forked from OSchip/llvm-project
[MLIR][GPU] Expose GpuParallelLoopMapping as non-test pass.
Reviewed By: bondhugula, herhut Differential Revision: https://reviews.llvm.org/D126199
This commit is contained in:
parent
a5ddd4a238
commit
bcf3d52486
|
@ -60,13 +60,5 @@ ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor,
|
||||||
LogicalResult setMappingAttr(scf::ParallelOp ploopOp,
|
LogicalResult setMappingAttr(scf::ParallelOp ploopOp,
|
||||||
ArrayRef<ParallelLoopDimMapping> mapping);
|
ArrayRef<ParallelLoopDimMapping> mapping);
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
|
|
||||||
/// Maps the parallel loops found in the given function to workgroups. The first
|
|
||||||
/// loop encountered will be mapped to the global workgroup and the second loop
|
|
||||||
/// encountered to the local workgroup. Within each mapping, the first three
|
|
||||||
/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
|
|
||||||
/// mapped to sequential loops.
|
|
||||||
void greedilyMapParallelSCFToGPU(Region ®ion);
|
|
||||||
|
|
||||||
} // namespace mlir
|
} // namespace mlir
|
||||||
#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
|
#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
|
||||||
|
|
|
@ -39,6 +39,13 @@ createGpuKernelOutliningPass(StringRef dataLayoutStr = StringRef());
|
||||||
/// Rewrites a function region so that GPU ops execute asynchronously.
|
/// Rewrites a function region so that GPU ops execute asynchronously.
|
||||||
std::unique_ptr<OperationPass<func::FuncOp>> createGpuAsyncRegionPass();
|
std::unique_ptr<OperationPass<func::FuncOp>> createGpuAsyncRegionPass();
|
||||||
|
|
||||||
|
/// Maps the parallel loops found in the given function to workgroups. The first
|
||||||
|
/// loop encountered will be mapped to the global workgroup and the second loop
|
||||||
|
/// encountered to the local workgroup. Within each mapping, the first three
|
||||||
|
/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
|
||||||
|
/// mapped to sequential loops.
|
||||||
|
std::unique_ptr<OperationPass<func::FuncOp>> createGpuMapParallelLoopsPass();
|
||||||
|
|
||||||
/// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect.
|
/// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect.
|
||||||
void populateGpuAllReducePatterns(RewritePatternSet &patterns);
|
void populateGpuAllReducePatterns(RewritePatternSet &patterns);
|
||||||
|
|
||||||
|
|
|
@ -29,4 +29,11 @@ def GpuAsyncRegionPass : Pass<"gpu-async-region", "func::FuncOp"> {
|
||||||
let dependentDialects = ["async::AsyncDialect"];
|
let dependentDialects = ["async::AsyncDialect"];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def GpuMapParallelLoopsPass
|
||||||
|
: Pass<"gpu-map-parallel-loops", "mlir::func::FuncOp"> {
|
||||||
|
let summary = "Greedily maps loops to GPU hardware dimensions.";
|
||||||
|
let constructor = "mlir::createGpuMapParallelLoopsPass()";
|
||||||
|
let description = "Greedily maps loops to GPU hardware dimensions.";
|
||||||
|
}
|
||||||
|
|
||||||
#endif // MLIR_DIALECT_GPU_PASSES
|
#endif // MLIR_DIALECT_GPU_PASSES
|
||||||
|
|
|
@ -13,26 +13,25 @@
|
||||||
|
|
||||||
#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
|
#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
|
||||||
|
|
||||||
|
#include "PassDetail.h"
|
||||||
#include "mlir/Dialect/GPU/GPUDialect.h"
|
#include "mlir/Dialect/GPU/GPUDialect.h"
|
||||||
#include "mlir/Dialect/GPU/Passes.h"
|
#include "mlir/Dialect/GPU/Passes.h"
|
||||||
#include "mlir/Dialect/SCF/SCF.h"
|
#include "mlir/Dialect/SCF/SCF.h"
|
||||||
#include "mlir/IR/AffineMap.h"
|
#include "mlir/IR/AffineMap.h"
|
||||||
#include "mlir/Pass/Pass.h"
|
#include "mlir/Pass/Pass.h"
|
||||||
|
|
||||||
using namespace mlir;
|
|
||||||
using namespace mlir::gpu;
|
|
||||||
using namespace mlir::scf;
|
|
||||||
|
|
||||||
#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc"
|
#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc"
|
||||||
#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc"
|
#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc"
|
||||||
|
|
||||||
namespace mlir {
|
namespace mlir {
|
||||||
namespace gpu {
|
|
||||||
|
|
||||||
StringRef getMappingAttrName() { return "mapping"; }
|
using scf::ParallelOp;
|
||||||
|
|
||||||
ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor,
|
StringRef gpu::getMappingAttrName() { return "mapping"; }
|
||||||
AffineMap map,
|
|
||||||
AffineMap bound) {
|
gpu::ParallelLoopDimMapping
|
||||||
|
gpu::getParallelLoopDimMappingAttr(Processor processor, AffineMap map,
|
||||||
|
AffineMap bound) {
|
||||||
MLIRContext *context = map.getContext();
|
MLIRContext *context = map.getContext();
|
||||||
OpBuilder builder(context);
|
OpBuilder builder(context);
|
||||||
return ParallelLoopDimMapping::get(
|
return ParallelLoopDimMapping::get(
|
||||||
|
@ -40,8 +39,8 @@ ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor,
|
||||||
AffineMapAttr::get(map), AffineMapAttr::get(bound), context);
|
AffineMapAttr::get(map), AffineMapAttr::get(bound), context);
|
||||||
}
|
}
|
||||||
|
|
||||||
LogicalResult setMappingAttr(scf::ParallelOp ploopOp,
|
LogicalResult gpu::setMappingAttr(ParallelOp ploopOp,
|
||||||
ArrayRef<ParallelLoopDimMapping> mapping) {
|
ArrayRef<ParallelLoopDimMapping> mapping) {
|
||||||
// Verify that each processor is mapped to only once.
|
// Verify that each processor is mapped to only once.
|
||||||
llvm::DenseSet<gpu::Processor> specifiedMappings;
|
llvm::DenseSet<gpu::Processor> specifiedMappings;
|
||||||
for (auto dimAttr : mapping) {
|
for (auto dimAttr : mapping) {
|
||||||
|
@ -56,20 +55,17 @@ LogicalResult setMappingAttr(scf::ParallelOp ploopOp,
|
||||||
ArrayAttr::get(ploopOp.getContext(), mappingAsAttrs));
|
ArrayAttr::get(ploopOp.getContext(), mappingAsAttrs));
|
||||||
return success();
|
return success();
|
||||||
}
|
}
|
||||||
} // namespace gpu
|
|
||||||
} // namespace mlir
|
|
||||||
|
|
||||||
|
namespace gpu {
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
|
enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
|
||||||
|
} // namespace
|
||||||
|
|
||||||
static constexpr int kNumHardwareIds = 3;
|
static constexpr int kNumHardwareIds = 3;
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
/// Bounded increment on MappingLevel. Increments to the next
|
/// Bounded increment on MappingLevel. Increments to the next
|
||||||
/// level unless Sequential was already reached.
|
/// level unless Sequential was already reached.
|
||||||
MappingLevel &operator++(MappingLevel &mappingLevel) {
|
static MappingLevel &operator++(MappingLevel &mappingLevel) {
|
||||||
if (mappingLevel < Sequential) {
|
if (mappingLevel < Sequential) {
|
||||||
mappingLevel = static_cast<MappingLevel>(mappingLevel + 1);
|
mappingLevel = static_cast<MappingLevel>(mappingLevel + 1);
|
||||||
}
|
}
|
||||||
|
@ -82,8 +78,7 @@ MappingLevel &operator++(MappingLevel &mappingLevel) {
|
||||||
/// TODO: Make this use x for the inner-most loop that is
|
/// TODO: Make this use x for the inner-most loop that is
|
||||||
/// distributed to map to x, the next innermost to y and the next innermost to
|
/// distributed to map to x, the next innermost to y and the next innermost to
|
||||||
/// z.
|
/// z.
|
||||||
static gpu::Processor getHardwareIdForMapping(MappingLevel level,
|
static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
|
||||||
int dimension) {
|
|
||||||
|
|
||||||
if (dimension >= kNumHardwareIds || level == Sequential)
|
if (dimension >= kNumHardwareIds || level == Sequential)
|
||||||
return Processor::Sequential;
|
return Processor::Sequential;
|
||||||
|
@ -145,6 +140,21 @@ static void mapParallelOp(ParallelOp parallelOp,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void mlir::greedilyMapParallelSCFToGPU(Region ®ion) {
|
namespace {
|
||||||
region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
|
struct GpuMapParallelLoopsPass
|
||||||
|
: public GpuMapParallelLoopsPassBase<GpuMapParallelLoopsPass> {
|
||||||
|
void runOnOperation() override {
|
||||||
|
for (Region ®ion : getOperation()->getRegions()) {
|
||||||
|
region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
} // namespace gpu
|
||||||
|
} // namespace mlir
|
||||||
|
|
||||||
|
std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
|
||||||
|
mlir::createGpuMapParallelLoopsPass() {
|
||||||
|
return std::make_unique<gpu::GpuMapParallelLoopsPass>();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
// RUN: mlir-opt -test-gpu-greedy-parallel-loop-mapping -split-input-file %s | FileCheck %s
|
// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s
|
||||||
|
|
||||||
func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
|
func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
|
||||||
%arg3 : index) {
|
%arg3 : index) {
|
||||||
|
|
|
@ -3,7 +3,6 @@ add_mlir_library(MLIRGPUTestPasses
|
||||||
TestConvertGPUKernelToCubin.cpp
|
TestConvertGPUKernelToCubin.cpp
|
||||||
TestConvertGPUKernelToHsaco.cpp
|
TestConvertGPUKernelToHsaco.cpp
|
||||||
TestGpuMemoryPromotion.cpp
|
TestGpuMemoryPromotion.cpp
|
||||||
TestGpuParallelLoopMapping.cpp
|
|
||||||
TestGpuRewrite.cpp
|
TestGpuRewrite.cpp
|
||||||
|
|
||||||
EXCLUDE_FROM_LIBMLIR
|
EXCLUDE_FROM_LIBMLIR
|
||||||
|
|
|
@ -1,47 +0,0 @@
|
||||||
//===- TestGPUParallelLoopMapping.cpp - Test pass for GPU loop mapping ----===//
|
|
||||||
//
|
|
||||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
// See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
//
|
|
||||||
//===----------------------------------------------------------------------===//
|
|
||||||
//
|
|
||||||
// This file implements the pass testing the utilities for mapping parallel
|
|
||||||
// loops to gpu hardware ids.
|
|
||||||
//
|
|
||||||
//===----------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
|
|
||||||
#include "mlir/Pass/Pass.h"
|
|
||||||
|
|
||||||
using namespace mlir;
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
/// Simple pass for testing the mapping of parallel loops to hardware ids using
|
|
||||||
/// a greedy mapping strategy.
|
|
||||||
struct TestGpuGreedyParallelLoopMappingPass
|
|
||||||
: public PassWrapper<TestGpuGreedyParallelLoopMappingPass,
|
|
||||||
OperationPass<>> {
|
|
||||||
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
|
|
||||||
TestGpuGreedyParallelLoopMappingPass)
|
|
||||||
|
|
||||||
StringRef getArgument() const final {
|
|
||||||
return "test-gpu-greedy-parallel-loop-mapping";
|
|
||||||
}
|
|
||||||
StringRef getDescription() const final {
|
|
||||||
return "Greedily maps all parallel loops to gpu hardware ids.";
|
|
||||||
}
|
|
||||||
void runOnOperation() override {
|
|
||||||
for (Region ®ion : getOperation()->getRegions())
|
|
||||||
greedilyMapParallelSCFToGPU(region);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
namespace mlir {
|
|
||||||
namespace test {
|
|
||||||
void registerTestGpuParallelLoopMappingPass() {
|
|
||||||
PassRegistration<TestGpuGreedyParallelLoopMappingPass>();
|
|
||||||
}
|
|
||||||
} // namespace test
|
|
||||||
} // namespace mlir
|
|
|
@ -79,7 +79,6 @@ void registerTestDynamicPipelinePass();
|
||||||
void registerTestExpandTanhPass();
|
void registerTestExpandTanhPass();
|
||||||
void registerTestComposeSubView();
|
void registerTestComposeSubView();
|
||||||
void registerTestMultiBuffering();
|
void registerTestMultiBuffering();
|
||||||
void registerTestGpuParallelLoopMappingPass();
|
|
||||||
void registerTestIRVisitorsPass();
|
void registerTestIRVisitorsPass();
|
||||||
void registerTestGenericIRVisitorsPass();
|
void registerTestGenericIRVisitorsPass();
|
||||||
void registerTestGenericIRVisitorsInterruptPass();
|
void registerTestGenericIRVisitorsInterruptPass();
|
||||||
|
@ -176,7 +175,6 @@ void registerTestPasses() {
|
||||||
mlir::test::registerTestExpandTanhPass();
|
mlir::test::registerTestExpandTanhPass();
|
||||||
mlir::test::registerTestComposeSubView();
|
mlir::test::registerTestComposeSubView();
|
||||||
mlir::test::registerTestMultiBuffering();
|
mlir::test::registerTestMultiBuffering();
|
||||||
mlir::test::registerTestGpuParallelLoopMappingPass();
|
|
||||||
mlir::test::registerTestIRVisitorsPass();
|
mlir::test::registerTestIRVisitorsPass();
|
||||||
mlir::test::registerTestGenericIRVisitorsPass();
|
mlir::test::registerTestGenericIRVisitorsPass();
|
||||||
mlir::test::registerTestInterfaces();
|
mlir::test::registerTestInterfaces();
|
||||||
|
|
Loading…
Reference in New Issue