[MLIR][GPU] Implement a simple greedy loop mapper.

Summary:
The mapper assigns annotations to loop.parallel operations that
are compatible with the loop to gpu mapping pass. The outermost
loop uses the grid dimensions, followed by block dimensions. All
remaining loops are mapped to sequential loops.

Differential Revision: https://reviews.llvm.org/D74963
This commit is contained in:
Stephan Herhut 2020-02-21 16:18:22 +01:00
parent 157b3d505f
commit 7a7eacc797
9 changed files with 252 additions and 8 deletions

View File

@ -0,0 +1,50 @@
//===- ParallelLoopMapper.h - Utilities for mapping parallel loops to GPU ====//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This header file declares the utilities to generate mappings for parallel
// loops to GPU devices.
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
#define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
namespace mlir {
struct Region;
namespace gpu {
/// Name of the mapping attribute produced by loop mappers.
static constexpr const char *kMappingAttributeName = "mapping";
/// Name of the processor sub-attribute that identifies the hardware id
/// to map a loop to.
static constexpr const char *kProcessorEntryName = "processor";
/// Name of the map sub-attribute that identifies the affine map to apply
/// to the hardware id to compute the iteration number of the loop. This
/// map is expected to be extended by step and lower bound computations:
/// index = map(hardware_id) * step + lowerbound
static constexpr const char *kIndexMapEntryName = "map";
/// Name of the bound sub-attribute that itendities the affine map to
/// compute an upper bound of iterations for the hardware id. This is
/// applied to an upper bound on the number of iterations:
/// launchBound = bound(upperbound-lowerbound ceildiv step)
static constexpr const char *kBoundMapEntryName = "bound";
} // end namespace gpu
/// Maps the parallel loops found in the given function to workgroups. The first
/// loop encountered will be mapped to the global workgroup and the second loop
/// encountered to the local workgroup. Within each mapping, the first three
/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
/// mapped to sequential loops.
void greedilyMapParallelLoopsToGPU(Region &region);
} // end namespace mlir
#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H

View File

@ -289,6 +289,9 @@ def ParallelOp : Loop_Op<"parallel",
let extraClassDeclaration = [{
Block *getBody() { return &region().front(); }
unsigned getNumInductionVars() {
return getBody()->getNumArguments();
}
iterator_range<Block::args_iterator> getInductionVars() {
return {getBody()->args_begin(), getBody()->args_end()};
}

View File

@ -17,6 +17,7 @@
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Dialect/AffineOps/AffineOps.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
#include "mlir/Dialect/LoopOps/LoopOps.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/AffineExpr.h"
@ -508,23 +509,20 @@ struct MappingAnnotation {
} // namespace
static constexpr const char *kProcessorEntryName = "processor";
static constexpr const char *kIndexMapEntryName = "map";
static constexpr const char *kBoundMapEntryName = "bound";
/// Extracts the mapping annotations from the provided attribute. The attribute
/// is expected to be of the form
/// { processor = <unsigned>, map = <AffineMap>, bound = <AffineMap> }
/// where the bound is optional.
static MappingAnnotation extractMappingAnnotation(Attribute attribute) {
DictionaryAttr dict = attribute.cast<DictionaryAttr>();
unsigned processor = dict.get(kProcessorEntryName)
unsigned processor = dict.get(gpu::kProcessorEntryName)
.cast<IntegerAttr>()
.getValue()
.getSExtValue();
AffineMap map = dict.get(kIndexMapEntryName).cast<AffineMapAttr>().getValue();
AffineMap map =
dict.get(gpu::kIndexMapEntryName).cast<AffineMapAttr>().getValue();
AffineMapAttr boundAttr =
dict.get(kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
AffineMap bound;
if (boundAttr)
bound = boundAttr.getValue();
@ -583,7 +581,8 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
PatternRewriter &rewriter) {
// TODO(herhut): Verify that this is a valid GPU mapping.
// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
ArrayAttr mapping = parallelOp.getAttrOfType<ArrayAttr>("mapping");
ArrayAttr mapping =
parallelOp.getAttrOfType<ArrayAttr>(gpu::kMappingAttributeName);
// TODO(herhut): Support reductions.
if (!mapping || parallelOp.getNumResults() != 0)

View File

@ -3,6 +3,7 @@ add_llvm_library(MLIRGPU
Transforms/AllReduceLowering.cpp
Transforms/KernelOutlining.cpp
Transforms/MemoryPromotion.cpp
Transforms/ParallelLoopMapper.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU

View File

@ -0,0 +1,89 @@
//===- ParallelLoopMapper.cpp - Utilities for mapping parallel loops to GPU =//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements utilities to generate mappings for parallel loops to
// GPU devices.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/LoopOps/LoopOps.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/Pass/Pass.h"
using namespace mlir;
using namespace mlir::gpu;
using namespace mlir::loop;
namespace {
enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
static constexpr int kNumHardwareIds = 3;
} // namespace
/// Bounded increment on MappingLevel. Increments to the next
/// level unless Sequential was already reached.
MappingLevel &operator++(MappingLevel &mappingLevel) {
if (mappingLevel < Sequential) {
mappingLevel = static_cast<MappingLevel>(mappingLevel + 1);
}
return mappingLevel;
}
/// Computed the hardware id to use for a given mapping level. Will
/// assign x,y and z hardware ids for the first 3 dimensions and use
/// sequential after.
static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) {
if (dimension >= kNumHardwareIds || level == Sequential)
return Sequential * kNumHardwareIds;
return (level * kNumHardwareIds) + dimension;
}
/// Add mapping information to the given parallel loop. Do not add
/// mapping information if the loop already has it. Also, don't
/// start a mapping at a nested loop.
static void mapParallelOp(ParallelOp parallelOp,
MappingLevel mappingLevel = MapGrid) {
// Do not try to add a mapping to already mapped loops or nested loops.
if (parallelOp.getAttr(gpu::kMappingAttributeName) ||
((mappingLevel == MapGrid) && parallelOp.getParentOfType<ParallelOp>()))
return;
MLIRContext *ctx = parallelOp.getContext();
Builder b(ctx);
SmallVector<Attribute, 4> attrs;
attrs.reserve(parallelOp.getNumInductionVars());
for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) {
SmallVector<NamedAttribute, 3> entries;
entries.emplace_back(b.getNamedAttr(
kProcessorEntryName,
b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i))));
entries.emplace_back(b.getNamedAttr(
kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
entries.emplace_back(b.getNamedAttr(
kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
attrs.push_back(DictionaryAttr::get(entries, ctx));
}
parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx));
++mappingLevel;
// Parallel loop operations are immediately nested, so do not use
// walk but just iterate over the operations.
for (Operation &op : *parallelOp.getBody()) {
if (ParallelOp nested = dyn_cast<ParallelOp>(op))
mapParallelOp(nested, mappingLevel);
}
}
void mlir::greedilyMapParallelLoopsToGPU(Region &region) {
region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
}

View File

@ -0,0 +1,61 @@
// RUN: mlir-opt -test-gpu-greedy-parallel-loop-mapping -split-input-file %s | FileCheck %s
func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
%arg3 : index) {
%zero = constant 0 : index
%one = constant 1 : index
%four = constant 4 : index
loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
step (%four, %four) {
loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
step (%one, %one) {
}
}
return
}
// CHECK-LABEL: func @parallel_loop(
// CHECK: loop.parallel
// CHECK: loop.parallel
// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64},
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64}]}
// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64},
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64}]}
// CHECK-NOT: mapping
// -----
func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
%arg3 : index) {
%zero = constant 0 : index
%one = constant 1 : index
%four = constant 4 : index
loop.parallel (%i0, %i1, %i2, %i3) = (%zero, %zero, %zero, %zero) to (%arg0, %arg1, %arg2, %arg3)
step (%four, %four, %four, %four) {
loop.parallel (%si0, %si1, %si2, %si3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four)
step (%one, %one, %one, %one) {
loop.parallel (%ti0, %ti1, %ti2, %ti3) = (%zero, %zero, %zero, %zero) to (%four, %four, %four, %four)
step (%one, %one, %one, %one) {
}
}
}
return
}
// CHECK-LABEL: func @parallel_loop_4d(
// CHECK: loop.parallel
// CHECK: loop.parallel
// CHECK: loop.parallel
// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64},
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64},
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64},
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 5 : i64},
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
// CHECK: {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64},
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64},
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 2 : i64},
// CHECK-SAME: {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 6 : i64}]}
// CHECK-NOT: mapping

View File

@ -5,6 +5,7 @@ add_llvm_library(MLIRTestTransforms
TestConstantFold.cpp
TestLoopFusion.cpp
TestGpuMemoryPromotion.cpp
TestGpuParallelLoopMapping.cpp
TestInlining.cpp
TestLinalgTransforms.cpp
TestLiveness.cpp

View File

@ -0,0 +1,38 @@
//===- TestGPUParallelLoopMapping.cpp - Test pass for GPU loop mapping ----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the pass testing the utilities for mapping parallel
// loops to gpu hardware ids.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
#include "mlir/Pass/Pass.h"
using namespace mlir;
namespace {
/// Simple pass for testing the mapping of parallel loops to hardware ids using
/// a greedy mapping stratgegy.
class TestGpuGreedyParallelLoopMappingPass
: public OperationPass<TestGpuGreedyParallelLoopMappingPass, FuncOp> {
void runOnOperation() override {
Operation *op = getOperation();
for (Region &region : op->getRegions())
greedilyMapParallelLoopsToGPU(region);
}
};
} // end namespace
namespace mlir {
void registerTestGpuParallelLoopMappingPass() {
PassRegistration<TestGpuGreedyParallelLoopMappingPass> registration(
"test-gpu-greedy-parallel-loop-mapping",
"Greedily maps all parallel loops to gpu hardware ids.");
}
} // namespace mlir

View File

@ -50,6 +50,7 @@ void registerTestMemRefDependenceCheck();
void registerTestMemRefStrideCalculation();
void registerTestOpaqueLoc();
void registerTestParallelismDetection();
void registerTestGpuParallelLoopMappingPass();
void registerTestVectorConversions();
void registerTestVectorToLoopsPass();
void registerVectorizerTestPass();
@ -103,6 +104,7 @@ void registerTestPasses() {
registerTestMemRefStrideCalculation();
registerTestOpaqueLoc();
registerTestParallelismDetection();
registerTestGpuParallelLoopMappingPass();
registerTestVectorConversions();
registerTestVectorToLoopsPass();
registerVectorizerTestPass();