forked from OSchip/llvm-project
[mlir][GPU] Use StructAttr to drive lowering from loop.parallel to
gpu.launch Current implementation of lowering from loop.parallel to gpu.launch uses a DictionaryAttr to specify the mapping. Moving this attribute to be auto-generated from specification as a StructAttr. This simplifies a lot the logic of looking up and creating this attribute. Differential Revision: https://reviews.llvm.org/D76165
This commit is contained in:
parent
b684c1a50f
commit
46bb6613a3
|
@ -1,2 +1,12 @@
|
|||
add_mlir_dialect(GPUOps gpu)
|
||||
add_mlir_dialect(GPUOps gpu GPUOps)
|
||||
add_mlir_doc(GPUOps -gen-dialect-doc GPUDialect Dialects/)
|
||||
|
||||
set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td)
|
||||
mlir_tablegen(ParallelLoopMapperAttr.h.inc -gen-struct-attr-decls)
|
||||
mlir_tablegen(ParallelLoopMapperAttr.cpp.inc -gen-struct-attr-defs)
|
||||
add_public_tablegen_target(MLIRParallelLoopMapperAttrGen)
|
||||
|
||||
set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td)
|
||||
mlir_tablegen(ParallelLoopMapperEnums.h.inc -gen-enum-decls)
|
||||
mlir_tablegen(ParallelLoopMapperEnums.cpp.inc -gen-enum-defs)
|
||||
add_public_tablegen_target(MLIRParallelLoopMapperEnumsGen)
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
//===-- GPUBase.td - GPU dialect definitions ---------------*- tablegen -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Defines the GPU dialect
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef GPU_BASE
|
||||
#define GPU_BASE
|
||||
|
||||
include "mlir/IR/OpBase.td"
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// GPU Dialect.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def GPU_Dialect : Dialect {
|
||||
let name = "gpu";
|
||||
let hasOperationAttrVerify = 1;
|
||||
|
||||
let extraClassDeclaration = [{
|
||||
/// Get the name of the attribute used to annotate the modules that contain
|
||||
/// kernel modules.
|
||||
static StringRef getContainerModuleAttrName() {
|
||||
return "gpu.container_module";
|
||||
}
|
||||
/// Get the name of the attribute used to annotate external kernel
|
||||
/// functions.
|
||||
static StringRef getKernelFuncAttrName() { return "gpu.kernel"; }
|
||||
|
||||
/// Get the name of the attribute used to annotate kernel modules.
|
||||
static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; }
|
||||
|
||||
/// Returns whether the given function is a kernel function, i.e., has the
|
||||
/// 'gpu.kernel' attribute.
|
||||
static bool isKernel(Operation *op);
|
||||
|
||||
/// Returns the number of workgroup (thread, block) dimensions supported in
|
||||
/// the GPU dialect.
|
||||
// TODO(zinenko,herhut): consider generalizing this.
|
||||
static unsigned getNumWorkgroupDimensions() { return 3; }
|
||||
|
||||
/// Returns the numeric value used to identify the workgroup memory address
|
||||
/// space.
|
||||
static unsigned getWorkgroupAddressSpace() { return 3; }
|
||||
|
||||
/// Returns the numeric value used to identify the private memory address
|
||||
/// space.
|
||||
static unsigned getPrivateAddressSpace() { return 5; }
|
||||
}];
|
||||
}
|
||||
|
||||
#endif // GPU_BASE
|
|
@ -13,6 +13,7 @@
|
|||
#ifndef GPU_OPS
|
||||
#define GPU_OPS
|
||||
|
||||
include "mlir/Dialect/GPU/GPUBase.td"
|
||||
include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
|
||||
include "mlir/Interfaces/SideEffects.td"
|
||||
|
||||
|
@ -26,42 +27,6 @@ def IntLikeOrLLVMInt : TypeConstraint<
|
|||
// GPU Dialect operations.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def GPU_Dialect : Dialect {
|
||||
let name = "gpu";
|
||||
let hasOperationAttrVerify = 1;
|
||||
|
||||
let extraClassDeclaration = [{
|
||||
/// Get the name of the attribute used to annotate the modules that contain
|
||||
/// kernel modules.
|
||||
static StringRef getContainerModuleAttrName() {
|
||||
return "gpu.container_module";
|
||||
}
|
||||
/// Get the name of the attribute used to annotate external kernel
|
||||
/// functions.
|
||||
static StringRef getKernelFuncAttrName() { return "gpu.kernel"; }
|
||||
|
||||
/// Get the name of the attribute used to annotate kernel modules.
|
||||
static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; }
|
||||
|
||||
/// Returns whether the given function is a kernel function, i.e., has the
|
||||
/// 'gpu.kernel' attribute.
|
||||
static bool isKernel(Operation *op);
|
||||
|
||||
/// Returns the number of workgroup (thread, block) dimensions supported in
|
||||
/// the GPU dialect.
|
||||
// TODO(zinenko,herhut): consider generalizing this.
|
||||
static unsigned getNumWorkgroupDimensions() { return 3; }
|
||||
|
||||
/// Returns the numeric value used to identify the workgroup memory address
|
||||
/// space.
|
||||
static unsigned getWorkgroupAddressSpace() { return 3; }
|
||||
|
||||
/// Returns the numeric value used to identify the private memory address
|
||||
/// space.
|
||||
static unsigned getPrivateAddressSpace() { return 5; }
|
||||
}];
|
||||
}
|
||||
|
||||
class GPU_Op<string mnemonic, list<OpTrait> traits = []> :
|
||||
Op<GPU_Dialect, mnemonic, traits>;
|
||||
|
||||
|
|
|
@ -14,28 +14,48 @@
|
|||
#ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
|
||||
#define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
|
||||
|
||||
#include "mlir/IR/Attributes.h"
|
||||
#include "mlir/Support/LLVM.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
|
||||
#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.h.inc"
|
||||
|
||||
namespace mlir {
|
||||
|
||||
class AffineMap;
|
||||
struct LogicalResult;
|
||||
class Operation;
|
||||
class Region;
|
||||
|
||||
#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.h.inc"
|
||||
|
||||
namespace loop {
|
||||
class ParallelOp;
|
||||
}
|
||||
|
||||
namespace gpu {
|
||||
|
||||
/// Name of the mapping attribute produced by loop mappers.
|
||||
static constexpr const char *kMappingAttributeName = "mapping";
|
||||
/// Name of the processor sub-attribute that identifies the hardware id
|
||||
/// to map a loop to.
|
||||
static constexpr const char *kProcessorEntryName = "processor";
|
||||
/// Name of the map sub-attribute that identifies the affine map to apply
|
||||
/// to the hardware id to compute the iteration number of the loop. This
|
||||
/// map is expected to be extended by step and lower bound computations:
|
||||
/// index = map(hardware_id) * step + lowerbound
|
||||
static constexpr const char *kIndexMapEntryName = "map";
|
||||
/// Name of the bound sub-attribute that itendities the affine map to
|
||||
/// compute an upper bound of iterations for the hardware id. This is
|
||||
/// applied to an upper bound on the number of iterations:
|
||||
/// launchBound = bound(upperbound-lowerbound ceildiv step)
|
||||
static constexpr const char *kBoundMapEntryName = "bound";
|
||||
StringRef getMappingAttrName();
|
||||
|
||||
/// Get the value of the processor in the ParallelLoopDimMapping attribute.
|
||||
inline Processor getProcessor(ParallelLoopDimMapping attr) {
|
||||
return static_cast<Processor>(attr.processor().getInt());
|
||||
}
|
||||
|
||||
/// Helper function to create a ParallelDimMapperAttr.
|
||||
/// TODO(ravishankarm/antiagainst): Replace its uses with an auto-gened method.
|
||||
ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor,
|
||||
AffineMap map,
|
||||
AffineMap bound);
|
||||
|
||||
/// Sets the mapping attribute of a loop.parallel operation. Verifies that the
|
||||
/// mapping passed is valid.
|
||||
/// - the number of DimMapperAttr provided is same as the number of loops of
|
||||
/// the `ploopOp`.
|
||||
/// - the mapping does not map multiple loops to the same processor.
|
||||
LogicalResult setMappingAttr(loop::ParallelOp ploopOp,
|
||||
ArrayRef<ParallelLoopDimMapping> mapping);
|
||||
} // end namespace gpu
|
||||
|
||||
/// Maps the parallel loops found in the given function to workgroups. The first
|
||||
|
@ -46,5 +66,4 @@ static constexpr const char *kBoundMapEntryName = "bound";
|
|||
void greedilyMapParallelLoopsToGPU(Region ®ion);
|
||||
|
||||
} // end namespace mlir
|
||||
|
||||
#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
//===-- ParallelLoopMapperAttr.td - Attribute definition ---*- tablegen -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Defines the attribute used for driving conversion from loop.parallel to
|
||||
// gpu.launch operations
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef PARALLEL_LOOP_MAPPER_ATTR
|
||||
#define PARALLEL_LOOP_MAPPER_ATTR
|
||||
|
||||
include "mlir/Dialect/Affine/IR/AffineOpsBase.td"
|
||||
include "mlir/Dialect/GPU/GPUBase.td"
|
||||
|
||||
def BlockX : I64EnumAttrCase<"BlockX", 0>;
|
||||
def BlockY : I64EnumAttrCase<"BlockY", 1>;
|
||||
def BlockZ : I64EnumAttrCase<"BlockZ", 2>;
|
||||
def ThreadX : I64EnumAttrCase<"ThreadX", 3>;
|
||||
def ThreadY : I64EnumAttrCase<"ThreadY", 4>;
|
||||
def ThreadZ : I64EnumAttrCase<"ThreadZ", 5>;
|
||||
def Sequential : I64EnumAttrCase<"Sequential", 6>;
|
||||
|
||||
def ProcessorAttr : I64EnumAttr<"Processor", "processor for loop mapping", [
|
||||
BlockX, BlockY, BlockZ, ThreadX, ThreadY, ThreadZ, Sequential]> {
|
||||
let cppNamespace = "::mlir::gpu";
|
||||
}
|
||||
|
||||
// Attribute that drives conversion of a loop.parallel to gpu.launch
|
||||
// operation.
|
||||
// processor: the hardware id to map to.
|
||||
// map : An affine map that is used to pre-process hardware ids before
|
||||
// substitution.
|
||||
// bound : An affine map that is used to compute the bound of the hardware
|
||||
// id based on an upper bound of the number of iterations.
|
||||
def ParallelLoopDimMappingAttr :
|
||||
StructAttr<"ParallelLoopDimMapping", GPU_Dialect,
|
||||
[StructFieldAttr<"processor", ProcessorAttr>,
|
||||
StructFieldAttr<"map", AffineMapAttr>,
|
||||
StructFieldAttr<"bound", AffineMapAttr>]>;
|
||||
|
||||
|
||||
def ParallelLoopMappingAttr :
|
||||
TypedArrayAttrBase<ParallelLoopDimMappingAttr,
|
||||
"parallel loop to processor mapping attribute">;
|
||||
|
||||
#endif // PARALLEL_LOOP_MAPPER_ATTR
|
|
@ -500,35 +500,8 @@ struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
|
|||
LogicalResult matchAndRewrite(ParallelOp parallelOp,
|
||||
PatternRewriter &rewriter) const override;
|
||||
};
|
||||
|
||||
struct MappingAnnotation {
|
||||
unsigned processor;
|
||||
AffineMap indexMap;
|
||||
AffineMap boundMap;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
/// Extracts the mapping annotations from the provided attribute. The attribute
|
||||
/// is expected to be of the form
|
||||
/// { processor = <unsigned>, map = <AffineMap>, bound = <AffineMap> }
|
||||
/// where the bound is optional.
|
||||
static MappingAnnotation extractMappingAnnotation(Attribute attribute) {
|
||||
DictionaryAttr dict = attribute.cast<DictionaryAttr>();
|
||||
unsigned processor = dict.get(gpu::kProcessorEntryName)
|
||||
.cast<IntegerAttr>()
|
||||
.getValue()
|
||||
.getSExtValue();
|
||||
AffineMap map =
|
||||
dict.get(gpu::kIndexMapEntryName).cast<AffineMapAttr>().getValue();
|
||||
AffineMapAttr boundAttr =
|
||||
dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
|
||||
AffineMap bound;
|
||||
if (boundAttr)
|
||||
bound = boundAttr.getValue();
|
||||
return {processor, map, bound};
|
||||
}
|
||||
|
||||
/// Tries to derive a static upper bound from the defining operation of
|
||||
/// `upperBound`.
|
||||
static Value deriveStaticUpperBound(Value upperBound,
|
||||
|
@ -546,6 +519,30 @@ static Value deriveStaticUpperBound(Value upperBound,
|
|||
return {};
|
||||
}
|
||||
|
||||
static bool isMappedToProcessor(gpu::Processor processor) {
|
||||
return processor != gpu::Processor::Sequential;
|
||||
}
|
||||
|
||||
static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
|
||||
switch (processor) {
|
||||
case gpu::Processor::BlockX:
|
||||
return 0;
|
||||
case gpu::Processor::BlockY:
|
||||
return 1;
|
||||
case gpu::Processor::BlockZ:
|
||||
return 2;
|
||||
case gpu::Processor::ThreadX:
|
||||
return 3;
|
||||
case gpu::Processor::ThreadY:
|
||||
return 4;
|
||||
case gpu::Processor::ThreadZ:
|
||||
return 5;
|
||||
default:;
|
||||
}
|
||||
llvm_unreachable(
|
||||
"invalid processor type while retrieving launch op argument number");
|
||||
}
|
||||
|
||||
/// Modifies the current transformation state to capture the effect of the given
|
||||
/// `loop.parallel` operation on index substitutions and the operations to be
|
||||
/// inserted.
|
||||
|
@ -568,16 +565,14 @@ static Value deriveStaticUpperBound(Value upperBound,
|
|||
/// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
|
||||
/// worklist. This signals the processor of the worklist to pop the rewriter
|
||||
/// one scope-level up.
|
||||
static LogicalResult processParallelLoop(ParallelOp parallelOp,
|
||||
gpu::LaunchOp launchOp,
|
||||
BlockAndValueMapping &cloningMap,
|
||||
SmallVectorImpl<Operation *> &worklist,
|
||||
DenseMap<int, Value> &bounds,
|
||||
PatternRewriter &rewriter) {
|
||||
static LogicalResult processParallelLoop(
|
||||
ParallelOp parallelOp, gpu::LaunchOp launchOp,
|
||||
BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist,
|
||||
DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) {
|
||||
// TODO(herhut): Verify that this is a valid GPU mapping.
|
||||
// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
|
||||
ArrayAttr mapping =
|
||||
parallelOp.getAttrOfType<ArrayAttr>(gpu::kMappingAttributeName);
|
||||
parallelOp.getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
|
||||
|
||||
// TODO(herhut): Support reductions.
|
||||
if (!mapping || parallelOp.getNumResults() != 0)
|
||||
|
@ -604,12 +599,17 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
|
|||
Attribute mappingAttribute;
|
||||
Value iv, lowerBound, upperBound, step;
|
||||
std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
|
||||
MappingAnnotation annotation = extractMappingAnnotation(mappingAttribute);
|
||||
auto annotation = mappingAttribute.dyn_cast<gpu::ParallelLoopDimMapping>();
|
||||
if (!annotation)
|
||||
return parallelOp.emitOpError()
|
||||
<< "expected mapping attribute for lowering to GPU";
|
||||
Value newIndex;
|
||||
gpu::Processor processor = gpu::getProcessor(annotation);
|
||||
|
||||
if (annotation.processor < gpu::LaunchOp::kNumConfigOperands) {
|
||||
if (isMappedToProcessor(processor)) {
|
||||
// Use the corresponding thread/grid index as replacement for the loop iv.
|
||||
Value operand = launchOp.body().front().getArgument(annotation.processor);
|
||||
Value operand = launchOp.body().front().getArgument(
|
||||
getLaunchOpArgumentNum(processor));
|
||||
// Take the indexmap and add the lower bound and step computations in.
|
||||
// This computes operand * step + lowerBound.
|
||||
// Use an affine map here so that it composes nicely with the provided
|
||||
|
@ -619,11 +619,11 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
|
|||
rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
|
||||
rewriter.getAffineSymbolExpr(1));
|
||||
newIndex = rewriter.create<AffineApplyOp>(
|
||||
loc, annotation.indexMap.compose(lowerAndStep),
|
||||
loc, annotation.map().getValue().compose(lowerAndStep),
|
||||
ValueRange{operand, step, lowerBound});
|
||||
// If there was also a bound, insert that, too.
|
||||
// TODO(herhut): Check that we do not assign bounds twice.
|
||||
if (annotation.boundMap) {
|
||||
if (annotation.bound().getValue()) {
|
||||
// We pass as the single opererand to the bound-map the number of
|
||||
// iterations, which is (upperBound - lowerBound) ceilDiv step. To
|
||||
// support inner loops with dynamic upper bounds (as generated by e.g.
|
||||
|
@ -663,19 +663,21 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
|
|||
rewriter.getAffineSymbolExpr(1))
|
||||
.ceilDiv(rewriter.getAffineSymbolExpr(2))));
|
||||
Value launchBound = rewriter.create<AffineApplyOp>(
|
||||
loc, annotation.boundMap.compose(stepMap),
|
||||
loc, annotation.bound().getValue().compose(stepMap),
|
||||
ValueRange{
|
||||
ensureLaunchIndependent(
|
||||
cloningMap.lookupOrDefault(upperBound)),
|
||||
ensureLaunchIndependent(
|
||||
cloningMap.lookupOrDefault(lowerBound)),
|
||||
ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
|
||||
if (bounds.find(annotation.processor) != bounds.end()) {
|
||||
// todo(herhut,ravishankarm): Update the behavior of setMappingAttr
|
||||
// when this condition is relaxed.
|
||||
if (bounds.find(processor) != bounds.end()) {
|
||||
return parallelOp.emitOpError()
|
||||
<< "cannot redefine the bound for processor "
|
||||
<< annotation.processor;
|
||||
<< static_cast<int64_t>(processor);
|
||||
}
|
||||
bounds[annotation.processor] = launchBound;
|
||||
bounds[processor] = launchBound;
|
||||
}
|
||||
if (!boundIsPrecise) {
|
||||
// We are using an approximation, create a surrounding conditional.
|
||||
|
@ -757,7 +759,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
|
|||
rewriter.setInsertionPointToStart(&launchOp.body().front());
|
||||
|
||||
BlockAndValueMapping cloningMap;
|
||||
llvm::DenseMap<int, Value> launchBounds;
|
||||
llvm::DenseMap<gpu::Processor, Value> launchBounds;
|
||||
SmallVector<Operation *, 16> worklist;
|
||||
if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
|
||||
launchBounds, rewriter)))
|
||||
|
@ -809,7 +811,8 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
|
|||
// Now that we succeeded creating the launch operation, also update the
|
||||
// bounds.
|
||||
for (auto bound : launchBounds)
|
||||
launchOp.setOperand(std::get<0>(bound), std::get<1>(bound));
|
||||
launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),
|
||||
std::get<1>(bound));
|
||||
|
||||
rewriter.eraseOp(parallelOp);
|
||||
return success();
|
||||
|
|
|
@ -10,6 +10,8 @@ add_mlir_dialect_library(MLIRGPU
|
|||
|
||||
DEPENDS
|
||||
MLIRGPUOpsIncGen
|
||||
MLIRParallelLoopMapperAttrGen
|
||||
MLIRParallelLoopMapperEnumsGen
|
||||
)
|
||||
target_link_libraries(MLIRGPU
|
||||
PUBLIC
|
||||
|
|
|
@ -23,6 +23,43 @@ using namespace mlir;
|
|||
using namespace mlir::gpu;
|
||||
using namespace mlir::loop;
|
||||
|
||||
#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc"
|
||||
namespace mlir {
|
||||
|
||||
#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc"
|
||||
namespace gpu {
|
||||
|
||||
StringRef getMappingAttrName() { return "mapping"; }
|
||||
|
||||
ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor,
|
||||
AffineMap map,
|
||||
AffineMap bound) {
|
||||
MLIRContext *context = map.getContext();
|
||||
OpBuilder builder(context);
|
||||
return ParallelLoopDimMapping::get(
|
||||
builder.getI64IntegerAttr(static_cast<int32_t>(processor)),
|
||||
AffineMapAttr::get(map), AffineMapAttr::get(bound), context);
|
||||
}
|
||||
|
||||
LogicalResult setMappingAttr(loop::ParallelOp ploopOp,
|
||||
ArrayRef<ParallelLoopDimMapping> mapping) {
|
||||
// Verify that each processor is mapped to only once.
|
||||
llvm::DenseSet<gpu::Processor> specifiedMappings;
|
||||
for (auto dimAttr : mapping) {
|
||||
gpu::Processor processor = getProcessor(dimAttr);
|
||||
if (processor != gpu::Processor::Sequential &&
|
||||
specifiedMappings.count(processor))
|
||||
return ploopOp.emitError(
|
||||
"invalid mapping multiple loops to same processor");
|
||||
}
|
||||
ArrayRef<Attribute> mappingAsAttrs(mapping.data(), mapping.size());
|
||||
ploopOp.setAttr(getMappingAttrName(),
|
||||
ArrayAttr::get(mappingAsAttrs, ploopOp.getContext()));
|
||||
return success();
|
||||
}
|
||||
} // namespace gpu
|
||||
} // namespace mlir
|
||||
|
||||
namespace {
|
||||
|
||||
enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
|
||||
|
@ -43,10 +80,41 @@ MappingLevel &operator++(MappingLevel &mappingLevel) {
|
|||
/// Computed the hardware id to use for a given mapping level. Will
|
||||
/// assign x,y and z hardware ids for the first 3 dimensions and use
|
||||
/// sequential after.
|
||||
static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) {
|
||||
/// TODO(ravishankarm/herhut) : Make this use x for the inner-most loop that is
|
||||
/// distributed to map to x, the next innermost to y and the next innermost to
|
||||
/// z.
|
||||
static gpu::Processor getHardwareIdForMapping(MappingLevel level,
|
||||
int dimension) {
|
||||
|
||||
if (dimension >= kNumHardwareIds || level == Sequential)
|
||||
return Sequential * kNumHardwareIds;
|
||||
return (level * kNumHardwareIds) + dimension;
|
||||
return Processor::Sequential;
|
||||
switch (level) {
|
||||
case MapGrid:
|
||||
switch (dimension) {
|
||||
case 0:
|
||||
return Processor::BlockX;
|
||||
case 1:
|
||||
return Processor::BlockY;
|
||||
case 2:
|
||||
return Processor::BlockZ;
|
||||
default:
|
||||
return Processor::Sequential;
|
||||
}
|
||||
break;
|
||||
case MapBlock:
|
||||
switch (dimension) {
|
||||
case 0:
|
||||
return Processor::ThreadX;
|
||||
case 1:
|
||||
return Processor::ThreadY;
|
||||
case 2:
|
||||
return Processor::ThreadZ;
|
||||
default:
|
||||
return Processor::Sequential;
|
||||
}
|
||||
default:;
|
||||
}
|
||||
return Processor::Sequential;
|
||||
}
|
||||
|
||||
/// Add mapping information to the given parallel loop. Do not add
|
||||
|
@ -55,26 +123,20 @@ static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) {
|
|||
static void mapParallelOp(ParallelOp parallelOp,
|
||||
MappingLevel mappingLevel = MapGrid) {
|
||||
// Do not try to add a mapping to already mapped loops or nested loops.
|
||||
if (parallelOp.getAttr(gpu::kMappingAttributeName) ||
|
||||
if (parallelOp.getAttr(getMappingAttrName()) ||
|
||||
((mappingLevel == MapGrid) && parallelOp.getParentOfType<ParallelOp>()))
|
||||
return;
|
||||
|
||||
MLIRContext *ctx = parallelOp.getContext();
|
||||
Builder b(ctx);
|
||||
SmallVector<Attribute, 4> attrs;
|
||||
SmallVector<ParallelLoopDimMapping, 4> attrs;
|
||||
attrs.reserve(parallelOp.getNumInductionVars());
|
||||
for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) {
|
||||
SmallVector<NamedAttribute, 3> entries;
|
||||
entries.emplace_back(b.getNamedAttr(
|
||||
kProcessorEntryName,
|
||||
b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i))));
|
||||
entries.emplace_back(b.getNamedAttr(
|
||||
kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
|
||||
entries.emplace_back(b.getNamedAttr(
|
||||
kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
|
||||
attrs.push_back(DictionaryAttr::get(entries, ctx));
|
||||
attrs.push_back(getParallelLoopDimMappingAttr(
|
||||
getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(),
|
||||
b.getDimIdentityMap()));
|
||||
}
|
||||
parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx));
|
||||
setMappingAttr(parallelOp, attrs);
|
||||
++mappingLevel;
|
||||
// Parallel loop operations are immediately nested, so do not use
|
||||
// walk but just iterate over the operations.
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
// 2-d parallel loop mapped to block.y and block.x
|
||||
|
||||
func @parallel_loop_bidy_bidx(%arg0 : index, %arg1 : index, %arg2 : index,
|
||||
%arg3 : index, %arg4 : index,
|
||||
%arg3 : index, %arg4 : index,
|
||||
%buf : memref<?x?xf32>,
|
||||
%res : memref<?x?xf32>) {
|
||||
%step = constant 2 : index
|
||||
|
@ -334,7 +334,7 @@ func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : ind
|
|||
// expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
|
||||
loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
|
||||
step (%four, %four) {
|
||||
// expected-error@+1 {{cannot derive loop-invariant upper bound}}
|
||||
// expected-error@+1 {{cannot derive loop-invariant upper bound}}
|
||||
loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
|
||||
step (%one, %one) {
|
||||
%idx0 = addi %i0, %si0 : index
|
||||
|
|
Loading…
Reference in New Issue