forked from OSchip/llvm-project
[MLIR][GPU] Add error checking to loop.parallel to gpu transform.
Summary: Instead of crashing on malformed input, the pass now produces error messages. Differential Revision: https://reviews.llvm.org/D75468
This commit is contained in:
parent
292ab49d43
commit
10ec1860a8
|
@ -572,6 +572,7 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
|
|||
gpu::LaunchOp launchOp,
|
||||
BlockAndValueMapping &cloningMap,
|
||||
SmallVectorImpl<Operation *> &worklist,
|
||||
DenseMap<int, Value> &bounds,
|
||||
PatternRewriter &rewriter) {
|
||||
// TODO(herhut): Verify that this is a valid GPU mapping.
|
||||
// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
|
||||
|
@ -631,22 +632,27 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
|
|||
// conditional. If the lower-bound is constant or defined before the
|
||||
// launch, we can use it in the launch bounds. Otherwise fail.
|
||||
if (!launchIndependent(lowerBound) &&
|
||||
!isa<ConstantOp>(lowerBound.getDefiningOp()))
|
||||
!isa_and_nonnull<ConstantOp>(lowerBound.getDefiningOp()))
|
||||
return failure();
|
||||
// The step must also be constant or defined outside of the loop nest.
|
||||
if (!launchIndependent(step) && !isa<ConstantOp>(step.getDefiningOp()))
|
||||
if (!launchIndependent(step) &&
|
||||
!isa_and_nonnull<ConstantOp>(step.getDefiningOp()))
|
||||
return failure();
|
||||
// If the upper-bound is constant or defined before the launch, we can
|
||||
// use it in the launch bounds directly. Otherwise try derive a bound.
|
||||
bool boundIsPrecise = launchIndependent(upperBound) ||
|
||||
isa<ConstantOp>(upperBound.getDefiningOp());
|
||||
bool boundIsPrecise =
|
||||
launchIndependent(upperBound) ||
|
||||
isa_and_nonnull<ConstantOp>(upperBound.getDefiningOp());
|
||||
{
|
||||
PatternRewriter::InsertionGuard guard(rewriter);
|
||||
rewriter.setInsertionPoint(launchOp);
|
||||
if (!boundIsPrecise) {
|
||||
upperBound = deriveStaticUpperBound(upperBound, rewriter);
|
||||
if (!upperBound)
|
||||
return failure();
|
||||
if (!upperBound) {
|
||||
return parallelOp.emitOpError()
|
||||
<< "cannot derive loop-invariant upper bound for number "
|
||||
"of iterations";
|
||||
}
|
||||
}
|
||||
// Compute the number of iterations needed. We compute this as an
|
||||
// affine expression ceilDiv (upperBound - lowerBound) step. We use
|
||||
|
@ -654,8 +660,8 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
|
|||
AffineMap stepMap =
|
||||
AffineMap::get(0, 3,
|
||||
((rewriter.getAffineSymbolExpr(0) -
|
||||
rewriter.getAffineSymbolExpr(1)).ceilDiv(
|
||||
rewriter.getAffineSymbolExpr(2))));
|
||||
rewriter.getAffineSymbolExpr(1))
|
||||
.ceilDiv(rewriter.getAffineSymbolExpr(2))));
|
||||
Value launchBound = rewriter.create<AffineApplyOp>(
|
||||
loc, annotation.boundMap.compose(stepMap),
|
||||
ValueRange{
|
||||
|
@ -664,7 +670,12 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
|
|||
ensureLaunchIndependent(
|
||||
cloningMap.lookupOrDefault(lowerBound)),
|
||||
ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
|
||||
launchOp.setOperand(annotation.processor, launchBound);
|
||||
if (bounds.find(annotation.processor) != bounds.end()) {
|
||||
return parallelOp.emitOpError()
|
||||
<< "cannot redefine the bound for processor "
|
||||
<< annotation.processor;
|
||||
}
|
||||
bounds[annotation.processor] = launchBound;
|
||||
}
|
||||
if (!boundIsPrecise) {
|
||||
// We are using an approximation, create a surrounding conditional.
|
||||
|
@ -746,9 +757,10 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
|
|||
rewriter.setInsertionPointToStart(&launchOp.body().front());
|
||||
|
||||
BlockAndValueMapping cloningMap;
|
||||
llvm::DenseMap<int, Value> launchBounds;
|
||||
SmallVector<Operation *, 16> worklist;
|
||||
if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
|
||||
rewriter)))
|
||||
launchBounds, rewriter)))
|
||||
return matchFailure();
|
||||
|
||||
// Whether we have seen any side-effects. Reset when leaving an inner scope.
|
||||
|
@ -770,8 +782,9 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
|
|||
// A nested loop.parallel needs insertion of code to compute indices.
|
||||
// Insert that now. This will also update the worklist with the loops
|
||||
// body.
|
||||
processParallelLoop(nestedParallel, launchOp, cloningMap, worklist,
|
||||
rewriter);
|
||||
if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
|
||||
worklist, launchBounds, rewriter)))
|
||||
return matchFailure();
|
||||
} else if (op == launchOp.getOperation()) {
|
||||
// Found our sentinel value. We have finished the operations from one
|
||||
// nesting level, pop one level back up.
|
||||
|
@ -791,6 +804,11 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
|
|||
}
|
||||
}
|
||||
|
||||
// Now that we succeeded creating the launch operation, also update the
|
||||
// bounds.
|
||||
for (auto bound : launchBounds)
|
||||
launchOp.setOperand(std::get<0>(bound), std::get<1>(bound));
|
||||
|
||||
rewriter.eraseOp(parallelOp);
|
||||
return matchSuccess();
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file %s | FileCheck %s -dump-input-on-failure
|
||||
// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file -verify-diagnostics %s | FileCheck %s -dump-input-on-failure
|
||||
|
||||
// 2-d parallel loop mapped to block.y and block.x
|
||||
|
||||
|
@ -299,3 +299,55 @@ module {
|
|||
// CHECK: return
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
|
||||
// -----
|
||||
|
||||
// Mapping to the same processor twice.
|
||||
|
||||
func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
|
||||
%arg3 : index,
|
||||
%buf : memref<?x?xf32>,
|
||||
%res : memref<?x?xf32>) {
|
||||
%four = constant 4 : index
|
||||
// expected-error@+2 {{cannot redefine the bound for processor 1}}
|
||||
// expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
|
||||
loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
|
||||
step (%four, %four) {
|
||||
} { mapping = [
|
||||
{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
|
||||
{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
|
||||
] }
|
||||
return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// Loop with loop-variant upper bound.
|
||||
|
||||
func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,
|
||||
%arg3 : index,
|
||||
%buf : memref<?x?xf32>,
|
||||
%res : memref<?x?xf32>) {
|
||||
%zero = constant 0 : index
|
||||
%one = constant 1 : index
|
||||
%four = constant 4 : index
|
||||
// expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
|
||||
loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
|
||||
step (%four, %four) {
|
||||
// expected-error@+1 {{cannot derive loop-invariant upper bound}}
|
||||
loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
|
||||
step (%one, %one) {
|
||||
%idx0 = addi %i0, %si0 : index
|
||||
%idx1 = addi %i1, %si1 : index
|
||||
%val = load %buf[%idx0, %idx1] : memref<?x?xf32>
|
||||
store %val, %res[%idx1, %idx0] : memref<?x?xf32>
|
||||
} { mapping = [
|
||||
{processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
|
||||
{processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
|
||||
] }
|
||||
} { mapping = [
|
||||
{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
|
||||
{processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
|
||||
] }
|
||||
return
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue