[MLIR][GPU] Add error checking to loop.parallel to gpu transform.

Summary:
Instead of crashing on malformed input, the pass now produces error
messages.

Differential Revision: https://reviews.llvm.org/D75468
This commit is contained in:
Stephan Herhut 2020-03-02 18:05:42 +01:00
parent 292ab49d43
commit 10ec1860a8
2 changed files with 83 additions and 13 deletions

View File

@ -572,6 +572,7 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
gpu::LaunchOp launchOp,
BlockAndValueMapping &cloningMap,
SmallVectorImpl<Operation *> &worklist,
DenseMap<int, Value> &bounds,
PatternRewriter &rewriter) {
// TODO(herhut): Verify that this is a valid GPU mapping.
// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
@ -631,22 +632,27 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
// conditional. If the lower-bound is constant or defined before the
// launch, we can use it in the launch bounds. Otherwise fail.
if (!launchIndependent(lowerBound) &&
!isa<ConstantOp>(lowerBound.getDefiningOp()))
!isa_and_nonnull<ConstantOp>(lowerBound.getDefiningOp()))
return failure();
// The step must also be constant or defined outside of the loop nest.
if (!launchIndependent(step) && !isa<ConstantOp>(step.getDefiningOp()))
if (!launchIndependent(step) &&
!isa_and_nonnull<ConstantOp>(step.getDefiningOp()))
return failure();
// If the upper-bound is constant or defined before the launch, we can
// use it in the launch bounds directly. Otherwise try derive a bound.
bool boundIsPrecise = launchIndependent(upperBound) ||
isa<ConstantOp>(upperBound.getDefiningOp());
bool boundIsPrecise =
launchIndependent(upperBound) ||
isa_and_nonnull<ConstantOp>(upperBound.getDefiningOp());
{
PatternRewriter::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(launchOp);
if (!boundIsPrecise) {
upperBound = deriveStaticUpperBound(upperBound, rewriter);
if (!upperBound)
return failure();
if (!upperBound) {
return parallelOp.emitOpError()
<< "cannot derive loop-invariant upper bound for number "
"of iterations";
}
}
// Compute the number of iterations needed. We compute this as an
// affine expression ceilDiv (upperBound - lowerBound) step. We use
@ -654,8 +660,8 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
AffineMap stepMap =
AffineMap::get(0, 3,
((rewriter.getAffineSymbolExpr(0) -
rewriter.getAffineSymbolExpr(1)).ceilDiv(
rewriter.getAffineSymbolExpr(2))));
rewriter.getAffineSymbolExpr(1))
.ceilDiv(rewriter.getAffineSymbolExpr(2))));
Value launchBound = rewriter.create<AffineApplyOp>(
loc, annotation.boundMap.compose(stepMap),
ValueRange{
@ -664,7 +670,12 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
ensureLaunchIndependent(
cloningMap.lookupOrDefault(lowerBound)),
ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
launchOp.setOperand(annotation.processor, launchBound);
if (bounds.find(annotation.processor) != bounds.end()) {
return parallelOp.emitOpError()
<< "cannot redefine the bound for processor "
<< annotation.processor;
}
bounds[annotation.processor] = launchBound;
}
if (!boundIsPrecise) {
// We are using an approximation, create a surrounding conditional.
@ -746,9 +757,10 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
rewriter.setInsertionPointToStart(&launchOp.body().front());
BlockAndValueMapping cloningMap;
llvm::DenseMap<int, Value> launchBounds;
SmallVector<Operation *, 16> worklist;
if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
rewriter)))
launchBounds, rewriter)))
return matchFailure();
// Whether we have seen any side-effects. Reset when leaving an inner scope.
@ -770,8 +782,9 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
// A nested loop.parallel needs insertion of code to compute indices.
// Insert that now. This will also update the worklist with the loops
// body.
processParallelLoop(nestedParallel, launchOp, cloningMap, worklist,
rewriter);
if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
worklist, launchBounds, rewriter)))
return matchFailure();
} else if (op == launchOp.getOperation()) {
// Found our sentinel value. We have finished the operations from one
// nesting level, pop one level back up.
@ -791,6 +804,11 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
}
}
// Now that we succeeded creating the launch operation, also update the
// bounds.
for (auto bound : launchBounds)
launchOp.setOperand(std::get<0>(bound), std::get<1>(bound));
rewriter.eraseOp(parallelOp);
return matchSuccess();
}

View File

@ -1,4 +1,4 @@
// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file %s | FileCheck %s -dump-input-on-failure
// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file -verify-diagnostics %s | FileCheck %s -dump-input-on-failure
// 2-d parallel loop mapped to block.y and block.x
@ -299,3 +299,55 @@ module {
// CHECK: return
// CHECK: }
// CHECK: }
// -----
// Mapping to the same processor twice.
func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
%arg3 : index,
%buf : memref<?x?xf32>,
%res : memref<?x?xf32>) {
%four = constant 4 : index
// expected-error@+2 {{cannot redefine the bound for processor 1}}
// expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
step (%four, %four) {
} { mapping = [
{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
] }
return
}
// -----
// Loop with loop-variant upper bound.
func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,
%arg3 : index,
%buf : memref<?x?xf32>,
%res : memref<?x?xf32>) {
%zero = constant 0 : index
%one = constant 1 : index
%four = constant 4 : index
// expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
step (%four, %four) {
// expected-error@+1 {{cannot derive loop-invariant upper bound}}
loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
step (%one, %one) {
%idx0 = addi %i0, %si0 : index
%idx1 = addi %i1, %si1 : index
%val = load %buf[%idx0, %idx1] : memref<?x?xf32>
store %val, %res[%idx1, %idx0] : memref<?x?xf32>
} { mapping = [
{processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
{processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
] }
} { mapping = [
{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
{processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
] }
return
}