[MLIR][GPU] Add error checking to loop.parallel to gpu transform.

Summary: Instead of crashing on malformed input, the pass now produces error messages. Differential Revision: https://reviews.llvm.org/D75468
2020-03-02 18:05:42 +01:00 · 2020-03-02 18:05:42 +01:00 · 10ec1860a8
parent 292ab49d43
commit 10ec1860a8
2 changed files with 83 additions and 13 deletions
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@ -572,6 +572,7 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
                                         gpu::LaunchOp launchOp,
                                         BlockAndValueMapping &cloningMap,
                                         SmallVectorImpl<Operation *> &worklist,
+                                         DenseMap<int, Value> &bounds,
                                         PatternRewriter &rewriter) {
  // TODO(herhut): Verify that this is a valid GPU mapping.
  // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
@ -631,22 +632,27 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
        // conditional. If the lower-bound is constant or defined before the
        // launch, we can use it in the launch bounds. Otherwise fail.
        if (!launchIndependent(lowerBound) &&
-            !isa<ConstantOp>(lowerBound.getDefiningOp()))
+            !isa_and_nonnull<ConstantOp>(lowerBound.getDefiningOp()))
          return failure();
        // The step must also be constant or defined outside of the loop nest.
-        if (!launchIndependent(step) && !isa<ConstantOp>(step.getDefiningOp()))
+        if (!launchIndependent(step) &&
+            !isa_and_nonnull<ConstantOp>(step.getDefiningOp()))
          return failure();
        // If the upper-bound is constant or defined before the launch, we can
        // use it in the launch bounds directly. Otherwise try derive a bound.
-        bool boundIsPrecise = launchIndependent(upperBound) ||
-                              isa<ConstantOp>(upperBound.getDefiningOp());
+        bool boundIsPrecise =
+            launchIndependent(upperBound) ||
+            isa_and_nonnull<ConstantOp>(upperBound.getDefiningOp());
        {
          PatternRewriter::InsertionGuard guard(rewriter);
          rewriter.setInsertionPoint(launchOp);
          if (!boundIsPrecise) {
            upperBound = deriveStaticUpperBound(upperBound, rewriter);
-            if (!upperBound)
-              return failure();
+            if (!upperBound) {
+              return parallelOp.emitOpError()
+                     << "cannot derive loop-invariant upper bound for number "
+                        "of iterations";
+            }
          }
          // Compute the number of iterations needed. We compute this as an
          // affine expression ceilDiv (upperBound - lowerBound) step. We use
@ -654,8 +660,8 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
          AffineMap stepMap =
              AffineMap::get(0, 3,
                             ((rewriter.getAffineSymbolExpr(0) -
-                              rewriter.getAffineSymbolExpr(1)).ceilDiv(
-                                  rewriter.getAffineSymbolExpr(2))));
+                               rewriter.getAffineSymbolExpr(1))
+                                  .ceilDiv(rewriter.getAffineSymbolExpr(2))));
          Value launchBound = rewriter.create<AffineApplyOp>(
              loc, annotation.boundMap.compose(stepMap),
              ValueRange{
@ -664,7 +670,12 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp,
                  ensureLaunchIndependent(
                      cloningMap.lookupOrDefault(lowerBound)),
                  ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
-          launchOp.setOperand(annotation.processor, launchBound);
+          if (bounds.find(annotation.processor) != bounds.end()) {
+            return parallelOp.emitOpError()
+                   << "cannot redefine the bound for processor "
+                   << annotation.processor;
+          }
+          bounds[annotation.processor] = launchBound;
        }
        if (!boundIsPrecise) {
          // We are using an approximation, create a surrounding conditional.
@ -746,9 +757,10 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
  rewriter.setInsertionPointToStart(&launchOp.body().front());

  BlockAndValueMapping cloningMap;
+  llvm::DenseMap<int, Value> launchBounds;
  SmallVector<Operation *, 16> worklist;
  if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
-                                 rewriter)))
+                                 launchBounds, rewriter)))
    return matchFailure();

  // Whether we have seen any side-effects. Reset when leaving an inner scope.
@ -770,8 +782,9 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
      // A nested loop.parallel needs insertion of code to compute indices.
      // Insert that now. This will also update the worklist with the loops
      // body.
-      processParallelLoop(nestedParallel, launchOp, cloningMap, worklist,
-                          rewriter);
+      if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
+                                     worklist, launchBounds, rewriter)))
+        return matchFailure();
    } else if (op == launchOp.getOperation()) {
      // Found our sentinel value. We have finished the operations from one
      // nesting level, pop one level back up.
@ -791,6 +804,11 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
    }
  }

+  // Now that we succeeded creating the launch operation, also update the
+  // bounds.
+  for (auto bound : launchBounds)
+    launchOp.setOperand(std::get<0>(bound), std::get<1>(bound));
+
  rewriter.eraseOp(parallelOp);
  return matchSuccess();
 }
--- a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file %s | FileCheck %s -dump-input-on-failure
+// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file -verify-diagnostics %s | FileCheck %s -dump-input-on-failure

 // 2-d parallel loop mapped to block.y and block.x

@ -299,3 +299,55 @@ module {
 // CHECK:           return
 // CHECK:         }
 // CHECK:       }
+
+// -----
+
+// Mapping to the same processor twice.
+
+func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
+                          %arg3 : index,
+                          %buf : memref<?x?xf32>,
+                          %res : memref<?x?xf32>) {
+  %four = constant 4 : index
+  // expected-error@+2 {{cannot redefine the bound for processor 1}}
+  // expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+  } { mapping = [
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+    ] }
+  return
+}
+
+// -----
+
+// Loop with loop-variant upper bound.
+
+func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,
+                                       %arg3 : index,
+                                       %buf : memref<?x?xf32>,
+                                       %res : memref<?x?xf32>) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  // expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+    // expected-error@+1 {{cannot derive loop-invariant upper bound}}                                        
+    loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
+                                            step (%one, %one)  {
+      %idx0 = addi %i0, %si0 : index
+      %idx1 = addi %i1, %si1 : index
+      %val = load %buf[%idx0, %idx1] : memref<?x?xf32>
+      store %val, %res[%idx1, %idx0] : memref<?x?xf32>
+    } { mapping = [
+        {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+        {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+      ] }
+  } { mapping = [
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+      {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+    ] }
+  return
+}