[MLIR] Fix affine parallelize pass.

To control the number of outer parallel loops, we need to process the outer loops first and hence pre-order walk fixes the issue. Reviewed By: bondhugula Differential Revision: https://reviews.llvm.org/D104361
2021-06-17 01:15:35 +05:30 · 2021-06-17 01:15:35 +05:30 · 51d43bbc46
parent 0e760a0870
commit 51d43bbc46
2 changed files with 26 additions and 6 deletions
--- a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
@ -50,14 +50,13 @@ struct ParallelizationCandidate {
 void AffineParallelize::runOnFunction() {
  FuncOp f = getFunction();

-  // The walker proceeds in post-order, but we need to process outer loops first
-  // to control the number of outer parallel loops, so push candidate loops to
-  // the front of a deque.
-  std::deque<ParallelizationCandidate> parallelizableLoops;
-  f.walk([&](AffineForOp loop) {
+  // The walker proceeds in pre-order to process the outer loops first
+  // and control the number of outer parallel loops.
+  std::vector<ParallelizationCandidate> parallelizableLoops;
+  f.walk<WalkOrder::PreOrder>([&](AffineForOp loop) {
    SmallVector<LoopReduction> reductions;
    if (isLoopParallel(loop, parallelReductions ? &reductions : nullptr))
-      parallelizableLoops.emplace_back(loop, std::move(reductions));
+      parallelizableLoops.push_back({loop, std::move(reductions)});
  });

  for (const ParallelizationCandidate &candidate : parallelizableLoops) {
--- a/mlir/test/Dialect/Affine/parallelize.mlir
+++ b/mlir/test/Dialect/Affine/parallelize.mlir
@ -155,6 +155,27 @@ func @max_nested(%m: memref<?x?xf32>, %lb0: index, %lb1: index,
  return
 }

+// MAX-NESTED-LABEL: @max_nested_1
+func @max_nested_1(%arg0: memref<4096x4096xf32>, %arg1: memref<4096x4096xf32>, %arg2: memref<4096x4096xf32>) {
+  %0 = memref.alloc() : memref<4096x4096xf32>
+  // MAX-NESTED: affine.parallel
+  affine.for %arg3 = 0 to 4096 {
+    // MAX-NESTED-NEXT: affine.for
+    affine.for %arg4 = 0 to 4096 {
+      // MAX-NESTED-NEXT: affine.for
+      affine.for %arg5 = 0 to 4096 {
+        %1 = affine.load %arg0[%arg3, %arg5] : memref<4096x4096xf32>
+        %2 = affine.load %arg1[%arg5, %arg4] : memref<4096x4096xf32>
+        %3 = affine.load %0[%arg3, %arg4] : memref<4096x4096xf32>
+        %4 = mulf %1, %2 : f32
+        %5 = addf %3, %4 : f32
+        affine.store %5, %0[%arg3, %arg4] : memref<4096x4096xf32>
+      }
+    }
+  }
+  return
+}
+
 // CHECK-LABEL: @iter_args
 // REDUCE-LABEL: @iter_args
 func @iter_args(%in: memref<10xf32>) {