From 51d43bbc4662202d7f694c43b968fb289a56a355 Mon Sep 17 00:00:00 2001
From: Prashant Kumar <prashantk@polymagelabs.com>
Date: Thu, 17 Jun 2021 01:15:35 +0530
Subject: [PATCH] [MLIR] Fix affine parallelize pass.

To control the number of outer parallel loops, we need to process the
 outer loops first and hence pre-order walk fixes the issue.

Reviewed By: bondhugula

Differential Revision: https://reviews.llvm.org/D104361
---
 .../Affine/Transforms/AffineParallelize.cpp   | 11 +++++-----
 mlir/test/Dialect/Affine/parallelize.mlir     | 21 +++++++++++++++++++
 2 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
index 62519908a248..237094d40006 100644
--- a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
@@ -50,14 +50,13 @@ struct ParallelizationCandidate {
 void AffineParallelize::runOnFunction() {
   FuncOp f = getFunction();
 
-  // The walker proceeds in post-order, but we need to process outer loops first
-  // to control the number of outer parallel loops, so push candidate loops to
-  // the front of a deque.
-  std::deque<ParallelizationCandidate> parallelizableLoops;
-  f.walk([&](AffineForOp loop) {
+  // The walker proceeds in pre-order to process the outer loops first
+  // and control the number of outer parallel loops.
+  std::vector<ParallelizationCandidate> parallelizableLoops;
+  f.walk<WalkOrder::PreOrder>([&](AffineForOp loop) {
     SmallVector<LoopReduction> reductions;
     if (isLoopParallel(loop, parallelReductions ? &reductions : nullptr))
-      parallelizableLoops.emplace_back(loop, std::move(reductions));
+      parallelizableLoops.push_back({loop, std::move(reductions)});
   });
 
   for (const ParallelizationCandidate &candidate : parallelizableLoops) {
diff --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir
index bb98e654a80d..dc5c435c1e9c 100644
--- a/mlir/test/Dialect/Affine/parallelize.mlir
+++ b/mlir/test/Dialect/Affine/parallelize.mlir
@@ -155,6 +155,27 @@ func @max_nested(%m: memref<?x?xf32>, %lb0: index, %lb1: index,
   return
 }
 
+// MAX-NESTED-LABEL: @max_nested_1
+func @max_nested_1(%arg0: memref<4096x4096xf32>, %arg1: memref<4096x4096xf32>, %arg2: memref<4096x4096xf32>) {
+  %0 = memref.alloc() : memref<4096x4096xf32>
+  // MAX-NESTED: affine.parallel
+  affine.for %arg3 = 0 to 4096 {
+    // MAX-NESTED-NEXT: affine.for
+    affine.for %arg4 = 0 to 4096 {
+      // MAX-NESTED-NEXT: affine.for
+      affine.for %arg5 = 0 to 4096 {
+        %1 = affine.load %arg0[%arg3, %arg5] : memref<4096x4096xf32>
+        %2 = affine.load %arg1[%arg5, %arg4] : memref<4096x4096xf32>
+        %3 = affine.load %0[%arg3, %arg4] : memref<4096x4096xf32>
+        %4 = mulf %1, %2 : f32
+        %5 = addf %3, %4 : f32
+        affine.store %5, %0[%arg3, %arg4] : memref<4096x4096xf32>
+      }
+    }
+  }
+  return
+}
+
 // CHECK-LABEL: @iter_args
 // REDUCE-LABEL: @iter_args
 func @iter_args(%in: memref<10xf32>) {