[Support] Simplify parallelForEach{,N}

* Merge parallel_for_each into parallelForEach (this removes 1 `Fn(...)` call) * Change parallelForEach to use parallelForEachN * Move parallelForEachN into Parallel.cpp My x86-64 `lld` executable is 100KiB smaller. No noticeable difference in performance. Reviewed By: lattner Differential Revision: https://reviews.llvm.org/D117510
2022-01-23 10:35:44 -08:00 · 2022-01-23 10:35:44 -08:00 · 8e382ae91b
parent 818cfb10c5
commit 8e382ae91b
2 changed files with 35 additions and 77 deletions
--- a/llvm/include/llvm/Support/Parallel.h
+++ b/llvm/include/llvm/Support/Parallel.h
@ -130,64 +130,6 @@ void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End,
 // improving to take the number of available cores into account.)
 enum { MaxTasksPerGroup = 1024 };

-template <class IterTy, class FuncTy>
-void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) {
-  // If we have zero or one items, then do not incur the overhead of spinning up
-  // a task group.  They are surprisingly expensive, and because they do not
-  // support nested parallelism, a single entry task group can block parallel
-  // execution underneath them.
-  auto NumItems = std::distance(Begin, End);
-  if (NumItems <= 1) {
-    if (NumItems)
-      Fn(*Begin);
-    return;
-  }
-
-  // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
-  // overhead on large inputs.
-  ptrdiff_t TaskSize = NumItems / MaxTasksPerGroup;
-  if (TaskSize == 0)
-    TaskSize = 1;
-
-  TaskGroup TG;
-  while (TaskSize < std::distance(Begin, End)) {
-    TG.spawn([=, &Fn] { std::for_each(Begin, Begin + TaskSize, Fn); });
-    Begin += TaskSize;
-  }
-  std::for_each(Begin, End, Fn);
-}
-
-template <class IndexTy, class FuncTy>
-void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) {
-  // If we have zero or one items, then do not incur the overhead of spinning up
-  // a task group.  They are surprisingly expensive, and because they do not
-  // support nested parallelism, a single entry task group can block parallel
-  // execution underneath them.
-  auto NumItems = End - Begin;
-  if (NumItems <= 1) {
-    if (NumItems)
-      Fn(Begin);
-    return;
-  }
-
-  // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
-  // overhead on large inputs.
-  ptrdiff_t TaskSize = NumItems / MaxTasksPerGroup;
-  if (TaskSize == 0)
-    TaskSize = 1;
-
-  TaskGroup TG;
-  IndexTy I = Begin;
-  for (; I + TaskSize < End; I += TaskSize) {
-    TG.spawn([=, &Fn] {
-      for (IndexTy J = I, E = I + TaskSize; J != E; ++J)
-        Fn(J);
-    });
-  }
-  for (IndexTy J = I; J < End; ++J)
-    Fn(J);
-}
-
 template <class IterTy, class ResultTy, class ReduceFuncTy,
          class TransformFuncTy>
 ResultTy parallel_transform_reduce(IterTy Begin, IterTy End, ResultTy Init,
@ -251,27 +193,11 @@ void parallelSort(RandomAccessIterator Start, RandomAccessIterator End,
  llvm::sort(Start, End, Comp);
 }

+void parallelForEachN(size_t Begin, size_t End, function_ref<void(size_t)> Fn);
+
 template <class IterTy, class FuncTy>
 void parallelForEach(IterTy Begin, IterTy End, FuncTy Fn) {
-#if LLVM_ENABLE_THREADS
-  if (parallel::strategy.ThreadsRequested != 1) {
-    parallel::detail::parallel_for_each(Begin, End, Fn);
-    return;
-  }
-#endif
-  std::for_each(Begin, End, Fn);
-}
-
-template <class FuncTy>
-void parallelForEachN(size_t Begin, size_t End, FuncTy Fn) {
-#if LLVM_ENABLE_THREADS
-  if (parallel::strategy.ThreadsRequested != 1) {
-    parallel::detail::parallel_for_each_n(Begin, End, Fn);
-    return;
-  }
-#endif
-  for (size_t I = Begin; I != End; ++I)
-    Fn(I);
+  parallelForEachN(0, End - Begin, [&](size_t I) { Fn(Begin[I]); });
 }

 template <class IterTy, class ResultTy, class ReduceFuncTy,
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@ -174,3 +174,35 @@ void TaskGroup::spawn(std::function<void()> F) {
 } // namespace parallel
 } // namespace llvm
 #endif // LLVM_ENABLE_THREADS
+
+void llvm::parallelForEachN(size_t Begin, size_t End,
+                            llvm::function_ref<void(size_t)> Fn) {
+  // If we have zero or one items, then do not incur the overhead of spinning up
+  // a task group.  They are surprisingly expensive, and because they do not
+  // support nested parallelism, a single entry task group can block parallel
+  // execution underneath them.
+#if LLVM_ENABLE_THREADS
+  auto NumItems = End - Begin;
+  if (NumItems > 1 && parallel::strategy.ThreadsRequested != 1) {
+    // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
+    // overhead on large inputs.
+    auto TaskSize = NumItems / parallel::detail::MaxTasksPerGroup;
+    if (TaskSize == 0)
+      TaskSize = 1;
+
+    parallel::detail::TaskGroup TG;
+    for (; Begin + TaskSize < End; Begin += TaskSize) {
+      TG.spawn([=, &Fn] {
+        for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I)
+          Fn(I);
+      });
+    }
+    for (; Begin != End; ++Begin)
+      Fn(Begin);
+    return;
+  }
+#endif
+
+  for (; Begin != End; ++Begin)
+    Fn(Begin);
+}