[Support] Simplify parallelForEach{,N}

* Merge parallel_for_each into parallelForEach (this removes 1 `Fn(...)` call)
* Change parallelForEach to use parallelForEachN
* Move parallelForEachN into Parallel.cpp

My x86-64 `lld` executable is 100KiB smaller.
No noticeable difference in performance.

Reviewed By: lattner

Differential Revision: https://reviews.llvm.org/D117510
This commit is contained in:
Fangrui Song 2022-01-23 10:35:44 -08:00
parent 818cfb10c5
commit 8e382ae91b
2 changed files with 35 additions and 77 deletions

View File

@ -130,64 +130,6 @@ void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End,
// improving to take the number of available cores into account.)
enum { MaxTasksPerGroup = 1024 };
template <class IterTy, class FuncTy>
void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) {
// If we have zero or one items, then do not incur the overhead of spinning up
// a task group. They are surprisingly expensive, and because they do not
// support nested parallelism, a single entry task group can block parallel
// execution underneath them.
auto NumItems = std::distance(Begin, End);
if (NumItems <= 1) {
if (NumItems)
Fn(*Begin);
return;
}
// Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
// overhead on large inputs.
ptrdiff_t TaskSize = NumItems / MaxTasksPerGroup;
if (TaskSize == 0)
TaskSize = 1;
TaskGroup TG;
while (TaskSize < std::distance(Begin, End)) {
TG.spawn([=, &Fn] { std::for_each(Begin, Begin + TaskSize, Fn); });
Begin += TaskSize;
}
std::for_each(Begin, End, Fn);
}
template <class IndexTy, class FuncTy>
void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) {
// If we have zero or one items, then do not incur the overhead of spinning up
// a task group. They are surprisingly expensive, and because they do not
// support nested parallelism, a single entry task group can block parallel
// execution underneath them.
auto NumItems = End - Begin;
if (NumItems <= 1) {
if (NumItems)
Fn(Begin);
return;
}
// Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
// overhead on large inputs.
ptrdiff_t TaskSize = NumItems / MaxTasksPerGroup;
if (TaskSize == 0)
TaskSize = 1;
TaskGroup TG;
IndexTy I = Begin;
for (; I + TaskSize < End; I += TaskSize) {
TG.spawn([=, &Fn] {
for (IndexTy J = I, E = I + TaskSize; J != E; ++J)
Fn(J);
});
}
for (IndexTy J = I; J < End; ++J)
Fn(J);
}
template <class IterTy, class ResultTy, class ReduceFuncTy,
class TransformFuncTy>
ResultTy parallel_transform_reduce(IterTy Begin, IterTy End, ResultTy Init,
@ -251,27 +193,11 @@ void parallelSort(RandomAccessIterator Start, RandomAccessIterator End,
llvm::sort(Start, End, Comp);
}
void parallelForEachN(size_t Begin, size_t End, function_ref<void(size_t)> Fn);
template <class IterTy, class FuncTy>
void parallelForEach(IterTy Begin, IterTy End, FuncTy Fn) {
#if LLVM_ENABLE_THREADS
if (parallel::strategy.ThreadsRequested != 1) {
parallel::detail::parallel_for_each(Begin, End, Fn);
return;
}
#endif
std::for_each(Begin, End, Fn);
}
template <class FuncTy>
void parallelForEachN(size_t Begin, size_t End, FuncTy Fn) {
#if LLVM_ENABLE_THREADS
if (parallel::strategy.ThreadsRequested != 1) {
parallel::detail::parallel_for_each_n(Begin, End, Fn);
return;
}
#endif
for (size_t I = Begin; I != End; ++I)
Fn(I);
parallelForEachN(0, End - Begin, [&](size_t I) { Fn(Begin[I]); });
}
template <class IterTy, class ResultTy, class ReduceFuncTy,

View File

@ -174,3 +174,35 @@ void TaskGroup::spawn(std::function<void()> F) {
} // namespace parallel
} // namespace llvm
#endif // LLVM_ENABLE_THREADS
void llvm::parallelForEachN(size_t Begin, size_t End,
llvm::function_ref<void(size_t)> Fn) {
// If we have zero or one items, then do not incur the overhead of spinning up
// a task group. They are surprisingly expensive, and because they do not
// support nested parallelism, a single entry task group can block parallel
// execution underneath them.
#if LLVM_ENABLE_THREADS
auto NumItems = End - Begin;
if (NumItems > 1 && parallel::strategy.ThreadsRequested != 1) {
// Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
// overhead on large inputs.
auto TaskSize = NumItems / parallel::detail::MaxTasksPerGroup;
if (TaskSize == 0)
TaskSize = 1;
parallel::detail::TaskGroup TG;
for (; Begin + TaskSize < End; Begin += TaskSize) {
TG.spawn([=, &Fn] {
for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I)
Fn(I);
});
}
for (; Begin != End; ++Begin)
Fn(Begin);
return;
}
#endif
for (; Begin != End; ++Begin)
Fn(Begin);
}