forked from OSchip/llvm-project
Reduce number of tasks in parallel_for_each.
TaskGroup has a fairly high overhead, so we don't want to partition tasks into too small tasks. This patch partition tasks into up to 1024 tasks. I compared this patch with the original LLD's parallel_for_each. I reverted r287042 locally for comparison. With this patch, time to self-link lld with debug info changed from 6.23 seconds to 4.62 seconds (-25.8%), with -threads and without -build-id. With both -threads and -build-id, it improved from 11.71 seconds to 4.94 seconds (-57.8%). Full results are below. BTW, GNU gold takes 11.65 seconds to link the same binary. NOW --no-threads --build-id=none 6789.847776 task-clock (msec) # 1.000 CPUs utilized ( +- 1.86% ) 685 context-switches # 0.101 K/sec ( +- 2.82% ) 4 cpu-migrations # 0.001 K/sec ( +- 31.18% ) 1,424,690 page-faults # 0.210 M/sec ( +- 1.07% ) 21,339,542,522 cycles # 3.143 GHz ( +- 1.49% ) 13,092,260,230 stalled-cycles-frontend # 61.35% frontend cycles idle ( +- 2.23% ) <not supported> stalled-cycles-backend 21,462,051,828 instructions # 1.01 insns per cycle # 0.61 stalled cycles per insn ( +- 0.41% ) 3,955,296,378 branches # 582.531 M/sec ( +- 0.39% ) 75,699,909 branch-misses # 1.91% of all branches ( +- 0.08% ) 6.787630744 seconds time elapsed ( +- 1.86% ) --threads --build-id=none 14767.148697 task-clock (msec) # 3.196 CPUs utilized ( +- 2.56% ) 28,891 context-switches # 0.002 M/sec ( +- 1.99% ) 905 cpu-migrations # 0.061 K/sec ( +- 5.49% ) 1,262,122 page-faults # 0.085 M/sec ( +- 1.68% ) 43,116,163,217 cycles # 2.920 GHz ( +- 3.07% ) 33,690,171,242 stalled-cycles-frontend # 78.14% frontend cycles idle ( +- 3.67% ) <not supported> stalled-cycles-backend 22,836,731,536 instructions # 0.53 insns per cycle # 1.48 stalled cycles per insn ( +- 1.13% ) 4,382,712,998 branches # 296.788 M/sec ( +- 1.33% ) 78,622,295 branch-misses # 1.79% of all branches ( +- 0.54% ) 4.621228056 seconds time elapsed ( +- 1.90% ) --threads --build-id=sha1 24594.457135 task-clock (msec) # 4.974 CPUs utilized ( +- 1.78% ) 29,902 context-switches # 0.001 M/sec ( +- 2.62% ) 1,097 cpu-migrations # 0.045 K/sec ( +- 6.29% ) 1,313,947 page-faults # 0.053 M/sec ( +- 2.36% ) 70,516,415,741 cycles # 2.867 GHz ( +- 0.78% ) 47,570,262,296 stalled-cycles-frontend # 67.46% frontend cycles idle ( +- 0.86% ) <not supported> stalled-cycles-backend 73,124,599,029 instructions # 1.04 insns per cycle # 0.65 stalled cycles per insn ( +- 0.33% ) 10,495,266,104 branches # 426.733 M/sec ( +- 0.41% ) 91,444,149 branch-misses # 0.87% of all branches ( +- 0.83% ) 4.944291711 seconds time elapsed ( +- 1.72% ) PREVIOUS --threads --build-id=none 7307.437544 task-clock (msec) # 1.160 CPUs utilized ( +- 2.34% ) 3,128 context-switches # 0.428 K/sec ( +- 4.37% ) 352 cpu-migrations # 0.048 K/sec ( +- 5.98% ) 1,354,450 page-faults # 0.185 M/sec ( +- 2.20% ) 22,081,733,098 cycles # 3.022 GHz ( +- 1.46% ) 13,709,991,267 stalled-cycles-frontend # 62.09% frontend cycles idle ( +- 1.77% ) <not supported> stalled-cycles-backend 21,634,468,895 instructions # 0.98 insns per cycle # 0.63 stalled cycles per insn ( +- 0.86% ) 3,993,062,361 branches # 546.438 M/sec ( +- 0.83% ) 76,188,819 branch-misses # 1.91% of all branches ( +- 0.19% ) 6.298101157 seconds time elapsed ( +- 2.03% ) --threads --build-id=sha1 12845.420265 task-clock (msec) # 1.097 CPUs utilized ( +- 1.95% ) 4,020 context-switches # 0.313 K/sec ( +- 2.89% ) 369 cpu-migrations # 0.029 K/sec ( +- 6.26% ) 1,464,822 page-faults # 0.114 M/sec ( +- 1.37% ) 40,668,449,813 cycles # 3.166 GHz ( +- 0.96% ) 18,863,982,388 stalled-cycles-frontend # 46.38% frontend cycles idle ( +- 1.82% ) <not supported> stalled-cycles-backend 71,560,499,058 instructions # 1.76 insns per cycle # 0.26 stalled cycles per insn ( +- 0.14% ) 10,044,152,441 branches # 781.925 M/sec ( +- 0.19% ) 87,835,773 branch-misses # 0.87% of all branches ( +- 0.09% ) 11.711773314 seconds time elapsed ( +- 1.51% ) llvm-svn: 287140
This commit is contained in:
parent
f4c5a0e630
commit
87ff6fef0f
|
@ -283,9 +283,20 @@ void parallel_for_each(Iterator begin, Iterator end, Func func) {
|
|||
#else
|
||||
template <class Iterator, class Func>
|
||||
void parallel_for_each(Iterator begin, Iterator end, Func func) {
|
||||
// TaskGroup has a relatively high overhead, so we want to reduce
|
||||
// the number of spawn() calls. We'll create up to 1024 tasks here.
|
||||
// (Note that 1024 is an arbitrary number. This code probably needs
|
||||
// improving to take the number of available cores into account.)
|
||||
ptrdiff_t taskSize = std::distance(begin, end) / 1024;
|
||||
if (taskSize == 0)
|
||||
taskSize = 1;
|
||||
|
||||
TaskGroup tg;
|
||||
for (; begin != end; ++begin)
|
||||
tg.spawn([=, &func] { func(*begin); });
|
||||
while (taskSize <= std::distance(begin, end)) {
|
||||
tg.spawn([=, &func] { std::for_each(begin, begin + taskSize, func); });
|
||||
begin += taskSize;
|
||||
}
|
||||
std::for_each(begin, end, func);
|
||||
}
|
||||
#endif
|
||||
} // end namespace lld
|
||||
|
|
Loading…
Reference in New Issue