forked from OSchip/llvm-project
Improving EPCC performance when linking with hwloc
When linking with libhwloc, the ORDERED EPCC test slows down on big machines (> 48 cores). Performance analysis showed that a cache thrash was occurring and this padding helps alleviate the problem. Also, inside the main spin-wait loop in kmp_wait_release.h, we can eliminate the references to the global shared variables by instead creating a local variable, oversubscribed and instead checking that. Differential Revision: http://reviews.llvm.org/D22093 llvm-svn: 274894
This commit is contained in:
parent
50d307680f
commit
4d3c21307c
|
@ -1706,6 +1706,12 @@ typedef struct dispatch_shared_info {
|
||||||
volatile kmp_uint32 *doacross_flags; // shared array of iteration flags (0/1)
|
volatile kmp_uint32 *doacross_flags; // shared array of iteration flags (0/1)
|
||||||
kmp_int32 doacross_num_done; // count finished threads
|
kmp_int32 doacross_num_done; // count finished threads
|
||||||
#endif
|
#endif
|
||||||
|
#if KMP_USE_HWLOC
|
||||||
|
// When linking with libhwloc, the ORDERED EPCC test slows down on big
|
||||||
|
// machines (> 48 cores). Performance analysis showed that a cache thrash
|
||||||
|
// was occurring and this padding helps alleviate the problem.
|
||||||
|
char padding[64];
|
||||||
|
#endif
|
||||||
} dispatch_shared_info_t;
|
} dispatch_shared_info_t;
|
||||||
|
|
||||||
typedef struct kmp_disp {
|
typedef struct kmp_disp {
|
||||||
|
@ -2567,7 +2573,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
|
||||||
int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via omp_set_num_threads() call
|
int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via omp_set_num_threads() call
|
||||||
|
|
||||||
// Read/write by workers as well -----------------------------------------------------------------------
|
// Read/write by workers as well -----------------------------------------------------------------------
|
||||||
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
|
#if (KMP_ARCH_X86 || KMP_ARCH_X86_64) && !KMP_USE_HWLOC
|
||||||
// Using CACHE_LINE=64 reduces memory footprint, but causes a big perf regression of epcc 'parallel'
|
// Using CACHE_LINE=64 reduces memory footprint, but causes a big perf regression of epcc 'parallel'
|
||||||
// and 'barrier' on fxe256lin01. This extra padding serves to fix the performance of epcc 'parallel'
|
// and 'barrier' on fxe256lin01. This extra padding serves to fix the performance of epcc 'parallel'
|
||||||
// and 'barrier' when CACHE_LINE=64. TODO: investigate more and get rid if this padding.
|
// and 'barrier' when CACHE_LINE=64. TODO: investigate more and get rid if this padding.
|
||||||
|
|
|
@ -180,6 +180,12 @@ struct dispatch_shared_info_template {
|
||||||
kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
|
kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
|
||||||
kmp_int32 doacross_num_done; // count finished threads
|
kmp_int32 doacross_num_done; // count finished threads
|
||||||
#endif
|
#endif
|
||||||
|
#if KMP_USE_HWLOC
|
||||||
|
// When linking with libhwloc, the ORDERED EPCC test slowsdown on big
|
||||||
|
// machines (> 48 cores). Performance analysis showed that a cache thrash
|
||||||
|
// was occurring and this padding helps alleviate the problem.
|
||||||
|
char padding[64];
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------ */
|
/* ------------------------------------------------------------------------ */
|
||||||
|
|
|
@ -97,6 +97,7 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin
|
||||||
kmp_uint32 hibernate;
|
kmp_uint32 hibernate;
|
||||||
int th_gtid;
|
int th_gtid;
|
||||||
int tasks_completed = FALSE;
|
int tasks_completed = FALSE;
|
||||||
|
int oversubscribed;
|
||||||
|
|
||||||
KMP_FSYNC_SPIN_INIT(spin, NULL);
|
KMP_FSYNC_SPIN_INIT(spin, NULL);
|
||||||
if (flag->done_check()) {
|
if (flag->done_check()) {
|
||||||
|
@ -166,6 +167,7 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin
|
||||||
hibernate - __kmp_global.g.g_time.dt.t_value));
|
hibernate - __kmp_global.g.g_time.dt.t_value));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
oversubscribed = (TCR_4(__kmp_nth) > __kmp_avail_proc);
|
||||||
KMP_MB();
|
KMP_MB();
|
||||||
|
|
||||||
// Main wait spin loop
|
// Main wait spin loop
|
||||||
|
@ -201,7 +203,7 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we are oversubscribed, or have waited a bit (and KMP_LIBRARY=throughput), then yield
|
// If we are oversubscribed, or have waited a bit (and KMP_LIBRARY=throughput), then yield
|
||||||
KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
|
KMP_YIELD(oversubscribed);
|
||||||
// TODO: Should it be number of cores instead of thread contexts? Like:
|
// TODO: Should it be number of cores instead of thread contexts? Like:
|
||||||
// KMP_YIELD(TCR_4(__kmp_nth) > __kmp_ncores);
|
// KMP_YIELD(TCR_4(__kmp_nth) > __kmp_ncores);
|
||||||
// Need performance improvement data to make the change...
|
// Need performance improvement data to make the change...
|
||||||
|
|
Loading…
Reference in New Issue