Tidy statistics collection

This removes some statistics counters and timers which were not used,
adds new counters and timers for some language features that were not
monitored previously and separates the counters and timers into those
which are of interest for investigating user code and those which are
only of interest to the developer of the runtime itself.
The runtime developer statistics are now ony collected if the
additional #define KMP_DEVELOPER_STATS is set.

Additional user statistics which are now collected include:
* Count of nested parallelism (omp parallel inside a parallel region)
* Count of omp distribute occurrences
* Count of omp teams occurrences
* Counts of task related statistics (taskyield, task execution, task
  cancellation, task steal)
* Values passed to omp_set_numtheads
* Time spent in omp single and omp master

None of this affects code compiled without stats gathering enabled,
which is the normal library build mode.

This also fixes the CMake build by linking to the standard c++ library
when building the stats library as it is a requirement.  The normal library
does not have this requirement and its link phase is left alone.

Differential Revision: http://reviews.llvm.org/D11759

llvm-svn: 244677
This commit is contained in:
Jonathan Peyton 2015-08-11 21:36:41 +00:00
parent 827529e7a0
commit 45be450070
12 changed files with 233 additions and 131 deletions

View File

@ -254,6 +254,10 @@ set(LIBOMP_STATS FALSE CACHE BOOL
if(LIBOMP_STATS AND (NOT LIBOMP_HAVE_STATS))
libomp_error_say("Stats-gathering functionality requested but not available")
endif()
# The stats functionality requires the std c++ library
if(LIBOMP_STATS)
set(LIBOMP_USE_STDCPPLIB TRUE)
endif()
# OMPT-support
# TODO: Make this a real feature check

View File

@ -149,7 +149,10 @@ endif()
# Remove any cmake-automatic linking of the standard C++ library.
# We neither need (nor want) the standard C++ library dependency even though we compile c++ files.
if(NOT ${LIBOMP_USE_STDCPPLIB})
set(LIBOMP_LINKER_LANGUAGE C)
set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES)
else()
set(LIBOMP_LINKER_LANGUAGE CXX)
endif()
# Add the OpenMP library
@ -158,7 +161,7 @@ add_library(omp SHARED ${LIBOMP_SOURCE_FILES})
set_target_properties(omp PROPERTIES
PREFIX "" SUFFIX "" OUTPUT_NAME "${LIBOMP_LIB_FILE}"
LINK_FLAGS "${LIBOMP_CONFIGURED_LDFLAGS}"
LINKER_LANGUAGE C # use C Compiler for linking step
LINKER_LANGUAGE ${LIBOMP_LINKER_LANGUAGE}
SKIP_BUILD_RPATH true # have Mac linker -install_name just be "-install_name libomp.dylib"
)

View File

@ -46,7 +46,7 @@ __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid
void (*reduce)(void *, void *)
USE_ITT_BUILD_ARG(void * itt_sync_obj) )
{
KMP_TIME_BLOCK(KMP_linear_gather);
KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather);
register kmp_team_t *team = this_thr->th.th_team;
register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
register kmp_info_t **other_threads = team->t.t_threads;
@ -123,7 +123,7 @@ __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gti
int propagate_icvs
USE_ITT_BUILD_ARG(void *itt_sync_obj) )
{
KMP_TIME_BLOCK(KMP_linear_release);
KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release);
register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
register kmp_team_t *team;
@ -141,17 +141,18 @@ __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gti
if (nproc > 1) {
#if KMP_BARRIER_ICV_PUSH
KMP_START_EXPLICIT_TIMER(USER_icv_copy);
if (propagate_icvs) {
ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
for (i=1; i<nproc; ++i) {
__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
&team->t.t_implicit_task_taskdata[0].td_icvs);
{
KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
if (propagate_icvs) {
ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
for (i=1; i<nproc; ++i) {
__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
&team->t.t_implicit_task_taskdata[0].td_icvs);
}
ngo_sync();
}
ngo_sync();
}
KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
#endif // KMP_BARRIER_ICV_PUSH
// Now, release all of the worker threads
@ -217,7 +218,7 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
void (*reduce)(void *, void *)
USE_ITT_BUILD_ARG(void *itt_sync_obj) )
{
KMP_TIME_BLOCK(KMP_tree_gather);
KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather);
register kmp_team_t *team = this_thr->th.th_team;
register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
register kmp_info_t **other_threads = team->t.t_threads;
@ -312,7 +313,7 @@ __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
int propagate_icvs
USE_ITT_BUILD_ARG(void *itt_sync_obj) )
{
KMP_TIME_BLOCK(KMP_tree_release);
KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release);
register kmp_team_t *team;
register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
register kmp_uint32 nproc;
@ -381,14 +382,15 @@ __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
#endif /* KMP_CACHE_MANAGE */
#if KMP_BARRIER_ICV_PUSH
KMP_START_EXPLICIT_TIMER(USER_icv_copy);
if (propagate_icvs) {
__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
team, child_tid, FALSE);
copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
&team->t.t_implicit_task_taskdata[0].td_icvs);
{
KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
if (propagate_icvs) {
__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
team, child_tid, FALSE);
copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
&team->t.t_implicit_task_taskdata[0].td_icvs);
}
}
KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
#endif // KMP_BARRIER_ICV_PUSH
KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
"go(%p): %u => %u\n", gtid, team->t.t_id, tid,
@ -414,7 +416,7 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
void (*reduce)(void *, void *)
USE_ITT_BUILD_ARG(void *itt_sync_obj) )
{
KMP_TIME_BLOCK(KMP_hyper_gather);
KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather);
register kmp_team_t *team = this_thr->th.th_team;
register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
register kmp_info_t **other_threads = team->t.t_threads;
@ -520,7 +522,7 @@ __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid
int propagate_icvs
USE_ITT_BUILD_ARG(void *itt_sync_obj) )
{
KMP_TIME_BLOCK(KMP_hyper_release);
KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release);
register kmp_team_t *team;
register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
register kmp_info_t **other_threads;
@ -725,7 +727,7 @@ __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
int gtid, int tid, void (*reduce) (void *, void *)
USE_ITT_BUILD_ARG(void * itt_sync_obj) )
{
KMP_TIME_BLOCK(KMP_hier_gather);
KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather);
register kmp_team_t *team = this_thr->th.th_team;
register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
register kmp_uint32 nproc = this_thr->th.th_team_nproc;
@ -853,7 +855,7 @@ __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, i
int propagate_icvs
USE_ITT_BUILD_ARG(void * itt_sync_obj) )
{
KMP_TIME_BLOCK(KMP_hier_release);
KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release);
register kmp_team_t *team;
register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
register kmp_uint32 nproc;
@ -1035,7 +1037,7 @@ int
__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
void *reduce_data, void (*reduce)(void *, void *))
{
KMP_TIME_BLOCK(KMP_barrier);
KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
register int tid = __kmp_tid_from_gtid(gtid);
register kmp_info_t *this_thr = __kmp_threads[gtid];
register kmp_team_t *team = this_thr->th.th_team;
@ -1294,7 +1296,7 @@ __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
void
__kmp_end_split_barrier(enum barrier_type bt, int gtid)
{
KMP_TIME_BLOCK(KMP_end_split_barrier);
KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier);
int tid = __kmp_tid_from_gtid(gtid);
kmp_info_t *this_thr = __kmp_threads[gtid];
kmp_team_t *team = this_thr->th.th_team;
@ -1335,7 +1337,7 @@ __kmp_end_split_barrier(enum barrier_type bt, int gtid)
void
__kmp_join_barrier(int gtid)
{
KMP_TIME_BLOCK(KMP_join_barrier);
KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
register kmp_info_t *this_thr = __kmp_threads[gtid];
register kmp_team_t *team;
register kmp_uint nproc;
@ -1533,7 +1535,7 @@ __kmp_join_barrier(int gtid)
void
__kmp_fork_barrier(int gtid, int tid)
{
KMP_TIME_BLOCK(KMP_fork_barrier);
KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
kmp_info_t *this_thr = __kmp_threads[gtid];
kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
#if USE_ITT_BUILD
@ -1648,15 +1650,16 @@ __kmp_fork_barrier(int gtid, int tid)
this data before this function is called. We cannot modify __kmp_fork_call() to look at
the fixed ICVs in the master's thread struct, because it is not always the case that the
threads arrays have been allocated when __kmp_fork_call() is executed. */
KMP_START_EXPLICIT_TIMER(USER_icv_copy);
if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
// Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
{
KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
// Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
}
}
KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
#endif // KMP_BARRIER_ICV_PULL
if (__kmp_tasking_mode != tskm_immediate_exec) {
@ -1702,7 +1705,7 @@ __kmp_fork_barrier(int gtid, int tid)
void
__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
{
KMP_TIME_BLOCK(KMP_setup_icv_copy);
KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy);
KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);

View File

@ -58,7 +58,7 @@ kmp_int32 __kmpc_cancel(ident_t* loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind) {
break;
}
case cancel_taskgroup:
// cancellation requests for parallel and worksharing constructs
// cancellation requests for a task group
// are handled through the taskgroup structure
{
kmp_taskdata_t* task;
@ -141,7 +141,7 @@ kmp_int32 __kmpc_cancellationpoint(ident_t* loc_ref, kmp_int32 gtid, kmp_int32 c
break;
}
case cancel_taskgroup:
// cancellation requests for parallel and worksharing constructs
// cancellation requests for a task group
// are handled through the taskgroup structure
{
kmp_taskdata_t* task;

View File

@ -280,9 +280,21 @@ Do the actual fork and call the microtask in the relevant number of threads.
void
__kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
{
KMP_STOP_EXPLICIT_TIMER(OMP_serial);
KMP_COUNT_BLOCK(OMP_PARALLEL);
int gtid = __kmp_entry_gtid();
#if (KMP_STATS_ENABLED)
int inParallel = __kmpc_in_parallel(loc);
if (inParallel)
{
KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
}
else
{
KMP_STOP_EXPLICIT_TIMER(OMP_serial);
KMP_COUNT_BLOCK(OMP_PARALLEL);
}
#endif
// maybe to save thr_state is enough here
{
va_list ap;
@ -329,7 +341,10 @@ __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
}
#endif
}
KMP_START_EXPLICIT_TIMER(OMP_serial);
#if (KMP_STATS_ENABLED)
if (!inParallel)
KMP_START_EXPLICIT_TIMER(OMP_serial);
#endif
}
#if OMP_40_ENABLED
@ -370,6 +385,8 @@ __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
va_list ap;
va_start( ap, microtask );
KMP_COUNT_BLOCK(OMP_TEAMS);
// remember teams entry point and nesting level
this_thr->th.th_teams_microtask = microtask;
this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
@ -715,8 +732,10 @@ __kmpc_master(ident_t *loc, kmp_int32 global_tid)
if( ! TCR_4( __kmp_init_parallel ) )
__kmp_parallel_initialize();
if( KMP_MASTER_GTID( global_tid ))
if( KMP_MASTER_GTID( global_tid )) {
KMP_START_EXPLICIT_TIMER(OMP_master);
status = 1;
}
#if OMPT_SUPPORT && OMPT_TRACE
if (status) {
@ -764,6 +783,7 @@ __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) );
KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
KMP_STOP_EXPLICIT_TIMER(OMP_master);
#if OMPT_SUPPORT && OMPT_TRACE
kmp_info_t *this_thr = __kmp_threads[ global_tid ];
@ -1386,6 +1406,9 @@ __kmpc_single(ident_t *loc, kmp_int32 global_tid)
{
KMP_COUNT_BLOCK(OMP_SINGLE);
kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
if(rc == TRUE) {
KMP_START_EXPLICIT_TIMER(OMP_single);
}
#if OMPT_SUPPORT && OMPT_TRACE
kmp_info_t *this_thr = __kmp_threads[ global_tid ];
@ -1427,6 +1450,7 @@ void
__kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
{
__kmp_exit_single( global_tid );
KMP_STOP_EXPLICIT_TIMER(OMP_single);
#if OMPT_SUPPORT && OMPT_TRACE
kmp_info_t *this_thr = __kmp_threads[ global_tid ];
@ -2191,7 +2215,6 @@ int
__kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
{
KMP_COUNT_BLOCK(OMP_test_lock);
KMP_TIME_BLOCK(OMP_test_lock);
#if KMP_USE_DYNAMIC_LOCK
int rc;

View File

@ -670,6 +670,7 @@ __kmp_dispatch_init(
} else {
pr->ordered = FALSE;
}
if ( schedule == kmp_sch_static ) {
schedule = __kmp_static;
} else {
@ -761,6 +762,19 @@ __kmp_dispatch_init(
tc = 0; // zero-trip
}
// Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
// when statistics are disabled.
if (schedule == __kmp_static)
{
KMP_COUNT_BLOCK(OMP_FOR_static);
KMP_COUNT_VALUE(FOR_static_iterations, tc);
}
else
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
}
pr->u.p.lb = lb;
pr->u.p.ub = ub;
pr->u.p.st = st;
@ -1384,6 +1398,11 @@ __kmp_dispatch_next(
static const int ___kmp_size_type = sizeof( UT );
#endif
// This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
// is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
// more than a compile time choice to use static scheduling would.)
KMP_TIME_BLOCK(FOR_dynamic_scheduling);
int status;
dispatch_private_info_template< T > * pr;
kmp_info_t * th = __kmp_threads[ gtid ];
@ -2164,7 +2183,6 @@ __kmp_dist_get_bounds(
T *pupper,
typename traits_t< T >::signed_t incr
) {
KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
typedef typename traits_t< T >::unsigned_t UT;
typedef typename traits_t< T >::signed_t ST;
register kmp_uint32 team_id;
@ -2222,6 +2240,7 @@ __kmp_dist_get_bounds(
} else {
trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
}
if( trip_count <= nteams ) {
KMP_DEBUG_ASSERT(
__kmp_static == kmp_sch_static_greedy || \
@ -2297,7 +2316,6 @@ void
__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
}
@ -2308,7 +2326,6 @@ void
__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
}
@ -2321,7 +2338,6 @@ __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_int64 lb, kmp_int64 ub,
kmp_int64 st, kmp_int64 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
}
@ -2334,7 +2350,6 @@ __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_uint64 lb, kmp_uint64 ub,
kmp_int64 st, kmp_int64 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
}
@ -2352,7 +2367,6 @@ void
__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
@ -2362,7 +2376,6 @@ void
__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
@ -2372,7 +2385,6 @@ void
__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
@ -2382,7 +2394,6 @@ void
__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );

View File

@ -1495,7 +1495,8 @@ __kmp_fork_call(
kmp_hot_team_ptr_t **p_hot_teams;
#endif
{ // KMP_TIME_BLOCK
KMP_TIME_BLOCK(KMP_fork_call);
KMP_TIME_DEVELOPER_BLOCK(KMP_fork_call);
KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
if ( __kmp_stkpadding > 0 && __kmp_root[gtid] != NULL ) {
@ -1620,12 +1621,14 @@ __kmp_fork_call(
}
#endif
KMP_TIME_BLOCK(OMP_work);
__kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
{
KMP_TIME_BLOCK(OMP_work);
__kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
#if OMPT_SUPPORT
, exit_runtime_p
, exit_runtime_p
#endif
);
);
}
#if OMPT_SUPPORT
if (ompt_status & ompt_status_track) {
@ -2224,8 +2227,8 @@ __kmp_fork_call(
} // END of timer KMP_fork_call block
{
//KMP_TIME_BLOCK(OMP_work);
KMP_TIME_BLOCK(USER_master_invoke);
KMP_TIME_BLOCK(OMP_work);
// KMP_TIME_DEVELOPER_BLOCK(USER_master_invoke);
if (! team->t.t_invoke( gtid )) {
KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
}
@ -2280,7 +2283,7 @@ __kmp_join_call(ident_t *loc, int gtid, enum fork_context_e fork_context
#endif /* OMP_40_ENABLED */
)
{
KMP_TIME_BLOCK(KMP_join_call);
KMP_TIME_DEVELOPER_BLOCK(KMP_join_call);
kmp_team_t *team;
kmp_team_t *parent_team;
kmp_info_t *master_th;
@ -2582,6 +2585,7 @@ __kmp_set_num_threads( int new_nth, int gtid )
else if (new_nth > __kmp_max_nth)
new_nth = __kmp_max_nth;
KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
thread = __kmp_threads[gtid];
__kmp_save_internal_controls( thread );
@ -4790,7 +4794,7 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
kmp_internal_control_t *new_icvs,
int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
{
KMP_TIME_BLOCK(KMP_allocate_team);
KMP_TIME_DEVELOPER_BLOCK(KMP_allocate_team);
int f;
kmp_team_t *team;
int use_hot_team = ! root->r.r_active;
@ -5577,12 +5581,12 @@ __kmp_launch_thread( kmp_info_t *this_thr )
}
#endif
KMP_STOP_EXPLICIT_TIMER(USER_launch_thread_loop);
KMP_STOP_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop);
{
KMP_TIME_BLOCK(USER_worker_invoke);
KMP_TIME_DEVELOPER_BLOCK(USER_worker_invoke);
rc = (*pteam)->t.t_invoke( gtid );
}
KMP_START_EXPLICIT_TIMER(USER_launch_thread_loop);
KMP_START_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop);
KMP_ASSERT( rc );
#if OMPT_SUPPORT
@ -6910,12 +6914,15 @@ __kmp_invoke_task_func( int gtid )
#endif
#endif
rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
{
KMP_TIME_BLOCK(OMP_work);
rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
#if OMPT_SUPPORT
, exit_runtime_p
, exit_runtime_p
#endif
);
);
}
#if OMPT_SUPPORT && OMPT_TRACE
if (ompt_status & ompt_status_track) {

View File

@ -84,6 +84,8 @@ __kmp_for_static_init(
typename traits_t< T >::signed_t chunk
) {
KMP_COUNT_BLOCK(OMP_FOR_static);
KMP_TIME_BLOCK (FOR_static_scheduling);
typedef typename traits_t< T >::unsigned_t UT;
typedef typename traits_t< T >::signed_t ST;
/* this all has to be changed back to TID and such.. */
@ -151,6 +153,7 @@ __kmp_for_static_init(
team_info->microtask);
}
#endif
KMP_COUNT_VALUE (FOR_static_iterations, 0);
return;
}
@ -246,6 +249,7 @@ __kmp_for_static_init(
__kmp_error_construct( kmp_i18n_msg_CnsIterationRangeTooLarge, ct_pdo, loc );
}
}
KMP_COUNT_VALUE (FOR_static_iterations, trip_count);
/* compute remaining parameters */
switch ( schedtype ) {
@ -372,7 +376,7 @@ __kmp_dist_for_static_init(
typename traits_t< T >::signed_t incr,
typename traits_t< T >::signed_t chunk
) {
KMP_COUNT_BLOCK(OMP_DISTR_FOR_static);
KMP_COUNT_BLOCK(OMP_DISTRIBUTE);
typedef typename traits_t< T >::unsigned_t UT;
typedef typename traits_t< T >::signed_t ST;
register kmp_uint32 tid;
@ -437,6 +441,7 @@ __kmp_dist_for_static_init(
} else {
trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
}
*pstride = *pupper - *plower; // just in case (can be unused)
if( trip_count <= nteams ) {
KMP_DEBUG_ASSERT(

View File

@ -521,16 +521,14 @@ void kmp_stats_output_module::outputStats(const char* heading)
// Special handling for synthesized statistics.
// These just have to be coded specially here for now.
// At present we only have one: the total parallel work done in each thread.
// At present we only have a few:
// The total parallel work done in each thread.
// The variance here makes it easy to see load imbalance over the whole program (though, of course,
// it's possible to have a code with awful load balance in every parallel region but perfect load
// balance oever the whole program.)
// The time spent in barriers in each thread.
allStats[TIMER_Total_work].addSample ((*it)->getTimer(TIMER_OMP_work)->getTotal());
// Time waiting for work (synthesized)
if ((t != 0) || !timeStat::workerOnly(timer_e(TIMER_OMP_await_work)))
allStats[TIMER_Total_await_work].addSample ((*it)->getTimer(TIMER_OMP_await_work)->getTotal());
// Time in explicit barriers.
allStats[TIMER_Total_barrier].addSample ((*it)->getTimer(TIMER_OMP_barrier)->getTotal());

View File

@ -31,6 +31,11 @@
#include <new> // placement new
#include "kmp_stats_timing.h"
/*
* Enable developer statistics here if you want them. They are more detailed than is useful for application characterisation and
* are intended for the runtime library developer.
*/
// #define KMP_DEVELOPER_STATS 1
/*!
* @ingroup STATS_GATHERING
@ -56,7 +61,7 @@ class stats_flags_e {
* Each thread accumulates its own count, at the end of execution the counts are aggregated treating each thread
* as a separate measurement. (Unless onlyInMaster is set, in which case there's only a single measurement).
* The min,mean,max are therefore the values for the threads.
* Adding the counter here and then putting in a KMP_BLOCK_COUNTER(name) is all you need to do.
* Adding the counter here and then putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you need to do.
* All of the tables and printing is generated from this macro.
* Format is "macro(name, flags, arg)"
*
@ -64,21 +69,30 @@ class stats_flags_e {
*/
#define KMP_FOREACH_COUNTER(macro, arg) \
macro (OMP_PARALLEL, stats_flags_e::onlyInMaster, arg) \
macro (OMP_NESTED_PARALLEL, 0, arg) \
macro (OMP_FOR_static, 0, arg) \
macro (OMP_FOR_dynamic, 0, arg) \
macro (OMP_DISTR_FOR_static, 0, arg) \
macro (OMP_DISTR_FOR_dynamic, 0, arg) \
macro (OMP_DISTRIBUTE, 0, arg) \
macro (OMP_BARRIER, 0, arg) \
macro (OMP_CRITICAL,0, arg) \
macro (OMP_SINGLE, 0, arg) \
macro (OMP_MASTER, 0, arg) \
macro (OMP_TEAMS, 0, arg) \
macro (OMP_set_lock, 0, arg) \
macro (OMP_test_lock, 0, arg) \
macro (OMP_test_lock_failure, 0, arg) \
macro (REDUCE_wait, 0, arg) \
macro (REDUCE_nowait, 0, arg) \
macro (OMP_TASKYIELD, 0, arg) \
macro (TASK_executed, 0, arg) \
macro (TASK_cancelled, 0, arg) \
macro (TASK_stolen, 0, arg) \
macro (LAST,0,arg)
// OMP_PARALLEL_args -- the number of arguments passed to a fork
// FOR_static_iterations -- Number of available parallel chunks of work in a static for
// FOR_dynamic_iterations -- Number of available parallel chunks of work in a dynamic for
// Both adjust for any chunking, so if there were an iteration count of 20 but a chunk size of 10, we'd record 2.
/*!
* \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
*
@ -87,72 +101,45 @@ class stats_flags_e {
*
* \details A timer collects multiple samples of some count in each thread and then finally aggregates over all the threads.
* The count is normally a time (in ticks), hence the name "timer". (But can be any value, so we use this for "number of arguments passed to fork"
* as well, or we could collect "loop iteration count" if we wanted to).
* as well).
* For timers the threads are not significant, it's the individual observations that count, so the statistics are at that level.
* Format is "macro(name, flags, arg)"
*
* @ingroup STATS_GATHERING
* @ingroup STATS_GATHERING2
*/
#define KMP_FOREACH_TIMER(macro, arg) \
macro (OMP_PARALLEL_args, stats_flags_e::onlyInMaster | stats_flags_e::noUnits, arg) \
macro (FOR_static_iterations, stats_flags_e::onlyInMaster | stats_flags_e::noUnits, arg) \
macro (FOR_dynamic_iterations, stats_flags_e::noUnits, arg) \
#define KMP_FOREACH_TIMER(macro, arg) \
macro (OMP_start_end, stats_flags_e::onlyInMaster, arg) \
macro (OMP_serial, stats_flags_e::onlyInMaster, arg) \
macro (OMP_work, 0, arg) \
macro (Total_work, stats_flags_e::synthesized, arg) \
macro (OMP_await_work, stats_flags_e::notInMaster, arg) \
macro (Total_await_work, stats_flags_e::synthesized, arg) \
macro (OMP_barrier, 0, arg) \
macro (Total_barrier, stats_flags_e::synthesized, arg) \
macro (OMP_test_lock, 0, arg) \
macro (FOR_static_iterations, stats_flags_e::noUnits, arg) \
macro (FOR_static_scheduling, 0, arg) \
macro (FOR_dynamic_iterations, stats_flags_e::noUnits, arg) \
macro (FOR_dynamic_scheduling, 0, arg) \
macro (KMP_fork_call, 0, arg) \
macro (KMP_join_call, 0, arg) \
macro (KMP_fork_barrier, stats_flags_e::logEvent, arg) \
macro (KMP_join_barrier, stats_flags_e::logEvent, arg) \
macro (KMP_barrier, 0, arg) \
macro (KMP_end_split_barrier, 0, arg) \
macro (KMP_wait_sleep, 0, arg) \
macro (KMP_release, 0, arg) \
macro (KMP_hier_gather, 0, arg) \
macro (KMP_hier_release, 0, arg) \
macro (KMP_hyper_gather, stats_flags_e::logEvent, arg) \
macro (KMP_hyper_release, stats_flags_e::logEvent, arg) \
macro (KMP_linear_gather, 0, arg) \
macro (KMP_linear_release, 0, arg) \
macro (KMP_tree_gather, 0, arg) \
macro (KMP_tree_release, 0, arg) \
macro (USER_master_invoke, stats_flags_e::logEvent, arg) \
macro (USER_worker_invoke, stats_flags_e::logEvent, arg) \
macro (USER_resume, stats_flags_e::logEvent, arg) \
macro (USER_suspend, stats_flags_e::logEvent, arg) \
macro (USER_launch_thread_loop, stats_flags_e::logEvent, arg) \
macro (KMP_allocate_team, 0, arg) \
macro (KMP_setup_icv_copy, 0, arg) \
macro (USER_icv_copy, 0, arg) \
macro (TASK_execution, 0, arg) \
macro (OMP_set_numthreads, stats_flags_e::noUnits, arg) \
macro (OMP_PARALLEL_args, stats_flags_e::noUnits, arg) \
macro (OMP_single, 0, arg) \
macro (OMP_master, 0, arg) \
KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
macro (LAST,0, arg)
// OMP_PARALLEL_args -- the number of arguments passed to a fork
// FOR_static_iterations -- Number of available parallel chunks of work in a static for
// FOR_dynamic_iterations -- Number of available parallel chunks of work in a dynamic for
// Both adjust for any chunking, so if there were an iteration count of 20 but a chunk size of 10, we'd record 2.
// OMP_serial -- thread zero time executing serial code
// OMP_start_end -- time from when OpenMP is initialized until the stats are printed at exit
// OMP_serial -- thread zero time executing serial code
// OMP_work -- elapsed time in code dispatched by a fork (measured in the thread)
// Total_work -- a synthesized statistic summarizing how much parallel work each thread executed.
// OMP_barrier -- time at "real" barriers
// Total_barrier -- a synthesized statistic summarizing how much time at real barriers in each thread
// OMP_set_lock -- time in lock setting
// OMP_test_lock -- time in testing a lock
// LOCK_WAIT -- time waiting for a lock
// FOR_static_scheduling -- time spent doing scheduling for a static "for"
// FOR_dynamic_scheduling -- time spent doing scheduling for a dynamic "for"
// KMP_wait_sleep -- time in __kmp_wait_sleep
// KMP_release -- time in __kmp_release
#if (KMP_DEVELOPER_STATS)
// Timers which are of interest tio runtime library developers, not end users.
// THese have to be explicitly enabled in addition to the other stats.
// KMP_fork_barrier -- time in __kmp_fork_barrier
// KMP_join_barrier -- time in __kmp_join_barrier
// KMP_barrier -- time in __kmp_barrier
@ -165,6 +152,32 @@ class stats_flags_e {
// KMP_tree_release -- time in __kmp_tree_barrier_release
// KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
// KMP_hyper_release -- time in __kmp_hyper_barrier_release
# define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
macro (KMP_fork_call, 0, arg) \
macro (KMP_join_call, 0, arg) \
macro (KMP_fork_barrier, stats_flags_e::logEvent, arg) \
macro (KMP_join_barrier, stats_flags_e::logEvent, arg) \
macro (KMP_barrier, 0, arg) \
macro (KMP_end_split_barrier, 0, arg) \
macro (KMP_hier_gather, 0, arg) \
macro (KMP_hier_release, 0, arg) \
macro (KMP_hyper_gather, stats_flags_e::logEvent, arg) \
macro (KMP_hyper_release, stats_flags_e::logEvent, arg) \
macro (KMP_linear_gather, 0, arg) \
macro (KMP_linear_release, 0, arg) \
macro (KMP_tree_gather, 0, arg) \
macro (KMP_tree_release, 0, arg) \
macro (USER_master_invoke, stats_flags_e::logEvent, arg) \
macro (USER_worker_invoke, stats_flags_e::logEvent, arg) \
macro (USER_resume, stats_flags_e::logEvent, arg) \
macro (USER_suspend, stats_flags_e::logEvent, arg) \
macro (USER_launch_thread_loop, stats_flags_e::logEvent, arg) \
macro (KMP_allocate_team, 0, arg) \
macro (KMP_setup_icv_copy, 0, arg) \
macro (USER_icv_copy, 0, arg)
#else
# define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
#endif
/*!
* \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
@ -182,13 +195,21 @@ class stats_flags_e {
*
* @ingroup STATS_GATHERING
*/
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) \
macro(OMP_serial, 0, arg) \
macro(OMP_start_end, 0, arg) \
macro(USER_icv_copy, 0, arg) \
macro(USER_launch_thread_loop, stats_flags_e::logEvent, arg) \
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) \
macro(OMP_serial, 0, arg) \
macro(OMP_start_end, 0, arg) \
macro(OMP_single, 0, arg) \
macro(OMP_master, 0, arg) \
KMP_FOREACH_EXPLICIT_DEVELOPER_TIMER(macro,arg) \
macro(LAST, 0, arg)
#if (KMP_DEVELOPER_STATS)
# define KMP_FOREACH_EXPLICIT_DEVELOPER_TIMER(macro, arg) \
macro(USER_launch_thread_loop, stats_flags_e::logEvent, arg)
#else
# define KMP_FOREACH_EXPLICIT_DEVELOPER_TIMER(macro, arg)
#endif
#define ENUMERATE(name,ignore,prefix) prefix##name,
enum timer_e {
KMP_FOREACH_TIMER(ENUMERATE, TIMER_)
@ -689,6 +710,21 @@ extern kmp_stats_output_module __kmp_stats_output;
*/
#define KMP_RESET_STATS() __kmp_reset_stats()
#if (KMP_DEVELOPER_STATS)
# define KMP_TIME_DEVELOPER_BLOCK(n) KMP_TIME_BLOCK(n)
# define KMP_COUNT_DEVELOPER_VALUE(n,v) KMP_COUNT_VALUE(n,v)
# define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
# define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) KMP_START_EXPLICIT_TIMER(n)
# define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) KMP_STOP_EXPLICIT_TIMER(n)
#else
// Null definitions
# define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0)
# define KMP_COUNT_DEVELOPER_VALUE(n,v) ((void)0)
# define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
# define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
# define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
#endif
#else // KMP_STATS_ENABLED
// Null definitions
@ -701,6 +737,11 @@ extern kmp_stats_output_module __kmp_stats_output;
#define KMP_OUTPUT_STATS(heading_string) ((void)0)
#define KMP_RESET_STATS() ((void)0)
#define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0)
#define KMP_COUNT_DEVELOPER_VALUE(n,v) ((void)0)
#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
#define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
#define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
#endif // KMP_STATS_ENABLED
#endif // KMP_STATS_H

View File

@ -17,6 +17,7 @@
#include "kmp_i18n.h"
#include "kmp_itt.h"
#include "kmp_wait_release.h"
#include "kmp_stats.h"
#if OMPT_SUPPORT
#include "ompt-specific.h"
@ -1136,6 +1137,7 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_ta
kmp_team_t * this_team = this_thr->th.th_team;
kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
if ((taskgroup && taskgroup->cancel_request) || (this_team->t.t_cancel_request == cancel_parallel)) {
KMP_COUNT_BLOCK(TASK_cancelled);
// this task belongs to a task group and we need to cancel it
discard = 1 /* true */;
}
@ -1146,6 +1148,8 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_ta
// Thunks generated by gcc take a different argument list.
//
if (!discard) {
KMP_COUNT_BLOCK(TASK_executed);
KMP_TIME_BLOCK (TASK_execution);
#endif // OMP_40_ENABLED
#ifdef KMP_GOMP_COMPAT
if (taskdata->td_flags.native) {
@ -1356,6 +1360,8 @@ __kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part )
kmp_info_t * thread;
int thread_finished = FALSE;
KMP_COUNT_BLOCK(OMP_TASKYIELD);
KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
gtid, loc_ref, end_part) );
@ -1648,6 +1654,7 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team
__kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
KMP_COUNT_BLOCK(TASK_stolen);
KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p "
"ntasks=%d head=%u tail=%u\n",
gtid, taskdata, __kmp_gtid_from_thread( victim ), task_team,

View File

@ -1688,7 +1688,7 @@ __kmp_suspend_uninitialize_thread( kmp_info_t *th )
template <class C>
static inline void __kmp_suspend_template( int th_gtid, C *flag )
{
KMP_TIME_BLOCK(USER_suspend);
KMP_TIME_DEVELOPER_BLOCK(USER_suspend);
kmp_info_t *th = __kmp_threads[th_gtid];
int status;
typename C::flag_t old_spin;
@ -1826,6 +1826,7 @@ void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
template <class C>
static inline void __kmp_resume_template( int target_gtid, C *flag )
{
KMP_TIME_DEVELOPER_BLOCK(USER_resume);
kmp_info_t *th = __kmp_threads[target_gtid];
int status;
@ -1900,7 +1901,6 @@ void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
void
__kmp_resume_monitor()
{
KMP_TIME_BLOCK(USER_resume);
int status;
#ifdef KMP_DEBUG
int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;