Adding Hwloc library option for affinity mechanism

These changes allow libhwloc to be used as the topology discovery/affinity
mechanism for libomp.  It is supported on Unices. The code additions:
* Canonicalize KMP_CPU_* interface macros so bitmask operations are
  implementation independent and work with both hwloc bitmaps and libomp
  bitmaps.  So there are new KMP_CPU_ALLOC_* and KMP_CPU_ITERATE() macros and
  the like. These are all in kmp.h and appropriately placed.
* Hwloc topology discovery code in kmp_affinity.cpp. This uses the hwloc
  interface to create a libomp address2os object which the rest of libomp knows
  how to handle already.
* To build, use -DLIBOMP_USE_HWLOC=on and
  -DLIBOMP_HWLOC_INSTALL_DIR=/path/to/install/dir [default /usr/local]. If CMake
  can't find the library or hwloc.h, then it will tell you and exit.

Differential Revision: http://reviews.llvm.org/D13991

llvm-svn: 254320
This commit is contained in:
Jonathan Peyton 2015-11-30 20:02:59 +00:00
parent 7a096596b2
commit 01dcf36bd5
18 changed files with 790 additions and 139 deletions

View File

@ -159,6 +159,18 @@ Should include stats-gathering code be included in the build?
-DLIBOMP_USE_DEBUGGER=off|on
Should the friendly debugger interface be included in the build?
-DLIBOMP_USE_HWLOC=off|on
Should the Hwloc library be used for affinity?
This option is not supported on Windows.
http://www.open-mpi.org/projects/hwloc
-DLIBOMP_HWLOC_INSTALL_DIR=/path/to/hwloc/install/dir
Default: /usr/local
This option is only used if LIBOMP_USE_HWLOC is on.
Specifies install location of Hwloc. The configuration system will look for
hwloc.h in ${LIBOMP_HWLOC_INSTALL_DIR}/include and the library in
${LIBOMP_HWLOC_INSTALL_DIR}/lib.
================================
How to append flags to the build
================================

View File

@ -135,6 +135,12 @@ set(LIBOMP_FFLAGS "" CACHE STRING
set(LIBOMP_COPY_EXPORTS TRUE CACHE STRING
"Should exports be copied into source exports/ directory?")
# HWLOC-support
set(LIBOMP_USE_HWLOC FALSE CACHE BOOL
"Use Hwloc (http://www.open-mpi.org/projects/hwloc/) library for affinity?")
set(LIBOMP_HWLOC_INSTALL_DIR /usr/local CACHE PATH
"Install path for hwloc library")
# Get the build number from kmp_version.c
libomp_get_build_number("${CMAKE_CURRENT_SOURCE_DIR}" LIBOMP_VERSION_BUILD)
math(EXPR LIBOMP_VERSION_BUILD_YEAR "${LIBOMP_VERSION_BUILD}/10000")
@ -285,6 +291,11 @@ if(LIBOMP_OMPT_SUPPORT AND (NOT LIBOMP_HAVE_OMPT_SUPPORT))
libomp_error_say("OpenMP Tools Interface requested but not available")
endif()
# Error check hwloc support after config-ix has run
if(LIBOMP_USE_HWLOC AND (NOT LIBOMP_HAVE_HWLOC))
libomp_error_say("Hwloc requested but not available")
endif()
# Setting final library name
set(LIBOMP_DEFAULT_LIB_NAME libomp)
if(${PROFILE_LIBRARY})
@ -323,6 +334,7 @@ if(${LIBOMP_STANDALONE_BUILD})
endif()
libomp_say("Use Adaptive locks -- ${LIBOMP_USE_ADAPTIVE_LOCKS}")
libomp_say("Use quad precision -- ${LIBOMP_USE_QUAD_PRECISION}")
libomp_say("Use Hwloc library -- ${LIBOMP_USE_HWLOC}")
endif()
add_subdirectory(src)

View File

@ -151,6 +151,7 @@ endfunction()
function(libomp_get_libflags libflags)
set(libflags_local)
libomp_append(libflags_local "${CMAKE_THREAD_LIBS_INIT}")
libomp_append(libflags_local "${LIBOMP_HWLOC_LIBRARY}" LIBOMP_USE_HWLOC)
if(${IA32})
libomp_append(libflags_local -lirc_pic LIBOMP_HAVE_IRC_PIC_LIBRARY)
endif()

View File

@ -82,10 +82,13 @@ else() # (Unix based systems, Intel(R) MIC Architecture, and Mac)
libomp_append(libomp_test_touch_cflags -m32 LIBOMP_HAVE_M32_FLAG)
endif()
libomp_append(libomp_test_touch_libs ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE})
libomp_append(libomp_test_touch_libs "${LIBOMP_HWLOC_LIBRARY}" LIBOMP_USE_HWLOC)
if(APPLE)
set(libomp_test_touch_env "DYLD_LIBRARY_PATH=.:${LIBOMP_OUTPUT_DIRECTORY}:$ENV{DYLD_LIBRARY_PATH}")
libomp_append(libomp_test_touch_ldflags "-Wl,-rpath,${LIBOMP_HWLOC_LIBRARY_DIR}" LIBOMP_USE_HWLOC)
else()
set(libomp_test_touch_env "LD_LIBRARY_PATH=.:${LIBOMP_OUTPUT_DIRECTORY}:$ENV{LD_LIBRARY_PATH}")
libomp_append(libomp_test_touch_ldflags "-Wl,-rpath=${LIBOMP_HWLOC_LIBRARY_DIR}" LIBOMP_USE_HWLOC)
endif()
endif()
macro(libomp_test_touch_recipe test_touch_dir)
@ -169,8 +172,10 @@ add_custom_target(libomp-test-deps DEPENDS test-deps/.success)
set(libomp_expected_library_deps)
if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
set(libomp_expected_library_deps libc.so.7 libthr.so.3)
libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
elseif(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
set(libomp_expected_library_deps libc.so.12 libpthread.so.1 libm.so.0)
libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
elseif(APPLE)
set(libomp_expected_library_deps /usr/lib/libSystem.B.dylib)
elseif(WIN32)
@ -203,6 +208,7 @@ else()
libomp_append(libomp_expected_library_deps ld64.so.1)
endif()
libomp_append(libomp_expected_library_deps libpthread.so.0 IF_FALSE STUBS_LIBRARY)
libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
endif()
libomp_append(libomp_expected_library_deps libstdc++.so.6 LIBOMP_USE_STDCPPLIB)
endif()

View File

@ -12,6 +12,7 @@
include(CheckCCompilerFlag)
include(CheckCSourceCompiles)
include(CheckCXXCompilerFlag)
include(CheckIncludeFile)
include(CheckLibraryExists)
include(CheckIncludeFiles)
include(LibompCheckLinkerFlag)
@ -211,3 +212,25 @@ else()
endif()
endif()
# Check if HWLOC support is available
if(${LIBOMP_USE_HWLOC})
if(WIN32)
set(LIBOMP_HAVE_HWLOC FALSE)
libomp_say("Using hwloc not supported on Windows yet")
else()
set(CMAKE_REQUIRED_INCLUDES ${LIBOMP_HWLOC_INSTALL_DIR}/include)
check_include_file(hwloc.h LIBOMP_HAVE_HWLOC_H)
set(CMAKE_REQUIRED_INCLUDES)
check_library_exists(hwloc hwloc_topology_init
${LIBOMP_HWLOC_INSTALL_DIR}/lib LIBOMP_HAVE_LIBHWLOC)
find_library(LIBOMP_HWLOC_LIBRARY hwloc ${LIBOMP_HWLOC_INSTALL_DIR}/lib)
get_filename_component(LIBOMP_HWLOC_LIBRARY_DIR ${LIBOMP_HWLOC_LIBRARY} PATH)
if(LIBOMP_HAVE_HWLOC_H AND LIBOMP_HAVE_LIBHWLOC AND LIBOMP_HWLOC_LIBRARY)
set(LIBOMP_HAVE_HWLOC TRUE)
else()
set(LIBOMP_HAVE_HWLOC FALSE)
libomp_say("Could not find hwloc")
endif()
endif()
endif()

View File

@ -42,6 +42,9 @@ include_directories(
${LIBOMP_INC_DIR}
${LIBOMP_SRC_DIR}/thirdparty/ittnotify
)
if(${LIBOMP_USE_HWLOC})
include_directories(${LIBOMP_HWLOC_INSTALL_DIR}/include)
endif()
# Getting correct source files to build library
set(LIBOMP_CFILES)

View File

@ -405,6 +405,9 @@ AffGranTopGroup "%1$s: granularity=%2$s is not supported with KMP_T
AffGranGroupType "%1$s: granularity=group is not supported with KMP_AFFINITY=%2$s. Using \"granularity=core\"."
AffThrPlaceManySockets "KMP_PLACE_THREADS ignored: too many sockets requested."
AffThrPlaceDeprecated "KMP_PLACE_THREADS \"o\" offset designator deprecated, please use @ prefix for offset value."
AffUsingHwloc "%1$s: Affinity capable, using hwloc."
AffIgnoringHwloc "%1$s: Ignoring hwloc mechanism."
AffHwlocErrorOccurred "%1$s: Hwloc failed in %2$s. Relying on internal affinity mechanisms."
# --------------------------------------------------------------------------------------------------

View File

@ -77,10 +77,18 @@
#include "kmp_os.h"
#include "kmp_safe_c_api.h"
#if KMP_STATS_ENABLED
class kmp_stats_list;
#endif
#if KMP_USE_HWLOC
#include "hwloc.h"
extern hwloc_topology_t __kmp_hwloc_topology;
extern int __kmp_hwloc_error;
#endif
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
#include <xmmintrin.h>
#endif
@ -488,6 +496,78 @@ extern size_t __kmp_affin_mask_size;
# define KMP_AFFINITY_ENABLE(mask_size) (__kmp_affin_mask_size = mask_size)
# define KMP_CPU_SETSIZE (__kmp_affin_mask_size * CHAR_BIT)
#if KMP_USE_HWLOC
typedef hwloc_cpuset_t kmp_affin_mask_t;
# define KMP_CPU_SET(i,mask) hwloc_bitmap_set((hwloc_cpuset_t)mask, (unsigned)i)
# define KMP_CPU_ISSET(i,mask) hwloc_bitmap_isset((hwloc_cpuset_t)mask, (unsigned)i)
# define KMP_CPU_CLR(i,mask) hwloc_bitmap_clr((hwloc_cpuset_t)mask, (unsigned)i)
# define KMP_CPU_ZERO(mask) hwloc_bitmap_zero((hwloc_cpuset_t)mask)
# define KMP_CPU_COPY(dest, src) hwloc_bitmap_copy((hwloc_cpuset_t)dest, (hwloc_cpuset_t)src)
# define KMP_CPU_COMPLEMENT(max_bit_number, mask) \
{ \
unsigned i; \
for(i=0;i<(unsigned)max_bit_number+1;i++) { \
if(hwloc_bitmap_isset((hwloc_cpuset_t)mask, i)) { \
hwloc_bitmap_clr((hwloc_cpuset_t)mask, i); \
} else { \
hwloc_bitmap_set((hwloc_cpuset_t)mask, i); \
} \
} \
} \
# define KMP_CPU_UNION(dest, src) hwloc_bitmap_or((hwloc_cpuset_t)dest, (hwloc_cpuset_t)dest, (hwloc_cpuset_t)src)
# define KMP_CPU_SET_ITERATE(i,mask) \
for(i = hwloc_bitmap_first((hwloc_cpuset_t)mask); (int)i != -1; i = hwloc_bitmap_next((hwloc_cpuset_t)mask, i))
# define KMP_CPU_ALLOC(ptr) ptr = (kmp_affin_mask_t*)hwloc_bitmap_alloc()
# define KMP_CPU_FREE(ptr) hwloc_bitmap_free((hwloc_bitmap_t)ptr);
# define KMP_CPU_ALLOC_ON_STACK(ptr) KMP_CPU_ALLOC(ptr)
# define KMP_CPU_FREE_FROM_STACK(ptr) KMP_CPU_FREE(ptr)
# define KMP_CPU_INTERNAL_ALLOC(ptr) KMP_CPU_ALLOC(ptr)
# define KMP_CPU_INTERNAL_FREE(ptr) KMP_CPU_FREE(ptr)
//
// The following macro should be used to index an array of masks.
// The array should be declared as "kmp_affinity_t *" and allocated with
// size "__kmp_affinity_mask_size * len". The macro takes care of the fact
// that on Windows* OS, sizeof(kmp_affin_t) is really the size of the mask, but
// on Linux* OS, sizeof(kmp_affin_t) is 1.
//
# define KMP_CPU_INDEX(array,i) ((kmp_affin_mask_t*)(array[i]))
# define KMP_CPU_ALLOC_ARRAY(arr, n) { \
arr = (kmp_affin_mask_t *)__kmp_allocate(n*sizeof(kmp_affin_mask_t)); \
unsigned i; \
for(i=0;i<(unsigned)n;i++) { \
arr[i] = hwloc_bitmap_alloc(); \
} \
}
# define KMP_CPU_FREE_ARRAY(arr, n) { \
unsigned i; \
for(i=0;i<(unsigned)n;i++) { \
hwloc_bitmap_free(arr[i]); \
} \
__kmp_free(arr); \
}
# define KMP_CPU_INTERNAL_ALLOC_ARRAY(arr, n) { \
arr = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(n*sizeof(kmp_affin_mask_t)); \
unsigned i; \
for(i=0;i<(unsigned)n;i++) { \
arr[i] = hwloc_bitmap_alloc(); \
} \
}
# define KMP_CPU_INTERNAL_FREE_ARRAY(arr, n) { \
unsigned i; \
for(i=0;i<(unsigned)n;i++) { \
hwloc_bitmap_free(arr[i]); \
} \
KMP_INTERNAL_FREE(arr); \
}
#else /* KMP_USE_HWLOC */
# define KMP_CPU_SET_ITERATE(i,mask) \
for(i = 0; (size_t)i < KMP_CPU_SETSIZE; ++i)
# if KMP_OS_LINUX
//
// On Linux* OS, the mask is actually a vector of length __kmp_affin_mask_size
@ -526,7 +606,7 @@ typedef unsigned char kmp_affin_mask_t;
} \
}
# define KMP_CPU_COMPLEMENT(mask) \
# define KMP_CPU_COMPLEMENT(max_bit_number, mask) \
{ \
size_t __i; \
for (__i = 0; __i < __kmp_affin_mask_size; __i++) { \
@ -605,7 +685,7 @@ extern int __kmp_num_proc_groups;
} \
}
# define KMP_CPU_COMPLEMENT(mask) \
# define KMP_CPU_COMPLEMENT(max_bit_number, mask) \
{ \
int __i; \
for (__i = 0; __i < __kmp_num_proc_groups; __i++) { \
@ -637,7 +717,7 @@ extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
extern int __kmp_get_proc_group(kmp_affin_mask_t const *mask);
# else
# else /* KMP_GROUP_AFFINITY */
typedef DWORD kmp_affin_mask_t; /* for compatibility with older winbase.h */
@ -646,7 +726,7 @@ typedef DWORD kmp_affin_mask_t; /* for compatibility with older winbase.h */
# define KMP_CPU_CLR(i,mask) (*(mask) &= ~(((kmp_affin_mask_t)1) << (i)))
# define KMP_CPU_ZERO(mask) (*(mask) = 0)
# define KMP_CPU_COPY(dest, src) (*(dest) = *(src))
# define KMP_CPU_COMPLEMENT(mask) (*(mask) = ~*(mask))
# define KMP_CPU_COMPLEMENT(max_bit_number, mask) (*(mask) = ~*(mask))
# define KMP_CPU_UNION(dest, src) (*(dest) |= *(src))
# endif /* KMP_GROUP_AFFINITY */
@ -660,6 +740,10 @@ typedef DWORD kmp_affin_mask_t; /* for compatibility with older winbase.h */
# define KMP_CPU_ALLOC(ptr) \
(ptr = ((kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size)))
# define KMP_CPU_FREE(ptr) __kmp_free(ptr)
# define KMP_CPU_ALLOC_ON_STACK(ptr) (ptr = ((kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size)))
# define KMP_CPU_FREE_FROM_STACK(ptr) /* Nothing */
# define KMP_CPU_INTERNAL_ALLOC(ptr) (ptr = ((kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(__kmp_affin_mask_size)))
# define KMP_CPU_INTERNAL_FREE(ptr) KMP_INTERNAL_FREE(ptr)
//
// The following macro should be used to index an array of masks.
@ -670,6 +754,12 @@ typedef DWORD kmp_affin_mask_t; /* for compatibility with older winbase.h */
//
# define KMP_CPU_INDEX(array,i) \
((kmp_affin_mask_t *)(((char *)(array)) + (i) * __kmp_affin_mask_size))
# define KMP_CPU_ALLOC_ARRAY(arr, n) arr = (kmp_affin_mask_t *)__kmp_allocate(n * __kmp_affin_mask_size)
# define KMP_CPU_FREE_ARRAY(arr, n) __kmp_free(arr);
# define KMP_CPU_INTERNAL_ALLOC_ARRAY(arr, n) arr = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(n * __kmp_affin_mask_size)
# define KMP_CPU_INTERNAL_FREE_ARRAY(arr, n) KMP_INTERNAL_FREE(arr);
#endif /* KMP_USE_HWLOC */
//
// Declare local char buffers with this size for printing debug and info
@ -716,6 +806,9 @@ enum affinity_top_method {
affinity_top_method_group,
#endif /* KMP_GROUP_AFFINITY */
affinity_top_method_flat,
#if KMP_USE_HWLOC
affinity_top_method_hwloc,
#endif
affinity_top_method_default
};

View File

@ -50,6 +50,50 @@ void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
//
// Print the affinity mask to the character array in a pretty format.
//
#if KMP_USE_HWLOC
char *
__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
{
int num_chars_to_write, num_chars_written;
char* scan;
KMP_ASSERT(buf_len >= 40);
// bufsize of 0 just retrieves the needed buffer size.
num_chars_to_write = hwloc_bitmap_list_snprintf(buf, 0, (hwloc_bitmap_t)mask);
// need '{', "xxxxxxxx...xx", '}', '\0' = num_chars_to_write + 3 bytes
// * num_chars_to_write returned by hwloc_bitmap_list_snprintf does not
// take into account the '\0' character.
if(hwloc_bitmap_iszero((hwloc_bitmap_t)mask)) {
KMP_SNPRINTF(buf, buf_len, "{<empty>}");
} else if(num_chars_to_write < buf_len - 3) {
// no problem fitting the mask into buf_len number of characters
buf[0] = '{';
// use buf_len-3 because we have the three characters: '{' '}' '\0' to add to the buffer
num_chars_written = hwloc_bitmap_list_snprintf(buf+1, buf_len-3, (hwloc_bitmap_t)mask);
buf[num_chars_written+1] = '}';
buf[num_chars_written+2] = '\0';
} else {
// Need to truncate the affinity mask string and add ellipsis.
// To do this, we first write out the '{' + str(mask)
buf[0] = '{';
hwloc_bitmap_list_snprintf(buf+1, buf_len-7, (hwloc_bitmap_t)mask);
// then, what we do here is go to the 7th to last character, then go backwards until we are NOT
// on a digit then write "...}\0". This way it is a clean ellipsis addition and we don't
// overwrite part of an affinity number. i.e., we avoid something like { 45, 67, 8...} and get
// { 45, 67,...} instead.
scan = buf + buf_len - 7;
while(*scan >= '0' && *scan <= '9' && scan >= buf)
scan--;
*(scan+1) = '.';
*(scan+2) = '.';
*(scan+3) = '.';
*(scan+4) = '}';
*(scan+5) = '\0';
}
return buf;
}
#else
char *
__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
{
@ -102,6 +146,7 @@ __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
KMP_ASSERT(scan <= end);
return buf;
}
#endif // KMP_USE_HWLOC
void
@ -263,6 +308,291 @@ __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
}
}
#if KMP_USE_HWLOC
static int
__kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
kmp_i18n_id_t *const msg_id)
{
*address2os = NULL;
*msg_id = kmp_i18n_null;
//
// Save the affinity mask for the current thread.
//
kmp_affin_mask_t *oldMask;
KMP_CPU_ALLOC(oldMask);
__kmp_get_system_affinity(oldMask, TRUE);
unsigned depth = hwloc_topology_get_depth(__kmp_hwloc_topology);
int threadLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_PU);
int coreLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_CORE);
int pkgLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET);
__kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 0;
//
// This makes an assumption about the topology being four levels:
// machines -> packages -> cores -> hardware threads
//
hwloc_obj_t current_level_iterator = hwloc_get_root_obj(__kmp_hwloc_topology);
hwloc_obj_t child_iterator;
for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL);
child_iterator != NULL;
child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator))
{
nPackages++;
}
current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, pkgLevel, 0);
for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL);
child_iterator != NULL;
child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator))
{
nCoresPerPkg++;
}
current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, coreLevel, 0);
for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL);
child_iterator != NULL;
child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator))
{
__kmp_nThreadsPerCore++;
}
if (! KMP_AFFINITY_CAPABLE())
{
//
// Hack to try and infer the machine topology using only the data
// available from cpuid on the current thread, and __kmp_xproc.
//
KMP_ASSERT(__kmp_affinity_type == affinity_none);
__kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
if (__kmp_affinity_verbose) {
KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
if (__kmp_affinity_uniform_topology()) {
KMP_INFORM(Uniform, "KMP_AFFINITY");
} else {
KMP_INFORM(NonUniform, "KMP_AFFINITY");
}
KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
}
return 0;
}
//
// Allocate the data structure to be returned.
//
AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
unsigned num_hardware_threads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel);
unsigned i;
hwloc_obj_t hardware_thread_iterator;
int nActiveThreads = 0;
for(i=0;i<num_hardware_threads;i++) {
hardware_thread_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, threadLevel, i);
Address addr(3);
if(! KMP_CPU_ISSET(i, fullMask)) continue;
addr.labels[0] = hardware_thread_iterator->parent->parent->logical_index;
addr.labels[1] = hardware_thread_iterator->parent->logical_index % nCoresPerPkg;
addr.labels[2] = hardware_thread_iterator->logical_index % __kmp_nThreadsPerCore;
retval[nActiveThreads] = AddrUnsPair(addr, hardware_thread_iterator->os_index);
nActiveThreads++;
}
//
// If there's only one thread context to bind to, return now.
//
KMP_ASSERT(nActiveThreads > 0);
if (nActiveThreads == 1) {
__kmp_ncores = nPackages = 1;
__kmp_nThreadsPerCore = nCoresPerPkg = 1;
if (__kmp_affinity_verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
} else {
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
}
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
KMP_INFORM(Uniform, "KMP_AFFINITY");
KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
}
if (__kmp_affinity_type == affinity_none) {
__kmp_free(retval);
KMP_CPU_FREE(oldMask);
return 0;
}
//
// Form an Address object which only includes the package level.
//
Address addr(1);
addr.labels[0] = retval[0].first.labels[pkgLevel-1];
retval[0].first = addr;
if (__kmp_affinity_gran_levels < 0) {
__kmp_affinity_gran_levels = 0;
}
if (__kmp_affinity_verbose) {
__kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
}
*address2os = retval;
KMP_CPU_FREE(oldMask);
return 1;
}
//
// Sort the table by physical Id.
//
qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
//
// When affinity is off, this routine will still be called to set
// __kmp_ncores, as well as __kmp_nThreadsPerCore,
// nCoresPerPkg, & nPackages. Make sure all these vars are set
// correctly, and return if affinity is not enabled.
//
__kmp_ncores = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, coreLevel);
//
// Check to see if the machine topology is uniform
//
unsigned npackages = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, pkgLevel);
unsigned ncores = __kmp_ncores;
unsigned nthreads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel);
unsigned uniform = (npackages * nCoresPerPkg * __kmp_nThreadsPerCore == nthreads);
//
// Print the machine topology summary.
//
if (__kmp_affinity_verbose) {
char mask[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
} else {
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
}
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
if (uniform) {
KMP_INFORM(Uniform, "KMP_AFFINITY");
} else {
KMP_INFORM(NonUniform, "KMP_AFFINITY");
}
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
__kmp_str_buf_print(&buf, "%d", npackages);
//for (level = 1; level <= pkgLevel; level++) {
// __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
// }
KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
__kmp_str_buf_free(&buf);
}
if (__kmp_affinity_type == affinity_none) {
KMP_CPU_FREE(oldMask);
return 0;
}
//
// Find any levels with radiix 1, and remove them from the map
// (except for the package level).
//
int new_depth = 0;
int level;
unsigned proc;
for (level = 1; level < (int)depth; level++) {
if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) {
continue;
}
new_depth++;
}
//
// If we are removing any levels, allocate a new vector to return,
// and copy the relevant information to it.
//
if (new_depth != depth-1) {
AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
sizeof(AddrUnsPair) * nActiveThreads);
for (proc = 0; (int)proc < nActiveThreads; proc++) {
Address addr(new_depth);
new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
}
int new_level = 0;
for (level = 1; level < (int)depth; level++) {
if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) {
if (level == threadLevel) {
threadLevel = -1;
}
else if ((threadLevel >= 0) && (level < threadLevel)) {
threadLevel--;
}
if (level == coreLevel) {
coreLevel = -1;
}
else if ((coreLevel >= 0) && (level < coreLevel)) {
coreLevel--;
}
if (level < pkgLevel) {
pkgLevel--;
}
continue;
}
for (proc = 0; (int)proc < nActiveThreads; proc++) {
new_retval[proc].first.labels[new_level]
= retval[proc].first.labels[level];
}
new_level++;
}
__kmp_free(retval);
retval = new_retval;
depth = new_depth;
}
if (__kmp_affinity_gran_levels < 0) {
//
// Set the granularity level based on what levels are modeled
// in the machine topology map.
//
__kmp_affinity_gran_levels = 0;
if ((threadLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
__kmp_affinity_gran_levels++;
}
if ((coreLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
__kmp_affinity_gran_levels++;
}
if (__kmp_affinity_gran > affinity_gran_package) {
__kmp_affinity_gran_levels++;
}
}
if (__kmp_affinity_verbose) {
__kmp_affinity_print_topology(retval, nActiveThreads, depth-1, pkgLevel-1,
coreLevel-1, threadLevel-1);
}
KMP_CPU_FREE(oldMask);
*address2os = retval;
if(depth == 0) return 0;
else return depth-1;
}
#endif // KMP_USE_HWLOC
//
// If we don't know how to retrieve the machine's processor topology, or
@ -329,7 +659,7 @@ __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
int avail_ct = 0;
unsigned int i;
for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
KMP_CPU_SET_ITERATE(i, fullMask) {
//
// Skip this proc if it is not included in the machine model.
//
@ -394,7 +724,7 @@ __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
int avail_ct = 0;
int i;
for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
KMP_CPU_SET_ITERATE(i, fullMask) {
//
// Skip this proc if it is not included in the machine model.
//
@ -656,7 +986,7 @@ __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
__kmp_avail_proc * sizeof(apicThreadInfo));
unsigned nApics = 0;
for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
KMP_CPU_SET_ITERATE(i, fullMask) {
//
// Skip this proc if it is not included in the machine model.
//
@ -1167,7 +1497,7 @@ __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
//
unsigned int proc;
int nApics = 0;
for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
KMP_CPU_SET_ITERATE(proc, fullMask) {
//
// Skip this proc if it is not included in the machine model.
//
@ -2282,8 +2612,8 @@ __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
maxOsId = osId;
}
}
kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
(maxOsId + 1) * __kmp_affin_mask_size);
kmp_affin_mask_t *osId2Mask;
KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1));
//
// Sort the address2os table according to physical order. Doing so
@ -2314,8 +2644,8 @@ __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
unsigned j = 0; // index of 1st thread on core
unsigned leader = 0;
Address *leaderAddr = &(address2os[0].first);
kmp_affin_mask_t *sum
= (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
kmp_affin_mask_t *sum;
KMP_CPU_ALLOC_ON_STACK(sum);
KMP_CPU_ZERO(sum);
KMP_CPU_SET(address2os[0].second, sum);
for (i = 1; i < numAddrs; i++) {
@ -2365,6 +2695,7 @@ __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
address2os[j].first.leader = (j == leader);
}
unique++;
KMP_CPU_FREE_FROM_STACK(sum);
*maxIndex = maxOsId;
*numUnique = unique;
@ -2384,9 +2715,17 @@ static int nextNewMask;
#define ADD_MASK(_mask) \
{ \
if (nextNewMask >= numNewMasks) { \
int i; \
numNewMasks *= 2; \
newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
numNewMasks * __kmp_affin_mask_size); \
kmp_affin_mask_t* temp; \
KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
for(i=0;i<numNewMasks/2;i++) { \
kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); \
kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i); \
KMP_CPU_COPY(dest, src); \
} \
KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2); \
newMasks = temp; \
} \
KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
nextNewMask++; \
@ -2416,6 +2755,7 @@ __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
unsigned int *out_numMasks, const char *proclist,
kmp_affin_mask_t *osId2Mask, int maxOsId)
{
int i;
const char *scan = proclist;
const char *next = proclist;
@ -2424,11 +2764,10 @@ __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
// so that we can use realloc() to extend it.
//
numNewMasks = 2;
newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
* __kmp_affin_mask_size);
KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
nextNewMask = 0;
kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
__kmp_affin_mask_size);
kmp_affin_mask_t *sumMask;
KMP_CPU_ALLOC(sumMask);
int setSize = 0;
for (;;) {
@ -2632,14 +2971,17 @@ __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
*out_numMasks = nextNewMask;
if (nextNewMask == 0) {
*out_masks = NULL;
KMP_INTERNAL_FREE(newMasks);
KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
return;
}
*out_masks
= (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
__kmp_free(sumMask);
KMP_INTERNAL_FREE(newMasks);
KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
for(i = 0; i < nextNewMask; i++) {
kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i);
kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
KMP_CPU_COPY(dest, src);
}
KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
KMP_CPU_FREE(sumMask);
}
@ -2834,7 +3176,7 @@ __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
else if (**scan == '!') {
(*scan)++; // skip '!'
__kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
KMP_CPU_COMPLEMENT(tempMask);
KMP_CPU_COMPLEMENT(maxOsId, tempMask);
}
else if ((**scan >= '0') && (**scan <= '9')) {
next = *scan;
@ -2866,17 +3208,23 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
unsigned int *out_numMasks, const char *placelist,
kmp_affin_mask_t *osId2Mask, int maxOsId)
{
int i,j,count,stride,sign;
const char *scan = placelist;
const char *next = placelist;
numNewMasks = 2;
newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
* __kmp_affin_mask_size);
KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
nextNewMask = 0;
kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
__kmp_affin_mask_size);
// tempMask is modified based on the previous or initial
// place to form the current place
// previousMask contains the previous place
kmp_affin_mask_t *tempMask;
kmp_affin_mask_t *previousMask;
KMP_CPU_ALLOC(tempMask);
KMP_CPU_ZERO(tempMask);
KMP_CPU_ALLOC(previousMask);
KMP_CPU_ZERO(previousMask);
int setSize = 0;
for (;;) {
@ -2910,7 +3258,7 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
"bad explicit places list");
next = scan;
SKIP_DIGITS(next);
int count = __kmp_str_to_int(scan, *next);
count = __kmp_str_to_int(scan, *next);
KMP_ASSERT(count >= 0);
scan = next;
@ -2918,7 +3266,6 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
// valid follow sets are ',' ':' and EOL
//
SKIP_WS(scan);
int stride;
if (*scan == '\0' || *scan == ',') {
stride = +1;
}
@ -2929,7 +3276,7 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
//
// Read stride parameter
//
int sign = +1;
sign = +1;
for (;;) {
SKIP_WS(scan);
if (*scan == '+') {
@ -2954,66 +3301,30 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
stride *= sign;
}
if (stride > 0) {
int i;
for (i = 0; i < count; i++) {
int j;
if (setSize == 0) {
break;
}
ADD_MASK(tempMask);
setSize = 0;
for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
if (! KMP_CPU_ISSET(j - stride, tempMask)) {
KMP_CPU_CLR(j, tempMask);
}
else if ((j > maxOsId) ||
(! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
&& (__kmp_affinity_type != affinity_none))) && i < count - 1) {
KMP_WARNING(AffIgnoreInvalidProcID, j);
}
KMP_CPU_CLR(j, tempMask);
}
else {
KMP_CPU_SET(j, tempMask);
setSize++;
}
}
for (; j >= 0; j--) {
KMP_CPU_CLR(j, tempMask);
}
// Add places determined by initial_place : count : stride
for (i = 0; i < count; i++) {
if (setSize == 0) {
break;
}
}
else {
int i;
for (i = 0; i < count; i++) {
int j;
if (setSize == 0) {
break;
// Add the current place, then build the next place (tempMask) from that
KMP_CPU_COPY(previousMask, tempMask);
ADD_MASK(previousMask);
KMP_CPU_ZERO(tempMask);
setSize = 0;
KMP_CPU_SET_ITERATE(j, previousMask) {
if (! KMP_CPU_ISSET(j, previousMask)) {
continue;
}
ADD_MASK(tempMask);
setSize = 0;
for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
j++) {
if (! KMP_CPU_ISSET(j - stride, tempMask)) {
KMP_CPU_CLR(j, tempMask);
}
else if ((j > maxOsId) ||
(! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
&& (__kmp_affinity_type != affinity_none))) && i < count - 1) {
KMP_WARNING(AffIgnoreInvalidProcID, j);
}
KMP_CPU_CLR(j, tempMask);
}
else {
KMP_CPU_SET(j, tempMask);
setSize++;
else if ((j+stride > maxOsId) || (j+stride < 0) ||
(! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) {
if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
&& (__kmp_affinity_type != affinity_none))) && i < count - 1) {
KMP_WARNING(AffIgnoreInvalidProcID, j+stride);
}
}
for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
KMP_CPU_CLR(j, tempMask);
else {
KMP_CPU_SET(j+stride, tempMask);
setSize++;
}
}
}
@ -3038,14 +3349,18 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
*out_numMasks = nextNewMask;
if (nextNewMask == 0) {
*out_masks = NULL;
KMP_INTERNAL_FREE(newMasks);
KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
return;
}
*out_masks
= (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
__kmp_free(tempMask);
KMP_INTERNAL_FREE(newMasks);
KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
KMP_CPU_FREE(tempMask);
KMP_CPU_FREE(previousMask);
for(i = 0; i < nextNewMask; i++) {
kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i);
kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
KMP_CPU_COPY(dest, src);
}
KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
}
# endif /* OMP_40_ENABLED */
@ -3140,7 +3455,7 @@ __kmp_aux_affinity_initialize(void)
// processors that we know about on the machine.
//
if (fullMask == NULL) {
fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
KMP_CPU_ALLOC(fullMask);
}
if (KMP_AFFINITY_CAPABLE()) {
if (__kmp_affinity_respect_mask) {
@ -3151,7 +3466,7 @@ __kmp_aux_affinity_initialize(void)
//
unsigned i;
__kmp_avail_proc = 0;
for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
KMP_CPU_SET_ITERATE(i, fullMask) {
if (! KMP_CPU_ISSET(i, fullMask)) {
continue;
}
@ -3193,39 +3508,60 @@ __kmp_aux_affinity_initialize(void)
//
const char *file_name = NULL;
int line = 0;
# if KMP_USE_HWLOC
if (depth < 0) {
if (__kmp_affinity_verbose) {
KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
}
if(!__kmp_hwloc_error) {
depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
if (depth == 0) {
KMP_ASSERT(__kmp_affinity_type == affinity_none);
KMP_ASSERT(address2os == NULL);
return;
} else if(depth < 0 && __kmp_affinity_verbose) {
KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
}
} else if(__kmp_affinity_verbose) {
KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
}
}
# endif
# if KMP_ARCH_X86 || KMP_ARCH_X86_64
if (__kmp_affinity_verbose) {
KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
}
file_name = NULL;
depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
if (depth == 0) {
KMP_ASSERT(__kmp_affinity_type == affinity_none);
KMP_ASSERT(address2os == NULL);
return;
}
if (depth < 0) {
if (__kmp_affinity_verbose) {
if (msg_id != kmp_i18n_null) {
KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
KMP_I18N_STR(DecodingLegacyAPIC));
}
else {
KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
}
KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
}
file_name = NULL;
depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
if (depth == 0) {
KMP_ASSERT(__kmp_affinity_type == affinity_none);
KMP_ASSERT(address2os == NULL);
return;
}
if (depth < 0) {
if (__kmp_affinity_verbose) {
if (msg_id != kmp_i18n_null) {
KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
KMP_I18N_STR(DecodingLegacyAPIC));
}
else {
KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
}
}
file_name = NULL;
depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
if (depth == 0) {
KMP_ASSERT(__kmp_affinity_type == affinity_none);
KMP_ASSERT(address2os == NULL);
return;
}
}
}
# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
@ -3430,6 +3766,50 @@ __kmp_aux_affinity_initialize(void)
KMP_ASSERT(address2os != NULL);
}
# if KMP_USE_HWLOC
else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
if (__kmp_affinity_verbose) {
KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
}
depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
if (depth == 0) {
KMP_ASSERT(__kmp_affinity_type == affinity_none);
KMP_ASSERT(address2os == NULL);
return;
}
# if KMP_DEBUG
AddrUnsPair *otheraddress2os = NULL;
int otherdepth = -1;
# if KMP_MIC
otherdepth = __kmp_affinity_create_apicid_map(&otheraddress2os, &msg_id);
# else
otherdepth = __kmp_affinity_create_x2apicid_map(&otheraddress2os, &msg_id);
# endif
if(otheraddress2os != NULL && address2os != NULL) {
int i;
unsigned arent_equal_flag = 0;
for(i=0;i<__kmp_avail_proc;i++) {
if(otheraddress2os[i] != address2os[i]) arent_equal_flag = 1;
}
if(arent_equal_flag) {
KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc affinity places are different from APICID\n"));
KA_TRACE(10, ("__kmp_aux_affinity_initialize: APICID Table:\n"));
for(i=0;i<__kmp_avail_proc;i++) {
otheraddress2os[i].print(); __kmp_printf("\n");
}
KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc Table:\n"));
for(i=0;i<__kmp_avail_proc;i++) {
address2os[i].print(); __kmp_printf("\n");
}
}
else {
KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc affinity places are same as APICID\n"));
}
}
# endif // KMP_DEBUG
}
# endif // KMP_USE_HWLOC
if (address2os == NULL) {
if (KMP_AFFINITY_CAPABLE()
&& (__kmp_affinity_verbose || (__kmp_affinity_warnings
@ -3608,8 +3988,7 @@ __kmp_aux_affinity_initialize(void)
}
# endif
__kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
__kmp_affinity_num_masks * __kmp_affin_mask_size);
KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
//
// Sort the address2os table according to the current setting of
@ -3679,7 +4058,7 @@ void
__kmp_affinity_uninitialize(void)
{
if (__kmp_affinity_masks != NULL) {
__kmp_free(__kmp_affinity_masks);
KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
__kmp_affinity_masks = NULL;
}
if (fullMask != NULL) {
@ -3909,7 +4288,7 @@ __kmp_aux_set_affinity(void **mask)
unsigned proc;
int num_procs = 0;
for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) {
if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
continue;
}
@ -4027,7 +4406,11 @@ __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
}
}
if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
if ((proc < 0)
# if !KMP_USE_HWLOC
|| ((unsigned)proc >= KMP_CPU_SETSIZE)
# endif
) {
return -1;
}
if (! KMP_CPU_ISSET(proc, fullMask)) {
@ -4063,7 +4446,11 @@ __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
}
}
if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
if ((proc < 0)
# if !KMP_USE_HWLOC
|| ((unsigned)proc >= KMP_CPU_SETSIZE)
# endif
) {
return -1;
}
if (! KMP_CPU_ISSET(proc, fullMask)) {
@ -4099,8 +4486,12 @@ __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
}
}
if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
return 0;
if ((proc < 0)
# if !KMP_USE_HWLOC
|| ((unsigned)proc >= KMP_CPU_SETSIZE)
# endif
) {
return -1;
}
if (! KMP_CPU_ISSET(proc, fullMask)) {
return 0;
@ -4137,7 +4528,8 @@ void __kmp_balanced_affinity( int tid, int nthreads )
KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
"Illegal set affinity operation when not capable");
kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
kmp_affin_mask_t *mask;
KMP_CPU_ALLOC_ON_STACK(mask);
KMP_CPU_ZERO(mask);
// Granularity == thread
@ -4158,9 +4550,11 @@ void __kmp_balanced_affinity( int tid, int nthreads )
tid, buf);
}
__kmp_set_system_affinity( mask, TRUE );
KMP_CPU_FREE_FROM_STACK(mask);
} else { // Non-uniform topology
kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
kmp_affin_mask_t *mask;
KMP_CPU_ALLOC_ON_STACK(mask);
KMP_CPU_ZERO(mask);
// Number of hyper threads per core in HT machine
@ -4334,6 +4728,7 @@ void __kmp_balanced_affinity( int tid, int nthreads )
tid, buf);
}
__kmp_set_system_affinity( mask, TRUE );
KMP_CPU_FREE_FROM_STACK(mask);
}
}

View File

@ -57,6 +57,13 @@ public:
bool operator!=(const Address &b) const {
return !operator==(b);
}
void print() const {
unsigned i;
printf("Depth: %u --- ", depth);
for(i=0;i<depth;i++) {
printf("%u ", labels[i]);
}
}
};
class AddrUnsPair {
@ -72,6 +79,18 @@ public:
second = b.second;
return *this;
}
void print() const {
printf("first = "); first.print();
printf(" --- second = %u", second);
}
bool operator==(const AddrUnsPair &b) const {
if(first != b.first) return false;
if(second != b.second) return false;
return true;
}
bool operator!=(const AddrUnsPair &b) const {
return !operator==(b);
}
};

View File

@ -51,6 +51,8 @@
#cmakedefine01 LIBOMP_ENABLE_ASSERTIONS
#define KMP_USE_ASSERT LIBOMP_ENABLE_ASSERTIONS
#cmakedefine01 STUBS_LIBRARY
#cmakedefine01 LIBOMP_USE_HWLOC
#define KMP_USE_HWLOC LIBOMP_USE_HWLOC
#define KMP_ARCH_STR "@LIBOMP_LEGAL_ARCH@"
#define KMP_LIBRARY_FILE "@LIBOMP_LIB_FILE@"
#define KMP_VERSION_MAJOR @LIBOMP_VERSION_MAJOR@

View File

@ -257,7 +257,7 @@ FTN_GET_AFFINITY_MAX_PROC( void )
return 0;
}
#if KMP_GROUP_AFFINITY
#if KMP_GROUP_AFFINITY && !KMP_USE_HWLOC
if ( __kmp_num_proc_groups > 1 ) {
return (int)KMP_CPU_SETSIZE;
}
@ -278,7 +278,11 @@ FTN_CREATE_AFFINITY_MASK( void **mask )
if ( ! TCR_4(__kmp_init_middle) ) {
__kmp_middle_initialize();
}
# if KMP_USE_HWLOC
*mask = (hwloc_cpuset_t)hwloc_bitmap_alloc();
# else
*mask = kmpc_malloc( __kmp_affin_mask_size );
# endif
KMP_CPU_ZERO( (kmp_affin_mask_t *)(*mask) );
#endif
}
@ -300,7 +304,11 @@ FTN_DESTROY_AFFINITY_MASK( void **mask )
KMP_FATAL( AffinityInvalidMask, "kmp_destroy_affinity_mask" );
}
}
# if KMP_USE_HWLOC
hwloc_bitmap_free((hwloc_cpuset_t)(*mask));
# else
kmpc_free( *mask );
# endif
*mask = NULL;
#endif
}

View File

@ -33,6 +33,10 @@ __thread kmp_stats_list* __kmp_stats_thread_ptr = &__kmp_stats_list;
// gives reference tick for all events (considered the 0 tick)
tsc_tick_count __kmp_stats_start_time;
#endif
#if KMP_USE_HWLOC
int __kmp_hwloc_error = FALSE;
hwloc_topology_t __kmp_hwloc_topology = NULL;
#endif
/* ----------------------------------------------------- */
/* INITIALIZATION VARIABLES */

View File

@ -3009,6 +3009,11 @@ __kmp_stg_parse_topology_method( char const * name, char const * value,
else if ( __kmp_str_match( "flat", 1, value ) ) {
__kmp_affinity_top_method = affinity_top_method_flat;
}
# if KMP_USE_HWLOC
else if ( __kmp_str_match( "hwloc", 1, value) ) {
__kmp_affinity_top_method = affinity_top_method_hwloc;
}
# endif
else {
KMP_WARNING( StgInvalidValue, name, value );
}
@ -5119,11 +5124,43 @@ __kmp_env_initialize( char const * string ) {
// affinity.
//
const char *var = "KMP_AFFINITY";
# if KMP_USE_HWLOC
if(hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
__kmp_hwloc_error = TRUE;
if(__kmp_affinity_verbose)
KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
}
hwloc_topology_ignore_type(__kmp_hwloc_topology, HWLOC_OBJ_CACHE);
# endif
if ( __kmp_affinity_type == affinity_disabled ) {
KMP_AFFINITY_DISABLE();
}
else if ( ! KMP_AFFINITY_CAPABLE() ) {
# if KMP_USE_HWLOC
const hwloc_topology_support* topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
if(hwloc_topology_load(__kmp_hwloc_topology) < 0) {
__kmp_hwloc_error = TRUE;
if(__kmp_affinity_verbose)
KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
}
// Is the system capable of setting/getting this thread's affinity?
// also, is topology discovery possible? (pu indicates ability to discover processing units)
// and finally, were there no errors when calling any hwloc_* API functions?
if(topology_support->cpubind->set_thisthread_cpubind &&
topology_support->cpubind->get_thisthread_cpubind &&
topology_support->discovery->pu &&
!__kmp_hwloc_error)
{
// enables affinity according to KMP_AFFINITY_CAPABLE() macro
KMP_AFFINITY_ENABLE(TRUE);
} else {
// indicate that hwloc didn't work and disable affinity
__kmp_hwloc_error = TRUE;
KMP_AFFINITY_DISABLE();
}
# else
__kmp_affinity_determine_capable( var );
# endif // KMP_USE_HWLOC
if ( ! KMP_AFFINITY_CAPABLE() ) {
if ( __kmp_affinity_verbose || ( __kmp_affinity_warnings
&& ( __kmp_affinity_type != affinity_default )

View File

@ -175,8 +175,11 @@ __kmp_set_system_affinity( kmp_affin_mask_t const *mask, int abort_on_error )
{
KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
"Illegal set affinity operation when not capable");
#if KMP_USE_HWLOC
int retval = hwloc_set_cpubind(__kmp_hwloc_topology, (hwloc_cpuset_t)mask, HWLOC_CPUBIND_THREAD);
#else
int retval = syscall( __NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask );
#endif
if (retval >= 0) {
return 0;
}
@ -198,7 +201,11 @@ __kmp_get_system_affinity( kmp_affin_mask_t *mask, int abort_on_error )
KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
"Illegal get affinity operation when not capable");
#if KMP_USE_HWLOC
int retval = hwloc_get_cpubind(__kmp_hwloc_topology, (hwloc_cpuset_t)mask, HWLOC_CPUBIND_THREAD);
#else
int retval = syscall( __NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask );
#endif
if (retval >= 0) {
return 0;
}
@ -220,10 +227,12 @@ __kmp_affinity_bind_thread( int which )
KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
"Illegal set affinity operation when not capable");
kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
kmp_affin_mask_t *mask;
KMP_CPU_ALLOC_ON_STACK(mask);
KMP_CPU_ZERO(mask);
KMP_CPU_SET(which, mask);
__kmp_set_system_affinity(mask, TRUE);
KMP_CPU_FREE_FROM_STACK(mask);
}
/*

View File

@ -1,12 +1,23 @@
# CMakeLists.txt file for unit testing OpenMP Library
include(FindPythonInterp)
include(CheckTypeSize)
if(NOT PYTHONINTERP_FOUND)
libomp_warning_say("Could not find Python.")
libomp_warning_say("The check-libomp target will not be available!")
return()
endif()
macro(pythonize_bool var)
if (${var})
set(${var} True)
else()
set(${var} False)
endif()
endmacro()
pythonize_bool(LIBOMP_USE_HWLOC)
set(LIBOMP_TEST_CFLAGS "" CACHE STRING
"Extra compiler flags to send to the test compiler")

View File

@ -9,11 +9,20 @@ if 'PYLINT_IMPORT' in os.environ:
config = object()
lit_config = object()
def append_dynamic_library_path(name, value, sep):
if name in config.environment:
config.environment[name] = value + sep + config.environment[name]
def append_dynamic_library_path(path):
if config.operating_system == 'Windows':
name = 'PATH'
sep = ';'
elif config.operating_system == 'Darwin':
name = 'DYLD_LIBRARY_PATH'
sep = ':'
else:
config.environment[name] = value
name = 'LD_LIBRARY_PATH'
sep = ':'
if name in config.environment:
config.environment[name] = path + sep + config.environment[name]
else:
config.environment[name] = path
# name: The name of this test suite.
config.name = 'libomp'
@ -38,13 +47,15 @@ config.test_cflags = config.test_openmp_flag + \
" " + config.test_extra_cflags
# Setup environment to find dynamic library at runtime
if config.operating_system == 'Windows':
append_dynamic_library_path('PATH', config.library_dir, ";")
elif config.operating_system == 'Darwin':
append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":")
append_dynamic_library_path(config.library_dir)
if config.using_hwloc:
append_dynamic_library_path(config.hwloc_library_dir)
# Rpath modifications for Darwin
if config.operating_system == 'Darwin':
config.test_cflags += " -Wl,-rpath," + config.library_dir
else: # Unices
append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
if config.using_hwloc:
config.test_cflags += " -Wl,-rpath," + config.hwloc_library_dir
# substitutions
config.substitutions.append(("%libomp-compile-and-run", \

View File

@ -7,6 +7,8 @@ config.libomp_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
config.library_dir = "@LIBOMP_LIBRARY_DIR@"
config.omp_header_directory = "@LIBOMP_BINARY_DIR@/src"
config.operating_system = "@CMAKE_SYSTEM_NAME@"
config.hwloc_library_dir = "@LIBOMP_HWLOC_LIBRARY_DIR@"
config.using_hwloc = @LIBOMP_USE_HWLOC@
# Let the main config do the real work.
lit_config.load_config(config, "@LIBOMP_BASE_DIR@/test/lit.cfg")