[OpenMP] Refactor/Rework topology discovery code

This patch does the following:

1) Introduce kmp_topology_t as the runtime-friendly structure (the
corresponding global variable is __kmp_topology) to determine the
exact machine topology which can vary widely among current and future
architectures. The current design is not easy to expand beyond the assumed
three layer topology: sockets, cores, and threads so a rework capable of
using the existing KMP_AFFINITY mechanisms is required.

This new topology structure has:
* The depth and types of the topology
* Ratio count for each consecutive level (e.g., number of cores per
   socket, number of threads per core)
* Absolute count for each level (e.g., 2 sockets, 16 cores, 32 threads)
* Equivalent topology layer map (e.g., Numa domain is equivalent to
   socket, L1/L2 cache equivalent to core)
* Whether it is uniform or not

The hardware threads are represented with the kmp_hw_thread_t
structure. This structure contains the ids (e.g., socket 0, core 1,
thread 0) and other information grabbed from the previous Address
structure. The kmp_topology_t structure contains an array of these.

2) Generalize the KMP_HW_SUBSET envirable for the new
kmp_topology_t structure. The algorithm doesn't assume any order with
tiles,numa domains,sockets,cores,threads. Instead it just parses the
envirable, makes sure it is consistent with the detected topology
(including taking into account equivalent layers) and then trims away
the unneeded subset of hardware threads. To enable this, a new
kmp_hw_subset_t structure is introduced which contains a vector of
items (hardware type, number user wants, offset). Any keyword within
__kmp_hw_get_keyword() can be used as a name and can be shortened as
well. e.g.,
KMP_HW_SUBSET=1s,2numa,4tile,2c,3t can be used on the KNL SNC-4 machine.

3) Simplify topology detection functions so they only do the singular
task of detecting the machine's topology. Printing, and all
canonicalizing functionality is now done afterwards. So many lines of
duplicated code are eliminated.

4) Add new ll_caches and numa_domains to OMP_PLACES, and
consequently, KMP_AFFINITY's granularity setting. All the names within
__kmp_hw_get_keyword() are available for use in OMP_PLACES or
KMP_AFFINITY's granularity setting.

5) Simplify and future-proof code where explicit lists of allowed
affinity settings keywords inside if() conditions.

6) Add x86 CPUID leaf 4 cache detection to existing x2apic id method
so equivalent caches could be detected (in particular for the ll_caches
place).

Differential Revision: https://reviews.llvm.org/D100997
This commit is contained in:
Peyton, Jonathan L 2021-04-16 16:30:26 -05:00
parent 32b500431c
commit 9982f33e2c
10 changed files with 2274 additions and 2545 deletions

View File

@ -124,6 +124,9 @@ ProcGroup "processor group"
ProcGroups "processor groups"
Unknown "unknown"
NoLeaf31Support "cpuid leaf 31 not supported"
HwlocFailed "Hwloc api failure"
LLCache "LL cache"
LLCaches "LL caches"
@ -355,6 +358,7 @@ OmptOutdatedWorkshare "OMPT: Cannot determine workshare type; using the d
"This issue is fixed in an up-to-date compiler."
OmpNoAllocator "Allocator %1$s is not available, will use default allocator."
TopologyGeneric "%1$s: %2$s (%3$d total cores)"
AffGranularityBad "%1$s: granularity setting: %2$s does not exist in topology. Using granularity=%3$s instead."
# --- OpenMP errors detected at runtime ---
#
@ -458,6 +462,11 @@ AffNotUsingHwloc "%1$s: Affinity not capable, using hwloc."
UserDirectedError "%1$s: Encountered user-directed error: %2$s."
UserDirectedWarning "%1$s: Encountered user-directed warning: %2$s."
FailedToCreateTeam "Failed to create teams between lower bound (%1$d) and upper bound (%2$d)."
AffHWSubsetManyGeneric "KMP_HW_SUBSET ignored: %1$s: too many requested."
AffHWSubsetNotExistGeneric "KMP_HW_SUBSET ignored: %1$s: level not detected in machine topology."
AffHWSubsetEqvLayers "KMP_HW_SUBSET ignored: %1$s, %2$s: layers are equivalent, please only specify one."
AffHWSubsetOutOfOrder "KMP_HW_SUBSET ignored: %1$s layer should come after %2$s."
AffEqualTopologyTypes "%1$s: topology layer \"%2$s\" is equivalent to \"%3$s\"."
# --------------------------------------------------------------------------------------------------
-*- HINTS -*-

View File

@ -597,11 +597,11 @@ typedef int PACKED_REDUCTION_METHOD_T;
enum kmp_hw_t : int {
KMP_HW_UNKNOWN = -1,
KMP_HW_MACHINE = 0,
KMP_HW_SOCKET,
KMP_HW_SOCKET = 0,
KMP_HW_PROC_GROUP,
KMP_HW_NUMA,
KMP_HW_DIE,
KMP_HW_LLC,
KMP_HW_L3,
KMP_HW_TILE,
KMP_HW_MODULE,
@ -612,13 +612,16 @@ enum kmp_hw_t : int {
KMP_HW_LAST
};
#define KMP_ASSERT_VALID_HW_TYPE(type) \
#define KMP_DEBUG_ASSERT_VALID_HW_TYPE(type) \
KMP_DEBUG_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
#define KMP_ASSERT_VALID_HW_TYPE(type) \
KMP_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
#define KMP_FOREACH_HW_TYPE(type) \
for (kmp_hw_t type = (kmp_hw_t)0; type < KMP_HW_LAST; \
type = (kmp_hw_t)((int)type + 1))
const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural = false);
const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural = false);
/* Only Linux* OS and Windows* OS support thread affinity. */
@ -655,8 +658,6 @@ extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
#if KMP_USE_HWLOC
extern hwloc_topology_t __kmp_hwloc_topology;
extern int __kmp_hwloc_error;
extern int __kmp_numa_detected;
extern int __kmp_tile_depth;
#endif
extern size_t __kmp_affin_mask_size;
@ -784,23 +785,6 @@ enum affinity_type {
affinity_default
};
enum affinity_gran {
affinity_gran_fine = 0,
affinity_gran_thread,
affinity_gran_core,
affinity_gran_tile,
affinity_gran_die,
affinity_gran_numa,
affinity_gran_package,
affinity_gran_node,
#if KMP_GROUP_AFFINITY
// The "group" granularity isn't necesssarily coarser than all of the
// other levels, but we put it last in the enum.
affinity_gran_group,
#endif /* KMP_GROUP_AFFINITY */
affinity_gran_default
};
enum affinity_top_method {
affinity_top_method_all = 0, // try all (supported) methods, in order
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
@ -822,7 +806,7 @@ enum affinity_top_method {
#define affinity_respect_mask_default (-1)
extern enum affinity_type __kmp_affinity_type; /* Affinity type */
extern enum affinity_gran __kmp_affinity_gran; /* Affinity granularity */
extern kmp_hw_t __kmp_affinity_gran; /* Affinity granularity */
extern int __kmp_affinity_gran_levels; /* corresponding int value */
extern int __kmp_affinity_dups; /* Affinity duplicate masks */
extern enum affinity_top_method __kmp_affinity_top_method;

File diff suppressed because it is too large Load Diff

View File

@ -598,91 +598,274 @@ class KMPNativeAffinity : public KMPAffinity {
#endif /* KMP_OS_WINDOWS */
#endif /* KMP_AFFINITY_SUPPORTED */
class Address {
class kmp_hw_thread_t {
public:
static const unsigned maxDepth = 32;
unsigned labels[maxDepth];
unsigned childNums[maxDepth];
unsigned depth;
unsigned leader;
Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
Address &operator=(const Address &b) {
depth = b.depth;
for (unsigned i = 0; i < depth; i++) {
labels[i] = b.labels[i];
childNums[i] = b.childNums[i];
}
leader = FALSE;
return *this;
}
bool operator==(const Address &b) const {
if (depth != b.depth)
return false;
for (unsigned i = 0; i < depth; i++)
if (labels[i] != b.labels[i])
return false;
return true;
}
bool isClose(const Address &b, int level) const {
if (depth != b.depth)
return false;
if ((unsigned)level >= depth)
return true;
for (unsigned i = 0; i < (depth - level); i++)
if (labels[i] != b.labels[i])
return false;
return true;
}
bool operator!=(const Address &b) const { return !operator==(b); }
void print() const {
unsigned i;
printf("Depth: %u --- ", depth);
for (i = 0; i < depth; i++) {
printf("%u ", labels[i]);
}
static const int UNKNOWN_ID = -1;
static int compare_ids(const void *a, const void *b);
static int compare_compact(const void *a, const void *b);
int ids[KMP_HW_LAST];
int sub_ids[KMP_HW_LAST];
bool leader;
int os_id;
void print() const;
void clear() {
for (int i = 0; i < (int)KMP_HW_LAST; ++i)
ids[i] = UNKNOWN_ID;
leader = false;
}
};
class AddrUnsPair {
public:
Address first;
unsigned second;
AddrUnsPair(Address _first, unsigned _second)
: first(_first), second(_second) {}
AddrUnsPair &operator=(const AddrUnsPair &b) {
first = b.first;
second = b.second;
return *this;
}
void print() const {
printf("first = ");
first.print();
printf(" --- second = %u", second);
}
bool operator==(const AddrUnsPair &b) const {
if (first != b.first)
return false;
if (second != b.second)
return false;
return true;
}
bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
class kmp_topology_t {
struct flags_t {
int uniform : 1;
int reserved : 31;
};
static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
const Address *aa = &(((const AddrUnsPair *)a)->first);
const Address *bb = &(((const AddrUnsPair *)b)->first);
unsigned depth = aa->depth;
unsigned i;
KMP_DEBUG_ASSERT(depth == bb->depth);
for (i = 0; i < depth; i++) {
if (aa->labels[i] < bb->labels[i])
int depth;
// The following arrays are all 'depth' long
// Orderd array of the types in the topology
kmp_hw_t *types;
// Keep quick topology ratios, for non-uniform topologies,
// this ratio holds the max number of itemAs per itemB
// e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
int *ratio;
// Storage containing the absolute number of each topology layer
int *count;
// The hardware threads array
// hw_threads is num_hw_threads long
// Each hw_thread's ids and sub_ids are depth deep
int num_hw_threads;
kmp_hw_thread_t *hw_threads;
// Equivalence hash where the key is the hardware topology item
// and the value is the equivalent hardware topology type in the
// types[] array, if the value is KMP_HW_UNKNOWN, then there is no
// known equivalence for the topology type
kmp_hw_t equivalent[KMP_HW_LAST];
// Flags describing the topology
flags_t flags;
// Count each item & get the num x's per y
// e.g., get the number of cores and the number of threads per core
// for each (x, y) in (KMP_HW_* , KMP_HW_*)
void _gather_enumeration_information();
// Remove layers that don't add information to the topology.
// This is done by having the layer take on the id = UNKNOWN_ID (-1)
void _remove_radix1_layers();
// Find out if the topology is uniform
void _discover_uniformity();
// Set all the sub_ids for each hardware thread
void _set_sub_ids();
// Set global affinity variables describing the number of threads per
// core, the number of packages, the number of cores per package, and
// the number of cores.
void _set_globals();
// Set the last level cache equivalent type
void _set_last_level_cache();
public:
// Force use of allocate()/deallocate()
kmp_topology_t() = delete;
kmp_topology_t(const kmp_topology_t &t) = delete;
kmp_topology_t(kmp_topology_t &&t) = delete;
kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
static void deallocate(kmp_topology_t *);
// Functions used in create_map() routines
kmp_hw_thread_t &at(int index) {
KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
return hw_threads[index];
}
const kmp_hw_thread_t &at(int index) const {
KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
return hw_threads[index];
}
int get_num_hw_threads() const { return num_hw_threads; }
void sort_ids() {
qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
kmp_hw_thread_t::compare_ids);
}
// Check if the hardware ids are unique, if they are
// return true, otherwise return false
bool check_ids() const;
// Function to call after the create_map() routine
void canonicalize();
void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
// Functions used after canonicalize() called
bool filter_hw_subset();
bool is_close(int hwt1, int hwt2, int level) const;
bool is_uniform() const { return flags.uniform; }
// Tell whether a type is a valid type in the topology
// returns KMP_HW_UNKNOWN when there is no equivalent type
kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
// Set type1 = type2
void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
kmp_hw_t real_type2 = equivalent[type2];
if (real_type2 == KMP_HW_UNKNOWN)
real_type2 = type2;
equivalent[type1] = real_type2;
// This loop is required since any of the types may have been set to
// be equivalent to type1. They all must be checked and reset to type2.
KMP_FOREACH_HW_TYPE(type) {
if (equivalent[type] == type1) {
equivalent[type] = real_type2;
}
}
}
// Calculate number of types corresponding to level1
// per types corresponding to level2 (e.g., number of threads per core)
int calculate_ratio(int level1, int level2) const {
KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
int r = 1;
for (int level = level1; level > level2; --level)
r *= ratio[level];
return r;
}
int get_ratio(int level) const {
KMP_DEBUG_ASSERT(level >= 0 && level < depth);
return ratio[level];
}
int get_depth() const { return depth; };
kmp_hw_t get_type(int level) const {
KMP_DEBUG_ASSERT(level >= 0 && level < depth);
return types[level];
}
int get_level(kmp_hw_t type) const {
KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
int eq_type = equivalent[type];
if (eq_type == KMP_HW_UNKNOWN)
return -1;
for (int i = 0; i < depth; ++i)
if (types[i] == eq_type)
return i;
return -1;
if (aa->labels[i] > bb->labels[i])
return 1;
}
return 0;
int get_count(int level) const {
KMP_DEBUG_ASSERT(level >= 0 && level < depth);
return count[level];
}
#if KMP_AFFINITY_SUPPORTED
void sort_compact() {
qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
kmp_hw_thread_t::compare_compact);
}
#endif
void print(const char *env_var = "KMP_AFFINITY") const;
void dump() const;
};
class kmp_hw_subset_t {
public:
struct item_t {
int num;
kmp_hw_t type;
int offset;
};
private:
int depth;
int capacity;
item_t *items;
kmp_uint64 set;
bool absolute;
// The set must be able to handle up to KMP_HW_LAST number of layers
KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
public:
// Force use of allocate()/deallocate()
kmp_hw_subset_t() = delete;
kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
static kmp_hw_subset_t *allocate() {
int initial_capacity = 5;
kmp_hw_subset_t *retval =
(kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
retval->depth = 0;
retval->capacity = initial_capacity;
retval->set = 0ull;
retval->absolute = false;
retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
return retval;
}
static void deallocate(kmp_hw_subset_t *subset) {
__kmp_free(subset->items);
__kmp_free(subset);
}
void set_absolute() { absolute = true; }
bool is_absolute() const { return absolute; }
void push_back(int num, kmp_hw_t type, int offset) {
if (depth == capacity - 1) {
capacity *= 2;
item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
for (int i = 0; i < depth; ++i)
new_items[i] = items[i];
__kmp_free(items);
items = new_items;
}
items[depth].num = num;
items[depth].type = type;
items[depth].offset = offset;
depth++;
set |= (1ull << type);
}
int get_depth() const { return depth; }
const item_t &at(int index) const {
KMP_DEBUG_ASSERT(index >= 0 && index < depth);
return items[index];
}
item_t &at(int index) {
KMP_DEBUG_ASSERT(index >= 0 && index < depth);
return items[index];
}
void remove(int index) {
KMP_DEBUG_ASSERT(index >= 0 && index < depth);
set &= ~(1ull << items[index].type);
for (int j = index + 1; j < depth; ++j) {
items[j - 1] = items[j];
}
depth--;
}
bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
void dump() const {
printf("**********************\n");
printf("*** kmp_hw_subset: ***\n");
printf("* depth: %d\n", depth);
printf("* items:\n");
for (int i = 0; i < depth; ++i) {
printf("num: %d, type: %s, offset: %d\n", items[i].num,
__kmp_hw_get_keyword(items[i].type), items[i].offset);
}
printf("* set: 0x%llx\n", set);
printf("* absolute: %d\n", absolute);
printf("**********************\n");
}
};
extern kmp_topology_t *__kmp_topology;
extern kmp_hw_subset_t *__kmp_hw_subset;
/* A structure for holding machine-specific hierarchy info to be computed once
at init. This structure represents a mapping of threads to the actual machine
@ -721,18 +904,10 @@ public:
kmp_uint32 *numPerLevel;
kmp_uint32 *skipPerLevel;
void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
int hier_depth = adr2os[0].first.depth;
int level = 0;
for (int i = hier_depth - 1; i >= 0; --i) {
int max = -1;
for (int j = 0; j < num_addrs; ++j) {
int next = adr2os[j].first.childNums[i];
if (next > max)
max = next;
}
numPerLevel[level] = max + 1;
++level;
void deriveLevels() {
int hier_depth = __kmp_topology->get_depth();
for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
numPerLevel[level] = __kmp_topology->get_ratio(i);
}
}
@ -747,7 +922,7 @@ public:
}
}
void init(AddrUnsPair *adr2os, int num_addrs) {
void init(int num_addrs) {
kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
&uninitialized, not_initialized, initializing);
if (bool_result == 0) { // Wait for initialization
@ -774,10 +949,8 @@ public:
}
// Sort table by physical ID
if (adr2os) {
qsort(adr2os, num_addrs, sizeof(*adr2os),
__kmp_affinity_cmp_Address_labels);
deriveLevels(adr2os, num_addrs);
if (__kmp_topology && __kmp_topology->get_depth() > 0) {
deriveLevels();
} else {
numPerLevel[0] = maxLeaves;
numPerLevel[1] = num_addrs / maxLeaves;

View File

@ -247,8 +247,6 @@ KMPAffinity *__kmp_affinity_dispatch = NULL;
#if KMP_USE_HWLOC
int __kmp_hwloc_error = FALSE;
hwloc_topology_t __kmp_hwloc_topology = NULL;
int __kmp_numa_detected = FALSE;
int __kmp_tile_depth = 0;
#endif
#if KMP_OS_WINDOWS
@ -263,7 +261,7 @@ kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity = NULL;
size_t __kmp_affin_mask_size = 0;
enum affinity_type __kmp_affinity_type = affinity_default;
enum affinity_gran __kmp_affinity_gran = affinity_gran_default;
kmp_hw_t __kmp_affinity_gran = KMP_HW_UNKNOWN;
int __kmp_affinity_gran_levels = -1;
int __kmp_affinity_dups = TRUE;
enum affinity_top_method __kmp_affinity_top_method =
@ -286,15 +284,6 @@ int __kmp_affinity_num_places = 0;
int __kmp_display_affinity = FALSE;
char *__kmp_affinity_format = NULL;
kmp_hws_item_t __kmp_hws_socket = {0, 0};
kmp_hws_item_t __kmp_hws_die = {0, 0};
kmp_hws_item_t __kmp_hws_node = {0, 0};
kmp_hws_item_t __kmp_hws_tile = {0, 0};
kmp_hws_item_t __kmp_hws_core = {0, 0};
kmp_hws_item_t __kmp_hws_proc = {0, 0};
int __kmp_hws_requested = 0;
int __kmp_hws_abs_flag = 0; // absolute or per-item number requested
kmp_int32 __kmp_default_device = 0;
kmp_tasking_mode_t __kmp_tasking_mode = tskm_task_teams;

View File

@ -2069,9 +2069,9 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
enum affinity_type *out_type,
char **out_proclist, int *out_verbose,
int *out_warn, int *out_respect,
enum affinity_gran *out_gran,
int *out_gran_levels, int *out_dups,
int *out_compact, int *out_offset) {
kmp_hw_t *out_gran, int *out_gran_levels,
int *out_dups, int *out_compact,
int *out_offset) {
char *buffer = NULL; // Copy of env var value.
char *buf = NULL; // Buffer for strtok_r() function.
char *next = NULL; // end of token / start of next.
@ -2087,6 +2087,7 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
int respect = 0;
int gran = 0;
int dups = 0;
bool set = false;
KMP_ASSERT(value != NULL);
@ -2232,33 +2233,37 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
SKIP_WS(next);
buf = next;
// Try any hardware topology type for granularity
KMP_FOREACH_HW_TYPE(type) {
const char *name = __kmp_hw_get_keyword(type);
if (__kmp_match_str(name, buf, CCAST(const char **, &next))) {
set_gran(type, -1);
buf = next;
set = true;
break;
}
}
if (!set) {
// Support older names for different granularity layers
if (__kmp_match_str("fine", buf, CCAST(const char **, &next))) {
set_gran(affinity_gran_fine, -1);
set_gran(KMP_HW_THREAD, -1);
buf = next;
} else if (__kmp_match_str("thread", buf, CCAST(const char **, &next))) {
set_gran(affinity_gran_thread, -1);
buf = next;
} else if (__kmp_match_str("core", buf, CCAST(const char **, &next))) {
set_gran(affinity_gran_core, -1);
buf = next;
#if KMP_USE_HWLOC
} else if (__kmp_match_str("tile", buf, CCAST(const char **, &next))) {
set_gran(affinity_gran_tile, -1);
buf = next;
#endif
} else if (__kmp_match_str("die", buf, CCAST(const char **, &next))) {
set_gran(affinity_gran_die, -1);
buf = next;
} else if (__kmp_match_str("package", buf, CCAST(const char **, &next))) {
set_gran(affinity_gran_package, -1);
set = true;
} else if (__kmp_match_str("package", buf,
CCAST(const char **, &next))) {
set_gran(KMP_HW_SOCKET, -1);
buf = next;
set = true;
} else if (__kmp_match_str("node", buf, CCAST(const char **, &next))) {
set_gran(affinity_gran_node, -1);
set_gran(KMP_HW_NUMA, -1);
buf = next;
set = true;
#if KMP_GROUP_AFFINITY
} else if (__kmp_match_str("group", buf, CCAST(const char **, &next))) {
set_gran(affinity_gran_group, -1);
set_gran(KMP_HW_PROC_GROUP, -1);
buf = next;
set = true;
#endif /* KMP_GROUP AFFINITY */
} else if ((*buf >= '0') && (*buf <= '9')) {
int n;
@ -2267,11 +2272,13 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
n = __kmp_str_to_int(buf, *next);
KMP_ASSERT(n >= 0);
buf = next;
set_gran(affinity_gran_default, n);
set_gran(KMP_HW_UNKNOWN, n);
set = true;
} else {
EMIT_WARN(TRUE, (AffInvalidParam, name, start));
continue;
}
}
} else if (__kmp_match_str("proclist", buf, CCAST(const char **, &next))) {
char *temp_proclist;
@ -2377,20 +2384,20 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
*out_offset = number[1];
}
if (__kmp_affinity_gran == affinity_gran_default) {
if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
#if KMP_MIC_SUPPORTED
if (__kmp_mic_type != non_mic) {
if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
KMP_WARNING(AffGranUsing, "KMP_AFFINITY", "fine");
}
__kmp_affinity_gran = affinity_gran_fine;
__kmp_affinity_gran = KMP_HW_THREAD;
} else
#endif
{
if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
KMP_WARNING(AffGranUsing, "KMP_AFFINITY", "core");
}
__kmp_affinity_gran = affinity_gran_core;
__kmp_affinity_gran = KMP_HW_CORE;
}
}
} break;
@ -2475,31 +2482,8 @@ static void __kmp_stg_print_affinity(kmp_str_buf_t *buffer, char const *name,
} else {
__kmp_str_buf_print(buffer, "%s,", "norespect");
}
switch (__kmp_affinity_gran) {
case affinity_gran_default:
__kmp_str_buf_print(buffer, "%s", "granularity=default,");
break;
case affinity_gran_fine:
__kmp_str_buf_print(buffer, "%s", "granularity=fine,");
break;
case affinity_gran_thread:
__kmp_str_buf_print(buffer, "%s", "granularity=thread,");
break;
case affinity_gran_core:
__kmp_str_buf_print(buffer, "%s", "granularity=core,");
break;
case affinity_gran_package:
__kmp_str_buf_print(buffer, "%s", "granularity=package,");
break;
case affinity_gran_node:
__kmp_str_buf_print(buffer, "%s", "granularity=node,");
break;
#if KMP_GROUP_AFFINITY
case affinity_gran_group:
__kmp_str_buf_print(buffer, "%s", "granularity=group,");
break;
#endif /* KMP_GROUP_AFFINITY */
}
__kmp_str_buf_print(buffer, "granularity=%s,",
__kmp_hw_get_keyword(__kmp_affinity_gran, false));
}
if (!KMP_AFFINITY_CAPABLE()) {
__kmp_str_buf_print(buffer, "%s", "disabled");
@ -2571,7 +2555,7 @@ static void __kmp_stg_parse_gomp_cpu_affinity(char const *name,
// GOMP_CPU_AFFINITY => granularity=fine,explicit,proclist=...
__kmp_affinity_proclist = temp_proclist;
__kmp_affinity_type = affinity_explicit;
__kmp_affinity_gran = affinity_gran_fine;
__kmp_affinity_gran = KMP_HW_THREAD;
__kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
} else {
KMP_WARNING(AffSyntaxError, name);
@ -2856,10 +2840,20 @@ static int __kmp_parse_place_list(const char *var, const char *env,
static void __kmp_stg_parse_places(char const *name, char const *value,
void *data) {
struct kmp_place_t {
const char *name;
kmp_hw_t type;
};
int count;
bool set = false;
const char *scan = value;
const char *next = scan;
const char *kind = "\"threads\"";
kmp_place_t std_places[] = {{"threads", KMP_HW_THREAD},
{"cores", KMP_HW_CORE},
{"numa_domains", KMP_HW_NUMA},
{"ll_caches", KMP_HW_LLC},
{"sockets", KMP_HW_SOCKET}};
kmp_setting_t **rivals = (kmp_setting_t **)data;
int rc;
@ -2868,52 +2862,47 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
return;
}
if (__kmp_match_str("threads", scan, &next)) {
// Standard choices
for (size_t i = 0; i < sizeof(std_places) / sizeof(std_places[0]); ++i) {
const kmp_place_t &place = std_places[i];
if (__kmp_match_str(place.name, scan, &next)) {
scan = next;
__kmp_affinity_type = affinity_compact;
__kmp_affinity_gran = affinity_gran_thread;
__kmp_affinity_gran = place.type;
__kmp_affinity_dups = FALSE;
kind = "\"threads\"";
} else if (__kmp_match_str("cores", scan, &next)) {
set = true;
break;
}
}
// Implementation choices for OMP_PLACES based on internal types
if (!set) {
KMP_FOREACH_HW_TYPE(type) {
const char *name = __kmp_hw_get_keyword(type, true);
if (__kmp_match_str("unknowns", scan, &next))
continue;
if (__kmp_match_str(name, scan, &next)) {
scan = next;
__kmp_affinity_type = affinity_compact;
__kmp_affinity_gran = affinity_gran_core;
__kmp_affinity_gran = type;
__kmp_affinity_dups = FALSE;
kind = "\"cores\"";
#if KMP_USE_HWLOC
} else if (__kmp_match_str("tiles", scan, &next)) {
scan = next;
__kmp_affinity_type = affinity_compact;
__kmp_affinity_gran = affinity_gran_tile;
__kmp_affinity_dups = FALSE;
kind = "\"tiles\"";
#endif
} else if (__kmp_match_str("dice", scan, &next) ||
__kmp_match_str("dies", scan, &next)) {
scan = next;
__kmp_affinity_type = affinity_compact;
__kmp_affinity_gran = affinity_gran_die;
__kmp_affinity_dups = FALSE;
kind = "\"dice\"";
} else if (__kmp_match_str("sockets", scan, &next)) {
scan = next;
__kmp_affinity_type = affinity_compact;
__kmp_affinity_gran = affinity_gran_package;
__kmp_affinity_dups = FALSE;
kind = "\"sockets\"";
} else {
set = true;
break;
}
}
}
if (!set) {
if (__kmp_affinity_proclist != NULL) {
KMP_INTERNAL_FREE((void *)__kmp_affinity_proclist);
__kmp_affinity_proclist = NULL;
}
if (__kmp_parse_place_list(name, value, &__kmp_affinity_proclist)) {
__kmp_affinity_type = affinity_explicit;
__kmp_affinity_gran = affinity_gran_fine;
__kmp_affinity_gran = KMP_HW_THREAD;
__kmp_affinity_dups = FALSE;
} else {
// Syntax error fallback
__kmp_affinity_type = affinity_compact;
__kmp_affinity_gran = affinity_gran_core;
__kmp_affinity_gran = KMP_HW_CORE;
__kmp_affinity_dups = FALSE;
}
if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
@ -2921,6 +2910,9 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
}
return;
}
if (__kmp_affinity_gran != KMP_HW_UNKNOWN) {
kind = __kmp_hw_get_keyword(__kmp_affinity_gran);
}
if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
__kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
@ -2985,31 +2977,12 @@ static void __kmp_stg_print_places(kmp_str_buf_t *buffer, char const *name,
} else {
num = 0;
}
if (__kmp_affinity_gran == affinity_gran_thread) {
if (__kmp_affinity_gran != KMP_HW_UNKNOWN) {
const char *name = __kmp_hw_get_keyword(__kmp_affinity_gran, true);
if (num > 0) {
__kmp_str_buf_print(buffer, "='threads(%d)'\n", num);
__kmp_str_buf_print(buffer, "='%s(%d)'\n", name, num);
} else {
__kmp_str_buf_print(buffer, "='threads'\n");
}
} else if (__kmp_affinity_gran == affinity_gran_core) {
if (num > 0) {
__kmp_str_buf_print(buffer, "='cores(%d)' \n", num);
} else {
__kmp_str_buf_print(buffer, "='cores'\n");
}
#if KMP_USE_HWLOC
} else if (__kmp_affinity_gran == affinity_gran_tile) {
if (num > 0) {
__kmp_str_buf_print(buffer, "='tiles(%d)' \n", num);
} else {
__kmp_str_buf_print(buffer, "='tiles'\n");
}
#endif
} else if (__kmp_affinity_gran == affinity_gran_package) {
if (num > 0) {
__kmp_str_buf_print(buffer, "='sockets(%d)'\n", num);
} else {
__kmp_str_buf_print(buffer, "='sockets'\n");
__kmp_str_buf_print(buffer, "='%s'\n", name);
}
} else {
__kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
@ -3118,8 +3091,12 @@ static void __kmp_stg_print_topology_method(kmp_str_buf_t *buffer,
break;
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
case affinity_top_method_x2apicid_1f:
value = "x2APIC id leaf 0x1f";
break;
case affinity_top_method_x2apicid:
value = "x2APIC id";
value = "x2APIC id leaf 0xb";
break;
case affinity_top_method_apicid:
@ -4727,12 +4704,92 @@ static void __kmp_stg_print_speculative_statsfile(kmp_str_buf_t *buffer,
// -----------------------------------------------------------------------------
// KMP_HW_SUBSET (was KMP_PLACE_THREADS)
// 2s16c,2t => 2S16C,2T => 2S16C \0 2T
// The longest observable sequence of items is
// Socket-Node-Tile-Core-Thread
// So, let's limit to 5 levels for now
// Return KMP_HW_SUBSET preferred hardware type in case a token is ambiguously
// short. The original KMP_HW_SUBSET environment variable had single letters:
// s, c, t for sockets, cores, threads repsectively.
static kmp_hw_t __kmp_hw_subset_break_tie(const kmp_hw_t *possible,
size_t num_possible) {
for (size_t i = 0; i < num_possible; ++i) {
if (possible[i] == KMP_HW_THREAD)
return KMP_HW_THREAD;
else if (possible[i] == KMP_HW_CORE)
return KMP_HW_CORE;
else if (possible[i] == KMP_HW_SOCKET)
return KMP_HW_SOCKET;
}
return KMP_HW_UNKNOWN;
}
// Return hardware type from string or HW_UNKNOWN if string cannot be parsed
// This algorithm is very forgiving to the user in that, the instant it can
// reduce the search space to one, it assumes that is the topology level the
// user wanted, even if it is misspelled later in the token.
static kmp_hw_t __kmp_stg_parse_hw_subset_name(char const *token) {
size_t index, num_possible, token_length;
kmp_hw_t possible[KMP_HW_LAST];
const char *end;
// Find the end of the hardware token string
end = token;
token_length = 0;
while (isalnum(*end) || *end == '_') {
token_length++;
end++;
}
// Set the possibilities to all hardware types
num_possible = 0;
KMP_FOREACH_HW_TYPE(type) { possible[num_possible++] = type; }
// Eliminate hardware types by comparing the front of the token
// with hardware names
// In most cases, the first letter in the token will indicate exactly
// which hardware type is parsed, e.g., 'C' = Core
index = 0;
while (num_possible > 1 && index < token_length) {
size_t n = num_possible;
char token_char = (char)toupper(token[index]);
for (size_t i = 0; i < n; ++i) {
const char *s;
kmp_hw_t type = possible[i];
s = __kmp_hw_get_keyword(type, false);
if (index < KMP_STRLEN(s)) {
char c = (char)toupper(s[index]);
// Mark hardware types for removal when the characters do not match
if (c != token_char) {
possible[i] = KMP_HW_UNKNOWN;
num_possible--;
}
}
}
// Remove hardware types that this token cannot be
size_t start = 0;
for (size_t i = 0; i < n; ++i) {
if (possible[i] != KMP_HW_UNKNOWN) {
kmp_hw_t temp = possible[i];
possible[i] = possible[start];
possible[start] = temp;
start++;
}
}
KMP_ASSERT(start == num_possible);
index++;
}
// Attempt to break a tie if user has very short token
// (e.g., is 'T' tile or thread?)
if (num_possible > 1)
return __kmp_hw_subset_break_tie(possible, num_possible);
if (num_possible == 1)
return possible[0];
return KMP_HW_UNKNOWN;
}
// The longest observable sequence of items can only be HW_LAST length
// The input string is usually short enough, let's use 512 limit for now
#define MAX_T_LEVEL 5
#define MAX_T_LEVEL KMP_HW_LAST
#define MAX_STR_LEN 512
static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
void *data) {
@ -4751,12 +4808,13 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
char input[MAX_STR_LEN];
size_t len = 0, mlen = MAX_STR_LEN;
int level = 0;
// Canonize the string (remove spaces, unify delimiters, etc.)
bool absolute = false;
// Canonicalize the string (remove spaces, unify delimiters, etc.)
char *pos = CCAST(char *, value);
while (*pos && mlen) {
if (*pos != ' ') { // skip spaces
if (len == 0 && *pos == ':') {
__kmp_hws_abs_flag = 1; // if the first symbol is ":", skip it
absolute = true;
} else {
input[len] = (char)(toupper(*pos));
if (input[len] == 'X')
@ -4769,10 +4827,10 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
mlen--;
pos++;
}
if (len == 0 || mlen == 0)
if (len == 0 || mlen == 0) {
goto err; // contents is either empty or too long
}
input[len] = '\0';
__kmp_hws_requested = 1; // mark that subset requested
// Split by delimiter
pos = input;
components[level++] = pos;
@ -4782,146 +4840,69 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
*pos = '\0'; // modify input and avoid more copying
components[level++] = ++pos; // expect something after ","
}
__kmp_hw_subset = kmp_hw_subset_t::allocate();
if (absolute)
__kmp_hw_subset->set_absolute();
// Check each component
for (int i = 0; i < level; ++i) {
int offset = 0;
int num = atoi(components[i]); // each component should start with a number
if (num <= 0) {
goto err; // only positive integers are valid for count
}
if ((pos = strchr(components[i], '@'))) {
offset = atoi(pos + 1); // save offset
*pos = '\0'; // cut the offset from the component
}
pos = components[i] + strspn(components[i], digits);
if (pos == components[i])
if (pos == components[i]) {
goto err;
}
// detect the component type
switch (*pos) {
case 'S': // Socket
if (__kmp_hws_socket.num > 0)
goto err; // duplicate is not allowed
__kmp_hws_socket.num = num;
__kmp_hws_socket.offset = offset;
break;
case 'N': // NUMA Node
if (__kmp_hws_node.num > 0)
goto err; // duplicate is not allowed
__kmp_hws_node.num = num;
__kmp_hws_node.offset = offset;
break;
case 'D': // Die
if (__kmp_hws_die.num > 0)
goto err; // duplicate is not allowed
__kmp_hws_die.num = num;
__kmp_hws_die.offset = offset;
break;
case 'L': // Cache
if (*(pos + 1) == '2') { // L2 - Tile
if (__kmp_hws_tile.num > 0)
goto err; // duplicate is not allowed
__kmp_hws_tile.num = num;
__kmp_hws_tile.offset = offset;
} else if (*(pos + 1) == '3') { // L3 - Socket
if (__kmp_hws_socket.num > 0 || __kmp_hws_die.num > 0)
goto err; // duplicate is not allowed
__kmp_hws_socket.num = num;
__kmp_hws_socket.offset = offset;
} else if (*(pos + 1) == '1') { // L1 - Core
if (__kmp_hws_core.num > 0)
goto err; // duplicate is not allowed
__kmp_hws_core.num = num;
__kmp_hws_core.offset = offset;
}
break;
case 'C': // Core (or Cache?)
if (*(pos + 1) != 'A') {
if (__kmp_hws_core.num > 0)
goto err; // duplicate is not allowed
__kmp_hws_core.num = num;
__kmp_hws_core.offset = offset;
} else { // Cache
char *d = pos + strcspn(pos, digits); // find digit
if (*d == '2') { // L2 - Tile
if (__kmp_hws_tile.num > 0)
goto err; // duplicate is not allowed
__kmp_hws_tile.num = num;
__kmp_hws_tile.offset = offset;
} else if (*d == '3') { // L3 - Socket
if (__kmp_hws_socket.num > 0 || __kmp_hws_die.num > 0)
goto err; // duplicate is not allowed
__kmp_hws_socket.num = num;
__kmp_hws_socket.offset = offset;
} else if (*d == '1') { // L1 - Core
if (__kmp_hws_core.num > 0)
goto err; // duplicate is not allowed
__kmp_hws_core.num = num;
__kmp_hws_core.offset = offset;
} else {
kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos);
if (type == KMP_HW_UNKNOWN) {
goto err;
}
}
break;
case 'T': // Thread
if (__kmp_hws_proc.num > 0)
goto err; // duplicate is not allowed
__kmp_hws_proc.num = num;
__kmp_hws_proc.offset = offset;
break;
default:
if (__kmp_hw_subset->specified(type)) {
goto err;
}
__kmp_hw_subset->push_back(num, type, offset);
}
return;
err:
KMP_WARNING(AffHWSubsetInvalid, name, value);
__kmp_hws_requested = 0; // mark that subset not requested
if (__kmp_hw_subset) {
kmp_hw_subset_t::deallocate(__kmp_hw_subset);
__kmp_hw_subset = nullptr;
}
return;
}
static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
void *data) {
if (__kmp_hws_requested) {
int comma = 0;
kmp_str_buf_t buf;
int depth;
if (!__kmp_hw_subset)
return;
__kmp_str_buf_init(&buf);
if (__kmp_env_format)
KMP_STR_BUF_PRINT_NAME_EX(name);
else
__kmp_str_buf_print(buffer, " %s='", name);
if (__kmp_hws_socket.num) {
__kmp_str_buf_print(&buf, "%ds", __kmp_hws_socket.num);
if (__kmp_hws_socket.offset)
__kmp_str_buf_print(&buf, "@%d", __kmp_hws_socket.offset);
comma = 1;
depth = __kmp_hw_subset->get_depth();
for (int i = 0; i < depth; ++i) {
const auto &item = __kmp_hw_subset->at(i);
__kmp_str_buf_print(&buf, "%s%d%s", (i > 0 ? "," : ""), item.num,
__kmp_hw_get_keyword(item.type));
if (item.offset)
__kmp_str_buf_print(&buf, "@%d", item.offset);
}
if (__kmp_hws_die.num) {
__kmp_str_buf_print(&buf, "%s%dd", comma ? "," : "", __kmp_hws_die.num);
if (__kmp_hws_die.offset)
__kmp_str_buf_print(&buf, "@%d", __kmp_hws_die.offset);
comma = 1;
}
if (__kmp_hws_node.num) {
__kmp_str_buf_print(&buf, "%s%dn", comma ? "," : "", __kmp_hws_node.num);
if (__kmp_hws_node.offset)
__kmp_str_buf_print(&buf, "@%d", __kmp_hws_node.offset);
comma = 1;
}
if (__kmp_hws_tile.num) {
__kmp_str_buf_print(&buf, "%s%dL2", comma ? "," : "", __kmp_hws_tile.num);
if (__kmp_hws_tile.offset)
__kmp_str_buf_print(&buf, "@%d", __kmp_hws_tile.offset);
comma = 1;
}
if (__kmp_hws_core.num) {
__kmp_str_buf_print(&buf, "%s%dc", comma ? "," : "", __kmp_hws_core.num);
if (__kmp_hws_core.offset)
__kmp_str_buf_print(&buf, "@%d", __kmp_hws_core.offset);
comma = 1;
}
if (__kmp_hws_proc.num)
__kmp_str_buf_print(&buf, "%s%dt", comma ? "," : "", __kmp_hws_proc.num);
__kmp_str_buf_print(buffer, "%s'\n", buf.str);
__kmp_str_buf_free(&buf);
}
}
#if USE_ITT_BUILD
// -----------------------------------------------------------------------------
@ -5762,7 +5743,7 @@ void __kmp_env_initialize(char const *string) {
// Reset the affinity flags to their default values,
// in case this is called from kmp_set_defaults().
__kmp_affinity_type = affinity_default;
__kmp_affinity_gran = affinity_gran_default;
__kmp_affinity_gran = KMP_HW_UNKNOWN;
__kmp_affinity_top_method = affinity_top_method_default;
__kmp_affinity_respect_mask = affinity_respect_mask_default;
}
@ -5772,7 +5753,7 @@ void __kmp_env_initialize(char const *string) {
aff_str = __kmp_env_blk_var(&block, "OMP_PROC_BIND");
if (aff_str != NULL) {
__kmp_affinity_type = affinity_default;
__kmp_affinity_gran = affinity_gran_default;
__kmp_affinity_gran = KMP_HW_UNKNOWN;
__kmp_affinity_top_method = affinity_top_method_default;
__kmp_affinity_respect_mask = affinity_respect_mask_default;
}
@ -5844,12 +5825,19 @@ void __kmp_env_initialize(char const *string) {
if (!TCR_4(__kmp_init_middle)) {
#if KMP_USE_HWLOC
// Force using hwloc when either tiles or numa nodes requested within
// KMP_HW_SUBSET and no other topology method is requested
if ((__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0 ||
__kmp_affinity_gran == affinity_gran_tile) &&
(__kmp_affinity_top_method == affinity_top_method_default)) {
// KMP_HW_SUBSET or granularity setting and no other topology method
// is requested
if (__kmp_hw_subset &&
__kmp_affinity_top_method == affinity_top_method_default)
if (__kmp_hw_subset->specified(KMP_HW_NUMA) ||
__kmp_hw_subset->specified(KMP_HW_TILE) ||
__kmp_affinity_gran == KMP_HW_TILE ||
__kmp_affinity_gran == KMP_HW_NUMA)
__kmp_affinity_top_method = affinity_top_method_hwloc;
// Force using hwloc when tiles or numa nodes requested for OMP_PLACES
if (__kmp_affinity_gran == KMP_HW_NUMA ||
__kmp_affinity_gran == KMP_HW_TILE)
__kmp_affinity_top_method = affinity_top_method_hwloc;
}
#endif
// Determine if the machine/OS is actually capable of supporting
// affinity.
@ -5879,7 +5867,7 @@ void __kmp_env_initialize(char const *string) {
}
__kmp_affinity_type = affinity_disabled;
__kmp_affinity_respect_mask = 0;
__kmp_affinity_gran = affinity_gran_fine;
__kmp_affinity_gran = KMP_HW_THREAD;
}
}
@ -5937,44 +5925,27 @@ void __kmp_env_initialize(char const *string) {
__kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
}
if (__kmp_affinity_top_method == affinity_top_method_default) {
if (__kmp_affinity_gran == affinity_gran_default) {
if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
__kmp_affinity_top_method = affinity_top_method_group;
__kmp_affinity_gran = affinity_gran_group;
} else if (__kmp_affinity_gran == affinity_gran_group) {
__kmp_affinity_gran = KMP_HW_PROC_GROUP;
} else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) {
__kmp_affinity_top_method = affinity_top_method_group;
} else {
__kmp_affinity_top_method = affinity_top_method_all;
}
} else if (__kmp_affinity_top_method == affinity_top_method_group) {
if (__kmp_affinity_gran == affinity_gran_default) {
__kmp_affinity_gran = affinity_gran_group;
} else if ((__kmp_affinity_gran != affinity_gran_group) &&
(__kmp_affinity_gran != affinity_gran_fine) &&
(__kmp_affinity_gran != affinity_gran_thread)) {
const char *str = NULL;
switch (__kmp_affinity_gran) {
case affinity_gran_core:
str = "core";
break;
case affinity_gran_package:
str = "package";
break;
case affinity_gran_node:
str = "node";
break;
case affinity_gran_tile:
str = "tile";
break;
default:
KMP_DEBUG_ASSERT(0);
}
if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
__kmp_affinity_gran = KMP_HW_PROC_GROUP;
} else if ((__kmp_affinity_gran != KMP_HW_PROC_GROUP) &&
(__kmp_affinity_gran != KMP_HW_THREAD)) {
const char *str = __kmp_hw_get_keyword(__kmp_affinity_gran);
KMP_WARNING(AffGranTopGroup, var, str);
__kmp_affinity_gran = affinity_gran_fine;
__kmp_affinity_gran = KMP_HW_THREAD;
}
} else {
if (__kmp_affinity_gran == affinity_gran_default) {
__kmp_affinity_gran = affinity_gran_core;
} else if (__kmp_affinity_gran == affinity_gran_group) {
if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
__kmp_affinity_gran = KMP_HW_CORE;
} else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) {
const char *str = NULL;
switch (__kmp_affinity_type) {
case affinity_physical:
@ -5997,7 +5968,7 @@ void __kmp_env_initialize(char const *string) {
KMP_DEBUG_ASSERT(0);
}
KMP_WARNING(AffGranGroupType, var, str);
__kmp_affinity_gran = affinity_gran_core;
__kmp_affinity_gran = KMP_HW_CORE;
}
}
} else
@ -6039,15 +6010,15 @@ void __kmp_env_initialize(char const *string) {
__kmp_affinity_type = affinity_none;
}
}
if ((__kmp_affinity_gran == affinity_gran_default) &&
if ((__kmp_affinity_gran == KMP_HW_UNKNOWN) &&
(__kmp_affinity_gran_levels < 0)) {
#if KMP_MIC_SUPPORTED
if (__kmp_mic_type != non_mic) {
__kmp_affinity_gran = affinity_gran_fine;
__kmp_affinity_gran = KMP_HW_THREAD;
} else
#endif
{
__kmp_affinity_gran = affinity_gran_core;
__kmp_affinity_gran = KMP_HW_CORE;
}
}
if (__kmp_affinity_top_method == affinity_top_method_default) {

View File

@ -0,0 +1,71 @@
// RUN: %libomp-compile -D_GNU_SOURCE
// RUN: env KMP_AFFINITY=granularity=thread,compact %libomp-run
// RUN: env KMP_AFFINITY=granularity=core,compact %libomp-run
// RUN: env KMP_AFFINITY=granularity=socket,compact %libomp-run
// REQUIRES: linux
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "libomp_test_affinity.h"
#include "libomp_test_topology.h"
// Compare place lists. Make sure every place in p1 is in p2.
static int compare_places(const place_list_t *p1, const place_list_t *p2) {
int i, j;
for (i = 0; i < p1->num_places; ++i) {
int found = 0;
for (j = 0; j < p2->num_places; ++j) {
if (affinity_mask_equal(p1->masks[i], p2->masks[j])) {
found = 1;
break;
}
}
if (!found) {
printf("Found place in p1 not in p2!\n");
printf("p1 places:\n");
topology_print_places(p1);
printf("\n");
printf("p2 places:\n");
topology_print_places(p1);
return EXIT_FAILURE;
}
}
return EXIT_SUCCESS;
}
static int check_places() {
int status;
const char *value = getenv("KMP_AFFINITY");
if (!value) {
fprintf(stderr, "error: must set OMP_PLACES envirable for this test!\n");
return EXIT_FAILURE;
}
place_list_t *places, *openmp_places;
if (strstr(value, "socket")) {
places = topology_alloc_type_places(TOPOLOGY_OBJ_SOCKET);
} else if (strstr(value, "core")) {
places = topology_alloc_type_places(TOPOLOGY_OBJ_CORE);
} else if (strstr(value, "thread")) {
places = topology_alloc_type_places(TOPOLOGY_OBJ_THREAD);
} else {
fprintf(
stderr,
"error: KMP_AFFINITY granularity must be one of thread,core,socket!\n");
return EXIT_FAILURE;
}
openmp_places = topology_alloc_openmp_places();
status = compare_places(openmp_places, places);
topology_free_places(places);
topology_free_places(openmp_places);
return status;
}
int main() {
if (!topology_using_full_mask()) {
printf("Thread does not have access to all logical processors. Skipping "
"test.\n");
return EXIT_SUCCESS;
}
return check_places();
}

View File

@ -0,0 +1,127 @@
// RUN: %libomp-compile -D_GNU_SOURCE
// RUN: env OMP_PLACES=threads %libomp-run
// RUN: env OMP_PLACES=cores %libomp-run
// RUN: env OMP_PLACES=sockets %libomp-run
// REQUIRES: linux
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "libomp_test_affinity.h"
#include "libomp_test_topology.h"
// Check openmp place list to make sure it follow KMP_HW_SUBSET restriction
static int compare_hw_subset_places(const place_list_t *openmp_places,
topology_obj_type_t type, int nsockets,
int ncores_per_socket,
int nthreads_per_core) {
int i, j, expected_total, expected_per_place;
if (type == TOPOLOGY_OBJ_THREAD) {
expected_total = nsockets * ncores_per_socket * nthreads_per_core;
expected_per_place = 1;
} else if (type == TOPOLOGY_OBJ_CORE) {
expected_total = nsockets * ncores_per_socket;
expected_per_place = nthreads_per_core;
} else {
expected_total = nsockets;
expected_per_place = ncores_per_socket;
}
if (openmp_places->num_places != expected_total) {
fprintf(stderr, "error: KMP_HW_SUBSET did not half each resource layer!\n");
printf("openmp_places places:\n");
topology_print_places(openmp_places);
printf("\n");
return EXIT_FAILURE;
}
for (i = 0; i < openmp_places->num_places; ++i) {
int count = affinity_mask_count(openmp_places->masks[i]);
if (count != expected_per_place) {
fprintf(stderr, "error: place %d has %d OS procs instead of %d\n", i,
count, expected_per_place);
return EXIT_FAILURE;
}
}
return EXIT_SUCCESS;
}
static int check_places() {
char buf[100];
topology_obj_type_t type;
const char *value;
int status = EXIT_SUCCESS;
place_list_t *threads, *cores, *sockets, *openmp_places;
threads = topology_alloc_type_places(TOPOLOGY_OBJ_THREAD);
cores = topology_alloc_type_places(TOPOLOGY_OBJ_CORE);
sockets = topology_alloc_type_places(TOPOLOGY_OBJ_SOCKET);
if (threads->num_places <= 1) {
printf("Only one hardware thread to execute on. Skipping test.\n");
return status;
}
value = getenv("OMP_PLACES");
if (!value) {
fprintf(stderr,
"error: OMP_PLACES must be set to one of threads,cores,sockets!\n");
return EXIT_FAILURE;
}
if (strcmp(value, "threads") == 0)
type = TOPOLOGY_OBJ_THREAD;
else if (strcmp(value, "cores") == 0)
type = TOPOLOGY_OBJ_CORE;
else if (strcmp(value, "sockets") == 0)
type = TOPOLOGY_OBJ_SOCKET;
else {
fprintf(stderr,
"error: OMP_PLACES must be one of threads,cores,sockets!\n");
return EXIT_FAILURE;
}
// Calculate of num threads per core, num cores per socket, & num sockets
if (cores->num_places <= 0) {
printf("Invalid number of cores (%d). Skipping test.\n", cores->num_places);
return status;
} else if (sockets->num_places <= 0) {
printf("Invalid number of sockets (%d). Skipping test.\n",
cores->num_places);
return status;
}
int nthreads_per_core = threads->num_places / cores->num_places;
int ncores_per_socket = cores->num_places / sockets->num_places;
int nsockets = sockets->num_places;
if (nsockets * ncores_per_socket * nthreads_per_core != threads->num_places) {
printf("Only uniform topologies can be tested. Skipping test.\n");
return status;
}
// Use half the resources of every level
if (nthreads_per_core > 1)
nthreads_per_core /= 2;
if (ncores_per_socket > 1)
ncores_per_socket /= 2;
if (nsockets > 1)
nsockets /= 2;
snprintf(buf, sizeof(buf), "%ds,%dc,%dt", nsockets, ncores_per_socket,
nthreads_per_core);
setenv("KMP_HW_SUBSET", buf, 1);
openmp_places = topology_alloc_openmp_places();
status = compare_hw_subset_places(openmp_places, type, nsockets,
ncores_per_socket, nthreads_per_core);
topology_free_places(threads);
topology_free_places(cores);
topology_free_places(sockets);
topology_free_places(openmp_places);
return status;
}
int main() {
if (!topology_using_full_mask()) {
printf("Thread does not have access to all logical processors. Skipping "
"test.\n");
return EXIT_SUCCESS;
}
return check_places();
}

View File

@ -0,0 +1,231 @@
#ifndef LIBOMP_TEST_TOPOLOGY_H
#define LIBOMP_TEST_TOPOLOGY_H
#include "libomp_test_affinity.h"
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <errno.h>
#include <ctype.h>
#include <omp.h>
typedef enum topology_obj_type_t {
TOPOLOGY_OBJ_THREAD,
TOPOLOGY_OBJ_CORE,
TOPOLOGY_OBJ_SOCKET,
TOPOLOGY_OBJ_MAX
} topology_obj_type_t;
typedef struct place_list_t {
int num_places;
affinity_mask_t **masks;
} place_list_t;
// Return the first character in file 'f' that is not a whitespace character
// including newlines and carriage returns
static int get_first_nonspace_from_file(FILE *f) {
int c;
do {
c = fgetc(f);
} while (c != EOF && (isspace(c) || c == '\n' || c == '\r'));
return c;
}
// Read an integer from file 'f' into 'number'
// Return 1 on successful read of integer,
// 0 on unsuccessful read of integer,
// EOF on end of file.
static int get_integer_from_file(FILE *f, int *number) {
int n;
n = fscanf(f, "%d", number);
if (feof(f))
return EOF;
if (n != 1)
return 0;
return 1;
}
// Read a siblings list file from Linux /sys/devices/system/cpu/cpu?/topology/*
static affinity_mask_t *topology_get_mask_from_file(const char *filename) {
int status = EXIT_SUCCESS;
FILE *f = fopen(filename, "r");
if (!f) {
perror(filename);
exit(EXIT_FAILURE);
}
affinity_mask_t *mask = affinity_mask_alloc();
while (1) {
int c, i, n, lower, upper;
// Read the first integer
n = get_integer_from_file(f, &lower);
if (n == EOF) {
break;
} else if (n == 0) {
fprintf(stderr, "syntax error: expected integer\n");
status = EXIT_FAILURE;
break;
}
// Now either a , or -
c = get_first_nonspace_from_file(f);
if (c == EOF || c == ',') {
affinity_mask_set(mask, lower);
if (c == EOF)
break;
} else if (c == '-') {
n = get_integer_from_file(f, &upper);
if (n == EOF || n == 0) {
fprintf(stderr, "syntax error: expected integer\n");
status = EXIT_FAILURE;
break;
}
for (i = lower; i <= upper; ++i)
affinity_mask_set(mask, i);
c = get_first_nonspace_from_file(f);
if (c == EOF) {
break;
} else if (c == ',') {
continue;
} else {
fprintf(stderr, "syntax error: unexpected character: '%c (%d)'\n", c,
c);
status = EXIT_FAILURE;
break;
}
} else {
fprintf(stderr, "syntax error: unexpected character: '%c (%d)'\n", c, c);
status = EXIT_FAILURE;
break;
}
}
fclose(f);
if (status == EXIT_FAILURE) {
affinity_mask_free(mask);
mask = NULL;
}
return mask;
}
static int topology_get_num_cpus() {
char buf[1024];
// Count the number of cpus
int cpu = 0;
while (1) {
snprintf(buf, sizeof(buf), "/sys/devices/system/cpu/cpu%d", cpu);
DIR *dir = opendir(buf);
if (dir) {
closedir(dir);
cpu++;
} else {
break;
}
}
if (cpu == 0)
cpu = 1;
return cpu;
}
// Return whether the current thread has access to all logical processors
static int topology_using_full_mask() {
int cpu;
int has_all = 1;
int num_cpus = topology_get_num_cpus();
affinity_mask_t *mask = affinity_mask_alloc();
get_thread_affinity(mask);
for (cpu = 0; cpu < num_cpus; ++cpu) {
if (!affinity_mask_isset(mask, cpu)) {
has_all = 0;
break;
}
}
affinity_mask_free(mask);
return has_all;
}
// Return array of masks representing OMP_PLACES keyword (e.g., sockets, cores,
// threads)
static place_list_t *topology_alloc_type_places(topology_obj_type_t type) {
char buf[1024];
int i, cpu, num_places, num_unique;
int num_cpus = topology_get_num_cpus();
place_list_t *places = (place_list_t *)malloc(sizeof(place_list_t));
affinity_mask_t **masks =
(affinity_mask_t **)malloc(sizeof(affinity_mask_t *) * num_cpus);
num_unique = 0;
for (cpu = 0; cpu < num_cpus; ++cpu) {
affinity_mask_t *mask;
if (type == TOPOLOGY_OBJ_CORE) {
snprintf(buf, sizeof(buf),
"/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list",
cpu);
mask = topology_get_mask_from_file(buf);
} else if (type == TOPOLOGY_OBJ_SOCKET) {
snprintf(buf, sizeof(buf),
"/sys/devices/system/cpu/cpu%d/topology/core_siblings_list",
cpu);
mask = topology_get_mask_from_file(buf);
} else if (type == TOPOLOGY_OBJ_THREAD) {
mask = affinity_mask_alloc();
affinity_mask_set(mask, cpu);
} else {
fprintf(stderr, "Unknown topology type (%d)\n", (int)type);
exit(EXIT_FAILURE);
}
// Check for unique topology objects above the thread level
if (type != TOPOLOGY_OBJ_THREAD) {
for (i = 0; i < num_unique; ++i) {
if (affinity_mask_equal(masks[i], mask)) {
affinity_mask_free(mask);
mask = NULL;
break;
}
}
}
if (mask)
masks[num_unique++] = mask;
}
places->num_places = num_unique;
places->masks = masks;
return places;
}
static place_list_t *topology_alloc_openmp_places() {
int place, i;
int num_places = omp_get_num_places();
place_list_t *places = (place_list_t *)malloc(sizeof(place_list_t));
affinity_mask_t **masks =
(affinity_mask_t **)malloc(sizeof(affinity_mask_t *) * num_places);
for (place = 0; place < num_places; ++place) {
int num_procs = omp_get_place_num_procs(place);
int *ids = (int *)malloc(sizeof(int) * num_procs);
omp_get_place_proc_ids(place, ids);
affinity_mask_t *mask = affinity_mask_alloc();
for (i = 0; i < num_procs; ++i)
affinity_mask_set(mask, ids[i]);
masks[place] = mask;
}
places->num_places = num_places;
places->masks = masks;
return places;
}
// Free the array of masks from one of: topology_alloc_type_masks()
// or topology_alloc_openmp_masks()
static void topology_free_places(place_list_t *places) {
int i;
for (i = 0; i < places->num_places; ++i)
affinity_mask_free(places->masks[i]);
free(places->masks);
free(places);
}
static void topology_print_places(const place_list_t *p) {
int i;
char buf[1024];
for (i = 0; i < p->num_places; ++i) {
affinity_mask_snprintf(buf, sizeof(buf), p->masks[i]);
printf("Place %d: %s\n", i, buf);
}
}
#endif

View File

@ -0,0 +1,83 @@
// RUN: %libomp-compile -D_GNU_SOURCE
// RUN: env OMP_PLACES=threads %libomp-run
// RUN: env OMP_PLACES=cores %libomp-run
// RUN: env OMP_PLACES=sockets %libomp-run
// REQUIRES: linux
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "libomp_test_affinity.h"
#include "libomp_test_topology.h"
// Compare place lists. The order is not taken into consideration here.
// The OS detection might have the cores/sockets in a different
// order from the runtime.
static int compare_places(const place_list_t *p1, const place_list_t *p2) {
int i, j;
if (p1->num_places != p2->num_places) {
fprintf(stderr, "error: places do not have same number of places! (p1 has "
"%d, p2 has %d)\n",
p1->num_places, p2->num_places);
printf("p1 places:\n");
topology_print_places(p1);
printf("\n");
printf("p2 places:\n");
topology_print_places(p1);
return EXIT_FAILURE;
}
for (i = 0; i < p1->num_places; ++i) {
int found = 0;
for (j = 0; j < p2->num_places; ++j) {
if (affinity_mask_equal(p1->masks[i], p2->masks[j])) {
found = 1;
break;
}
}
if (!found) {
printf("Found difference in places!\n");
printf("p1 places:\n");
topology_print_places(p1);
printf("\n");
printf("p2 places:\n");
topology_print_places(p1);
return EXIT_FAILURE;
}
}
return EXIT_SUCCESS;
}
static int check_places() {
int status;
const char *value = getenv("OMP_PLACES");
if (!value) {
fprintf(stderr, "error: must set OMP_PLACES envirable for this test!\n");
return EXIT_FAILURE;
}
place_list_t *places, *openmp_places;
if (strcmp(value, "sockets") == 0) {
places = topology_alloc_type_places(TOPOLOGY_OBJ_SOCKET);
} else if (strcmp(value, "cores") == 0) {
places = topology_alloc_type_places(TOPOLOGY_OBJ_CORE);
} else if (strcmp(value, "threads") == 0) {
places = topology_alloc_type_places(TOPOLOGY_OBJ_THREAD);
} else {
fprintf(stderr,
"error: OMP_PLACES must be one of threads,cores,sockets!\n");
return EXIT_FAILURE;
}
openmp_places = topology_alloc_openmp_places();
status = compare_places(places, openmp_places);
topology_free_places(places);
topology_free_places(openmp_places);
return status;
}
int main() {
if (!topology_using_full_mask()) {
printf("Thread does not have access to all logical processors. Skipping "
"test.\n");
return EXIT_SUCCESS;
}
return check_places();
}