KMP_HW_SUBSET extended with NUMA support when HWLOC enabled

Differential Revision: https://reviews.llvm.org/D31600 llvm-svn: 300220
2017-04-13 17:15:07 +00:00 · 2017-04-13 17:15:07 +00:00 · 4a9a89241b
parent 7840dc8451
commit 4a9a89241b
7 changed files with 828 additions and 374 deletions
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@ -351,7 +351,7 @@ kmpc_set_defaults                           224
    %ifdef OMP_30
        __kmpc_omp_taskyield                235
    %endif # OMP_30
-    __kmpc_place_threads                    236
+#    __kmpc_place_threads                    236
 %endif

 # OpenMP 4.0 entry points
--- a/openmp/runtime/src/i18n/en_US.txt
+++ b/openmp/runtime/src/i18n/en_US.txt
@ -38,7 +38,7 @@ Language "English"
 Country  "USA"
 LangId   "1033"
 Version  "2"
-Revision "20160714"
+Revision "20161216"



@ -388,8 +388,8 @@ OBSOLETE                     "%1$s: granularity=core will be used."
 EnvLockWarn                  "%1$s must be set prior to first OMP lock call or critical section; ignored."
 FutexNotSupported            "futex system call not supported; %1$s=%2$s ignored."
 AffGranUsing                 "%1$s: granularity=%2$s will be used."
-AffHWSubsetInvalid           "%1$s: invalid value \"%2$s\", valid format is \"Ns[@N],Nc[@N],Nt "
-                             "(nSockets@offset, nCores@offset, nTthreads per core)\"."
+AffHWSubsetInvalid           "%1$s: invalid value \"%2$s\", valid format is \"N<item>[@N][,...][,Nt] "
+                             "(<item> can be S, N, L2, C, T  for Socket, NUMA Node, L2 Cache, Core, Thread)\"."
 AffHWSubsetUnsupported       "KMP_HW_SUBSET ignored: unsupported architecture."
 AffHWSubsetManyCores         "KMP_HW_SUBSET ignored: too many cores requested."
 SyntaxErrorUsing             "%1$s: syntax error, using %2$s."
@ -411,6 +411,10 @@ AffHwlocErrorOccurred        "%1$s: Hwloc failed in %2$s. Relying on internal af
 EnvSerialWarn                "%1$s must be set prior to OpenMP runtime library initialization; ignored."
 EnvVarDeprecated             "%1$s variable deprecated, please use %2$s instead."
 RedMethodNotSupported        "KMP_FORCE_REDUCTION: %1$s method is not supported; using critical."
+AffHWSubsetNoHWLOC           "KMP_HW_SUBSET ignored: unsupported item requested for non-HWLOC topology method (KMP_TOPOLOGY_METHOD)"
+AffHWSubsetManyNodes         "KMP_HW_SUBSET ignored: too many NUMA Nodes requested."
+AffHWSubsetManyTiles         "KMP_HW_SUBSET ignored: too many L2 Caches requested."
+AffHWSubsetManyProcs         "KMP_HW_SUBSET ignored: too many Procs requested."


 # --------------------------------------------------------------------------------------------------
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@ -774,11 +774,19 @@ typedef enum kmp_cancel_kind_t {
 } kmp_cancel_kind_t;
 #endif // OMP_40_ENABLED

-extern int __kmp_place_num_sockets;
-extern int __kmp_place_socket_offset;
-extern int __kmp_place_num_cores;
-extern int __kmp_place_core_offset;
-extern int __kmp_place_num_threads_per_core;
+// KMP_HW_SUBSET support:
+typedef struct kmp_hws_item {
+    int num;
+    int offset;
+} kmp_hws_item_t;
+
+extern kmp_hws_item_t __kmp_hws_socket;
+extern kmp_hws_item_t __kmp_hws_node;
+extern kmp_hws_item_t __kmp_hws_tile;
+extern kmp_hws_item_t __kmp_hws_core;
+extern kmp_hws_item_t __kmp_hws_proc;
+extern int __kmp_hws_requested;
+extern int __kmp_hws_abs_flag; // absolute or per-item number requested

 /* ------------------------------------------------------------------------ */
 /* ------------------------------------------------------------------------ */
@ -3494,9 +3502,6 @@ KMP_EXPORT kmp_int32 __kmp_get_reduce_method( void );
 KMP_EXPORT kmp_uint64 __kmpc_get_taskid();
 KMP_EXPORT kmp_uint64 __kmpc_get_parent_taskid();

-// this function exported for testing of KMP_PLACE_THREADS functionality
-KMP_EXPORT void __kmpc_place_threads(int,int,int,int,int);
-
 /* ------------------------------------------------------------------------ */
 /* ------------------------------------------------------------------------ */

--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@ -3405,102 +3405,665 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
 #undef ADD_MASK
 #undef ADD_MASK_OSID

+#if KMP_USE_HWLOC
+static int
+__kmp_hwloc_count_children_by_type(
+    hwloc_topology_t t, hwloc_obj_t o, hwloc_obj_type_t type, hwloc_obj_t* f)
+{
+    if (!hwloc_compare_types(o->type, type)) {
+      if (*f == NULL)
+        *f = o; // output first descendant found
+      return 1;
+    }
+    int sum = 0;
+    for (unsigned i = 0; i < o->arity; i++)
+      sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
+    return sum; // will be 0 if no one found (as PU arity is 0)
+}
+
+static int
+__kmp_hwloc_count_children_by_depth(
+    hwloc_topology_t t, hwloc_obj_t o, unsigned depth, hwloc_obj_t* f)
+{
+    if (o->depth == depth) {
+      if (*f == NULL)
+        *f = o; // output first descendant found
+      return 1;
+    }
+    int sum = 0;
+    for (unsigned i = 0; i < o->arity; i++)
+      sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
+    return sum; // will be 0 if no one found (as PU arity is 0)
+}
+
+static int
+__kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o)
+{ // skip PUs descendants of the object o
+    int skipped = 0;
+    hwloc_obj_t hT = NULL;
+    int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
+    for (int i = 0; i < N; ++i) {
+      KMP_DEBUG_ASSERT(hT);
+      unsigned idx = hT->os_index;
+      if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+        KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+        KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+        ++skipped;
+      }
+      hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
+    }
+    return skipped; // count number of skipped units
+}
+
+static int
+__kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o)
+{ // check if obj has PUs present in fullMask
+    hwloc_obj_t hT = NULL;
+    int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
+    for (int i = 0; i < N; ++i) {
+      KMP_DEBUG_ASSERT(hT);
+      unsigned idx = hT->os_index;
+      if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask))
+        return 1; // found PU
+      hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
+    }
+    return 0; // no PUs found
+}
+#endif // KMP_USE_HWLOC
+
 static void
 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
 {
-    int i, j, k, n_old = 0, n_new = 0, proc_num = 0;
-    if (__kmp_place_num_sockets == 0 &&
-        __kmp_place_num_cores == 0 &&
-        __kmp_place_num_threads_per_core == 0 )
-        goto _exit;   // no topology limiting actions requested, exit
-    if (__kmp_place_num_sockets == 0)
-        __kmp_place_num_sockets = nPackages;    // use all available sockets
-    if (__kmp_place_num_cores == 0)
-        __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
-    if (__kmp_place_num_threads_per_core == 0 ||
-        __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore)
-        __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
+    AddrUnsPair *newAddr;
+    if (__kmp_hws_requested == 0)
+      goto _exit;   // no topology limiting actions requested, exit
+#if KMP_USE_HWLOC
+    if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
+      // Number of subobjects calculated dynamically, this works fine for
+      // any non-uniform topology.
+      // L2 cache objects are determined by depth, other objects - by type.
+      hwloc_topology_t tp = __kmp_hwloc_topology;
+      int nS=0, nN=0, nL=0, nC=0, nT=0; // logical index including skipped
+      int nCr=0, nTr=0; // number of requested units
+      int nPkg=0, nCo=0, n_new=0, n_old = 0, nCpP=0, nTpC=0; // counters
+      hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
+      int L2depth, idx;

-    if ( !__kmp_affinity_uniform_topology() ) {
+      // check support of extensions ----------------------------------
+      int numa_support = 0, tile_support = 0;
+      if (__kmp_pu_os_idx)
+        hT = hwloc_get_pu_obj_by_os_index(
+          tp, __kmp_pu_os_idx[__kmp_avail_proc - 1]);
+      else
+        hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1);
+      if (hT == NULL) { // something's gone wrong
+        KMP_WARNING(AffHWSubsetUnsupported);
+        goto _exit;
+      }
+      // check NUMA node
+      hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
+      hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
+      if (hN != NULL && hN->depth > hS->depth) {
+        numa_support = 1; // 1 in case socket includes node(s)
+      } else if (__kmp_hws_node.num > 0) {
+        // don't support sockets inside NUMA node (no such HW found for testing)
+        KMP_WARNING(AffHWSubsetUnsupported);
+        goto _exit;
+      }
+      // check L2 cahce, get object by depth because of multiple caches
+      L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
+      hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
+      if (hL != NULL && __kmp_hwloc_count_children_by_type(
+          tp, hL, HWLOC_OBJ_CORE, &hC) > 1) {
+        tile_support = 1; // no sense to count L2 if it includes single core
+      } else if (__kmp_hws_tile.num > 0) {
+        if (__kmp_hws_core.num == 0) {
+          __kmp_hws_core = __kmp_hws_tile; // replace L2 with core
+          __kmp_hws_tile.num = 0;
+        } else {
+          // L2 and core are both requested, but represent same object
+          KMP_WARNING(AffHWSubsetInvalid);
+          goto _exit;
+        }
+      }
+      // end of check of extensions -----------------------------------
+
+      // fill in unset items, validate settings -----------------------
+      if (__kmp_hws_socket.num == 0)
+        __kmp_hws_socket.num = nPackages;    // use all available sockets
+      if (__kmp_hws_socket.offset >= nPackages) {
+          KMP_WARNING(AffHWSubsetManySockets);
+          goto _exit;
+      }
+      if (numa_support) {
+        int NN = __kmp_hwloc_count_children_by_type(
+          tp, hS, HWLOC_OBJ_NUMANODE, &hN); // num nodes in socket
+        if (__kmp_hws_node.num == 0)
+          __kmp_hws_node.num = NN; // use all available nodes
+        if (__kmp_hws_node.offset >= NN) {
+          KMP_WARNING(AffHWSubsetManyNodes);
+          goto _exit;
+        }
+        if (tile_support) {
+          // get num tiles in node
+          int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
+          if (__kmp_hws_tile.num == 0) {
+            __kmp_hws_tile.num = NL + 1;
+          } // use all available tiles, some node may have more tiles, thus +1
+          if (__kmp_hws_tile.offset >= NL) {
+            KMP_WARNING(AffHWSubsetManyTiles);
+            goto _exit;
+          }
+          int NC = __kmp_hwloc_count_children_by_type(
+            tp, hL, HWLOC_OBJ_CORE, &hC); // num cores in tile
+          if (__kmp_hws_core.num == 0)
+            __kmp_hws_core.num = NC;   // use all available cores
+          if (__kmp_hws_core.offset >= NC) {
+            KMP_WARNING(AffHWSubsetManyCores);
+            goto _exit;
+          }
+        } else { // tile_support
+          int NC = __kmp_hwloc_count_children_by_type(
+            tp, hN, HWLOC_OBJ_CORE, &hC); // num cores in node
+          if (__kmp_hws_core.num == 0)
+            __kmp_hws_core.num = NC;   // use all available cores
+          if (__kmp_hws_core.offset >= NC) {
+            KMP_WARNING(AffHWSubsetManyCores);
+            goto _exit;
+          }
+        } // tile_support
+      } else { // numa_support
+        if (tile_support) {
+          // get num tiles in socket
+          int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
+          if (__kmp_hws_tile.num == 0)
+            __kmp_hws_tile.num = NL; // use all available tiles
+          if (__kmp_hws_tile.offset >= NL) {
+            KMP_WARNING(AffHWSubsetManyTiles);
+            goto _exit;
+          }
+          int NC = __kmp_hwloc_count_children_by_type(
+            tp, hL, HWLOC_OBJ_CORE, &hC); // num cores in tile
+          if (__kmp_hws_core.num == 0)
+            __kmp_hws_core.num = NC;   // use all available cores
+          if (__kmp_hws_core.offset >= NC) {
+            KMP_WARNING(AffHWSubsetManyCores);
+            goto _exit;
+          }
+        } else { // tile_support
+          int NC = __kmp_hwloc_count_children_by_type(
+            tp, hS, HWLOC_OBJ_CORE, &hC); // num cores in socket
+          if (__kmp_hws_core.num == 0)
+            __kmp_hws_core.num = NC;   // use all available cores
+          if (__kmp_hws_core.offset >= NC) {
+            KMP_WARNING(AffHWSubsetManyCores);
+            goto _exit;
+          }
+        } // tile_support
+      }
+      if (__kmp_hws_proc.num == 0)
+        __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs
+      if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) {
+        KMP_WARNING(AffHWSubsetManyProcs);
+        goto _exit;
+      }
+      // end of validation --------------------------------------------
+
+      if (pAddr) // pAddr is NULL in case of affinity_none
+        newAddr = (AddrUnsPair *)__kmp_allocate(
+          sizeof(AddrUnsPair) * __kmp_avail_proc); // max size
+      // main loop to form HW subset ----------------------------------
+      hS = NULL;
+      int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE);
+      for (int s = 0; s < NP; ++s) {
+        // Check Socket -----------------------------------------------
+        hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS);
+        if (!__kmp_hwloc_obj_has_PUs(tp, hS))
+          continue; // skip socket if all PUs are out of fullMask
+        ++nS; // only count objects those have PUs in affinity mask
+        if (nS <= __kmp_hws_socket.offset ||
+            nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) {
+          n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket
+          continue; // move to next socket
+        }
+        nCr = 0; // count number of cores per socket
+        // socket requested, go down the topology tree
+        // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
+        if (numa_support) {
+          nN = 0;
+          hN = NULL;
+          int NN = __kmp_hwloc_count_children_by_type(
+            tp, hS, HWLOC_OBJ_NUMANODE, &hN); // num nodes in current socket
+          for (int n = 0; n < NN; ++n) {
+            // Check NUMA Node ----------------------------------------
+            if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
+              hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
+              continue; // skip node if all PUs are out of fullMask
+            }
+            ++nN;
+            if (nN <= __kmp_hws_node.offset ||
+                nN > __kmp_hws_node.num + __kmp_hws_node.offset) {
+              // skip node as not requested
+              n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node
+              hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
+              continue; // move to next node
+            }
+            // node requested, go down the topology tree
+            if (tile_support) {
+              nL = 0;
+              hL = NULL;
+              int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
+              for (int l = 0; l < NL; ++l) {
+                // Check L2 (tile) ------------------------------------
+                if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
+                  hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+                  continue; // skip tile if all PUs are out of fullMask
+                }
+                ++nL;
+                if (nL <= __kmp_hws_tile.offset ||
+                    nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
+                  // skip tile as not requested
+                  n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
+                  hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+                  continue; // move to next tile
+                }
+                // tile requested, go down the topology tree
+                nC = 0;
+                hC = NULL;
+                int NC = __kmp_hwloc_count_children_by_type(
+                  tp, hL, HWLOC_OBJ_CORE, &hC); // num cores in current tile
+                for (int c = 0; c < NC; ++c) {
+                  // Check Core ---------------------------------------
+                  if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
+                    hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                    continue; // skip core if all PUs are out of fullMask
+                  }
+                  ++nC;
+                  if (nC <= __kmp_hws_core.offset ||
+                      nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
+                    // skip node as not requested
+                    n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
+                    hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                    continue; // move to next node
+                  }
+                  // core requested, go down to PUs
+                  nT = 0;
+                  nTr = 0;
+                  hT = NULL;
+                  int NT = __kmp_hwloc_count_children_by_type(
+                    tp, hC, HWLOC_OBJ_PU, &hT); // num procs in current core
+                  for (int t = 0; t < NT; ++t) {
+                    // Check PU ---------------------------------------
+                    idx = hT->os_index;
+                    if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+                      hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                      continue; // skip PU if not in fullMask
+                    }
+                    ++nT;
+                    if (nT <= __kmp_hws_proc.offset ||
+                        nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
+                      // skip PU
+                      KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+                      ++n_old;
+                      KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+                      hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                      continue; // move to next node
+                    }
+                    ++nTr;
+                    if (pAddr) // collect requested thread's data
+                      newAddr[n_new] = (*pAddr)[n_old];
+                    ++n_new;
+                    ++n_old;
+                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                  } // threads loop
+                  if (nTr > 0) {
+                    ++nCr; // num cores per socket
+                    ++nCo; // total num cores
+                    if (nTr > nTpC)
+                      nTpC = nTr; // calc max threads per core
+                  }
+                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                } // cores loop
+                hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+              } // tiles loop
+            } else { // tile_support
+              // no tiles, check cores
+              nC = 0;
+              hC = NULL;
+              int NC = __kmp_hwloc_count_children_by_type(
+                tp, hN, HWLOC_OBJ_CORE, &hC); // num cores in current node
+              for (int c = 0; c < NC; ++c) {
+                // Check Core ---------------------------------------
+                if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
+                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                  continue; // skip core if all PUs are out of fullMask
+                }
+                ++nC;
+                if (nC <= __kmp_hws_core.offset ||
+                    nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
+                  // skip node as not requested
+                  n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
+                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                  continue; // move to next node
+                }
+                // core requested, go down to PUs
+                nT = 0;
+                nTr = 0;
+                hT = NULL;
+                int NT = __kmp_hwloc_count_children_by_type(
+                  tp, hC, HWLOC_OBJ_PU, &hT);
+                for (int t = 0; t < NT; ++t) {
+                  // Check PU ---------------------------------------
+                  idx = hT->os_index;
+                  if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                    continue; // skip PU if not in fullMask
+                  }
+                  ++nT;
+                  if (nT <= __kmp_hws_proc.offset ||
+                      nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
+                    // skip PU
+                    KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+                    ++n_old;
+                    KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                    continue; // move to next node
+                  }
+                  ++nTr;
+                  if (pAddr) // collect requested thread's data
+                    newAddr[n_new] = (*pAddr)[n_old];
+                  ++n_new;
+                  ++n_old;
+                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                } // threads loop
+                if (nTr > 0) {
+                  ++nCr; // num cores per socket
+                  ++nCo; // total num cores
+                  if (nTr > nTpC)
+                    nTpC = nTr; // calc max threads per core
+                }
+                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+              } // cores loop
+            } // tiles support
+            hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
+          } // nodes loop
+        } else { // numa_support
+          // no NUMA support
+          if (tile_support) {
+            nL = 0;
+            hL = NULL;
+            int NL = __kmp_hwloc_count_children_by_depth(
+              tp, hS, L2depth, &hL); // num tiles in current socket
+            for (int l = 0; l < NL; ++l) {
+              // Check L2 (tile) ------------------------------------
+              if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
+                hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+                continue; // skip tile if all PUs are out of fullMask
+              }
+              ++nL;
+              if (nL <= __kmp_hws_tile.offset ||
+                  nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
+                // skip tile as not requested
+                n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
+                hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+                continue; // move to next tile
+              }
+              // tile requested, go down the topology tree
+              nC = 0;
+              hC = NULL;
+              int NC = __kmp_hwloc_count_children_by_type(
+                tp, hL, HWLOC_OBJ_CORE, &hC); // num cores per tile
+              for (int c = 0; c < NC; ++c) {
+                // Check Core ---------------------------------------
+                if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
+                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                  continue; // skip core if all PUs are out of fullMask
+                }
+                ++nC;
+                if (nC <= __kmp_hws_core.offset ||
+                    nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
+                  // skip node as not requested
+                  n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
+                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                  continue; // move to next node
+                }
+                // core requested, go down to PUs
+                nT = 0;
+                nTr = 0;
+                hT = NULL;
+                int NT = __kmp_hwloc_count_children_by_type(
+                  tp, hC, HWLOC_OBJ_PU, &hT); // num procs per core
+                for (int t = 0; t < NT; ++t) {
+                  // Check PU ---------------------------------------
+                  idx = hT->os_index;
+                  if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                    continue; // skip PU if not in fullMask
+                  }
+                  ++nT;
+                  if (nT <= __kmp_hws_proc.offset ||
+                      nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
+                    // skip PU
+                    KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+                    ++n_old;
+                    KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                    continue; // move to next node
+                  }
+                  ++nTr;
+                  if (pAddr) // collect requested thread's data
+                    newAddr[n_new] = (*pAddr)[n_old];
+                  ++n_new;
+                  ++n_old;
+                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                } // threads loop
+                if (nTr > 0) {
+                  ++nCr; // num cores per socket
+                  ++nCo; // total num cores
+                  if (nTr > nTpC)
+                    nTpC = nTr; // calc max threads per core
+                }
+                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+              } // cores loop
+              hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+            } // tiles loop
+          } else { // tile_support
+            // no tiles, check cores
+            nC = 0;
+            hC = NULL;
+            int NC = __kmp_hwloc_count_children_by_type(
+              tp, hS, HWLOC_OBJ_CORE, &hC); // num cores in socket
+            for (int c = 0; c < NC; ++c) {
+              // Check Core -------------------------------------------
+              if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
+                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                continue; // skip core if all PUs are out of fullMask
+              }
+              ++nC;
+              if (nC <= __kmp_hws_core.offset ||
+                  nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
+                // skip node as not requested
+                n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
+                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                continue; // move to next node
+              }
+              // core requested, go down to PUs
+              nT = 0;
+              nTr = 0;
+              hT = NULL;
+              int NT = __kmp_hwloc_count_children_by_type(
+                tp, hC, HWLOC_OBJ_PU, &hT); // num procs per core
+              for (int t = 0; t < NT; ++t) {
+                // Check PU ---------------------------------------
+                idx = hT->os_index;
+                if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                  continue; // skip PU if not in fullMask
+                }
+                ++nT;
+                if (nT <= __kmp_hws_proc.offset ||
+                    nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
+                  // skip PU
+                  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+                  ++n_old;
+                  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                  continue; // move to next node
+                }
+                ++nTr;
+                if (pAddr) // collect requested thread's data
+                  newAddr[n_new] = (*pAddr)[n_old];
+                ++n_new;
+                ++n_old;
+                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+              } // threads loop
+              if (nTr > 0) {
+                ++nCr; // num cores per socket
+                ++nCo; // total num cores
+                if (nTr > nTpC)
+                  nTpC = nTr; // calc max threads per core
+              }
+              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+            } // cores loop
+          } // tiles support
+        } // numa_support
+        if (nCr > 0) { // found cores?
+          ++nPkg; // num sockets
+          if (nCr > nCpP)
+            nCpP = nCr; // calc max cores per socket
+        }
+      } // sockets loop
+
+      // check the subset is valid
+      KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc);
+      KMP_DEBUG_ASSERT(nPkg > 0);
+      KMP_DEBUG_ASSERT(nCpP > 0);
+      KMP_DEBUG_ASSERT(nTpC > 0);
+      KMP_DEBUG_ASSERT(nCo > 0);
+      KMP_DEBUG_ASSERT(nPkg <= nPackages);
+      KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg);
+      KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
+      KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
+
+      nPackages = nPkg;             // correct num sockets
+      nCoresPerPkg = nCpP;          // correct num cores per socket
+      __kmp_nThreadsPerCore = nTpC; // correct num threads per core
+      __kmp_avail_proc = n_new;     // correct num procs
+      __kmp_ncores = nCo;           // correct num cores
+      // hwloc topology method end
+    } else
+#endif // KMP_USE_HWLOC
+    {
+      int n_old = 0, n_new = 0, proc_num = 0;
+      if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) {
+        KMP_WARNING(AffHWSubsetNoHWLOC);
+        goto _exit;
+      }
+      if (__kmp_hws_socket.num == 0)
+        __kmp_hws_socket.num = nPackages;    // use all available sockets
+      if (__kmp_hws_core.num == 0)
+        __kmp_hws_core.num = nCoresPerPkg;   // use all available cores
+      if (__kmp_hws_proc.num == 0 ||
+        __kmp_hws_proc.num > __kmp_nThreadsPerCore)
+        __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
+      if ( !__kmp_affinity_uniform_topology() ) {
        KMP_WARNING( AffHWSubsetNonUniform );
        goto _exit; // don't support non-uniform topology
-    }
-    if ( depth > 3 ) {
+      }
+      if ( depth > 3 ) {
        KMP_WARNING( AffHWSubsetNonThreeLevel );
        goto _exit; // don't support not-3-level topology
-    }
-    if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) {
+      }
+      if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
        KMP_WARNING(AffHWSubsetManySockets);
        goto _exit;
-    }
-    if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
+      }
+      if ( __kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg ) {
        KMP_WARNING( AffHWSubsetManyCores );
        goto _exit;
-    }
-
-    AddrUnsPair *newAddr;
-    if (pAddr) // pAddr is NULL in case of affinity_none
+      }
+      // Form the requested subset
+      if (pAddr) // pAddr is NULL in case of affinity_none
        newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
-            __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
-
-    for (i = 0; i < nPackages; ++i) {
-        if (i < __kmp_place_socket_offset ||
-            i >= __kmp_place_socket_offset + __kmp_place_num_sockets) {
-            n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket
-            if (__kmp_pu_os_idx != NULL) {
-                for (j = 0; j < nCoresPerPkg; ++j) { // walk through skipped socket
-                    for (k = 0; k < __kmp_nThreadsPerCore; ++k) {
-                        KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
-                        ++proc_num;
-                    }
-                }
+          __kmp_hws_socket.num * __kmp_hws_core.num * __kmp_hws_proc.num);
+      for (int i = 0; i < nPackages; ++i) {
+        if (i < __kmp_hws_socket.offset ||
+            i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
+          // skip not-requested socket
+          n_old += nCoresPerPkg * __kmp_nThreadsPerCore;
+          if (__kmp_pu_os_idx != NULL) {
+            // walk through skipped socket
+            for (int j = 0; j < nCoresPerPkg; ++j) {
+              for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+                KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
+                ++proc_num;
+              }
            }
+          }
        } else {
-            for (j = 0; j < nCoresPerPkg; ++j) { // walk through requested socket
-                if (j < __kmp_place_core_offset ||
-                    j >= __kmp_place_core_offset + __kmp_place_num_cores) {
-                    n_old += __kmp_nThreadsPerCore; // skip not-requested core
-                    if (__kmp_pu_os_idx != NULL) {
-                        for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through skipped core
-                            KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
-                            ++proc_num;
-                        }
-                    }
-                } else {
-                    for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core
-                        if (k < __kmp_place_num_threads_per_core) {
-                            if (pAddr)
-                                newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data
-                            n_new++;
-                        } else {
-                            if (__kmp_pu_os_idx != NULL)
-                                KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
-                        }
-                        n_old++;
-                        ++proc_num;
-                    }
+          // walk through requested socket
+          for (int j = 0; j < nCoresPerPkg; ++j) {
+            if (j < __kmp_hws_core.offset ||
+                j >= __kmp_hws_core.offset + __kmp_hws_core.num)
+            { // skip not-requested core
+              n_old += __kmp_nThreadsPerCore;
+              if (__kmp_pu_os_idx != NULL) {
+                for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+                  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
+                  ++proc_num;
                }
+              }
+            } else {
+              // walk through requested core
+              for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+                if (k < __kmp_hws_proc.num) {
+                  if (pAddr) // collect requested thread's data
+                    newAddr[n_new] = (*pAddr)[n_old];
+                  n_new++;
+                } else {
+                  if (__kmp_pu_os_idx != NULL)
+                    KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
+                }
+                n_old++;
+                ++proc_num;
+              }
            }
+          }
        }
-    }
-    KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
-    KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores *
-                     __kmp_place_num_threads_per_core);
-
-    nPackages = __kmp_place_num_sockets;                      // correct nPackages
-    nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
-    __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
-    __kmp_avail_proc = n_new;                                 // correct avail_proc
-    __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
-
+      }
+      KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
+      KMP_DEBUG_ASSERT(n_new == __kmp_hws_socket.num * __kmp_hws_core.num *
+                       __kmp_hws_proc.num);
+      nPackages = __kmp_hws_socket.num;           // correct nPackages
+      nCoresPerPkg = __kmp_hws_core.num;          // correct nCoresPerPkg
+      __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
+      __kmp_avail_proc = n_new;                   // correct avail_proc
+      __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
+    } // non-hwloc topology method
    if (pAddr) {
-        __kmp_free( *pAddr );
-        *pAddr = newAddr;      // replace old topology with new one
+      __kmp_free( *pAddr );
+      *pAddr = newAddr;      // replace old topology with new one
+    }
+    if (__kmp_affinity_verbose) {
+      char m[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(m,KMP_AFFIN_MASK_PRINT_LEN,__kmp_affin_fullMask);
+      if (__kmp_affinity_respect_mask) {
+        KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m);
+      } else {
+        KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m);
+      }
+      KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
+      kmp_str_buf_t buf;
+      __kmp_str_buf_init(&buf);
+      __kmp_str_buf_print(&buf, "%d", nPackages);
+      KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg,
+        __kmp_nThreadsPerCore, __kmp_ncores);
+      __kmp_str_buf_free(&buf);
    }
 _exit:
    if (__kmp_pu_os_idx != NULL) {
-        __kmp_free(__kmp_pu_os_idx);
-        __kmp_pu_os_idx = NULL;
+      __kmp_free(__kmp_pu_os_idx);
+      __kmp_pu_os_idx = NULL;
    }
 }

--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@ -3038,18 +3038,6 @@ __kmpc_get_parent_taskid() {

 } // __kmpc_get_parent_taskid

-void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT)
-{
-    if ( ! __kmp_init_serial ) {
-        __kmp_serial_initialize();
-    }
-    __kmp_place_num_sockets = nS;
-    __kmp_place_socket_offset = sO;
-    __kmp_place_num_cores = nC;
-    __kmp_place_core_offset = cO;
-    __kmp_place_num_threads_per_core = nT;
-}
-
 #if OMP_45_ENABLED
 /*!
@ingroup WORK_SHARING
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@ -264,11 +264,13 @@ kmp_nested_proc_bind_t __kmp_nested_proc_bind = { NULL, 0, 0 };
 int __kmp_affinity_num_places = 0;
 #endif

-int __kmp_place_num_sockets = 0;
-int __kmp_place_socket_offset = 0;
-int __kmp_place_num_cores = 0;
-int __kmp_place_core_offset = 0;
-int __kmp_place_num_threads_per_core = 0;
+kmp_hws_item_t __kmp_hws_socket = {0, 0};
+kmp_hws_item_t __kmp_hws_node = {0, 0};
+kmp_hws_item_t __kmp_hws_tile = {0, 0};
+kmp_hws_item_t __kmp_hws_core = {0, 0};
+kmp_hws_item_t __kmp_hws_proc = {0, 0};
+int __kmp_hws_requested = 0;
+int __kmp_hws_abs_flag = 0; // absolute or per-item number requested

 #if OMP_40_ENABLED
 kmp_int32 __kmp_default_device = 0;
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@ -24,6 +24,7 @@
 #include "kmp_lock.h"
 #include "kmp_io.h"
 #include "kmp_affinity.h"
+#include <ctype.h>   // toupper()

 static int __kmp_env_toPrint( char const * name, int flag );

@ -3108,6 +3109,12 @@ __kmp_stg_print_topology_method( kmp_str_buf_t * buffer, char const * name,
        break;
 #  endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */

+# if KMP_USE_HWLOC
+        case affinity_top_method_hwloc:
+        value = "hwloc";
+        break;
+# endif
+
        case affinity_top_method_cpuinfo:
        value = "cpuinfo";
        break;
@ -4297,275 +4304,152 @@ __kmp_stg_print_speculative_statsfile( kmp_str_buf_t * buffer, char const * name
 // KMP_HW_SUBSET (was KMP_PLACE_THREADS)
 // -------------------------------------------------------------------------------------------------

+// The longest observable sequense of items is
+// Socket-Node-Tile-Core-Thread
+// So, let's limit to 5 levels for now
+// The input string is usually short enough, let's use 512 limit for now
+#define MAX_T_LEVEL 5
+#define MAX_STR_LEN 512
 static void
 __kmp_stg_parse_hw_subset( char const * name, char const * value, void * data ) {
-    // Value example: 5Cx2Tx15O
-    // Which means "use 5 cores with offset 15, 2 threads per core"
-    // AC: extended to sockets level, examples of
-    //     "use 2 sockets with offset 6, 2 cores with offset 2 per socket, 2 threads per core":
-    //     2s,6o,2c,2o,2t; 2s,6o,2c,2t,2o; 2s@6,2c@2,2t
-    //     To not break legacy code core-offset can be last;
-    //     postfix "o" or prefix @ can be offset designator.
-    // Note: not all syntax errors are analyzed, some may be skipped.
-#define CHECK_DELIM(_x)   (*(_x) == ',' || *(_x) == 'x')
-    static int parsed = 0;
-    int         num;
-    int single_warning = 0;
-    int flagS = 0, flagC = 0, flagT = 0, flagSO = 0, flagCO = 0;
-    const char *next = value;
-    const char *prev;
-
-    if( strcmp(name, "KMP_PLACE_THREADS") == 0 ) {
-        KMP_INFORM(EnvVarDeprecated,name,"KMP_HW_SUBSET");
-        if( parsed == 1 ) {
-            return; // already parsed KMP_HW_SUBSET
-        }
+  // Value example: 1s,5c@3,2T
+  // Which means "use 1 socket, 5 cores with offset 3, 2 threads per core"
+  static int parsed = 0;
+  if( strcmp(name, "KMP_PLACE_THREADS") == 0 ) {
+    KMP_INFORM(EnvVarDeprecated,name,"KMP_HW_SUBSET");
+    if( parsed == 1 ) {
+      return; // already parsed KMP_HW_SUBSET
    }
-    parsed = 1;
+  }
+  parsed = 1;

-    SKIP_WS(next);  // skip white spaces
-    if (*next == '\0')
-        return;   // no data provided, retain default values
-    if( strcmp(name, "KMP_PLACE_THREADS") == 0 ) {
-        KMP_INFORM(EnvVarDeprecated,name,"KMP_HW_SUBSET");
-        if( parsed == 1 ) {
-            return; // already parsed KMP_HW_SUBSET
-        }
+  char *components[MAX_T_LEVEL];
+  char const *digits = "0123456789";
+  char input[MAX_STR_LEN];
+  size_t len = 0, mlen = MAX_STR_LEN;
+  int level = 0;
+  // Canonize the string (remove spaces, unify delimiters, etc.)
+  char *pos = (char *)value;
+  while (*pos && mlen) {
+    if (*pos != ' ') { // skip spaces
+      if (len == 0 && *pos == ':') {
+        __kmp_hws_abs_flag = 1; // if the first symbol is ":", skip it
+      } else {
+        input[len] = toupper(*pos);
+        if (input[len] == 'X')
+          input[len] = ','; // unify delimiters of levels
+        if (input[len] == 'O' && strchr(digits, *(pos + 1)))
+          input[len] = '@'; // unify delimiters of offset
+        len++;
+      }
    }
-    parsed = 1;
-
-    SKIP_WS(next);  // skip white spaces
-    if (*next == '\0')
-        return;   // no data provided, retain default values
-    // Get num_sockets first (or whatever specified)
-    if (*next >= '0' && *next <= '9') {
-        prev = next;
-        SKIP_DIGITS(next);
-        num = __kmp_str_to_int(prev, *next);
-        SKIP_WS(next);
-        if (*next == 's' || *next == 'S') {  // e.g. "2s"
-            __kmp_place_num_sockets = num;
-            flagS = 1; // got num sockets
-            next++;
-            if (*next == '@') { // socket offset, e.g. "2s@4"
-                flagSO = 1;
-                prev = ++next;  // don't allow spaces for simplicity
-                if (!(*next >= '0' && *next <= '9')) {
-                    KMP_WARNING(AffHWSubsetInvalid, name, value);
-                    return;
-                }
-                SKIP_DIGITS(next);
-                num = __kmp_str_to_int(prev, *next);
-                __kmp_place_socket_offset = num;
-            }
-        } else if (*next == 'c' || *next == 'C') {
-            __kmp_place_num_cores = num;
-            flagS = flagC = 1; // sockets were not specified - use default
-            next++;
-            if (*next == '@') { // core offset, e.g. "2c@6"
-                flagCO = 1;
-                prev = ++next;  // don't allow spaces for simplicity
-                if (!(*next >= '0' && *next <= '9')) {
-                    KMP_WARNING(AffHWSubsetInvalid, name, value);
-                    return;
-                }
-                SKIP_DIGITS(next);
-                num = __kmp_str_to_int(prev, *next);
-                __kmp_place_core_offset = num;
-            }
-        } else if (CHECK_DELIM(next)) {
-            __kmp_place_num_cores = num; // no letter-designator - num cores
-            flagS = flagC = 1; // sockets were not specified - use default
-            next++;
-        } else if (*next == 't' || *next == 'T') {
-            __kmp_place_num_threads_per_core = num;
-            // sockets, cores were not specified - use default
-            return;   // we ignore offset value in case all cores are used
-        } else if (*next == '\0') {
-            __kmp_place_num_cores = num;
-            return;   // the only value provided - set num cores
+    mlen--;
+    pos++;
+  }
+  if (len == 0 || mlen == 0)
+    goto err; // contents is either empty or too long
+  input[len] = '\0';
+  __kmp_hws_requested = 1; // mark that subset requested
+  // Split by delimiter
+  pos = input;
+  components[level++] = pos;
+  while (pos = strchr(pos, ',')) {
+    *pos = '\0'; // modify input and avoid more copying
+    components[level++] = ++pos; // expect something after ","
+    if (level > MAX_T_LEVEL)
+      goto err; // too many components provided
+  }
+  // Check each component
+  for (int i = 0; i < level; ++i) {
+    int offset = 0;
+    int num = atoi(components[i]); // each component should start with a number
+    if ((pos = strchr(components[i], '@'))) {
+      offset = atoi(pos + 1); // save offset
+      *pos = '\0'; // cut the offset from the component
+    }
+    pos = components[i] + strspn(components[i], digits);
+    if (pos == components[i])
+      goto err;
+    // detect the component type
+    switch (*pos) {
+    case 'S': // Socket
+      if (__kmp_hws_socket.num > 0)
+        goto err; // duplicate is not allowed
+      __kmp_hws_socket.num = num;
+      __kmp_hws_socket.offset = offset;
+      break;
+    case 'N': // NUMA Node
+      if (__kmp_hws_node.num > 0)
+        goto err; // duplicate is not allowed
+      __kmp_hws_node.num = num;
+      __kmp_hws_node.offset = offset;
+      break;
+    case 'L': // Cache
+      if (*(pos + 1) == '2') { // L2 - Tile
+        if (__kmp_hws_tile.num > 0)
+          goto err; // duplicate is not allowed
+        __kmp_hws_tile.num = num;
+        __kmp_hws_tile.offset = offset;
+      } else if (*(pos + 1) == '3') { // L3 - Socket
+        if (__kmp_hws_socket.num > 0)
+          goto err; // duplicate is not allowed
+        __kmp_hws_socket.num = num;
+        __kmp_hws_socket.offset = offset;
+      } else if (*(pos + 1) == '1') { // L1 - Core
+        if (__kmp_hws_core.num > 0)
+          goto err; // duplicate is not allowed
+        __kmp_hws_core.num = num;
+        __kmp_hws_core.offset = offset;
+      }
+      break;
+    case 'C': // Core (or Cache?)
+      if (*(pos + 1) != 'A') {
+        if (__kmp_hws_core.num > 0)
+          goto err; // duplicate is not allowed
+        __kmp_hws_core.num = num;
+        __kmp_hws_core.offset = offset;
+      } else { // Cache
+        char *d = pos + strcspn(pos, digits); // find digit
+        if (*d == '2') { // L2 - Tile
+          if (__kmp_hws_tile.num > 0)
+            goto err; // duplicate is not allowed
+          __kmp_hws_tile.num = num;
+          __kmp_hws_tile.offset = offset;
+        } else if (*d == '3') { // L3 - Socket
+          if (__kmp_hws_socket.num > 0)
+            goto err; // duplicate is not allowed
+          __kmp_hws_socket.num = num;
+          __kmp_hws_socket.offset = offset;
+        } else if (*d == '1') { // L1 - Core
+          if (__kmp_hws_core.num > 0)
+            goto err; // duplicate is not allowed
+          __kmp_hws_core.num = num;
+          __kmp_hws_core.offset = offset;
        } else {
-            KMP_WARNING(AffHWSubsetInvalid, name, value);
-            return;
+          goto err;
        }
-    } else {
-        KMP_WARNING(AffHWSubsetInvalid, name, value);
-        return;
+      }
+      break;
+    case 'T': // Thread
+      if (__kmp_hws_proc.num > 0)
+        goto err; // duplicate is not allowed
+      __kmp_hws_proc.num = num;
+      __kmp_hws_proc.offset = offset;
+      break;
+    default:
+      goto err;
    }
-    KMP_DEBUG_ASSERT(flagS); // num sockets should already be set here
-    SKIP_WS(next);
-    if (*next == '\0')
-        return;   // " n  " - something like this
-    if (CHECK_DELIM(next)) {
-        next++;   // skip delimiter
-        SKIP_WS(next);
-    }
-
-    // Get second value (could be offset, num_cores, num_threads)
-    if (*next >= '0' && *next <= '9') {
-        prev = next;
-        SKIP_DIGITS(next);
-        num = __kmp_str_to_int(prev, *next);
-        SKIP_WS(next);
-        if (*next == 'c' || *next == 'C') {
-            KMP_DEBUG_ASSERT(flagC == 0);
-            __kmp_place_num_cores = num;
-            flagC = 1;
-            next++;
-            if (*next == '@') { // core offset, e.g. "2c@6"
-                flagCO = 1;
-                prev = ++next;  // don't allow spaces for simplicity
-                if (!(*next >= '0' && *next <= '9')) {
-                    KMP_WARNING(AffHWSubsetInvalid, name, value);
-                    return;
-                }
-                SKIP_DIGITS(next);
-                num = __kmp_str_to_int(prev, *next);
-                __kmp_place_core_offset = num;
-            }
-        } else if (*next == 'o' || *next == 'O') { // offset specified
-            KMP_WARNING(AffHWSubsetDeprecated);
-            single_warning = 1;
-            if (flagC) { // whether num_cores already specified (sockets skipped)
-                KMP_DEBUG_ASSERT(!flagCO); // either "o" or @, not both
-                __kmp_place_core_offset = num;
-            } else {
-                KMP_DEBUG_ASSERT(!flagSO); // either "o" or @, not both
-                __kmp_place_socket_offset = num;
-            }
-            next++;
-        } else if (*next == 't' || *next == 'T') {
-            KMP_DEBUG_ASSERT(flagT == 0);
-            __kmp_place_num_threads_per_core = num;
-            flagC = 1; // num_cores could be skipped ?
-            flagT = 1;
-            next++; // can have core-offset specified after num threads
-        } else if (*next == '\0') {
-            KMP_DEBUG_ASSERT(flagC); // 4x2 means 4 cores 2 threads per core
-            __kmp_place_num_threads_per_core = num;
-            return;   // two values provided without letter-designator
-        } else {
-            KMP_WARNING(AffHWSubsetInvalid, name, value);
-            return;
-        }
-    } else {
-        KMP_WARNING(AffHWSubsetInvalid, name, value);
-        return;
-    }
-    SKIP_WS(next);
-    if (*next == '\0')
-        return;   // " Ns,Nc  " - something like this
-    if (CHECK_DELIM(next)) {
-        next++;   // skip delimiter
-        SKIP_WS(next);
-    }
-
-    // Get third value (could be core-offset, num_cores, num_threads)
-    if (*next >= '0' && *next <= '9') {
-        prev = next;
-        SKIP_DIGITS(next);
-        num = __kmp_str_to_int(prev, *next);
-        SKIP_WS(next);
-        if (*next == 't' || *next == 'T') {
-            KMP_DEBUG_ASSERT(flagT == 0);
-            __kmp_place_num_threads_per_core = num;
-            if (flagC == 0)
-                return; // num_cores could be skipped (e.g. 2s,4o,2t)
-            flagT = 1;
-            next++; // can have core-offset specified later (e.g. 2s,1c,2t,3o)
-        } else if (*next == 'c' || *next == 'C') {
-            KMP_DEBUG_ASSERT(flagC == 0);
-            __kmp_place_num_cores = num;
-            flagC = 1;
-            next++;
-            //KMP_DEBUG_ASSERT(*next != '@'); // socket offset used "o" designator
-        } else if (*next == 'o' || *next == 'O') {
-            KMP_WARNING(AffHWSubsetDeprecated);
-            single_warning = 1;
-            KMP_DEBUG_ASSERT(flagC);
-            //KMP_DEBUG_ASSERT(!flagSO); // socket offset couldn't use @ designator
-            __kmp_place_core_offset = num;
-            next++;
-        } else {
-            KMP_WARNING(AffHWSubsetInvalid, name, value);
-            return;
-        }
-    } else {
-        KMP_WARNING(AffHWSubsetInvalid, name, value);
-        return;
-    }
-    KMP_DEBUG_ASSERT(flagC);
-    SKIP_WS(next);
-    if ( *next == '\0' )
-            return;
-    if (CHECK_DELIM(next)) {
-        next++;   // skip delimiter
-        SKIP_WS(next);
-    }
-
-    // Get 4-th value (could be core-offset, num_threads)
-    if (*next >= '0' && *next <= '9') {
-        prev = next;
-        SKIP_DIGITS(next);
-        num = __kmp_str_to_int(prev, *next);
-        SKIP_WS(next);
-        if (*next == 'o' || *next == 'O') {
-            if (!single_warning) { // warn once
-                KMP_WARNING(AffHWSubsetDeprecated);
-            }
-            KMP_DEBUG_ASSERT(!flagSO); // socket offset couldn't use @ designator
-            __kmp_place_core_offset = num;
-            next++;
-        } else if (*next == 't' || *next == 'T') {
-            KMP_DEBUG_ASSERT(flagT == 0);
-            __kmp_place_num_threads_per_core = num;
-            flagT = 1;
-            next++; // can have core-offset specified after num threads
-        } else {
-            KMP_WARNING(AffHWSubsetInvalid, name, value);
-            return;
-        }
-    } else {
-        KMP_WARNING(AffHWSubsetInvalid, name, value);
-        return;
-    }
-    SKIP_WS(next);
-    if ( *next == '\0' )
-        return;
-    if (CHECK_DELIM(next)) {
-        next++;   // skip delimiter
-        SKIP_WS(next);
-    }
-
-    // Get 5-th value (could be core-offset, num_threads)
-    if (*next >= '0' && *next <= '9') {
-        prev = next;
-        SKIP_DIGITS(next);
-        num = __kmp_str_to_int(prev, *next);
-        SKIP_WS(next);
-        if (*next == 'o' || *next == 'O') {
-            if (!single_warning) { // warn once
-                KMP_WARNING(AffHWSubsetDeprecated);
-            }
-            KMP_DEBUG_ASSERT(flagT);
-            KMP_DEBUG_ASSERT(!flagSO); // socket offset couldn't use @ designator
-            __kmp_place_core_offset = num;
-        } else if (*next == 't' || *next == 'T') {
-            KMP_DEBUG_ASSERT(flagT == 0);
-            __kmp_place_num_threads_per_core = num;
-        } else {
-            KMP_WARNING(AffHWSubsetInvalid, name, value);
-        }
-    } else {
-        KMP_WARNING(AffHWSubsetInvalid, name, value);
-    }
-    return;
-#undef CHECK_DELIM
+  }
+  return;
+err:
+  KMP_WARNING(AffHWSubsetInvalid, name, value);
+  __kmp_hws_requested = 0; // mark that subset not requested
+  return;
 }

 static void
 __kmp_stg_print_hw_subset( kmp_str_buf_t * buffer, char const * name, void * data ) {
-    if (__kmp_place_num_sockets + __kmp_place_num_cores + __kmp_place_num_threads_per_core) {
+    if (__kmp_hws_requested) {
        int comma = 0;
        kmp_str_buf_t buf;
        __kmp_str_buf_init(&buf);
@ -4573,26 +4457,34 @@ __kmp_stg_print_hw_subset( kmp_str_buf_t * buffer, char const * name, void * dat
            KMP_STR_BUF_PRINT_NAME_EX(name);
        else
            __kmp_str_buf_print(buffer, "   %s='", name);
-        if (__kmp_place_num_sockets) {
-            __kmp_str_buf_print(&buf, "%ds", __kmp_place_num_sockets);
-            if (__kmp_place_socket_offset)
-                __kmp_str_buf_print(&buf, "@%d", __kmp_place_socket_offset);
+        if (__kmp_hws_socket.num) {
+            __kmp_str_buf_print(&buf, "%ds", __kmp_hws_socket.num);
+            if (__kmp_hws_socket.offset)
+                __kmp_str_buf_print(&buf, "@%d", __kmp_hws_socket.offset);
            comma = 1;
        }
-        if (__kmp_place_num_cores) {
-            __kmp_str_buf_print(&buf, "%s%dc", comma?",":"", __kmp_place_num_cores);
-            if (__kmp_place_core_offset)
-                __kmp_str_buf_print(&buf, "@%d", __kmp_place_core_offset);
+        if (__kmp_hws_node.num) {
+            __kmp_str_buf_print(&buf, "%s%dn", comma?",":"", __kmp_hws_node.num);
+            if (__kmp_hws_node.offset)
+                __kmp_str_buf_print(&buf, "@%d", __kmp_hws_node.offset);
            comma = 1;
        }
-        if (__kmp_place_num_threads_per_core)
-            __kmp_str_buf_print(&buf, "%s%dt", comma?",":"", __kmp_place_num_threads_per_core);
+        if (__kmp_hws_tile.num) {
+            __kmp_str_buf_print(&buf, "%s%dL2", comma?",":"", __kmp_hws_tile.num);
+            if (__kmp_hws_tile.offset)
+                __kmp_str_buf_print(&buf, "@%d", __kmp_hws_tile.offset);
+            comma = 1;
+        }
+        if (__kmp_hws_core.num) {
+            __kmp_str_buf_print(&buf, "%s%dc", comma?",":"", __kmp_hws_core.num);
+            if (__kmp_hws_core.offset)
+                __kmp_str_buf_print(&buf, "@%d", __kmp_hws_core.offset);
+            comma = 1;
+        }
+        if (__kmp_hws_proc.num)
+            __kmp_str_buf_print(&buf, "%s%dt", comma?",":"", __kmp_hws_proc.num);
        __kmp_str_buf_print(buffer, "%s'\n", buf.str );
        __kmp_str_buf_free(&buf);
-/*
-    } else {
-        __kmp_str_buf_print( buffer, "   %s: %s \n", name, KMP_I18N_STR( NotDefined ) );
-*/
    }
 }