!33092 numa

Merge pull request !33092 from TuDouNi/numa
2022-06-08 05:30:09 +00:00 · 2022-06-08 05:30:09 +00:00 · f3b70ff9db
parent 539be854ba 1b1fadf354
commit f3b70ff9db
10 changed files with 93 additions and 36 deletions
--- a/RELEASE.md
+++ b/RELEASE.md
@ -980,7 +980,7 @@ Contributions of any kind are welcome!

 #### DataSet

- [STABLE] If the libnuma library is installed in the environment, you can run `export DATASET_ENABLE_NUMA=True` to configure NUMA binding. In multi-card training scenarios, the training data processing speed can be improved, thereby improving the network training efficiency.
+- [STABLE] If the libnuma library is installed in the environment, you can run `export DATASET_ENABLE_NUMA=True` or `export MS_ENABLE_NUMA=True` to configure NUMA binding. In multi-card training scenarios, the training data processing speed can be improved, thereby improving the network training efficiency.
 - [STABLE] Unify API Tensor structure of Training/Inference interfaces in C++ SDK.
 - [STABLE] Optimize duplicated Decode in data preprocess using cache, improve preprocess efficiency.
 - [STABLE] Support eager mode to run data augmentation in Python & C++.
--- a/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc
@ -20,7 +20,7 @@
 #include "minddata/dataset/engine/datasetops/dataset_op.h"
 #include "minddata/dataset/engine/datasetops/device_queue_op.h"
 #if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
-#include "minddata/dataset/util/numa_interface.h"
+#include "mindspore/core/utils/numa_interface.h"
 #endif
 #include "minddata/dataset/util/task_manager.h"
 #include "minddata/dataset/util/service.h"
@ -45,9 +45,7 @@ ExecutionTree::ExecutionTree() : id_count_(0), tree_state_(kDeTStateInit) {
 ExecutionTree::~ExecutionTree() {
 #if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
  if (numa_enable_) {
-    if (handle_ != nullptr) {
-      ReleaseLibrary(handle_);
-    }
+    handle_ = nullptr;
  }
 #if defined(ENABLE_TDTQUE)
  DeviceQueueOp *op = dynamic_cast<DeviceQueueOp *>(root_.get());
@ -162,7 +160,7 @@ Status ExecutionTree::Launch() {
        RETURN_STATUS_UNEXPECTED("Numa package (libnuma.so) not found.");
      }
    }
-    RETURN_IF_NOT_OK(NumaBind(handle_, rank_id_));
+    RETURN_IF_NOT_OK(NumaBind(handle_.get(), rank_id_));
    MS_LOG(INFO) << "Numa bind memory and cpu successful.";
  }
 #endif
--- a/mindspore/ccsrc/minddata/dataset/engine/execution_tree.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/execution_tree.h
@ -235,7 +235,7 @@ class ExecutionTree {
  // but for distribute scenario, this rank_id come from _get_global_rank() in python
  int32_t rank_id_;
  bool numa_enable_;
-  void *handle_;
+  std::shared_ptr<void> handle_;
 #endif
 };
 }  // namespace dataset
--- a/mindspore/ccsrc/minddata/dataset/util/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/util/CMakeLists.txt
@ -1,6 +1,3 @@
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
-if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
-  LIST(REMOVE_ITEM _CURRENT_SRC_FILES numa_interface.cc)
-endif()
-add_library(utils OBJECT ${_CURRENT_SRC_FILES})
+add_library(utils OBJECT ${_CURRENT_SRC_FILES})
--- a/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc
@ -60,12 +60,19 @@
 #include "abstract/ops/primitive_infer_map.h"
 #include "mindspore/core/utils/file_utils.h"

+#if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__)
+#include "utils/numa_interface.h"
+#endif
+
 namespace mindspore {
 namespace runtime {
 using distributed::cluster::ClusterContext;
 using distributed::collective::CollectiveManager;
 using distributed::recovery::RecoveryContext;
 namespace {
+constexpr char kNumaEnableEnv[] = "MS_ENABLE_NUMA";
+constexpr char kNumaEnableEnv2[] = "DATASET_ENABLE_NUMA";
+
 bool IsNeedInsertCopyActor(const DeviceContext *from_device_context, const DeviceContext *to_device_context) {
  MS_EXCEPTION_IF_NULL(from_device_context);
  MS_EXCEPTION_IF_NULL(to_device_context);
@ -359,6 +366,7 @@ void GraphScheduler::Initialize() {
  }
  init_ = true;

+  BindNumaNode();
  (void)kKernelTypeToLinkFunc.emplace(KernelTransformType::kDeviceDataSourceActor,
                                      &GraphScheduler::LinkDataArrowForBaseActor);
  (void)kKernelTypeToLinkFunc.emplace(KernelTransformType::kHostDataSourceActor,
@ -2234,5 +2242,30 @@ void GraphScheduler::DumpDeviceTensorStore(const GraphCompilerInfo &graph_compil
    }
  }
 }
+
+void GraphScheduler::BindNumaNode() {
+  auto numa_enable = common::GetEnv(kNumaEnableEnv);
+  auto numa_enable2 = common::GetEnv(kNumaEnableEnv2);
+  if ((numa_enable.empty() || numa_enable != "1") && (numa_enable2.empty() || numa_enable2 != "1")) {
+    return;
+  }
+
+#if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__) && !defined(ENABLE_ANDROID)
+  uint32_t rank_id = CommManager::GetInstance().GetRank();
+  MS_LOG(INFO) << "Bind numa node for rank " << rank_id;
+  if (numa_handle_ == nullptr) {
+    numa_handle_ = GetNumaAdapterHandle();
+    if (numa_handle_ == nullptr) {
+      MS_LOG(EXCEPTION) << "Load numa library failed.";
+    }
+  }
+
+  auto ret = NumaBind(numa_handle_.get(), rank_id);
+  if (ret != StatusCode::kSuccess) {
+    MS_LOG(EXCEPTION) << "Bind numa node failed, ret = " << ret.GetErrDescription();
+  }
+  MS_LOG(INFO) << "Numa bind memory and cpu successful.";
+#endif
+}
 }  // namespace runtime
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.h
+++ b/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.h
@ -189,6 +189,9 @@ class BACKEND_EXPORT GraphScheduler {
  void DumpActor(const ActorSet *actor_set, const GraphCompilerInfo &graph_compiler_info) const;
  void DumpDeviceTensorStore(const GraphCompilerInfo &graph_compiler_info, std::ofstream &ofs) const;

+  // bind thread pool to same numa node
+  void BindNumaNode();
+
  // The global maps, only be cleared in the deconstruction.
  mindspore::HashMap<ActorInfo, ActorSetPtr> actors_;

@ -214,6 +217,8 @@ class BACKEND_EXPORT GraphScheduler {

  // Whether actor running by the persistent execution order.
  bool execution_order_running_{false};
+  // numa library handle
+  std::shared_ptr<void> numa_handle_{};

  bool init_{false};
 };
--- a/mindspore/core/CMakeLists.txt
+++ b/mindspore/core/CMakeLists.txt
@ -42,6 +42,15 @@ file(GLOB_RECURSE CORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}

 set(CORE_SRC_LIST ${CORE_SRC_LIST} ${CORE_OPS_LIST})

+if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+    LIST(REMOVE_ITEM CORE_SRC_LIST utils/numa_interface.cc)
+endif()
+
+if(ENABLE_SECURITY)
+    file(GLOB_RECURSE _INFER_SUMMARY_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "ops/*_summary.cc")
+    list(REMOVE_ITEM CORE_SRC_LIST ${_INFER_SUMMARY_FILES})
+endif()
+
 file(GLOB_RECURSE PROTO_FILE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "proto/*.proto")
 if(NOT(BUILD_LITE))
    ms_protobuf_generate_py(PROTO_SRCS PY_HDRS PY_PYS ${PROTO_FILE})
--- a/mindspore/ccsrc/minddata/dataset/util/numa_interface.cc
+++ b/mindspore/ccsrc/minddata/dataset/util/numa_interface.cc
@ -13,11 +13,28 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include "minddata/dataset/util/numa_interface.h"
+#include "utils/numa_interface.h"
 #include <dlfcn.h>
+#include <memory>
+#include <mutex>
+#include "utils/log_adapter.h"
+
+#define RETURN_STATUS_UNEXPECTED(_e)                                \
+  do {                                                              \
+    return Status(StatusCode::kCoreFailed, __LINE__, __FILE__, _e); \
+  } while (false)

 namespace mindspore {
-namespace dataset {
+namespace {
+struct bitmask {
+  uint64_t size;
+  uint64_t *maskp;
+};
+
+std::weak_ptr<void> g_numa_lib_handle;
+std::mutex g_numa_lib_handle_mutex;
+}  // namespace
+
 inline void *LoadLibrary(const char *name) {
  if (name == nullptr) {
    return nullptr;
@ -45,9 +62,16 @@ void ReleaseLibrary(void *handle) {
  }
 }

-void *GetNumaAdapterHandle() {
+std::shared_ptr<void> GetNumaAdapterHandle() {
+  std::lock_guard<std::mutex> lock(g_numa_lib_handle_mutex);
+  auto shared = g_numa_lib_handle.lock();
+  if (shared != nullptr) {
+    return shared;
+  }
  void *handle = LoadLibrary("libnuma.so");
-  return handle;
+  shared = std::shared_ptr<void>(handle, ReleaseLibrary);
+  g_numa_lib_handle = shared;
+  return shared;
 }

 Status NumaBind(void *handle, const int32_t &rank_id) {
@ -100,5 +124,4 @@ Status NumaBind(void *handle, const int32_t &rank_id) {
  }
  return Status::OK();
 }
-}  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/util/numa_interface.h
+++ b/mindspore/ccsrc/minddata/dataset/util/numa_interface.h
@ -13,33 +13,22 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_UTIL_NUMA_INTERFACE_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_UTIL_NUMA_INTERFACE_H_
+#ifndef MINDSPORE_CORE_UTILS_NUMA_INTERFACE_H_
+#define MINDSPORE_CORE_UTILS_NUMA_INTERFACE_H_

-#include "minddata/dataset/util/log_adapter.h"
-#include "minddata/dataset/util/status.h"
+#include <memory>
+#include "include/api/status.h"
+#include "utils/visible.h"

 namespace mindspore {
-namespace dataset {
-struct bitmask {
-  uint64_t size;
-  uint64_t *maskp;
-};
-
-// Now we separate the link from _c_dataengine with numa,
+// Now we separate the link from mindspore binary with numa,
 // and we use dlopen("libnuma") instead. This function will
 // return a handle which you can do NumaBind and ReleaseLibrary.
-void *GetNumaAdapterHandle();
+MS_CORE_API std::shared_ptr<void> GetNumaAdapterHandle();

 // Totally this function will do:
 // 1. Get function pointer of numa api
 // 2. Do numa_bind
-Status NumaBind(void *handle, const int32_t &rank_id);
-
-// Release the numa handle for avoid memory leak, we should
-// not allow handle is nullptr before we use it.
-void ReleaseLibrary(void *handle);
-}  // namespace dataset
+MS_CORE_API Status NumaBind(void *handle, const int32_t &rank_id);
 }  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_UTIL_NUMA_INTERFACE_H_
+#endif  // MINDSPORE_CORE_UTILS_NUMA_INTERFACE_H_
--- a/mindspore/python/mindspore/dataset/core/config.py
+++ b/mindspore/python/mindspore/dataset/core/config.py
@ -65,6 +65,9 @@ def _init_device_info():
    from mindspore.parallel._utils import _get_global_rank
    numa_enable = False
    numa_enable_env = os.getenv("DATASET_ENABLE_NUMA", None)
+    if numa_enable_env and numa_enable_env.strip() == 'True':
+        numa_enable = True
+    numa_enable_env = os.getenv("MS_ENABLE_NUMA", None)
    if numa_enable_env and numa_enable_env.strip() == 'True':
        numa_enable = True
    if context.get_context("device_target") == "GPU":