Merge pull request !33092 from TuDouNi/numa
This commit is contained in:
i-robot 2022-06-08 05:30:09 +00:00 committed by Gitee
commit f3b70ff9db
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
10 changed files with 93 additions and 36 deletions

View File

@ -980,7 +980,7 @@ Contributions of any kind are welcome!
#### DataSet
- [STABLE] If the libnuma library is installed in the environment, you can run `export DATASET_ENABLE_NUMA=True` to configure NUMA binding. In multi-card training scenarios, the training data processing speed can be improved, thereby improving the network training efficiency.
- [STABLE] If the libnuma library is installed in the environment, you can run `export DATASET_ENABLE_NUMA=True` or `export MS_ENABLE_NUMA=True` to configure NUMA binding. In multi-card training scenarios, the training data processing speed can be improved, thereby improving the network training efficiency.
- [STABLE] Unify API Tensor structure of Training/Inference interfaces in C++ SDK.
- [STABLE] Optimize duplicated Decode in data preprocess using cache, improve preprocess efficiency.
- [STABLE] Support eager mode to run data augmentation in Python & C++.

View File

@ -20,7 +20,7 @@
#include "minddata/dataset/engine/datasetops/dataset_op.h"
#include "minddata/dataset/engine/datasetops/device_queue_op.h"
#if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
#include "minddata/dataset/util/numa_interface.h"
#include "mindspore/core/utils/numa_interface.h"
#endif
#include "minddata/dataset/util/task_manager.h"
#include "minddata/dataset/util/service.h"
@ -45,9 +45,7 @@ ExecutionTree::ExecutionTree() : id_count_(0), tree_state_(kDeTStateInit) {
ExecutionTree::~ExecutionTree() {
#if defined(ENABLE_GPUQUE) || defined(ENABLE_TDTQUE)
if (numa_enable_) {
if (handle_ != nullptr) {
ReleaseLibrary(handle_);
}
handle_ = nullptr;
}
#if defined(ENABLE_TDTQUE)
DeviceQueueOp *op = dynamic_cast<DeviceQueueOp *>(root_.get());
@ -162,7 +160,7 @@ Status ExecutionTree::Launch() {
RETURN_STATUS_UNEXPECTED("Numa package (libnuma.so) not found.");
}
}
RETURN_IF_NOT_OK(NumaBind(handle_, rank_id_));
RETURN_IF_NOT_OK(NumaBind(handle_.get(), rank_id_));
MS_LOG(INFO) << "Numa bind memory and cpu successful.";
}
#endif

View File

@ -235,7 +235,7 @@ class ExecutionTree {
// but for distribute scenario, this rank_id come from _get_global_rank() in python
int32_t rank_id_;
bool numa_enable_;
void *handle_;
std::shared_ptr<void> handle_;
#endif
};
} // namespace dataset

View File

@ -1,6 +1,3 @@
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
LIST(REMOVE_ITEM _CURRENT_SRC_FILES numa_interface.cc)
endif()
add_library(utils OBJECT ${_CURRENT_SRC_FILES})
add_library(utils OBJECT ${_CURRENT_SRC_FILES})

View File

@ -60,12 +60,19 @@
#include "abstract/ops/primitive_infer_map.h"
#include "mindspore/core/utils/file_utils.h"
#if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__)
#include "utils/numa_interface.h"
#endif
namespace mindspore {
namespace runtime {
using distributed::cluster::ClusterContext;
using distributed::collective::CollectiveManager;
using distributed::recovery::RecoveryContext;
namespace {
constexpr char kNumaEnableEnv[] = "MS_ENABLE_NUMA";
constexpr char kNumaEnableEnv2[] = "DATASET_ENABLE_NUMA";
bool IsNeedInsertCopyActor(const DeviceContext *from_device_context, const DeviceContext *to_device_context) {
MS_EXCEPTION_IF_NULL(from_device_context);
MS_EXCEPTION_IF_NULL(to_device_context);
@ -359,6 +366,7 @@ void GraphScheduler::Initialize() {
}
init_ = true;
BindNumaNode();
(void)kKernelTypeToLinkFunc.emplace(KernelTransformType::kDeviceDataSourceActor,
&GraphScheduler::LinkDataArrowForBaseActor);
(void)kKernelTypeToLinkFunc.emplace(KernelTransformType::kHostDataSourceActor,
@ -2234,5 +2242,30 @@ void GraphScheduler::DumpDeviceTensorStore(const GraphCompilerInfo &graph_compil
}
}
}
void GraphScheduler::BindNumaNode() {
auto numa_enable = common::GetEnv(kNumaEnableEnv);
auto numa_enable2 = common::GetEnv(kNumaEnableEnv2);
if ((numa_enable.empty() || numa_enable != "1") && (numa_enable2.empty() || numa_enable2 != "1")) {
return;
}
#if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__) && !defined(ENABLE_ANDROID)
uint32_t rank_id = CommManager::GetInstance().GetRank();
MS_LOG(INFO) << "Bind numa node for rank " << rank_id;
if (numa_handle_ == nullptr) {
numa_handle_ = GetNumaAdapterHandle();
if (numa_handle_ == nullptr) {
MS_LOG(EXCEPTION) << "Load numa library failed.";
}
}
auto ret = NumaBind(numa_handle_.get(), rank_id);
if (ret != StatusCode::kSuccess) {
MS_LOG(EXCEPTION) << "Bind numa node failed, ret = " << ret.GetErrDescription();
}
MS_LOG(INFO) << "Numa bind memory and cpu successful.";
#endif
}
} // namespace runtime
} // namespace mindspore

View File

@ -189,6 +189,9 @@ class BACKEND_EXPORT GraphScheduler {
void DumpActor(const ActorSet *actor_set, const GraphCompilerInfo &graph_compiler_info) const;
void DumpDeviceTensorStore(const GraphCompilerInfo &graph_compiler_info, std::ofstream &ofs) const;
// bind thread pool to same numa node
void BindNumaNode();
// The global maps, only be cleared in the deconstruction.
mindspore::HashMap<ActorInfo, ActorSetPtr> actors_;
@ -214,6 +217,8 @@ class BACKEND_EXPORT GraphScheduler {
// Whether actor running by the persistent execution order.
bool execution_order_running_{false};
// numa library handle
std::shared_ptr<void> numa_handle_{};
bool init_{false};
};

View File

@ -42,6 +42,15 @@ file(GLOB_RECURSE CORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
set(CORE_SRC_LIST ${CORE_SRC_LIST} ${CORE_OPS_LIST})
if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
LIST(REMOVE_ITEM CORE_SRC_LIST utils/numa_interface.cc)
endif()
if(ENABLE_SECURITY)
file(GLOB_RECURSE _INFER_SUMMARY_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "ops/*_summary.cc")
list(REMOVE_ITEM CORE_SRC_LIST ${_INFER_SUMMARY_FILES})
endif()
file(GLOB_RECURSE PROTO_FILE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "proto/*.proto")
if(NOT(BUILD_LITE))
ms_protobuf_generate_py(PROTO_SRCS PY_HDRS PY_PYS ${PROTO_FILE})

View File

@ -13,11 +13,28 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/util/numa_interface.h"
#include "utils/numa_interface.h"
#include <dlfcn.h>
#include <memory>
#include <mutex>
#include "utils/log_adapter.h"
#define RETURN_STATUS_UNEXPECTED(_e) \
do { \
return Status(StatusCode::kCoreFailed, __LINE__, __FILE__, _e); \
} while (false)
namespace mindspore {
namespace dataset {
namespace {
struct bitmask {
uint64_t size;
uint64_t *maskp;
};
std::weak_ptr<void> g_numa_lib_handle;
std::mutex g_numa_lib_handle_mutex;
} // namespace
inline void *LoadLibrary(const char *name) {
if (name == nullptr) {
return nullptr;
@ -45,9 +62,16 @@ void ReleaseLibrary(void *handle) {
}
}
void *GetNumaAdapterHandle() {
std::shared_ptr<void> GetNumaAdapterHandle() {
std::lock_guard<std::mutex> lock(g_numa_lib_handle_mutex);
auto shared = g_numa_lib_handle.lock();
if (shared != nullptr) {
return shared;
}
void *handle = LoadLibrary("libnuma.so");
return handle;
shared = std::shared_ptr<void>(handle, ReleaseLibrary);
g_numa_lib_handle = shared;
return shared;
}
Status NumaBind(void *handle, const int32_t &rank_id) {
@ -100,5 +124,4 @@ Status NumaBind(void *handle, const int32_t &rank_id) {
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -13,33 +13,22 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_UTIL_NUMA_INTERFACE_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_UTIL_NUMA_INTERFACE_H_
#ifndef MINDSPORE_CORE_UTILS_NUMA_INTERFACE_H_
#define MINDSPORE_CORE_UTILS_NUMA_INTERFACE_H_
#include "minddata/dataset/util/log_adapter.h"
#include "minddata/dataset/util/status.h"
#include <memory>
#include "include/api/status.h"
#include "utils/visible.h"
namespace mindspore {
namespace dataset {
struct bitmask {
uint64_t size;
uint64_t *maskp;
};
// Now we separate the link from _c_dataengine with numa,
// Now we separate the link from mindspore binary with numa,
// and we use dlopen("libnuma") instead. This function will
// return a handle which you can do NumaBind and ReleaseLibrary.
void *GetNumaAdapterHandle();
MS_CORE_API std::shared_ptr<void> GetNumaAdapterHandle();
// Totally this function will do:
// 1. Get function pointer of numa api
// 2. Do numa_bind
Status NumaBind(void *handle, const int32_t &rank_id);
// Release the numa handle for avoid memory leak, we should
// not allow handle is nullptr before we use it.
void ReleaseLibrary(void *handle);
} // namespace dataset
MS_CORE_API Status NumaBind(void *handle, const int32_t &rank_id);
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_UTIL_NUMA_INTERFACE_H_
#endif // MINDSPORE_CORE_UTILS_NUMA_INTERFACE_H_

View File

@ -65,6 +65,9 @@ def _init_device_info():
from mindspore.parallel._utils import _get_global_rank
numa_enable = False
numa_enable_env = os.getenv("DATASET_ENABLE_NUMA", None)
if numa_enable_env and numa_enable_env.strip() == 'True':
numa_enable = True
numa_enable_env = os.getenv("MS_ENABLE_NUMA", None)
if numa_enable_env and numa_enable_env.strip() == 'True':
numa_enable = True
if context.get_context("device_target") == "GPU":