From b8345d03b6872f19dad4fda75486c0aae16e4fe6 Mon Sep 17 00:00:00 2001 From: yanghaoran Date: Sat, 19 Dec 2020 21:17:31 +0800 Subject: [PATCH] Synchronize latest Ascend software 18 Dec 2020, with profiler fixes --- build.sh | 3 - cmake/dependency_graphengine.cmake | 3 - cmake/dependency_graphenginev2.cmake | 2 +- graphengine | 2 +- mindspore/_check_version.py | 2 +- mindspore/ccsrc/CMakeLists.txt | 8 +- .../ccsrc/backend/session/ascend_session.cc | 7 +- mindspore/ccsrc/pipeline/jit/pipeline.cc | 25 +- mindspore/ccsrc/runtime/device/CMakeLists.txt | 1 + .../device/ascend/ascend_kernel_runtime.cc | 121 +++++--- .../device/ascend/ascend_kernel_runtime.h | 16 +- .../device/ascend/ascend_memory_manager.cc | 2 +- .../device/ascend/profiling/plugin_impl.cc | 42 --- .../device/ascend/profiling/plugin_impl.h | 45 --- .../profiling/profiling_callback_register.cc | 93 ++++++ .../profiling/profiling_callback_register.h | 82 ++++++ .../ascend/profiling/profiling_engine_impl.cc | 37 --- .../ascend/profiling/profiling_engine_impl.h | 39 --- .../ascend/profiling/profiling_manager.cc | 275 +++++++++++------- .../ascend/profiling/profiling_manager.h | 32 +- .../ascend/profiling/profiling_utils.cc | 66 +++-- .../device/ascend/profiling/profiling_utils.h | 6 +- .../profiling/reporter/desc_reporter.cc | 9 +- .../ccsrc/runtime/device/kernel_adjust.cc | 4 + .../ccsrc/runtime/device/kernel_runtime.h | 2 + mindspore/ccsrc/utils/comm_manager.cc | 8 +- mindspore/communication/_hccl_management.py | 16 +- .../transformer/test_transformer.py | 7 +- tests/ut/cpp/CMakeLists.txt | 2 - tests/ut/cpp/device/ascend_profiling_test.cc | 124 -------- tests/ut/cpp/stub/hccl/hccl_stub.cc | 12 +- tests/ut/cpp/stub/profiling/profiling_stub.cc | 5 +- tests/ut/cpp/stub/runtime/runtime_stub.cc | 10 +- 33 files changed, 590 insertions(+), 518 deletions(-) delete mode 100644 mindspore/ccsrc/runtime/device/ascend/profiling/plugin_impl.cc delete mode 100644 mindspore/ccsrc/runtime/device/ascend/profiling/plugin_impl.h create mode 100644 mindspore/ccsrc/runtime/device/ascend/profiling/profiling_callback_register.cc create mode 100644 mindspore/ccsrc/runtime/device/ascend/profiling/profiling_callback_register.h delete mode 100644 mindspore/ccsrc/runtime/device/ascend/profiling/profiling_engine_impl.cc delete mode 100644 mindspore/ccsrc/runtime/device/ascend/profiling/profiling_engine_impl.h delete mode 100644 tests/ut/cpp/device/ascend_profiling_test.cc diff --git a/build.sh b/build.sh index ac5b93aa434..1acb9730397 100755 --- a/build.sh +++ b/build.sh @@ -381,9 +381,6 @@ checkopts "$@" echo "---------------- MindSpore: build start ----------------" mkdir -pv "${BUILD_PATH}/package/mindspore/lib" git submodule update --init graphengine -cd "${BASEPATH}/graphengine" -git submodule update --init metadef -cd "${BASEPATH}" if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" || "X$ENABLE_GPU" = "Xon" ]]; then git submodule update --init --recursive akg fi diff --git a/cmake/dependency_graphengine.cmake b/cmake/dependency_graphengine.cmake index db15eb630da..23fb53cfaef 100644 --- a/cmake/dependency_graphengine.cmake +++ b/cmake/dependency_graphengine.cmake @@ -38,7 +38,6 @@ elseif (DEFINED ENV{D_LINK_PATH}) find_library(slog libslog.so ${GE_LIB_PATH}) find_library(mmpa libmmpa.a ${GE_LIB_PATH}) find_library(runtime libruntime.so ${GE_LIB_PATH}) - find_library(msprof libmsprof.so ${GE_LIB_PATH}) find_library(register libregister.so ${GE_LIB_PATH}) find_library(hccl libhccl.so ${GE_LIB_PATH}) find_library(cce libcce.so ${GE_LIB_PATH}) @@ -59,7 +58,6 @@ else() find_library(cce libcce.so ${ASCEND_RUNTIME_PATH}) find_library(hccl libhccl.so ${ASCEND_RUNTIME_PATH}) find_library(runtime libruntime.so ${ASCEND_RUNTIME_PATH}) - find_library(msprof libmsprof.so ${ASCEND_RUNTIME_PATH}) find_library(register libregister.so ${ASCEND_RUNTIME_PATH}) find_library(resource libresource.so ${ASCEND_RUNTIME_PATH}) find_library(error_manager liberror_manager.so ${ASCEND_RUNTIME_PATH}) @@ -68,7 +66,6 @@ else() find_library(cce libcce.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(hccl libhccl.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(runtime libruntime.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) - find_library(msprof libmsprof.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(register libregister.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(resource libresource.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(error_manager liberror_manager.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) diff --git a/cmake/dependency_graphenginev2.cmake b/cmake/dependency_graphenginev2.cmake index 3d003e7f8e6..1af0604f1f6 100644 --- a/cmake/dependency_graphenginev2.cmake +++ b/cmake/dependency_graphenginev2.cmake @@ -26,7 +26,7 @@ if (ENABLE_D OR ENABLE_ACL OR ENABLE_TESTCASES) # use slog, error manager, mmpa in non ascend mode, e.g. tests set(GE_PREBUILD_PATH ${GE_SOURCE_DIR}/third_party/prebuild/${CMAKE_HOST_SYSTEM_PROCESSOR}) set(ENABLE_MS_TESTCASES TRUE) - find_submodule_lib(slog libslog.so ${GE_PREBUILD_PATH}) + find_submodule_lib(slog libalog.so ${GE_PREBUILD_PATH}) find_submodule_lib(error_manager liberror_manager.so ${GE_PREBUILD_PATH}) find_submodule_lib(static_mmpa libmmpa.a ${GE_PREBUILD_PATH}) endif() diff --git a/graphengine b/graphengine index 20a0326976d..9a7b271674f 160000 --- a/graphengine +++ b/graphengine @@ -1 +1 @@ -Subproject commit 20a0326976db65ca01f43ae4ccdd85677faaeb5e +Subproject commit 9a7b271674f343157c316b1455aee628c43cffdc diff --git a/mindspore/_check_version.py b/mindspore/_check_version.py index eade05988ee..865af779ea3 100644 --- a/mindspore/_check_version.py +++ b/mindspore/_check_version.py @@ -122,7 +122,7 @@ class AscendEnvChecker(EnvChecker): """ascend environment check""" def __init__(self): - self.version = ["1.76.T21.0.B210"] + self.version = ["1.76.22.0.220"] atlas_nnae_version = "/usr/local/Ascend/nnae/latest/fwkacllib/version.info" atlas_toolkit_version = "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/version.info" hisi_fwk_version = "/usr/local/Ascend/fwkacllib/version.info" diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt index 120354a454e..793a5df5989 100644 --- a/mindspore/ccsrc/CMakeLists.txt +++ b/mindspore/ccsrc/CMakeLists.txt @@ -248,17 +248,17 @@ if (ENABLE_D) find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) - find_library(PROFILING msprofiler ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) - find_library(PROFILING_SHARED msprof ${ASCEND_DRIVER_PATH}) + find_library(PROFILING msprofiler_fwk ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(REGISTER register ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(OPTILING optiling ${ASCEND_OPP_PATH}) # hccl_adpter find_library(HCCL_ADPTER hcom_graph_adaptor ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(HCCL_BUILDER hcom_opskernel_builder ${ASCEND_RUNTIME_PATH}/plugin/opskernel ${ASCEND_TOOLKIT_RUNTIME_PATH}/plugin/opskernel) - add_library(ms_profile SHARED ${PROFILING}) + add_library(ms_profile SHARED ${CMAKE_CURRENT_SOURCE_DIR}/runtime/device/ascend/profiling/profiling_callback_register.cc) set_target_properties(ms_profile PROPERTIES LINKER_LANGUAGE CXX) - target_link_libraries(ms_profile -Wl,--start-group ${PROFILING_SHARED} ${PROFILING} mindspore::protobuf -Wl,--end-group) + target_link_options(ms_profile PRIVATE -Wl,-init,common_log_init) + target_link_libraries(ms_profile -Wl,--start-group -Wl,--whole-archive ${PROFILING} -Wl,--no-whole-archive mindspore::protobuf -Wl,--end-group) target_link_libraries(mindspore ge_runtime ${CCE_LIB} ${RUNTIME_LIB} ${TSDCLIENT} ${HCCL} ${DATATRANSFER} ${HCCL_ADPTER} ${REGISTER} -Wl,--no-as-needed ${OPTILING} ${HCCL_BUILDER}) target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group) diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index 99284177417..f7e574ebcfe 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -422,12 +422,7 @@ GraphInfo GetSingleOpGraphInfo(const PrimitivePtr &prim, const std::vectorCreateContext(); -} +void AscendSession::Init(uint32_t device_id) { InitExecutor(kAscendDevice, device_id); } void AscendSession::UnifyMindIR(const KernelGraphPtr &graph) { auto context_ptr = MsContext::GetInstance(); diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.cc b/mindspore/ccsrc/pipeline/jit/pipeline.cc index 817ec6509a7..6e6375fabde 100644 --- a/mindspore/ccsrc/pipeline/jit/pipeline.cc +++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc @@ -1019,7 +1019,6 @@ void InitHccl() { mindspore::parse::python_adapter::set_python_env_flag(true); auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); - (void)context::OpenTsd(ms_context); uint32_t device_id = ms_context->get_param(MS_CTX_DEVICE_ID); std::string device_name = ms_context->get_param(MS_CTX_DEVICE_TARGET); ms_context->set_param(MS_CTX_ENABLE_HCCL, true); @@ -1027,10 +1026,14 @@ void InitHccl() { ms_context->get_param(MS_CTX_DEVICE_TARGET) == kAscendDevice) { auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(device_name, device_id); MS_EXCEPTION_IF_NULL(runtime_instance); + runtime_instance->PreInit(); + (void)context::OpenTsd(ms_context); if (!runtime_instance->Init()) { MS_LOG(ERROR) << "Kernel runtime init error."; return; } + } else { + (void)context::OpenTsd(ms_context); } #endif } @@ -1060,9 +1063,29 @@ void ReleaseGeTsd() { } } +void StartUpProfiling() { + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + if (!ms_context->get_param(MS_CTX_ENABLE_PROFILING)) { + return; + } + MS_LOG(INFO) << "Startup profiling"; + // Start up profiling before OpenTsd + uint32_t device_id = ms_context->get_param(MS_CTX_DEVICE_ID); + std::string device_name = ms_context->get_param(MS_CTX_DEVICE_TARGET); + if (ms_context->backend_policy() == "ms" && + ms_context->get_param(MS_CTX_DEVICE_TARGET) == kAscendDevice) { + auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(device_name, device_id); + MS_EXCEPTION_IF_NULL(runtime_instance); + runtime_instance->PreInit(); + } +} + void InitBackend() { // set python env flag mindspore::parse::python_adapter::set_python_env_flag(true); + // Startup profiling before open tsd + StartUpProfiling(); // open tsd before ge initialize auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); diff --git a/mindspore/ccsrc/runtime/device/CMakeLists.txt b/mindspore/ccsrc/runtime/device/CMakeLists.txt index 37ce7a93c34..bd7153f758d 100644 --- a/mindspore/ccsrc/runtime/device/CMakeLists.txt +++ b/mindspore/ccsrc/runtime/device/CMakeLists.txt @@ -64,6 +64,7 @@ if (ENABLE_GPU) # add_library(_mindspore_device_cuda_obj OBJECT ${CUDA_SRC_LIST}) endif () +list(REMOVE_ITEM D_SRC_LIST "ascend/profiling/profiling_callback_register.cc") set_property(SOURCE ${DEVICE_SRC_LIST} ${D_SRC_LIST} ${CPU_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE) add_library(_mindspore_runtime_device_obj OBJECT ${DEVICE_SRC_LIST} ${D_SRC_LIST} ${CPU_SRC_LIST}) diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 75e98ba8a56..ef4072c7678 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -45,6 +45,8 @@ #include "toolchain/adx_datadump_server.h" #include "utils/shape_utils.h" #include "utils/trace_base.h" +#include "graphengine/inc/external/acl/error_codes/rt_error_codes.h" +#include "debug/anf_ir_dump.h" #ifdef MEM_REUSE_DEBUG #include "backend/optimizer/mem_reuse/mem_reuse_checker.h" #endif @@ -54,6 +56,7 @@ #include "utils/config_manager.h" #include "runtime/device/ascend/profiling/reporter/op_name_task_stream_reporter.h" #include "runtime/hccl_adapter/hccl_adapter.h" +#include "runtime/device/ascend/profiling/profiling_callback_register.h" #include "backend/kernel_compiler/hccl/hccl_context.h" #ifdef ENABLE_TDTQUE #include "tdt/tdt_host_interface.h" @@ -71,11 +74,9 @@ constexpr uint32_t kTupleTaskId = 0; constexpr uint32_t kTupleStreamId = 1; constexpr uint32_t kTupleArgs = 2; constexpr uint32_t kProfilingMaxTaskIdInStream = 65531; +constexpr auto kModuleName = "MindSpore"; -namespace mindspore { -namespace device { -namespace ascend { -static const size_t PRAMATER_OUTPUT_INDEX = 0; +namespace mindspore::device::ascend { static thread_local rtContext_t thread_local_rt_context{nullptr}; namespace { std::string GetRankId() { @@ -110,7 +111,9 @@ std::string GetRankId() { } } // namespace -std::vector AscendKernelRuntime::exception_infoes_; +std::vector AscendKernelRuntime::task_fail_infoes_ = {}; +uint32_t AscendKernelRuntime::current_graph_id_ = 0; +std::map AscendKernelRuntime::overflow_tasks_; AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); } void AscendKernelRuntime::SetContext() { @@ -255,6 +258,11 @@ void AscendKernelRuntime::ReleaseDeviceRes() { mem_manager_->FreeDeviceMemory(); } + auto rt_ret = rtRegTaskFailCallbackByModule(kModuleName, nullptr); + if (rt_ret != RT_ERROR_NONE) { + MS_LOG(EXCEPTION) << "Reg SetTaskFailCallback failed, error: " << rt_ret; + } + (void)DestroySingleOpHccl(); (void)DestroyHccl(); (void)ResetDevice(device_id); @@ -262,6 +270,13 @@ void AscendKernelRuntime::ReleaseDeviceRes() { MS_LOG(INFO) << "Ascend finalize end"; } +void AscendKernelRuntime::PreInit() { + auto ret = ProfilingManager::GetInstance().StartupProfiling(device_id_); + if (!ret) { + MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed."; + } +} + bool AscendKernelRuntime::Init() { if (initialized_) { InnerSetContext(); @@ -269,24 +284,21 @@ bool AscendKernelRuntime::Init() { } OpTilingCalculater::GetInstance().Init(); // Start up profiling before rtSetDevice - bool ret = ProfilingManager::GetInstance().StartupProfiling(device_id_); - if (!ret) { - MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed."; - } - ret = InitDevice(); + bool ret = InitDevice(); if (!ret) { return ret; } + SetDebugger(); mem_manager_ = std::make_shared(); MS_EXCEPTION_IF_NULL(mem_manager_); mem_manager_->MallocDeviceMemory(); // Set callback func when exception error - auto rt_ret = rtSetTaskFailCallback(ExceptionCallback); + auto rt_ret = rtRegTaskFailCallbackByModule(kModuleName, TaskFailCallback); if (rt_ret != RT_ERROR_NONE) { - MS_LOG(EXCEPTION) << "SetTaskFailCallback failed, error: " << rt_ret; + MS_LOG(EXCEPTION) << "Reg SetTaskFailCallback failed, error: " << rt_ret; } initialized_ = true; @@ -525,42 +537,57 @@ void AscendKernelRuntime::LaunchDataDump(GraphId graph_id) { } } -void AscendKernelRuntime::ExceptionCallback(rtExceptionInfo *exception_info) { +void AscendKernelRuntime::TaskFailCallback(rtTaskFailInfo *task_fail_info) { + MS_EXCEPTION_IF_NULL(task_fail_info); static std::mutex exception_mutex; std::lock_guard lock(exception_mutex); - exception_infoes_.push_back(*exception_info); + if (task_fail_info->retcode == ACL_ERROR_RT_AICORE_OVER_FLOW) { + auto key = std::to_string(task_fail_info->streamid) + std::to_string(task_fail_info->taskid); + auto find_iter = overflow_tasks_.find(key); + if (find_iter == overflow_tasks_.end()) { + overflow_tasks_[key] = 1; + } else { + if (overflow_tasks_[key] == 5) { + auto node_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info->streamid, task_fail_info->taskid); + MS_LOG(WARNING) << "Node run task overflow, node name: " << node_name; + overflow_tasks_.erase(find_iter); + } else { + overflow_tasks_[key]++; + } + } + } else { + MS_LOG(WARNING) << "Task fail infos task_id: " << task_fail_info->taskid + << ", stream_id: " << task_fail_info->streamid << ", tid: " << task_fail_info->tid + << ", device_id: " << task_fail_info->deviceid << ", retcode: " << task_fail_info->retcode; + task_fail_infoes_.push_back(*task_fail_info); + } +} + +string AscendKernelRuntime::GetErrorNodeName(uint32_t streamid, uint32_t taskid) { + auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(AscendKernelRuntime::current_graph_id_); + for (const auto &iter : runtime_info_map) { + auto task_id = std::get(*iter.second); + auto stream_id = std::get(*iter.second); + if (task_id == taskid && stream_id == streamid) { + MS_LOG(ERROR) << "Node: " << iter.first << ", run task error."; + return iter.first; + } + } + return ""; } void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); - std::vector full_scope_name{}; - // Find node name(full scope name) - auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(graph->graph_id()); - MS_LOG(ERROR) << "Exception_infos_ size: " << exception_infoes_.size() << ". first example: " - << ", task_id: " << exception_infoes_.at(0).taskid - << ", stream_id: " << exception_infoes_.at(0).streamid << ", tid: " << exception_infoes_.at(0).tid - << ", device_id: " << exception_infoes_.at(0).deviceid; - - for (const auto &exception_info : exception_infoes_) { - for (const auto &iter : runtime_info_map) { - auto task_id = std::get(*iter.second); - auto stream_id = std::get(*iter.second); - if (task_id == exception_info.taskid && stream_id == exception_info.streamid) { - full_scope_name.push_back(iter.first); - MS_LOG(ERROR) << "Node: " << iter.first << ", run task error."; - } - } - } + auto full_scope_name = + AscendKernelRuntime::GetErrorNodeName(task_fail_infoes_.at(0).streamid, task_fail_infoes_.at(0).taskid); // Dump error data in local path - const std::string local_path = std::string("./task_error_dump/") + std::to_string(exception_infoes_.at(0).deviceid); + const std::string local_path = std::string("./task_error_dump/") + std::to_string(task_fail_infoes_.at(0).deviceid); for (const auto &node : graph->execution_order()) { - for (auto &name : full_scope_name) { - if (node->fullname_with_scope() == name) { - MS_LOG(ERROR) << "Begin to dump node (" << name << ") task error input/output data in local path." - << " trace: " << trace::DumpSourceLines(node); - E2eDumpUtil::DumpInputImpl(node, false, local_path, &name, nullptr); - E2eDumpUtil::DumpOutputImpl(node, false, local_path, &name, nullptr); - } + if (node->fullname_with_scope() == full_scope_name) { + MS_LOG(ERROR) << "Begin to dump node (" << full_scope_name << ") task error input/output data in local path." + << " trace: " << trace::DumpSourceLines(node); + E2eDumpUtil::DumpInputImpl(node, false, local_path, &full_scope_name, nullptr); + E2eDumpUtil::DumpOutputImpl(node, false, local_path, &full_scope_name, nullptr); } } } @@ -571,7 +598,8 @@ bool AscendKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink) { #if defined(_WIN32) || defined(_WIN64) auto start_time = std::chrono::steady_clock::now(); #else - struct timeval start_time, end_time; + struct timeval start_time {}; + struct timeval end_time {}; (void)gettimeofday(&start_time, nullptr); #endif if (is_task_sink) { @@ -630,6 +658,7 @@ bool AscendKernelRuntime::RunDynamicKernelAsync(const session::KernelGraph *grap } bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { + current_graph_id_ = graph->graph_id(); InnerSetContext(); MS_EXCEPTION_IF_NULL(graph); if (graph->is_dynamic_shape()) { @@ -656,7 +685,8 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { bool status = ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors); if (!status) { DumpTaskExceptionInfo(graph); - + std::string file_name = "task_error_debug" + std::to_string(current_graph_id_) + ".ir"; + DumpIR(file_name, std::shared_ptr(const_cast(graph))); #ifdef ENABLE_TDTQUE // Run task error, we should call TdtHostDestroy to release tdt to avoid DeviceQueueOp hostPush hung // case1: cpu usage 100% cause thread/process exit, but some tdt thread remain in backend @@ -667,10 +697,9 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { MS_LOG(INFO) << "Destroy tsd success."; } #endif - return false; } - exception_infoes_.clear(); + task_fail_infoes_.clear(); return true; } @@ -857,6 +886,4 @@ void AscendKernelRuntime::KernelLaunchProfiling(const std::string &kernel_name) MS_LOG(EXCEPTION) << "Too many profiling data"; } } -} // namespace ascend -} // namespace device -} // namespace mindspore +} // namespace mindspore::device::ascend diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h index 3daa1215646..bb4c2c9206b 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h @@ -32,9 +32,7 @@ using ge::model_runner::TaskInfo; using std::unordered_map; using std::vector; -namespace mindspore { -namespace device { -namespace ascend { +namespace mindspore::device::ascend { class AscendKernelRuntime : public KernelRuntime { public: AscendKernelRuntime() = default; @@ -56,6 +54,7 @@ class AscendKernelRuntime : public KernelRuntime { void SetContext() override; void CreateContext() override; void *context() const override { return rt_context_; } + void PreInit() override; protected: DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, @@ -80,8 +79,9 @@ class AscendKernelRuntime : public KernelRuntime { bool CheckGraphIdValid(GraphId graph_id) const; void DistributeDebugTask(NotNull graph, NotNull> model_handle); void LaunchDataDump(GraphId graph_id); + static string GetErrorNodeName(uint32_t streamid, uint32_t taskid); static void DumpTaskExceptionInfo(const session::KernelGraph *graph); - static void ExceptionCallback(rtExceptionInfo *exception_info); + static void TaskFailCallback(rtTaskFailInfo *task_fail_info); void ReportProfilingData(); rtContext_t rt_context_{nullptr}; @@ -90,11 +90,11 @@ class AscendKernelRuntime : public KernelRuntime { unordered_map> graph_model_map_; unordered_map> graph_data_dumper_; std::map, std::string> stream_id_task_id_op_name_map_; - static std::vector exception_infoes_; + static uint32_t current_graph_id_; + static std::map overflow_tasks_; + static std::vector task_fail_infoes_; }; MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime); -} // namespace ascend -} // namespace device -} // namespace mindspore +} // namespace mindspore::device::ascend #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_KERNEL_RUNTIME_H_ diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc index 94096d0b6b5..a4e9462c77c 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc @@ -31,7 +31,7 @@ void AscendMemoryManager::MallocDeviceMemory() { device_mem_size_ = context_mem == 0 ? kAscendDeviceMemSize : context_mem; auto ret = rtMalloc(reinterpret_cast(&device_mem_base_), device_mem_size_, RT_MEMORY_HBM); if (ret != ACL_RT_SUCCESS) { - if (ret == ACL_ERROR_RT_DRV_INTERNEL_ERROR) { + if (ret == ACL_ERROR_RT_DRV_INTERNAL_ERROR) { auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); unsigned int device_id = context_ptr->get_param(MS_CTX_DEVICE_ID); diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/plugin_impl.cc b/mindspore/ccsrc/runtime/device/ascend/profiling/plugin_impl.cc deleted file mode 100644 index 4886c00a8e0..00000000000 --- a/mindspore/ccsrc/runtime/device/ascend/profiling/plugin_impl.cc +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "runtime/device/ascend/profiling/plugin_impl.h" -#include -#include "utils/log_adapter.h" -using std::string; - -namespace mindspore { -namespace device { -namespace ascend { -Reporter *PluginImpl::reporter_ = nullptr; - -PluginImpl::PluginImpl(const std::string &module) : module_(module) { MS_LOG(INFO) << "Create PluginImpl."; } - -int PluginImpl::Init(const Reporter *reporter) { - MS_LOG(INFO) << "PluginImpl init"; - MS_EXCEPTION_IF_NULL(reporter); - reporter_ = const_cast(reporter); - return 0; -} - -int PluginImpl::UnInit() { - MS_LOG(INFO) << " PluginImpl Uninit "; - reporter_ = nullptr; - return 0; -} -} // namespace ascend -} // namespace device -} // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/plugin_impl.h b/mindspore/ccsrc/runtime/device/ascend/profiling/plugin_impl.h deleted file mode 100644 index 0150c5b41be..00000000000 --- a/mindspore/ccsrc/runtime/device/ascend/profiling/plugin_impl.h +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PLUGIN_IMPL_H_ -#define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PLUGIN_IMPL_H_ - -#include -#include "./prof_engine.h" -using Msprof::Engine::PluginIntf; -using Msprof::Engine::Reporter; -using std::string; - -namespace mindspore { -namespace device { -namespace ascend { -class PluginImpl : public PluginIntf { - public: - explicit PluginImpl(const std::string &module); - ~PluginImpl() override = default; - int Init(const Reporter *reporter) override; - int UnInit() override; - static Reporter *GetPluginReporter() { return reporter_; } - - private: - static Reporter *reporter_; - std::string module_; -}; -} // namespace ascend -} // namespace device -} // namespace mindspore - -#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PLUGIN_IMPL_H_ diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_callback_register.cc b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_callback_register.cc new file mode 100644 index 00000000000..202f28f7b05 --- /dev/null +++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_callback_register.cc @@ -0,0 +1,93 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "runtime/device/ascend/profiling/profiling_callback_register.h" +#include "runtime/base.h" + +namespace Analysis { +namespace Dvvp { +namespace ProfilerCommon { +extern int32_t MsprofilerInit(); +} // namespace ProfilerCommon +} // namespace Dvvp +} // namespace Analysis + +namespace { +constexpr Status PROF_SUCCESS = 0; +constexpr Status PROF_FAILED = 0xFFFFFFFF; +} // namespace + +Status RegProfCtrlCallback(MsprofCtrlCallback func) { + if (VMCallbackRegister::GetInstance().registed()) { + return VMCallbackRegister::GetInstance().DoRegProfCtrlCallback(func); + } else { + return PROF_SUCCESS; + } +} + +Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) { + if (VMCallbackRegister::GetInstance().registed()) { + return VMCallbackRegister::GetInstance().DoRegProfSetDeviceCallback(func); + } else { + return PROF_SUCCESS; + } +} + +Status RegProfReporterCallback(MsprofReporterCallback func) { + if (VMCallbackRegister::GetInstance().registed()) { + return VMCallbackRegister::GetInstance().DoRegProfReporterCallback(func); + } else { + return PROF_SUCCESS; + } +} + +Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) { + if (VMCallbackRegister::GetInstance().registed()) { + return VMCallbackRegister::GetInstance().DoProfCommandHandle(type, data, len); + } else { + return PROF_SUCCESS; + } +} + +bool IsInitialize() { return true; } + +VMCallbackRegister &VMCallbackRegister::GetInstance() { + static VMCallbackRegister instance; + return instance; +} + +bool VMCallbackRegister::Registe(Status (*pRegProfCtrlCallback)(MsprofCtrlCallback), + Status (*pRegProfSetDeviceCallback)(MsprofSetDeviceCallback), + Status (*pRegProfReporterCallback)(MsprofReporterCallback), + Status (*pProfCommandHandle)(ProfCommandHandleType, void *, uint32_t)) { + if (!registed_) { + pRegProfCtrlCallback_ = pRegProfCtrlCallback; + pRegProfSetDeviceCallback_ = pRegProfSetDeviceCallback; + pRegProfReporterCallback_ = pRegProfReporterCallback; + pProfCommandHandle_ = pProfCommandHandle; + registed_ = true; + ForceMsprofilerInit(); + return true; + } + return false; +} + +void VMCallbackRegister::ForceMsprofilerInit() { + if (!ms_profile_inited_) { + Analysis::Dvvp::ProfilerCommon::MsprofilerInit(); + ms_profile_inited_ = true; + } +} diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_callback_register.h b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_callback_register.h new file mode 100644 index 00000000000..a90d7e836d6 --- /dev/null +++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_callback_register.h @@ -0,0 +1,82 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_CALLBACK_REGISTER_H_ +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_CALLBACK_REGISTER_H_ + +#include "toolchain/prof_callback.h" + +#define MAX_DEV_NUM (64) + +using Status = uint32_t; +enum ProfCommandHandleType { + kProfCommandhandleInit = 0, + kProfCommandhandleStart, + kProfCommandhandleStop, + kProfCommandhandleFinalize, + kProfCommandhandleModelSubscribe, + kProfCommandhandleModelUnsubscribe +}; + +struct ProfCommandHandleData { + uint64_t profSwitch; + uint32_t devNums; // length of device id list + uint32_t devIdList[MAX_DEV_NUM]; + uint32_t modelId; +}; + +Status RegProfCtrlCallback(MsprofCtrlCallback func); +Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func); +Status RegProfReporterCallback(MsprofReporterCallback func); +Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len); +bool IsInitialize(); + +class __attribute__((visibility("default"))) VMCallbackRegister { + public: + static VMCallbackRegister &GetInstance(); + VMCallbackRegister(const VMCallbackRegister &) = delete; + VMCallbackRegister &operator=(const VMCallbackRegister &) = delete; + bool Registe(Status (*pRegProfCtrlCallback)(MsprofCtrlCallback), + Status (*pRegProfSetDeviceCallback)(MsprofSetDeviceCallback), + Status (*pRegProfReporterCallback)(MsprofReporterCallback), + Status (*pProfCommandHandle)(ProfCommandHandleType, void *, uint32_t)); + void ForceMsprofilerInit(); + bool registed() { return registed_; } + Status DoRegProfCtrlCallback(MsprofCtrlCallback func) { return pRegProfCtrlCallback_(func); } + Status DoRegProfSetDeviceCallback(MsprofSetDeviceCallback func) { return pRegProfSetDeviceCallback_(func); } + Status DoRegProfReporterCallback(MsprofReporterCallback func) { return pRegProfReporterCallback_(func); } + Status DoProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) { + return pProfCommandHandle_(type, data, len); + } + + private: + VMCallbackRegister() + : registed_(false), + ms_profile_inited_(false), + pRegProfCtrlCallback_(nullptr), + pRegProfSetDeviceCallback_(nullptr), + pRegProfReporterCallback_(nullptr), + pProfCommandHandle_(nullptr) {} + ~VMCallbackRegister() = default; + + bool registed_; + bool ms_profile_inited_; + Status (*pRegProfCtrlCallback_)(MsprofCtrlCallback); + Status (*pRegProfSetDeviceCallback_)(MsprofSetDeviceCallback); + Status (*pRegProfReporterCallback_)(MsprofReporterCallback); + Status (*pProfCommandHandle_)(ProfCommandHandleType, void *, uint32_t); +}; +#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_CALLBACK_REGISTER_H_ diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_engine_impl.cc b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_engine_impl.cc deleted file mode 100644 index 1f35cba0f74..00000000000 --- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_engine_impl.cc +++ /dev/null @@ -1,37 +0,0 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "runtime/device/ascend/profiling/profiling_engine_impl.h" -#include "utils/log_adapter.h" -#include "runtime/device/ascend/profiling/plugin_impl.h" - -namespace mindspore { -namespace device { -namespace ascend { -PluginIntf *ProfilingEngineImpl::CreatePlugin() { - MS_LOG(INFO) << "Create Plugin."; - return new (std::nothrow) PluginImpl("Framework"); -} - -int ProfilingEngineImpl::ReleasePlugin(PluginIntf *plugin) { - if (plugin != nullptr) { - delete plugin; - plugin = nullptr; - } - return 0; -} -} // namespace ascend -} // namespace device -} // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_engine_impl.h b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_engine_impl.h deleted file mode 100644 index cdb175fe5c7..00000000000 --- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_engine_impl.h +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_ENGINE_IMPL_H_ -#define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_ENGINE_IMPL_H_ - -#include "./prof_engine.h" - -using Msprof::Engine::EngineIntf; -using Msprof::Engine::PluginIntf; - -namespace mindspore { -namespace device { -namespace ascend { -class ProfilingEngineImpl : public EngineIntf { - public: - ProfilingEngineImpl() = default; - ~ProfilingEngineImpl() override = default; - - PluginIntf *CreatePlugin() override; - int ReleasePlugin(PluginIntf *plugin) override; -}; -} // namespace ascend -} // namespace device -} // namespace mindspore - -#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_ENGINE_IMPL_H_ diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc index efa0cd99f04..049ed132d93 100644 --- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc +++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc @@ -19,18 +19,20 @@ #include #include "securec/include/securec.h" #include "./prof_mgr_core.h" -#include "runtime/device/ascend/profiling/plugin_impl.h" -#include "runtime/device/ascend/profiling/profiling_engine_impl.h" #include "utils/log_adapter.h" #include "utils/ms_context.h" #include "utils/ms_utils.h" #include "utils/convert_utils.h" #include "runtime/base.h" #include "toolchain/prof_acl_api.h" +#include "runtime/device/ascend/profiling/profiling_callback_register.h" namespace { constexpr uint32_t kProfilingDeviceNum = 1; -} +constexpr auto kRtSetDeviceRegName = "profiling"; +constexpr Status PROF_SUCCESS = 0; +constexpr Status PROF_FAILED = 0xFFFFFFFF; +} // namespace namespace mindspore { namespace device { @@ -40,9 +42,7 @@ ProfilingManager &ProfilingManager::GetInstance() { return inst; } -ProfilingManager::ProfilingManager() : device_id_(0), prof_handle_(nullptr) { - engine_0_ = std::make_shared(); -} +ProfilingManager::ProfilingManager() : device_id_(0), prof_cb_({0}) {} uint64_t ProfilingManager::GetJobId() const { const char *job_id = std::getenv("JOB_ID"); @@ -58,14 +58,10 @@ bool ProfilingManager::ReportProfilingData(const map &op_taskI MS_LOG(WARNING) << "op_taskId_map is empty."; return false; } - auto reporter = PluginImpl::GetPluginReporter(); - if (reporter == nullptr) { - MS_LOG(ERROR) << "No profiling data report!"; - return false; - } + MS_LOG(INFO) << "DistributeTask: op tasId map size = " << op_taskId_map.size(); - Msprof::Engine::ReporterData reporter_data = {}; + ReporterData reporter_data = {}; for (const auto &iter : op_taskId_map) { auto data = iter.second + ' ' + std::to_string(iter.first) + ';'; reporter_data.deviceId = UintToInt(device_id_); @@ -76,43 +72,67 @@ bool ProfilingManager::ReportProfilingData(const map &op_taskI MS_LOG(ERROR) << "memcpy_s error, errorno(" << ret << ")"; return false; } - ret = reporter->Report(&reporter_data); - if (ret != 0) { - MS_LOG(ERROR) << "reporter data fail, errorno(" << ret << ")"; + int32_t cb_ret = CallMsprofReport(NOT_NULL(&reporter_data)); + if (cb_ret != 0) { + MS_LOG(ERROR) << "reporter data fail, errorno(" << cb_ret << ")"; return false; } } return true; } -static std::vector Split(const std::string &str, const char delim) { - std::vector elems; - - if (str.empty()) { - elems.emplace_back(""); - return elems; - } - - std::stringstream ss(str); - std::string item; - - while (getline(ss, item, delim)) { - elems.push_back(item); - } - auto str_size = str.size(); - if (str_size > 0 && str[str_size - 1] == delim) { - elems.emplace_back(""); - } - - return elems; -} - uint64_t GetProfilingModule() { return PROF_MODEL_EXECUTE_MASK | PROF_RUNTIME_API_MASK | PROF_RUNTIME_TRACE_MASK | PROF_SCHEDULE_TIMELINE_MASK | PROF_SCHEDULE_TRACE_MASK | PROF_TASK_TIME_MASK | PROF_SUBTASK_TIME_MASK | PROF_AICPU_TRACE_MASK | PROF_AICORE_METRICS_MASK | PROF_AIVECTORCORE_METRICS_MASK | PROF_MODEL_LOAD_MASK; } +Status ProfilingManager::PluginInit() const { + if (prof_cb_.msprofReporterCallback == nullptr) { + MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr."; + return PROF_FAILED; + } + return prof_cb_.msprofReporterCallback(static_cast(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), + static_cast(MsprofReporterCallbackType::MSPROF_REPORTER_INIT), + nullptr, 0); +} + +void ProfilingManager::PluginUnInit() const { + if (prof_cb_.msprofReporterCallback == nullptr) { + MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr."; + return; + } + int32_t cb_ret = prof_cb_.msprofReporterCallback( + static_cast(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), + static_cast(MsprofReporterCallbackType::MSPROF_REPORTER_UNINIT), nullptr, 0); + if (cb_ret != 0) { + MS_LOG(WARNING) << "profiling plugin uninit failed, ret:%d" << cb_ret; + } +} + +Status ProfilingManager::GetProfConf(NotNull prof) { + string job_id = std::to_string(GetJobId()); + + if (memcpy_s(prof->jobId, sizeof(prof->jobId), job_id.c_str(), sizeof(job_id.c_str())) != EOK) { + MS_LOG(ERROR) << "Copy job_id failed."; + return PROF_FAILED; + } + + auto context = MsContext::GetInstance(); + if (context == nullptr) { + MS_LOG(ERROR) << "Context is nullptr."; + return PROF_FAILED; + } + + const string prof_options_str = context->get_param(MS_CTX_PROFILING_OPTIONS); + + if (memcpy_s(prof->options, MSPROF_OPTIONS_DEF_LEN_MAX, prof_options_str.c_str(), prof_options_str.size()) != EOK) { + MS_LOG(ERROR) << "Copy profiling_options failed"; + return PROF_FAILED; + } + return PROF_SUCCESS; +} + bool ProfilingManager::StartupProfiling(uint32_t device_id) { auto is_profiling = IsProfiling(); if (!is_profiling) { @@ -120,42 +140,14 @@ bool ProfilingManager::StartupProfiling(uint32_t device_id) { return true; } device_id_ = device_id; - // register Framework to profiling - int result = Msprof::Engine::RegisterEngine("Framework", engine_0_.get()); - if (result != 0) { - MS_LOG(ERROR) << "Register profiling Engine failed."; + + struct MsprofGeOptions prof_conf = {0}; + if (GetProfConf(NOT_NULL(&prof_conf)) != PROF_SUCCESS) { + MS_LOG(ERROR) << "Get prof conf failed."; return false; } - auto context = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context); - const string prof_options_str = context->get_param(MS_CTX_PROFILING_OPTIONS); - std::vector opts = Split(prof_options_str, ':'); - if (opts.empty()) { - MS_LOG(WARNING) << "Profiling is enabled, but profiling option is not set!"; - return true; - } - // current one docker only use one device` - nlohmann::json p_device; - // JOBID - auto job_id = GetJobId(); - p_device["jobID"] = std::to_string(job_id); - // device_id - p_device["deviceID"] = std::to_string(device_id); - // features:'training_trace', 'task_trace' etc - nlohmann::json features; - for (std::vector::size_type i = 0; i < opts.size(); i++) { - nlohmann::json f; - f["name"] = opts[i]; - features[i] = f; - } - p_device["features"] = features; - // only one device, but sProfMgrStartUp API require for device list - nlohmann::json devices; - devices[0] = p_device; - nlohmann::json startCfg; - startCfg["startCfg"] = devices; - if (!ProfStartUp(startCfg)) { + if (!ProfStartUp(NOT_NULL(&prof_conf))) { MS_LOG(ERROR) << "ProfMgrStartUp failed."; return false; } @@ -168,28 +160,24 @@ uint32_t GetCurrentDeviceId() { return context->get_param(MS_CTX_DEVICE_ID); } -bool ProfilingManager::ProfStartUp(const nlohmann::json &startCfg) { - // convert json to string - std::stringstream ss; - ss << startCfg; - std::string cfg = ss.str(); - MS_LOG(INFO) << "profiling config " << cfg; +bool ProfilingManager::ProfStartUp(NotNull prof_conf) { + MS_LOG(INFO) << "Prof start up. "; - auto module = GetProfilingModule(); - auto device_id = GetCurrentDeviceId(); - auto ret = rtProfilerStart(module, kProfilingDeviceNum, &device_id); - if (ret != RT_ERROR_NONE) { - MS_LOG(INFO) << "Call rtProfilerStart failed, ret:" << ret; + if (prof_cb_.msprofCtrlCallback == nullptr) { + MS_LOG(ERROR) << "MsprofCtrlCallback callback is nullptr."; return false; } - // call profiling startup API - ProfMgrCfg prof_cfg = {cfg}; - prof_handle_ = ProfMgrStartUp(&prof_cfg); - if (prof_handle_ == nullptr) { - MS_LOG(ERROR) << "Startup profiling failed."; + // call profiling start up api + int32_t cb_ret = + prof_cb_.msprofCtrlCallback(static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), + static_cast(prof_conf.get()), sizeof(MsprofGeOptions)); + if (cb_ret != PROF_SUCCESS) { + MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret; return false; } + + MS_LOG(INFO) << "Start up profiling success."; return true; } @@ -199,12 +187,10 @@ bool ProfilingManager::StopProfiling() { MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode."; return true; } - Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter(); - if (reporter != nullptr) { - auto ret = reporter->Flush(); - MS_LOG(INFO) << "report data end, ret = " << ret; - } + // plugin unregister + PluginUnInit(); + // stop runtime profiler auto module = GetProfilingModule(); uint32_t device_ids[kProfilingDeviceNum] = {GetCurrentDeviceId()}; @@ -214,18 +200,109 @@ bool ProfilingManager::StopProfiling() { return false; } - if (prof_handle_ != nullptr) { - int result = ProfMgrStop(prof_handle_); - if (result != 0) { - MS_LOG(ERROR) << "ProfMgr stop return fail:" << result << "."; - prof_handle_ = nullptr; - return false; - } - prof_handle_ = nullptr; + // stop profiling + if (prof_cb_.msprofCtrlCallback == nullptr) { + MS_LOG(ERROR) << "MsprofCtrlCallback callback is nullptr."; + return false; } + int32_t cb_ret = + prof_cb_.msprofCtrlCallback(static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE), nullptr, 0); + if (cb_ret != 0) { + MS_LOG(WARNING) << "Call msprofCtrlCallback failed, ret: " << cb_ret; + return false; + } return true; } + +Status ProfilingManager::CallMsprofReport(NotNull reporter_data) const { + if (prof_cb_.msprofReporterCallback == nullptr) { + MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr."; + return PROF_FAILED; + } + return prof_cb_.msprofReporterCallback(static_cast(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), + static_cast(MsprofReporterCallbackType::MSPROF_REPORTER_REPORT), + static_cast(reporter_data.get()), sizeof(ReporterData)); +} + +Status RegProfCtrlCallback(MsprofCtrlCallback func) { + if (func == nullptr) { + MS_LOG(ERROR) << "Msprof ctrl callback is nullptr."; + return PROF_FAILED; + } + if (ProfilingManager::GetInstance().GetMsprofCallback().msprofCtrlCallback != nullptr) { + MS_LOG(WARNING) << "Msprof ctrl callback is exist, just ignore it."; + } else { + MS_LOG(INFO) << "GE register Msprof ctrl callback."; + ProfilingManager::GetInstance().SetMsprofCtrlCallback(func); + } + return PROF_SUCCESS; +} + +Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) { + if (func == nullptr) { + MS_LOG(ERROR) << "MsprofSetDeviceCallback callback is nullptr."; + return PROF_FAILED; + } + ProfilingManager::GetInstance().SetMsprofSetDeviceCallback(func); + // Pass MsprofSetDeviceCallback to runtime + MS_LOG(INFO) << "GE pass setdevice callback to runtime."; + Status rt_ret = rtRegDeviceStateCallback(kRtSetDeviceRegName, static_cast(func)); + if (rt_ret != PROF_SUCCESS) { + MS_LOG(ERROR) << "Pass MsprofSetDeviceCallback to runtime failed!"; + return rt_ret; + } + return PROF_SUCCESS; +} + +Status RegProfReporterCallback(MsprofReporterCallback func) { + if (func == nullptr) { + MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr."; + return PROF_FAILED; + } + if (ProfilingManager::GetInstance().GetMsprofCallback().msprofReporterCallback != nullptr) { + MS_LOG(WARNING) << "Msprof reporter callback is exist, just ignore it."; + } else { + MS_LOG(INFO) << "GE register Msprof reporter callback."; + ProfilingManager::GetInstance().SetMsprofReporterCallback(func); + // Pass MsprofReporterCallback to runtime + Status rt_ret = rtSetMsprofReporterCallback(func); + if (rt_ret != PROF_SUCCESS) { + MS_LOG(ERROR) << "Pass MsprofReporterCallback to runtime failed, ret: " << rt_ret; + return rt_ret; + } + // Pass MsprofReporterCallback to hccl + } + return PROF_SUCCESS; +} + +Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) { + MS_LOG(INFO) << "ProfCommandHandle start, type:" << type; + if (type == kProfCommandhandleInit) { + auto cb_ret = ProfilingManager::GetInstance().PluginInit(); + if (cb_ret != PROF_SUCCESS) { + MS_LOG(ERROR) << "Profiling plugin int failed."; + return PROF_FAILED; + } + + // call runtime profiler API + auto module = GetProfilingModule(); + auto device_id = GetCurrentDeviceId(); + auto ret = rtProfilerStart(module, kProfilingDeviceNum, &device_id); + if (ret != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtProfilerStart failed, ret:" << ret; + return PROF_FAILED; + } + } + return PROF_SUCCESS; +} + +bool DoRegiste() { + MS_LOG(INFO) << "VM profiling register start"; + return VMCallbackRegister::GetInstance().Registe(RegProfCtrlCallback, RegProfSetDeviceCallback, + RegProfReporterCallback, ProfCommandHandle); +} +static bool doRegiste = DoRegiste(); } // namespace ascend } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.h b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.h index 60f5e1a9dd7..bbec7ba1c4a 100644 --- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.h +++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.h @@ -23,13 +23,21 @@ #include #include "utils/contract.h" #include "utils/ms_context.h" +#include "toolchain/prof_callback.h" +#include "runtime/device/ascend/profiling/profiling_callback_register.h" using std::map; using std::string; +using Status = uint32_t; namespace mindspore { namespace device { namespace ascend { -class ProfilingEngineImpl; +struct MsprofCallback { + MsprofCtrlCallback msprofCtrlCallback; + MsprofSetDeviceCallback msprofSetDeviceCallback; + MsprofReporterCallback msprofReporterCallback; +}; + class ProfilingManager { public: static ProfilingManager &GetInstance(); @@ -43,17 +51,31 @@ class ProfilingManager { MS_EXCEPTION_IF_NULL(context); return context->get_param(MS_CTX_ENABLE_PROFILING); } + Status PluginInit() const; + void PluginUnInit() const; + Status CallMsprofReport(NotNull reporter_data) const; + struct MsprofCallback &GetMsprofCallback() { + return prof_cb_; + } + void SetMsprofCtrlCallback(MsprofCtrlCallback func) { prof_cb_.msprofCtrlCallback = func; } + void SetMsprofReporterCallback(MsprofReporterCallback func) { prof_cb_.msprofReporterCallback = func; } + void SetMsprofSetDeviceCallback(MsprofSetDeviceCallback func) { prof_cb_.msprofSetDeviceCallback = func; } + Status GetProfConf(NotNull prof); protected: ProfilingManager(); - ~ProfilingManager() { prof_handle_ = nullptr; } + ~ProfilingManager() {} private: - bool ProfStartUp(const nlohmann::json &json); - std::shared_ptr engine_0_; + bool ProfStartUp(NotNull prof_conf); uint32_t device_id_; - void *prof_handle_; + MsprofCallback prof_cb_; }; + +Status RegProfCtrlCallback(MsprofCtrlCallback func); +Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func); +Status RegProfReporterCallback(MsprofReporterCallback func); +Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len); } // namespace ascend } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_utils.cc b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_utils.cc index e93761fcb98..23602716a3e 100644 --- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_utils.cc +++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_utils.cc @@ -24,14 +24,15 @@ #include "runtime/device/ascend/profiling/reporter/task_desc_reporter.h" #include "utils/ms_context.h" #include "runtime/device/ascend/profiling/reporter/point_reporter.h" +#include "nlohmann/json.hpp" namespace mindspore { namespace device { namespace ascend { constexpr uint32_t kMaxProfilingNodeNum = 100; constexpr char kCustomNode[] = "PROFILING_CUSTOM_"; -constexpr char kFpStartNode[] = "PROFILING_FP_START"; -constexpr char kBpEndNode[] = "PROFILING_BP_END"; +constexpr char kFpStartNode[] = "fp_point"; +constexpr char kBpEndNode[] = "bp_point"; constexpr char kIterEndNode[] = "PROFILING_ITER_END"; // PROFILING_CUSTOM_LOGID_START 3 constexpr uint64_t kProfilingFpStartLogId = 1; @@ -42,14 +43,29 @@ std::map> ProfilingUtils::graph_kernel_name_; std::map>> ProfilingUtils::graph_point_; uint32_t ProfilingUtils::custom_node_index_ = 1; +nlohmann::json GetContextProfilingOption() { + auto context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context); + const string prof_options_str = context->get_param(MS_CTX_PROFILING_OPTIONS); + nlohmann::json j; + try { + j = nlohmann::json::parse(prof_options_str); + } catch (nlohmann::json::parse_error &e) { + MS_LOG(EXCEPTION) << "Parse profiling option json failed, error:" << e.what(); + } + return j; +} + ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull graph_ptr) { MS_LOG(INFO) << "get env start"; custom_node_index_ = 1; auto &cnode_exec_order = graph_ptr->execution_order(); + auto profiling_option = GetContextProfilingOption(); + ProfilingTraceInfo profiling_trace; - profiling_trace.trace_begin = GetTraceBegin(cnode_exec_order); - profiling_trace.trace_bp_end = GetTraceBpEnd(cnode_exec_order); - profiling_trace.trace_netoutput = GetTraceNetoutput(cnode_exec_order); + profiling_trace.trace_begin = GetTraceBegin(cnode_exec_order, profiling_option); + profiling_trace.trace_bp_end = GetTraceBpEnd(cnode_exec_order, profiling_option); + profiling_trace.trace_netoutput = GetTraceNetoutput(cnode_exec_order, profiling_option); for (uint32_t i = 1; i <= kMaxProfilingNodeNum; ++i) { std::string env_str = std::string(kCustomNode) + std::to_string(i); @@ -80,10 +96,14 @@ void ProfilingUtils::GetTraceHccl(const std::vector &cnode_exec_order, } } -std::string ProfilingUtils::GetTraceBegin(const std::vector &cnode_exec_order) { - const char *trace_begin = std::getenv(kFpStartNode); - if (trace_begin != nullptr) { - return std::string(trace_begin); +std::string ProfilingUtils::GetTraceBegin(const std::vector &cnode_exec_order, const nlohmann::json &option) { + auto iter = option.find(kFpStartNode); + if (iter != option.end() && iter->is_string()) { + std::string trace_begin_str = *iter; + if (!trace_begin_str.empty()) { + MS_LOG(INFO) << "Get fp_point from profiling_option:" << trace_begin_str; + return trace_begin_str; + } } std::string fp_start_str; @@ -124,12 +144,16 @@ void ProfilingUtils::GetCNodeOutputRealNode(const std::string &node_name, const } } -std::string ProfilingUtils::GetTraceBpEnd(const std::vector &cnode_exec_order) { - const char *trace_bp_end = std::getenv(kBpEndNode); - - if (trace_bp_end != nullptr) { - return std::string(trace_bp_end); +std::string ProfilingUtils::GetTraceBpEnd(const std::vector &cnode_exec_order, const nlohmann::json &option) { + auto bp_point = option.find(kBpEndNode); + if (bp_point != option.end() && bp_point->is_string()) { + std::string bp_point_str = *bp_point; + if (!bp_point_str.empty()) { + MS_LOG(INFO) << "Get bp_point from profiling_option:" << bp_point_str; + return bp_point_str; + } } + std::string bp_end_str; // Contain hccl kernel auto iter = cnode_exec_order.rbegin(); @@ -179,9 +203,17 @@ std::string ProfilingUtils::GetGraphLastTbeKernelName(const std::vector &cnode_exec_order) { - const char *trace_netoutput = std::getenv(kIterEndNode); - return trace_netoutput == nullptr ? GetGraphLastTbeKernelName(cnode_exec_order) : std::string(trace_netoutput); +std::string ProfilingUtils::GetTraceNetoutput(const std::vector &cnode_exec_order, + const nlohmann::json &option) { + auto iter_end = option.find(kIterEndNode); + if (iter_end != option.end() && iter_end->is_string()) { + std::string iter_end_str = *iter_end; + if (!iter_end_str.empty()) { + MS_LOG(INFO) << "Get iter_end from profiling_option:" << iter_end_str; + return iter_end_str; + } + } + return GetGraphLastTbeKernelName(cnode_exec_order); } NotNull ProfilingUtils::CreateProfilingCNode(const ProfilingContent &profiling_content, diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_utils.h b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_utils.h index 468e8f8394a..f5ab6ebb434 100644 --- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_utils.h +++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_utils.h @@ -118,9 +118,9 @@ class ProfilingUtils { NotNull graph_ptr); static CNodePtr CreateProfilingCNodeWithStream(const AnfNodePtr &anf_node, const ProfilingContent &profiling_content, NotNull graph_ptr); - static std::string GetTraceBegin(const std::vector &cnode_exec_order); - static std::string GetTraceBpEnd(const std::vector &cnode_exec_order); - static std::string GetTraceNetoutput(const std::vector &cnode_exec_order); + static std::string GetTraceBegin(const std::vector &cnode_exec_order, const nlohmann::json &option); + static std::string GetTraceBpEnd(const std::vector &cnode_exec_order, const nlohmann::json &option); + static std::string GetTraceNetoutput(const std::vector &cnode_exec_order, const nlohmann::json &option); static std::string GetGraphLastTbeKernelName(const std::vector &cnode_exec_order); static void GetTraceHccl(const std::vector &cnode_exec_order, NotNull profiling_trace); diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/reporter/desc_reporter.cc b/mindspore/ccsrc/runtime/device/ascend/profiling/reporter/desc_reporter.cc index 87e2bbcb064..582b9a125d8 100644 --- a/mindspore/ccsrc/runtime/device/ascend/profiling/reporter/desc_reporter.cc +++ b/mindspore/ccsrc/runtime/device/ascend/profiling/reporter/desc_reporter.cc @@ -16,7 +16,7 @@ #include #include "runtime/device/ascend/profiling/reporter/desc_reporter.h" -#include "runtime/device/ascend/profiling/plugin_impl.h" +#include "runtime/device/ascend/profiling/profiling_manager.h" #include "utils/log_adapter.h" constexpr size_t kReportMaxLen = 2048; @@ -27,16 +27,13 @@ namespace ascend { DescReporter::~DescReporter() = default; void DescReporter::ReportByLine(const std::string &data, const std::string &file_name) const { - auto reporter = PluginImpl::GetPluginReporter(); - MS_EXCEPTION_IF_NULL(reporter); - auto tot_size = data.size(); size_t cur_size = 0; while (cur_size < tot_size) { size_t remain_size = tot_size - cur_size; size_t report_size = std::min(remain_size, kReportMaxLen); - Msprof::Engine::ReporterData report_data{}; + ReporterData report_data{}; report_data.deviceId = device_id_; report_data.dataLen = report_size; report_data.data = (unsigned char *)data.c_str() + cur_size; @@ -44,7 +41,7 @@ void DescReporter::ReportByLine(const std::string &data, const std::string &file if (ret != 0) { MS_LOG(EXCEPTION) << "Memcpy_s report data tag failed"; } - auto report_ret = reporter->Report(&report_data); + auto report_ret = ProfilingManager::GetInstance().CallMsprofReport(NOT_NULL(&report_data)); if (report_ret != 0) { MS_LOG(EXCEPTION) << "Report data failed"; } diff --git a/mindspore/ccsrc/runtime/device/kernel_adjust.cc b/mindspore/ccsrc/runtime/device/kernel_adjust.cc index 088c42be374..7b8360ba97c 100644 --- a/mindspore/ccsrc/runtime/device/kernel_adjust.cc +++ b/mindspore/ccsrc/runtime/device/kernel_adjust.cc @@ -515,6 +515,10 @@ CNodePtr KernelAdjust::CreateStreamAssignAddnOP(const std::shared_ptrset_abstract(switch_loop_input.at(kCurLoopCountParamName)->abstract()); + // add AssignAdd op to kernel ref node map + session::AnfWithOutIndex final_pair = std::make_pair(assign_add_one, 0); + session::KernelWithIndex kernel_with_index = AnfAlgo::VisitKernel(AnfAlgo::GetInputNode(assign_add_one, 0), 0); + kernel_graph_ptr->AddRefCorrespondPairs(final_pair, kernel_with_index); return assign_add_one; } diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h index e9a563ef1d6..1b046cc9b2c 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h @@ -99,6 +99,8 @@ class KernelRuntime { #endif } + virtual void PreInit() {} + protected: virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, TypeId type_id) = 0; diff --git a/mindspore/ccsrc/utils/comm_manager.cc b/mindspore/ccsrc/utils/comm_manager.cc index 6411479f77c..c92d33a9ab1 100644 --- a/mindspore/ccsrc/utils/comm_manager.cc +++ b/mindspore/ccsrc/utils/comm_manager.cc @@ -67,26 +67,26 @@ bool CommManager::CreateGroupSync(const string &group, const vector(rank_id_list).data())); + HcomCreateGroup(group.c_str(), UlongToUint(rank_size), vector(rank_id_list).data())); return true; } bool CommManager::GetRankID(const string &group, unsigned int *rank_id) const { HCCL_GROUP_CHECK_EMPTY(group); - HCCL_RUN_CHECK(string("get rank_id"), group, hcom_get_rank_id(group.c_str(), rank_id)); + HCCL_RUN_CHECK(string("get rank_id"), group, HcomGetRankId(group.c_str(), rank_id)); return true; } bool CommManager::GetRankSize(const string &group, unsigned int *rank_size) const { HCCL_GROUP_CHECK_EMPTY(group); - HCCL_RUN_CHECK(string("get rank size"), group, hcom_get_rank_size(group.c_str(), rank_size)); + HCCL_RUN_CHECK(string("get rank size"), group, HcomGetRankSize(group.c_str(), rank_size)); return true; } bool CommManager::DestroyGroup(const string &group) const { HCCL_GROUP_CHECK_EMPTY(group); HCCL_GROUP_CHECK_IS_WORLD(group); - HCCL_RUN_CHECK(string("destroy communicate group"), group, hcom_destroy_group(group.c_str())); + HCCL_RUN_CHECK(string("destroy communicate group"), group, HcomDestroyGroup(group.c_str())); return true; } #elif defined(ENABLE_GPU) diff --git a/mindspore/communication/_hccl_management.py b/mindspore/communication/_hccl_management.py index 3c834116826..f9610cac6cb 100644 --- a/mindspore/communication/_hccl_management.py +++ b/mindspore/communication/_hccl_management.py @@ -110,7 +110,7 @@ def create_group(group, rank_num, rank_ids): c_array_rank_ids = c_array(ctypes.c_uint, rank_ids) c_rank_num = ctypes.c_uint(rank_num) c_group = c_str(group) - ret = HCCL_LIB_CTYPES.hcom_create_group(c_group, c_rank_num, c_array_rank_ids) + ret = HCCL_LIB_CTYPES.HcomCreateGroup(c_group, c_rank_num, c_array_rank_ids) if ret != 0: raise RuntimeError('Create group error.') else: @@ -129,7 +129,7 @@ def destroy_group(group): """ check_group(group) c_group = c_str(group) - ret = HCCL_LIB_CTYPES.hcom_destroy_group(c_group) + ret = HCCL_LIB_CTYPES.HcomDestroyGroup(c_group) if ret != 0: raise RuntimeError('Destroy group error.') @@ -147,7 +147,7 @@ def get_rank_size(group="hccl_world_group"): check_group(group) c_group = c_str(group) c_rank_size = ctypes.c_uint() - ret = HCCL_LIB_CTYPES.hcom_get_rank_size(c_group, ctypes.byref(c_rank_size)) + ret = HCCL_LIB_CTYPES.HcomGetRankSize(c_group, ctypes.byref(c_rank_size)) if ret != 0: raise RuntimeError('Get rank size error.') @@ -164,7 +164,7 @@ def get_rank_id(group="hccl_world_group"): check_group(group) c_group = c_str(group) c_rank_id = ctypes.c_uint() - ret = HCCL_LIB_CTYPES.hcom_get_rank_id(c_group, ctypes.byref(c_rank_id)) + ret = HCCL_LIB_CTYPES.HcomGetRankId(c_group, ctypes.byref(c_rank_id)) if ret != 0: raise RuntimeError('Get rank id error.') @@ -184,7 +184,7 @@ def get_local_rank_size(group="hccl_world_group"): check_group(group) c_group = c_str(group) c_local_rank_size = ctypes.c_uint() - ret = HCCL_LIB_CTYPES.hcom_get_local_rank_size(c_group, ctypes.byref(c_local_rank_size)) + ret = HCCL_LIB_CTYPES.HcomGetLocalRankSize(c_group, ctypes.byref(c_local_rank_size)) if ret != 0: raise RuntimeError('Get local rank size error.') @@ -203,7 +203,7 @@ def get_local_rank_id(group="hccl_world_group"): check_group(group) c_group = c_str(group) c_local_rank_id = ctypes.c_uint() - ret = HCCL_LIB_CTYPES.hcom_get_local_rank_id(c_group, ctypes.byref(c_local_rank_id)) + ret = HCCL_LIB_CTYPES.HcomGetLocalRankId(c_group, ctypes.byref(c_local_rank_id)) if ret != 0: raise RuntimeError('Get local rank id error.') @@ -225,7 +225,7 @@ def get_world_rank_from_group_rank(group, group_rank_id): c_group = c_str(group) c_group_rank_id = ctypes.c_uint(group_rank_id) c_world_rank_id = ctypes.c_uint() - ret = HCCL_LIB_CTYPES.hcom_get_world_rank_from_group_rank(c_group, c_group_rank_id, ctypes.byref(c_world_rank_id)) + ret = HCCL_LIB_CTYPES.HcomGetWorldRankFromGroupRank(c_group, c_group_rank_id, ctypes.byref(c_world_rank_id)) if ret != 0: raise RuntimeError('Get world rank from group rank error.') @@ -247,7 +247,7 @@ def get_group_rank_from_world_rank(world_rank_id, group): c_group = c_str(group) c_world_rank_id = ctypes.c_uint(world_rank_id) c_group_rank_id = ctypes.c_uint() - ret = HCCL_LIB_CTYPES.hcom_get_group_rank_from_world_rank(c_world_rank_id, c_group, ctypes.byref(c_group_rank_id)) + ret = HCCL_LIB_CTYPES.HcomGetGroupRankFromWorldRank(c_world_rank_id, c_group, ctypes.byref(c_group_rank_id)) if ret != 0: raise RuntimeError('Get group rank from world rank error.') diff --git a/tests/st/model_zoo_tests/transformer/test_transformer.py b/tests/st/model_zoo_tests/transformer/test_transformer.py index 11001f92e38..327d58fdd42 100644 --- a/tests/st/model_zoo_tests/transformer/test_transformer.py +++ b/tests/st/model_zoo_tests/transformer/test_transformer.py @@ -164,10 +164,11 @@ def test_transformer(): # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) - assert np.allclose(loss_value[0], 11.241604, 0, 0.000005) + assert np.allclose(loss_value[0], 11.241624, 0, 0.000005) + + expect_loss_value = [11.241624, 11.243232, 11.217465, 11.204196, 11.2138195, + 11.215386, 11.19053, 11.150403, 11.191858, 11.160057] - expect_loss_value = [11.241604, 11.243231, 11.217458, 11.204156, 11.213805, - 11.215374, 11.19065, 11.150393, 11.191824, 11.160044] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value[0:10], expect_loss_value, 0, 0.0005) diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index ec17dbd47e5..a020330c16a 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -85,8 +85,6 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} # dont remove the 4 lines above "../../../mindspore/ccsrc/debug/data_dump/dump_json_parser.cc" "../../../mindspore/ccsrc/debug/common.cc" - "../../../mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc" - "../../../mindspore/ccsrc/runtime/device/ascend/profiling/profiling_engine_impl.cc" "../../../mindspore/ccsrc/runtime/device/kernel_runtime.cc" "../../../mindspore/ccsrc/runtime/device/memory_manager.cc" "../../../mindspore/ccsrc/runtime/device/kernel_runtime_manager.cc" diff --git a/tests/ut/cpp/device/ascend_profiling_test.cc b/tests/ut/cpp/device/ascend_profiling_test.cc deleted file mode 100644 index 858f99c429e..00000000000 --- a/tests/ut/cpp/device/ascend_profiling_test.cc +++ /dev/null @@ -1,124 +0,0 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include - -#include "./prof_reporter.h" -#include "common/common_test.h" -#include "runtime/device/ascend/profiling/profiling_manager.h" -#define private public -#include "runtime/device/ascend/profiling/plugin_impl.h" -#undef private -#include "runtime/device/ascend/profiling/profiling_engine_impl.h" - -namespace mindspore { -namespace device { -namespace ascend { -class stubReporter : public Reporter { - public: - stubReporter() = default; - ~stubReporter() = default; - - int Report(const Msprof::Engine::ReporterData *data) override; - int Flush() override; -}; - -int stubReporter::Report(const Msprof::Engine::ReporterData *data) { return 0; } - -int stubReporter::Flush() { return 0; } - -class TestAscendProfiling : public UT::Common { - public: - TestAscendProfiling() {} -}; - -TEST_F(TestAscendProfiling, test_profiling_GetJobId) { - auto job_id = ProfilingManager::GetInstance().GetJobId(); - printf("get job_id:%ld\n", job_id); -} - -int test_profiling_start() { - (void)setenv("PROFILING_MODE", "true", 1); - (void)setenv("PROFILING_OPTIONS", "training_trace:task_trace", 1); - auto ret = ProfilingManager::GetInstance().StartupProfiling(0); - (void)unsetenv("PROFILING_MODE"); - (void)unsetenv("PROFILING_OPTIONS"); - return ret; -} - -TEST_F(TestAscendProfiling, test_profiling_start) { - auto ret = test_profiling_start(); - ASSERT_EQ(ret, true); -} - -int test_profiling_stop() { - (void)setenv("PROFILING_MODE", "true", 1); - auto engine = std::make_shared(); - auto report = std::make_shared(); - auto plug = engine->CreatePlugin(); - plug->Init(report.get()); - auto ret = ProfilingManager::GetInstance().StopProfiling(); - plug->UnInit(); - engine->ReleasePlugin(plug); - (void)unsetenv("PROFILING_OPTIONS"); - return ret; -} - -TEST_F(TestAscendProfiling, test_profiling_stop) { - auto ret = test_profiling_stop(); - ASSERT_EQ(ret, true); -} - -int test_profiling_rpt() { - (void)setenv("PROFILING_MODE", "true", 1); - std::map op_taskId_map; - op_taskId_map[1] = "add"; - op_taskId_map[2] = "mul"; - auto engine = std::make_shared(); - auto report = std::make_shared(); - auto plug = engine->CreatePlugin(); - plug->Init(report.get()); - ProfilingManager::GetInstance().ReportProfilingData(op_taskId_map); - plug->UnInit(); - engine->ReleasePlugin(plug); - (void)unsetenv("PROFILING_OPTIONS"); - return 0; -} - -TEST_F(TestAscendProfiling, test_profiling_rpt) { - auto ret = test_profiling_rpt(); - ASSERT_EQ(ret, false); -} - -int test_profiling_rpt_abnormal() { - std::map op_taskId_map; - ProfilingManager::GetInstance().ReportProfilingData(op_taskId_map); - (void)setenv("PROFILING_MODE", "true", 1); - ProfilingManager::GetInstance().ReportProfilingData(op_taskId_map); - op_taskId_map[1] = "add"; - op_taskId_map[2] = "mul"; - ProfilingManager::GetInstance().ReportProfilingData(op_taskId_map); - (void)unsetenv("PROFILING_OPTIONS"); - return 0; -} - -TEST_F(TestAscendProfiling, test_profiling_rpt_abnormal) { - auto ret = test_profiling_rpt_abnormal(); - ASSERT_EQ(ret, false); -} -} // namespace ascend -} // namespace device -} // namespace mindspore diff --git a/tests/ut/cpp/stub/hccl/hccl_stub.cc b/tests/ut/cpp/stub/hccl/hccl_stub.cc index 9601a1fdb9c..9778acc09ff 100644 --- a/tests/ut/cpp/stub/hccl/hccl_stub.cc +++ b/tests/ut/cpp/stub/hccl/hccl_stub.cc @@ -63,32 +63,32 @@ HcclResult hcom_reduce_scatter(const char *tag, void *inputPtr, void *outputPtr, } /* 获取group内的rank个数 */ -HcclResult hcom_get_rank_size(const char *group, u32 *rankSize) { return HCCL_SUCCESS; } +HcclResult HcomGetRankSize(const char *group, u32 *rankSize) { return HCCL_SUCCESS; } /* python获取上云场景内的rank个数 */ HcclResult hcom_python_get_rank_size(u32 *rankSize) { return HCCL_SUCCESS; } /* 获取本rank的id */ -HcclResult hcom_get_rank_id(const char *group, u32 *rankId) { return HCCL_SUCCESS; } +HcclResult HcomGetRankId(const char *group, u32 *rankId) { return HCCL_SUCCESS; } /* 获取本rank的id */ HcclResult hcom_python_get_rank_id(u32 *rankId) { return HCCL_SUCCESS; } /* 获取本rank的id */ -HcclResult hcom_get_world_rank_from_group_rank(const char *group, u32 groupRank, u32 *worldRank) { +HcclResult HcomGetWorldRankFromGroupRank(const char *group, u32 groupRank, u32 *worldRank) { return HCCL_SUCCESS; } /* 获取通信域的rank个数 */ -HcclResult hcom_get_group_rank_from_world_rank(u32 worldRank, const char *group, u32 *groupRank) { +HcclResult HcomGetGroupRankFromWorldRank(u32 worldRank, const char *group, u32 *groupRank) { return HCCL_SUCCESS; } /* 创建group */ -HcclResult hcom_create_group(const char *group, u32 rankNum, u32 *rankIds) { return HCCL_SUCCESS; } +HcclResult HcomCreateGroup(const char *group, u32 rankNum, u32 *rankIds) { return HCCL_SUCCESS; } /* 销毁group */ -HcclResult hcom_destroy_group(const char *group) { return HCCL_SUCCESS; } +HcclResult HcomDestroyGroup(const char *group) { return HCCL_SUCCESS; } /* 发送消息 */ HcclResult hcom_send(const char *tag, void *inputPtr, u64 count, HcclDataType dataType, u32 destRank, u32 srTag, diff --git a/tests/ut/cpp/stub/profiling/profiling_stub.cc b/tests/ut/cpp/stub/profiling/profiling_stub.cc index ac09c524f4e..86df5facde5 100644 --- a/tests/ut/cpp/stub/profiling/profiling_stub.cc +++ b/tests/ut/cpp/stub/profiling/profiling_stub.cc @@ -15,7 +15,6 @@ */ #include #include "prof_mgr_core.h" -#include namespace Msprof { namespace Engine { @@ -51,3 +50,7 @@ void* ProfMgrStartUp(const ProfMgrCfg* cfg) { return const_cast(reinterpr * PROFILING_FAILED -1 (failed) */ int ProfMgrStop(void* handle) { return 0; } + +namespace Analysis::Dvvp::ProfilerCommon { +uint32_t MsprofilerInit() { return 0; } +} diff --git a/tests/ut/cpp/stub/runtime/runtime_stub.cc b/tests/ut/cpp/stub/runtime/runtime_stub.cc index 19c6e1a8bcd..73ba1aed6be 100644 --- a/tests/ut/cpp/stub/runtime/runtime_stub.cc +++ b/tests/ut/cpp/stub/runtime/runtime_stub.cc @@ -151,4 +151,12 @@ int AdxDataDumpServerUnInit() { return 0; } RTS_API rtError_t rtGetTaskIdAndStreamID(uint32_t *taskid, uint32_t *streamid) { return RT_ERROR_NONE; } -RTS_API rtError_t rtSetTaskFailCallback(rtTaskFailCallback callback) {return RT_ERROR_NONE; } \ No newline at end of file +RTS_API rtError_t rtSetTaskFailCallback(rtTaskFailCallback callback) {return RT_ERROR_NONE; } + +RTS_API rtError_t rtRegDeviceStateCallback(const char *regName, rtDeviceStateCallback callback) {return RT_ERROR_NONE; } + +RTS_API rtError_t rtSetMsprofReporterCallback(MsprofReporterCallback callback) {return RT_ERROR_NONE; } + +RTS_API rtError_t rtRegTaskFailCallbackByModule(const char *moduleName, rtTaskFailCallbackByModule callback) { + return RT_ERROR_NONE; +}