From d532af3a9a967abde3179a8c6a92331c00310ce8 Mon Sep 17 00:00:00 2001 From: zhoufeng Date: Mon, 9 Nov 2020 15:44:55 +0800 Subject: [PATCH] new hccl interface Signed-off-by: zhoufeng --- cmake/dependency_graphengine.cmake | 13 +- cmake/package.cmake | 2 +- graphengine | 2 +- mindspore/ccsrc/CMakeLists.txt | 45 +++-- .../kernel_compiler/hccl/hccl_kernel.cc | 45 +++-- .../hccl/hcom_all_broadcast.cc | 22 +-- .../kernel_compiler/hccl/hcom_all_gather.cc | 22 +-- .../kernel_compiler/hccl/hcom_all_reduce.cc | 22 +-- .../hccl/hcom_all_reduce_scatter.cc | 21 +-- .../device/ascend/ascend_kernel_runtime.cc | 17 +- .../device/ascend/tasksink/runtime_utils.cc | 106 ----------- .../device/ascend/tasksink/runtime_utils.h | 39 ----- .../ccsrc/runtime/hccl_adapter/CMakeLists.txt | 8 + .../ccsrc/runtime/hccl_adapter/converter.cc | 129 ++++++++++++++ .../ccsrc/runtime/hccl_adapter/converter.h | 38 ++++ .../runtime/hccl_adapter/hccl_adapter.cc | 165 ++++++++++++++++++ .../ccsrc/runtime/hccl_adapter/hccl_adapter.h | 43 +++++ .../runtime/hccl_adapter/hcom_graph_adaptor.h | 32 ++++ mindspore/core/utils/log_adapter.cc | 3 +- mindspore/core/utils/log_adapter.h | 1 + tests/ut/cpp/stub/ge/ge_task_launch_stub.cc | 21 +-- 21 files changed, 518 insertions(+), 278 deletions(-) delete mode 100644 mindspore/ccsrc/runtime/device/ascend/tasksink/runtime_utils.cc delete mode 100644 mindspore/ccsrc/runtime/device/ascend/tasksink/runtime_utils.h create mode 100644 mindspore/ccsrc/runtime/hccl_adapter/CMakeLists.txt create mode 100644 mindspore/ccsrc/runtime/hccl_adapter/converter.cc create mode 100644 mindspore/ccsrc/runtime/hccl_adapter/converter.h create mode 100644 mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.cc create mode 100644 mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.h create mode 100644 mindspore/ccsrc/runtime/hccl_adapter/hcom_graph_adaptor.h diff --git a/cmake/dependency_graphengine.cmake b/cmake/dependency_graphengine.cmake index b6912d869c4..1f8c13fc89b 100644 --- a/cmake/dependency_graphengine.cmake +++ b/cmake/dependency_graphengine.cmake @@ -6,10 +6,15 @@ include(${GE_SOURCE_DIR}/cmake/ge_utils.cmake) include(${GE_SOURCE_DIR}/cmake/external_libs/json.cmake) include(${GE_SOURCE_DIR}/cmake/external_libs/eigen.cmake) include(${GE_SOURCE_DIR}/cmake/external_libs/gtest.cmake) -include(${GE_SOURCE_DIR}/cmake/external_libs/protobuf.cmake) include(${GE_SOURCE_DIR}/cmake/external_libs/onnx.cmake) include(${GE_SOURCE_DIR}/cmake/external_libs/securec.cmake) - +if (ENABLE_D) + set(AS_MS_COMP TRUE) + include(${GE_SOURCE_DIR}/cmake/external_libs/protobuf.cmake) + unset(AS_MS_COMP) +else () + include(${GE_SOURCE_DIR}/cmake/external_libs/protobuf.cmake) +endif () # for UT, find slog and error_manager from local prebuild if (NOT ENABLE_D AND NOT ENABLE_ACL) set(GE_PREBUILD_PATH ${GE_SOURCE_DIR}/third_party/prebuild/${CMAKE_HOST_SYSTEM_PROCESSOR}) @@ -79,8 +84,12 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__FILE__='\"$(subst $(realpath ${CMAKE add_subdirectory(${GE_SOURCE_DIR}/src/common/graph) if (ENABLE_ACL OR ENABLE_D) add_subdirectory(${GE_SOURCE_DIR}/src/ge/common) + target_compile_definitions(graph PRIVATE google=ascend_private) + set_target_properties(graph PROPERTIES SKIP_BUILD_RPATH TRUE) if (ENABLE_D) add_subdirectory(${GE_SOURCE_DIR}/src/ge/ge_runtime) + target_compile_definitions(ge_runtime PRIVATE google=ascend_private) + set_target_properties(ge_runtime PROPERTIES SKIP_BUILD_RPATH TRUE) endif () endif () diff --git a/cmake/package.cmake b/cmake/package.cmake index 988f8089f22..5da32b81e17 100644 --- a/cmake/package.cmake +++ b/cmake/package.cmake @@ -216,7 +216,7 @@ if (NOT ENABLE_GE) if (ENABLE_D) install( - TARGETS ms_profile + TARGETS ms_profile hccl_adapter DESTINATION ${INSTALL_LIB_DIR} COMPONENT mindspore ) diff --git a/graphengine b/graphengine index 412ebe82c96..383f7f751d6 160000 --- a/graphengine +++ b/graphengine @@ -1 +1 @@ -Subproject commit 412ebe82c96620b5f7c942a7ab87a45bf14c5621 +Subproject commit 383f7f751d6612e9dbde9e22a2960098fdbf3792 diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt index e46e47e633a..66996e97fd2 100644 --- a/mindspore/ccsrc/CMakeLists.txt +++ b/mindspore/ccsrc/CMakeLists.txt @@ -174,7 +174,7 @@ foreach (_comp ${SUB_COMP}) string(REPLACE "/" "_" sub ${_comp}) if (TARGET _mindspore_${sub}_obj) list(APPEND SUB_OBJECTS_SRC $) - add_dependencies(_mindspore_${sub}_obj proto_input ) + add_dependencies(_mindspore_${sub}_obj proto_input) endif () endforeach () @@ -229,28 +229,26 @@ if (ENABLE_D) endif() MESSAGE("USE DAV LIB PATH: ${ASCEND_PATH}") - find_library(HCCL hccl ${ASCEND_RUNTIME_PATH}) - find_library(CCE_LIB cce ${ASCEND_RUNTIME_PATH}) - find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH}) - find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) - find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) - find_library(PROFILING msprofiler ${ASCEND_RUNTIME_PATH}) - # for atlas env - find_library(HCCL hccl ${ASCEND_TOOLKIT_RUNTIME_PATH}) - find_library(CCE_LIB cce ${ASCEND_TOOLKIT_RUNTIME_PATH}) - find_library(RUNTIME_LIB runtime ${ASCEND_TOOLKIT_RUNTIME_PATH}) - find_library(TSDCLIENT tsdclient HINTS ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) - find_library(DATATRANSFER datatransfer HINTS ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) - find_library(PROFILING msprofiler ${ASCEND_TOOLKIT_RUNTIME_PATH}) - + find_library(HCCL hccl ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) + find_library(CCE_LIB cce ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) + find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) + find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) + find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) + find_library(PROFILING msprofiler ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(PROFILING_SHARED msprof ${ASCEND_DRIVER_PATH}) - find_library(REGISTER register ${ASCEND_RUNTIME_PATH}) + find_library(REGISTER register ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(OPTILING optiling ${ASCEND_OPP_PATH}) add_library(ms_profile SHARED ${PROFILING}) set_target_properties(ms_profile PROPERTIES LINKER_LANGUAGE CXX) target_link_libraries(ms_profile -Wl,--start-group ${PROFILING_SHARED} ${PROFILING} mindspore::protobuf -Wl,--end-group) - target_link_libraries(mindspore ms_profile ge_runtime ${CCE_LIB} ${RUNTIME_LIB} ${TSDCLIENT} ${HCCL} ${DATATRANSFER} ${REGISTER} -Wl,--no-as-needed ${OPTILING} -Wl,--as-needed) + target_link_libraries(mindspore ms_profile ge_runtime ${CCE_LIB} ${RUNTIME_LIB} ${TSDCLIENT} ${HCCL} ${DATATRANSFER} + ${REGISTER} -Wl,--no-as-needed ${OPTILING} -Wl,--as-needed) target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group) + # hccl_adpter + find_library(HCCL_ADPTER hcom_graph_adaptor ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) + find_library(HCCL_BUILDER hcom_opskernel_builder ${ASCEND_RUNTIME_PATH}/plugin/opskernel ${ASCEND_TOOLKIT_RUNTIME_PATH}/plugin/opskernel) + add_subdirectory(runtime/hccl_adapter) + target_link_libraries(hccl_adapter PRIVATE mindspore ${HCCL_ADPTER} ${REGISTER} -Wl,--no-as-needed ${HCCL_BUILDER}) elseif (CMAKE_SYSTEM_NAME MATCHES "Windows") target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece -Wl,--end-group) else () @@ -274,11 +272,14 @@ elseif (CMAKE_SYSTEM_NAME MATCHES "Windows") else () MESSAGE(FATAL_ERROR "other platform: ${CMAKE_SYSTEM_NAME}") endif () -set(MINDSPORE_RPATH ${ORIGIN_PATH}/lib) + if (ENABLE_D) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/fwkacllib/lib64) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/fwkacllib/lib64) + set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel) + set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel) + set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/add-ons) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/op_tiling) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling) @@ -286,9 +287,16 @@ if (ENABLE_D) elseif (ENABLE_GPU) set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/cuda/lib64) endif () +set(HCCL_ADPT_RPATH ${ORIGIN_PATH}:${MINDSPORE_RPATH}) +set(MINDSPORE_RPATH ${ORIGIN_PATH}/lib:${MINDSPORE_RPATH}) set_target_properties(_c_expression PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH}) +if (ENABLE_D) + set_target_properties(hccl_adapter PROPERTIES INSTALL_RPATH ${HCCL_ADPT_RPATH}) + target_link_libraries(_c_expression PRIVATE hccl_adapter) +endif () + if (CMAKE_SYSTEM_NAME MATCHES "Windows") target_link_libraries(mindspore mindspore::pybind11_module) target_link_libraries(mindspore mindspore_gvar) @@ -352,6 +360,7 @@ if (ENABLE_D) find_library(adump_server libadump_server.a ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) target_link_libraries(_c_expression PRIVATE ${adump_server}) target_link_libraries(inference PRIVATE ${adump_server}) + target_link_libraries(inference PRIVATE mindspore_core hccl_adapter) endif() if (ENABLE_CPU) diff --git a/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.cc index c4b108b0212..9ad10704f0f 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.cc @@ -17,16 +17,15 @@ #include "backend/kernel_compiler/hccl/hccl_kernel.h" #include -#include "runtime/device/ascend/tasksink/runtime_utils.h" #include "backend/session/anf_runtime_algorithm.h" #include "utils/utils.h" #include "utils/ms_context.h" #include "runtime/device/kernel_runtime.h" #include "runtime/device/ascend/executor/hccl_dynamic_kernel.h" +#include "runtime/hccl_adapter/hccl_adapter.h" using HcclTaskInfoPtr = std::shared_ptr; using ge::model_runner::HcclTaskInfo; -using mindspore::device::ascend::tasksink::RuntimeUtils; namespace { static std::map kMsOpNameToHcomHcclType = { @@ -145,35 +144,45 @@ const std::vector &HcclKernel::GetOutputSizeList() const { const std::vector &HcclKernel::GetWorkspaceSizeList() const { return workspace_size_list_; } -std::vector HcclKernel::GenTask(const std::vector &inputs, - const std::vector &workspace, +std::vector HcclKernel::GenTask(const std::vector &inputs, const std::vector &, const std::vector &outputs, uint32_t stream_id) { if (inputs.empty() || outputs.empty()) { MS_LOG(EXCEPTION) << "Inputs or outputs is empty"; } stream_id_ = stream_id; - std::string hccl_type = AnfAlgo::GetCNodeName(anf_node_); MS_EXCEPTION_IF_NULL(inputs.at(0)); auto input_data_addr = inputs.at(0)->addr; MS_EXCEPTION_IF_NULL(outputs.at(0)); auto output_data_addr = outputs.at(0)->addr; - void *workspace_address = nullptr; - const int64_t workspace_num = 0; std::vector private_def; HcclDataType data_type = hccl_data_type_list_[0]; - MS_LOG(INFO) << "HCCL Task : stream_id=" << stream_id << ", ws_num=" << workspace_num << ", count=" << hccl_count_ - << ", root_id=" << root_id_ << ", op_type=" << static_cast(op_type_) - << ", data_type=" << static_cast(data_type); + std::vector task_info; + bool ret = hccl::GenTask(anf_node_, data_type, &task_info); + if (!ret) { + MS_LOG(EXCEPTION) << "Gen Task for " << anf_node_->DebugString() << " failed."; + } - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - HcclTaskInfoPtr task_info_ptr = std::make_shared( - kernel_name_, stream_id, hccl_type, input_data_addr, output_data_addr, workspace_address, workspace_num, 0, - private_def, nullptr, hccl_count_, root_id_, op_type_, data_type, group_, RuntimeUtils::HcomBindModel, - RuntimeUtils::HcomUnbindModel, RuntimeUtils::HcomDistribute, NeedDump()); - MS_EXCEPTION_IF_NULL(task_info_ptr); - return {task_info_ptr}; + std::vector results; + for (auto &task : task_info) { + MS_LOG(INFO) << "HCCL Task : stream_id=" << stream_id << ", count=" << hccl_count_ << ", root_id=" << root_id_ + << ", op_type=" << static_cast(op_type_) << ", data_type=" << static_cast(data_type) + << ", workspace_size=" << task.workspace_size << ", stream_num=" << task.stream_num + << ", private_def_size=" << task.private_def.size(); + + private_def.resize(task.private_def.size()); + auto sec_ret = memcpy_s(private_def.data(), private_def.size(), task.private_def.data(), task.private_def.size()); + if (sec_ret != 0) { + MS_LOG(EXCEPTION) << "Set data memcpy_s failed, ret = " << sec_ret; + } + + results.emplace_back(std::make_shared( + kernel_name_, stream_id, hccl::GetHcclType(anf_node_), input_data_addr, output_data_addr, task.workspace_size, + task.stream_num, private_def, hccl::GetHcclOpsKernelInfoStore(), hccl_count_, root_id_, op_type_, data_type, + group_, NeedDump())); + } + + return results; } device::DynamicKernelPtr HcclKernel::GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_broadcast.cc b/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_broadcast.cc index 3fff96d1b29..1fb30c51901 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_broadcast.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_broadcast.cc @@ -20,26 +20,10 @@ namespace mindspore { namespace kernel { -bool HcomAllBroadCastKernel::Launch(const std::vector &inputs, +bool HcomAllBroadCastKernel::Launch(const std::vector & /*inputs*/, const std::vector & /*workspace*/, - const std::vector & /*outputs*/, void *stream_ptr) { - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - if (context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK)) { - return true; - } - if (inputs.empty() || hccl_data_type_list_.empty()) { - MS_LOG(ERROR) << "BroadCast param is empty"; - return false; - } - const char *tag = "Hccl-BroadCast"; - MS_EXCEPTION_IF_NULL(inputs[0]); - HcclResult ret = - hcom_broadcast(tag, inputs[0]->addr, hccl_count_, hccl_data_type_list_[0], root_id_, nullptr, stream_ptr); - if (ret != HCCL_SUCCESS) { - MS_LOG(ERROR) << "HcomBroadcastOp : hcom_broadcast fail, return: " << static_cast(ret); - return false; - } + const std::vector & /*outputs*/, void * /*stream_ptr*/) { + MS_LOG(INFO) << "HcomAllBroadCast launch"; return true; } } // namespace kernel diff --git a/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_gather.cc b/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_gather.cc index db8d2edf739..5bc3be9da50 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_gather.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_gather.cc @@ -20,24 +20,10 @@ namespace mindspore { namespace kernel { -bool HcomAllGatherKernel::Launch(const std::vector &inputs, const std::vector & /*workspace*/, - const std::vector &outputs, void *stream_ptr) { - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - if (context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK)) { - return true; - } - if (inputs.empty() || hccl_data_type_list_.empty()) { - MS_LOG(ERROR) << "AllGather param is empty"; - return false; - } - const char *tag = "Hccl-AllGather"; - HcclResult ret = - hcom_all_gather(tag, inputs[0]->addr, outputs[0]->addr, hccl_count_, hccl_data_type_list_[0], nullptr, stream_ptr); - if (ret != HCCL_SUCCESS) { - MS_LOG(ERROR) << "HcomAllGatherKernelOp : hcom_all_gather fail, return: " << static_cast(ret); - return false; - } +bool HcomAllGatherKernel::Launch(const std::vector & /*inputs*/, + const std::vector & /*workspace*/, + const std::vector & /*outputs*/, void * /*stream_ptr*/) { + MS_LOG(INFO) << "HcomAllGather launch"; return true; } } // namespace kernel diff --git a/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_reduce.cc b/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_reduce.cc index 62a4868d33d..7a1dae02d10 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_reduce.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_reduce.cc @@ -20,24 +20,10 @@ namespace mindspore { namespace kernel { -bool HcomAllReduceKernel::Launch(const std::vector &inputs, const std::vector & /*workspace*/, - const std::vector &outputs, void *stream_ptr) { - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - if (context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK)) { - return true; - } - if (inputs.empty() || outputs.empty() || hccl_data_type_list_.empty()) { - MS_LOG(ERROR) << "AllReduce param is empty"; - return false; - } - const char *tag = "Hccl-AllReduce"; - HcclResult ret = hcom_all_reduce(tag, inputs[0]->addr, outputs[0]->addr, hccl_count_, hccl_data_type_list_[0], - op_type_, nullptr, stream_ptr); - if (ret != HCCL_SUCCESS) { - MS_LOG(ERROR) << "HcomAllReduceKernelOp : hcom_all_reduce fail, return: " << static_cast(ret); - return false; - } +bool HcomAllReduceKernel::Launch(const std::vector & /*inputs*/, + const std::vector & /*workspace*/, + const std::vector & /*outputs*/, void * /*stream_ptr*/) { + MS_LOG(INFO) << "HcomAllReduce launch"; return true; } } // namespace kernel diff --git a/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_reduce_scatter.cc b/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_reduce_scatter.cc index 08a2415eaf7..a72db696f1d 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_reduce_scatter.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/hccl/hcom_all_reduce_scatter.cc @@ -20,25 +20,10 @@ namespace mindspore { namespace kernel { -bool HcomAllReduceScatterKernel::Launch(const std::vector &inputs, +bool HcomAllReduceScatterKernel::Launch(const std::vector & /*inputs*/, const std::vector & /*workspace*/, - const std::vector &outputs, void *stream_ptr) { - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - if (context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK)) { - return true; - } - if (inputs.empty() || outputs.empty() || hccl_data_type_list_.empty()) { - MS_LOG(ERROR) << "ReduceScatter param is empty"; - return false; - } - const char *tag = "Hccl-ReduceScatter"; - HcclResult ret = hcom_reduce_scatter(tag, inputs[0]->addr, outputs[0]->addr, hccl_count_, hccl_data_type_list_[0], - op_type_, nullptr, stream_ptr); - if (ret != HCCL_SUCCESS) { - MS_LOG(ERROR) << "HcomReduceScatterOp : hcom_reduce_scatter fail, return: " << static_cast(ret); - return false; - } + const std::vector & /*outputs*/, void * /*stream_ptr*/) { + MS_LOG(INFO) << "HcomAllReduceScatter launch"; return true; } } // namespace kernel diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 42f4381d679..5fef61b3c02 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -55,6 +55,7 @@ #include "profiler/device/ascend/rt_callback_manager.h" #include "utils/config_manager.h" #include "runtime/device/ascend/profiling/reporter/op_name_task_stream_reporter.h" +#include "runtime/hccl_adapter/hccl_adapter.h" using ge::model_runner::ModelRunner; using mindspore::device::ascend::ProfilingManager; @@ -796,10 +797,10 @@ bool AscendKernelRuntime::HcclInit() { return false; } MS_LOG(INFO) << "MINDSPORE_HCCL_CONFIG_PATH : " << full_path << ", RANK_ID: " << rank_id_str; - HcclResult res = hcom_init(full_path, rank_id_str.c_str()); + bool ret = hccl::InitHccl(context_ptr->get_param(MS_CTX_DEVICE_ID), rank_id_str, full_path); free(full_path); - if (res != HCCL_SUCCESS) { - MS_LOG(ERROR) << "Hcom init failed, res is " << static_cast(res); + if (!ret) { + MS_LOG(ERROR) << "Hcom init failed."; return false; } return true; @@ -816,12 +817,14 @@ bool AscendKernelRuntime::DestroyHccl() { if (!HcclExecutorManager::GetInstance().Finalize()) { MS_LOG(ERROR) << "Dynamic Shape Hccl Finalize Failed"; } - HcclResult res = hcom_destroy(); - if (res != HCCL_SUCCESS) { + + bool res = hccl::FinalizeHccl(); + if (!res) { MS_LOG(ERROR) << "Hccl destroy failed"; return false; } - MS_LOG(INFO) << "Hccl destroy successful, status = " << res << "."; + + MS_LOG(INFO) << "Hccl destroy successful."; context_ptr->set_param(MS_CTX_ENABLE_HCCL, false); return true; } @@ -855,7 +858,7 @@ void AscendKernelRuntime::KernelLaunchProfiling(const std::string &kernel_name) auto try_emplace_ret = stream_id_task_id_op_name_map_.try_emplace(stream_task_pair, kernel_name); if (!try_emplace_ret.second) { MS_LOG(WARNING) << "Profiling duplicate key, task_id:" << stream_task_pair.second - << " stream_id:" << stream_task_pair.first << " name:" << kernel_name; + << " stream_id:" << stream_task_pair.first << " name:" << kernel_name; } if (stream_id_task_id_op_name_map_.size() > kProfilingMaxTaskIdInStream) { MS_LOG(EXCEPTION) << "Too many profiling data"; diff --git a/mindspore/ccsrc/runtime/device/ascend/tasksink/runtime_utils.cc b/mindspore/ccsrc/runtime/device/ascend/tasksink/runtime_utils.cc deleted file mode 100644 index 4b8c97689ff..00000000000 --- a/mindspore/ccsrc/runtime/device/ascend/tasksink/runtime_utils.cc +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "runtime/device/ascend/tasksink/runtime_utils.h" - -#include - -#include "hccl/hcom.h" -#include "utils/log_adapter.h" -#include "hccl/hccl_types.h" -#include "utils/utils.h" - -constexpr auto kHcomBroadcast = "hcom_broadcast_"; -constexpr auto kHcomAllGather = "hcom_all_gather_"; -constexpr auto kHcomAllReduce = "hcom_all_reduce_"; -constexpr auto kHcomReduceScatter = "hcom_reduce_scatter_"; -constexpr auto kUnderline = "_"; -namespace mindspore { -namespace device { -namespace ascend { -namespace tasksink { -bool RuntimeUtils::HcomBindModel(rtModel_t model, rtStream_t stream) { - HcclResult ret = hcom_bind_model(model, stream); - if (ret != HCCL_SUCCESS) { - MS_LOG(ERROR) << "Call hcom_bind_model failed, ret: 0x" << static_cast(ret); - return false; - } - return true; -} - -bool RuntimeUtils::HcomUnbindModel(rtModel_t model) { - HcclResult ret = hcom_unbind_model(model); - if (ret != HCCL_SUCCESS) { - MS_LOG(ERROR) << "Call hcom_unbind_model failed, ret: 0x" << static_cast(ret); - return false; - } - return true; -} - -bool RuntimeUtils::HcomDistribute(const std::shared_ptr &task_info, rtStream_t stream) { - MS_LOG(INFO) << "hccl distribute start"; - MS_EXCEPTION_IF_NULL(task_info); - HcclResult ret; - static uint32_t task_counter = 0; - auto hccl_group = task_info->group(); - if (task_info->hccl_type() == kBroadcastOpName) { - // call hcom broadcast interface to run op - const string tag_broadcast = kHcomBroadcast + std::to_string(task_counter++) + kUnderline + std::to_string(0); - ret = hcom_broadcast(tag_broadcast.c_str(), task_info->input_data_addr(), static_cast(task_info->count()), - static_cast(task_info->data_type()), static_cast(task_info->root_id()), - hccl_group.c_str(), stream); - if (ret != HCCL_SUCCESS) { - MS_LOG(ERROR) << "hcom_broadcast fail, return ret: " << static_cast(ret); - return false; - } - } else if (task_info->hccl_type() == kAllGatherOpName) { - // call hcom allgather interface to run op - const string tag_all_gather = kHcomAllGather + std::to_string(task_counter++) + kUnderline + std::to_string(0); - ret = hcom_all_gather(tag_all_gather.c_str(), task_info->input_data_addr(), task_info->output_data_addr(), - static_cast(task_info->count()), static_cast(task_info->data_type()), - hccl_group.c_str(), stream); - if (ret != HCCL_SUCCESS) { - MS_LOG(ERROR) << "hcom_all_gather fail, return ret: " << ret; - return false; - } - } else if (task_info->hccl_type() == kAllReduceOpName) { - // call hcom allreduce interface to run op - const string tag_all_reduce = kHcomAllReduce + std::to_string(task_counter++) + kUnderline + std::to_string(0); - ret = hcom_all_reduce(tag_all_reduce.c_str(), task_info->input_data_addr(), task_info->output_data_addr(), - static_cast(task_info->count()), static_cast(task_info->data_type()), - static_cast(task_info->op_type()), hccl_group.c_str(), stream); - if (ret != HCCL_SUCCESS) { - MS_LOG(ERROR) << "hcom_all_reduce fail, return ret: " << ret; - return false; - } - } else if (task_info->hccl_type() == kReduceScatterOpName) { - // call hcom reducescatter interface to run op - const string tag_reduce_scatter = - kHcomReduceScatter + std::to_string(task_counter++) + kUnderline + std::to_string(0); - ret = hcom_reduce_scatter(tag_reduce_scatter.c_str(), task_info->input_data_addr(), task_info->output_data_addr(), - static_cast(task_info->count()), static_cast(task_info->data_type()), - static_cast(task_info->op_type()), hccl_group.c_str(), stream); - if (ret != HCCL_SUCCESS) { - MS_LOG(ERROR) << "hcom_reduce_scatter fail, return ret: " << ret; - return false; - } - } - return true; -} -} // namespace tasksink -} // namespace ascend -} // namespace device -} // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/ascend/tasksink/runtime_utils.h b/mindspore/ccsrc/runtime/device/ascend/tasksink/runtime_utils.h deleted file mode 100644 index b353be681f5..00000000000 --- a/mindspore/ccsrc/runtime/device/ascend/tasksink/runtime_utils.h +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_TASKSINK_RUNTIME_UTILS_H_ -#define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_TASKSINK_RUNTIME_UTILS_H_ - -#include -#include "runtime/rt.h" -#include "framework/ge_runtime/task_info.h" - -using ge::model_runner::HcclTaskInfo; - -namespace mindspore { -namespace device { -namespace ascend { -namespace tasksink { -class RuntimeUtils { - public: - static bool HcomBindModel(rtModel_t model, rtStream_t stream); - static bool HcomUnbindModel(rtModel_t model); - static bool HcomDistribute(const std::shared_ptr &task_info, rtStream_t stream); -}; -} // namespace tasksink -} // namespace ascend -} // namespace device -} // namespace mindspore -#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_TASKSINK_RUNTIME_UTILS_H_ diff --git a/mindspore/ccsrc/runtime/hccl_adapter/CMakeLists.txt b/mindspore/ccsrc/runtime/hccl_adapter/CMakeLists.txt new file mode 100644 index 00000000000..5b313f52a33 --- /dev/null +++ b/mindspore/ccsrc/runtime/hccl_adapter/CMakeLists.txt @@ -0,0 +1,8 @@ +file(GLOB_RECURSE HCCL_ADAPTER_SRC_LIST ./*.cc) +set_property(SOURCE ${HCCL_ADAPTER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_HCCL_ADPT) +add_library(hccl_adapter SHARED ${HCCL_ADAPTER_SRC_LIST}) +target_include_directories(hccl_adapter PRIVATE ${CMAKE_BINARY_DIR}/proto/ge) +add_dependencies(hccl_adapter graph) +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + target_link_options(hccl_adapter PRIVATE -Wl,-init,mindspore_log_init) +endif () \ No newline at end of file diff --git a/mindspore/ccsrc/runtime/hccl_adapter/converter.cc b/mindspore/ccsrc/runtime/hccl_adapter/converter.cc new file mode 100644 index 00000000000..9ecefcd5d54 --- /dev/null +++ b/mindspore/ccsrc/runtime/hccl_adapter/converter.cc @@ -0,0 +1,129 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "runtime/hccl_adapter/converter.h" +#include +#include +#include +#define google ascend_private +#include "register/ops_kernel_builder_registry.h" +#include "graph/compute_graph.h" +#include "graph/debug/ge_attr_define.h" +#undef google +#include "backend/session/anf_runtime_algorithm.h" +#include "utils/log_adapter.h" +#include "utils/ms_utils.h" +#include "mindspore/core/base/core_ops.h" +#include "transform/graph_ir/util.h" + +static constexpr char kGeOpNameHcclAllRudece[] = "HcomAllReduce"; +static constexpr char kGeOpNameHcclAllGather[] = "HcomAllGather"; +static constexpr char kGeOpNameHcclBroadcast[] = "HcomBroadcast"; +static constexpr char kGeOpNameHcclReduceScatter[] = "HcomReduceScatter"; +static constexpr char kGeNodeAttrUsedStreamNum[] = "used_stream_num"; +static constexpr char kStubDataStructureName[] = "any_name_can_work"; + +static ge::DataType ConvertHcclDTypeToGeDType(HcclDataType datatype) { + static map kHcomDataTypeMap = { + {HCCL_DATA_TYPE_FP32, ge::DT_FLOAT}, + {HCCL_DATA_TYPE_FP16, ge::DT_FLOAT16}, + {HCCL_DATA_TYPE_INT8, ge::DT_INT8}, + {HCCL_DATA_TYPE_INT32, ge::DT_INT32}, + }; + + auto iter = kHcomDataTypeMap.find(datatype); + if (iter == kHcomDataTypeMap.end()) { + MS_LOG(EXCEPTION) << "Unknown hccl data type " << datatype; + } + + return iter->second; +} + +namespace mindspore::hccl { +std::string GetGeNodeName(const CNodePtr &cnode) { + MS_EXCEPTION_IF_NULL(cnode); + if (IsPrimitiveCNode(cnode, prim::kPrimAllReduce)) { + return kGeOpNameHcclAllRudece; + } else if (IsPrimitiveCNode(cnode, prim::kPrimAllGather)) { + return kGeOpNameHcclAllGather; + } else if (IsPrimitiveCNode(cnode, prim::kPrimBroadcast)) { + return kGeOpNameHcclBroadcast; + } else if (IsPrimitiveCNode(cnode, prim::kPrimReduceScatter)) { + return kGeOpNameHcclReduceScatter; + } + + MS_LOG(EXCEPTION) << "Unknown hccl node type " << cnode->DebugString(); +} + +std::tuple GenerateStubGeNode(const AnfNodePtr &anf_node, HcclDataType datatype) { + MS_EXCEPTION_IF_NULL(anf_node); + auto cnode = anf_node->cast(); + MS_EXCEPTION_IF_NULL(cnode); + std::string ge_node_name = GetGeNodeName(cnode); + + ge::OpDescPtr op_desc = std::make_shared(kStubDataStructureName, ge_node_name); + MS_EXCEPTION_IF_NULL(op_desc); + for (size_t i = 1; i < cnode->size(); ++i) { + auto &input = cnode->input(i); + std::vector ge_shape; + auto ms_shape = AnfAlgo::GetOutputInferShape(input, 0); + std::transform(ms_shape.begin(), ms_shape.end(), std::back_inserter(ge_shape), + [](size_t in) { return static_cast(in); }); + op_desc->AddInputDesc( + ge::GeTensorDesc(ge::GeShape(ge_shape), ge::Format::FORMAT_NCHW, + transform::TransformUtil::ConvertDataType(AnfAlgo::GetOutputInferDataType(input, 0)))); + } + + // set node data type + bool ret = ge::AttrUtils::SetDataType(*op_desc, ge::HCOM_ATTR_DATA_TYPE, ConvertHcclDTypeToGeDType(datatype)); + if (!ret) { + MS_LOG(EXCEPTION) << "Set attr " << ge::HCOM_ATTR_DATA_TYPE << " for ge node of " << cnode->DebugString() + << " failed."; + } + + // set rank size + if (AnfAlgo::HasNodeAttr(kAttrRankSize, cnode)) { + auto rank_size = AnfAlgo::GetNodeAttr(cnode, kAttrRankSize); + ret = ge::AttrUtils::SetInt(*op_desc, ge::HCOM_ATTR_RANK_SIZE, rank_size); + if (!ret) { + MS_LOG(EXCEPTION) << "Set attr " << ge::HCOM_ATTR_RANK_SIZE << " for ge node of " << cnode->DebugString() + << " failed."; + } + } + + ge::ComputeGraphPtr ge_graph = std::make_shared(kStubDataStructureName); + MS_EXCEPTION_IF_NULL(ge_graph); + auto ge_node = ge_graph->AddNode(op_desc); + return {ge_node, ge_graph}; +} + +HcclTaskInfo ParseDomiTask(const ge::OpDescPtr &op, const domi::TaskDef &task_def) { + MS_EXCEPTION_IF_NULL(op); + // workspace size + auto workspace_sizes = op->GetWorkspaceBytes(); + if (workspace_sizes.size() != 1) { + MS_LOG(EXCEPTION) << "Unexpected workspace size " << workspace_sizes.size(); + } + int64_t workspace_size = workspace_sizes[0]; + // stream num + int64_t stream_num; + bool ret = ge::AttrUtils::GetInt(*op, kGeNodeAttrUsedStreamNum, stream_num); + if (!ret) { + MS_LOG(EXCEPTION) << "Get attr " << kGeNodeAttrUsedStreamNum << " for ge node " << op->GetType() << " failed."; + } + + return {task_def.private_def(), workspace_size, stream_num}; +} +} // namespace mindspore::hccl diff --git a/mindspore/ccsrc/runtime/hccl_adapter/converter.h b/mindspore/ccsrc/runtime/hccl_adapter/converter.h new file mode 100644 index 00000000000..6b6bda3a47c --- /dev/null +++ b/mindspore/ccsrc/runtime/hccl_adapter/converter.h @@ -0,0 +1,38 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_RUNTIME_HCCL_ADAPTER_CONVERTER_H +#define MINDSPORE_RUNTIME_HCCL_ADAPTER_CONVERTER_H + +#include +#include +#include +#include +#define google ascend_private +#include "graph/node.h" +#include "common/opskernel/ops_kernel_info_types.h" +#include "proto/task.pb.h" +#undef google +#include "runtime/hccl_adapter/hccl_adapter.h" +#include "mindspore/core/ir/anf.h" + +namespace mindspore::hccl { +// return graph ptr to keep reference count +std::tuple GenerateStubGeNode(const AnfNodePtr &anf_node, HcclDataType datatype); +HcclTaskInfo ParseDomiTask(const ge::OpDescPtr &op, const domi::TaskDef &task_def); +std::string GetGeNodeName(const CNodePtr &cnode); +} // namespace mindspore::hccl +#endif // MINDSPORE_RUNTIME_HCCL_ADAPTER_CONVERTER_H diff --git a/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.cc b/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.cc new file mode 100644 index 00000000000..5ce9faa8d6f --- /dev/null +++ b/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.cc @@ -0,0 +1,165 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "runtime/hccl_adapter/hccl_adapter.h" +#include +#include +#define google ascend_private +#include "register/ops_kernel_builder_registry.h" +#include "common/opskernel/ops_kernel_info_store.h" +#include "external/ge/ge_api_types.h" +#undef google +#include "utils/log_adapter.h" +#include "utils/ms_utils.h" +#include "runtime/hccl_adapter/converter.h" +#include "runtime/hccl_adapter/hcom_graph_adaptor.h" + +static constexpr const char *kHcclOpsKernelInfoStore = "ops_kernel_info_hccl"; +static constexpr const char *kHcclDeployModeEnv = "DEPLOY_MODE"; +// following global var, thread safety is not guaranteed +static std::shared_ptr ops_kernel_info_store = nullptr; +static ge::OpsKernelBuilderPtr ops_kernel_builder = nullptr; + +namespace mindspore::hccl { +static std::map GenHcclOptions(uint32_t device_id, std::string_view rank_id, + std::string_view rank_file) { + auto env_deploy_mode = common::GetEnv(kHcclDeployModeEnv); + if (env_deploy_mode.empty()) { + MS_LOG(WARNING) << kHcclDeployModeEnv << " is not set in ENV. Now set to default value 0"; + env_deploy_mode = "0"; + } + + return std::map({{ge::OPTION_EXEC_IS_USEHCOM, "1"}, + {ge::OPTION_EXEC_IS_USEHVD, "0"}, + {ge::OPTION_EXEC_HCCL_FLAG, "1"}, + {ge::OPTION_EXEC_DEVICE_ID, std::to_string(device_id)}, + {ge::OPTION_EXEC_RANK_ID, rank_id.data()}, + {ge::OPTION_EXEC_POD_NAME, rank_id.data()}, + {ge::OPTION_EXEC_RANK_TABLE_FILE, rank_file.data()}, + {ge::OPTION_GRAPH_RUN_MODE, "1"}, + {ge::OPTION_EXEC_HCCL_FLAG, "1"}, + {ge::OPTION_EXEC_DEPLOY_MODE, env_deploy_mode}}); +} + +bool InitHccl(uint32_t device_id, std::string_view rank_id, std::string_view rank_file) { + MS_LOG(INFO) << "Start init hccl adapter."; + // get ops_kernel_builder + std::map all_builders = ge::OpsKernelBuilderRegistry::GetInstance().GetAll(); + if (all_builders.size() != 1) { + MS_LOG(EXCEPTION) << "Builders size should be 1 (hccl builder), but is " << all_builders.size(); + } + + MS_LOG(INFO) << "Get builder " << all_builders.begin()->first; + ops_kernel_builder = all_builders.begin()->second; + MS_EXCEPTION_IF_NULL(ops_kernel_builder); + // init ops_kernel_builder + auto options = GenHcclOptions(device_id, rank_id, rank_file); + auto ret = ops_kernel_builder->Initialize(options); + if (ret != ge::SUCCESS) { + MS_LOG(EXCEPTION) << "Init builder failed, ret = " << ret; + } + + // get ops_kernel_info_store + ret = ::Initialize(options); + if (ret != ge::SUCCESS) { + MS_LOG(EXCEPTION) << "Init plugin so failed, ret = " << ret; + } + + std::map> all_ops_kernel_info_stores; + ::GetOpsKernelInfoStores(all_ops_kernel_info_stores); + for (auto &[name, ptr] : all_ops_kernel_info_stores) { + if (name == kHcclOpsKernelInfoStore) { + ops_kernel_info_store = ptr; + break; + } + } + MS_EXCEPTION_IF_NULL(ops_kernel_info_store); + ret = ops_kernel_info_store->Initialize(options); + if (ret != ge::SUCCESS) { + MS_LOG(EXCEPTION) << "Init info store failed, ret = " << ret; + } + MS_LOG(INFO) << "Init hccl adapter success."; + return true; +} + +bool FinalizeHccl() { + MS_LOG(INFO) << "Start destroy hccl adapter."; + if (ops_kernel_info_store != nullptr) { + auto ret = ops_kernel_info_store->Finalize(); + if (ret != ge::SUCCESS) { + MS_LOG(ERROR) << "Destory info store failed, ret = " << ret; + return false; + } + } + + if (ops_kernel_builder != nullptr) { + auto ret = ops_kernel_builder->Finalize(); + if (ret != ge::SUCCESS) { + MS_LOG(ERROR) << "Destory builder failed, ret = " << ret; + return false; + } + } + + ::Finalize(); + ge::OpsKernelBuilderRegistry::GetInstance().UnregisterAll(); + ops_kernel_info_store.reset(); + ops_kernel_builder.reset(); + MS_LOG(INFO) << "Destroy hccl adapter success."; + return true; +} + +bool GenTask(const AnfNodePtr &node, HcclDataType datatype, std::vector *task_info_lists) { + MS_EXCEPTION_IF_NULL(ops_kernel_builder); + MS_EXCEPTION_IF_NULL(task_info_lists); + MS_LOG(INFO) << "Start generate task for hccl node " << node->DebugString(); + auto [ge_node, ge_graph] = GenerateStubGeNode(node, datatype); + MS_EXCEPTION_IF_NULL(ge_node); + auto op = ge_node->GetOpDesc(); + MS_EXCEPTION_IF_NULL(op); + + MS_LOG(INFO) << "Start to call CalcOpRunningParam"; + ge::Status ret = ops_kernel_builder->CalcOpRunningParam(*ge_node); + if (ret != ge::SUCCESS) { + MS_LOG(ERROR) << "OpsKernelBuilder CalcOpRunningParam failed, ret = " << ret; + return false; + } + MS_LOG(INFO) << "Start to call GenerateTask"; + ge::RunContext unused_ctx; + std::vector domi_tasks; + ret = ops_kernel_builder->GenerateTask(*ge_node, unused_ctx, domi_tasks); + if (ret != ge::SUCCESS) { + MS_LOG(ERROR) << "OpsKernelBuilder GenerateTask failed, ret = " << ret; + return false; + } + + task_info_lists->clear(); + std::transform(domi_tasks.begin(), domi_tasks.end(), std::back_inserter(*task_info_lists), + [&op](const domi::TaskDef &task_def) -> HcclTaskInfo { return ParseDomiTask(op, task_def); }); + MS_LOG(INFO) << "Generate task for node " << node->DebugString() << " success."; + ge_graph.reset(); + return true; +} + +bool CalcOpRunningParam(const AnfNodePtr &node) { return true; } + +void *GetHcclOpsKernelInfoStore() { return ops_kernel_info_store.get(); } + +std::string GetHcclType(const AnfNodePtr &node) { + MS_EXCEPTION_IF_NULL(node); + auto cnode = node->cast(); + MS_EXCEPTION_IF_NULL(cnode); + return GetGeNodeName(cnode); +} +} // namespace mindspore::hccl diff --git a/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.h b/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.h new file mode 100644 index 00000000000..e5ba2beedb2 --- /dev/null +++ b/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.h @@ -0,0 +1,43 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_RUNTIME_HCCL_ADAPTER_HCCL_ADAPTER_H +#define MINDSPORE_RUNTIME_HCCL_ADAPTER_HCCL_ADAPTER_H + +#include +#include +#include +#include "mindspore/core/ir/anf.h" +#include "external/hccl/hccl_types.h" + +#define MS_API __attribute__((visibility("default"))) + +namespace mindspore::hccl { +struct MS_API HcclTaskInfo { + std::string private_def; + int64_t workspace_size; + int64_t stream_num; +}; + +MS_API bool InitHccl(uint32_t device_id, std::string_view rank_id, std::string_view rank_file); +MS_API bool FinalizeHccl(); +MS_API bool GenTask(const AnfNodePtr &node, HcclDataType datatype, std::vector *task_info_lists); +MS_API bool CalcOpRunningParam(const AnfNodePtr &node); +MS_API void *GetHcclOpsKernelInfoStore(); +MS_API std::string GetHcclType(const AnfNodePtr &node); +} // namespace mindspore::hccl +#undef MS_API +#endif // MINDSPORE_RUNTIME_HCCL_ADAPTER_HCCL_ADAPTER_H diff --git a/mindspore/ccsrc/runtime/hccl_adapter/hcom_graph_adaptor.h b/mindspore/ccsrc/runtime/hccl_adapter/hcom_graph_adaptor.h new file mode 100644 index 00000000000..08fadfebde3 --- /dev/null +++ b/mindspore/ccsrc/runtime/hccl_adapter/hcom_graph_adaptor.h @@ -0,0 +1,32 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_RUNTIME_HCCL_ADAPTER_HCOM_GRAPH_ADAPTOR_H +#define MINDSPORE_RUNTIME_HCCL_ADAPTER_HCOM_GRAPH_ADAPTOR_H + +#include +#include +#include +#include "mindspore/core/ir/anf.h" +#include "common/opskernel/ops_kernel_info_store.h" + +extern "C" { +ge::Status Initialize(const std::map &); +ge::Status Finalize(); +void GetOpsKernelInfoStores(std::map> &); +} + +#endif // MINDSPORE_RUNTIME_HCCL_ADAPTER_HCOM_GRAPH_ADAPTOR_H diff --git a/mindspore/core/utils/log_adapter.cc b/mindspore/core/utils/log_adapter.cc index bc41a1716c6..d2504a3c7dc 100644 --- a/mindspore/core/utils/log_adapter.cc +++ b/mindspore/core/utils/log_adapter.cc @@ -181,7 +181,8 @@ static const char *GetSubModuleName(SubModuleId module_id) { "VM", // SM_VM "PROFILER", // SM_PROFILER "PS", // SM_PS - "LITE" // SM_LITE + "LITE", // SM_LITE + "HCCL_ADPT" // SM_HCCL_ADPT }; return sub_module_names[module_id % NUM_SUBMODUES]; diff --git a/mindspore/core/utils/log_adapter.h b/mindspore/core/utils/log_adapter.h index 4617a4dab85..266b4678542 100644 --- a/mindspore/core/utils/log_adapter.h +++ b/mindspore/core/utils/log_adapter.h @@ -125,6 +125,7 @@ enum SubModuleId : int { SM_PROFILER, // profiler SM_PS, // Parameter Server SM_LITE, // LITE + SM_HCCL_ADPT, // Hccl Adapter NUM_SUBMODUES // number of submodules }; diff --git a/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc b/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc index e2502baf8b0..331c23dba13 100644 --- a/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc +++ b/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc @@ -15,7 +15,7 @@ */ #include #include "framework/ge_runtime/model_runner.h" -#include "runtime/device/ascend/tasksink/runtime_utils.h" +#include "runtime/hccl_adapter/hccl_adapter.h" namespace ge { namespace model_runner { @@ -60,15 +60,12 @@ const std::map> &ModelRunner::GetRunti } // namespace ge namespace mindspore { -namespace device { -namespace ascend { -namespace tasksink { -bool RuntimeUtils::HcomBindModel(rtModel_t model, rtStream_t stream) { return true; } - -bool RuntimeUtils::HcomUnbindModel(rtModel_t model) { return true; } - -bool RuntimeUtils::HcomDistribute(const std::shared_ptr &task_info, rtStream_t stream) { return true; } -} // namespace tasksink -} // namespace ascend -} // namespace device +namespace hccl { +bool InitHccl(uint32_t, std::string_view, std::string_view) { return true; } +bool FinalizeHccl() { return true; } +bool GenTask(const AnfNodePtr &, HcclDataType, std::vector *) { return true; } +bool CalcOpRunningParam(const AnfNodePtr &) { return true; } +void *GetHcclOpsKernelInfoStore() { return nullptr; } +std::string GetHcclType(const AnfNodePtr &) { return ""; } +} // namespace hccl } // namespace mindspore