forked from mindspore-Ecosystem/mindspore
new hccl interface
Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
This commit is contained in:
parent
e86e990089
commit
d532af3a9a
|
@ -6,10 +6,15 @@ include(${GE_SOURCE_DIR}/cmake/ge_utils.cmake)
|
|||
include(${GE_SOURCE_DIR}/cmake/external_libs/json.cmake)
|
||||
include(${GE_SOURCE_DIR}/cmake/external_libs/eigen.cmake)
|
||||
include(${GE_SOURCE_DIR}/cmake/external_libs/gtest.cmake)
|
||||
include(${GE_SOURCE_DIR}/cmake/external_libs/protobuf.cmake)
|
||||
include(${GE_SOURCE_DIR}/cmake/external_libs/onnx.cmake)
|
||||
include(${GE_SOURCE_DIR}/cmake/external_libs/securec.cmake)
|
||||
|
||||
if (ENABLE_D)
|
||||
set(AS_MS_COMP TRUE)
|
||||
include(${GE_SOURCE_DIR}/cmake/external_libs/protobuf.cmake)
|
||||
unset(AS_MS_COMP)
|
||||
else ()
|
||||
include(${GE_SOURCE_DIR}/cmake/external_libs/protobuf.cmake)
|
||||
endif ()
|
||||
# for UT, find slog and error_manager from local prebuild
|
||||
if (NOT ENABLE_D AND NOT ENABLE_ACL)
|
||||
set(GE_PREBUILD_PATH ${GE_SOURCE_DIR}/third_party/prebuild/${CMAKE_HOST_SYSTEM_PROCESSOR})
|
||||
|
@ -79,8 +84,12 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__FILE__='\"$(subst $(realpath ${CMAKE
|
|||
add_subdirectory(${GE_SOURCE_DIR}/src/common/graph)
|
||||
if (ENABLE_ACL OR ENABLE_D)
|
||||
add_subdirectory(${GE_SOURCE_DIR}/src/ge/common)
|
||||
target_compile_definitions(graph PRIVATE google=ascend_private)
|
||||
set_target_properties(graph PROPERTIES SKIP_BUILD_RPATH TRUE)
|
||||
if (ENABLE_D)
|
||||
add_subdirectory(${GE_SOURCE_DIR}/src/ge/ge_runtime)
|
||||
target_compile_definitions(ge_runtime PRIVATE google=ascend_private)
|
||||
set_target_properties(ge_runtime PROPERTIES SKIP_BUILD_RPATH TRUE)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
|
|
@ -216,7 +216,7 @@ if (NOT ENABLE_GE)
|
|||
|
||||
if (ENABLE_D)
|
||||
install(
|
||||
TARGETS ms_profile
|
||||
TARGETS ms_profile hccl_adapter
|
||||
DESTINATION ${INSTALL_LIB_DIR}
|
||||
COMPONENT mindspore
|
||||
)
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 412ebe82c96620b5f7c942a7ab87a45bf14c5621
|
||||
Subproject commit 383f7f751d6612e9dbde9e22a2960098fdbf3792
|
|
@ -174,7 +174,7 @@ foreach (_comp ${SUB_COMP})
|
|||
string(REPLACE "/" "_" sub ${_comp})
|
||||
if (TARGET _mindspore_${sub}_obj)
|
||||
list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_${sub}_obj>)
|
||||
add_dependencies(_mindspore_${sub}_obj proto_input )
|
||||
add_dependencies(_mindspore_${sub}_obj proto_input)
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
|
@ -229,28 +229,26 @@ if (ENABLE_D)
|
|||
endif()
|
||||
|
||||
MESSAGE("USE DAV LIB PATH: ${ASCEND_PATH}")
|
||||
find_library(HCCL hccl ${ASCEND_RUNTIME_PATH})
|
||||
find_library(CCE_LIB cce ${ASCEND_RUNTIME_PATH})
|
||||
find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH})
|
||||
find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH})
|
||||
find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH})
|
||||
find_library(PROFILING msprofiler ${ASCEND_RUNTIME_PATH})
|
||||
# for atlas env
|
||||
find_library(HCCL hccl ${ASCEND_TOOLKIT_RUNTIME_PATH})
|
||||
find_library(CCE_LIB cce ${ASCEND_TOOLKIT_RUNTIME_PATH})
|
||||
find_library(RUNTIME_LIB runtime ${ASCEND_TOOLKIT_RUNTIME_PATH})
|
||||
find_library(TSDCLIENT tsdclient HINTS ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH})
|
||||
find_library(DATATRANSFER datatransfer HINTS ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH})
|
||||
find_library(PROFILING msprofiler ${ASCEND_TOOLKIT_RUNTIME_PATH})
|
||||
|
||||
find_library(HCCL hccl ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
|
||||
find_library(CCE_LIB cce ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
|
||||
find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
|
||||
find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH})
|
||||
find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH})
|
||||
find_library(PROFILING msprofiler ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
|
||||
find_library(PROFILING_SHARED msprof ${ASCEND_DRIVER_PATH})
|
||||
find_library(REGISTER register ${ASCEND_RUNTIME_PATH})
|
||||
find_library(REGISTER register ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
|
||||
find_library(OPTILING optiling ${ASCEND_OPP_PATH})
|
||||
add_library(ms_profile SHARED ${PROFILING})
|
||||
set_target_properties(ms_profile PROPERTIES LINKER_LANGUAGE CXX)
|
||||
target_link_libraries(ms_profile -Wl,--start-group ${PROFILING_SHARED} ${PROFILING} mindspore::protobuf -Wl,--end-group)
|
||||
target_link_libraries(mindspore ms_profile ge_runtime ${CCE_LIB} ${RUNTIME_LIB} ${TSDCLIENT} ${HCCL} ${DATATRANSFER} ${REGISTER} -Wl,--no-as-needed ${OPTILING} -Wl,--as-needed)
|
||||
target_link_libraries(mindspore ms_profile ge_runtime ${CCE_LIB} ${RUNTIME_LIB} ${TSDCLIENT} ${HCCL} ${DATATRANSFER}
|
||||
${REGISTER} -Wl,--no-as-needed ${OPTILING} -Wl,--as-needed)
|
||||
target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group)
|
||||
# hccl_adpter
|
||||
find_library(HCCL_ADPTER hcom_graph_adaptor ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
|
||||
find_library(HCCL_BUILDER hcom_opskernel_builder ${ASCEND_RUNTIME_PATH}/plugin/opskernel ${ASCEND_TOOLKIT_RUNTIME_PATH}/plugin/opskernel)
|
||||
add_subdirectory(runtime/hccl_adapter)
|
||||
target_link_libraries(hccl_adapter PRIVATE mindspore ${HCCL_ADPTER} ${REGISTER} -Wl,--no-as-needed ${HCCL_BUILDER})
|
||||
elseif (CMAKE_SYSTEM_NAME MATCHES "Windows")
|
||||
target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece -Wl,--end-group)
|
||||
else ()
|
||||
|
@ -274,11 +272,14 @@ elseif (CMAKE_SYSTEM_NAME MATCHES "Windows")
|
|||
else ()
|
||||
MESSAGE(FATAL_ERROR "other platform: ${CMAKE_SYSTEM_NAME}")
|
||||
endif ()
|
||||
set(MINDSPORE_RPATH ${ORIGIN_PATH}/lib)
|
||||
|
||||
if (ENABLE_D)
|
||||
set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/fwkacllib/lib64)
|
||||
set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64)
|
||||
set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/fwkacllib/lib64)
|
||||
set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel)
|
||||
set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel)
|
||||
set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel)
|
||||
set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/add-ons)
|
||||
set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/op_tiling)
|
||||
set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
|
||||
|
@ -286,9 +287,16 @@ if (ENABLE_D)
|
|||
elseif (ENABLE_GPU)
|
||||
set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/cuda/lib64)
|
||||
endif ()
|
||||
set(HCCL_ADPT_RPATH ${ORIGIN_PATH}:${MINDSPORE_RPATH})
|
||||
set(MINDSPORE_RPATH ${ORIGIN_PATH}/lib:${MINDSPORE_RPATH})
|
||||
|
||||
set_target_properties(_c_expression PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH})
|
||||
|
||||
if (ENABLE_D)
|
||||
set_target_properties(hccl_adapter PROPERTIES INSTALL_RPATH ${HCCL_ADPT_RPATH})
|
||||
target_link_libraries(_c_expression PRIVATE hccl_adapter)
|
||||
endif ()
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Windows")
|
||||
target_link_libraries(mindspore mindspore::pybind11_module)
|
||||
target_link_libraries(mindspore mindspore_gvar)
|
||||
|
@ -352,6 +360,7 @@ if (ENABLE_D)
|
|||
find_library(adump_server libadump_server.a ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
|
||||
target_link_libraries(_c_expression PRIVATE ${adump_server})
|
||||
target_link_libraries(inference PRIVATE ${adump_server})
|
||||
target_link_libraries(inference PRIVATE mindspore_core hccl_adapter)
|
||||
endif()
|
||||
|
||||
if (ENABLE_CPU)
|
||||
|
|
|
@ -17,16 +17,15 @@
|
|||
#include "backend/kernel_compiler/hccl/hccl_kernel.h"
|
||||
|
||||
#include <map>
|
||||
#include "runtime/device/ascend/tasksink/runtime_utils.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "utils/utils.h"
|
||||
#include "utils/ms_context.h"
|
||||
#include "runtime/device/kernel_runtime.h"
|
||||
#include "runtime/device/ascend/executor/hccl_dynamic_kernel.h"
|
||||
#include "runtime/hccl_adapter/hccl_adapter.h"
|
||||
|
||||
using HcclTaskInfoPtr = std::shared_ptr<ge::model_runner::HcclTaskInfo>;
|
||||
using ge::model_runner::HcclTaskInfo;
|
||||
using mindspore::device::ascend::tasksink::RuntimeUtils;
|
||||
|
||||
namespace {
|
||||
static std::map<std::string, std::string> kMsOpNameToHcomHcclType = {
|
||||
|
@ -145,35 +144,45 @@ const std::vector<size_t> &HcclKernel::GetOutputSizeList() const {
|
|||
|
||||
const std::vector<size_t> &HcclKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
|
||||
|
||||
std::vector<TaskInfoPtr> HcclKernel::GenTask(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
std::vector<TaskInfoPtr> HcclKernel::GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
||||
const std::vector<AddressPtr> &outputs, uint32_t stream_id) {
|
||||
if (inputs.empty() || outputs.empty()) {
|
||||
MS_LOG(EXCEPTION) << "Inputs or outputs is empty";
|
||||
}
|
||||
stream_id_ = stream_id;
|
||||
std::string hccl_type = AnfAlgo::GetCNodeName(anf_node_);
|
||||
MS_EXCEPTION_IF_NULL(inputs.at(0));
|
||||
auto input_data_addr = inputs.at(0)->addr;
|
||||
MS_EXCEPTION_IF_NULL(outputs.at(0));
|
||||
auto output_data_addr = outputs.at(0)->addr;
|
||||
void *workspace_address = nullptr;
|
||||
const int64_t workspace_num = 0;
|
||||
std::vector<uint8_t> private_def;
|
||||
HcclDataType data_type = hccl_data_type_list_[0];
|
||||
|
||||
MS_LOG(INFO) << "HCCL Task : stream_id=" << stream_id << ", ws_num=" << workspace_num << ", count=" << hccl_count_
|
||||
<< ", root_id=" << root_id_ << ", op_type=" << static_cast<int>(op_type_)
|
||||
<< ", data_type=" << static_cast<int>(data_type);
|
||||
std::vector<hccl::HcclTaskInfo> task_info;
|
||||
bool ret = hccl::GenTask(anf_node_, data_type, &task_info);
|
||||
if (!ret) {
|
||||
MS_LOG(EXCEPTION) << "Gen Task for " << anf_node_->DebugString() << " failed.";
|
||||
}
|
||||
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
HcclTaskInfoPtr task_info_ptr = std::make_shared<HcclTaskInfo>(
|
||||
kernel_name_, stream_id, hccl_type, input_data_addr, output_data_addr, workspace_address, workspace_num, 0,
|
||||
private_def, nullptr, hccl_count_, root_id_, op_type_, data_type, group_, RuntimeUtils::HcomBindModel,
|
||||
RuntimeUtils::HcomUnbindModel, RuntimeUtils::HcomDistribute, NeedDump());
|
||||
MS_EXCEPTION_IF_NULL(task_info_ptr);
|
||||
return {task_info_ptr};
|
||||
std::vector<TaskInfoPtr> results;
|
||||
for (auto &task : task_info) {
|
||||
MS_LOG(INFO) << "HCCL Task : stream_id=" << stream_id << ", count=" << hccl_count_ << ", root_id=" << root_id_
|
||||
<< ", op_type=" << static_cast<int>(op_type_) << ", data_type=" << static_cast<int>(data_type)
|
||||
<< ", workspace_size=" << task.workspace_size << ", stream_num=" << task.stream_num
|
||||
<< ", private_def_size=" << task.private_def.size();
|
||||
|
||||
private_def.resize(task.private_def.size());
|
||||
auto sec_ret = memcpy_s(private_def.data(), private_def.size(), task.private_def.data(), task.private_def.size());
|
||||
if (sec_ret != 0) {
|
||||
MS_LOG(EXCEPTION) << "Set data memcpy_s failed, ret = " << sec_ret;
|
||||
}
|
||||
|
||||
results.emplace_back(std::make_shared<HcclTaskInfo>(
|
||||
kernel_name_, stream_id, hccl::GetHcclType(anf_node_), input_data_addr, output_data_addr, task.workspace_size,
|
||||
task.stream_num, private_def, hccl::GetHcclOpsKernelInfoStore(), hccl_count_, root_id_, op_type_, data_type,
|
||||
group_, NeedDump()));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
device::DynamicKernelPtr HcclKernel::GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) {
|
||||
|
|
|
@ -20,26 +20,10 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
bool HcomAllBroadCastKernel::Launch(const std::vector<AddressPtr> &inputs,
|
||||
bool HcomAllBroadCastKernel::Launch(const std::vector<AddressPtr> & /*inputs*/,
|
||||
const std::vector<AddressPtr> & /*workspace*/,
|
||||
const std::vector<AddressPtr> & /*outputs*/, void *stream_ptr) {
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) {
|
||||
return true;
|
||||
}
|
||||
if (inputs.empty() || hccl_data_type_list_.empty()) {
|
||||
MS_LOG(ERROR) << "BroadCast param is empty";
|
||||
return false;
|
||||
}
|
||||
const char *tag = "Hccl-BroadCast";
|
||||
MS_EXCEPTION_IF_NULL(inputs[0]);
|
||||
HcclResult ret =
|
||||
hcom_broadcast(tag, inputs[0]->addr, hccl_count_, hccl_data_type_list_[0], root_id_, nullptr, stream_ptr);
|
||||
if (ret != HCCL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "HcomBroadcastOp : hcom_broadcast fail, return: " << static_cast<int>(ret);
|
||||
return false;
|
||||
}
|
||||
const std::vector<AddressPtr> & /*outputs*/, void * /*stream_ptr*/) {
|
||||
MS_LOG(INFO) << "HcomAllBroadCast launch";
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
|
|
|
@ -20,24 +20,10 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
bool HcomAllGatherKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> & /*workspace*/,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) {
|
||||
return true;
|
||||
}
|
||||
if (inputs.empty() || hccl_data_type_list_.empty()) {
|
||||
MS_LOG(ERROR) << "AllGather param is empty";
|
||||
return false;
|
||||
}
|
||||
const char *tag = "Hccl-AllGather";
|
||||
HcclResult ret =
|
||||
hcom_all_gather(tag, inputs[0]->addr, outputs[0]->addr, hccl_count_, hccl_data_type_list_[0], nullptr, stream_ptr);
|
||||
if (ret != HCCL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "HcomAllGatherKernelOp : hcom_all_gather fail, return: " << static_cast<int>(ret);
|
||||
return false;
|
||||
}
|
||||
bool HcomAllGatherKernel::Launch(const std::vector<AddressPtr> & /*inputs*/,
|
||||
const std::vector<AddressPtr> & /*workspace*/,
|
||||
const std::vector<AddressPtr> & /*outputs*/, void * /*stream_ptr*/) {
|
||||
MS_LOG(INFO) << "HcomAllGather launch";
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
|
|
|
@ -20,24 +20,10 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
bool HcomAllReduceKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> & /*workspace*/,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) {
|
||||
return true;
|
||||
}
|
||||
if (inputs.empty() || outputs.empty() || hccl_data_type_list_.empty()) {
|
||||
MS_LOG(ERROR) << "AllReduce param is empty";
|
||||
return false;
|
||||
}
|
||||
const char *tag = "Hccl-AllReduce";
|
||||
HcclResult ret = hcom_all_reduce(tag, inputs[0]->addr, outputs[0]->addr, hccl_count_, hccl_data_type_list_[0],
|
||||
op_type_, nullptr, stream_ptr);
|
||||
if (ret != HCCL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "HcomAllReduceKernelOp : hcom_all_reduce fail, return: " << static_cast<int>(ret);
|
||||
return false;
|
||||
}
|
||||
bool HcomAllReduceKernel::Launch(const std::vector<AddressPtr> & /*inputs*/,
|
||||
const std::vector<AddressPtr> & /*workspace*/,
|
||||
const std::vector<AddressPtr> & /*outputs*/, void * /*stream_ptr*/) {
|
||||
MS_LOG(INFO) << "HcomAllReduce launch";
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
|
|
|
@ -20,25 +20,10 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
bool HcomAllReduceScatterKernel::Launch(const std::vector<AddressPtr> &inputs,
|
||||
bool HcomAllReduceScatterKernel::Launch(const std::vector<AddressPtr> & /*inputs*/,
|
||||
const std::vector<AddressPtr> & /*workspace*/,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) {
|
||||
return true;
|
||||
}
|
||||
if (inputs.empty() || outputs.empty() || hccl_data_type_list_.empty()) {
|
||||
MS_LOG(ERROR) << "ReduceScatter param is empty";
|
||||
return false;
|
||||
}
|
||||
const char *tag = "Hccl-ReduceScatter";
|
||||
HcclResult ret = hcom_reduce_scatter(tag, inputs[0]->addr, outputs[0]->addr, hccl_count_, hccl_data_type_list_[0],
|
||||
op_type_, nullptr, stream_ptr);
|
||||
if (ret != HCCL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "HcomReduceScatterOp : hcom_reduce_scatter fail, return: " << static_cast<int>(ret);
|
||||
return false;
|
||||
}
|
||||
const std::vector<AddressPtr> & /*outputs*/, void * /*stream_ptr*/) {
|
||||
MS_LOG(INFO) << "HcomAllReduceScatter launch";
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel
|
||||
|
|
|
@ -55,6 +55,7 @@
|
|||
#include "profiler/device/ascend/rt_callback_manager.h"
|
||||
#include "utils/config_manager.h"
|
||||
#include "runtime/device/ascend/profiling/reporter/op_name_task_stream_reporter.h"
|
||||
#include "runtime/hccl_adapter/hccl_adapter.h"
|
||||
|
||||
using ge::model_runner::ModelRunner;
|
||||
using mindspore::device::ascend::ProfilingManager;
|
||||
|
@ -796,10 +797,10 @@ bool AscendKernelRuntime::HcclInit() {
|
|||
return false;
|
||||
}
|
||||
MS_LOG(INFO) << "MINDSPORE_HCCL_CONFIG_PATH : " << full_path << ", RANK_ID: " << rank_id_str;
|
||||
HcclResult res = hcom_init(full_path, rank_id_str.c_str());
|
||||
bool ret = hccl::InitHccl(context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID), rank_id_str, full_path);
|
||||
free(full_path);
|
||||
if (res != HCCL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Hcom init failed, res is " << static_cast<int>(res);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "Hcom init failed.";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -816,12 +817,14 @@ bool AscendKernelRuntime::DestroyHccl() {
|
|||
if (!HcclExecutorManager::GetInstance().Finalize()) {
|
||||
MS_LOG(ERROR) << "Dynamic Shape Hccl Finalize Failed";
|
||||
}
|
||||
HcclResult res = hcom_destroy();
|
||||
if (res != HCCL_SUCCESS) {
|
||||
|
||||
bool res = hccl::FinalizeHccl();
|
||||
if (!res) {
|
||||
MS_LOG(ERROR) << "Hccl destroy failed";
|
||||
return false;
|
||||
}
|
||||
MS_LOG(INFO) << "Hccl destroy successful, status = " << res << ".";
|
||||
|
||||
MS_LOG(INFO) << "Hccl destroy successful.";
|
||||
context_ptr->set_param<bool>(MS_CTX_ENABLE_HCCL, false);
|
||||
return true;
|
||||
}
|
||||
|
@ -855,7 +858,7 @@ void AscendKernelRuntime::KernelLaunchProfiling(const std::string &kernel_name)
|
|||
auto try_emplace_ret = stream_id_task_id_op_name_map_.try_emplace(stream_task_pair, kernel_name);
|
||||
if (!try_emplace_ret.second) {
|
||||
MS_LOG(WARNING) << "Profiling duplicate key, task_id:" << stream_task_pair.second
|
||||
<< " stream_id:" << stream_task_pair.first << " name:" << kernel_name;
|
||||
<< " stream_id:" << stream_task_pair.first << " name:" << kernel_name;
|
||||
}
|
||||
if (stream_id_task_id_op_name_map_.size() > kProfilingMaxTaskIdInStream) {
|
||||
MS_LOG(EXCEPTION) << "Too many profiling data";
|
||||
|
|
|
@ -1,106 +0,0 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "runtime/device/ascend/tasksink/runtime_utils.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "hccl/hcom.h"
|
||||
#include "utils/log_adapter.h"
|
||||
#include "hccl/hccl_types.h"
|
||||
#include "utils/utils.h"
|
||||
|
||||
constexpr auto kHcomBroadcast = "hcom_broadcast_";
|
||||
constexpr auto kHcomAllGather = "hcom_all_gather_";
|
||||
constexpr auto kHcomAllReduce = "hcom_all_reduce_";
|
||||
constexpr auto kHcomReduceScatter = "hcom_reduce_scatter_";
|
||||
constexpr auto kUnderline = "_";
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
namespace tasksink {
|
||||
bool RuntimeUtils::HcomBindModel(rtModel_t model, rtStream_t stream) {
|
||||
HcclResult ret = hcom_bind_model(model, stream);
|
||||
if (ret != HCCL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Call hcom_bind_model failed, ret: 0x" << static_cast<int>(ret);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool RuntimeUtils::HcomUnbindModel(rtModel_t model) {
|
||||
HcclResult ret = hcom_unbind_model(model);
|
||||
if (ret != HCCL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Call hcom_unbind_model failed, ret: 0x" << static_cast<int>(ret);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool RuntimeUtils::HcomDistribute(const std::shared_ptr<HcclTaskInfo> &task_info, rtStream_t stream) {
|
||||
MS_LOG(INFO) << "hccl distribute start";
|
||||
MS_EXCEPTION_IF_NULL(task_info);
|
||||
HcclResult ret;
|
||||
static uint32_t task_counter = 0;
|
||||
auto hccl_group = task_info->group();
|
||||
if (task_info->hccl_type() == kBroadcastOpName) {
|
||||
// call hcom broadcast interface to run op
|
||||
const string tag_broadcast = kHcomBroadcast + std::to_string(task_counter++) + kUnderline + std::to_string(0);
|
||||
ret = hcom_broadcast(tag_broadcast.c_str(), task_info->input_data_addr(), static_cast<u64>(task_info->count()),
|
||||
static_cast<HcclDataType>(task_info->data_type()), static_cast<u32>(task_info->root_id()),
|
||||
hccl_group.c_str(), stream);
|
||||
if (ret != HCCL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "hcom_broadcast fail, return ret: " << static_cast<int>(ret);
|
||||
return false;
|
||||
}
|
||||
} else if (task_info->hccl_type() == kAllGatherOpName) {
|
||||
// call hcom allgather interface to run op
|
||||
const string tag_all_gather = kHcomAllGather + std::to_string(task_counter++) + kUnderline + std::to_string(0);
|
||||
ret = hcom_all_gather(tag_all_gather.c_str(), task_info->input_data_addr(), task_info->output_data_addr(),
|
||||
static_cast<u64>(task_info->count()), static_cast<HcclDataType>(task_info->data_type()),
|
||||
hccl_group.c_str(), stream);
|
||||
if (ret != HCCL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "hcom_all_gather fail, return ret: " << ret;
|
||||
return false;
|
||||
}
|
||||
} else if (task_info->hccl_type() == kAllReduceOpName) {
|
||||
// call hcom allreduce interface to run op
|
||||
const string tag_all_reduce = kHcomAllReduce + std::to_string(task_counter++) + kUnderline + std::to_string(0);
|
||||
ret = hcom_all_reduce(tag_all_reduce.c_str(), task_info->input_data_addr(), task_info->output_data_addr(),
|
||||
static_cast<u64>(task_info->count()), static_cast<HcclDataType>(task_info->data_type()),
|
||||
static_cast<HcclReduceOp>(task_info->op_type()), hccl_group.c_str(), stream);
|
||||
if (ret != HCCL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "hcom_all_reduce fail, return ret: " << ret;
|
||||
return false;
|
||||
}
|
||||
} else if (task_info->hccl_type() == kReduceScatterOpName) {
|
||||
// call hcom reducescatter interface to run op
|
||||
const string tag_reduce_scatter =
|
||||
kHcomReduceScatter + std::to_string(task_counter++) + kUnderline + std::to_string(0);
|
||||
ret = hcom_reduce_scatter(tag_reduce_scatter.c_str(), task_info->input_data_addr(), task_info->output_data_addr(),
|
||||
static_cast<u64>(task_info->count()), static_cast<HcclDataType>(task_info->data_type()),
|
||||
static_cast<HcclReduceOp>(task_info->op_type()), hccl_group.c_str(), stream);
|
||||
if (ret != HCCL_SUCCESS) {
|
||||
MS_LOG(ERROR) << "hcom_reduce_scatter fail, return ret: " << ret;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace tasksink
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
|
@ -1,39 +0,0 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_TASKSINK_RUNTIME_UTILS_H_
|
||||
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_TASKSINK_RUNTIME_UTILS_H_
|
||||
|
||||
#include <memory>
|
||||
#include "runtime/rt.h"
|
||||
#include "framework/ge_runtime/task_info.h"
|
||||
|
||||
using ge::model_runner::HcclTaskInfo;
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
namespace tasksink {
|
||||
class RuntimeUtils {
|
||||
public:
|
||||
static bool HcomBindModel(rtModel_t model, rtStream_t stream);
|
||||
static bool HcomUnbindModel(rtModel_t model);
|
||||
static bool HcomDistribute(const std::shared_ptr<HcclTaskInfo> &task_info, rtStream_t stream);
|
||||
};
|
||||
} // namespace tasksink
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_TASKSINK_RUNTIME_UTILS_H_
|
|
@ -0,0 +1,8 @@
|
|||
file(GLOB_RECURSE HCCL_ADAPTER_SRC_LIST ./*.cc)
|
||||
set_property(SOURCE ${HCCL_ADAPTER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_HCCL_ADPT)
|
||||
add_library(hccl_adapter SHARED ${HCCL_ADAPTER_SRC_LIST})
|
||||
target_include_directories(hccl_adapter PRIVATE ${CMAKE_BINARY_DIR}/proto/ge)
|
||||
add_dependencies(hccl_adapter graph)
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
target_link_options(hccl_adapter PRIVATE -Wl,-init,mindspore_log_init)
|
||||
endif ()
|
|
@ -0,0 +1,129 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "runtime/hccl_adapter/converter.h"
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
#include <tuple>
|
||||
#define google ascend_private
|
||||
#include "register/ops_kernel_builder_registry.h"
|
||||
#include "graph/compute_graph.h"
|
||||
#include "graph/debug/ge_attr_define.h"
|
||||
#undef google
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "utils/log_adapter.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#include "mindspore/core/base/core_ops.h"
|
||||
#include "transform/graph_ir/util.h"
|
||||
|
||||
static constexpr char kGeOpNameHcclAllRudece[] = "HcomAllReduce";
|
||||
static constexpr char kGeOpNameHcclAllGather[] = "HcomAllGather";
|
||||
static constexpr char kGeOpNameHcclBroadcast[] = "HcomBroadcast";
|
||||
static constexpr char kGeOpNameHcclReduceScatter[] = "HcomReduceScatter";
|
||||
static constexpr char kGeNodeAttrUsedStreamNum[] = "used_stream_num";
|
||||
static constexpr char kStubDataStructureName[] = "any_name_can_work";
|
||||
|
||||
static ge::DataType ConvertHcclDTypeToGeDType(HcclDataType datatype) {
|
||||
static map<HcclDataType, ge::DataType> kHcomDataTypeMap = {
|
||||
{HCCL_DATA_TYPE_FP32, ge::DT_FLOAT},
|
||||
{HCCL_DATA_TYPE_FP16, ge::DT_FLOAT16},
|
||||
{HCCL_DATA_TYPE_INT8, ge::DT_INT8},
|
||||
{HCCL_DATA_TYPE_INT32, ge::DT_INT32},
|
||||
};
|
||||
|
||||
auto iter = kHcomDataTypeMap.find(datatype);
|
||||
if (iter == kHcomDataTypeMap.end()) {
|
||||
MS_LOG(EXCEPTION) << "Unknown hccl data type " << datatype;
|
||||
}
|
||||
|
||||
return iter->second;
|
||||
}
|
||||
|
||||
namespace mindspore::hccl {
|
||||
std::string GetGeNodeName(const CNodePtr &cnode) {
|
||||
MS_EXCEPTION_IF_NULL(cnode);
|
||||
if (IsPrimitiveCNode(cnode, prim::kPrimAllReduce)) {
|
||||
return kGeOpNameHcclAllRudece;
|
||||
} else if (IsPrimitiveCNode(cnode, prim::kPrimAllGather)) {
|
||||
return kGeOpNameHcclAllGather;
|
||||
} else if (IsPrimitiveCNode(cnode, prim::kPrimBroadcast)) {
|
||||
return kGeOpNameHcclBroadcast;
|
||||
} else if (IsPrimitiveCNode(cnode, prim::kPrimReduceScatter)) {
|
||||
return kGeOpNameHcclReduceScatter;
|
||||
}
|
||||
|
||||
MS_LOG(EXCEPTION) << "Unknown hccl node type " << cnode->DebugString();
|
||||
}
|
||||
|
||||
std::tuple<ge::NodePtr, ge::ComputeGraphPtr> GenerateStubGeNode(const AnfNodePtr &anf_node, HcclDataType datatype) {
|
||||
MS_EXCEPTION_IF_NULL(anf_node);
|
||||
auto cnode = anf_node->cast<CNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(cnode);
|
||||
std::string ge_node_name = GetGeNodeName(cnode);
|
||||
|
||||
ge::OpDescPtr op_desc = std::make_shared<ge::OpDesc>(kStubDataStructureName, ge_node_name);
|
||||
MS_EXCEPTION_IF_NULL(op_desc);
|
||||
for (size_t i = 1; i < cnode->size(); ++i) {
|
||||
auto &input = cnode->input(i);
|
||||
std::vector<int64_t> ge_shape;
|
||||
auto ms_shape = AnfAlgo::GetOutputInferShape(input, 0);
|
||||
std::transform(ms_shape.begin(), ms_shape.end(), std::back_inserter(ge_shape),
|
||||
[](size_t in) { return static_cast<int64_t>(in); });
|
||||
op_desc->AddInputDesc(
|
||||
ge::GeTensorDesc(ge::GeShape(ge_shape), ge::Format::FORMAT_NCHW,
|
||||
transform::TransformUtil::ConvertDataType(AnfAlgo::GetOutputInferDataType(input, 0))));
|
||||
}
|
||||
|
||||
// set node data type
|
||||
bool ret = ge::AttrUtils::SetDataType(*op_desc, ge::HCOM_ATTR_DATA_TYPE, ConvertHcclDTypeToGeDType(datatype));
|
||||
if (!ret) {
|
||||
MS_LOG(EXCEPTION) << "Set attr " << ge::HCOM_ATTR_DATA_TYPE << " for ge node of " << cnode->DebugString()
|
||||
<< " failed.";
|
||||
}
|
||||
|
||||
// set rank size
|
||||
if (AnfAlgo::HasNodeAttr(kAttrRankSize, cnode)) {
|
||||
auto rank_size = AnfAlgo::GetNodeAttr<int64_t>(cnode, kAttrRankSize);
|
||||
ret = ge::AttrUtils::SetInt(*op_desc, ge::HCOM_ATTR_RANK_SIZE, rank_size);
|
||||
if (!ret) {
|
||||
MS_LOG(EXCEPTION) << "Set attr " << ge::HCOM_ATTR_RANK_SIZE << " for ge node of " << cnode->DebugString()
|
||||
<< " failed.";
|
||||
}
|
||||
}
|
||||
|
||||
ge::ComputeGraphPtr ge_graph = std::make_shared<ge::ComputeGraph>(kStubDataStructureName);
|
||||
MS_EXCEPTION_IF_NULL(ge_graph);
|
||||
auto ge_node = ge_graph->AddNode(op_desc);
|
||||
return {ge_node, ge_graph};
|
||||
}
|
||||
|
||||
HcclTaskInfo ParseDomiTask(const ge::OpDescPtr &op, const domi::TaskDef &task_def) {
|
||||
MS_EXCEPTION_IF_NULL(op);
|
||||
// workspace size
|
||||
auto workspace_sizes = op->GetWorkspaceBytes();
|
||||
if (workspace_sizes.size() != 1) {
|
||||
MS_LOG(EXCEPTION) << "Unexpected workspace size " << workspace_sizes.size();
|
||||
}
|
||||
int64_t workspace_size = workspace_sizes[0];
|
||||
// stream num
|
||||
int64_t stream_num;
|
||||
bool ret = ge::AttrUtils::GetInt(*op, kGeNodeAttrUsedStreamNum, stream_num);
|
||||
if (!ret) {
|
||||
MS_LOG(EXCEPTION) << "Get attr " << kGeNodeAttrUsedStreamNum << " for ge node " << op->GetType() << " failed.";
|
||||
}
|
||||
|
||||
return {task_def.private_def(), workspace_size, stream_num};
|
||||
}
|
||||
} // namespace mindspore::hccl
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_RUNTIME_HCCL_ADAPTER_CONVERTER_H
|
||||
#define MINDSPORE_RUNTIME_HCCL_ADAPTER_CONVERTER_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <tuple>
|
||||
#define google ascend_private
|
||||
#include "graph/node.h"
|
||||
#include "common/opskernel/ops_kernel_info_types.h"
|
||||
#include "proto/task.pb.h"
|
||||
#undef google
|
||||
#include "runtime/hccl_adapter/hccl_adapter.h"
|
||||
#include "mindspore/core/ir/anf.h"
|
||||
|
||||
namespace mindspore::hccl {
|
||||
// return graph ptr to keep reference count
|
||||
std::tuple<ge::NodePtr, ge::ComputeGraphPtr> GenerateStubGeNode(const AnfNodePtr &anf_node, HcclDataType datatype);
|
||||
HcclTaskInfo ParseDomiTask(const ge::OpDescPtr &op, const domi::TaskDef &task_def);
|
||||
std::string GetGeNodeName(const CNodePtr &cnode);
|
||||
} // namespace mindspore::hccl
|
||||
#endif // MINDSPORE_RUNTIME_HCCL_ADAPTER_CONVERTER_H
|
|
@ -0,0 +1,165 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "runtime/hccl_adapter/hccl_adapter.h"
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
#define google ascend_private
|
||||
#include "register/ops_kernel_builder_registry.h"
|
||||
#include "common/opskernel/ops_kernel_info_store.h"
|
||||
#include "external/ge/ge_api_types.h"
|
||||
#undef google
|
||||
#include "utils/log_adapter.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#include "runtime/hccl_adapter/converter.h"
|
||||
#include "runtime/hccl_adapter/hcom_graph_adaptor.h"
|
||||
|
||||
static constexpr const char *kHcclOpsKernelInfoStore = "ops_kernel_info_hccl";
|
||||
static constexpr const char *kHcclDeployModeEnv = "DEPLOY_MODE";
|
||||
// following global var, thread safety is not guaranteed
|
||||
static std::shared_ptr<ge::OpsKernelInfoStore> ops_kernel_info_store = nullptr;
|
||||
static ge::OpsKernelBuilderPtr ops_kernel_builder = nullptr;
|
||||
|
||||
namespace mindspore::hccl {
|
||||
static std::map<std::string, std::string> GenHcclOptions(uint32_t device_id, std::string_view rank_id,
|
||||
std::string_view rank_file) {
|
||||
auto env_deploy_mode = common::GetEnv(kHcclDeployModeEnv);
|
||||
if (env_deploy_mode.empty()) {
|
||||
MS_LOG(WARNING) << kHcclDeployModeEnv << " is not set in ENV. Now set to default value 0";
|
||||
env_deploy_mode = "0";
|
||||
}
|
||||
|
||||
return std::map<std::string, std::string>({{ge::OPTION_EXEC_IS_USEHCOM, "1"},
|
||||
{ge::OPTION_EXEC_IS_USEHVD, "0"},
|
||||
{ge::OPTION_EXEC_HCCL_FLAG, "1"},
|
||||
{ge::OPTION_EXEC_DEVICE_ID, std::to_string(device_id)},
|
||||
{ge::OPTION_EXEC_RANK_ID, rank_id.data()},
|
||||
{ge::OPTION_EXEC_POD_NAME, rank_id.data()},
|
||||
{ge::OPTION_EXEC_RANK_TABLE_FILE, rank_file.data()},
|
||||
{ge::OPTION_GRAPH_RUN_MODE, "1"},
|
||||
{ge::OPTION_EXEC_HCCL_FLAG, "1"},
|
||||
{ge::OPTION_EXEC_DEPLOY_MODE, env_deploy_mode}});
|
||||
}
|
||||
|
||||
bool InitHccl(uint32_t device_id, std::string_view rank_id, std::string_view rank_file) {
|
||||
MS_LOG(INFO) << "Start init hccl adapter.";
|
||||
// get ops_kernel_builder
|
||||
std::map<std::string, ge::OpsKernelBuilderPtr> all_builders = ge::OpsKernelBuilderRegistry::GetInstance().GetAll();
|
||||
if (all_builders.size() != 1) {
|
||||
MS_LOG(EXCEPTION) << "Builders size should be 1 (hccl builder), but is " << all_builders.size();
|
||||
}
|
||||
|
||||
MS_LOG(INFO) << "Get builder " << all_builders.begin()->first;
|
||||
ops_kernel_builder = all_builders.begin()->second;
|
||||
MS_EXCEPTION_IF_NULL(ops_kernel_builder);
|
||||
// init ops_kernel_builder
|
||||
auto options = GenHcclOptions(device_id, rank_id, rank_file);
|
||||
auto ret = ops_kernel_builder->Initialize(options);
|
||||
if (ret != ge::SUCCESS) {
|
||||
MS_LOG(EXCEPTION) << "Init builder failed, ret = " << ret;
|
||||
}
|
||||
|
||||
// get ops_kernel_info_store
|
||||
ret = ::Initialize(options);
|
||||
if (ret != ge::SUCCESS) {
|
||||
MS_LOG(EXCEPTION) << "Init plugin so failed, ret = " << ret;
|
||||
}
|
||||
|
||||
std::map<std::string, std::shared_ptr<ge::OpsKernelInfoStore>> all_ops_kernel_info_stores;
|
||||
::GetOpsKernelInfoStores(all_ops_kernel_info_stores);
|
||||
for (auto &[name, ptr] : all_ops_kernel_info_stores) {
|
||||
if (name == kHcclOpsKernelInfoStore) {
|
||||
ops_kernel_info_store = ptr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(ops_kernel_info_store);
|
||||
ret = ops_kernel_info_store->Initialize(options);
|
||||
if (ret != ge::SUCCESS) {
|
||||
MS_LOG(EXCEPTION) << "Init info store failed, ret = " << ret;
|
||||
}
|
||||
MS_LOG(INFO) << "Init hccl adapter success.";
|
||||
return true;
|
||||
}
|
||||
|
||||
bool FinalizeHccl() {
|
||||
MS_LOG(INFO) << "Start destroy hccl adapter.";
|
||||
if (ops_kernel_info_store != nullptr) {
|
||||
auto ret = ops_kernel_info_store->Finalize();
|
||||
if (ret != ge::SUCCESS) {
|
||||
MS_LOG(ERROR) << "Destory info store failed, ret = " << ret;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (ops_kernel_builder != nullptr) {
|
||||
auto ret = ops_kernel_builder->Finalize();
|
||||
if (ret != ge::SUCCESS) {
|
||||
MS_LOG(ERROR) << "Destory builder failed, ret = " << ret;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
::Finalize();
|
||||
ge::OpsKernelBuilderRegistry::GetInstance().UnregisterAll();
|
||||
ops_kernel_info_store.reset();
|
||||
ops_kernel_builder.reset();
|
||||
MS_LOG(INFO) << "Destroy hccl adapter success.";
|
||||
return true;
|
||||
}
|
||||
|
||||
bool GenTask(const AnfNodePtr &node, HcclDataType datatype, std::vector<HcclTaskInfo> *task_info_lists) {
|
||||
MS_EXCEPTION_IF_NULL(ops_kernel_builder);
|
||||
MS_EXCEPTION_IF_NULL(task_info_lists);
|
||||
MS_LOG(INFO) << "Start generate task for hccl node " << node->DebugString();
|
||||
auto [ge_node, ge_graph] = GenerateStubGeNode(node, datatype);
|
||||
MS_EXCEPTION_IF_NULL(ge_node);
|
||||
auto op = ge_node->GetOpDesc();
|
||||
MS_EXCEPTION_IF_NULL(op);
|
||||
|
||||
MS_LOG(INFO) << "Start to call CalcOpRunningParam";
|
||||
ge::Status ret = ops_kernel_builder->CalcOpRunningParam(*ge_node);
|
||||
if (ret != ge::SUCCESS) {
|
||||
MS_LOG(ERROR) << "OpsKernelBuilder CalcOpRunningParam failed, ret = " << ret;
|
||||
return false;
|
||||
}
|
||||
MS_LOG(INFO) << "Start to call GenerateTask";
|
||||
ge::RunContext unused_ctx;
|
||||
std::vector<domi::TaskDef> domi_tasks;
|
||||
ret = ops_kernel_builder->GenerateTask(*ge_node, unused_ctx, domi_tasks);
|
||||
if (ret != ge::SUCCESS) {
|
||||
MS_LOG(ERROR) << "OpsKernelBuilder GenerateTask failed, ret = " << ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
task_info_lists->clear();
|
||||
std::transform(domi_tasks.begin(), domi_tasks.end(), std::back_inserter(*task_info_lists),
|
||||
[&op](const domi::TaskDef &task_def) -> HcclTaskInfo { return ParseDomiTask(op, task_def); });
|
||||
MS_LOG(INFO) << "Generate task for node " << node->DebugString() << " success.";
|
||||
ge_graph.reset();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CalcOpRunningParam(const AnfNodePtr &node) { return true; }
|
||||
|
||||
void *GetHcclOpsKernelInfoStore() { return ops_kernel_info_store.get(); }
|
||||
|
||||
std::string GetHcclType(const AnfNodePtr &node) {
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
auto cnode = node->cast<CNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(cnode);
|
||||
return GetGeNodeName(cnode);
|
||||
}
|
||||
} // namespace mindspore::hccl
|
|
@ -0,0 +1,43 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_RUNTIME_HCCL_ADAPTER_HCCL_ADAPTER_H
|
||||
#define MINDSPORE_RUNTIME_HCCL_ADAPTER_HCCL_ADAPTER_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "mindspore/core/ir/anf.h"
|
||||
#include "external/hccl/hccl_types.h"
|
||||
|
||||
#define MS_API __attribute__((visibility("default")))
|
||||
|
||||
namespace mindspore::hccl {
|
||||
struct MS_API HcclTaskInfo {
|
||||
std::string private_def;
|
||||
int64_t workspace_size;
|
||||
int64_t stream_num;
|
||||
};
|
||||
|
||||
MS_API bool InitHccl(uint32_t device_id, std::string_view rank_id, std::string_view rank_file);
|
||||
MS_API bool FinalizeHccl();
|
||||
MS_API bool GenTask(const AnfNodePtr &node, HcclDataType datatype, std::vector<HcclTaskInfo> *task_info_lists);
|
||||
MS_API bool CalcOpRunningParam(const AnfNodePtr &node);
|
||||
MS_API void *GetHcclOpsKernelInfoStore();
|
||||
MS_API std::string GetHcclType(const AnfNodePtr &node);
|
||||
} // namespace mindspore::hccl
|
||||
#undef MS_API
|
||||
#endif // MINDSPORE_RUNTIME_HCCL_ADAPTER_HCCL_ADAPTER_H
|
|
@ -0,0 +1,32 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_RUNTIME_HCCL_ADAPTER_HCOM_GRAPH_ADAPTOR_H
|
||||
#define MINDSPORE_RUNTIME_HCCL_ADAPTER_HCOM_GRAPH_ADAPTOR_H
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include "mindspore/core/ir/anf.h"
|
||||
#include "common/opskernel/ops_kernel_info_store.h"
|
||||
|
||||
extern "C" {
|
||||
ge::Status Initialize(const std::map<std::string, std::string> &);
|
||||
ge::Status Finalize();
|
||||
void GetOpsKernelInfoStores(std::map<std::string, std::shared_ptr<ge::OpsKernelInfoStore>> &);
|
||||
}
|
||||
|
||||
#endif // MINDSPORE_RUNTIME_HCCL_ADAPTER_HCOM_GRAPH_ADAPTOR_H
|
|
@ -181,7 +181,8 @@ static const char *GetSubModuleName(SubModuleId module_id) {
|
|||
"VM", // SM_VM
|
||||
"PROFILER", // SM_PROFILER
|
||||
"PS", // SM_PS
|
||||
"LITE" // SM_LITE
|
||||
"LITE", // SM_LITE
|
||||
"HCCL_ADPT" // SM_HCCL_ADPT
|
||||
};
|
||||
|
||||
return sub_module_names[module_id % NUM_SUBMODUES];
|
||||
|
|
|
@ -125,6 +125,7 @@ enum SubModuleId : int {
|
|||
SM_PROFILER, // profiler
|
||||
SM_PS, // Parameter Server
|
||||
SM_LITE, // LITE
|
||||
SM_HCCL_ADPT, // Hccl Adapter
|
||||
NUM_SUBMODUES // number of submodules
|
||||
};
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
*/
|
||||
#include <vector>
|
||||
#include "framework/ge_runtime/model_runner.h"
|
||||
#include "runtime/device/ascend/tasksink/runtime_utils.h"
|
||||
#include "runtime/hccl_adapter/hccl_adapter.h"
|
||||
|
||||
namespace ge {
|
||||
namespace model_runner {
|
||||
|
@ -60,15 +60,12 @@ const std::map<std::string, std::shared_ptr<RuntimeInfo>> &ModelRunner::GetRunti
|
|||
} // namespace ge
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
namespace tasksink {
|
||||
bool RuntimeUtils::HcomBindModel(rtModel_t model, rtStream_t stream) { return true; }
|
||||
|
||||
bool RuntimeUtils::HcomUnbindModel(rtModel_t model) { return true; }
|
||||
|
||||
bool RuntimeUtils::HcomDistribute(const std::shared_ptr<HcclTaskInfo> &task_info, rtStream_t stream) { return true; }
|
||||
} // namespace tasksink
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
namespace hccl {
|
||||
bool InitHccl(uint32_t, std::string_view, std::string_view) { return true; }
|
||||
bool FinalizeHccl() { return true; }
|
||||
bool GenTask(const AnfNodePtr &, HcclDataType, std::vector<HcclTaskInfo> *) { return true; }
|
||||
bool CalcOpRunningParam(const AnfNodePtr &) { return true; }
|
||||
void *GetHcclOpsKernelInfoStore() { return nullptr; }
|
||||
std::string GetHcclType(const AnfNodePtr &) { return ""; }
|
||||
} // namespace hccl
|
||||
} // namespace mindspore
|
||||
|
|
Loading…
Reference in New Issue