diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt index cc062c9c097..9ccfe9f4473 100644 --- a/mindspore/ccsrc/CMakeLists.txt +++ b/mindspore/ccsrc/CMakeLists.txt @@ -267,6 +267,8 @@ if(ENABLE_D) find_library(REGISTER register ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(PLATFORM platform ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(OPTILING optiling ${ASCEND_OPP_PATH} ${ASCEND_TOOLKIT_OPP_PATH}) + find_library(ACL ascendcl ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) + # hccl_adpter find_library(HCCL_ADPTER hcom_graph_adaptor ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) find_library(HCCL_RA ra ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) @@ -281,7 +283,7 @@ if(ENABLE_D) mindspore::protobuf -Wl,--end-group) target_link_libraries(mindspore ge_runtime ${CCE_LIB} ${RUNTIME_LIB} ${TSDCLIENT} ${HCCL} ${DATATRANSFER} ${HCCL_ADPTER} ${REGISTER} -Wl,--no-as-needed ${OPTILING} ${HCCL_BUILDER} - ${HCCL_RA} ${PLATFORM}) + ${HCCL_RA} ${PLATFORM} ${ACL}) target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group) elseif(CMAKE_SYSTEM_NAME MATCHES "Windows") target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece diff --git a/mindspore/ccsrc/utils/context/context_extends.cc b/mindspore/ccsrc/utils/context/context_extends.cc index b329aba52b6..6e3edc87baa 100644 --- a/mindspore/ccsrc/utils/context/context_extends.cc +++ b/mindspore/ccsrc/utils/context/context_extends.cc @@ -22,7 +22,6 @@ #include #include "pybind11/pybind11.h" - #include "utils/ms_utils.h" #include "utils/convert_utils_base.h" @@ -46,7 +45,7 @@ bool OpenTsd(const std::shared_ptr &ms_context_ptr) { } if (ms_context_ptr->get_param(MS_CTX_TSD_REF)) { - MS_LOG(DEBUG) << "TDT Dataset client is already opened."; + MS_LOG(DEBUG) << "ACLTDT Dataset client is already opened."; ms_context_ptr->increase_param(MS_CTX_TSD_REF); return true; } @@ -56,10 +55,8 @@ bool OpenTsd(const std::shared_ptr &ms_context_ptr) { return true; } - unsigned int device_id; - unsigned int rank_size = 1; - - device_id = ms_context_ptr->get_param(MS_CTX_DEVICE_ID); + uint32_t rank_size = 1; + uint32_t device_id = ms_context_ptr->get_param(MS_CTX_DEVICE_ID); auto rank_size_env = common::GetEnv("RANK_SIZE"); if (rank_size_env.empty()) { @@ -81,14 +78,14 @@ bool OpenTsd(const std::shared_ptr &ms_context_ptr) { } ms_context_ptr->increase_param(MS_CTX_TSD_REF); #ifdef ENABLE_TDTQUE - int32_t initStatus = tdt::TdtHostInit(device_id); - if (initStatus != TDT_OK_CODE) { - MS_LOG(EXCEPTION) << "Init tsd failed, status = " << initStatus << "."; + acltdtChannelHandle *acl_handle = ms_context_ptr->get_acl_tdt_channel_handle(); + if (acl_handle == nullptr) { + MS_LOG(EXCEPTION) << "Get acltdt handle failed"; return false; } - ms_context_ptr->tdt_print_ = std::thread(TensorPrint()); + ms_context_ptr->acl_tdt_print = std::thread(TensorPrint(acl_handle)); #endif - MS_LOG(INFO) << "Open and init tsd successful, tsd reference = " + MS_LOG(INFO) << "Get the acltdt handle successful, tsd reference = " << ms_context_ptr->get_param(MS_CTX_TSD_REF) << "."; return true; } @@ -103,28 +100,34 @@ bool CloseTsd(const std::shared_ptr &ms_context_ptr, bool force) { ms_context_ptr->decrease_param(MS_CTX_TSD_REF); if (force || ms_context_ptr->get_param(MS_CTX_TSD_REF) == 0) { ms_context_ptr->set_param(MS_CTX_TSD_REF, 0); + #ifdef ENABLE_TDTQUE - int32_t stopStatus = tdt::TdtHostStop(KNpuLog); - if (stopStatus != TDT_OK_CODE) { - MS_LOG(EXCEPTION) << "Stop tsd failed, status = " << stopStatus << "."; - return false; + acltdtChannelHandle *acl_handle = ms_context_ptr->get_acl_tdt_channel_handle(); + aclError stopStatus = acltdtStopChannel(acl_handle); + if (stopStatus != ACL_SUCCESS) { + MS_LOG(ERROR) << "Failed stop acl data channel for host queue "; + } else { + MS_LOG(INFO) << "Succeed stop acl data channel for host queue "; } + MS_LOG(INFO) << "Succeed run cancellation callback of out-feed dequeue op "; + py::gil_scoped_release gil_release; - int32_t destroyStatus = tdt::TdtHostDestroy(); - if (destroyStatus != TDT_OK_CODE) { - MS_LOG(EXCEPTION) << "Destroy tsd failed, status = " << destroyStatus << "."; - return false; + aclError destrodStatus = acltdtDestroyChannel(acl_handle); + if (destrodStatus != ACL_SUCCESS) { + MS_LOG(ERROR) << "Failed destroy acl channel for out-feed dequeue op "; + } else { + MS_LOG(INFO) << "Succeed destroy acl channel for out-feed dequeue op "; } try { - if (ms_context_ptr->tdt_print_.joinable()) { - MS_LOG(INFO) << "join tdt host receive process"; - ms_context_ptr->tdt_print_.join(); + if (ms_context_ptr->acl_tdt_print.joinable()) { + MS_LOG(INFO) << "join acl tdt host receive process"; + ms_context_ptr->acl_tdt_print.join(); } } catch (const std::exception &e) { MS_LOG(ERROR) << "tdt thread join failed: " << e.what(); } #endif - auto device_id = ms_context_ptr->get_param(MS_CTX_DEVICE_ID); + uint32_t device_id = ms_context_ptr->get_param(MS_CTX_DEVICE_ID); auto ret = rtDeviceReset(device_id); if (ret != RT_ERROR_NONE) { MS_LOG(EXCEPTION) << "Device " << device_id << " call rtDeviceReset failed, ret[" << static_cast(ret) << "]"; @@ -133,10 +136,9 @@ bool CloseTsd(const std::shared_ptr &ms_context_ptr, bool force) { ms_context_ptr->set_param(MS_CTX_IS_PYNATIVE_GE_INIT, false); MS_LOG(INFO) << "Call rtDeviceReset, destroy and close tsd successful, ret[" << static_cast(ret) << "]"; } else { - MS_LOG(DEBUG) << "TDT Dataset client is used, no need to close, tsd reference = " + MS_LOG(DEBUG) << "Acltdt Dataset client is used, no need to close, tsd reference = " << ms_context_ptr->get_param(MS_CTX_TSD_REF) << "."; } - return true; } #else @@ -308,6 +310,7 @@ bool PynativeInitGe(const std::shared_ptr &ms_context_ptr) { ms_context_ptr->get_param(MS_CTX_GE_REF) || ms_context_ptr->get_param(MS_CTX_TSD_REF)) { return true; } + (void)OpenTsd(ms_context_ptr); (void)InitGe(ms_context_ptr); ms_context_ptr->set_param(MS_CTX_IS_PYNATIVE_GE_INIT, true); diff --git a/mindspore/ccsrc/utils/context/context_extends.h b/mindspore/ccsrc/utils/context/context_extends.h index 36e6036e173..3d8ca425643 100644 --- a/mindspore/ccsrc/utils/context/context_extends.h +++ b/mindspore/ccsrc/utils/context/context_extends.h @@ -24,8 +24,8 @@ #include "utils/tensorprint_utils.h" #ifndef NO_DLIB +#include "acl/acl_tdt.h" #include "tdt/tsd_client.h" -#include "tdt/tdt_host_interface.h" #include "tdt/data_common.h" #include "runtime/dev.h" #endif @@ -35,8 +35,8 @@ namespace mindspore { namespace context { -bool OpenTsd(const std::shared_ptr &inst_context); -bool CloseTsd(const std::shared_ptr &inst_context, bool force = false); +bool OpenTsd(const std::shared_ptr &ms_context_ptr); +bool CloseTsd(const std::shared_ptr &ms_context_ptr, bool force = false); void SetHcclOptions(const std::shared_ptr &inst_context, std::map *ge_options); void GetGeOptions(const std::shared_ptr &inst_context, std::map *ge_options); void SetDisableReuseMemoryFlag(std::map *ge_options); diff --git a/mindspore/ccsrc/utils/tensorprint_utils.cc b/mindspore/ccsrc/utils/tensorprint_utils.cc index a92d1e2e463..173843a5d81 100644 --- a/mindspore/ccsrc/utils/tensorprint_utils.cc +++ b/mindspore/ccsrc/utils/tensorprint_utils.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,75 +23,48 @@ #include "pybind11/pybind11.h" #include "utils/ms_utils.h" #include "utils/shape_utils.h" -#ifndef NO_DLIB -#include "tdt/tsd_client.h" -#include "tdt/tdt_host_interface.h" -#include "tdt/data_common.h" -#endif namespace py = pybind11; namespace mindspore { -const char kShapeSeperator[] = ","; -const char kShapeScalar[] = "[0]"; -const char kShapeNone[] = "[]"; -static std::map print_type_map = { - {"int8_t", TypeId::kNumberTypeInt8}, {"uint8_t", TypeId::kNumberTypeUInt8}, - {"int16_t", TypeId::kNumberTypeInt16}, {"uint16_t", TypeId::kNumberTypeUInt16}, - {"int32_t", TypeId::kNumberTypeInt32}, {"uint32_t", TypeId::kNumberTypeUInt32}, - {"int64_t", TypeId::kNumberTypeInt64}, {"uint64_t", TypeId::kNumberTypeUInt64}, - {"float16", TypeId::kNumberTypeFloat16}, {"float", TypeId::kNumberTypeFloat32}, - {"double", TypeId::kNumberTypeFloat64}, {"bool", TypeId::kNumberTypeBool}}; -static std::map type_size_map = { - {"int8_t", sizeof(int8_t)}, {"uint8_t", sizeof(uint8_t)}, {"int16_t", sizeof(int16_t)}, - {"uint16_t", sizeof(uint16_t)}, {"int32_t", sizeof(int32_t)}, {"uint32_t", sizeof(uint32_t)}, - {"int64_t", sizeof(int64_t)}, {"uint64_t", sizeof(uint64_t)}, {"float16", sizeof(float) / 2}, - {"float", sizeof(float)}, {"double", sizeof(double)}, {"bool", sizeof(bool)}}; +#ifndef NO_DLIB +static std::map print_acl_data_type_map = { + {ACL_INT8, TypeId::kNumberTypeInt8}, {ACL_UINT8, TypeId::kNumberTypeUInt8}, + {ACL_INT16, TypeId::kNumberTypeInt16}, {ACL_UINT16, TypeId::kNumberTypeUInt16}, + {ACL_INT32, TypeId::kNumberTypeInt32}, {ACL_UINT32, TypeId::kNumberTypeUInt32}, + {ACL_INT64, TypeId::kNumberTypeInt64}, {ACL_UINT64, TypeId::kNumberTypeUInt64}, + {ACL_FLOAT16, TypeId::kNumberTypeFloat16}, {ACL_FLOAT, TypeId::kNumberTypeFloat32}, + {ACL_DOUBLE, TypeId::kNumberTypeFloat64}, {ACL_BOOL, TypeId::kNumberTypeBool}}; -std::string GetParseType(const std::string &tensorType_) { - static const std::map print_parse_map = { - {"int8_t", "Int8"}, {"uint8_t", "Uint8"}, {"int16_t", "Int16"}, {"uint16_t", "Uint16"}, - {"int32_t", "Int32"}, {"uint32_t", "Uint32"}, {"int64_t", "Int64"}, {"uint64_t", "Uint64"}, - {"float16", "Float16"}, {"float", "Float32"}, {"double", "Float64"}, {"bool", "Bool"}}; - auto type_iter = print_parse_map.find(tensorType_); - if (type_iter == print_parse_map.end()) { - MS_LOG(EXCEPTION) << "type of tensor need to print is not support " << tensorType_; +static std::map acl_data_type_size_map = { + {ACL_INT8, sizeof(int8_t)}, {ACL_UINT8, sizeof(uint8_t)}, {ACL_INT16, sizeof(int16_t)}, + {ACL_UINT16, sizeof(uint16_t)}, {ACL_INT32, sizeof(int32_t)}, {ACL_UINT32, sizeof(uint32_t)}, + {ACL_INT64, sizeof(int64_t)}, {ACL_UINT64, sizeof(uint64_t)}, {ACL_FLOAT16, sizeof(float) / 2}, + {ACL_FLOAT, sizeof(float)}, {ACL_DOUBLE, sizeof(double)}, {ACL_BOOL, sizeof(bool)}}; + +std::string GetParseType(const aclDataType &acl_data_type) { + static const std::map print_tensor_parse_map = { + {ACL_INT8, "Int8"}, {ACL_UINT8, "Uint8"}, {ACL_INT16, "Int16"}, {ACL_UINT16, "Uint16"}, + {ACL_INT32, "Int32"}, {ACL_UINT32, "Uint32"}, {ACL_INT64, "Int64"}, {ACL_UINT64, "Uint64"}, + {ACL_FLOAT16, "Float16"}, {ACL_FLOAT, "Float32"}, {ACL_DOUBLE, "Float64"}, {ACL_BOOL, "Bool"}}; + auto type_iter = print_tensor_parse_map.find(acl_data_type); + if (type_iter == print_tensor_parse_map.end()) { + MS_LOG(EXCEPTION) << "type of tensor need to print is not support " << acl_data_type; } return type_iter->second; } -bool ParseTensorShape(const std::string &input_shape_str, ShapeVector *const tensor_shape, size_t *dims) { - if (tensor_shape == nullptr) { - return false; - } - MS_EXCEPTION_IF_NULL(dims); - std::string shape_str = input_shape_str; - if (shape_str.size() <= 2) { - return false; - } - (void)shape_str.erase(shape_str.begin()); - shape_str.pop_back(); - shape_str += kShapeSeperator; - string::size_type pos_begin = 0; - string::size_type pos_end = shape_str.find(kShapeSeperator); - while (pos_end != std::string::npos) { - string dim_str = shape_str.substr(pos_begin, pos_end - pos_begin); - tensor_shape->emplace_back(std::stoi(dim_str)); - (*dims) = (*dims) * std::stoul(dim_str); - pos_begin = pos_end + sizeof(kShapeSeperator) - 1; - pos_end = shape_str.find(kShapeSeperator, pos_begin); - } - return true; -} - bool PrintTensorToString(const char *str_data_ptr, mindspore::tensor::Tensor *const print_tensor, const size_t &memory_size) { MS_EXCEPTION_IF_NULL(str_data_ptr); MS_EXCEPTION_IF_NULL(print_tensor); auto *tensor_data_ptr = static_cast(print_tensor->data_c()); MS_EXCEPTION_IF_NULL(tensor_data_ptr); - auto cp_ret = - memcpy_s(tensor_data_ptr, static_cast(print_tensor->data().nbytes()), str_data_ptr, memory_size); + + size_t dest_size = static_cast(print_tensor->data().nbytes()); + size_t target_size = memory_size; + + auto cp_ret = memcpy_s(tensor_data_ptr, dest_size, str_data_ptr, target_size); if (cp_ret != EOK) { MS_LOG(ERROR) << "Print op Failed to copy the memory to py::tensor " << cp_ret; return false; @@ -100,10 +73,10 @@ bool PrintTensorToString(const char *str_data_ptr, mindspore::tensor::Tensor *co } template -void PrintScalarToString(const char *str_data_ptr, const string &tensor_type, std::ostringstream *const buf) { +void PrintScalarToString(const char *str_data_ptr, const aclDataType &acl_data_type, std::ostringstream *const buf) { MS_EXCEPTION_IF_NULL(str_data_ptr); MS_EXCEPTION_IF_NULL(buf); - *buf << "Tensor(shape=[], dtype=" << GetParseType(tensor_type) << ", value="; + *buf << "Tensor(shape=[], dtype=" << GetParseType(acl_data_type) << ", value="; const T *data_ptr = reinterpret_cast(str_data_ptr); if constexpr (std::is_same::value || std::is_same::value) { const int int_data = static_cast(*data_ptr); @@ -113,11 +86,12 @@ void PrintScalarToString(const char *str_data_ptr, const string &tensor_type, st } } -void PrintScalarToBoolString(const char *str_data_ptr, const string &tensor_type, std::ostringstream *const buf) { +void PrintScalarToBoolString(const char *str_data_ptr, const aclDataType &acl_data_type, + std::ostringstream *const buf) { MS_EXCEPTION_IF_NULL(str_data_ptr); MS_EXCEPTION_IF_NULL(buf); const bool *data_ptr = reinterpret_cast(str_data_ptr); - *buf << "Tensor(shape=[], dtype=" << GetParseType(tensor_type) << ", value="; + *buf << "Tensor(shape=[], dtype=" << GetParseType(acl_data_type) << ", value="; if (*data_ptr) { *buf << "True)\n"; } else { @@ -125,89 +99,99 @@ void PrintScalarToBoolString(const char *str_data_ptr, const string &tensor_type } } -void convertDataItem2Scalar(const char *str_data_ptr, const string &tensor_type, std::ostringstream *const buf) { +void convertDataItem2Scalar(const char *str_data_ptr, const aclDataType &acl_data_type, std::ostringstream *const buf) { MS_EXCEPTION_IF_NULL(str_data_ptr); MS_EXCEPTION_IF_NULL(buf); - auto type_iter = print_type_map.find(tensor_type); + auto type_iter = print_acl_data_type_map.find(acl_data_type); auto type_id = type_iter->second; if (type_id == TypeId::kNumberTypeBool) { - PrintScalarToBoolString(str_data_ptr, tensor_type, buf); + PrintScalarToBoolString(str_data_ptr, acl_data_type, buf); } else if (type_id == TypeId::kNumberTypeInt8) { - PrintScalarToString(str_data_ptr, tensor_type, buf); + PrintScalarToString(str_data_ptr, acl_data_type, buf); } else if (type_id == TypeId::kNumberTypeUInt8) { - PrintScalarToString(str_data_ptr, tensor_type, buf); + PrintScalarToString(str_data_ptr, acl_data_type, buf); } else if (type_id == TypeId::kNumberTypeInt16) { - PrintScalarToString(str_data_ptr, tensor_type, buf); + PrintScalarToString(str_data_ptr, acl_data_type, buf); } else if (type_id == TypeId::kNumberTypeUInt16) { - PrintScalarToString(str_data_ptr, tensor_type, buf); + PrintScalarToString(str_data_ptr, acl_data_type, buf); } else if (type_id == TypeId::kNumberTypeInt32) { - PrintScalarToString(str_data_ptr, tensor_type, buf); + PrintScalarToString(str_data_ptr, acl_data_type, buf); } else if (type_id == TypeId::kNumberTypeUInt32) { - PrintScalarToString(str_data_ptr, tensor_type, buf); + PrintScalarToString(str_data_ptr, acl_data_type, buf); } else if (type_id == TypeId::kNumberTypeInt64) { - PrintScalarToString(str_data_ptr, tensor_type, buf); + PrintScalarToString(str_data_ptr, acl_data_type, buf); } else if (type_id == TypeId::kNumberTypeUInt64) { - PrintScalarToString(str_data_ptr, tensor_type, buf); + PrintScalarToString(str_data_ptr, acl_data_type, buf); } else if (type_id == TypeId::kNumberTypeFloat16) { - PrintScalarToString(str_data_ptr, tensor_type, buf); + PrintScalarToString(str_data_ptr, acl_data_type, buf); } else if (type_id == TypeId::kNumberTypeFloat32) { - PrintScalarToString(str_data_ptr, tensor_type, buf); + PrintScalarToString(str_data_ptr, acl_data_type, buf); } else if (type_id == TypeId::kNumberTypeFloat64) { - PrintScalarToString(str_data_ptr, tensor_type, buf); + PrintScalarToString(str_data_ptr, acl_data_type, buf); } else { - MS_LOG(EXCEPTION) << "Cannot print scalar because of unsupported data type: " << tensor_type << "."; + MS_LOG(EXCEPTION) << "Cannot print scalar because of unsupported data type: " << GetParseType(acl_data_type) << "."; } } -bool judgeLengthValid(const size_t str_len, const string &tensor_type) { - auto type_iter = type_size_map.find(tensor_type); - if (type_iter == type_size_map.end()) { +bool judgeLengthValid(const size_t str_len, const aclDataType &acl_data_type) { + auto type_iter = acl_data_type_size_map.find(acl_data_type); + if (type_iter == acl_data_type_size_map.end()) { MS_LOG(EXCEPTION) << "type of scalar to print is not support."; } return str_len == type_iter->second; } -#ifndef NO_DLIB -bool ConvertDataItem2Tensor(const std::vector &items) { +bool ConvertDataset2Tensor(acltdtDataset *acl_dataset) { // Acquire Python GIL py::gil_scoped_acquire gil_acquire; std::ostringstream buf; bool ret_end_sequence = false; - for (auto &item : items) { - if (item.dataType_ == tdt::TDT_END_OF_SEQUENCE) { + + size_t acl_dataset_size = acltdtGetDatasetSize(acl_dataset); + + for (size_t i = 0; i < acl_dataset_size; i++) { + acltdtDataItem *item = acltdtGetDataItem(acl_dataset, i); + if (acltdtGetTensorTypeFromItem(item) == ACL_TENSOR_DATA_END_OF_SEQUENCE) { ret_end_sequence = true; + MS_LOG(INFO) << "end of sequence" << std::endl; break; } - std::shared_ptr str_data_ptr = std::static_pointer_cast(item.dataPtr_); - MS_EXCEPTION_IF_NULL(str_data_ptr); - if (item.tensorShape_ == kShapeScalar || item.tensorShape_ == kShapeNone) { - if (!judgeLengthValid(str_data_ptr->size(), item.tensorType_)) { + + size_t dim_num = acltdtGetDimNumFromItem(item); + void *acl_addr = acltdtGetDataAddrFromItem(item); + size_t acl_data_size = acltdtGetDataSizeFromItem(item); + aclDataType acl_data_type = acltdtGetDataTypeFromItem(item); + char *acl_data = reinterpret_cast(acl_addr); + acl_data = const_cast(reinterpret_cast(acl_data)->c_str()); + MS_EXCEPTION_IF_NULL(acl_data); + + ShapeVector tensorShape; + tensorShape.resize(dim_num); + + if (acltdtGetDimsFromItem(item, tensorShape.data(), dim_num) != ACL_SUCCESS) { + MS_LOG(ERROR) << "ACL failed get dim-size from acl channel data"; + } + + if ((tensorShape.size() == 1 && tensorShape[0] == 0) || tensorShape.size() == 0) { + if (!judgeLengthValid(acl_data_size, acl_data_type)) { MS_LOG(EXCEPTION) << "Print op receive data length is invalid."; } - convertDataItem2Scalar(str_data_ptr->data(), item.tensorType_, &buf); + convertDataItem2Scalar(acl_data, acl_data_type, &buf); continue; } - ShapeVector tensor_shape; - size_t totaldims = 1; - if (!ParseTensorShape(item.tensorShape_, &tensor_shape, &totaldims)) { - MS_LOG(ERROR) << "Tensor print can not parse tensor shape, receive info" << item.tensorShape_; - continue; - } - - if (item.tensorType_ == "string") { - std::string data(reinterpret_cast(str_data_ptr->c_str()), item.dataLen_); + if (acl_data_type == ACL_STRING) { + std::string data(reinterpret_cast(acl_data), acl_data_size); buf << data << std::endl; } else { - auto type_iter = print_type_map.find(item.tensorType_); - if (type_iter == print_type_map.end()) { - MS_LOG(ERROR) << "type of tensor need to print is not support " << item.tensorType_; + auto type_iter = print_acl_data_type_map.find(acl_data_type); + if (type_iter == print_acl_data_type_map.end()) { + MS_LOG(ERROR) << "type of tensor need to print is not support " << GetParseType(acl_data_type); continue; } auto type_id = type_iter->second; - mindspore::tensor::Tensor print_tensor(type_id, tensor_shape); - auto memory_size = totaldims * type_size_map[item.tensorType_]; - if (PrintTensorToString(str_data_ptr->data(), &print_tensor, memory_size)) { + mindspore::tensor::Tensor print_tensor(type_id, tensorShape); + if (PrintTensorToString(acl_data, &print_tensor, acl_data_size)) { buf << print_tensor.ToStringNoLimit() << std::endl; } } @@ -216,44 +200,63 @@ bool ConvertDataItem2Tensor(const std::vector &items) { return ret_end_sequence; } -bool SaveDataItem2File(const std::vector &items, const std::string &print_file_path, prntpb::Print print, - std::fstream *output) { +bool SaveDataset2File(acltdtDataset *acl_dataset, const std::string &print_file_path, prntpb::Print print, + std::fstream *output) { bool ret_end_thread = false; - for (auto &item : items) { - if (item.dataType_ == tdt::TDT_END_OF_SEQUENCE) { + + for (size_t i = 0; i < acltdtGetDatasetSize(acl_dataset); i++) { + acltdtDataItem *item = acltdtGetDataItem(acl_dataset, i); + MS_EXCEPTION_IF_NULL(item); + acltdtTensorType acl_tensor_type = acltdtGetTensorTypeFromItem(item); + + if (acl_tensor_type == ACL_TENSOR_DATA_END_OF_SEQUENCE) { + MS_LOG(INFO) << "Acl channel received end-of-sequence for print op."; ret_end_thread = true; break; + } else if (acl_tensor_type == ACL_TENSOR_DATA_ABNORMAL) { + MS_LOG(INFO) << "Acl channel received abnormal for print op."; + return true; + } else if (acl_tensor_type == ACL_TENSOR_DATA_UNDEFINED) { + MS_LOG(INFO) << "Acl channel received undefined message type for print op."; + return false; } + prntpb::Print_Value *value = print.add_value(); - std::shared_ptr str_data_ptr = std::static_pointer_cast(item.dataPtr_); - MS_EXCEPTION_IF_NULL(str_data_ptr); - if (item.tensorShape_ == kShapeScalar || item.tensorShape_ == kShapeNone) { - if (!judgeLengthValid(str_data_ptr->size(), item.tensorType_)) { + size_t dim_num = acltdtGetDimNumFromItem(item); + void *acl_addr = acltdtGetDataAddrFromItem(item); + size_t acl_data_size = acltdtGetDataSizeFromItem(item); + aclDataType acl_data_type = acltdtGetDataTypeFromItem(item); + char *acl_data = reinterpret_cast(acl_addr); + MS_EXCEPTION_IF_NULL(acl_data); + + ShapeVector tensorShape; + tensorShape.resize(dim_num); + + if (acltdtGetDimsFromItem(item, tensorShape.data(), dim_num) != ACL_SUCCESS) { + MS_LOG(ERROR) << "ACL failed get dim-size from acl channel data"; + } + + if ((tensorShape.size() == 1 && tensorShape[0] == 0) || tensorShape.size() == 0) { + if (!judgeLengthValid(acl_data_size, acl_data_type)) { MS_LOG(ERROR) << "Print op receive data length is invalid."; ret_end_thread = true; } } - ShapeVector tensor_shape; - size_t totaldims = 1; - if (!ParseTensorShape(item.tensorShape_, &tensor_shape, &totaldims)) { - MS_LOG(ERROR) << "Tensor print can not parse tensor shape, receive info" << item.tensorShape_; - ret_end_thread = true; - } - - if (item.tensorType_ == "string") { - std::string data(reinterpret_cast(str_data_ptr->c_str()), item.dataLen_); + if (acl_data_type == ACL_STRING) { + std::string data(reinterpret_cast(acl_data), acl_data_size); value->set_desc(data); } else { - auto parse_type = GetParseType(item.tensorType_); + auto parse_type = GetParseType(acl_data_type); prntpb::TensorProto *tensor = value->mutable_tensor(); - if (!(item.tensorShape_ == kShapeScalar) && !(item.tensorShape_ == kShapeNone)) { - for (const auto &dim : tensor_shape) { + if (tensorShape.size() > 1 || (tensorShape.size() == 1 && tensorShape[0] != 1)) { + for (const auto &dim : tensorShape) { tensor->add_dims(static_cast<::google::protobuf::int64>(dim)); } } + tensor->set_tensor_type(parse_type); - std::string data(reinterpret_cast(str_data_ptr->c_str()), item.dataLen_); + std::string data(reinterpret_cast(acl_data), acl_data_size); tensor->set_tensor_content(data); } @@ -274,29 +277,37 @@ void TensorPrint::operator()() { std::string print_file_path = ms_context->get_param(MS_CTX_PRINT_FILE_PATH); if (print_file_path == "") { while (true) { - std::vector bundle; - if (tdt::TdtHostPopData("_npu_log", bundle) != 0) { + acltdtDataset *acl_dataset = acltdtCreateDataset(); + if (acl_dataset == nullptr) { + MS_LOG(ERROR) << "Failed create acl dateaset."; + } + if (acltdtReceiveTensor(acl_handle_, acl_dataset, -1 /* no timeout */) != ACL_SUCCESS) { + MS_LOG(ERROR) << "Acltdt receive tensor failed"; break; } - if (ConvertDataItem2Tensor(bundle)) { + if (ConvertDataset2Tensor(acl_dataset)) { break; } } } else { std::fstream output(print_file_path, std::ios::out | std::ios::trunc | std::ios::binary); while (true) { - std::vector bundle; - if (tdt::TdtHostPopData("_npu_log", bundle) != 0) { + acltdtDataset *acl_dataset = acltdtCreateDataset(); + if (acl_dataset == nullptr) { + MS_LOG(ERROR) << "Failed create acl dateaset."; + } + if (acltdtReceiveTensor(acl_handle_, acl_dataset, -1 /* no timeout */) != ACL_SUCCESS) { + MS_LOG(ERROR) << "Acltdt receive tensor failed"; break; } - if (SaveDataItem2File(bundle, print_file_path, print, &output)) { + if (SaveDataset2File(acl_dataset, print_file_path, print, &output)) { break; } } output.close(); std::string path_string = print_file_path; if (chmod(common::SafeCStr(path_string), S_IRUSR) == -1) { - MS_LOG(ERROR) << "Modify file:" << print_file_path << " to r fail."; + MS_LOG(ERROR) << "Modify file:" << print_file_path << " to fail."; return; } } diff --git a/mindspore/ccsrc/utils/tensorprint_utils.h b/mindspore/ccsrc/utils/tensorprint_utils.h index b150368f71f..cf84e710991 100644 --- a/mindspore/ccsrc/utils/tensorprint_utils.h +++ b/mindspore/ccsrc/utils/tensorprint_utils.h @@ -20,9 +20,10 @@ #include #include "ir/dtype/type.h" #ifndef NO_DLIB +#include "acl/acl_tdt.h" #include "tdt/tsd_client.h" -#include "tdt/tdt_host_interface.h" #include "tdt/data_common.h" +#include "tdt/tdt_host_interface.h" #include "proto/print.pb.h" #include "utils/ms_context.h" #endif @@ -32,7 +33,11 @@ class TensorPrint { TensorPrint() {} ~TensorPrint() = default; #ifndef NO_DLIB + explicit TensorPrint(acltdtChannelHandle *acl_handle) { acl_handle_ = acl_handle; } void operator()(); + + private: + acltdtChannelHandle *acl_handle_ = nullptr; #endif }; } // namespace mindspore diff --git a/mindspore/core/utils/ms_context.cc b/mindspore/core/utils/ms_context.cc index a2328059c33..57c996f858d 100644 --- a/mindspore/core/utils/ms_context.cc +++ b/mindspore/core/utils/ms_context.cc @@ -50,6 +50,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) { } else { set_param(MS_CTX_DEVICE_ID, 0); } + set_param(MS_CTX_MAX_CALL_DEPTH, MAX_CALL_DEPTH_DEFAULT); set_param(MS_CTX_DEVICE_TARGET, target); set_param(MS_CTX_EXECUTION_MODE, kPynativeMode); @@ -108,4 +109,22 @@ std::string MsContext::backend_policy() const { } return "unknown"; } + +#ifdef ENABLE_TDTQUE +acltdtChannelHandle *MsContext::get_acl_tdt_channel_handle() { + if (acl_handle == nullptr) { + std::string kReceivePrefix = "TF_RECEIVE_"; + std::string channel_name = "_npu_log"; + uint32_t device_id = get_param(MS_CTX_DEVICE_ID); + acl_handle = acltdtCreateChannel(device_id, (kReceivePrefix + channel_name).c_str()); + if (acl_handle == nullptr) { + MS_LOG(ERROR) << "Failed to create acltdt handle : " << channel_name; + return nullptr; + } + MS_LOG(INFO) << "Success to create acltdt handle: " << channel_name; + return acl_handle; + } + return acl_handle; +} +#endif } // namespace mindspore diff --git a/mindspore/core/utils/ms_context.h b/mindspore/core/utils/ms_context.h index f82e7876852..e137f047493 100644 --- a/mindspore/core/utils/ms_context.h +++ b/mindspore/core/utils/ms_context.h @@ -24,7 +24,10 @@ #include #include #include "utils/log_adapter.h" - +#include "utils/ms_utils.h" +#ifndef NO_DLIB +#include "acl/acl_tdt.h" +#endif namespace mindspore { enum MsBackendPolicy { kMsBackendGeOnly = 0, @@ -130,11 +133,13 @@ class MsContext { std::string backend_policy() const; bool set_backend_policy(const std::string &policy); - +#ifdef ENABLE_TDTQUE + acltdtChannelHandle *get_acl_tdt_channel_handle(); +#endif static void device_seter(DeviceSeter device) { seter_ = device; } static void device_type_seter(DeviceTypeSeter device_type) { device_type_seter_ = device_type; } - std::thread tdt_print_; + std::thread acl_tdt_print; template void set_param(MsCtxParam param, const T &value) { @@ -169,6 +174,9 @@ class MsContext { std::string string_params_[MsCtxParam::NUM_STRING_PARAMS]; MsBackendPolicy backend_policy_; +#ifdef ENABLE_TDTQUE + acltdtChannelHandle *acl_handle = nullptr; +#endif }; // set method implementation for type bool/int/uint32_t/float/std::string