diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc index 0730ad35707..5cb0d411e50 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc @@ -31,6 +31,7 @@ #include "proto/node_def.pb.h" #include "backend/session/anf_runtime_algorithm.h" #include "backend/kernel_compiler/aicpu/aicpu_util.h" +#include "backend/kernel_compiler/aicpu/aicpu_kernel_load.h" #include "backend/session/kernel_graph.h" #include "backend/kernel_compiler/common_utils.h" #include "backend/kernel_compiler/oplib/oplib.h" @@ -423,6 +424,11 @@ KernelModPtr AicpuOpBuild(const std::shared_ptr &anf_node) { if (!SetIOSize(anf_node, kernel_mod_ptr)) { MS_LOG(EXCEPTION) << "Set input output size list failed."; } + + if (!AicpuOpKernelLoad::GetInstance().LoadAicpuKernelSo(anf_node, kernel_mod_ptr)) { + MS_LOG(EXCEPTION) << "Aicpu kernel so load failed. task is " << anf_node->fullname_with_scope(); + } + return kernel_mod_ptr; } } // namespace kernel diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_load.cc b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_load.cc new file mode 100644 index 00000000000..44b8c99cfae --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_load.cc @@ -0,0 +1,373 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "backend/kernel_compiler/aicpu/aicpu_kernel_load.h" +#include +#include +#include +#include +#include +#include +#include "runtime/kernel.h" +#include "runtime/mem.h" +#include "runtime/context.h" +#include "utils/utils.h" +#include "utils/file_utils.h" +#include "backend/session/anf_runtime_algorithm.h" + +namespace mindspore { +namespace kernel { +bool AicpuOpKernelLoad::GetBinaryFileName(const std::string &so_name, const std::string &bin_folder_path, + std::string *bin_file_path) { + MS_EXCEPTION_IF_NULL(bin_file_path); + const auto &iter = so_name_and_realpath_map_.find(so_name); + if (iter != so_name_and_realpath_map_.end()) { + *bin_file_path = iter->second; + MS_LOG(INFO) << "so " << so_name << " has bin file path " << bin_file_path; + return true; + } + + std::string bin_file_name(bin_folder_path); + if (bin_file_name.empty()) { + bin_file_name = "./"; + } else if (bin_file_name.back() != '/') { + bin_file_name.append("/"); + } + + bin_file_name += so_name; + auto real_file_path = FileUtils::GetRealPath(bin_file_name.c_str()); + if (!real_file_path.has_value()) { + MS_LOG(ERROR) << "Get real path failed, path=" << bin_file_name; + return false; + } + + auto real_file_path_value = real_file_path.value(); + if (access(real_file_path_value.c_str(), F_OK) == -1) { + MS_LOG(ERROR) << "Kernel so path:" << real_file_path_value << " is not existed!"; + return false; + } + + *bin_file_path = real_file_path_value; + so_name_and_realpath_map_[so_name] = *bin_file_path; + return true; +} + +bool AicpuOpKernelLoad::ReadBytesFromBinaryFile(const std::string &file_name, std::vector *buffer) const { + std::ifstream file(file_name.c_str(), std::ios::binary | std::ios::ate); + if (!file.is_open()) { + MS_LOG(ERROR) << "Open file [" << file_name << "] failed"; + return false; + } + + std::streamsize size = file.tellg(); + if (size <= 0) { + file.close(); + MS_LOG(ERROR) << "Empty file [" << file_name << "], please check this file."; + return false; + } + if (size > INT_MAX) { + file.close(); + MS_LOG(ERROR) << "File [" << file_name << "] size [" << size << "] is out of limit[" << INT_MAX << "]"; + return false; + } + + file.seekg(0, std::ios::beg); + buffer->resize(size); + file.read(buffer->data(), size); + file.close(); + return true; +} + +bool AicpuOpKernelLoad::GetSoNeedLoadPath(const std::string &so_name, std::string *file_path) const { + MS_EXCEPTION_IF_NULL(file_path); + Dl_info dl_info; + if (dladdr(reinterpret_cast(const_cast(this)), &dl_info) == 0) { + MS_LOG(ERROR) << "Get dladdr failed!"; + return false; + } + std::string cust_kernel_so_path(dl_info.dli_fname); + + auto pos = cust_kernel_so_path.find_last_of('/'); + if (cust_kernel_so_path.empty() || pos == std::string::npos) { + MS_LOG(ERROR) << "Current path [" << cust_kernel_so_path << "] is invalid."; + return false; + } + auto real_cust_kernel_so_path = cust_kernel_so_path.substr(0, pos) + "/lib/"; + + if (real_cust_kernel_so_path.size() > PATH_MAX) { + MS_LOG(ERROR) << "Current path [" << real_cust_kernel_so_path << "] is too long."; + return false; + } + + *file_path = real_cust_kernel_so_path; + return true; +} + +bool AicpuOpKernelLoad::PackageBinaryFile(const std::string &so_name, + std::map *so_name_with_bin_info) { + std::string bin_folder_path; + bool ret = GetSoNeedLoadPath(so_name, &bin_folder_path); + if (!ret) { + MS_LOG(ERROR) << "GetSoNeedLoadPath failed."; + return false; + } + + std::string bin_file_path; + ret = GetBinaryFileName(so_name, bin_folder_path, &bin_file_path); + if (!ret) { + MS_LOG(ERROR) << "GetBinaryFileName failed."; + return false; + } + + std::vector buffer; + ret = ReadBytesFromBinaryFile(bin_file_path, &buffer); + if (!ret) { + MS_LOG(ERROR) << "ReadBytesFromBinaryFile failed."; + return false; + } + + OpKernelBinPtr cust_aicpu_kernel_ptr = std::make_shared(so_name, std::move(buffer)); + if (cust_aicpu_kernel_ptr == nullptr) { + MS_LOG(ERROR) << "Create OpKernelBin object failed."; + return false; + } + so_name_with_bin_info->insert({so_name, cust_aicpu_kernel_ptr}); + + return true; +} + +bool AicpuOpKernelLoad::LoadAicpuKernelSo(const AnfNodePtr &node, + const std::shared_ptr &kernel_mod_ptr) { + std::lock_guard lock(cust_aicpu_mutex_); + MS_EXCEPTION_IF_NULL(node); + MS_EXCEPTION_IF_NULL(kernel_mod_ptr); + CNodePtr cnode = node->cast(); + MS_EXCEPTION_IF_NULL(cnode); + if (!AnfAlgo::HasNodeAttr(kAttrCustAicpu, cnode)) { + MS_LOG(INFO) << "Current aicpu ops:" << cnode->fullname_with_scope() << " isn't a custom ops."; + return true; + } + + std::string so_name = "lib" + AnfAlgo::GetNodeAttr(cnode, kAttrCustAicpu) + ".so"; + if (so_name == kLibAicpuKernelSoName || so_name == kLibCpuKernelSoName) { + MS_LOG(INFO) << "Aicpu so:" << so_name << " is default so."; + return true; + } + + kernel_mod_ptr->SetCustSo(so_name); + rtContext_t rt_cur_ctx = nullptr; + auto rt_error = rtCtxGetCurrent(&rt_cur_ctx); + if (rt_error != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtCtxGetCurrent failed, ret = 0x" << rt_error; + return false; + } + // use current context as resource key + uintptr_t resource_id = reinterpret_cast(rt_cur_ctx); + auto it = cust_aicpu_so_.find(resource_id); + if (it != cust_aicpu_so_.end()) { + auto it_so_name = it->second.find(so_name); + if (it_so_name != it->second.end()) { + MS_LOG(INFO) << "Cust aicpu so:" << so_name << " has been loaded."; + return true; + } + } + + std::map so_name_with_bin_info; + if (!PackageBinaryFile(so_name, &so_name_with_bin_info)) { + MS_LOG(ERROR) << "Package binary file failed."; + return false; + } + + if (it == cust_aicpu_so_.end()) { + cust_aicpu_so_[resource_id] = so_name_with_bin_info; + MS_LOG(INFO) << "Load new aicpu so:" << so_name << "success, resource id:" << resource_id << "."; + return true; + } + auto it_so_name = it->second.find(so_name); + if (it_so_name == it->second.end()) { + it->second.insert(so_name_with_bin_info.begin(), so_name_with_bin_info.end()); + MS_LOG(INFO) << "Load cust aicpu so:" << so_name << "success, resource id:" << resource_id << "."; + return true; + } + return true; +} + +bool AicpuOpKernelLoad::CacheBinaryFileToDevice(const uintptr_t &resource_id, std::vector *allocated_mem, + void **batch_args) { + auto it = cust_aicpu_so_.find(resource_id); + if (it == cust_aicpu_so_.end()) { + MS_LOG(ERROR) << "Context id:" << resource_id << " is invalid."; + return false; + } + + rtError_t status; + std::vector v_cust_so; + for (const auto &it_so : it->second) { + const auto &so_name = it_so.first; + const void *aicpu_data = it_so.second->GetBinData(); + uint32_t aicpu_data_length = it_so.second->GetBinDataSize(); + void *d_aicpu_data = nullptr; + void *d_so_name = nullptr; + + status = rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtMalloc failed, size:" << aicpu_data_length << ", ret = 0x" << status; + return false; + } + allocated_mem->emplace_back(d_aicpu_data); + + status = rtMalloc(&d_so_name, so_name.size(), RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtMalloc failed, size:" << so_name.size() << ", ret = 0x" << status; + return false; + } + allocated_mem->emplace_back(d_so_name); + + status = rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtMemcpy failed, ret = 0x" << status; + return false; + } + + status = rtMemcpy(d_so_name, so_name.size(), reinterpret_cast(so_name.c_str()), so_name.size(), + RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtMemcpy failed, ret = 0x" << status; + return false; + } + + CustAicpuSoBuf cust_aicpu_so_buf; + cust_aicpu_so_buf.kernelSoBuf = static_cast(reinterpret_cast(d_aicpu_data)); + cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length; + cust_aicpu_so_buf.kernelSoName = static_cast(reinterpret_cast(d_so_name)); + cust_aicpu_so_buf.kernelSoNameLen = so_name.size(); + v_cust_so.emplace_back(cust_aicpu_so_buf); + } + + void *args = nullptr; + uint32_t args_size = sizeof(CustAicpuSoBuf) * v_cust_so.size(); + status = rtMalloc(&args, args_size, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtMalloc failed, size:" << args_size << ", ret = 0x" << status; + return false; + } + allocated_mem->emplace_back(args); + status = rtMemcpy(args, args_size, v_cust_so.data(), args_size, RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtMemcpy failed, ret = 0x" << status; + return false; + } + + BatchLoadOpFromBufArgs batch_cust_so; + batch_cust_so.soNum = v_cust_so.size(); + batch_cust_so.args = static_cast(reinterpret_cast(args)); + + uint32_t batch_args_size = sizeof(BatchLoadOpFromBufArgs); + status = rtMalloc(batch_args, batch_args_size, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtMalloc failed, size:" << batch_args_size << ", ret = 0x" << status; + return false; + } + allocated_mem->emplace_back(*batch_args); + status = rtMemcpy(*batch_args, batch_args_size, static_cast(&batch_cust_so), batch_args_size, + RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtMemcpy failed, ret = 0x" << status; + return false; + } + + return true; +} + +bool AicpuOpKernelLoad::LaunchAicpuKernelSo() { + std::lock_guard lock(cust_aicpu_mutex_); + if (cust_aicpu_so_.empty()) { + return true; + } + + rtContext_t rt_cur_ctx = nullptr; + rtError_t status = RT_ERROR_NONE; + status = rtCtxGetCurrent(&rt_cur_ctx); + if (status != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtCtxGetCurrent failed, ret = 0x" << status; + return false; + } + // use current context as resource key + uintptr_t resource_id = reinterpret_cast(rt_cur_ctx); + auto it = cust_aicpu_so_.find(resource_id); + if (it == cust_aicpu_so_.end()) { + MS_LOG(INFO) << "Cust aicpu so map is empty, context id:" << resource_id; + return true; + } + + std::vector allocated_mem; + void *batch_args = nullptr; + uint32_t batch_args_size = sizeof(BatchLoadOpFromBufArgs); + bool ret = CacheBinaryFileToDevice(resource_id, &allocated_mem, &batch_args); + allocated_mem_list_.emplace_back(std::move(allocated_mem)); + if (!ret) { + MS_LOG(ERROR) << "CacheBinaryFileToDevice is failed."; + return false; + } + + rtStream_t stream = nullptr; + status = rtStreamCreate(&stream, 0); + if (status != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtStreamCreate failed, ret = 0x" << status; + return false; + } + stream_list_.emplace_back(stream); + // launch "batchLoadsoFrombuf" event to device. + std::string load_event(kBatchLoadBuf); + status = rtCpuKernelLaunch(nullptr, load_event.c_str(), 1, batch_args, batch_args_size, nullptr, stream); + if (status != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtCpuKernelLaunch failed, ret = 0x" << status; + return false; + } + status = rtStreamSynchronize(stream); + if (status != RT_ERROR_NONE) { + MS_LOG(ERROR) << "Call rtStreamSynchronize failed, ret = 0x" << status; + return false; + } + + MS_LOG(INFO) << "Aicpu kernel so launch success."; + return true; +} + +void AicpuOpKernelLoad::FreeDeviceMemory() { + for (auto allocated_mem : allocated_mem_list_) { + for (auto mem : allocated_mem) { + if (mem == nullptr) { + continue; + } + auto rt_error = rtFree(mem); + if (rt_error != RT_ERROR_NONE) { + MS_LOG(EXCEPTION) << "Call rtFree failed, ret = 0x" << rt_error; + } + } + } + for (auto stream : stream_list_) { + if (stream != nullptr) { + auto rt_error = rtStreamDestroy(stream); + if (rt_error != RT_ERROR_NONE) { + MS_LOG(EXCEPTION) << "Call rtStreamDestroy failed, ret = 0x" << rt_error; + } + } + } + so_name_and_realpath_map_.clear(); + cust_aicpu_so_.clear(); +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_load.h b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_load.h new file mode 100644 index 00000000000..3fa973c38c2 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_load.h @@ -0,0 +1,78 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_AICPU_KERNEL_LOAD_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_AICPU_KERNEL_LOAD_H_ + +#include +#include +#include +#include + +#include "runtime/base.h" +#include "base/base.h" +#include "ir/anf.h" +#include "backend/kernel_compiler/aicpu/aicpu_util.h" +#include "backend/kernel_compiler/aicpu/aicpu_kernel_mod.h" + +namespace mindspore { +namespace kernel { +constexpr auto kBatchLoadBuf = "batchLoadsoFrombuf"; + +#pragma pack(push, 1) +struct CustAicpuSoBuf { + uint64_t kernelSoBuf; + uint32_t kernelSoBufLen; + uint64_t kernelSoName; + uint32_t kernelSoNameLen; +}; + +struct BatchLoadOpFromBufArgs { + uint32_t soNum; + uint64_t args; +}; +#pragma pack(pop) + +class AicpuOpKernelLoad { + public: + AicpuOpKernelLoad() = default; + ~AicpuOpKernelLoad() = default; + + static AicpuOpKernelLoad &GetInstance() { + static AicpuOpKernelLoad instance; + return instance; + } + + bool LaunchAicpuKernelSo(); + bool LoadAicpuKernelSo(const AnfNodePtr &node, const std::shared_ptr &kernel_mod_ptr); + void FreeDeviceMemory(); + + private: + bool GetBinaryFileName(const std::string &so_name, const std::string &bin_folder_path, std::string *bin_file_path); + bool ReadBytesFromBinaryFile(const std::string &file_name, std::vector *buffer) const; + bool GetSoNeedLoadPath(const std::string &so_name, std::string *file_path) const; + bool PackageBinaryFile(const std::string &so_name, std::map *so_name_with_bin_info); + bool CacheBinaryFileToDevice(const uintptr_t &resource_id, std::vector *allocated_mem, void **batch_args); + + std::map so_name_and_realpath_map_; + std::map> cust_aicpu_so_; + std::mutex cust_aicpu_mutex_; + std::vector stream_list_; + std::vector> allocated_mem_list_; +}; +} // namespace kernel +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_AICPU_KERNEL_LOAD_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc index 37b6095e84b..2f71e0b7914 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc @@ -36,9 +36,6 @@ using HostDynamicKernel = mindspore::device::ascend::HostDynamicKernel; namespace mindspore { namespace kernel { -constexpr auto AICPU_OPS_SO_NAME = "libaicpu_kernels.so"; -constexpr auto CUST_AICPU_OPS_SO_NAME = "libcpu_kernels.so"; - AicpuOpKernelMod::AicpuOpKernelMod() : anf_node_(nullptr) {} AicpuOpKernelMod::~AicpuOpKernelMod() { @@ -63,6 +60,10 @@ void AicpuOpKernelMod::SetOutputList(const std::vector &outputList) { o void AicpuOpKernelMod::SetNodeDef(const std::string &nodeDef) { (void)node_def_str_.assign(nodeDef); } void AicpuOpKernelMod::SetExtInfo(const std::string &ext_info) { ext_info_ = ext_info; } void AicpuOpKernelMod::SetNodeName(const std::string &node_name) { node_name_ = node_name; } +void AicpuOpKernelMod::SetCustSo(const std::string &cust_so) { + node_so_ = cust_so; + cust_kernel_ = true; +} void AicpuOpKernelMod::SetAnfNode(const mindspore::AnfNodePtr &anf_node) { MS_EXCEPTION_IF_NULL(anf_node); anf_node_ = anf_node; @@ -72,15 +73,17 @@ void AicpuOpKernelMod::CreateCpuKernelInfo(const std::vector &inputs const std::vector &outputs) { MS_LOG(INFO) << "CreateCpuKernelInfoOffline start"; - if (kCustAiCpuKernelOps.find(node_name_) != kCustAiCpuKernelOps.end()) { - node_so_ = CUST_AICPU_OPS_SO_NAME; - node_name_ = kCustRunApi; - } else if (kCacheKernelOps.find(node_name_) != kCacheKernelOps.end()) { - node_so_ = AICPU_OPS_SO_NAME; - node_name_ = kCustRunApi; - } else { - if (node_so_ != CUST_AICPU_OPS_SO_NAME) { - node_so_ = AICPU_OPS_SO_NAME; + if (!cust_kernel_) { + if (kCpuKernelOps.find(node_name_) != kCpuKernelOps.end()) { + node_so_ = kLibCpuKernelSoName; + node_name_ = kCpuRunApi; + } else if (kCacheKernelOps.find(node_name_) != kCacheKernelOps.end()) { + node_so_ = kLibAicpuKernelSoName; + node_name_ = kCpuRunApi; + } else { + if (node_so_ != kLibCpuKernelSoName) { + node_so_ = kLibAicpuKernelSoName; + } } } // InputOutputAddr @@ -149,12 +152,16 @@ bool AicpuOpKernelMod::Launch(const std::vector &inputs, const std:: if (node_name_ == kStack) { node_name_ = kPack; } + auto flag = RT_KERNEL_DEFAULT; + if (cust_kernel_) { + flag = RT_KERNEL_CUSTOM_AICPU; + } MS_LOG(INFO) << "Aicpu launch, node_so_:" << node_so_ << ", node name:" << node_name_ << ", args_size:" << args_.length(); - if (rtCpuKernelLaunch(reinterpret_cast(node_so_.c_str()), - reinterpret_cast(node_name_.c_str()), 1, - reinterpret_cast(args_.data()), static_cast(args_.length()), nullptr, - stream_) != RT_ERROR_NONE) { + if (rtCpuKernelLaunchWithFlag(reinterpret_cast(node_so_.c_str()), + reinterpret_cast(node_name_.c_str()), 1, + reinterpret_cast(args_.data()), static_cast(args_.length()), + nullptr, stream_, flag) != RT_ERROR_NONE) { MS_LOG(ERROR) << "Aicpu op launch failed!"; return false; @@ -168,15 +175,17 @@ std::vector AicpuOpKernelMod::GenTask(const std::vector MS_LOG(INFO) << "AicpuOpKernelMod GenTask start"; stream_id_ = stream_id; - if (kCustAiCpuKernelOps.find(node_name_) != kCustAiCpuKernelOps.end()) { - node_so_ = CUST_AICPU_OPS_SO_NAME; - node_name_ = kCustRunApi; - } else if (kCacheKernelOps.find(node_name_) != kCacheKernelOps.end()) { - node_so_ = AICPU_OPS_SO_NAME; - node_name_ = kCustRunApi; - } else { - if (node_so_ != CUST_AICPU_OPS_SO_NAME) { - node_so_ = AICPU_OPS_SO_NAME; + if (!cust_kernel_) { + if (kCpuKernelOps.find(node_name_) != kCpuKernelOps.end()) { + node_so_ = kLibCpuKernelSoName; + node_name_ = kCpuRunApi; + } else if (kCacheKernelOps.find(node_name_) != kCacheKernelOps.end()) { + node_so_ = kLibAicpuKernelSoName; + node_name_ = kCpuRunApi; + } else { + if (node_so_ != kLibCpuKernelSoName) { + node_so_ = kLibAicpuKernelSoName; + } } } std::vector input_data_addrs; @@ -197,7 +206,7 @@ std::vector AicpuOpKernelMod::GenTask(const std::vector AicpuTaskInfoPtr task_info_ptr = std::make_shared( unique_name_, stream_id, node_so_, node_name_, node_def_str_, ext_info_, input_data_addrs, output_data_addrs, - NeedDump()); + NeedDump(), cust_kernel_); MS_LOG(INFO) << "AicpuOpKernelMod GenTask end"; return {task_info_ptr}; diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h index 71768416ed5..c498e67c2fe 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h +++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h @@ -39,6 +39,7 @@ class AicpuOpKernelMod : public AscendKernelMod { void SetNodeDef(const std::string &nodeDef); void SetExtInfo(const std::string &ext_info); void SetNodeName(const std::string &node_name); + void SetCustSo(const std::string &cust_so); /** * @brief Build AICPU Engine kernel structure, and allocate device memory for offline task generate @@ -56,6 +57,7 @@ class AicpuOpKernelMod : public AscendKernelMod { const std::vector &GetWorkspaceSizeList() const override; private: + bool cust_kernel_{false}; std::string args_; std::string node_def_str_; std::string node_name_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.h b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.h index cbb2ff990bd..ac3854e3575 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.h +++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.h @@ -17,6 +17,8 @@ #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_AICPU_UTIL_H_ #include +#include +#include #include #include #include @@ -24,6 +26,8 @@ #include "backend/kernel_compiler/kernel.h" namespace mindspore { namespace kernel { +constexpr auto kLibAicpuKernelSoName = "libaicpu_kernels.so"; +constexpr auto kLibCpuKernelSoName = "libcpu_kernels.so"; constexpr auto kInitDataSetQueue = "InitDataSetQueue"; constexpr auto kInitData = "InitData"; constexpr auto kGetNext = "GetNext"; @@ -55,7 +59,7 @@ constexpr auto kUpdateCache = "UpdateCache"; constexpr auto kCacheSwapTable = "CacheSwapTable"; constexpr auto kSubAndFilter = "SubAndFilter"; constexpr auto kPadAndShift = "PadAndShift"; -constexpr auto kCustRunApi = "RunCpuKernel"; +constexpr auto kCpuRunApi = "RunCpuKernel"; constexpr auto kDropout2D = "Dropout2D"; constexpr auto kDropout3D = "Dropout3D"; constexpr auto kMaskedSelect = "MaskedSelect"; @@ -65,8 +69,8 @@ constexpr auto kSearchSorted = "SearchSorted"; constexpr auto kResizeBilinear = "ResizeBilinear"; constexpr auto kResizeBilinearGrad = "ResizeBilinearGrad"; constexpr auto kScatterElements = "ScatterElements"; -const std::set kCustAiCpuKernelOps{kIdentity, kMaskedSelect, kMaskedSelectGrad, kDynamicStitch, - kSearchSorted, kResizeBilinear, kResizeBilinearGrad, kScatterElements}; +const std::set kCpuKernelOps{kIdentity, kMaskedSelect, kMaskedSelectGrad, kDynamicStitch, + kSearchSorted, kResizeBilinear, kResizeBilinearGrad, kScatterElements}; const std::set kCacheKernelOps{kUpdateCache, kCacheSwapTable, kSubAndFilter, kPadAndShift, kDropout3D, kDropout2D}; const std::set kDynamicInputOps{ @@ -118,6 +122,24 @@ class AicpuOpUtil { // kernel id static uint64_t KernelId_; }; + +class OpKernelBin { + public: + OpKernelBin(std::string name, std::vector &&data) : name_(std::move(name)), data_(std::move(data)) {} + ~OpKernelBin() = default; + + const std::string &GetName() const { return name_; } + const uint8_t *GetBinData() const { return (const uint8_t *)data_.data(); } + size_t GetBinDataSize() const { return data_.size(); } + OpKernelBin(const OpKernelBin &) = delete; + const OpKernelBin &operator=(const OpKernelBin &) = delete; + + private: + std::string name_; + std::vector data_; +}; + +using OpKernelBinPtr = std::shared_ptr; } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 2c5ee91ea59..019b059b777 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -34,6 +34,7 @@ #include "runtime/device/ascend/tasksink/task_generator.h" #include "backend/session/anf_runtime_algorithm.h" #include "backend/session/kernel_build_client.h" +#include "backend/kernel_compiler/aicpu/aicpu_kernel_load.h" #ifndef ENABLE_SECURITY #include "runtime/device/ascend/profiling/profiling_manager.h" #include "runtime/device/ascend/profiling/profiling_utils.h" @@ -286,6 +287,7 @@ void AscendKernelRuntime::ReleaseDeviceRes() { if (mem_manager_ != nullptr) { mem_manager_->FreeDeviceMemory(); } + mindspore::kernel::AicpuOpKernelLoad::GetInstance().FreeDeviceMemory(); auto rt_ret = rtRegTaskFailCallbackByModule(kModuleName, nullptr); if (rt_ret != RT_ERROR_NONE) { @@ -438,6 +440,9 @@ bool AscendKernelRuntime::Load(const session::KernelGraph &graph, bool is_task_s if (!LoadTask(graph)) { return false; } + if (!mindspore::kernel::AicpuOpKernelLoad::GetInstance().LaunchAicpuKernelSo()) { + return false; + } return true; } diff --git a/mindspore/ccsrc/runtime/device/ascend/ge_runtime/task/aicpu_task.cc b/mindspore/ccsrc/runtime/device/ascend/ge_runtime/task/aicpu_task.cc index 3e58c58ebd3..74a3d1a8866 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ge_runtime/task/aicpu_task.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ge_runtime/task/aicpu_task.cc @@ -73,13 +73,14 @@ void AicpuTask::Distribute() { // for data dump input_output_addr_ = reinterpret_cast(reinterpret_cast(args_) + io_addr_offset); auto dump_flag = task_info_->dump_flag() ? RT_KERNEL_DUMPFLAG : RT_KERNEL_DEFAULT; + auto cpu_flag = task_info_->cust_aicpu() ? RT_KERNEL_CUSTOM_AICPU : dump_flag; MS_LOG(INFO) << "Distribute AicpuTask start, args_size = " << args_size << ", io_addrs_num =" << io_addrs_num << ", so_name = " << task_info_->so_name() << ", kernel_name = " << task_info_->kernel_name() << ", dump_flag = " << dump_flag; rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast(task_info_->so_name().data()), reinterpret_cast(task_info_->kernel_name().data()), 1, args_, - args_size, nullptr, stream_, dump_flag); + args_size, nullptr, stream_, cpu_flag); if (rt_ret != RT_ERROR_NONE) { MS_LOG(EXCEPTION) << "Call rt api rtCpuKernelLaunchWithFlag failed, ret: " << rt_ret; } diff --git a/mindspore/ccsrc/runtime/device/ascend/ge_runtime/task_info.h b/mindspore/ccsrc/runtime/device/ascend/ge_runtime/task_info.h index 6afb0c3c839..8a992f75105 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ge_runtime/task_info.h +++ b/mindspore/ccsrc/runtime/device/ascend/ge_runtime/task_info.h @@ -119,14 +119,15 @@ class AicpuTaskInfo : public TaskInfo { AicpuTaskInfo(const std::string &op_name, uint32_t stream_id, const std::string &so_name, const std::string &kernel_name, const std::string &node_def, const std::string &ext_info, const std::vector &input_data_addrs, const std::vector &output_data_addrs, - bool dump_flag) + bool dump_flag, bool cust_aicpu = false) : TaskInfo(op_name, stream_id, TaskInfoType::AICPU, dump_flag), so_name_(so_name), kernel_name_(kernel_name), node_def_(node_def), ext_info_(ext_info), input_data_addrs_(input_data_addrs), - output_data_addrs_(output_data_addrs) {} + output_data_addrs_(output_data_addrs), + cust_aicpu_(cust_aicpu) {} ~AicpuTaskInfo() override {} const std::string &so_name() const { return so_name_; } @@ -135,6 +136,7 @@ class AicpuTaskInfo : public TaskInfo { const std::vector &input_data_addrs() const { return input_data_addrs_; } const std::vector &output_data_addrs() const { return output_data_addrs_; } const std::string &ext_info() const { return ext_info_; } + const bool &cust_aicpu() const { return cust_aicpu_; } private: std::string so_name_; @@ -143,6 +145,7 @@ class AicpuTaskInfo : public TaskInfo { std::string ext_info_; std::vector input_data_addrs_; std::vector output_data_addrs_; + bool cust_aicpu_; }; class LabelSetTaskInfo : public TaskInfo { diff --git a/mindspore/ccsrc/utils/utils.h b/mindspore/ccsrc/utils/utils.h index a52c581e36b..af3963be9d2 100644 --- a/mindspore/ccsrc/utils/utils.h +++ b/mindspore/ccsrc/utils/utils.h @@ -492,6 +492,7 @@ constexpr auto kAttrDstType = "dst_type"; constexpr auto kAttrDump = "dump"; constexpr auto kAttrSkipNopOpAddr = "skip_nop_op_addr"; constexpr auto kAttrFuncType = "func_type"; +constexpr auto kAttrCustAicpu = "cust_aicpu"; // custom operator func type constexpr auto kCustomTypeAOT = "aot"; diff --git a/mindspore/ops/primitive.py b/mindspore/ops/primitive.py index 5f4050cdecc..b66d8aee685 100644 --- a/mindspore/ops/primitive.py +++ b/mindspore/ops/primitive.py @@ -631,6 +631,38 @@ def prim_attr_register(fn): return deco +def custom_aicpu_register(custom_aicpu_so="mindspore_aicpu_kernels"): + """Register custom aicpu attribute. + + Args: + custom_aicpu_so (str): Path of the dynamic library loaded by the aicpu ops. + Default: "mindspore_aicpu_kernels" + """ + + def deco(fn): + def wrapper(self, *args, **kwargs): + if not isinstance(custom_aicpu_so, str): + raise ValueError(f"custom_aicpu_so must be a str, but got {custom_aicpu_so}") + class_name = self.__class__.__name__ + if hasattr(self.__class__, "substitute_name"): + class_name = self.__class__.substitute_name + if isinstance(self, PrimitiveWithInfer): + PrimitiveWithInfer.__init__(self, class_name) + elif isinstance(self, PrimitiveWithCheck): + PrimitiveWithCheck.__init__(self, class_name) + else: + Primitive.__init__(self, self.__class__.__name__) + attr_name = "cust_aicpu" + self.add_prim_attr(attr_name, custom_aicpu_so) + self.init_attrs[attr_name] = custom_aicpu_so + ret = fn(self, *args, **kwargs) + return ret + + return wrapper + + return deco + + def constexpr(fn=None, get_instance=True, name=None): """ Creates a PrimitiveWithInfer operator that can infer the value at compile time. We can use it to define a function diff --git a/tests/ut/cpp/stub/aicpu/aicpu_stub.cc b/tests/ut/cpp/stub/aicpu/aicpu_stub.cc index 5516d1fdc85..7d2b3a3c6e6 100644 --- a/tests/ut/cpp/stub/aicpu/aicpu_stub.cc +++ b/tests/ut/cpp/stub/aicpu/aicpu_stub.cc @@ -14,6 +14,7 @@ * limitations under the License. */ #include "backend/kernel_compiler/kernel.h" +#include "backend/kernel_compiler/aicpu/aicpu_kernel_load.h" namespace mindspore { namespace kernel { @@ -22,5 +23,8 @@ namespace kernel { * @brief build op and return a callable mod */ KernelModPtr AicpuOpBuild(const AnfNodePtr &anf_node) { return nullptr; } + +bool AicpuOpKernelLoad::LaunchAicpuKernelSo() { return true; } +void AicpuOpKernelLoad::FreeDeviceMemory() {} } // namespace kernel } // namespace mindspore