forked from mindspore-Ecosystem/mindspore
!26310 MindSpore support load custom aicpu kernels.
Merge pull request !26310 from linqingke/aicpu
This commit is contained in:
commit
9d6248194e
|
@ -31,6 +31,7 @@
|
|||
#include "proto/node_def.pb.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "backend/kernel_compiler/aicpu/aicpu_util.h"
|
||||
#include "backend/kernel_compiler/aicpu/aicpu_kernel_load.h"
|
||||
#include "backend/session/kernel_graph.h"
|
||||
#include "backend/kernel_compiler/common_utils.h"
|
||||
#include "backend/kernel_compiler/oplib/oplib.h"
|
||||
|
@ -423,6 +424,11 @@ KernelModPtr AicpuOpBuild(const std::shared_ptr<AnfNode> &anf_node) {
|
|||
if (!SetIOSize(anf_node, kernel_mod_ptr)) {
|
||||
MS_LOG(EXCEPTION) << "Set input output size list failed.";
|
||||
}
|
||||
|
||||
if (!AicpuOpKernelLoad::GetInstance().LoadAicpuKernelSo(anf_node, kernel_mod_ptr)) {
|
||||
MS_LOG(EXCEPTION) << "Aicpu kernel so load failed. task is " << anf_node->fullname_with_scope();
|
||||
}
|
||||
|
||||
return kernel_mod_ptr;
|
||||
}
|
||||
} // namespace kernel
|
||||
|
|
|
@ -0,0 +1,373 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/aicpu/aicpu_kernel_load.h"
|
||||
#include <dlfcn.h>
|
||||
#include <unistd.h>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include <ios>
|
||||
#include <fstream>
|
||||
#include "runtime/kernel.h"
|
||||
#include "runtime/mem.h"
|
||||
#include "runtime/context.h"
|
||||
#include "utils/utils.h"
|
||||
#include "utils/file_utils.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
bool AicpuOpKernelLoad::GetBinaryFileName(const std::string &so_name, const std::string &bin_folder_path,
|
||||
std::string *bin_file_path) {
|
||||
MS_EXCEPTION_IF_NULL(bin_file_path);
|
||||
const auto &iter = so_name_and_realpath_map_.find(so_name);
|
||||
if (iter != so_name_and_realpath_map_.end()) {
|
||||
*bin_file_path = iter->second;
|
||||
MS_LOG(INFO) << "so " << so_name << " has bin file path " << bin_file_path;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string bin_file_name(bin_folder_path);
|
||||
if (bin_file_name.empty()) {
|
||||
bin_file_name = "./";
|
||||
} else if (bin_file_name.back() != '/') {
|
||||
bin_file_name.append("/");
|
||||
}
|
||||
|
||||
bin_file_name += so_name;
|
||||
auto real_file_path = FileUtils::GetRealPath(bin_file_name.c_str());
|
||||
if (!real_file_path.has_value()) {
|
||||
MS_LOG(ERROR) << "Get real path failed, path=" << bin_file_name;
|
||||
return false;
|
||||
}
|
||||
|
||||
auto real_file_path_value = real_file_path.value();
|
||||
if (access(real_file_path_value.c_str(), F_OK) == -1) {
|
||||
MS_LOG(ERROR) << "Kernel so path:" << real_file_path_value << " is not existed!";
|
||||
return false;
|
||||
}
|
||||
|
||||
*bin_file_path = real_file_path_value;
|
||||
so_name_and_realpath_map_[so_name] = *bin_file_path;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AicpuOpKernelLoad::ReadBytesFromBinaryFile(const std::string &file_name, std::vector<char> *buffer) const {
|
||||
std::ifstream file(file_name.c_str(), std::ios::binary | std::ios::ate);
|
||||
if (!file.is_open()) {
|
||||
MS_LOG(ERROR) << "Open file [" << file_name << "] failed";
|
||||
return false;
|
||||
}
|
||||
|
||||
std::streamsize size = file.tellg();
|
||||
if (size <= 0) {
|
||||
file.close();
|
||||
MS_LOG(ERROR) << "Empty file [" << file_name << "], please check this file.";
|
||||
return false;
|
||||
}
|
||||
if (size > INT_MAX) {
|
||||
file.close();
|
||||
MS_LOG(ERROR) << "File [" << file_name << "] size [" << size << "] is out of limit[" << INT_MAX << "]";
|
||||
return false;
|
||||
}
|
||||
|
||||
file.seekg(0, std::ios::beg);
|
||||
buffer->resize(size);
|
||||
file.read(buffer->data(), size);
|
||||
file.close();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AicpuOpKernelLoad::GetSoNeedLoadPath(const std::string &so_name, std::string *file_path) const {
|
||||
MS_EXCEPTION_IF_NULL(file_path);
|
||||
Dl_info dl_info;
|
||||
if (dladdr(reinterpret_cast<void *>(const_cast<AicpuOpKernelLoad *>(this)), &dl_info) == 0) {
|
||||
MS_LOG(ERROR) << "Get dladdr failed!";
|
||||
return false;
|
||||
}
|
||||
std::string cust_kernel_so_path(dl_info.dli_fname);
|
||||
|
||||
auto pos = cust_kernel_so_path.find_last_of('/');
|
||||
if (cust_kernel_so_path.empty() || pos == std::string::npos) {
|
||||
MS_LOG(ERROR) << "Current path [" << cust_kernel_so_path << "] is invalid.";
|
||||
return false;
|
||||
}
|
||||
auto real_cust_kernel_so_path = cust_kernel_so_path.substr(0, pos) + "/lib/";
|
||||
|
||||
if (real_cust_kernel_so_path.size() > PATH_MAX) {
|
||||
MS_LOG(ERROR) << "Current path [" << real_cust_kernel_so_path << "] is too long.";
|
||||
return false;
|
||||
}
|
||||
|
||||
*file_path = real_cust_kernel_so_path;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AicpuOpKernelLoad::PackageBinaryFile(const std::string &so_name,
|
||||
std::map<std::string, OpKernelBinPtr> *so_name_with_bin_info) {
|
||||
std::string bin_folder_path;
|
||||
bool ret = GetSoNeedLoadPath(so_name, &bin_folder_path);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "GetSoNeedLoadPath failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string bin_file_path;
|
||||
ret = GetBinaryFileName(so_name, bin_folder_path, &bin_file_path);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "GetBinaryFileName failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<char> buffer;
|
||||
ret = ReadBytesFromBinaryFile(bin_file_path, &buffer);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "ReadBytesFromBinaryFile failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
OpKernelBinPtr cust_aicpu_kernel_ptr = std::make_shared<OpKernelBin>(so_name, std::move(buffer));
|
||||
if (cust_aicpu_kernel_ptr == nullptr) {
|
||||
MS_LOG(ERROR) << "Create OpKernelBin object failed.";
|
||||
return false;
|
||||
}
|
||||
so_name_with_bin_info->insert({so_name, cust_aicpu_kernel_ptr});
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AicpuOpKernelLoad::LoadAicpuKernelSo(const AnfNodePtr &node,
|
||||
const std::shared_ptr<AicpuOpKernelMod> &kernel_mod_ptr) {
|
||||
std::lock_guard<std::mutex> lock(cust_aicpu_mutex_);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
|
||||
CNodePtr cnode = node->cast<CNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(cnode);
|
||||
if (!AnfAlgo::HasNodeAttr(kAttrCustAicpu, cnode)) {
|
||||
MS_LOG(INFO) << "Current aicpu ops:" << cnode->fullname_with_scope() << " isn't a custom ops.";
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string so_name = "lib" + AnfAlgo::GetNodeAttr<std::string>(cnode, kAttrCustAicpu) + ".so";
|
||||
if (so_name == kLibAicpuKernelSoName || so_name == kLibCpuKernelSoName) {
|
||||
MS_LOG(INFO) << "Aicpu so:" << so_name << " is default so.";
|
||||
return true;
|
||||
}
|
||||
|
||||
kernel_mod_ptr->SetCustSo(so_name);
|
||||
rtContext_t rt_cur_ctx = nullptr;
|
||||
auto rt_error = rtCtxGetCurrent(&rt_cur_ctx);
|
||||
if (rt_error != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtCtxGetCurrent failed, ret = 0x" << rt_error;
|
||||
return false;
|
||||
}
|
||||
// use current context as resource key
|
||||
uintptr_t resource_id = reinterpret_cast<uintptr_t>(rt_cur_ctx);
|
||||
auto it = cust_aicpu_so_.find(resource_id);
|
||||
if (it != cust_aicpu_so_.end()) {
|
||||
auto it_so_name = it->second.find(so_name);
|
||||
if (it_so_name != it->second.end()) {
|
||||
MS_LOG(INFO) << "Cust aicpu so:" << so_name << " has been loaded.";
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
std::map<std::string, OpKernelBinPtr> so_name_with_bin_info;
|
||||
if (!PackageBinaryFile(so_name, &so_name_with_bin_info)) {
|
||||
MS_LOG(ERROR) << "Package binary file failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (it == cust_aicpu_so_.end()) {
|
||||
cust_aicpu_so_[resource_id] = so_name_with_bin_info;
|
||||
MS_LOG(INFO) << "Load new aicpu so:" << so_name << "success, resource id:" << resource_id << ".";
|
||||
return true;
|
||||
}
|
||||
auto it_so_name = it->second.find(so_name);
|
||||
if (it_so_name == it->second.end()) {
|
||||
it->second.insert(so_name_with_bin_info.begin(), so_name_with_bin_info.end());
|
||||
MS_LOG(INFO) << "Load cust aicpu so:" << so_name << "success, resource id:" << resource_id << ".";
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AicpuOpKernelLoad::CacheBinaryFileToDevice(const uintptr_t &resource_id, std::vector<void *> *allocated_mem,
|
||||
void **batch_args) {
|
||||
auto it = cust_aicpu_so_.find(resource_id);
|
||||
if (it == cust_aicpu_so_.end()) {
|
||||
MS_LOG(ERROR) << "Context id:" << resource_id << " is invalid.";
|
||||
return false;
|
||||
}
|
||||
|
||||
rtError_t status;
|
||||
std::vector<CustAicpuSoBuf> v_cust_so;
|
||||
for (const auto &it_so : it->second) {
|
||||
const auto &so_name = it_so.first;
|
||||
const void *aicpu_data = it_so.second->GetBinData();
|
||||
uint32_t aicpu_data_length = it_so.second->GetBinDataSize();
|
||||
void *d_aicpu_data = nullptr;
|
||||
void *d_so_name = nullptr;
|
||||
|
||||
status = rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtMalloc failed, size:" << aicpu_data_length << ", ret = 0x" << status;
|
||||
return false;
|
||||
}
|
||||
allocated_mem->emplace_back(d_aicpu_data);
|
||||
|
||||
status = rtMalloc(&d_so_name, so_name.size(), RT_MEMORY_HBM);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtMalloc failed, size:" << so_name.size() << ", ret = 0x" << status;
|
||||
return false;
|
||||
}
|
||||
allocated_mem->emplace_back(d_so_name);
|
||||
|
||||
status = rtMemcpy(d_aicpu_data, aicpu_data_length, aicpu_data, aicpu_data_length, RT_MEMCPY_HOST_TO_DEVICE);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtMemcpy failed, ret = 0x" << status;
|
||||
return false;
|
||||
}
|
||||
|
||||
status = rtMemcpy(d_so_name, so_name.size(), reinterpret_cast<const void *>(so_name.c_str()), so_name.size(),
|
||||
RT_MEMCPY_HOST_TO_DEVICE);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtMemcpy failed, ret = 0x" << status;
|
||||
return false;
|
||||
}
|
||||
|
||||
CustAicpuSoBuf cust_aicpu_so_buf;
|
||||
cust_aicpu_so_buf.kernelSoBuf = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_aicpu_data));
|
||||
cust_aicpu_so_buf.kernelSoBufLen = aicpu_data_length;
|
||||
cust_aicpu_so_buf.kernelSoName = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_so_name));
|
||||
cust_aicpu_so_buf.kernelSoNameLen = so_name.size();
|
||||
v_cust_so.emplace_back(cust_aicpu_so_buf);
|
||||
}
|
||||
|
||||
void *args = nullptr;
|
||||
uint32_t args_size = sizeof(CustAicpuSoBuf) * v_cust_so.size();
|
||||
status = rtMalloc(&args, args_size, RT_MEMORY_HBM);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtMalloc failed, size:" << args_size << ", ret = 0x" << status;
|
||||
return false;
|
||||
}
|
||||
allocated_mem->emplace_back(args);
|
||||
status = rtMemcpy(args, args_size, v_cust_so.data(), args_size, RT_MEMCPY_HOST_TO_DEVICE);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtMemcpy failed, ret = 0x" << status;
|
||||
return false;
|
||||
}
|
||||
|
||||
BatchLoadOpFromBufArgs batch_cust_so;
|
||||
batch_cust_so.soNum = v_cust_so.size();
|
||||
batch_cust_so.args = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(args));
|
||||
|
||||
uint32_t batch_args_size = sizeof(BatchLoadOpFromBufArgs);
|
||||
status = rtMalloc(batch_args, batch_args_size, RT_MEMORY_HBM);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtMalloc failed, size:" << batch_args_size << ", ret = 0x" << status;
|
||||
return false;
|
||||
}
|
||||
allocated_mem->emplace_back(*batch_args);
|
||||
status = rtMemcpy(*batch_args, batch_args_size, static_cast<void *>(&batch_cust_so), batch_args_size,
|
||||
RT_MEMCPY_HOST_TO_DEVICE);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtMemcpy failed, ret = 0x" << status;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AicpuOpKernelLoad::LaunchAicpuKernelSo() {
|
||||
std::lock_guard<std::mutex> lock(cust_aicpu_mutex_);
|
||||
if (cust_aicpu_so_.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
rtContext_t rt_cur_ctx = nullptr;
|
||||
rtError_t status = RT_ERROR_NONE;
|
||||
status = rtCtxGetCurrent(&rt_cur_ctx);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtCtxGetCurrent failed, ret = 0x" << status;
|
||||
return false;
|
||||
}
|
||||
// use current context as resource key
|
||||
uintptr_t resource_id = reinterpret_cast<uintptr_t>(rt_cur_ctx);
|
||||
auto it = cust_aicpu_so_.find(resource_id);
|
||||
if (it == cust_aicpu_so_.end()) {
|
||||
MS_LOG(INFO) << "Cust aicpu so map is empty, context id:" << resource_id;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<void *> allocated_mem;
|
||||
void *batch_args = nullptr;
|
||||
uint32_t batch_args_size = sizeof(BatchLoadOpFromBufArgs);
|
||||
bool ret = CacheBinaryFileToDevice(resource_id, &allocated_mem, &batch_args);
|
||||
allocated_mem_list_.emplace_back(std::move(allocated_mem));
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "CacheBinaryFileToDevice is failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
rtStream_t stream = nullptr;
|
||||
status = rtStreamCreate(&stream, 0);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtStreamCreate failed, ret = 0x" << status;
|
||||
return false;
|
||||
}
|
||||
stream_list_.emplace_back(stream);
|
||||
// launch "batchLoadsoFrombuf" event to device.
|
||||
std::string load_event(kBatchLoadBuf);
|
||||
status = rtCpuKernelLaunch(nullptr, load_event.c_str(), 1, batch_args, batch_args_size, nullptr, stream);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtCpuKernelLaunch failed, ret = 0x" << status;
|
||||
return false;
|
||||
}
|
||||
status = rtStreamSynchronize(stream);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call rtStreamSynchronize failed, ret = 0x" << status;
|
||||
return false;
|
||||
}
|
||||
|
||||
MS_LOG(INFO) << "Aicpu kernel so launch success.";
|
||||
return true;
|
||||
}
|
||||
|
||||
void AicpuOpKernelLoad::FreeDeviceMemory() {
|
||||
for (auto allocated_mem : allocated_mem_list_) {
|
||||
for (auto mem : allocated_mem) {
|
||||
if (mem == nullptr) {
|
||||
continue;
|
||||
}
|
||||
auto rt_error = rtFree(mem);
|
||||
if (rt_error != RT_ERROR_NONE) {
|
||||
MS_LOG(EXCEPTION) << "Call rtFree failed, ret = 0x" << rt_error;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto stream : stream_list_) {
|
||||
if (stream != nullptr) {
|
||||
auto rt_error = rtStreamDestroy(stream);
|
||||
if (rt_error != RT_ERROR_NONE) {
|
||||
MS_LOG(EXCEPTION) << "Call rtStreamDestroy failed, ret = 0x" << rt_error;
|
||||
}
|
||||
}
|
||||
}
|
||||
so_name_and_realpath_map_.clear();
|
||||
cust_aicpu_so_.clear();
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,78 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_AICPU_KERNEL_LOAD_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_AICPU_KERNEL_LOAD_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
#include "runtime/base.h"
|
||||
#include "base/base.h"
|
||||
#include "ir/anf.h"
|
||||
#include "backend/kernel_compiler/aicpu/aicpu_util.h"
|
||||
#include "backend/kernel_compiler/aicpu/aicpu_kernel_mod.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr auto kBatchLoadBuf = "batchLoadsoFrombuf";
|
||||
|
||||
#pragma pack(push, 1)
|
||||
struct CustAicpuSoBuf {
|
||||
uint64_t kernelSoBuf;
|
||||
uint32_t kernelSoBufLen;
|
||||
uint64_t kernelSoName;
|
||||
uint32_t kernelSoNameLen;
|
||||
};
|
||||
|
||||
struct BatchLoadOpFromBufArgs {
|
||||
uint32_t soNum;
|
||||
uint64_t args;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
class AicpuOpKernelLoad {
|
||||
public:
|
||||
AicpuOpKernelLoad() = default;
|
||||
~AicpuOpKernelLoad() = default;
|
||||
|
||||
static AicpuOpKernelLoad &GetInstance() {
|
||||
static AicpuOpKernelLoad instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
bool LaunchAicpuKernelSo();
|
||||
bool LoadAicpuKernelSo(const AnfNodePtr &node, const std::shared_ptr<AicpuOpKernelMod> &kernel_mod_ptr);
|
||||
void FreeDeviceMemory();
|
||||
|
||||
private:
|
||||
bool GetBinaryFileName(const std::string &so_name, const std::string &bin_folder_path, std::string *bin_file_path);
|
||||
bool ReadBytesFromBinaryFile(const std::string &file_name, std::vector<char> *buffer) const;
|
||||
bool GetSoNeedLoadPath(const std::string &so_name, std::string *file_path) const;
|
||||
bool PackageBinaryFile(const std::string &so_name, std::map<std::string, OpKernelBinPtr> *so_name_with_bin_info);
|
||||
bool CacheBinaryFileToDevice(const uintptr_t &resource_id, std::vector<void *> *allocated_mem, void **batch_args);
|
||||
|
||||
std::map<std::string, std::string> so_name_and_realpath_map_;
|
||||
std::map<uintptr_t, std::map<std::string, OpKernelBinPtr>> cust_aicpu_so_;
|
||||
std::mutex cust_aicpu_mutex_;
|
||||
std::vector<rtStream_t> stream_list_;
|
||||
std::vector<std::vector<void *>> allocated_mem_list_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_AICPU_KERNEL_LOAD_H_
|
|
@ -36,9 +36,6 @@ using HostDynamicKernel = mindspore::device::ascend::HostDynamicKernel;
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr auto AICPU_OPS_SO_NAME = "libaicpu_kernels.so";
|
||||
constexpr auto CUST_AICPU_OPS_SO_NAME = "libcpu_kernels.so";
|
||||
|
||||
AicpuOpKernelMod::AicpuOpKernelMod() : anf_node_(nullptr) {}
|
||||
|
||||
AicpuOpKernelMod::~AicpuOpKernelMod() {
|
||||
|
@ -63,6 +60,10 @@ void AicpuOpKernelMod::SetOutputList(const std::vector<int64_t> &outputList) { o
|
|||
void AicpuOpKernelMod::SetNodeDef(const std::string &nodeDef) { (void)node_def_str_.assign(nodeDef); }
|
||||
void AicpuOpKernelMod::SetExtInfo(const std::string &ext_info) { ext_info_ = ext_info; }
|
||||
void AicpuOpKernelMod::SetNodeName(const std::string &node_name) { node_name_ = node_name; }
|
||||
void AicpuOpKernelMod::SetCustSo(const std::string &cust_so) {
|
||||
node_so_ = cust_so;
|
||||
cust_kernel_ = true;
|
||||
}
|
||||
void AicpuOpKernelMod::SetAnfNode(const mindspore::AnfNodePtr &anf_node) {
|
||||
MS_EXCEPTION_IF_NULL(anf_node);
|
||||
anf_node_ = anf_node;
|
||||
|
@ -72,15 +73,17 @@ void AicpuOpKernelMod::CreateCpuKernelInfo(const std::vector<AddressPtr> &inputs
|
|||
const std::vector<AddressPtr> &outputs) {
|
||||
MS_LOG(INFO) << "CreateCpuKernelInfoOffline start";
|
||||
|
||||
if (kCustAiCpuKernelOps.find(node_name_) != kCustAiCpuKernelOps.end()) {
|
||||
node_so_ = CUST_AICPU_OPS_SO_NAME;
|
||||
node_name_ = kCustRunApi;
|
||||
if (!cust_kernel_) {
|
||||
if (kCpuKernelOps.find(node_name_) != kCpuKernelOps.end()) {
|
||||
node_so_ = kLibCpuKernelSoName;
|
||||
node_name_ = kCpuRunApi;
|
||||
} else if (kCacheKernelOps.find(node_name_) != kCacheKernelOps.end()) {
|
||||
node_so_ = AICPU_OPS_SO_NAME;
|
||||
node_name_ = kCustRunApi;
|
||||
node_so_ = kLibAicpuKernelSoName;
|
||||
node_name_ = kCpuRunApi;
|
||||
} else {
|
||||
if (node_so_ != CUST_AICPU_OPS_SO_NAME) {
|
||||
node_so_ = AICPU_OPS_SO_NAME;
|
||||
if (node_so_ != kLibCpuKernelSoName) {
|
||||
node_so_ = kLibAicpuKernelSoName;
|
||||
}
|
||||
}
|
||||
}
|
||||
// InputOutputAddr
|
||||
|
@ -149,12 +152,16 @@ bool AicpuOpKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::
|
|||
if (node_name_ == kStack) {
|
||||
node_name_ = kPack;
|
||||
}
|
||||
auto flag = RT_KERNEL_DEFAULT;
|
||||
if (cust_kernel_) {
|
||||
flag = RT_KERNEL_CUSTOM_AICPU;
|
||||
}
|
||||
MS_LOG(INFO) << "Aicpu launch, node_so_:" << node_so_ << ", node name:" << node_name_
|
||||
<< ", args_size:" << args_.length();
|
||||
if (rtCpuKernelLaunch(reinterpret_cast<const void *>(node_so_.c_str()),
|
||||
if (rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(node_so_.c_str()),
|
||||
reinterpret_cast<const void *>(node_name_.c_str()), 1,
|
||||
reinterpret_cast<const void *>(args_.data()), static_cast<uint32_t>(args_.length()), nullptr,
|
||||
stream_) != RT_ERROR_NONE) {
|
||||
reinterpret_cast<const void *>(args_.data()), static_cast<uint32_t>(args_.length()),
|
||||
nullptr, stream_, flag) != RT_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Aicpu op launch failed!";
|
||||
|
||||
return false;
|
||||
|
@ -168,15 +175,17 @@ std::vector<TaskInfoPtr> AicpuOpKernelMod::GenTask(const std::vector<AddressPtr>
|
|||
MS_LOG(INFO) << "AicpuOpKernelMod GenTask start";
|
||||
|
||||
stream_id_ = stream_id;
|
||||
if (kCustAiCpuKernelOps.find(node_name_) != kCustAiCpuKernelOps.end()) {
|
||||
node_so_ = CUST_AICPU_OPS_SO_NAME;
|
||||
node_name_ = kCustRunApi;
|
||||
if (!cust_kernel_) {
|
||||
if (kCpuKernelOps.find(node_name_) != kCpuKernelOps.end()) {
|
||||
node_so_ = kLibCpuKernelSoName;
|
||||
node_name_ = kCpuRunApi;
|
||||
} else if (kCacheKernelOps.find(node_name_) != kCacheKernelOps.end()) {
|
||||
node_so_ = AICPU_OPS_SO_NAME;
|
||||
node_name_ = kCustRunApi;
|
||||
node_so_ = kLibAicpuKernelSoName;
|
||||
node_name_ = kCpuRunApi;
|
||||
} else {
|
||||
if (node_so_ != CUST_AICPU_OPS_SO_NAME) {
|
||||
node_so_ = AICPU_OPS_SO_NAME;
|
||||
if (node_so_ != kLibCpuKernelSoName) {
|
||||
node_so_ = kLibAicpuKernelSoName;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<void *> input_data_addrs;
|
||||
|
@ -197,7 +206,7 @@ std::vector<TaskInfoPtr> AicpuOpKernelMod::GenTask(const std::vector<AddressPtr>
|
|||
|
||||
AicpuTaskInfoPtr task_info_ptr = std::make_shared<mindspore::ge::model_runner::AicpuTaskInfo>(
|
||||
unique_name_, stream_id, node_so_, node_name_, node_def_str_, ext_info_, input_data_addrs, output_data_addrs,
|
||||
NeedDump());
|
||||
NeedDump(), cust_kernel_);
|
||||
|
||||
MS_LOG(INFO) << "AicpuOpKernelMod GenTask end";
|
||||
return {task_info_ptr};
|
||||
|
|
|
@ -39,6 +39,7 @@ class AicpuOpKernelMod : public AscendKernelMod {
|
|||
void SetNodeDef(const std::string &nodeDef);
|
||||
void SetExtInfo(const std::string &ext_info);
|
||||
void SetNodeName(const std::string &node_name);
|
||||
void SetCustSo(const std::string &cust_so);
|
||||
|
||||
/**
|
||||
* @brief Build AICPU Engine kernel structure, and allocate device memory for offline task generate
|
||||
|
@ -56,6 +57,7 @@ class AicpuOpKernelMod : public AscendKernelMod {
|
|||
const std::vector<size_t> &GetWorkspaceSizeList() const override;
|
||||
|
||||
private:
|
||||
bool cust_kernel_{false};
|
||||
std::string args_;
|
||||
std::string node_def_str_;
|
||||
std::string node_name_;
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_AICPU_UTIL_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
@ -24,6 +26,8 @@
|
|||
#include "backend/kernel_compiler/kernel.h"
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr auto kLibAicpuKernelSoName = "libaicpu_kernels.so";
|
||||
constexpr auto kLibCpuKernelSoName = "libcpu_kernels.so";
|
||||
constexpr auto kInitDataSetQueue = "InitDataSetQueue";
|
||||
constexpr auto kInitData = "InitData";
|
||||
constexpr auto kGetNext = "GetNext";
|
||||
|
@ -55,7 +59,7 @@ constexpr auto kUpdateCache = "UpdateCache";
|
|||
constexpr auto kCacheSwapTable = "CacheSwapTable";
|
||||
constexpr auto kSubAndFilter = "SubAndFilter";
|
||||
constexpr auto kPadAndShift = "PadAndShift";
|
||||
constexpr auto kCustRunApi = "RunCpuKernel";
|
||||
constexpr auto kCpuRunApi = "RunCpuKernel";
|
||||
constexpr auto kDropout2D = "Dropout2D";
|
||||
constexpr auto kDropout3D = "Dropout3D";
|
||||
constexpr auto kMaskedSelect = "MaskedSelect";
|
||||
|
@ -65,7 +69,7 @@ constexpr auto kSearchSorted = "SearchSorted";
|
|||
constexpr auto kResizeBilinear = "ResizeBilinear";
|
||||
constexpr auto kResizeBilinearGrad = "ResizeBilinearGrad";
|
||||
constexpr auto kScatterElements = "ScatterElements";
|
||||
const std::set<std::string> kCustAiCpuKernelOps{kIdentity, kMaskedSelect, kMaskedSelectGrad, kDynamicStitch,
|
||||
const std::set<std::string> kCpuKernelOps{kIdentity, kMaskedSelect, kMaskedSelectGrad, kDynamicStitch,
|
||||
kSearchSorted, kResizeBilinear, kResizeBilinearGrad, kScatterElements};
|
||||
const std::set<std::string> kCacheKernelOps{kUpdateCache, kCacheSwapTable, kSubAndFilter,
|
||||
kPadAndShift, kDropout3D, kDropout2D};
|
||||
|
@ -118,6 +122,24 @@ class AicpuOpUtil {
|
|||
// kernel id
|
||||
static uint64_t KernelId_;
|
||||
};
|
||||
|
||||
class OpKernelBin {
|
||||
public:
|
||||
OpKernelBin(std::string name, std::vector<char> &&data) : name_(std::move(name)), data_(std::move(data)) {}
|
||||
~OpKernelBin() = default;
|
||||
|
||||
const std::string &GetName() const { return name_; }
|
||||
const uint8_t *GetBinData() const { return (const uint8_t *)data_.data(); }
|
||||
size_t GetBinDataSize() const { return data_.size(); }
|
||||
OpKernelBin(const OpKernelBin &) = delete;
|
||||
const OpKernelBin &operator=(const OpKernelBin &) = delete;
|
||||
|
||||
private:
|
||||
std::string name_;
|
||||
std::vector<char> data_;
|
||||
};
|
||||
|
||||
using OpKernelBinPtr = std::shared_ptr<OpKernelBin>;
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
#include "runtime/device/ascend/tasksink/task_generator.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "backend/session/kernel_build_client.h"
|
||||
#include "backend/kernel_compiler/aicpu/aicpu_kernel_load.h"
|
||||
#ifndef ENABLE_SECURITY
|
||||
#include "runtime/device/ascend/profiling/profiling_manager.h"
|
||||
#include "runtime/device/ascend/profiling/profiling_utils.h"
|
||||
|
@ -286,6 +287,7 @@ void AscendKernelRuntime::ReleaseDeviceRes() {
|
|||
if (mem_manager_ != nullptr) {
|
||||
mem_manager_->FreeDeviceMemory();
|
||||
}
|
||||
mindspore::kernel::AicpuOpKernelLoad::GetInstance().FreeDeviceMemory();
|
||||
|
||||
auto rt_ret = rtRegTaskFailCallbackByModule(kModuleName, nullptr);
|
||||
if (rt_ret != RT_ERROR_NONE) {
|
||||
|
@ -438,6 +440,9 @@ bool AscendKernelRuntime::Load(const session::KernelGraph &graph, bool is_task_s
|
|||
if (!LoadTask(graph)) {
|
||||
return false;
|
||||
}
|
||||
if (!mindspore::kernel::AicpuOpKernelLoad::GetInstance().LaunchAicpuKernelSo()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -73,13 +73,14 @@ void AicpuTask::Distribute() {
|
|||
// for data dump
|
||||
input_output_addr_ = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(args_) + io_addr_offset);
|
||||
auto dump_flag = task_info_->dump_flag() ? RT_KERNEL_DUMPFLAG : RT_KERNEL_DEFAULT;
|
||||
auto cpu_flag = task_info_->cust_aicpu() ? RT_KERNEL_CUSTOM_AICPU : dump_flag;
|
||||
|
||||
MS_LOG(INFO) << "Distribute AicpuTask start, args_size = " << args_size << ", io_addrs_num =" << io_addrs_num
|
||||
<< ", so_name = " << task_info_->so_name() << ", kernel_name = " << task_info_->kernel_name()
|
||||
<< ", dump_flag = " << dump_flag;
|
||||
rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(task_info_->so_name().data()),
|
||||
reinterpret_cast<const void *>(task_info_->kernel_name().data()), 1, args_,
|
||||
args_size, nullptr, stream_, dump_flag);
|
||||
args_size, nullptr, stream_, cpu_flag);
|
||||
if (rt_ret != RT_ERROR_NONE) {
|
||||
MS_LOG(EXCEPTION) << "Call rt api rtCpuKernelLaunchWithFlag failed, ret: " << rt_ret;
|
||||
}
|
||||
|
|
|
@ -119,14 +119,15 @@ class AicpuTaskInfo : public TaskInfo {
|
|||
AicpuTaskInfo(const std::string &op_name, uint32_t stream_id, const std::string &so_name,
|
||||
const std::string &kernel_name, const std::string &node_def, const std::string &ext_info,
|
||||
const std::vector<void *> &input_data_addrs, const std::vector<void *> &output_data_addrs,
|
||||
bool dump_flag)
|
||||
bool dump_flag, bool cust_aicpu = false)
|
||||
: TaskInfo(op_name, stream_id, TaskInfoType::AICPU, dump_flag),
|
||||
so_name_(so_name),
|
||||
kernel_name_(kernel_name),
|
||||
node_def_(node_def),
|
||||
ext_info_(ext_info),
|
||||
input_data_addrs_(input_data_addrs),
|
||||
output_data_addrs_(output_data_addrs) {}
|
||||
output_data_addrs_(output_data_addrs),
|
||||
cust_aicpu_(cust_aicpu) {}
|
||||
~AicpuTaskInfo() override {}
|
||||
|
||||
const std::string &so_name() const { return so_name_; }
|
||||
|
@ -135,6 +136,7 @@ class AicpuTaskInfo : public TaskInfo {
|
|||
const std::vector<void *> &input_data_addrs() const { return input_data_addrs_; }
|
||||
const std::vector<void *> &output_data_addrs() const { return output_data_addrs_; }
|
||||
const std::string &ext_info() const { return ext_info_; }
|
||||
const bool &cust_aicpu() const { return cust_aicpu_; }
|
||||
|
||||
private:
|
||||
std::string so_name_;
|
||||
|
@ -143,6 +145,7 @@ class AicpuTaskInfo : public TaskInfo {
|
|||
std::string ext_info_;
|
||||
std::vector<void *> input_data_addrs_;
|
||||
std::vector<void *> output_data_addrs_;
|
||||
bool cust_aicpu_;
|
||||
};
|
||||
|
||||
class LabelSetTaskInfo : public TaskInfo {
|
||||
|
|
|
@ -493,6 +493,7 @@ constexpr auto kAttrDstType = "dst_type";
|
|||
constexpr auto kAttrDump = "dump";
|
||||
constexpr auto kAttrSkipNopOpAddr = "skip_nop_op_addr";
|
||||
constexpr auto kAttrFuncType = "func_type";
|
||||
constexpr auto kAttrCustAicpu = "cust_aicpu";
|
||||
|
||||
// custom operator func type
|
||||
constexpr auto kCustomTypeAOT = "aot";
|
||||
|
|
|
@ -631,6 +631,38 @@ def prim_attr_register(fn):
|
|||
return deco
|
||||
|
||||
|
||||
def custom_aicpu_register(custom_aicpu_so="mindspore_aicpu_kernels"):
|
||||
"""Register custom aicpu attribute.
|
||||
|
||||
Args:
|
||||
custom_aicpu_so (str): Path of the dynamic library loaded by the aicpu ops.
|
||||
Default: "mindspore_aicpu_kernels"
|
||||
"""
|
||||
|
||||
def deco(fn):
|
||||
def wrapper(self, *args, **kwargs):
|
||||
if not isinstance(custom_aicpu_so, str):
|
||||
raise ValueError(f"custom_aicpu_so must be a str, but got {custom_aicpu_so}")
|
||||
class_name = self.__class__.__name__
|
||||
if hasattr(self.__class__, "substitute_name"):
|
||||
class_name = self.__class__.substitute_name
|
||||
if isinstance(self, PrimitiveWithInfer):
|
||||
PrimitiveWithInfer.__init__(self, class_name)
|
||||
elif isinstance(self, PrimitiveWithCheck):
|
||||
PrimitiveWithCheck.__init__(self, class_name)
|
||||
else:
|
||||
Primitive.__init__(self, self.__class__.__name__)
|
||||
attr_name = "cust_aicpu"
|
||||
self.add_prim_attr(attr_name, custom_aicpu_so)
|
||||
self.init_attrs[attr_name] = custom_aicpu_so
|
||||
ret = fn(self, *args, **kwargs)
|
||||
return ret
|
||||
|
||||
return wrapper
|
||||
|
||||
return deco
|
||||
|
||||
|
||||
def constexpr(fn=None, get_instance=True, name=None):
|
||||
"""
|
||||
Creates a PrimitiveWithInfer operator that can infer the value at compile time. We can use it to define a function
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/kernel_compiler/kernel.h"
|
||||
#include "backend/kernel_compiler/aicpu/aicpu_kernel_load.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -22,5 +23,8 @@ namespace kernel {
|
|||
* @brief build op and return a callable mod
|
||||
*/
|
||||
KernelModPtr AicpuOpBuild(const AnfNodePtr &anf_node) { return nullptr; }
|
||||
|
||||
bool AicpuOpKernelLoad::LaunchAicpuKernelSo() { return true; }
|
||||
void AicpuOpKernelLoad::FreeDeviceMemory() {}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
Loading…
Reference in New Issue