forked from mindspore-Ecosystem/mindspore
ascend device context
This commit is contained in:
parent
fc4f8812a5
commit
07e5ed9f16
|
@ -192,8 +192,11 @@ void GenOpOutputStubTensor(const KernelGraphPtr &single_op_graph, const CNodePtr
|
|||
device_info.format_ = output_format;
|
||||
device_info.data_type_ = TypeIdToType(output_type);
|
||||
stub_output_tensor->set_device_info(device_info);
|
||||
device::DeviceAddressPtr device_address =
|
||||
std::make_shared<device::ascend::AscendDeviceAddress>(nullptr, 0, output_format, output_type);
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
device::DeviceAddressPtr device_address = std::make_shared<device::ascend::AscendDeviceAddress>(
|
||||
nullptr, 0, output_format, output_type, kAscendDevice, device_id);
|
||||
stub_output_tensor->set_device_address(device_address);
|
||||
output_tensor_info.output_stub_tensor = stub_output_tensor;
|
||||
auto kernel_info = dynamic_cast<const device::KernelInfo *>(output_node->kernel_info());
|
||||
|
@ -721,7 +724,7 @@ void AscendSession::BatchBuildKernel(const std::vector<std::shared_ptr<SessionTa
|
|||
|
||||
std::vector<CNodePtr> atomic_node_to_build;
|
||||
for (auto &graph : graphs) {
|
||||
device::ascend::InsertAtomicCleanOp(graph);
|
||||
device::ascend::InsertAtomicCleanOps(graph);
|
||||
const auto &nodes = graph->execution_order();
|
||||
std::copy(nodes.begin(), nodes.end(), std::back_inserter(atomic_node_to_build));
|
||||
}
|
||||
|
@ -998,10 +1001,10 @@ void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::map<AnfN
|
|||
InitRuntimeResource();
|
||||
// Compile all kernels parallel
|
||||
BuildKernel(kernels);
|
||||
// Some new kernel may be added after InsertAtomicCleanOp, so collect and build kernels again
|
||||
// Some new kernel may be added after InsertAtomicCleanOps, so collect and build kernels again
|
||||
kernels.clear();
|
||||
for (const auto &graph_item : single_op_graphs) {
|
||||
device::ascend::InsertAtomicCleanOp(graph_item.first);
|
||||
device::ascend::InsertAtomicCleanOps(graph_item.first);
|
||||
const auto &execution_order = graph_item.first->execution_order();
|
||||
std::copy(execution_order.begin(), execution_order.end(), std::back_inserter(kernels));
|
||||
}
|
||||
|
@ -1078,7 +1081,7 @@ void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_grap
|
|||
// Insert CLearZero op
|
||||
// prepare for next step from json get atomic info
|
||||
BuildKernel(kernel_graph);
|
||||
device::ascend::InsertAtomicCleanOp(kernel_graph);
|
||||
device::ascend::InsertAtomicCleanOps(kernel_graph);
|
||||
device::KernelAdjust::GetInstance().InsertDeviceLoopCtrl(kernel_graph);
|
||||
device::KernelAdjust::GetInstance().ProcessLoopSink(kernel_graph);
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
|
@ -1098,7 +1101,7 @@ void AscendSession::RunOpAdjustKernel(const std::shared_ptr<KernelGraph> &kernel
|
|||
// Insert CLearZero op
|
||||
// prepare for next step from json get atomic info
|
||||
BuildKernel(kernel_graph);
|
||||
device::ascend::InsertAtomicCleanOp(kernel_graph);
|
||||
device::ascend::InsertAtomicCleanOps(kernel_graph);
|
||||
MS_LOG(INFO) << "Finish!";
|
||||
}
|
||||
|
||||
|
|
|
@ -26,6 +26,8 @@
|
|||
#include "runtime/device/memory_manager.h"
|
||||
#include "runtime/device/convert_tensor_utils.h"
|
||||
#include "runtime/device/ascend/ascend_launch_transdata.h"
|
||||
#include "runtime/hardware/device_context_manager.h"
|
||||
#include "runtime/hardware/ascend/ascend_device_context.h"
|
||||
#include "ir/dtype/type.h"
|
||||
#include "ir/tensor.h"
|
||||
#include "abstract/utils.h"
|
||||
|
@ -162,6 +164,25 @@ bool SyncDeviceToHostAndFloatToFloat64(void *dst, size_t dst_size, const void *s
|
|||
return true;
|
||||
}
|
||||
|
||||
void AscendDeviceAddress::BindDevice() const {
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
if (!MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Bind device by device name and device id on the current thread.
|
||||
if (device_name_ != "") {
|
||||
auto device_context =
|
||||
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_});
|
||||
auto ascend_device_context = dynamic_cast<AscendDeviceContext *>(device_context);
|
||||
MS_EXCEPTION_IF_NULL(ascend_device_context);
|
||||
if (!ascend_device_context->BindDeviceToCurrentThread()) {
|
||||
MS_LOG(EXCEPTION) << "BindDeviceToCurrentThread failed.";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void AscendDeviceAddress::SyncStream() const {
|
||||
MS_LOG(DEBUG) << "SyncStream Start!";
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
|
@ -183,6 +204,7 @@ void AscendDeviceAddress::SyncStream() const {
|
|||
|
||||
bool AscendDeviceAddress::SyncDeviceToHost(size_t size, void *const host_ptr) const {
|
||||
MS_EXCEPTION_IF_NULL(host_ptr);
|
||||
BindDevice();
|
||||
SyncStream();
|
||||
SyncMemory(host_ptr, ptr_, size, RT_MEMCPY_DEVICE_TO_HOST);
|
||||
return true;
|
||||
|
@ -190,6 +212,7 @@ bool AscendDeviceAddress::SyncDeviceToHost(size_t size, void *const host_ptr) co
|
|||
|
||||
bool AscendDeviceAddress::SyncHostToDevice(size_t size, const void *host_ptr) const {
|
||||
MS_EXCEPTION_IF_NULL(host_ptr);
|
||||
BindDevice();
|
||||
SyncMemory(ptr_, host_ptr, size, RT_MEMCPY_HOST_TO_DEVICE);
|
||||
return true;
|
||||
}
|
||||
|
@ -201,6 +224,7 @@ bool AscendDeviceAddress::SyncDeviceToHost(const ShapeVector &shape, size_t size
|
|||
if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
|
||||
return true;
|
||||
}
|
||||
BindDevice();
|
||||
SyncStream();
|
||||
bool sync_ok = false;
|
||||
std::vector<size_t> host_shape;
|
||||
|
@ -368,7 +392,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size
|
|||
if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
|
||||
return true;
|
||||
}
|
||||
|
||||
BindDevice();
|
||||
bool sync_ok = false;
|
||||
std::vector<size_t> host_shape;
|
||||
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), LongToSize);
|
||||
|
@ -416,6 +440,7 @@ bool AscendDeviceAddress::SyncDeviceToDevice(const ShapeVector &, size_t size, T
|
|||
if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
|
||||
return true;
|
||||
}
|
||||
BindDevice();
|
||||
bool sync_ok = false;
|
||||
if (format_ == format && type_id_ == type) {
|
||||
if (!DataSync(ptr_, src_ptr, size)) {
|
||||
|
|
|
@ -36,11 +36,14 @@ namespace ascend {
|
|||
class AscendDeviceAddress : public DeviceAddress {
|
||||
public:
|
||||
explicit AscendDeviceAddress(void *ptr, size_t size) : DeviceAddress(ptr, size) {}
|
||||
explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id)
|
||||
: DeviceAddress(ptr, size, format, type_id) {}
|
||||
explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &device_name, uint32_t device_id)
|
||||
: DeviceAddress(ptr, size, device_name, device_id) {}
|
||||
explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
|
||||
const KernelWithIndex &node_index)
|
||||
: DeviceAddress(ptr, size, format, type_id, node_index) {}
|
||||
const std::string &device_name, uint32_t device_id)
|
||||
: DeviceAddress(ptr, size, format, type_id, device_name, device_id) {}
|
||||
explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
|
||||
const KernelWithIndex &node_index, const std::string &device_name, uint32_t device_id)
|
||||
: DeviceAddress(ptr, size, format, type_id, node_index, device_name, device_id) {}
|
||||
~AscendDeviceAddress() override;
|
||||
bool SyncDeviceToHost(size_t size, void *const host_ptr) const override;
|
||||
bool SyncHostToDevice(size_t size, const void *host_ptr) const override;
|
||||
|
@ -71,6 +74,7 @@ class AscendDeviceAddress : public DeviceAddress {
|
|||
const std::string &ori_format,
|
||||
const std::string &dst_format) const;
|
||||
mutable std::shared_ptr<LaunchKernel> launch_transdata_{nullptr};
|
||||
void BindDevice() const;
|
||||
};
|
||||
using AscendDeviceAddressPtr = std::shared_ptr<AscendDeviceAddress>;
|
||||
} // namespace ascend
|
||||
|
|
|
@ -404,12 +404,19 @@ bool AscendKernelRuntime::KernelMemNotReuse(const AnfNodePtr &node) {
|
|||
|
||||
DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
|
||||
TypeId type_id) const {
|
||||
return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id);
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id, kAscendDevice, device_id);
|
||||
}
|
||||
|
||||
DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
|
||||
TypeId type_id, const KernelWithIndex &node_index) const {
|
||||
return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id, node_index);
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id, node_index, kAscendDevice,
|
||||
device_id);
|
||||
}
|
||||
|
||||
bool AscendKernelRuntime::Load(const session::KernelGraph &graph, bool is_task_sink) {
|
||||
|
|
|
@ -71,6 +71,9 @@ class AscendKernelRuntime : public KernelRuntime {
|
|||
void *compute_stream() const override { return stream_; }
|
||||
void *communication_stream() const override { return communication_stream_; }
|
||||
void *GetModelStream(uint32_t graph_id) const override;
|
||||
// add for MindRT
|
||||
void ReleaseDeviceRes() override;
|
||||
void SetCurrentContext();
|
||||
|
||||
protected:
|
||||
DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
|
||||
|
@ -87,10 +90,8 @@ class AscendKernelRuntime : public KernelRuntime {
|
|||
static bool HcclInit();
|
||||
static bool NeedDestroyHccl();
|
||||
static bool DestroyHccl();
|
||||
void SetCurrentContext();
|
||||
|
||||
void ClearGraphModelMap();
|
||||
void ReleaseDeviceRes() override;
|
||||
bool GraphWithEmptyTaskList(const session::KernelGraph &graph) const;
|
||||
bool CheckGraphIdValid(GraphId graph_id) const;
|
||||
#ifndef ENABLE_SECURITY
|
||||
|
|
|
@ -147,7 +147,7 @@ void AiCoreDynamicKernel::AllocateWorkspace() {
|
|||
|
||||
workspace_addr_.clear();
|
||||
for (auto size : workspaces_size_) {
|
||||
auto device_address_ptr = std::make_shared<AscendDeviceAddress>(nullptr, size);
|
||||
auto device_address_ptr = std::make_shared<AscendDeviceAddress>(nullptr, size, kAscendDevice, device_id);
|
||||
auto device_ptr = runtime_instance->MallocMem(MemType::kDynamicMem, size, device_address_ptr);
|
||||
if (device_ptr == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "MallocMem from memory pool failed. Node info :" << cnode->fullname_with_scope();
|
||||
|
|
|
@ -338,7 +338,6 @@ void ProcessAtomicFusion(const std::vector<CNodePtr> &kernels, CleanOpsMap *clea
|
|||
InsertFusionAtomicOp(first_node, fusion_clear_inputs, clean_size_list, clean_ops);
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void InsertAtomicOps(const std::vector<CNodePtr> &kernels, CleanOpsMap *clean_ops) {
|
||||
// fusion
|
||||
|
@ -358,9 +357,9 @@ void InsertAtomicOps(const std::vector<CNodePtr> &kernels, CleanOpsMap *clean_op
|
|||
}
|
||||
}
|
||||
|
||||
std::map<AnfNodePtr, std::vector<size_t>> GetCommunicationOpInputInfo(const std::vector<CNodePtr> &exe_orders) {
|
||||
std::map<AnfNodePtr, std::vector<size_t>> GetCommunicationOpInputInfo(const std::vector<CNodePtr> &kernels) {
|
||||
std::map<AnfNodePtr, std::vector<size_t>> comm_input_info_map;
|
||||
for (auto &kernel : exe_orders) {
|
||||
for (auto &kernel : kernels) {
|
||||
MS_EXCEPTION_IF_NULL(kernel);
|
||||
auto input_num = AnfAlgo::GetInputTensorNum(kernel);
|
||||
if (mindspore::session::AnfRuntimeAlgorithm::IsCommunicationOp(kernel)) {
|
||||
|
@ -401,12 +400,12 @@ std::map<AnfNodePtr, std::vector<size_t>> GetCommunicationOpInputInfo(const std:
|
|||
return comm_input_info_map;
|
||||
}
|
||||
|
||||
void AddNeedInsertAtomicAttrForAllOps(const std::vector<CNodePtr> &exe_orders) {
|
||||
if (exe_orders.empty()) {
|
||||
void TagNeedInsertAtomicAttr(const std::vector<CNodePtr> &nodes) {
|
||||
if (nodes.empty()) {
|
||||
return;
|
||||
}
|
||||
std::map<AnfNodePtr, std::vector<size_t>> comm_input_info_map = GetCommunicationOpInputInfo(exe_orders);
|
||||
for (const auto &anf_node : exe_orders) {
|
||||
std::map<AnfNodePtr, std::vector<size_t>> comm_input_info_map = GetCommunicationOpInputInfo(nodes);
|
||||
for (const auto &anf_node : nodes) {
|
||||
if (comm_input_info_map.find(anf_node) != comm_input_info_map.end()) {
|
||||
auto indexes = comm_input_info_map[anf_node];
|
||||
if (AnfAlgo::HasNodeAttr(kAttrAtomicOutputIndexs, anf_node)) {
|
||||
|
@ -433,23 +432,24 @@ std::vector<CNodePtr> GatherAllAtomicOps(const CleanOpsMap &node_maps) {
|
|||
}
|
||||
return all_atomics;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void InsertAtomicCleanOpForMindRT(const std::vector<CNodePtr> &exe_orders, CleanOpsMap *maps) {
|
||||
void InsertAtomicCleanOps(const std::vector<CNodePtr> &nodes, CleanOpsMap *maps) {
|
||||
MS_EXCEPTION_IF_NULL(maps);
|
||||
// assign attr
|
||||
AddNeedInsertAtomicAttrForAllOps(exe_orders);
|
||||
TagNeedInsertAtomicAttr(nodes);
|
||||
// insert atomic
|
||||
InsertAtomicOps(exe_orders, maps);
|
||||
InsertAtomicOps(nodes, maps);
|
||||
std::vector<CNodePtr> all_atomics = GatherAllAtomicOps(*maps);
|
||||
// build atomic
|
||||
KernelBuild(all_atomics);
|
||||
}
|
||||
|
||||
void InsertAtomicCleanOp(const KernelGraphPtr &kernel_graph) {
|
||||
void InsertAtomicCleanOps(const KernelGraphPtr &kernel_graph) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
const auto &exe_orders = kernel_graph->execution_order();
|
||||
// assign attr
|
||||
AddNeedInsertAtomicAttrForAllOps(exe_orders);
|
||||
TagNeedInsertAtomicAttr(exe_orders);
|
||||
// insert atomic
|
||||
CleanOpsMap node_to_cleans;
|
||||
InsertAtomicOps(exe_orders, &node_to_cleans);
|
||||
|
|
|
@ -32,36 +32,15 @@ using CleanOpsMap = std::map<CNodePtr, std::vector<CNodePtr>>;
|
|||
*/
|
||||
bool KernelBuild(const std::vector<CNodePtr> &kernels);
|
||||
|
||||
/**
|
||||
* @brief preprocess of kernel build for ascend, e.g. inserting clear_zero node for max_pool, bn.
|
||||
* Must DO these changes just before kernel build, and after all of other optimizations on AnfGraph
|
||||
*/
|
||||
void InsertAtomicCleanOp(const KernelGraphPtr &kernel_graph);
|
||||
|
||||
/**
|
||||
* @brief preprocess for mind rt
|
||||
* */
|
||||
void InsertAtomicCleanOpForMindRT(const std::vector<CNodePtr> &exe_orders, CleanOpsMap *maps);
|
||||
|
||||
/**
|
||||
* @brief communication op input info.
|
||||
* */
|
||||
CommOpInputInfo GetCommunicationOpInputInfo(const std::vector<CNodePtr> &exe_orders);
|
||||
|
||||
/**
|
||||
* @brief insert atomic
|
||||
* */
|
||||
void InsertAtomicOps(const std::vector<CNodePtr> &exe_orders, CleanOpsMap *clean_ops);
|
||||
*/
|
||||
void InsertAtomicCleanOps(const KernelGraphPtr &kernel_graph);
|
||||
|
||||
/**
|
||||
* @brief gather all atomics
|
||||
* @brief insert atomic for mind rt
|
||||
* */
|
||||
std::vector<CNodePtr> GatherAllAtomicOps(const CleanOpsMap &node_maps);
|
||||
|
||||
/**
|
||||
* @brief add attr for op if need insert atomic
|
||||
* */
|
||||
void AddNeedInsertAtomicAttrForAllOps(const std::vector<CNodePtr> &exe_orders);
|
||||
void InsertAtomicCleanOps(const std::vector<CNodePtr> &nodes, CleanOpsMap *maps);
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -38,6 +38,7 @@ class CPUDeviceContext;
|
|||
namespace ascend {
|
||||
class AscendKernelRuntime;
|
||||
class AscendMemoryManager;
|
||||
class AscendDeviceContext;
|
||||
#ifndef ENABLE_SECURITY
|
||||
class DataDumper;
|
||||
#endif
|
||||
|
@ -71,9 +72,21 @@ class DeviceAddress : public mindspore::DeviceSync {
|
|||
explicit DeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
|
||||
const KernelWithIndex &node_index)
|
||||
: ptr_(ptr), size_(size), format_(format), type_id_(type_id), node_index_(node_index) {}
|
||||
explicit DeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
|
||||
const std::string &device_name, uint32_t device_id)
|
||||
|
||||
explicit DeviceAddress(void *ptr, size_t size, const std::string &device_name, uint32_t device_id)
|
||||
: ptr_(ptr), size_(size), device_name_(device_name), device_id_(device_id) {}
|
||||
explicit DeviceAddress(void *ptr, size_t size, const string &format, TypeId type_id, const std::string &device_name,
|
||||
uint32_t device_id)
|
||||
: ptr_(ptr), size_(size), format_(format), type_id_(type_id), device_name_(device_name), device_id_(device_id) {}
|
||||
explicit DeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
|
||||
const KernelWithIndex &node_index, const std::string &device_name, uint32_t device_id)
|
||||
: ptr_(ptr),
|
||||
size_(size),
|
||||
format_(format),
|
||||
type_id_(type_id),
|
||||
node_index_(node_index),
|
||||
device_name_(device_name),
|
||||
device_id_(device_id) {}
|
||||
virtual ~DeviceAddress() { ptr_ = nullptr; }
|
||||
|
||||
const void *GetPtr() const { return ptr_; }
|
||||
|
@ -133,6 +146,7 @@ class DeviceAddress : public mindspore::DeviceSync {
|
|||
friend class mindspore::device::gpu::GPUDeviceContext;
|
||||
friend class mindspore::device::ascend::AscendKernelRuntime;
|
||||
friend class mindspore::device::ascend::AscendMemoryManager;
|
||||
friend class mindspore::device::ascend::AscendDeviceContext;
|
||||
#ifndef ENABLE_SECURITY
|
||||
friend class mindspore::device::ascend::DataDumper;
|
||||
#endif
|
||||
|
|
|
@ -989,7 +989,12 @@ void KernelAdjust::AssignLoopCtrlTensorMem(const session::KernelGraph &kernel_gr
|
|||
auto format = AnfAlgo::GetOutputFormat(param, 0);
|
||||
auto type_id = AnfAlgo::GetOutputDeviceDataType(param, 0);
|
||||
|
||||
device_address = std::make_shared<device::ascend::AscendDeviceAddress>(nullptr, size, format, type_id);
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
device_address =
|
||||
std::make_shared<device::ascend::AscendDeviceAddress>(nullptr, size, format, type_id, kAscendDevice, device_id);
|
||||
|
||||
if (runtime_instance->MallocMem(kStaticMem, size, device_address) == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Cannot alloc static memory for device loop control parameter " << name
|
||||
<< " , tensor size is : " << size;
|
||||
|
|
|
@ -119,6 +119,11 @@ class KernelRuntime {
|
|||
virtual DeviceAddressPtr AssignExtraStaticMem(const TensorPtr &tensor, const AnfNodePtr &node, size_t index);
|
||||
virtual void *GetModelStream(uint32_t graph_id) const { return nullptr; }
|
||||
|
||||
// add for MindRT
|
||||
std::shared_ptr<MemoryManager> GetMemoryManager() { return mem_manager_; }
|
||||
void AssignStaticMemoryOutput(const session::KernelGraph &graph);
|
||||
void AssignDynamicMemory(const session::KernelGraph &graph);
|
||||
|
||||
protected:
|
||||
virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
|
||||
TypeId type_id) const = 0;
|
||||
|
@ -128,7 +133,6 @@ class KernelRuntime {
|
|||
virtual bool KernelMemNotReuse(const AnfNodePtr &node);
|
||||
|
||||
void AssignStaticMemory(const session::KernelGraph &graph);
|
||||
void AssignDynamicMemory(const session::KernelGraph &graph);
|
||||
void AssignNodeOutputMem(MemType type, const AnfNodePtr &node, int index);
|
||||
void AssignWorkSpaceMem(MemType type, const AnfNodePtr &node);
|
||||
|
||||
|
@ -154,7 +158,6 @@ class KernelRuntime {
|
|||
const AnfNodePtr &kernel, bool mock);
|
||||
|
||||
void AssignCommunicationMem(const session::KernelGraph &graph);
|
||||
void AssignStaticMemoryOutput(const session::KernelGraph &graph);
|
||||
bool LaunchKernelMod(const session::KernelGraph &graph, bool mock = false);
|
||||
void LaunchKernelEvent(const std::vector<std::vector<std::function<void()>>> &run_events, size_t index) const;
|
||||
void DebugStreamSync(const CNodePtr &kernel);
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
file(GLOB_RECURSE HARDWARE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
"device_context_manager.cc")
|
||||
|
||||
if(ENABLE_D)
|
||||
file(GLOB_RECURSE HARDWARE_D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "ascend/*.cc")
|
||||
endif()
|
||||
|
||||
if(ENABLE_GPU)
|
||||
file(GLOB_RECURSE HARDWARE_GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc")
|
||||
endif()
|
||||
|
|
|
@ -0,0 +1,346 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "runtime/hardware/ascend/ascend_device_context.h"
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include "backend/optimizer/ascend/ascend_backend_optimization.h"
|
||||
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
|
||||
#include "backend/session/ascend_auto_monad.h"
|
||||
#include "utils/context/graph_kernel_flags.h"
|
||||
#include "runtime/device/ascend/kernel_select_ascend.h"
|
||||
#include "runtime/device/kernel_adjust.h"
|
||||
#include "runtime/device/ascend/ascend_stream_assign.h"
|
||||
#include "runtime/device/ascend/kernel_build_ascend.h"
|
||||
#include "runtime/hardware/ascend/ascend_graph_optimization.h"
|
||||
|
||||
#ifndef ENABLE_SECURITY
|
||||
#include "debug/data_dump/dump_json_parser.h"
|
||||
#include "toolchain/adx_datadump_server.h"
|
||||
#include "debug/anf_ir_dump.h"
|
||||
#include "debug/dump_proto.h"
|
||||
#include "debug/data_dump/e2e_dump.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
using KernelGraph = mindspore::session::KernelGraph;
|
||||
|
||||
#ifndef ENABLE_SECURITY
|
||||
void DumpInit(uint32_t device_id) {
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
json_parser.Parse();
|
||||
json_parser.CopyDumpJsonToDir(device_id);
|
||||
json_parser.CopyHcclJsonToDir(device_id);
|
||||
json_parser.CopyMSCfgJsonToDir(device_id);
|
||||
if (json_parser.async_dump_enabled()) {
|
||||
if (AdxDataDumpServerInit() != 0) {
|
||||
MS_LOG(EXCEPTION) << "Adx data dump server init failed";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DumpSetup(const KernelGraphPtr &graph) {
|
||||
MS_LOG(DEBUG) << "Start!";
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
E2eDump::DumpSetup(graph.get());
|
||||
MS_LOG(DEBUG) << "Finish!";
|
||||
}
|
||||
|
||||
void Dump(const KernelGraphPtr &graph, uint32_t rank_id) {
|
||||
MS_LOG(DEBUG) << "Start!";
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
E2eDump::DumpData(graph.get(), rank_id);
|
||||
MS_LOG(DEBUG) << "Finish!";
|
||||
}
|
||||
#endif
|
||||
|
||||
void AscendDeviceContext::Initialize() {
|
||||
MS_LOG(INFO) << "Status record: Enter Initialize...";
|
||||
if (initialized_) {
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance_);
|
||||
runtime_instance_->SetCurrentContext();
|
||||
return;
|
||||
}
|
||||
|
||||
MS_LOG(INFO) << "Status record: Initialize start...";
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
runtime_instance_ = dynamic_cast<AscendKernelRuntime *>(
|
||||
device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id));
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance_);
|
||||
if (!runtime_instance_->Init()) {
|
||||
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
|
||||
}
|
||||
mem_manager_ = runtime_instance_->GetMemoryManager();
|
||||
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||
|
||||
auto env_rank_id = common::GetEnv("RANK_ID");
|
||||
if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
|
||||
// get actual rank id if it's distribution training case.
|
||||
rank_id_ = GetRankId();
|
||||
}
|
||||
#ifndef ENABLE_SECURITY
|
||||
DumpInit(rank_id_);
|
||||
#endif
|
||||
initialized_ = true;
|
||||
MS_LOG(INFO) << "Status record: Initialize success.";
|
||||
}
|
||||
|
||||
void AscendDeviceContext::Destroy() {
|
||||
MS_LOG(INFO) << "Status record: Enter Destroy...";
|
||||
if (!initialized_) {
|
||||
return;
|
||||
}
|
||||
MS_LOG(INFO) << "Status record: Destroy start...";
|
||||
rank_id_ = 0;
|
||||
if (runtime_instance_ != nullptr) {
|
||||
runtime_instance_->ReleaseDeviceRes();
|
||||
runtime_instance_ = nullptr;
|
||||
}
|
||||
initialized_ = false;
|
||||
MS_LOG(INFO) << "Status record: Destroy success.";
|
||||
}
|
||||
|
||||
std::vector<GraphSegmentPtr> AscendDeviceContext::PartitionGraph(
|
||||
const FuncGraphPtr &func_graph, const std::vector<GraphSegmentPtr> &default_partition_segments) {
|
||||
return std::vector<GraphSegmentPtr>();
|
||||
}
|
||||
|
||||
void AscendDeviceContext::UnifyMindIR(const KernelGraphPtr &graph) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
AscendGraphOptimization::GetInstance().UnifyMindIR(graph);
|
||||
}
|
||||
|
||||
void AscendDeviceContext::OptimizeGraph(const KernelGraphPtr &graph) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
AscendGraphOptimization::GetInstance().OptimizeGraph(graph);
|
||||
}
|
||||
|
||||
void AscendDeviceContext::SetOperatorInfo(const std::vector<CNodePtr> &nodes) const {
|
||||
AscendGraphOptimization::GetInstance().SetOperatorInfo(nodes);
|
||||
}
|
||||
|
||||
void AscendDeviceContext::CreateKernel(const std::vector<CNodePtr> &nodes) const {
|
||||
MS_LOG(INFO) << "CreateKernel Start...";
|
||||
struct timeval start_time, end_time;
|
||||
(void)gettimeofday(&start_time, nullptr);
|
||||
auto ret = device::ascend::KernelBuild(nodes);
|
||||
if (!ret) {
|
||||
MS_LOG(EXCEPTION) << "Kernel build error.";
|
||||
}
|
||||
(void)gettimeofday(&end_time, nullptr);
|
||||
const uint64_t kUSecondInSecond = 1000000;
|
||||
uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
|
||||
cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
|
||||
MS_LOG(INFO) << "CreateKernel finish run in " << PRIu64 << " us " << cost;
|
||||
}
|
||||
|
||||
void AscendDeviceContext::UpdateExecOrder(const KernelGraphPtr &graph) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
std::vector<CNodePtr> new_orders;
|
||||
auto nodes = graph->execution_order();
|
||||
for (const auto &node : nodes) {
|
||||
if (node_atomics_.find(node) != node_atomics_.end()) {
|
||||
auto atomics = node_atomics_[node];
|
||||
(void)std::copy(atomics.begin(), atomics.end(), std::back_inserter(new_orders));
|
||||
}
|
||||
new_orders.push_back(node);
|
||||
}
|
||||
graph->set_execution_order(new_orders);
|
||||
node_atomics_.clear();
|
||||
}
|
||||
|
||||
void AscendDeviceContext::PreprocessBeforeRunGraph(const KernelGraphPtr &graph) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_LOG(INFO) << "PreprocessBeforeRunGraph Start for graph " << graph->graph_id();
|
||||
device::ascend::InsertAtomicCleanOps(graph->execution_order(), &node_atomics_);
|
||||
if (graph->is_executing_sink()) {
|
||||
UpdateExecOrder(graph);
|
||||
device::KernelAdjust::GetInstance().InsertDeviceLoopCtrl(graph);
|
||||
device::KernelAdjust::GetInstance().ProcessLoopSink(graph);
|
||||
AscendStreamAssign::GetInstance().AssignStream(NOT_NULL(graph));
|
||||
CreateKernel(graph->execution_order());
|
||||
AllocateGraphMemory(NOT_NULL(graph));
|
||||
LoadModel(NOT_NULL(graph));
|
||||
MS_LOG(INFO) << "PreprocessBeforeRunGraph success.";
|
||||
return;
|
||||
}
|
||||
MS_LOG(INFO) << "PreprocessBeforeRunGraph success.";
|
||||
}
|
||||
|
||||
void AscendDeviceContext::AllocateGraphMemory(const NotNull<KernelGraphPtr> &root_graph) const {
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance_);
|
||||
runtime_instance_->ClearGlobalIdleMem();
|
||||
memo_.clear();
|
||||
AssignInputMemory(root_graph, NOT_NULL(&memo_));
|
||||
device::KernelAdjust::GetInstance().AssignLoopCtrlMemory(*root_graph.get());
|
||||
runtime_instance_->AssignStaticMemoryOutput(*root_graph.get());
|
||||
mem_manager_->ResetDynamicMemory();
|
||||
runtime_instance_->AssignDynamicMemory(*root_graph.get());
|
||||
runtime_instance_->UpdateRefNodeOutputMem(*root_graph.get());
|
||||
}
|
||||
|
||||
void AscendDeviceContext::AssignInputMemory(const NotNull<KernelGraphPtr> &graph,
|
||||
NotNull<std::set<KernelGraphPtr> *> const memo) const {
|
||||
if (memo->find(graph) != memo->end()) {
|
||||
return;
|
||||
}
|
||||
memo->insert(graph.get());
|
||||
|
||||
MS_LOG(INFO) << "Start to assign static memory for Parameter and Value node in graph: " << graph->graph_id();
|
||||
runtime_instance_->AssignStaticMemoryInput(*graph.get());
|
||||
runtime_instance_->AssignStaticMemoryValueNode(*graph.get());
|
||||
for (auto &child_graph : graph->child_graph_order()) {
|
||||
AssignInputMemory(NOT_NULL(child_graph.lock()), memo);
|
||||
}
|
||||
MS_LOG(INFO) << "Finish assigning static memory for Parameter and Value node in graph: " << graph->graph_id();
|
||||
}
|
||||
|
||||
void AscendDeviceContext::LoadModel(const NotNull<KernelGraphPtr> &root_graph) const {
|
||||
MS_LOG(INFO) << "Start LoadModel for graph " << root_graph->graph_id();
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance_);
|
||||
bool ret_ok = runtime_instance_->Load(*root_graph.get(), true);
|
||||
if (!ret_ok) {
|
||||
MS_LOG(EXCEPTION) << "Load task error!";
|
||||
}
|
||||
MS_LOG(INFO) << "Finish!";
|
||||
}
|
||||
|
||||
bool AscendDeviceContext::AllocateMemory(DeviceAddress *const &address, size_t size) const {
|
||||
MS_EXCEPTION_IF_NULL(address);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance_);
|
||||
runtime_instance_->SetCurrentContext();
|
||||
auto device_ptr = mem_manager_->MallocMemFromMemPool(size);
|
||||
if (!device_ptr) {
|
||||
return false;
|
||||
}
|
||||
address->ptr_ = device_ptr;
|
||||
address->size_ = size;
|
||||
address->from_mem_pool_ = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
void AscendDeviceContext::FreeMemory(DeviceAddress *const &address) const {
|
||||
MS_EXCEPTION_IF_NULL(address);
|
||||
MS_EXCEPTION_IF_NULL(address->ptr_);
|
||||
if (!address->from_mem_pool()) {
|
||||
return;
|
||||
}
|
||||
mem_manager_->FreeMemFromMemPool(address->ptr_);
|
||||
address->ptr_ = nullptr;
|
||||
}
|
||||
|
||||
bool AscendDeviceContext::AllocateContinuousMemory(const std::vector<DeviceAddressPtr> &addr_list, size_t total_size,
|
||||
const std::vector<size_t> &size_list) const {
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance_);
|
||||
runtime_instance_->SetCurrentContext();
|
||||
return mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list);
|
||||
}
|
||||
|
||||
bool AscendDeviceContext::ExecuteGraph(const KernelGraphPtr &graph) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
const uint64_t kUSecondInSecond = 1000000;
|
||||
bool ret = false;
|
||||
if (graph->is_executing_sink()) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
auto start_time = std::chrono::steady_clock::now();
|
||||
#else
|
||||
struct timeval start_time {};
|
||||
struct timeval end_time {};
|
||||
(void)gettimeofday(&start_time, nullptr);
|
||||
#endif
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance_);
|
||||
#ifndef ENABLE_SECURITY
|
||||
DumpSetup(graph);
|
||||
#endif
|
||||
{
|
||||
std::lock_guard<std::mutex> locker(launch_mutex_);
|
||||
ret = runtime_instance_->RunTask(*graph);
|
||||
}
|
||||
#ifndef ENABLE_SECURITY
|
||||
Dump(graph, GetRankID());
|
||||
#endif
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
auto end_time = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double, std::ratio<1, kUSecondInSecond>> cost = end_time - start_time;
|
||||
MS_LOG(INFO) << "Call MS Run Success in " << cost.count() << " us";
|
||||
#else
|
||||
(void)gettimeofday(&end_time, nullptr);
|
||||
uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
|
||||
cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
|
||||
MS_LOG(INFO) << "Call MS Run Success in " << cost << " us";
|
||||
#endif
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << graph->ToString() << " does not sink, should launch kernels";
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool AscendDeviceContext::LaunchGraph(const KernelGraphPtr &graph) const {
|
||||
MS_LOG(INFO) << "Status record: start launch graph. graph id: " << graph->graph_id();
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance_);
|
||||
runtime_instance_->SetCurrentContext();
|
||||
device::KernelAdjust::GetInstance().LoadDeviceLoopCtrlParameters(graph);
|
||||
auto ret = ExecuteGraph(graph);
|
||||
MS_LOG(INFO) << "Status record: end launch graph. graph id: " << graph->graph_id();
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool AscendDeviceContext::SyncStream(size_t stream_id) const {
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance_);
|
||||
return runtime_instance_->SyncStream();
|
||||
}
|
||||
bool AscendDeviceContext::IsExecutingSink(const KernelGraphPtr &graph) const { return true; }
|
||||
bool AscendDeviceContext::IsLoopCountSink(const KernelGraphPtr &graph) const { return true; }
|
||||
|
||||
// kernel by kernel mode interface
|
||||
void AscendDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const {
|
||||
MS_LOG(ERROR) << "!!! Ascend with MindRT not support kernel by kernel mode. !!! ";
|
||||
}
|
||||
|
||||
void AscendDeviceContext::PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr &graph) const {
|
||||
MS_LOG(ERROR) << "!!! Ascend with MindRT not support kernel by kernel mode. !!! ";
|
||||
}
|
||||
|
||||
void AscendDeviceContext::UpdateDynamicShape(const CNodePtr &kernel) const {
|
||||
MS_LOG(ERROR) << "!!! Ascend with MindRT not support function UpdateDynamicShape. !!! ";
|
||||
}
|
||||
|
||||
std::shared_ptr<Bucket> AscendDeviceContext::CreateBucket(uint32_t bucket_id, uint32_t bucket_size) const {
|
||||
MS_LOG(ERROR) << "!!! Ascend with MindRT not support function CreateBucket. !!! ";
|
||||
return DeviceContext::CreateBucket(bucket_id, bucket_size);
|
||||
}
|
||||
|
||||
bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector<AddressPtr> &inputs,
|
||||
const vector<AddressPtr> &workspace, const vector<AddressPtr> &outputs,
|
||||
bool is_dynamic_shape) const {
|
||||
MS_LOG(ERROR) << "!!! Ascend with MindRT not support kernel by kernel mode. !!! ";
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AscendDeviceContext::BindDeviceToCurrentThread() const {
|
||||
runtime_instance_->SetCurrentContext();
|
||||
return true;
|
||||
}
|
||||
|
||||
MS_REGISTER_DEVICE(kAscendDevice, AscendDeviceContext);
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,158 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_
|
||||
#define MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include "runtime/hardware/device_context.h"
|
||||
#include "runtime/hardware/device_context_manager.h"
|
||||
#include "runtime/device/memory_manager.h"
|
||||
#include "runtime/device/ascend/ascend_kernel_runtime.h"
|
||||
#include "runtime/device/ascend/ascend_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
class AscendDeviceContext : public DeviceContext {
|
||||
public:
|
||||
explicit AscendDeviceContext(const DeviceContextKey &device_context_key)
|
||||
: DeviceContext(device_context_key), mem_manager_(nullptr), initialized_(false) {}
|
||||
~AscendDeviceContext() override = default;
|
||||
|
||||
// Initialize the device context.
|
||||
void Initialize() override;
|
||||
|
||||
// Destroy device context and release device resource.
|
||||
void Destroy() override;
|
||||
|
||||
// Get rank id for distributed training.
|
||||
uint32_t GetRankID() const override { return rank_id_; }
|
||||
|
||||
// Partition the function graph through the device capability and return the partition segments.
|
||||
// The second parameter is the default partition segments which are provided by the framework.
|
||||
// Device can reprocess the default partition segments to new segments, also can partition the function graph again.
|
||||
// If Device can launch the whole graph and not expect partitioning the function graph, then return the empty
|
||||
// segments. The default behavior is return the default partition segments.
|
||||
std::vector<GraphSegmentPtr> PartitionGraph(const FuncGraphPtr &func_graph,
|
||||
const std::vector<GraphSegmentPtr> &default_partition_segments) override;
|
||||
|
||||
// Optimize the kernel graph for graph mode.
|
||||
void OptimizeGraph(const KernelGraphPtr &graph) const override;
|
||||
|
||||
// Optimize the single operator graph for PyNative mode.
|
||||
void OptimizeSingleOpGraph(const KernelGraphPtr &graph) const override;
|
||||
|
||||
// Select the matching backend kernels according to the data type and format of input and output for all
|
||||
// execution operators, and set final device data type and format information for backend kernels, device
|
||||
// data type and format which replace original data type and format will use for executing kernels.
|
||||
void SetOperatorInfo(const std::vector<CNodePtr> &nodes) const override;
|
||||
|
||||
// Generate 'KernelMod' for all kernels and set 'KernelMod' into kernel,
|
||||
// 'KernelMod' is real executive object of kernel.
|
||||
void CreateKernel(const std::vector<CNodePtr> &nodes) const override;
|
||||
|
||||
// Adjust kernel graph before run graph, used in Graph Mode.
|
||||
void PreprocessBeforeRunGraph(const KernelGraphPtr &graph) const override;
|
||||
// Adjust single op kernel graph before run graph, used in PyNative Mode.
|
||||
void PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr &graph) const override;
|
||||
|
||||
// Infer kernel shape and update abstract info for dynamic shape kernel.
|
||||
void UpdateDynamicShape(const CNodePtr &kernel) const override;
|
||||
|
||||
// Relevant function to allocate and free device memory.
|
||||
bool AllocateMemory(DeviceAddress *const &address, size_t size) const override;
|
||||
void FreeMemory(DeviceAddress *const &address) const override;
|
||||
|
||||
// Allocate continuous device memory end to end into 'addr_list'.
|
||||
// Communication operators may need continuous memory for input and output
|
||||
// to optimize the communication performance.
|
||||
bool AllocateContinuousMemory(const std::vector<DeviceAddressPtr> &addr_list, size_t total_size,
|
||||
const std::vector<size_t> &size_list) const override;
|
||||
|
||||
// Create concrete device address according different device type.
|
||||
DeviceAddressPtr CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format,
|
||||
TypeId type_id) const override {
|
||||
return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id,
|
||||
device_context_key_.device_name_, device_context_key_.device_id_);
|
||||
}
|
||||
|
||||
// Get device address type according different device type, such GPU, Ascend.
|
||||
DeviceAddressType GetDeviceAddressType() const override { return DeviceAddressType::kAscend; }
|
||||
|
||||
// Launch graph, device such as Ascend support the whole graph sink to the device executing.
|
||||
bool LaunchGraph(const KernelGraphPtr &graph) const override;
|
||||
|
||||
// Launch a kernel via 'KernelMod' of the kernel.
|
||||
bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs,
|
||||
bool is_dynamic_shape = false) const override;
|
||||
|
||||
// Synchronize stream, device such as GPU and Ascend need stream to launch kernel asynchronously,
|
||||
// using 'SyncStream' to block thread and wait for completing all tasks in stream.
|
||||
// Devices that do not need stream could ignore the implementation of this function.
|
||||
bool SyncStream(size_t stream_id = 0) const override;
|
||||
|
||||
// Create and initialize bucket for every allreduce operator. Bucket is used in PyNative distributed training mode,
|
||||
// one bucket handles all resource to launch and sync allreduce operator.
|
||||
std::shared_ptr<Bucket> CreateBucket(uint32_t bucket_id, uint32_t bucket_size) const override;
|
||||
|
||||
// Unify the MindIR, the default behavior uses the common unified MindIR.
|
||||
void UnifyMindIR(const KernelGraphPtr &graph) const override;
|
||||
|
||||
// Whether the graph sink executing through the device capability, the default behavior is not sink and return false.
|
||||
bool IsExecutingSink(const KernelGraphPtr &graph) const override;
|
||||
// Whether the graph loop sink executing through the device capability, the default behavior is not loop sink and
|
||||
// return false.
|
||||
bool IsLoopCountSink(const KernelGraphPtr &graph) const override;
|
||||
|
||||
// set rt_context_ to this thread to control device
|
||||
bool BindDeviceToCurrentThread() const;
|
||||
|
||||
private:
|
||||
// Graph loader interface
|
||||
void AllocateGraphMemory(const NotNull<KernelGraphPtr> &root_graph) const;
|
||||
void AssignInputMemory(const NotNull<KernelGraphPtr> &graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
|
||||
void LoadModel(const NotNull<KernelGraphPtr> &root_graph) const;
|
||||
void UpdateExecOrder(const KernelGraphPtr &graph) const;
|
||||
|
||||
// Kernel Runtime --- only for task sink
|
||||
AscendKernelRuntime *runtime_instance_{nullptr};
|
||||
std::shared_ptr<MemoryManager> mem_manager_{nullptr};
|
||||
// rank id of physical device
|
||||
uint32_t rank_id_{0};
|
||||
bool initialized_{false};
|
||||
|
||||
// LaunchGraph interface
|
||||
bool ExecuteGraph(const KernelGraphPtr &graph) const;
|
||||
// The ExecuteGraph is not thread safety specifically, it is not recommended that multiple threads access the same
|
||||
// func at the same time, so need the launch mutex when multiple threads launch the graph.
|
||||
mutable std::mutex launch_mutex_;
|
||||
// The graphs has been traversed when the graph id traversed recursively.
|
||||
// Note: Please clean the set before each use.
|
||||
mutable std::set<KernelGraphPtr> memo_;
|
||||
// Using node to get it's atomics
|
||||
mutable std::map<CNodePtr, std::vector<CNodePtr>> node_atomics_;
|
||||
};
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_
|
|
@ -0,0 +1,288 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "runtime/hardware/ascend/ascend_graph_optimization.h"
|
||||
#include <set>
|
||||
#include "backend/optimizer/common/common_backend_optimization.h"
|
||||
#include "backend/optimizer/ascend/ascend_backend_optimization.h"
|
||||
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
|
||||
#include "backend/session/ascend_auto_monad.h"
|
||||
#include "utils/context/graph_kernel_flags.h"
|
||||
#include "runtime/device/ascend/kernel_select_ascend.h"
|
||||
#include "runtime/device/kernel_adjust.h"
|
||||
|
||||
#ifndef ENABLE_SECURITY
|
||||
#include "debug/anf_ir_dump.h"
|
||||
#include "debug/dump_proto.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
using AscendAutoMonad = mindspore::session::AscendAutoMonad;
|
||||
|
||||
void AscendGraphOptimization::OptimizeGraph(const KernelGraphPtr &graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_LOG(INFO) << "Status record: start optimize graph. graph id: " << graph->graph_id();
|
||||
|
||||
// empty graph dont entry to backend
|
||||
if (graph->execution_order().empty()) {
|
||||
MS_LOG(INFO) << graph->ToString() << " is empty graph.";
|
||||
AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(graph));
|
||||
graph->set_executable(false);
|
||||
MS_LOG(INFO) << "Status record: end optimize graph. graph id: " << graph->graph_id();
|
||||
}
|
||||
|
||||
OptimizeGraphWithoutDeviceInfo(graph);
|
||||
SelectKernel(graph);
|
||||
OptimizeGraphWithDeviceInfo(graph);
|
||||
OptimizeExecutionOrder(graph);
|
||||
PostOptimization(graph);
|
||||
MS_LOG(INFO) << "Status record: end optimize graph. graph id: " << graph->graph_id();
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
HandleControlFlow(NOT_NULL(graph));
|
||||
|
||||
// add all graphs to manager first, so that don't have to make new manager in following passes.
|
||||
auto manager = Manage(graph, true);
|
||||
memo_.clear();
|
||||
AddGraphToManager(NOT_NULL(graph), NOT_NULL(manager));
|
||||
|
||||
memo_.clear();
|
||||
IRFusionOptimization(graph);
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::OptimizeGraphWithDeviceInfo(const KernelGraphPtr &graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
memo_.clear();
|
||||
HardWareOptimization(graph);
|
||||
AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(graph));
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::OptimizeExecutionOrder(const KernelGraphPtr &graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_LOG(INFO) << "Status record: start optimize execution order. graph id: " << graph->graph_id();
|
||||
// root root_graph validate,include generate execute order and so on
|
||||
RootGraphExecutorValidate(NOT_NULL(graph));
|
||||
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
|
||||
if (save_graphs) {
|
||||
DumpIRProto(graph, "before_removeNop_" + std::to_string(graph->graph_id()));
|
||||
}
|
||||
#endif
|
||||
|
||||
opt::HideNopNode(graph.get());
|
||||
|
||||
auto execution_order = graph->execution_order();
|
||||
AnfAlgo::ReorderExecList(NOT_NULL(&execution_order));
|
||||
graph->set_execution_order(execution_order);
|
||||
|
||||
#ifndef ENABLE_SECURITY
|
||||
// insert profiling point
|
||||
device::KernelAdjust::GetInstance().Profiling(NOT_NULL(graph.get()));
|
||||
#endif
|
||||
|
||||
device::KernelAdjust::GetInstance().InsertOverflowCheckOperations(NOT_NULL(graph));
|
||||
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
if (save_graphs) {
|
||||
DumpIR("after_adjust_kernel.ir", graph);
|
||||
}
|
||||
#endif
|
||||
MS_LOG(INFO) << "Status record: end optimize execution order. graph id: " << graph->graph_id();
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::PostOptimization(const KernelGraphPtr &graph) {
|
||||
MS_LOG(INFO) << "Status record: start post optimization. graph id: " << graph->graph_id();
|
||||
// copy child graph ref output map to father graph ref output map
|
||||
memo_.clear();
|
||||
UpdateRefOutputMap(graph);
|
||||
graph->SetInputNodes();
|
||||
graph->SetOptimizerFlag();
|
||||
MS_LOG(INFO) << "Status record: end post optimization. graph id: " << graph->graph_id();
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::HardWareOptimization(const KernelGraphPtr &graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_LOG(INFO) << "Status record: start hardware optimize. graph id: " << graph->graph_id();
|
||||
if (memo_.find(graph) != memo_.end()) {
|
||||
return;
|
||||
}
|
||||
memo_.insert(graph);
|
||||
opt::AscendBackendOptimization(graph);
|
||||
opt::CommonFinalOptimization(graph);
|
||||
if (graphkernel::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
|
||||
graphkernel::GraphKernelOptimize(graph);
|
||||
graph->SetExecOrderByDefault();
|
||||
}
|
||||
MS_LOG(INFO) << "Status record: end hardware optimize. graph id: " << graph->graph_id();
|
||||
|
||||
for (auto &child_graph : graph->child_graph_order()) {
|
||||
HardWareOptimization(child_graph.lock());
|
||||
}
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::AddGraphToManager(const NotNull<KernelGraphPtr> graph,
|
||||
NotNull<FuncGraphManagerPtr> manager) {
|
||||
if (memo_.find(graph) != memo_.end()) {
|
||||
return;
|
||||
}
|
||||
memo_.insert(graph.get());
|
||||
manager->AddFuncGraph(graph.get(), false);
|
||||
|
||||
for (auto &child_graph : graph->child_graph_order()) {
|
||||
AddGraphToManager(NOT_NULL(child_graph.lock()), manager);
|
||||
}
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::IRFusionOptimization(const KernelGraphPtr &graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
if (memo_.find(graph) != memo_.end()) {
|
||||
return;
|
||||
}
|
||||
memo_.insert(graph);
|
||||
|
||||
opt::AscendBackendIRFusionOptimization(graph);
|
||||
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
|
||||
if (save_graphs) {
|
||||
std::string file_name = "select_kernel_before_graph_" + std::to_string(graph->graph_id()) + ".ir";
|
||||
DumpIR(file_name, graph);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (auto &child_graph : graph->child_graph_order()) {
|
||||
IRFusionOptimization(NOT_NULL(child_graph.lock()));
|
||||
}
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::HandleControlFlow(const NotNull<KernelGraphPtr> graph) {
|
||||
MS_LOG(INFO) << "Status record: start handle control flow. graph id: " << graph->graph_id();
|
||||
AscendAutoMonad auto_monad(graph);
|
||||
auto_monad.Run();
|
||||
MS_LOG(INFO) << "Status record: end handle control flow. graph id: " << graph->graph_id();
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::RootGraphExecutorValidate(NotNull<KernelGraphPtr> graph) {
|
||||
MS_LOG(INFO) << "Status record: start graph executor validate. graph id: " << graph->graph_id();
|
||||
AscendAutoMonad auto_monad(graph);
|
||||
auto_monad.GenerateExecuteOrder();
|
||||
MS_LOG(INFO) << "Status record: end graph executor validate. graph id: " << graph->graph_id();
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::RecurseSelectKernelInfo(const KernelGraphPtr &graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
if (memo_.find(graph) != memo_.end()) {
|
||||
return;
|
||||
}
|
||||
memo_.insert(graph);
|
||||
MS_LOG(INFO) << "Status record: start select kernel info. graph id: " << graph->graph_id();
|
||||
SetOperatorInfo(graph->execution_order());
|
||||
MS_LOG(INFO) << "Status record: end select kernel info. graph id: " << graph->graph_id();
|
||||
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
|
||||
if (save_graphs) {
|
||||
std::string file_name = "select_kernel_after_graph_" + std::to_string(graph->graph_id()) + ".ir";
|
||||
DumpIR(file_name, graph);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (auto &child_graph : graph->child_graph_order()) {
|
||||
RecurseSelectKernelInfo(child_graph.lock());
|
||||
}
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::SelectKernel(const KernelGraphPtr &graph) {
|
||||
MS_LOG(INFO) << "Status record: start select kernel info. graph id: " << graph->graph_id();
|
||||
raise_precision_count_ = 0;
|
||||
reduce_precision_count_ = 0;
|
||||
memo_.clear();
|
||||
RecurseSelectKernelInfo(graph);
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) {
|
||||
if (raise_precision_count_ > 0) {
|
||||
MS_LOG(WARNING) << "There are " << raise_precision_count_
|
||||
<< " node/nodes used raise precision to selected the kernel!";
|
||||
}
|
||||
if (reduce_precision_count_ > 0) {
|
||||
MS_LOG(WARNING) << "There are " << reduce_precision_count_
|
||||
<< " node/nodes used reduce precision to selected the kernel!";
|
||||
}
|
||||
}
|
||||
MS_LOG(INFO) << "Status record: end select kernel info. graph id: " << graph->graph_id();
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::UpdateRefOutputMap(const KernelGraphPtr &graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
if (memo_.find(graph) != memo_.end()) {
|
||||
return;
|
||||
}
|
||||
memo_.insert(graph);
|
||||
|
||||
for (auto &child_graph : graph->child_graph_order()) {
|
||||
auto child_graph_ptr = child_graph.lock();
|
||||
MS_EXCEPTION_IF_NULL(child_graph_ptr);
|
||||
UpdateRefOutputMap(NOT_NULL(child_graph_ptr));
|
||||
// copy ref map to final graph
|
||||
auto child_ref_map = child_graph_ptr->GetRefMap();
|
||||
for (auto &item : child_ref_map) {
|
||||
if (graph->IsInRefOutputMap(item.first)) {
|
||||
MS_LOG(WARNING) << "The ref pair <" << item.first.first->DebugString() << ", " << item.first.second
|
||||
<< "> is already in " << graph->ToString();
|
||||
continue;
|
||||
}
|
||||
graph->AddRefCorrespondPairs(item.first, item.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::UnifyMindIR(const KernelGraphPtr &graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_LOG(INFO) << "Status record: start unify mindir. graph id: " << graph->graph_id();
|
||||
opt::CommonUnifyMindIR(graph);
|
||||
opt::AscendUnifyMindIR(graph);
|
||||
MS_LOG(INFO) << "Status record: end unify mindir. graph id: " << graph->graph_id();
|
||||
}
|
||||
|
||||
void AscendGraphOptimization::SetOperatorInfo(const std::vector<CNodePtr> &nodes) {
|
||||
for (const auto &node : nodes) {
|
||||
auto status = device::ascend::SelectKernelInfo(node);
|
||||
AnfAlgo::EraseNodeAttr(kAttrPynativeNextOpName, node);
|
||||
AnfAlgo::EraseNodeAttr(kAttrPynativeNextIndex, node);
|
||||
if (status == device::ascend::kStatusRaisePrecision) {
|
||||
raise_precision_count_++;
|
||||
} else if (status == device::ascend::kStatusReducePrecision) {
|
||||
reduce_precision_count_++;
|
||||
}
|
||||
MS_LOG(DEBUG) << "Select ApplyKernel: " << node->DebugString();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,77 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_MINDSPORE_ASCEND_GRAPH_OPTIMIZATION_H
|
||||
#define MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_MINDSPORE_ASCEND_GRAPH_OPTIMIZATION_H
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include "runtime/hardware/device_context.h"
|
||||
#include "runtime/hardware/device_context_manager.h"
|
||||
#include "runtime/device/memory_manager.h"
|
||||
#include "runtime/device/ascend/ascend_kernel_runtime.h"
|
||||
#include "runtime/device/ascend/ascend_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
class AscendGraphOptimization {
|
||||
public:
|
||||
static AscendGraphOptimization &GetInstance() {
|
||||
static AscendGraphOptimization instance;
|
||||
return instance;
|
||||
}
|
||||
AscendGraphOptimization() = default;
|
||||
~AscendGraphOptimization() = default;
|
||||
AscendGraphOptimization(const AscendGraphOptimization &) = delete;
|
||||
AscendGraphOptimization &operator=(const AscendGraphOptimization &) = delete;
|
||||
|
||||
void OptimizeGraph(const KernelGraphPtr &graph);
|
||||
void SetOperatorInfo(const std::vector<CNodePtr> &nodes);
|
||||
void UnifyMindIR(const KernelGraphPtr &graph);
|
||||
|
||||
private:
|
||||
// Graph Optimized level-2 interface
|
||||
void OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph);
|
||||
void OptimizeGraphWithDeviceInfo(const KernelGraphPtr &graph);
|
||||
void OptimizeExecutionOrder(const KernelGraphPtr &graph);
|
||||
void PostOptimization(const KernelGraphPtr &graph);
|
||||
|
||||
// Graph Optimized level-3 interface
|
||||
void IRFusionOptimization(const KernelGraphPtr &graph);
|
||||
void UpdateRefOutputMap(const KernelGraphPtr &graph);
|
||||
void AddGraphToManager(const NotNull<KernelGraphPtr> graph, NotNull<FuncGraphManagerPtr> manager);
|
||||
void SelectKernel(const KernelGraphPtr &graph);
|
||||
void RecurseSelectKernelInfo(const KernelGraphPtr &graph);
|
||||
void HardWareOptimization(const KernelGraphPtr &graph);
|
||||
void HandleControlFlow(const NotNull<KernelGraphPtr> graph);
|
||||
void RootGraphExecutorValidate(NotNull<KernelGraphPtr> graph);
|
||||
|
||||
// Number of operators whose precision changes after select kernel
|
||||
size_t raise_precision_count_{0};
|
||||
size_t reduce_precision_count_{0};
|
||||
// The graphs has been traversed when the graph id traversed recursively.
|
||||
// Note: Please clean the set before each use.
|
||||
std::set<KernelGraphPtr> memo_;
|
||||
};
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_ASCEND_GRAPH_OPTIMIZATION_H
|
|
@ -136,6 +136,8 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
|||
"../../../mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc"
|
||||
"../../../mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc"
|
||||
"../../../mindspore/ccsrc/runtime/device/ascend/lic_manager.cc"
|
||||
"../../../mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc"
|
||||
"../../../mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.cc"
|
||||
"../../../mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc"
|
||||
"../../../mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.cc"
|
||||
"../../../mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc"
|
||||
|
|
Loading…
Reference in New Issue