diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index 2bc82f64187..5880b9911bf 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -192,8 +192,11 @@ void GenOpOutputStubTensor(const KernelGraphPtr &single_op_graph, const CNodePtr device_info.format_ = output_format; device_info.data_type_ = TypeIdToType(output_type); stub_output_tensor->set_device_info(device_info); - device::DeviceAddressPtr device_address = - std::make_shared(nullptr, 0, output_format, output_type); + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + auto device_id = ms_context->get_param(MS_CTX_DEVICE_ID); + device::DeviceAddressPtr device_address = std::make_shared( + nullptr, 0, output_format, output_type, kAscendDevice, device_id); stub_output_tensor->set_device_address(device_address); output_tensor_info.output_stub_tensor = stub_output_tensor; auto kernel_info = dynamic_cast(output_node->kernel_info()); @@ -721,7 +724,7 @@ void AscendSession::BatchBuildKernel(const std::vector atomic_node_to_build; for (auto &graph : graphs) { - device::ascend::InsertAtomicCleanOp(graph); + device::ascend::InsertAtomicCleanOps(graph); const auto &nodes = graph->execution_order(); std::copy(nodes.begin(), nodes.end(), std::back_inserter(atomic_node_to_build)); } @@ -998,10 +1001,10 @@ void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::mapexecution_order(); std::copy(execution_order.begin(), execution_order.end(), std::back_inserter(kernels)); } @@ -1078,7 +1081,7 @@ void AscendSession::AdjustKernel(const std::shared_ptr &kernel_grap // Insert CLearZero op // prepare for next step from json get atomic info BuildKernel(kernel_graph); - device::ascend::InsertAtomicCleanOp(kernel_graph); + device::ascend::InsertAtomicCleanOps(kernel_graph); device::KernelAdjust::GetInstance().InsertDeviceLoopCtrl(kernel_graph); device::KernelAdjust::GetInstance().ProcessLoopSink(kernel_graph); #ifdef ENABLE_DUMP_IR @@ -1098,7 +1101,7 @@ void AscendSession::RunOpAdjustKernel(const std::shared_ptr &kernel // Insert CLearZero op // prepare for next step from json get atomic info BuildKernel(kernel_graph); - device::ascend::InsertAtomicCleanOp(kernel_graph); + device::ascend::InsertAtomicCleanOps(kernel_graph); MS_LOG(INFO) << "Finish!"; } diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc index 3c613ce1385..c7a605d0a77 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc @@ -26,6 +26,8 @@ #include "runtime/device/memory_manager.h" #include "runtime/device/convert_tensor_utils.h" #include "runtime/device/ascend/ascend_launch_transdata.h" +#include "runtime/hardware/device_context_manager.h" +#include "runtime/hardware/ascend/ascend_device_context.h" #include "ir/dtype/type.h" #include "ir/tensor.h" #include "abstract/utils.h" @@ -162,6 +164,25 @@ bool SyncDeviceToHostAndFloatToFloat64(void *dst, size_t dst_size, const void *s return true; } +void AscendDeviceAddress::BindDevice() const { + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + if (!MsContext::GetInstance()->get_param(MS_CTX_ENABLE_MINDRT)) { + return; + } + + // Bind device by device name and device id on the current thread. + if (device_name_ != "") { + auto device_context = + device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_}); + auto ascend_device_context = dynamic_cast(device_context); + MS_EXCEPTION_IF_NULL(ascend_device_context); + if (!ascend_device_context->BindDeviceToCurrentThread()) { + MS_LOG(EXCEPTION) << "BindDeviceToCurrentThread failed."; + } + } +} + void AscendDeviceAddress::SyncStream() const { MS_LOG(DEBUG) << "SyncStream Start!"; auto ms_context = MsContext::GetInstance(); @@ -183,6 +204,7 @@ void AscendDeviceAddress::SyncStream() const { bool AscendDeviceAddress::SyncDeviceToHost(size_t size, void *const host_ptr) const { MS_EXCEPTION_IF_NULL(host_ptr); + BindDevice(); SyncStream(); SyncMemory(host_ptr, ptr_, size, RT_MEMCPY_DEVICE_TO_HOST); return true; @@ -190,6 +212,7 @@ bool AscendDeviceAddress::SyncDeviceToHost(size_t size, void *const host_ptr) co bool AscendDeviceAddress::SyncHostToDevice(size_t size, const void *host_ptr) const { MS_EXCEPTION_IF_NULL(host_ptr); + BindDevice(); SyncMemory(ptr_, host_ptr, size, RT_MEMCPY_HOST_TO_DEVICE); return true; } @@ -201,6 +224,7 @@ bool AscendDeviceAddress::SyncDeviceToHost(const ShapeVector &shape, size_t size if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) { return true; } + BindDevice(); SyncStream(); bool sync_ok = false; std::vector host_shape; @@ -368,7 +392,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) { return true; } - + BindDevice(); bool sync_ok = false; std::vector host_shape; (void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), LongToSize); @@ -416,6 +440,7 @@ bool AscendDeviceAddress::SyncDeviceToDevice(const ShapeVector &, size_t size, T if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) { return true; } + BindDevice(); bool sync_ok = false; if (format_ == format && type_id_ == type) { if (!DataSync(ptr_, src_ptr, size)) { diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h index 5cc58d97942..5a82e3185cf 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h @@ -36,11 +36,14 @@ namespace ascend { class AscendDeviceAddress : public DeviceAddress { public: explicit AscendDeviceAddress(void *ptr, size_t size) : DeviceAddress(ptr, size) {} - explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id) - : DeviceAddress(ptr, size, format, type_id) {} + explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &device_name, uint32_t device_id) + : DeviceAddress(ptr, size, device_name, device_id) {} explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id, - const KernelWithIndex &node_index) - : DeviceAddress(ptr, size, format, type_id, node_index) {} + const std::string &device_name, uint32_t device_id) + : DeviceAddress(ptr, size, format, type_id, device_name, device_id) {} + explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id, + const KernelWithIndex &node_index, const std::string &device_name, uint32_t device_id) + : DeviceAddress(ptr, size, format, type_id, node_index, device_name, device_id) {} ~AscendDeviceAddress() override; bool SyncDeviceToHost(size_t size, void *const host_ptr) const override; bool SyncHostToDevice(size_t size, const void *host_ptr) const override; @@ -71,6 +74,7 @@ class AscendDeviceAddress : public DeviceAddress { const std::string &ori_format, const std::string &dst_format) const; mutable std::shared_ptr launch_transdata_{nullptr}; + void BindDevice() const; }; using AscendDeviceAddressPtr = std::shared_ptr; } // namespace ascend diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 928b9479d15..5881c5d33bb 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -404,12 +404,19 @@ bool AscendKernelRuntime::KernelMemNotReuse(const AnfNodePtr &node) { DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, TypeId type_id) const { - return std::make_shared(device_ptr, device_size, format, type_id); + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + auto device_id = ms_context->get_param(MS_CTX_DEVICE_ID); + return std::make_shared(device_ptr, device_size, format, type_id, kAscendDevice, device_id); } DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, TypeId type_id, const KernelWithIndex &node_index) const { - return std::make_shared(device_ptr, device_size, format, type_id, node_index); + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + auto device_id = ms_context->get_param(MS_CTX_DEVICE_ID); + return std::make_shared(device_ptr, device_size, format, type_id, node_index, kAscendDevice, + device_id); } bool AscendKernelRuntime::Load(const session::KernelGraph &graph, bool is_task_sink) { diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h index 601b6f96ef6..c6f8e6a2beb 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h @@ -71,6 +71,9 @@ class AscendKernelRuntime : public KernelRuntime { void *compute_stream() const override { return stream_; } void *communication_stream() const override { return communication_stream_; } void *GetModelStream(uint32_t graph_id) const override; + // add for MindRT + void ReleaseDeviceRes() override; + void SetCurrentContext(); protected: DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, @@ -87,10 +90,8 @@ class AscendKernelRuntime : public KernelRuntime { static bool HcclInit(); static bool NeedDestroyHccl(); static bool DestroyHccl(); - void SetCurrentContext(); void ClearGraphModelMap(); - void ReleaseDeviceRes() override; bool GraphWithEmptyTaskList(const session::KernelGraph &graph) const; bool CheckGraphIdValid(GraphId graph_id) const; #ifndef ENABLE_SECURITY diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.cc b/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.cc index ed1a8ae795a..a3a69e3d10a 100644 --- a/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.cc +++ b/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.cc @@ -147,7 +147,7 @@ void AiCoreDynamicKernel::AllocateWorkspace() { workspace_addr_.clear(); for (auto size : workspaces_size_) { - auto device_address_ptr = std::make_shared(nullptr, size); + auto device_address_ptr = std::make_shared(nullptr, size, kAscendDevice, device_id); auto device_ptr = runtime_instance->MallocMem(MemType::kDynamicMem, size, device_address_ptr); if (device_ptr == nullptr) { MS_LOG(EXCEPTION) << "MallocMem from memory pool failed. Node info :" << cnode->fullname_with_scope(); diff --git a/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc b/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc index 1b18077d19c..bc0b288052e 100644 --- a/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc +++ b/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc @@ -338,7 +338,6 @@ void ProcessAtomicFusion(const std::vector &kernels, CleanOpsMap *clea InsertFusionAtomicOp(first_node, fusion_clear_inputs, clean_size_list, clean_ops); } } -} // namespace void InsertAtomicOps(const std::vector &kernels, CleanOpsMap *clean_ops) { // fusion @@ -358,9 +357,9 @@ void InsertAtomicOps(const std::vector &kernels, CleanOpsMap *clean_op } } -std::map> GetCommunicationOpInputInfo(const std::vector &exe_orders) { +std::map> GetCommunicationOpInputInfo(const std::vector &kernels) { std::map> comm_input_info_map; - for (auto &kernel : exe_orders) { + for (auto &kernel : kernels) { MS_EXCEPTION_IF_NULL(kernel); auto input_num = AnfAlgo::GetInputTensorNum(kernel); if (mindspore::session::AnfRuntimeAlgorithm::IsCommunicationOp(kernel)) { @@ -401,12 +400,12 @@ std::map> GetCommunicationOpInputInfo(const std: return comm_input_info_map; } -void AddNeedInsertAtomicAttrForAllOps(const std::vector &exe_orders) { - if (exe_orders.empty()) { +void TagNeedInsertAtomicAttr(const std::vector &nodes) { + if (nodes.empty()) { return; } - std::map> comm_input_info_map = GetCommunicationOpInputInfo(exe_orders); - for (const auto &anf_node : exe_orders) { + std::map> comm_input_info_map = GetCommunicationOpInputInfo(nodes); + for (const auto &anf_node : nodes) { if (comm_input_info_map.find(anf_node) != comm_input_info_map.end()) { auto indexes = comm_input_info_map[anf_node]; if (AnfAlgo::HasNodeAttr(kAttrAtomicOutputIndexs, anf_node)) { @@ -433,23 +432,24 @@ std::vector GatherAllAtomicOps(const CleanOpsMap &node_maps) { } return all_atomics; } +} // namespace -void InsertAtomicCleanOpForMindRT(const std::vector &exe_orders, CleanOpsMap *maps) { +void InsertAtomicCleanOps(const std::vector &nodes, CleanOpsMap *maps) { MS_EXCEPTION_IF_NULL(maps); // assign attr - AddNeedInsertAtomicAttrForAllOps(exe_orders); + TagNeedInsertAtomicAttr(nodes); // insert atomic - InsertAtomicOps(exe_orders, maps); + InsertAtomicOps(nodes, maps); std::vector all_atomics = GatherAllAtomicOps(*maps); // build atomic KernelBuild(all_atomics); } -void InsertAtomicCleanOp(const KernelGraphPtr &kernel_graph) { +void InsertAtomicCleanOps(const KernelGraphPtr &kernel_graph) { MS_EXCEPTION_IF_NULL(kernel_graph); const auto &exe_orders = kernel_graph->execution_order(); // assign attr - AddNeedInsertAtomicAttrForAllOps(exe_orders); + TagNeedInsertAtomicAttr(exe_orders); // insert atomic CleanOpsMap node_to_cleans; InsertAtomicOps(exe_orders, &node_to_cleans); diff --git a/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.h b/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.h index 26345ef5f20..a30e1fdfc29 100644 --- a/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.h +++ b/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.h @@ -32,36 +32,15 @@ using CleanOpsMap = std::map>; */ bool KernelBuild(const std::vector &kernels); -/** - * @brief preprocess of kernel build for ascend, e.g. inserting clear_zero node for max_pool, bn. - * Must DO these changes just before kernel build, and after all of other optimizations on AnfGraph - */ -void InsertAtomicCleanOp(const KernelGraphPtr &kernel_graph); - -/** - * @brief preprocess for mind rt - * */ -void InsertAtomicCleanOpForMindRT(const std::vector &exe_orders, CleanOpsMap *maps); - -/** - * @brief communication op input info. - * */ -CommOpInputInfo GetCommunicationOpInputInfo(const std::vector &exe_orders); - /** * @brief insert atomic - * */ -void InsertAtomicOps(const std::vector &exe_orders, CleanOpsMap *clean_ops); + */ +void InsertAtomicCleanOps(const KernelGraphPtr &kernel_graph); /** - * @brief gather all atomics + * @brief insert atomic for mind rt * */ -std::vector GatherAllAtomicOps(const CleanOpsMap &node_maps); - -/** - * @brief add attr for op if need insert atomic - * */ -void AddNeedInsertAtomicAttrForAllOps(const std::vector &exe_orders); +void InsertAtomicCleanOps(const std::vector &nodes, CleanOpsMap *maps); } // namespace ascend } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/device_address.h b/mindspore/ccsrc/runtime/device/device_address.h index b1cbff61685..00910539cf4 100644 --- a/mindspore/ccsrc/runtime/device/device_address.h +++ b/mindspore/ccsrc/runtime/device/device_address.h @@ -38,6 +38,7 @@ class CPUDeviceContext; namespace ascend { class AscendKernelRuntime; class AscendMemoryManager; +class AscendDeviceContext; #ifndef ENABLE_SECURITY class DataDumper; #endif @@ -71,9 +72,21 @@ class DeviceAddress : public mindspore::DeviceSync { explicit DeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id, const KernelWithIndex &node_index) : ptr_(ptr), size_(size), format_(format), type_id_(type_id), node_index_(node_index) {} - explicit DeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id, - const std::string &device_name, uint32_t device_id) + + explicit DeviceAddress(void *ptr, size_t size, const std::string &device_name, uint32_t device_id) + : ptr_(ptr), size_(size), device_name_(device_name), device_id_(device_id) {} + explicit DeviceAddress(void *ptr, size_t size, const string &format, TypeId type_id, const std::string &device_name, + uint32_t device_id) : ptr_(ptr), size_(size), format_(format), type_id_(type_id), device_name_(device_name), device_id_(device_id) {} + explicit DeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id, + const KernelWithIndex &node_index, const std::string &device_name, uint32_t device_id) + : ptr_(ptr), + size_(size), + format_(format), + type_id_(type_id), + node_index_(node_index), + device_name_(device_name), + device_id_(device_id) {} virtual ~DeviceAddress() { ptr_ = nullptr; } const void *GetPtr() const { return ptr_; } @@ -133,6 +146,7 @@ class DeviceAddress : public mindspore::DeviceSync { friend class mindspore::device::gpu::GPUDeviceContext; friend class mindspore::device::ascend::AscendKernelRuntime; friend class mindspore::device::ascend::AscendMemoryManager; + friend class mindspore::device::ascend::AscendDeviceContext; #ifndef ENABLE_SECURITY friend class mindspore::device::ascend::DataDumper; #endif diff --git a/mindspore/ccsrc/runtime/device/kernel_adjust.cc b/mindspore/ccsrc/runtime/device/kernel_adjust.cc index 4fa834578de..a76e0932f40 100644 --- a/mindspore/ccsrc/runtime/device/kernel_adjust.cc +++ b/mindspore/ccsrc/runtime/device/kernel_adjust.cc @@ -989,7 +989,12 @@ void KernelAdjust::AssignLoopCtrlTensorMem(const session::KernelGraph &kernel_gr auto format = AnfAlgo::GetOutputFormat(param, 0); auto type_id = AnfAlgo::GetOutputDeviceDataType(param, 0); - device_address = std::make_shared(nullptr, size, format, type_id); + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + auto device_id = ms_context->get_param(MS_CTX_DEVICE_ID); + device_address = + std::make_shared(nullptr, size, format, type_id, kAscendDevice, device_id); + if (runtime_instance->MallocMem(kStaticMem, size, device_address) == nullptr) { MS_LOG(EXCEPTION) << "Cannot alloc static memory for device loop control parameter " << name << " , tensor size is : " << size; diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h index 4834d4ecaec..08dc7d7d0eb 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h @@ -119,6 +119,11 @@ class KernelRuntime { virtual DeviceAddressPtr AssignExtraStaticMem(const TensorPtr &tensor, const AnfNodePtr &node, size_t index); virtual void *GetModelStream(uint32_t graph_id) const { return nullptr; } + // add for MindRT + std::shared_ptr GetMemoryManager() { return mem_manager_; } + void AssignStaticMemoryOutput(const session::KernelGraph &graph); + void AssignDynamicMemory(const session::KernelGraph &graph); + protected: virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, TypeId type_id) const = 0; @@ -128,7 +133,6 @@ class KernelRuntime { virtual bool KernelMemNotReuse(const AnfNodePtr &node); void AssignStaticMemory(const session::KernelGraph &graph); - void AssignDynamicMemory(const session::KernelGraph &graph); void AssignNodeOutputMem(MemType type, const AnfNodePtr &node, int index); void AssignWorkSpaceMem(MemType type, const AnfNodePtr &node); @@ -154,7 +158,6 @@ class KernelRuntime { const AnfNodePtr &kernel, bool mock); void AssignCommunicationMem(const session::KernelGraph &graph); - void AssignStaticMemoryOutput(const session::KernelGraph &graph); bool LaunchKernelMod(const session::KernelGraph &graph, bool mock = false); void LaunchKernelEvent(const std::vector>> &run_events, size_t index) const; void DebugStreamSync(const CNodePtr &kernel); diff --git a/mindspore/ccsrc/runtime/hardware/CMakeLists.txt b/mindspore/ccsrc/runtime/hardware/CMakeLists.txt index e1be6200a6f..8847812b589 100644 --- a/mindspore/ccsrc/runtime/hardware/CMakeLists.txt +++ b/mindspore/ccsrc/runtime/hardware/CMakeLists.txt @@ -1,6 +1,10 @@ file(GLOB_RECURSE HARDWARE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "device_context_manager.cc") +if(ENABLE_D) + file(GLOB_RECURSE HARDWARE_D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "ascend/*.cc") +endif() + if(ENABLE_GPU) file(GLOB_RECURSE HARDWARE_GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc") endif() diff --git a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc new file mode 100644 index 00000000000..b5b6c56b205 --- /dev/null +++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc @@ -0,0 +1,346 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "runtime/hardware/ascend/ascend_device_context.h" +#include +#include +#include "backend/optimizer/ascend/ascend_backend_optimization.h" +#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" +#include "backend/session/ascend_auto_monad.h" +#include "utils/context/graph_kernel_flags.h" +#include "runtime/device/ascend/kernel_select_ascend.h" +#include "runtime/device/kernel_adjust.h" +#include "runtime/device/ascend/ascend_stream_assign.h" +#include "runtime/device/ascend/kernel_build_ascend.h" +#include "runtime/hardware/ascend/ascend_graph_optimization.h" + +#ifndef ENABLE_SECURITY +#include "debug/data_dump/dump_json_parser.h" +#include "toolchain/adx_datadump_server.h" +#include "debug/anf_ir_dump.h" +#include "debug/dump_proto.h" +#include "debug/data_dump/e2e_dump.h" +#endif + +namespace mindspore { +namespace device { +namespace ascend { +using KernelGraph = mindspore::session::KernelGraph; + +#ifndef ENABLE_SECURITY +void DumpInit(uint32_t device_id) { + auto &json_parser = DumpJsonParser::GetInstance(); + json_parser.Parse(); + json_parser.CopyDumpJsonToDir(device_id); + json_parser.CopyHcclJsonToDir(device_id); + json_parser.CopyMSCfgJsonToDir(device_id); + if (json_parser.async_dump_enabled()) { + if (AdxDataDumpServerInit() != 0) { + MS_LOG(EXCEPTION) << "Adx data dump server init failed"; + } + } +} + +void DumpSetup(const KernelGraphPtr &graph) { + MS_LOG(DEBUG) << "Start!"; + MS_EXCEPTION_IF_NULL(graph); + E2eDump::DumpSetup(graph.get()); + MS_LOG(DEBUG) << "Finish!"; +} + +void Dump(const KernelGraphPtr &graph, uint32_t rank_id) { + MS_LOG(DEBUG) << "Start!"; + MS_EXCEPTION_IF_NULL(graph); + E2eDump::DumpData(graph.get(), rank_id); + MS_LOG(DEBUG) << "Finish!"; +} +#endif + +void AscendDeviceContext::Initialize() { + MS_LOG(INFO) << "Status record: Enter Initialize..."; + if (initialized_) { + MS_EXCEPTION_IF_NULL(runtime_instance_); + runtime_instance_->SetCurrentContext(); + return; + } + + MS_LOG(INFO) << "Status record: Initialize start..."; + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + auto device_id = ms_context->get_param(MS_CTX_DEVICE_ID); + runtime_instance_ = dynamic_cast( + device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id)); + MS_EXCEPTION_IF_NULL(runtime_instance_); + if (!runtime_instance_->Init()) { + MS_LOG(EXCEPTION) << "Kernel runtime init error."; + } + mem_manager_ = runtime_instance_->GetMemoryManager(); + MS_EXCEPTION_IF_NULL(mem_manager_); + + auto env_rank_id = common::GetEnv("RANK_ID"); + if (ms_context->get_param(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) { + // get actual rank id if it's distribution training case. + rank_id_ = GetRankId(); + } +#ifndef ENABLE_SECURITY + DumpInit(rank_id_); +#endif + initialized_ = true; + MS_LOG(INFO) << "Status record: Initialize success."; +} + +void AscendDeviceContext::Destroy() { + MS_LOG(INFO) << "Status record: Enter Destroy..."; + if (!initialized_) { + return; + } + MS_LOG(INFO) << "Status record: Destroy start..."; + rank_id_ = 0; + if (runtime_instance_ != nullptr) { + runtime_instance_->ReleaseDeviceRes(); + runtime_instance_ = nullptr; + } + initialized_ = false; + MS_LOG(INFO) << "Status record: Destroy success."; +} + +std::vector AscendDeviceContext::PartitionGraph( + const FuncGraphPtr &func_graph, const std::vector &default_partition_segments) { + return std::vector(); +} + +void AscendDeviceContext::UnifyMindIR(const KernelGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + AscendGraphOptimization::GetInstance().UnifyMindIR(graph); +} + +void AscendDeviceContext::OptimizeGraph(const KernelGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + AscendGraphOptimization::GetInstance().OptimizeGraph(graph); +} + +void AscendDeviceContext::SetOperatorInfo(const std::vector &nodes) const { + AscendGraphOptimization::GetInstance().SetOperatorInfo(nodes); +} + +void AscendDeviceContext::CreateKernel(const std::vector &nodes) const { + MS_LOG(INFO) << "CreateKernel Start..."; + struct timeval start_time, end_time; + (void)gettimeofday(&start_time, nullptr); + auto ret = device::ascend::KernelBuild(nodes); + if (!ret) { + MS_LOG(EXCEPTION) << "Kernel build error."; + } + (void)gettimeofday(&end_time, nullptr); + const uint64_t kUSecondInSecond = 1000000; + uint64_t cost = kUSecondInSecond * static_cast(end_time.tv_sec - start_time.tv_sec); + cost += static_cast(end_time.tv_usec - start_time.tv_usec); + MS_LOG(INFO) << "CreateKernel finish run in " << PRIu64 << " us " << cost; +} + +void AscendDeviceContext::UpdateExecOrder(const KernelGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + std::vector new_orders; + auto nodes = graph->execution_order(); + for (const auto &node : nodes) { + if (node_atomics_.find(node) != node_atomics_.end()) { + auto atomics = node_atomics_[node]; + (void)std::copy(atomics.begin(), atomics.end(), std::back_inserter(new_orders)); + } + new_orders.push_back(node); + } + graph->set_execution_order(new_orders); + node_atomics_.clear(); +} + +void AscendDeviceContext::PreprocessBeforeRunGraph(const KernelGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + MS_LOG(INFO) << "PreprocessBeforeRunGraph Start for graph " << graph->graph_id(); + device::ascend::InsertAtomicCleanOps(graph->execution_order(), &node_atomics_); + if (graph->is_executing_sink()) { + UpdateExecOrder(graph); + device::KernelAdjust::GetInstance().InsertDeviceLoopCtrl(graph); + device::KernelAdjust::GetInstance().ProcessLoopSink(graph); + AscendStreamAssign::GetInstance().AssignStream(NOT_NULL(graph)); + CreateKernel(graph->execution_order()); + AllocateGraphMemory(NOT_NULL(graph)); + LoadModel(NOT_NULL(graph)); + MS_LOG(INFO) << "PreprocessBeforeRunGraph success."; + return; + } + MS_LOG(INFO) << "PreprocessBeforeRunGraph success."; +} + +void AscendDeviceContext::AllocateGraphMemory(const NotNull &root_graph) const { + MS_EXCEPTION_IF_NULL(runtime_instance_); + runtime_instance_->ClearGlobalIdleMem(); + memo_.clear(); + AssignInputMemory(root_graph, NOT_NULL(&memo_)); + device::KernelAdjust::GetInstance().AssignLoopCtrlMemory(*root_graph.get()); + runtime_instance_->AssignStaticMemoryOutput(*root_graph.get()); + mem_manager_->ResetDynamicMemory(); + runtime_instance_->AssignDynamicMemory(*root_graph.get()); + runtime_instance_->UpdateRefNodeOutputMem(*root_graph.get()); +} + +void AscendDeviceContext::AssignInputMemory(const NotNull &graph, + NotNull *> const memo) const { + if (memo->find(graph) != memo->end()) { + return; + } + memo->insert(graph.get()); + + MS_LOG(INFO) << "Start to assign static memory for Parameter and Value node in graph: " << graph->graph_id(); + runtime_instance_->AssignStaticMemoryInput(*graph.get()); + runtime_instance_->AssignStaticMemoryValueNode(*graph.get()); + for (auto &child_graph : graph->child_graph_order()) { + AssignInputMemory(NOT_NULL(child_graph.lock()), memo); + } + MS_LOG(INFO) << "Finish assigning static memory for Parameter and Value node in graph: " << graph->graph_id(); +} + +void AscendDeviceContext::LoadModel(const NotNull &root_graph) const { + MS_LOG(INFO) << "Start LoadModel for graph " << root_graph->graph_id(); + MS_EXCEPTION_IF_NULL(runtime_instance_); + bool ret_ok = runtime_instance_->Load(*root_graph.get(), true); + if (!ret_ok) { + MS_LOG(EXCEPTION) << "Load task error!"; + } + MS_LOG(INFO) << "Finish!"; +} + +bool AscendDeviceContext::AllocateMemory(DeviceAddress *const &address, size_t size) const { + MS_EXCEPTION_IF_NULL(address); + MS_EXCEPTION_IF_NULL(runtime_instance_); + runtime_instance_->SetCurrentContext(); + auto device_ptr = mem_manager_->MallocMemFromMemPool(size); + if (!device_ptr) { + return false; + } + address->ptr_ = device_ptr; + address->size_ = size; + address->from_mem_pool_ = true; + return true; +} + +void AscendDeviceContext::FreeMemory(DeviceAddress *const &address) const { + MS_EXCEPTION_IF_NULL(address); + MS_EXCEPTION_IF_NULL(address->ptr_); + if (!address->from_mem_pool()) { + return; + } + mem_manager_->FreeMemFromMemPool(address->ptr_); + address->ptr_ = nullptr; +} + +bool AscendDeviceContext::AllocateContinuousMemory(const std::vector &addr_list, size_t total_size, + const std::vector &size_list) const { + MS_EXCEPTION_IF_NULL(runtime_instance_); + runtime_instance_->SetCurrentContext(); + return mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list); +} + +bool AscendDeviceContext::ExecuteGraph(const KernelGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + const uint64_t kUSecondInSecond = 1000000; + bool ret = false; + if (graph->is_executing_sink()) { +#if defined(_WIN32) || defined(_WIN64) + auto start_time = std::chrono::steady_clock::now(); +#else + struct timeval start_time {}; + struct timeval end_time {}; + (void)gettimeofday(&start_time, nullptr); +#endif + MS_EXCEPTION_IF_NULL(runtime_instance_); +#ifndef ENABLE_SECURITY + DumpSetup(graph); +#endif + { + std::lock_guard locker(launch_mutex_); + ret = runtime_instance_->RunTask(*graph); + } +#ifndef ENABLE_SECURITY + Dump(graph, GetRankID()); +#endif +#if defined(_WIN32) || defined(_WIN64) + auto end_time = std::chrono::steady_clock::now(); + std::chrono::duration> cost = end_time - start_time; + MS_LOG(INFO) << "Call MS Run Success in " << cost.count() << " us"; +#else + (void)gettimeofday(&end_time, nullptr); + uint64_t cost = kUSecondInSecond * static_cast(end_time.tv_sec - start_time.tv_sec); + cost += static_cast(end_time.tv_usec - start_time.tv_usec); + MS_LOG(INFO) << "Call MS Run Success in " << cost << " us"; +#endif + } else { + MS_LOG(EXCEPTION) << graph->ToString() << " does not sink, should launch kernels"; + } + return ret; +} + +bool AscendDeviceContext::LaunchGraph(const KernelGraphPtr &graph) const { + MS_LOG(INFO) << "Status record: start launch graph. graph id: " << graph->graph_id(); + MS_EXCEPTION_IF_NULL(graph); + MS_EXCEPTION_IF_NULL(runtime_instance_); + runtime_instance_->SetCurrentContext(); + device::KernelAdjust::GetInstance().LoadDeviceLoopCtrlParameters(graph); + auto ret = ExecuteGraph(graph); + MS_LOG(INFO) << "Status record: end launch graph. graph id: " << graph->graph_id(); + return ret; +} + +bool AscendDeviceContext::SyncStream(size_t stream_id) const { + MS_EXCEPTION_IF_NULL(runtime_instance_); + return runtime_instance_->SyncStream(); +} +bool AscendDeviceContext::IsExecutingSink(const KernelGraphPtr &graph) const { return true; } +bool AscendDeviceContext::IsLoopCountSink(const KernelGraphPtr &graph) const { return true; } + +// kernel by kernel mode interface +void AscendDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const { + MS_LOG(ERROR) << "!!! Ascend with MindRT not support kernel by kernel mode. !!! "; +} + +void AscendDeviceContext::PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr &graph) const { + MS_LOG(ERROR) << "!!! Ascend with MindRT not support kernel by kernel mode. !!! "; +} + +void AscendDeviceContext::UpdateDynamicShape(const CNodePtr &kernel) const { + MS_LOG(ERROR) << "!!! Ascend with MindRT not support function UpdateDynamicShape. !!! "; +} + +std::shared_ptr AscendDeviceContext::CreateBucket(uint32_t bucket_id, uint32_t bucket_size) const { + MS_LOG(ERROR) << "!!! Ascend with MindRT not support function CreateBucket. !!! "; + return DeviceContext::CreateBucket(bucket_id, bucket_size); +} + +bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector &inputs, + const vector &workspace, const vector &outputs, + bool is_dynamic_shape) const { + MS_LOG(ERROR) << "!!! Ascend with MindRT not support kernel by kernel mode. !!! "; + return true; +} + +bool AscendDeviceContext::BindDeviceToCurrentThread() const { + runtime_instance_->SetCurrentContext(); + return true; +} + +MS_REGISTER_DEVICE(kAscendDevice, AscendDeviceContext); +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h new file mode 100644 index 00000000000..12609e08998 --- /dev/null +++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h @@ -0,0 +1,158 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_ +#define MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_ + +#include +#include +#include +#include +#include +#include "runtime/hardware/device_context.h" +#include "runtime/hardware/device_context_manager.h" +#include "runtime/device/memory_manager.h" +#include "runtime/device/ascend/ascend_kernel_runtime.h" +#include "runtime/device/ascend/ascend_device_address.h" + +namespace mindspore { +namespace device { +namespace ascend { +class AscendDeviceContext : public DeviceContext { + public: + explicit AscendDeviceContext(const DeviceContextKey &device_context_key) + : DeviceContext(device_context_key), mem_manager_(nullptr), initialized_(false) {} + ~AscendDeviceContext() override = default; + + // Initialize the device context. + void Initialize() override; + + // Destroy device context and release device resource. + void Destroy() override; + + // Get rank id for distributed training. + uint32_t GetRankID() const override { return rank_id_; } + + // Partition the function graph through the device capability and return the partition segments. + // The second parameter is the default partition segments which are provided by the framework. + // Device can reprocess the default partition segments to new segments, also can partition the function graph again. + // If Device can launch the whole graph and not expect partitioning the function graph, then return the empty + // segments. The default behavior is return the default partition segments. + std::vector PartitionGraph(const FuncGraphPtr &func_graph, + const std::vector &default_partition_segments) override; + + // Optimize the kernel graph for graph mode. + void OptimizeGraph(const KernelGraphPtr &graph) const override; + + // Optimize the single operator graph for PyNative mode. + void OptimizeSingleOpGraph(const KernelGraphPtr &graph) const override; + + // Select the matching backend kernels according to the data type and format of input and output for all + // execution operators, and set final device data type and format information for backend kernels, device + // data type and format which replace original data type and format will use for executing kernels. + void SetOperatorInfo(const std::vector &nodes) const override; + + // Generate 'KernelMod' for all kernels and set 'KernelMod' into kernel, + // 'KernelMod' is real executive object of kernel. + void CreateKernel(const std::vector &nodes) const override; + + // Adjust kernel graph before run graph, used in Graph Mode. + void PreprocessBeforeRunGraph(const KernelGraphPtr &graph) const override; + // Adjust single op kernel graph before run graph, used in PyNative Mode. + void PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr &graph) const override; + + // Infer kernel shape and update abstract info for dynamic shape kernel. + void UpdateDynamicShape(const CNodePtr &kernel) const override; + + // Relevant function to allocate and free device memory. + bool AllocateMemory(DeviceAddress *const &address, size_t size) const override; + void FreeMemory(DeviceAddress *const &address) const override; + + // Allocate continuous device memory end to end into 'addr_list'. + // Communication operators may need continuous memory for input and output + // to optimize the communication performance. + bool AllocateContinuousMemory(const std::vector &addr_list, size_t total_size, + const std::vector &size_list) const override; + + // Create concrete device address according different device type. + DeviceAddressPtr CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format, + TypeId type_id) const override { + return std::make_shared(device_ptr, device_size, format, type_id, + device_context_key_.device_name_, device_context_key_.device_id_); + } + + // Get device address type according different device type, such GPU, Ascend. + DeviceAddressType GetDeviceAddressType() const override { return DeviceAddressType::kAscend; } + + // Launch graph, device such as Ascend support the whole graph sink to the device executing. + bool LaunchGraph(const KernelGraphPtr &graph) const override; + + // Launch a kernel via 'KernelMod' of the kernel. + bool LaunchKernel(const CNodePtr &kernel, const std::vector &inputs, + const std::vector &workspace, const std::vector &outputs, + bool is_dynamic_shape = false) const override; + + // Synchronize stream, device such as GPU and Ascend need stream to launch kernel asynchronously, + // using 'SyncStream' to block thread and wait for completing all tasks in stream. + // Devices that do not need stream could ignore the implementation of this function. + bool SyncStream(size_t stream_id = 0) const override; + + // Create and initialize bucket for every allreduce operator. Bucket is used in PyNative distributed training mode, + // one bucket handles all resource to launch and sync allreduce operator. + std::shared_ptr CreateBucket(uint32_t bucket_id, uint32_t bucket_size) const override; + + // Unify the MindIR, the default behavior uses the common unified MindIR. + void UnifyMindIR(const KernelGraphPtr &graph) const override; + + // Whether the graph sink executing through the device capability, the default behavior is not sink and return false. + bool IsExecutingSink(const KernelGraphPtr &graph) const override; + // Whether the graph loop sink executing through the device capability, the default behavior is not loop sink and + // return false. + bool IsLoopCountSink(const KernelGraphPtr &graph) const override; + + // set rt_context_ to this thread to control device + bool BindDeviceToCurrentThread() const; + + private: + // Graph loader interface + void AllocateGraphMemory(const NotNull &root_graph) const; + void AssignInputMemory(const NotNull &graph, NotNull *> memo) const; + void LoadModel(const NotNull &root_graph) const; + void UpdateExecOrder(const KernelGraphPtr &graph) const; + + // Kernel Runtime --- only for task sink + AscendKernelRuntime *runtime_instance_{nullptr}; + std::shared_ptr mem_manager_{nullptr}; + // rank id of physical device + uint32_t rank_id_{0}; + bool initialized_{false}; + + // LaunchGraph interface + bool ExecuteGraph(const KernelGraphPtr &graph) const; + // The ExecuteGraph is not thread safety specifically, it is not recommended that multiple threads access the same + // func at the same time, so need the launch mutex when multiple threads launch the graph. + mutable std::mutex launch_mutex_; + // The graphs has been traversed when the graph id traversed recursively. + // Note: Please clean the set before each use. + mutable std::set memo_; + // Using node to get it's atomics + mutable std::map> node_atomics_; +}; +} // namespace ascend +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_ diff --git a/mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.cc b/mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.cc new file mode 100644 index 00000000000..29aa65f5167 --- /dev/null +++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.cc @@ -0,0 +1,288 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "runtime/hardware/ascend/ascend_graph_optimization.h" +#include +#include "backend/optimizer/common/common_backend_optimization.h" +#include "backend/optimizer/ascend/ascend_backend_optimization.h" +#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" +#include "backend/session/ascend_auto_monad.h" +#include "utils/context/graph_kernel_flags.h" +#include "runtime/device/ascend/kernel_select_ascend.h" +#include "runtime/device/kernel_adjust.h" + +#ifndef ENABLE_SECURITY +#include "debug/anf_ir_dump.h" +#include "debug/dump_proto.h" +#endif + +namespace mindspore { +namespace device { +namespace ascend { +using AscendAutoMonad = mindspore::session::AscendAutoMonad; + +void AscendGraphOptimization::OptimizeGraph(const KernelGraphPtr &graph) { + MS_EXCEPTION_IF_NULL(graph); + MS_LOG(INFO) << "Status record: start optimize graph. graph id: " << graph->graph_id(); + + // empty graph dont entry to backend + if (graph->execution_order().empty()) { + MS_LOG(INFO) << graph->ToString() << " is empty graph."; + AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(graph)); + graph->set_executable(false); + MS_LOG(INFO) << "Status record: end optimize graph. graph id: " << graph->graph_id(); + } + + OptimizeGraphWithoutDeviceInfo(graph); + SelectKernel(graph); + OptimizeGraphWithDeviceInfo(graph); + OptimizeExecutionOrder(graph); + PostOptimization(graph); + MS_LOG(INFO) << "Status record: end optimize graph. graph id: " << graph->graph_id(); +} + +void AscendGraphOptimization::OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph) { + MS_EXCEPTION_IF_NULL(graph); + HandleControlFlow(NOT_NULL(graph)); + + // add all graphs to manager first, so that don't have to make new manager in following passes. + auto manager = Manage(graph, true); + memo_.clear(); + AddGraphToManager(NOT_NULL(graph), NOT_NULL(manager)); + + memo_.clear(); + IRFusionOptimization(graph); +} + +void AscendGraphOptimization::OptimizeGraphWithDeviceInfo(const KernelGraphPtr &graph) { + MS_EXCEPTION_IF_NULL(graph); + memo_.clear(); + HardWareOptimization(graph); + AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(graph)); +} + +void AscendGraphOptimization::OptimizeExecutionOrder(const KernelGraphPtr &graph) { + MS_EXCEPTION_IF_NULL(graph); + MS_LOG(INFO) << "Status record: start optimize execution order. graph id: " << graph->graph_id(); + // root root_graph validate,include generate execute order and so on + RootGraphExecutorValidate(NOT_NULL(graph)); + +#ifdef ENABLE_DUMP_IR + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); + if (save_graphs) { + DumpIRProto(graph, "before_removeNop_" + std::to_string(graph->graph_id())); + } +#endif + + opt::HideNopNode(graph.get()); + + auto execution_order = graph->execution_order(); + AnfAlgo::ReorderExecList(NOT_NULL(&execution_order)); + graph->set_execution_order(execution_order); + +#ifndef ENABLE_SECURITY + // insert profiling point + device::KernelAdjust::GetInstance().Profiling(NOT_NULL(graph.get())); +#endif + + device::KernelAdjust::GetInstance().InsertOverflowCheckOperations(NOT_NULL(graph)); + +#ifdef ENABLE_DUMP_IR + if (save_graphs) { + DumpIR("after_adjust_kernel.ir", graph); + } +#endif + MS_LOG(INFO) << "Status record: end optimize execution order. graph id: " << graph->graph_id(); +} + +void AscendGraphOptimization::PostOptimization(const KernelGraphPtr &graph) { + MS_LOG(INFO) << "Status record: start post optimization. graph id: " << graph->graph_id(); + // copy child graph ref output map to father graph ref output map + memo_.clear(); + UpdateRefOutputMap(graph); + graph->SetInputNodes(); + graph->SetOptimizerFlag(); + MS_LOG(INFO) << "Status record: end post optimization. graph id: " << graph->graph_id(); +} + +void AscendGraphOptimization::HardWareOptimization(const KernelGraphPtr &graph) { + MS_EXCEPTION_IF_NULL(graph); + MS_LOG(INFO) << "Status record: start hardware optimize. graph id: " << graph->graph_id(); + if (memo_.find(graph) != memo_.end()) { + return; + } + memo_.insert(graph); + opt::AscendBackendOptimization(graph); + opt::CommonFinalOptimization(graph); + if (graphkernel::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) { + graphkernel::GraphKernelOptimize(graph); + graph->SetExecOrderByDefault(); + } + MS_LOG(INFO) << "Status record: end hardware optimize. graph id: " << graph->graph_id(); + + for (auto &child_graph : graph->child_graph_order()) { + HardWareOptimization(child_graph.lock()); + } +} + +void AscendGraphOptimization::AddGraphToManager(const NotNull graph, + NotNull manager) { + if (memo_.find(graph) != memo_.end()) { + return; + } + memo_.insert(graph.get()); + manager->AddFuncGraph(graph.get(), false); + + for (auto &child_graph : graph->child_graph_order()) { + AddGraphToManager(NOT_NULL(child_graph.lock()), manager); + } +} + +void AscendGraphOptimization::IRFusionOptimization(const KernelGraphPtr &graph) { + MS_EXCEPTION_IF_NULL(graph); + if (memo_.find(graph) != memo_.end()) { + return; + } + memo_.insert(graph); + + opt::AscendBackendIRFusionOptimization(graph); + +#ifdef ENABLE_DUMP_IR + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); + if (save_graphs) { + std::string file_name = "select_kernel_before_graph_" + std::to_string(graph->graph_id()) + ".ir"; + DumpIR(file_name, graph); + } +#endif + + for (auto &child_graph : graph->child_graph_order()) { + IRFusionOptimization(NOT_NULL(child_graph.lock())); + } +} + +void AscendGraphOptimization::HandleControlFlow(const NotNull graph) { + MS_LOG(INFO) << "Status record: start handle control flow. graph id: " << graph->graph_id(); + AscendAutoMonad auto_monad(graph); + auto_monad.Run(); + MS_LOG(INFO) << "Status record: end handle control flow. graph id: " << graph->graph_id(); +} + +void AscendGraphOptimization::RootGraphExecutorValidate(NotNull graph) { + MS_LOG(INFO) << "Status record: start graph executor validate. graph id: " << graph->graph_id(); + AscendAutoMonad auto_monad(graph); + auto_monad.GenerateExecuteOrder(); + MS_LOG(INFO) << "Status record: end graph executor validate. graph id: " << graph->graph_id(); +} + +void AscendGraphOptimization::RecurseSelectKernelInfo(const KernelGraphPtr &graph) { + MS_EXCEPTION_IF_NULL(graph); + if (memo_.find(graph) != memo_.end()) { + return; + } + memo_.insert(graph); + MS_LOG(INFO) << "Status record: start select kernel info. graph id: " << graph->graph_id(); + SetOperatorInfo(graph->execution_order()); + MS_LOG(INFO) << "Status record: end select kernel info. graph id: " << graph->graph_id(); + +#ifdef ENABLE_DUMP_IR + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); + if (save_graphs) { + std::string file_name = "select_kernel_after_graph_" + std::to_string(graph->graph_id()) + ".ir"; + DumpIR(file_name, graph); + } +#endif + + for (auto &child_graph : graph->child_graph_order()) { + RecurseSelectKernelInfo(child_graph.lock()); + } +} + +void AscendGraphOptimization::SelectKernel(const KernelGraphPtr &graph) { + MS_LOG(INFO) << "Status record: start select kernel info. graph id: " << graph->graph_id(); + raise_precision_count_ = 0; + reduce_precision_count_ = 0; + memo_.clear(); + RecurseSelectKernelInfo(graph); + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + if (ms_context->get_param(MS_CTX_EXECUTION_MODE) == kGraphMode) { + if (raise_precision_count_ > 0) { + MS_LOG(WARNING) << "There are " << raise_precision_count_ + << " node/nodes used raise precision to selected the kernel!"; + } + if (reduce_precision_count_ > 0) { + MS_LOG(WARNING) << "There are " << reduce_precision_count_ + << " node/nodes used reduce precision to selected the kernel!"; + } + } + MS_LOG(INFO) << "Status record: end select kernel info. graph id: " << graph->graph_id(); +} + +void AscendGraphOptimization::UpdateRefOutputMap(const KernelGraphPtr &graph) { + MS_EXCEPTION_IF_NULL(graph); + if (memo_.find(graph) != memo_.end()) { + return; + } + memo_.insert(graph); + + for (auto &child_graph : graph->child_graph_order()) { + auto child_graph_ptr = child_graph.lock(); + MS_EXCEPTION_IF_NULL(child_graph_ptr); + UpdateRefOutputMap(NOT_NULL(child_graph_ptr)); + // copy ref map to final graph + auto child_ref_map = child_graph_ptr->GetRefMap(); + for (auto &item : child_ref_map) { + if (graph->IsInRefOutputMap(item.first)) { + MS_LOG(WARNING) << "The ref pair <" << item.first.first->DebugString() << ", " << item.first.second + << "> is already in " << graph->ToString(); + continue; + } + graph->AddRefCorrespondPairs(item.first, item.second); + } + } +} + +void AscendGraphOptimization::UnifyMindIR(const KernelGraphPtr &graph) { + MS_EXCEPTION_IF_NULL(graph); + MS_LOG(INFO) << "Status record: start unify mindir. graph id: " << graph->graph_id(); + opt::CommonUnifyMindIR(graph); + opt::AscendUnifyMindIR(graph); + MS_LOG(INFO) << "Status record: end unify mindir. graph id: " << graph->graph_id(); +} + +void AscendGraphOptimization::SetOperatorInfo(const std::vector &nodes) { + for (const auto &node : nodes) { + auto status = device::ascend::SelectKernelInfo(node); + AnfAlgo::EraseNodeAttr(kAttrPynativeNextOpName, node); + AnfAlgo::EraseNodeAttr(kAttrPynativeNextIndex, node); + if (status == device::ascend::kStatusRaisePrecision) { + raise_precision_count_++; + } else if (status == device::ascend::kStatusReducePrecision) { + reduce_precision_count_++; + } + MS_LOG(DEBUG) << "Select ApplyKernel: " << node->DebugString(); + } +} + +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.h b/mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.h new file mode 100644 index 00000000000..3d2a3433558 --- /dev/null +++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.h @@ -0,0 +1,77 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_MINDSPORE_ASCEND_GRAPH_OPTIMIZATION_H +#define MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_MINDSPORE_ASCEND_GRAPH_OPTIMIZATION_H + +#include +#include +#include +#include +#include +#include "runtime/hardware/device_context.h" +#include "runtime/hardware/device_context_manager.h" +#include "runtime/device/memory_manager.h" +#include "runtime/device/ascend/ascend_kernel_runtime.h" +#include "runtime/device/ascend/ascend_device_address.h" + +namespace mindspore { +namespace device { +namespace ascend { +class AscendGraphOptimization { + public: + static AscendGraphOptimization &GetInstance() { + static AscendGraphOptimization instance; + return instance; + } + AscendGraphOptimization() = default; + ~AscendGraphOptimization() = default; + AscendGraphOptimization(const AscendGraphOptimization &) = delete; + AscendGraphOptimization &operator=(const AscendGraphOptimization &) = delete; + + void OptimizeGraph(const KernelGraphPtr &graph); + void SetOperatorInfo(const std::vector &nodes); + void UnifyMindIR(const KernelGraphPtr &graph); + + private: + // Graph Optimized level-2 interface + void OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph); + void OptimizeGraphWithDeviceInfo(const KernelGraphPtr &graph); + void OptimizeExecutionOrder(const KernelGraphPtr &graph); + void PostOptimization(const KernelGraphPtr &graph); + + // Graph Optimized level-3 interface + void IRFusionOptimization(const KernelGraphPtr &graph); + void UpdateRefOutputMap(const KernelGraphPtr &graph); + void AddGraphToManager(const NotNull graph, NotNull manager); + void SelectKernel(const KernelGraphPtr &graph); + void RecurseSelectKernelInfo(const KernelGraphPtr &graph); + void HardWareOptimization(const KernelGraphPtr &graph); + void HandleControlFlow(const NotNull graph); + void RootGraphExecutorValidate(NotNull graph); + + // Number of operators whose precision changes after select kernel + size_t raise_precision_count_{0}; + size_t reduce_precision_count_{0}; + // The graphs has been traversed when the graph id traversed recursively. + // Note: Please clean the set before each use. + std::set memo_; +}; +} // namespace ascend +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_ASCEND_GRAPH_OPTIMIZATION_H diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index a94e3f60196..57a846b916e 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -136,6 +136,8 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "../../../mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc" "../../../mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc" "../../../mindspore/ccsrc/runtime/device/ascend/lic_manager.cc" + "../../../mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc" + "../../../mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.cc" "../../../mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc" "../../../mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.cc" "../../../mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc"