ascend device context

This commit is contained in:
LaiYongqiang 2021-10-28 14:57:29 +08:00 committed by hwjiaorui
parent fc4f8812a5
commit 07e5ed9f16
17 changed files with 975 additions and 59 deletions

View File

@ -192,8 +192,11 @@ void GenOpOutputStubTensor(const KernelGraphPtr &single_op_graph, const CNodePtr
device_info.format_ = output_format;
device_info.data_type_ = TypeIdToType(output_type);
stub_output_tensor->set_device_info(device_info);
device::DeviceAddressPtr device_address =
std::make_shared<device::ascend::AscendDeviceAddress>(nullptr, 0, output_format, output_type);
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
device::DeviceAddressPtr device_address = std::make_shared<device::ascend::AscendDeviceAddress>(
nullptr, 0, output_format, output_type, kAscendDevice, device_id);
stub_output_tensor->set_device_address(device_address);
output_tensor_info.output_stub_tensor = stub_output_tensor;
auto kernel_info = dynamic_cast<const device::KernelInfo *>(output_node->kernel_info());
@ -721,7 +724,7 @@ void AscendSession::BatchBuildKernel(const std::vector<std::shared_ptr<SessionTa
std::vector<CNodePtr> atomic_node_to_build;
for (auto &graph : graphs) {
device::ascend::InsertAtomicCleanOp(graph);
device::ascend::InsertAtomicCleanOps(graph);
const auto &nodes = graph->execution_order();
std::copy(nodes.begin(), nodes.end(), std::back_inserter(atomic_node_to_build));
}
@ -998,10 +1001,10 @@ void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::map<AnfN
InitRuntimeResource();
// Compile all kernels parallel
BuildKernel(kernels);
// Some new kernel may be added after InsertAtomicCleanOp, so collect and build kernels again
// Some new kernel may be added after InsertAtomicCleanOps, so collect and build kernels again
kernels.clear();
for (const auto &graph_item : single_op_graphs) {
device::ascend::InsertAtomicCleanOp(graph_item.first);
device::ascend::InsertAtomicCleanOps(graph_item.first);
const auto &execution_order = graph_item.first->execution_order();
std::copy(execution_order.begin(), execution_order.end(), std::back_inserter(kernels));
}
@ -1078,7 +1081,7 @@ void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_grap
// Insert CLearZero op
// prepare for next step from json get atomic info
BuildKernel(kernel_graph);
device::ascend::InsertAtomicCleanOp(kernel_graph);
device::ascend::InsertAtomicCleanOps(kernel_graph);
device::KernelAdjust::GetInstance().InsertDeviceLoopCtrl(kernel_graph);
device::KernelAdjust::GetInstance().ProcessLoopSink(kernel_graph);
#ifdef ENABLE_DUMP_IR
@ -1098,7 +1101,7 @@ void AscendSession::RunOpAdjustKernel(const std::shared_ptr<KernelGraph> &kernel
// Insert CLearZero op
// prepare for next step from json get atomic info
BuildKernel(kernel_graph);
device::ascend::InsertAtomicCleanOp(kernel_graph);
device::ascend::InsertAtomicCleanOps(kernel_graph);
MS_LOG(INFO) << "Finish!";
}

View File

@ -26,6 +26,8 @@
#include "runtime/device/memory_manager.h"
#include "runtime/device/convert_tensor_utils.h"
#include "runtime/device/ascend/ascend_launch_transdata.h"
#include "runtime/hardware/device_context_manager.h"
#include "runtime/hardware/ascend/ascend_device_context.h"
#include "ir/dtype/type.h"
#include "ir/tensor.h"
#include "abstract/utils.h"
@ -162,6 +164,25 @@ bool SyncDeviceToHostAndFloatToFloat64(void *dst, size_t dst_size, const void *s
return true;
}
void AscendDeviceAddress::BindDevice() const {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (!MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
return;
}
// Bind device by device name and device id on the current thread.
if (device_name_ != "") {
auto device_context =
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_});
auto ascend_device_context = dynamic_cast<AscendDeviceContext *>(device_context);
MS_EXCEPTION_IF_NULL(ascend_device_context);
if (!ascend_device_context->BindDeviceToCurrentThread()) {
MS_LOG(EXCEPTION) << "BindDeviceToCurrentThread failed.";
}
}
}
void AscendDeviceAddress::SyncStream() const {
MS_LOG(DEBUG) << "SyncStream Start!";
auto ms_context = MsContext::GetInstance();
@ -183,6 +204,7 @@ void AscendDeviceAddress::SyncStream() const {
bool AscendDeviceAddress::SyncDeviceToHost(size_t size, void *const host_ptr) const {
MS_EXCEPTION_IF_NULL(host_ptr);
BindDevice();
SyncStream();
SyncMemory(host_ptr, ptr_, size, RT_MEMCPY_DEVICE_TO_HOST);
return true;
@ -190,6 +212,7 @@ bool AscendDeviceAddress::SyncDeviceToHost(size_t size, void *const host_ptr) co
bool AscendDeviceAddress::SyncHostToDevice(size_t size, const void *host_ptr) const {
MS_EXCEPTION_IF_NULL(host_ptr);
BindDevice();
SyncMemory(ptr_, host_ptr, size, RT_MEMCPY_HOST_TO_DEVICE);
return true;
}
@ -201,6 +224,7 @@ bool AscendDeviceAddress::SyncDeviceToHost(const ShapeVector &shape, size_t size
if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
return true;
}
BindDevice();
SyncStream();
bool sync_ok = false;
std::vector<size_t> host_shape;
@ -368,7 +392,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size
if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
return true;
}
BindDevice();
bool sync_ok = false;
std::vector<size_t> host_shape;
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), LongToSize);
@ -416,6 +440,7 @@ bool AscendDeviceAddress::SyncDeviceToDevice(const ShapeVector &, size_t size, T
if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
return true;
}
BindDevice();
bool sync_ok = false;
if (format_ == format && type_id_ == type) {
if (!DataSync(ptr_, src_ptr, size)) {

View File

@ -36,11 +36,14 @@ namespace ascend {
class AscendDeviceAddress : public DeviceAddress {
public:
explicit AscendDeviceAddress(void *ptr, size_t size) : DeviceAddress(ptr, size) {}
explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id)
: DeviceAddress(ptr, size, format, type_id) {}
explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &device_name, uint32_t device_id)
: DeviceAddress(ptr, size, device_name, device_id) {}
explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
const KernelWithIndex &node_index)
: DeviceAddress(ptr, size, format, type_id, node_index) {}
const std::string &device_name, uint32_t device_id)
: DeviceAddress(ptr, size, format, type_id, device_name, device_id) {}
explicit AscendDeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
const KernelWithIndex &node_index, const std::string &device_name, uint32_t device_id)
: DeviceAddress(ptr, size, format, type_id, node_index, device_name, device_id) {}
~AscendDeviceAddress() override;
bool SyncDeviceToHost(size_t size, void *const host_ptr) const override;
bool SyncHostToDevice(size_t size, const void *host_ptr) const override;
@ -71,6 +74,7 @@ class AscendDeviceAddress : public DeviceAddress {
const std::string &ori_format,
const std::string &dst_format) const;
mutable std::shared_ptr<LaunchKernel> launch_transdata_{nullptr};
void BindDevice() const;
};
using AscendDeviceAddressPtr = std::shared_ptr<AscendDeviceAddress>;
} // namespace ascend

View File

@ -404,12 +404,19 @@ bool AscendKernelRuntime::KernelMemNotReuse(const AnfNodePtr &node) {
DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
TypeId type_id) const {
return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id);
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id, kAscendDevice, device_id);
}
DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
TypeId type_id, const KernelWithIndex &node_index) const {
return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id, node_index);
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id, node_index, kAscendDevice,
device_id);
}
bool AscendKernelRuntime::Load(const session::KernelGraph &graph, bool is_task_sink) {

View File

@ -71,6 +71,9 @@ class AscendKernelRuntime : public KernelRuntime {
void *compute_stream() const override { return stream_; }
void *communication_stream() const override { return communication_stream_; }
void *GetModelStream(uint32_t graph_id) const override;
// add for MindRT
void ReleaseDeviceRes() override;
void SetCurrentContext();
protected:
DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
@ -87,10 +90,8 @@ class AscendKernelRuntime : public KernelRuntime {
static bool HcclInit();
static bool NeedDestroyHccl();
static bool DestroyHccl();
void SetCurrentContext();
void ClearGraphModelMap();
void ReleaseDeviceRes() override;
bool GraphWithEmptyTaskList(const session::KernelGraph &graph) const;
bool CheckGraphIdValid(GraphId graph_id) const;
#ifndef ENABLE_SECURITY

View File

@ -147,7 +147,7 @@ void AiCoreDynamicKernel::AllocateWorkspace() {
workspace_addr_.clear();
for (auto size : workspaces_size_) {
auto device_address_ptr = std::make_shared<AscendDeviceAddress>(nullptr, size);
auto device_address_ptr = std::make_shared<AscendDeviceAddress>(nullptr, size, kAscendDevice, device_id);
auto device_ptr = runtime_instance->MallocMem(MemType::kDynamicMem, size, device_address_ptr);
if (device_ptr == nullptr) {
MS_LOG(EXCEPTION) << "MallocMem from memory pool failed. Node info :" << cnode->fullname_with_scope();

View File

@ -338,7 +338,6 @@ void ProcessAtomicFusion(const std::vector<CNodePtr> &kernels, CleanOpsMap *clea
InsertFusionAtomicOp(first_node, fusion_clear_inputs, clean_size_list, clean_ops);
}
}
} // namespace
void InsertAtomicOps(const std::vector<CNodePtr> &kernels, CleanOpsMap *clean_ops) {
// fusion
@ -358,9 +357,9 @@ void InsertAtomicOps(const std::vector<CNodePtr> &kernels, CleanOpsMap *clean_op
}
}
std::map<AnfNodePtr, std::vector<size_t>> GetCommunicationOpInputInfo(const std::vector<CNodePtr> &exe_orders) {
std::map<AnfNodePtr, std::vector<size_t>> GetCommunicationOpInputInfo(const std::vector<CNodePtr> &kernels) {
std::map<AnfNodePtr, std::vector<size_t>> comm_input_info_map;
for (auto &kernel : exe_orders) {
for (auto &kernel : kernels) {
MS_EXCEPTION_IF_NULL(kernel);
auto input_num = AnfAlgo::GetInputTensorNum(kernel);
if (mindspore::session::AnfRuntimeAlgorithm::IsCommunicationOp(kernel)) {
@ -401,12 +400,12 @@ std::map<AnfNodePtr, std::vector<size_t>> GetCommunicationOpInputInfo(const std:
return comm_input_info_map;
}
void AddNeedInsertAtomicAttrForAllOps(const std::vector<CNodePtr> &exe_orders) {
if (exe_orders.empty()) {
void TagNeedInsertAtomicAttr(const std::vector<CNodePtr> &nodes) {
if (nodes.empty()) {
return;
}
std::map<AnfNodePtr, std::vector<size_t>> comm_input_info_map = GetCommunicationOpInputInfo(exe_orders);
for (const auto &anf_node : exe_orders) {
std::map<AnfNodePtr, std::vector<size_t>> comm_input_info_map = GetCommunicationOpInputInfo(nodes);
for (const auto &anf_node : nodes) {
if (comm_input_info_map.find(anf_node) != comm_input_info_map.end()) {
auto indexes = comm_input_info_map[anf_node];
if (AnfAlgo::HasNodeAttr(kAttrAtomicOutputIndexs, anf_node)) {
@ -433,23 +432,24 @@ std::vector<CNodePtr> GatherAllAtomicOps(const CleanOpsMap &node_maps) {
}
return all_atomics;
}
} // namespace
void InsertAtomicCleanOpForMindRT(const std::vector<CNodePtr> &exe_orders, CleanOpsMap *maps) {
void InsertAtomicCleanOps(const std::vector<CNodePtr> &nodes, CleanOpsMap *maps) {
MS_EXCEPTION_IF_NULL(maps);
// assign attr
AddNeedInsertAtomicAttrForAllOps(exe_orders);
TagNeedInsertAtomicAttr(nodes);
// insert atomic
InsertAtomicOps(exe_orders, maps);
InsertAtomicOps(nodes, maps);
std::vector<CNodePtr> all_atomics = GatherAllAtomicOps(*maps);
// build atomic
KernelBuild(all_atomics);
}
void InsertAtomicCleanOp(const KernelGraphPtr &kernel_graph) {
void InsertAtomicCleanOps(const KernelGraphPtr &kernel_graph) {
MS_EXCEPTION_IF_NULL(kernel_graph);
const auto &exe_orders = kernel_graph->execution_order();
// assign attr
AddNeedInsertAtomicAttrForAllOps(exe_orders);
TagNeedInsertAtomicAttr(exe_orders);
// insert atomic
CleanOpsMap node_to_cleans;
InsertAtomicOps(exe_orders, &node_to_cleans);

View File

@ -32,36 +32,15 @@ using CleanOpsMap = std::map<CNodePtr, std::vector<CNodePtr>>;
*/
bool KernelBuild(const std::vector<CNodePtr> &kernels);
/**
* @brief preprocess of kernel build for ascend, e.g. inserting clear_zero node for max_pool, bn.
* Must DO these changes just before kernel build, and after all of other optimizations on AnfGraph
*/
void InsertAtomicCleanOp(const KernelGraphPtr &kernel_graph);
/**
* @brief preprocess for mind rt
* */
void InsertAtomicCleanOpForMindRT(const std::vector<CNodePtr> &exe_orders, CleanOpsMap *maps);
/**
* @brief communication op input info.
* */
CommOpInputInfo GetCommunicationOpInputInfo(const std::vector<CNodePtr> &exe_orders);
/**
* @brief insert atomic
* */
void InsertAtomicOps(const std::vector<CNodePtr> &exe_orders, CleanOpsMap *clean_ops);
*/
void InsertAtomicCleanOps(const KernelGraphPtr &kernel_graph);
/**
* @brief gather all atomics
* @brief insert atomic for mind rt
* */
std::vector<CNodePtr> GatherAllAtomicOps(const CleanOpsMap &node_maps);
/**
* @brief add attr for op if need insert atomic
* */
void AddNeedInsertAtomicAttrForAllOps(const std::vector<CNodePtr> &exe_orders);
void InsertAtomicCleanOps(const std::vector<CNodePtr> &nodes, CleanOpsMap *maps);
} // namespace ascend
} // namespace device
} // namespace mindspore

View File

@ -38,6 +38,7 @@ class CPUDeviceContext;
namespace ascend {
class AscendKernelRuntime;
class AscendMemoryManager;
class AscendDeviceContext;
#ifndef ENABLE_SECURITY
class DataDumper;
#endif
@ -71,9 +72,21 @@ class DeviceAddress : public mindspore::DeviceSync {
explicit DeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
const KernelWithIndex &node_index)
: ptr_(ptr), size_(size), format_(format), type_id_(type_id), node_index_(node_index) {}
explicit DeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
const std::string &device_name, uint32_t device_id)
explicit DeviceAddress(void *ptr, size_t size, const std::string &device_name, uint32_t device_id)
: ptr_(ptr), size_(size), device_name_(device_name), device_id_(device_id) {}
explicit DeviceAddress(void *ptr, size_t size, const string &format, TypeId type_id, const std::string &device_name,
uint32_t device_id)
: ptr_(ptr), size_(size), format_(format), type_id_(type_id), device_name_(device_name), device_id_(device_id) {}
explicit DeviceAddress(void *ptr, size_t size, const std::string &format, TypeId type_id,
const KernelWithIndex &node_index, const std::string &device_name, uint32_t device_id)
: ptr_(ptr),
size_(size),
format_(format),
type_id_(type_id),
node_index_(node_index),
device_name_(device_name),
device_id_(device_id) {}
virtual ~DeviceAddress() { ptr_ = nullptr; }
const void *GetPtr() const { return ptr_; }
@ -133,6 +146,7 @@ class DeviceAddress : public mindspore::DeviceSync {
friend class mindspore::device::gpu::GPUDeviceContext;
friend class mindspore::device::ascend::AscendKernelRuntime;
friend class mindspore::device::ascend::AscendMemoryManager;
friend class mindspore::device::ascend::AscendDeviceContext;
#ifndef ENABLE_SECURITY
friend class mindspore::device::ascend::DataDumper;
#endif

View File

@ -989,7 +989,12 @@ void KernelAdjust::AssignLoopCtrlTensorMem(const session::KernelGraph &kernel_gr
auto format = AnfAlgo::GetOutputFormat(param, 0);
auto type_id = AnfAlgo::GetOutputDeviceDataType(param, 0);
device_address = std::make_shared<device::ascend::AscendDeviceAddress>(nullptr, size, format, type_id);
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
device_address =
std::make_shared<device::ascend::AscendDeviceAddress>(nullptr, size, format, type_id, kAscendDevice, device_id);
if (runtime_instance->MallocMem(kStaticMem, size, device_address) == nullptr) {
MS_LOG(EXCEPTION) << "Cannot alloc static memory for device loop control parameter " << name
<< " , tensor size is : " << size;

View File

@ -119,6 +119,11 @@ class KernelRuntime {
virtual DeviceAddressPtr AssignExtraStaticMem(const TensorPtr &tensor, const AnfNodePtr &node, size_t index);
virtual void *GetModelStream(uint32_t graph_id) const { return nullptr; }
// add for MindRT
std::shared_ptr<MemoryManager> GetMemoryManager() { return mem_manager_; }
void AssignStaticMemoryOutput(const session::KernelGraph &graph);
void AssignDynamicMemory(const session::KernelGraph &graph);
protected:
virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
TypeId type_id) const = 0;
@ -128,7 +133,6 @@ class KernelRuntime {
virtual bool KernelMemNotReuse(const AnfNodePtr &node);
void AssignStaticMemory(const session::KernelGraph &graph);
void AssignDynamicMemory(const session::KernelGraph &graph);
void AssignNodeOutputMem(MemType type, const AnfNodePtr &node, int index);
void AssignWorkSpaceMem(MemType type, const AnfNodePtr &node);
@ -154,7 +158,6 @@ class KernelRuntime {
const AnfNodePtr &kernel, bool mock);
void AssignCommunicationMem(const session::KernelGraph &graph);
void AssignStaticMemoryOutput(const session::KernelGraph &graph);
bool LaunchKernelMod(const session::KernelGraph &graph, bool mock = false);
void LaunchKernelEvent(const std::vector<std::vector<std::function<void()>>> &run_events, size_t index) const;
void DebugStreamSync(const CNodePtr &kernel);

View File

@ -1,6 +1,10 @@
file(GLOB_RECURSE HARDWARE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"device_context_manager.cc")
if(ENABLE_D)
file(GLOB_RECURSE HARDWARE_D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "ascend/*.cc")
endif()
if(ENABLE_GPU)
file(GLOB_RECURSE HARDWARE_GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu/*.cc")
endif()

View File

@ -0,0 +1,346 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "runtime/hardware/ascend/ascend_device_context.h"
#include <algorithm>
#include <set>
#include "backend/optimizer/ascend/ascend_backend_optimization.h"
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
#include "backend/session/ascend_auto_monad.h"
#include "utils/context/graph_kernel_flags.h"
#include "runtime/device/ascend/kernel_select_ascend.h"
#include "runtime/device/kernel_adjust.h"
#include "runtime/device/ascend/ascend_stream_assign.h"
#include "runtime/device/ascend/kernel_build_ascend.h"
#include "runtime/hardware/ascend/ascend_graph_optimization.h"
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#include "toolchain/adx_datadump_server.h"
#include "debug/anf_ir_dump.h"
#include "debug/dump_proto.h"
#include "debug/data_dump/e2e_dump.h"
#endif
namespace mindspore {
namespace device {
namespace ascend {
using KernelGraph = mindspore::session::KernelGraph;
#ifndef ENABLE_SECURITY
void DumpInit(uint32_t device_id) {
auto &json_parser = DumpJsonParser::GetInstance();
json_parser.Parse();
json_parser.CopyDumpJsonToDir(device_id);
json_parser.CopyHcclJsonToDir(device_id);
json_parser.CopyMSCfgJsonToDir(device_id);
if (json_parser.async_dump_enabled()) {
if (AdxDataDumpServerInit() != 0) {
MS_LOG(EXCEPTION) << "Adx data dump server init failed";
}
}
}
void DumpSetup(const KernelGraphPtr &graph) {
MS_LOG(DEBUG) << "Start!";
MS_EXCEPTION_IF_NULL(graph);
E2eDump::DumpSetup(graph.get());
MS_LOG(DEBUG) << "Finish!";
}
void Dump(const KernelGraphPtr &graph, uint32_t rank_id) {
MS_LOG(DEBUG) << "Start!";
MS_EXCEPTION_IF_NULL(graph);
E2eDump::DumpData(graph.get(), rank_id);
MS_LOG(DEBUG) << "Finish!";
}
#endif
void AscendDeviceContext::Initialize() {
MS_LOG(INFO) << "Status record: Enter Initialize...";
if (initialized_) {
MS_EXCEPTION_IF_NULL(runtime_instance_);
runtime_instance_->SetCurrentContext();
return;
}
MS_LOG(INFO) << "Status record: Initialize start...";
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
runtime_instance_ = dynamic_cast<AscendKernelRuntime *>(
device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id));
MS_EXCEPTION_IF_NULL(runtime_instance_);
if (!runtime_instance_->Init()) {
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
}
mem_manager_ = runtime_instance_->GetMemoryManager();
MS_EXCEPTION_IF_NULL(mem_manager_);
auto env_rank_id = common::GetEnv("RANK_ID");
if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
// get actual rank id if it's distribution training case.
rank_id_ = GetRankId();
}
#ifndef ENABLE_SECURITY
DumpInit(rank_id_);
#endif
initialized_ = true;
MS_LOG(INFO) << "Status record: Initialize success.";
}
void AscendDeviceContext::Destroy() {
MS_LOG(INFO) << "Status record: Enter Destroy...";
if (!initialized_) {
return;
}
MS_LOG(INFO) << "Status record: Destroy start...";
rank_id_ = 0;
if (runtime_instance_ != nullptr) {
runtime_instance_->ReleaseDeviceRes();
runtime_instance_ = nullptr;
}
initialized_ = false;
MS_LOG(INFO) << "Status record: Destroy success.";
}
std::vector<GraphSegmentPtr> AscendDeviceContext::PartitionGraph(
const FuncGraphPtr &func_graph, const std::vector<GraphSegmentPtr> &default_partition_segments) {
return std::vector<GraphSegmentPtr>();
}
void AscendDeviceContext::UnifyMindIR(const KernelGraphPtr &graph) const {
MS_EXCEPTION_IF_NULL(graph);
AscendGraphOptimization::GetInstance().UnifyMindIR(graph);
}
void AscendDeviceContext::OptimizeGraph(const KernelGraphPtr &graph) const {
MS_EXCEPTION_IF_NULL(graph);
AscendGraphOptimization::GetInstance().OptimizeGraph(graph);
}
void AscendDeviceContext::SetOperatorInfo(const std::vector<CNodePtr> &nodes) const {
AscendGraphOptimization::GetInstance().SetOperatorInfo(nodes);
}
void AscendDeviceContext::CreateKernel(const std::vector<CNodePtr> &nodes) const {
MS_LOG(INFO) << "CreateKernel Start...";
struct timeval start_time, end_time;
(void)gettimeofday(&start_time, nullptr);
auto ret = device::ascend::KernelBuild(nodes);
if (!ret) {
MS_LOG(EXCEPTION) << "Kernel build error.";
}
(void)gettimeofday(&end_time, nullptr);
const uint64_t kUSecondInSecond = 1000000;
uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
MS_LOG(INFO) << "CreateKernel finish run in " << PRIu64 << " us " << cost;
}
void AscendDeviceContext::UpdateExecOrder(const KernelGraphPtr &graph) const {
MS_EXCEPTION_IF_NULL(graph);
std::vector<CNodePtr> new_orders;
auto nodes = graph->execution_order();
for (const auto &node : nodes) {
if (node_atomics_.find(node) != node_atomics_.end()) {
auto atomics = node_atomics_[node];
(void)std::copy(atomics.begin(), atomics.end(), std::back_inserter(new_orders));
}
new_orders.push_back(node);
}
graph->set_execution_order(new_orders);
node_atomics_.clear();
}
void AscendDeviceContext::PreprocessBeforeRunGraph(const KernelGraphPtr &graph) const {
MS_EXCEPTION_IF_NULL(graph);
MS_LOG(INFO) << "PreprocessBeforeRunGraph Start for graph " << graph->graph_id();
device::ascend::InsertAtomicCleanOps(graph->execution_order(), &node_atomics_);
if (graph->is_executing_sink()) {
UpdateExecOrder(graph);
device::KernelAdjust::GetInstance().InsertDeviceLoopCtrl(graph);
device::KernelAdjust::GetInstance().ProcessLoopSink(graph);
AscendStreamAssign::GetInstance().AssignStream(NOT_NULL(graph));
CreateKernel(graph->execution_order());
AllocateGraphMemory(NOT_NULL(graph));
LoadModel(NOT_NULL(graph));
MS_LOG(INFO) << "PreprocessBeforeRunGraph success.";
return;
}
MS_LOG(INFO) << "PreprocessBeforeRunGraph success.";
}
void AscendDeviceContext::AllocateGraphMemory(const NotNull<KernelGraphPtr> &root_graph) const {
MS_EXCEPTION_IF_NULL(runtime_instance_);
runtime_instance_->ClearGlobalIdleMem();
memo_.clear();
AssignInputMemory(root_graph, NOT_NULL(&memo_));
device::KernelAdjust::GetInstance().AssignLoopCtrlMemory(*root_graph.get());
runtime_instance_->AssignStaticMemoryOutput(*root_graph.get());
mem_manager_->ResetDynamicMemory();
runtime_instance_->AssignDynamicMemory(*root_graph.get());
runtime_instance_->UpdateRefNodeOutputMem(*root_graph.get());
}
void AscendDeviceContext::AssignInputMemory(const NotNull<KernelGraphPtr> &graph,
NotNull<std::set<KernelGraphPtr> *> const memo) const {
if (memo->find(graph) != memo->end()) {
return;
}
memo->insert(graph.get());
MS_LOG(INFO) << "Start to assign static memory for Parameter and Value node in graph: " << graph->graph_id();
runtime_instance_->AssignStaticMemoryInput(*graph.get());
runtime_instance_->AssignStaticMemoryValueNode(*graph.get());
for (auto &child_graph : graph->child_graph_order()) {
AssignInputMemory(NOT_NULL(child_graph.lock()), memo);
}
MS_LOG(INFO) << "Finish assigning static memory for Parameter and Value node in graph: " << graph->graph_id();
}
void AscendDeviceContext::LoadModel(const NotNull<KernelGraphPtr> &root_graph) const {
MS_LOG(INFO) << "Start LoadModel for graph " << root_graph->graph_id();
MS_EXCEPTION_IF_NULL(runtime_instance_);
bool ret_ok = runtime_instance_->Load(*root_graph.get(), true);
if (!ret_ok) {
MS_LOG(EXCEPTION) << "Load task error!";
}
MS_LOG(INFO) << "Finish!";
}
bool AscendDeviceContext::AllocateMemory(DeviceAddress *const &address, size_t size) const {
MS_EXCEPTION_IF_NULL(address);
MS_EXCEPTION_IF_NULL(runtime_instance_);
runtime_instance_->SetCurrentContext();
auto device_ptr = mem_manager_->MallocMemFromMemPool(size);
if (!device_ptr) {
return false;
}
address->ptr_ = device_ptr;
address->size_ = size;
address->from_mem_pool_ = true;
return true;
}
void AscendDeviceContext::FreeMemory(DeviceAddress *const &address) const {
MS_EXCEPTION_IF_NULL(address);
MS_EXCEPTION_IF_NULL(address->ptr_);
if (!address->from_mem_pool()) {
return;
}
mem_manager_->FreeMemFromMemPool(address->ptr_);
address->ptr_ = nullptr;
}
bool AscendDeviceContext::AllocateContinuousMemory(const std::vector<DeviceAddressPtr> &addr_list, size_t total_size,
const std::vector<size_t> &size_list) const {
MS_EXCEPTION_IF_NULL(runtime_instance_);
runtime_instance_->SetCurrentContext();
return mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list);
}
bool AscendDeviceContext::ExecuteGraph(const KernelGraphPtr &graph) const {
MS_EXCEPTION_IF_NULL(graph);
const uint64_t kUSecondInSecond = 1000000;
bool ret = false;
if (graph->is_executing_sink()) {
#if defined(_WIN32) || defined(_WIN64)
auto start_time = std::chrono::steady_clock::now();
#else
struct timeval start_time {};
struct timeval end_time {};
(void)gettimeofday(&start_time, nullptr);
#endif
MS_EXCEPTION_IF_NULL(runtime_instance_);
#ifndef ENABLE_SECURITY
DumpSetup(graph);
#endif
{
std::lock_guard<std::mutex> locker(launch_mutex_);
ret = runtime_instance_->RunTask(*graph);
}
#ifndef ENABLE_SECURITY
Dump(graph, GetRankID());
#endif
#if defined(_WIN32) || defined(_WIN64)
auto end_time = std::chrono::steady_clock::now();
std::chrono::duration<double, std::ratio<1, kUSecondInSecond>> cost = end_time - start_time;
MS_LOG(INFO) << "Call MS Run Success in " << cost.count() << " us";
#else
(void)gettimeofday(&end_time, nullptr);
uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
MS_LOG(INFO) << "Call MS Run Success in " << cost << " us";
#endif
} else {
MS_LOG(EXCEPTION) << graph->ToString() << " does not sink, should launch kernels";
}
return ret;
}
bool AscendDeviceContext::LaunchGraph(const KernelGraphPtr &graph) const {
MS_LOG(INFO) << "Status record: start launch graph. graph id: " << graph->graph_id();
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(runtime_instance_);
runtime_instance_->SetCurrentContext();
device::KernelAdjust::GetInstance().LoadDeviceLoopCtrlParameters(graph);
auto ret = ExecuteGraph(graph);
MS_LOG(INFO) << "Status record: end launch graph. graph id: " << graph->graph_id();
return ret;
}
bool AscendDeviceContext::SyncStream(size_t stream_id) const {
MS_EXCEPTION_IF_NULL(runtime_instance_);
return runtime_instance_->SyncStream();
}
bool AscendDeviceContext::IsExecutingSink(const KernelGraphPtr &graph) const { return true; }
bool AscendDeviceContext::IsLoopCountSink(const KernelGraphPtr &graph) const { return true; }
// kernel by kernel mode interface
void AscendDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const {
MS_LOG(ERROR) << "!!! Ascend with MindRT not support kernel by kernel mode. !!! ";
}
void AscendDeviceContext::PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr &graph) const {
MS_LOG(ERROR) << "!!! Ascend with MindRT not support kernel by kernel mode. !!! ";
}
void AscendDeviceContext::UpdateDynamicShape(const CNodePtr &kernel) const {
MS_LOG(ERROR) << "!!! Ascend with MindRT not support function UpdateDynamicShape. !!! ";
}
std::shared_ptr<Bucket> AscendDeviceContext::CreateBucket(uint32_t bucket_id, uint32_t bucket_size) const {
MS_LOG(ERROR) << "!!! Ascend with MindRT not support function CreateBucket. !!! ";
return DeviceContext::CreateBucket(bucket_id, bucket_size);
}
bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector<AddressPtr> &inputs,
const vector<AddressPtr> &workspace, const vector<AddressPtr> &outputs,
bool is_dynamic_shape) const {
MS_LOG(ERROR) << "!!! Ascend with MindRT not support kernel by kernel mode. !!! ";
return true;
}
bool AscendDeviceContext::BindDeviceToCurrentThread() const {
runtime_instance_->SetCurrentContext();
return true;
}
MS_REGISTER_DEVICE(kAscendDevice, AscendDeviceContext);
} // namespace ascend
} // namespace device
} // namespace mindspore

View File

@ -0,0 +1,158 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_
#define MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_
#include <vector>
#include <memory>
#include <string>
#include <set>
#include <map>
#include "runtime/hardware/device_context.h"
#include "runtime/hardware/device_context_manager.h"
#include "runtime/device/memory_manager.h"
#include "runtime/device/ascend/ascend_kernel_runtime.h"
#include "runtime/device/ascend/ascend_device_address.h"
namespace mindspore {
namespace device {
namespace ascend {
class AscendDeviceContext : public DeviceContext {
public:
explicit AscendDeviceContext(const DeviceContextKey &device_context_key)
: DeviceContext(device_context_key), mem_manager_(nullptr), initialized_(false) {}
~AscendDeviceContext() override = default;
// Initialize the device context.
void Initialize() override;
// Destroy device context and release device resource.
void Destroy() override;
// Get rank id for distributed training.
uint32_t GetRankID() const override { return rank_id_; }
// Partition the function graph through the device capability and return the partition segments.
// The second parameter is the default partition segments which are provided by the framework.
// Device can reprocess the default partition segments to new segments, also can partition the function graph again.
// If Device can launch the whole graph and not expect partitioning the function graph, then return the empty
// segments. The default behavior is return the default partition segments.
std::vector<GraphSegmentPtr> PartitionGraph(const FuncGraphPtr &func_graph,
const std::vector<GraphSegmentPtr> &default_partition_segments) override;
// Optimize the kernel graph for graph mode.
void OptimizeGraph(const KernelGraphPtr &graph) const override;
// Optimize the single operator graph for PyNative mode.
void OptimizeSingleOpGraph(const KernelGraphPtr &graph) const override;
// Select the matching backend kernels according to the data type and format of input and output for all
// execution operators, and set final device data type and format information for backend kernels, device
// data type and format which replace original data type and format will use for executing kernels.
void SetOperatorInfo(const std::vector<CNodePtr> &nodes) const override;
// Generate 'KernelMod' for all kernels and set 'KernelMod' into kernel,
// 'KernelMod' is real executive object of kernel.
void CreateKernel(const std::vector<CNodePtr> &nodes) const override;
// Adjust kernel graph before run graph, used in Graph Mode.
void PreprocessBeforeRunGraph(const KernelGraphPtr &graph) const override;
// Adjust single op kernel graph before run graph, used in PyNative Mode.
void PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr &graph) const override;
// Infer kernel shape and update abstract info for dynamic shape kernel.
void UpdateDynamicShape(const CNodePtr &kernel) const override;
// Relevant function to allocate and free device memory.
bool AllocateMemory(DeviceAddress *const &address, size_t size) const override;
void FreeMemory(DeviceAddress *const &address) const override;
// Allocate continuous device memory end to end into 'addr_list'.
// Communication operators may need continuous memory for input and output
// to optimize the communication performance.
bool AllocateContinuousMemory(const std::vector<DeviceAddressPtr> &addr_list, size_t total_size,
const std::vector<size_t> &size_list) const override;
// Create concrete device address according different device type.
DeviceAddressPtr CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format,
TypeId type_id) const override {
return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id,
device_context_key_.device_name_, device_context_key_.device_id_);
}
// Get device address type according different device type, such GPU, Ascend.
DeviceAddressType GetDeviceAddressType() const override { return DeviceAddressType::kAscend; }
// Launch graph, device such as Ascend support the whole graph sink to the device executing.
bool LaunchGraph(const KernelGraphPtr &graph) const override;
// Launch a kernel via 'KernelMod' of the kernel.
bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs,
bool is_dynamic_shape = false) const override;
// Synchronize stream, device such as GPU and Ascend need stream to launch kernel asynchronously,
// using 'SyncStream' to block thread and wait for completing all tasks in stream.
// Devices that do not need stream could ignore the implementation of this function.
bool SyncStream(size_t stream_id = 0) const override;
// Create and initialize bucket for every allreduce operator. Bucket is used in PyNative distributed training mode,
// one bucket handles all resource to launch and sync allreduce operator.
std::shared_ptr<Bucket> CreateBucket(uint32_t bucket_id, uint32_t bucket_size) const override;
// Unify the MindIR, the default behavior uses the common unified MindIR.
void UnifyMindIR(const KernelGraphPtr &graph) const override;
// Whether the graph sink executing through the device capability, the default behavior is not sink and return false.
bool IsExecutingSink(const KernelGraphPtr &graph) const override;
// Whether the graph loop sink executing through the device capability, the default behavior is not loop sink and
// return false.
bool IsLoopCountSink(const KernelGraphPtr &graph) const override;
// set rt_context_ to this thread to control device
bool BindDeviceToCurrentThread() const;
private:
// Graph loader interface
void AllocateGraphMemory(const NotNull<KernelGraphPtr> &root_graph) const;
void AssignInputMemory(const NotNull<KernelGraphPtr> &graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
void LoadModel(const NotNull<KernelGraphPtr> &root_graph) const;
void UpdateExecOrder(const KernelGraphPtr &graph) const;
// Kernel Runtime --- only for task sink
AscendKernelRuntime *runtime_instance_{nullptr};
std::shared_ptr<MemoryManager> mem_manager_{nullptr};
// rank id of physical device
uint32_t rank_id_{0};
bool initialized_{false};
// LaunchGraph interface
bool ExecuteGraph(const KernelGraphPtr &graph) const;
// The ExecuteGraph is not thread safety specifically, it is not recommended that multiple threads access the same
// func at the same time, so need the launch mutex when multiple threads launch the graph.
mutable std::mutex launch_mutex_;
// The graphs has been traversed when the graph id traversed recursively.
// Note: Please clean the set before each use.
mutable std::set<KernelGraphPtr> memo_;
// Using node to get it's atomics
mutable std::map<CNodePtr, std::vector<CNodePtr>> node_atomics_;
};
} // namespace ascend
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_

View File

@ -0,0 +1,288 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "runtime/hardware/ascend/ascend_graph_optimization.h"
#include <set>
#include "backend/optimizer/common/common_backend_optimization.h"
#include "backend/optimizer/ascend/ascend_backend_optimization.h"
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
#include "backend/session/ascend_auto_monad.h"
#include "utils/context/graph_kernel_flags.h"
#include "runtime/device/ascend/kernel_select_ascend.h"
#include "runtime/device/kernel_adjust.h"
#ifndef ENABLE_SECURITY
#include "debug/anf_ir_dump.h"
#include "debug/dump_proto.h"
#endif
namespace mindspore {
namespace device {
namespace ascend {
using AscendAutoMonad = mindspore::session::AscendAutoMonad;
void AscendGraphOptimization::OptimizeGraph(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
MS_LOG(INFO) << "Status record: start optimize graph. graph id: " << graph->graph_id();
// empty graph dont entry to backend
if (graph->execution_order().empty()) {
MS_LOG(INFO) << graph->ToString() << " is empty graph.";
AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(graph));
graph->set_executable(false);
MS_LOG(INFO) << "Status record: end optimize graph. graph id: " << graph->graph_id();
}
OptimizeGraphWithoutDeviceInfo(graph);
SelectKernel(graph);
OptimizeGraphWithDeviceInfo(graph);
OptimizeExecutionOrder(graph);
PostOptimization(graph);
MS_LOG(INFO) << "Status record: end optimize graph. graph id: " << graph->graph_id();
}
void AscendGraphOptimization::OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
HandleControlFlow(NOT_NULL(graph));
// add all graphs to manager first, so that don't have to make new manager in following passes.
auto manager = Manage(graph, true);
memo_.clear();
AddGraphToManager(NOT_NULL(graph), NOT_NULL(manager));
memo_.clear();
IRFusionOptimization(graph);
}
void AscendGraphOptimization::OptimizeGraphWithDeviceInfo(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
memo_.clear();
HardWareOptimization(graph);
AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(graph));
}
void AscendGraphOptimization::OptimizeExecutionOrder(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
MS_LOG(INFO) << "Status record: start optimize execution order. graph id: " << graph->graph_id();
// root root_graph validate,include generate execute order and so on
RootGraphExecutorValidate(NOT_NULL(graph));
#ifdef ENABLE_DUMP_IR
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
if (save_graphs) {
DumpIRProto(graph, "before_removeNop_" + std::to_string(graph->graph_id()));
}
#endif
opt::HideNopNode(graph.get());
auto execution_order = graph->execution_order();
AnfAlgo::ReorderExecList(NOT_NULL(&execution_order));
graph->set_execution_order(execution_order);
#ifndef ENABLE_SECURITY
// insert profiling point
device::KernelAdjust::GetInstance().Profiling(NOT_NULL(graph.get()));
#endif
device::KernelAdjust::GetInstance().InsertOverflowCheckOperations(NOT_NULL(graph));
#ifdef ENABLE_DUMP_IR
if (save_graphs) {
DumpIR("after_adjust_kernel.ir", graph);
}
#endif
MS_LOG(INFO) << "Status record: end optimize execution order. graph id: " << graph->graph_id();
}
void AscendGraphOptimization::PostOptimization(const KernelGraphPtr &graph) {
MS_LOG(INFO) << "Status record: start post optimization. graph id: " << graph->graph_id();
// copy child graph ref output map to father graph ref output map
memo_.clear();
UpdateRefOutputMap(graph);
graph->SetInputNodes();
graph->SetOptimizerFlag();
MS_LOG(INFO) << "Status record: end post optimization. graph id: " << graph->graph_id();
}
void AscendGraphOptimization::HardWareOptimization(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
MS_LOG(INFO) << "Status record: start hardware optimize. graph id: " << graph->graph_id();
if (memo_.find(graph) != memo_.end()) {
return;
}
memo_.insert(graph);
opt::AscendBackendOptimization(graph);
opt::CommonFinalOptimization(graph);
if (graphkernel::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
graphkernel::GraphKernelOptimize(graph);
graph->SetExecOrderByDefault();
}
MS_LOG(INFO) << "Status record: end hardware optimize. graph id: " << graph->graph_id();
for (auto &child_graph : graph->child_graph_order()) {
HardWareOptimization(child_graph.lock());
}
}
void AscendGraphOptimization::AddGraphToManager(const NotNull<KernelGraphPtr> graph,
NotNull<FuncGraphManagerPtr> manager) {
if (memo_.find(graph) != memo_.end()) {
return;
}
memo_.insert(graph.get());
manager->AddFuncGraph(graph.get(), false);
for (auto &child_graph : graph->child_graph_order()) {
AddGraphToManager(NOT_NULL(child_graph.lock()), manager);
}
}
void AscendGraphOptimization::IRFusionOptimization(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
if (memo_.find(graph) != memo_.end()) {
return;
}
memo_.insert(graph);
opt::AscendBackendIRFusionOptimization(graph);
#ifdef ENABLE_DUMP_IR
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
if (save_graphs) {
std::string file_name = "select_kernel_before_graph_" + std::to_string(graph->graph_id()) + ".ir";
DumpIR(file_name, graph);
}
#endif
for (auto &child_graph : graph->child_graph_order()) {
IRFusionOptimization(NOT_NULL(child_graph.lock()));
}
}
void AscendGraphOptimization::HandleControlFlow(const NotNull<KernelGraphPtr> graph) {
MS_LOG(INFO) << "Status record: start handle control flow. graph id: " << graph->graph_id();
AscendAutoMonad auto_monad(graph);
auto_monad.Run();
MS_LOG(INFO) << "Status record: end handle control flow. graph id: " << graph->graph_id();
}
void AscendGraphOptimization::RootGraphExecutorValidate(NotNull<KernelGraphPtr> graph) {
MS_LOG(INFO) << "Status record: start graph executor validate. graph id: " << graph->graph_id();
AscendAutoMonad auto_monad(graph);
auto_monad.GenerateExecuteOrder();
MS_LOG(INFO) << "Status record: end graph executor validate. graph id: " << graph->graph_id();
}
void AscendGraphOptimization::RecurseSelectKernelInfo(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
if (memo_.find(graph) != memo_.end()) {
return;
}
memo_.insert(graph);
MS_LOG(INFO) << "Status record: start select kernel info. graph id: " << graph->graph_id();
SetOperatorInfo(graph->execution_order());
MS_LOG(INFO) << "Status record: end select kernel info. graph id: " << graph->graph_id();
#ifdef ENABLE_DUMP_IR
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
if (save_graphs) {
std::string file_name = "select_kernel_after_graph_" + std::to_string(graph->graph_id()) + ".ir";
DumpIR(file_name, graph);
}
#endif
for (auto &child_graph : graph->child_graph_order()) {
RecurseSelectKernelInfo(child_graph.lock());
}
}
void AscendGraphOptimization::SelectKernel(const KernelGraphPtr &graph) {
MS_LOG(INFO) << "Status record: start select kernel info. graph id: " << graph->graph_id();
raise_precision_count_ = 0;
reduce_precision_count_ = 0;
memo_.clear();
RecurseSelectKernelInfo(graph);
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) {
if (raise_precision_count_ > 0) {
MS_LOG(WARNING) << "There are " << raise_precision_count_
<< " node/nodes used raise precision to selected the kernel!";
}
if (reduce_precision_count_ > 0) {
MS_LOG(WARNING) << "There are " << reduce_precision_count_
<< " node/nodes used reduce precision to selected the kernel!";
}
}
MS_LOG(INFO) << "Status record: end select kernel info. graph id: " << graph->graph_id();
}
void AscendGraphOptimization::UpdateRefOutputMap(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
if (memo_.find(graph) != memo_.end()) {
return;
}
memo_.insert(graph);
for (auto &child_graph : graph->child_graph_order()) {
auto child_graph_ptr = child_graph.lock();
MS_EXCEPTION_IF_NULL(child_graph_ptr);
UpdateRefOutputMap(NOT_NULL(child_graph_ptr));
// copy ref map to final graph
auto child_ref_map = child_graph_ptr->GetRefMap();
for (auto &item : child_ref_map) {
if (graph->IsInRefOutputMap(item.first)) {
MS_LOG(WARNING) << "The ref pair <" << item.first.first->DebugString() << ", " << item.first.second
<< "> is already in " << graph->ToString();
continue;
}
graph->AddRefCorrespondPairs(item.first, item.second);
}
}
}
void AscendGraphOptimization::UnifyMindIR(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
MS_LOG(INFO) << "Status record: start unify mindir. graph id: " << graph->graph_id();
opt::CommonUnifyMindIR(graph);
opt::AscendUnifyMindIR(graph);
MS_LOG(INFO) << "Status record: end unify mindir. graph id: " << graph->graph_id();
}
void AscendGraphOptimization::SetOperatorInfo(const std::vector<CNodePtr> &nodes) {
for (const auto &node : nodes) {
auto status = device::ascend::SelectKernelInfo(node);
AnfAlgo::EraseNodeAttr(kAttrPynativeNextOpName, node);
AnfAlgo::EraseNodeAttr(kAttrPynativeNextIndex, node);
if (status == device::ascend::kStatusRaisePrecision) {
raise_precision_count_++;
} else if (status == device::ascend::kStatusReducePrecision) {
reduce_precision_count_++;
}
MS_LOG(DEBUG) << "Select ApplyKernel: " << node->DebugString();
}
}
} // namespace ascend
} // namespace device
} // namespace mindspore

View File

@ -0,0 +1,77 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_MINDSPORE_ASCEND_GRAPH_OPTIMIZATION_H
#define MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_MINDSPORE_ASCEND_GRAPH_OPTIMIZATION_H
#include <vector>
#include <memory>
#include <string>
#include <set>
#include <map>
#include "runtime/hardware/device_context.h"
#include "runtime/hardware/device_context_manager.h"
#include "runtime/device/memory_manager.h"
#include "runtime/device/ascend/ascend_kernel_runtime.h"
#include "runtime/device/ascend/ascend_device_address.h"
namespace mindspore {
namespace device {
namespace ascend {
class AscendGraphOptimization {
public:
static AscendGraphOptimization &GetInstance() {
static AscendGraphOptimization instance;
return instance;
}
AscendGraphOptimization() = default;
~AscendGraphOptimization() = default;
AscendGraphOptimization(const AscendGraphOptimization &) = delete;
AscendGraphOptimization &operator=(const AscendGraphOptimization &) = delete;
void OptimizeGraph(const KernelGraphPtr &graph);
void SetOperatorInfo(const std::vector<CNodePtr> &nodes);
void UnifyMindIR(const KernelGraphPtr &graph);
private:
// Graph Optimized level-2 interface
void OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph);
void OptimizeGraphWithDeviceInfo(const KernelGraphPtr &graph);
void OptimizeExecutionOrder(const KernelGraphPtr &graph);
void PostOptimization(const KernelGraphPtr &graph);
// Graph Optimized level-3 interface
void IRFusionOptimization(const KernelGraphPtr &graph);
void UpdateRefOutputMap(const KernelGraphPtr &graph);
void AddGraphToManager(const NotNull<KernelGraphPtr> graph, NotNull<FuncGraphManagerPtr> manager);
void SelectKernel(const KernelGraphPtr &graph);
void RecurseSelectKernelInfo(const KernelGraphPtr &graph);
void HardWareOptimization(const KernelGraphPtr &graph);
void HandleControlFlow(const NotNull<KernelGraphPtr> graph);
void RootGraphExecutorValidate(NotNull<KernelGraphPtr> graph);
// Number of operators whose precision changes after select kernel
size_t raise_precision_count_{0};
size_t reduce_precision_count_{0};
// The graphs has been traversed when the graph id traversed recursively.
// Note: Please clean the set before each use.
std::set<KernelGraphPtr> memo_;
};
} // namespace ascend
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_ASCEND_GRAPH_OPTIMIZATION_H

View File

@ -136,6 +136,8 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"../../../mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc"
"../../../mindspore/ccsrc/runtime/device/ascend/ascend_memory_pool.cc"
"../../../mindspore/ccsrc/runtime/device/ascend/lic_manager.cc"
"../../../mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc"
"../../../mindspore/ccsrc/runtime/hardware/ascend/ascend_graph_optimization.cc"
"../../../mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc"
"../../../mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.cc"
"../../../mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc"