diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc index ebf766f42a9..65c5a0aaf1e 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc +++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc @@ -375,7 +375,6 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vectorget_param(MS_CTX_DEVICE_ID); KernelType kernel_type = AnfAlgo::GetKernelType(kernel); MS_EXCEPTION_IF_NULL(kernel); - MS_LOG(DEBUG) << "Launch kernel: " << kernel->fullname_with_scope(); (void)res_manager_->BindDeviceToCurrentThread(); std::vector real_inputs; @@ -407,7 +406,6 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vectorfullname_with_scope(); #ifndef ENABLE_SECURITY auto profiler_inst = profiler::ascend::PynativeProfiler::GetInstance(); MS_EXCEPTION_IF_NULL(profiler_inst); @@ -415,7 +413,9 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vectorOpDataProducerBegin(res_manager_->runtime_instance_, stream, t_id, kernel->fullname_with_scope(), is_dynamic_shape); #endif + MS_LOG(DEBUG) << "Begin launch kernel: " << kernel->fullname_with_scope(); ret = kernel_mod->Launch(real_inputs, workspace, outputs, stream); + MS_LOG(DEBUG) << "End launch kernel: " << kernel->fullname_with_scope(); #ifndef ENABLE_SECURITY profiler_inst->OpDataProducerEnd(t_id, is_dynamic_shape); #endif diff --git a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc index 21656d826f6..a04f6461f98 100644 --- a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc +++ b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc @@ -340,7 +340,6 @@ bool CPUKernelExecutor::LaunchKernel(const CNodePtr &kernel, const std::vector &workspace, const std::vector &outputs, size_t /* stream_id */) const { MS_EXCEPTION_IF_NULL(kernel); - MS_LOG(DEBUG) << "Launch kernel: " << kernel->fullname_with_scope(); auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); @@ -355,10 +354,16 @@ bool CPUKernelExecutor::LaunchKernel(const CNodePtr &kernel, const std::vectorGetEnableFlag()) { - return LaunchKernelWithProfiling(kernel, inputs, workspace, outputs); + MS_LOG(DEBUG) << "Begin launch kernel: " << kernel->fullname_with_scope(); + auto ret = LaunchKernelWithProfiling(kernel, inputs, workspace, outputs); + MS_LOG(DEBUG) << "End launch kernel: " << kernel->fullname_with_scope(); + return ret; } #endif - return DoLaunchKernel(kernel_mod, inputs, workspace, outputs); + MS_LOG(DEBUG) << "Begin launch kernel: " << kernel->fullname_with_scope(); + auto ret = DoLaunchKernel(kernel_mod, inputs, workspace, outputs); + MS_LOG(DEBUG) << "End launch kernel: " << kernel->fullname_with_scope(); + return ret; } bool CPUDeviceResManager::LoadCollectiveCommLib() { diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc index aa7501cdc43..c88de549986 100644 --- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc +++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc @@ -599,13 +599,15 @@ bool GPUKernelExecutor::LaunchKernel(const CNodePtr &kernel, const std::vectorGetEnableFlag()) { #endif auto lock = LockLaunchKernel(stream); - MS_LOG(DEBUG) << "Launch kernel: " << kernel->fullname_with_scope(); + MS_LOG(DEBUG) << "Begin launch kernel: " << kernel->fullname_with_scope(); ret = DoLaunchKernel(kernel, inputs, workspace, outputs, stream); + MS_LOG(DEBUG) << "End launch kernel: " << kernel->fullname_with_scope(); #ifndef ENABLE_SECURITY } else { auto lock = LockLaunchKernel(stream); - MS_LOG(DEBUG) << "Launch kernel: " << kernel->fullname_with_scope(); + MS_LOG(DEBUG) << "Begin launch kernel: " << kernel->fullname_with_scope(); ret = LaunchKernelWithProfiling(kernel, inputs, workspace, outputs, stream); + MS_LOG(DEBUG) << "End launch kernel: " << kernel->fullname_with_scope(); } #endif if (!ret) { diff --git a/mindspore/ccsrc/runtime/device/common_somas_allocator.cc b/mindspore/ccsrc/runtime/device/common_somas_allocator.cc index 0f8b62da569..bdd0dce2823 100644 --- a/mindspore/ccsrc/runtime/device/common_somas_allocator.cc +++ b/mindspore/ccsrc/runtime/device/common_somas_allocator.cc @@ -53,11 +53,11 @@ uint8_t *CommonSomasAllocator::GetNodeOutputPtr(const AnfNodePtr &node, size_t i MS_EXCEPTION_IF_NULL(node); auto kernel_info = dynamic_cast(node->kernel_info()); MS_EXCEPTION_IF_NULL(kernel_info); - if (index >= kernel_info->somas_output_offset_aligned_size_list().size()) { + if (index >= kernel_info->somas_output_result().size()) { MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:[" - << kernel_info->somas_output_offset_aligned_size_list().size() << "]"; + << kernel_info->somas_output_result().size() << "]"; } - auto somas_offset_aligned_size = kernel_info->somas_output_offset_aligned_size_list()[index]; + auto somas_offset_aligned_size = kernel_info->somas_output_result()[index]; if (somas_offset_aligned_size.second == 0) { return nullptr; } @@ -70,11 +70,11 @@ uint8_t *CommonSomasAllocator::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_ MS_EXCEPTION_IF_NULL(node); auto kernel_info = dynamic_cast(node->kernel_info()); MS_EXCEPTION_IF_NULL(kernel_info); - if (index >= kernel_info->somas_workspace_offset_aligned_size_list().size()) { + if (index >= kernel_info->somas_workspace_result().size()) { MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:[" - << kernel_info->somas_workspace_offset_aligned_size_list().size() << "]"; + << kernel_info->somas_workspace_result().size() << "]"; } - auto somas_offset_aligned_size = kernel_info->somas_workspace_offset_aligned_size_list()[index]; + auto somas_offset_aligned_size = kernel_info->somas_workspace_result()[index]; if (somas_offset_aligned_size.second == 0) { return nullptr; } diff --git a/mindspore/ccsrc/runtime/device/kernel_info.cc b/mindspore/ccsrc/runtime/device/kernel_info.cc index 0e9058984c7..a08ac6ec670 100644 --- a/mindspore/ccsrc/runtime/device/kernel_info.cc +++ b/mindspore/ccsrc/runtime/device/kernel_info.cc @@ -16,6 +16,7 @@ #include "runtime/device/kernel_info.h" #include +#include "utils/ms_context.h" namespace mindspore { namespace device { @@ -116,6 +117,25 @@ bool KernelInfo::SetSomasResult(std::vector> &&output_ return true; } +bool KernelInfo::IsTensorEnableSomas(const std::vector> &somas_result, + size_t tensor_index) const { + // Somas is currently closed in the runtime. + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + if ((ms_context->get_param(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO0) || + (ms_context->get_param(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1)) { + return false; + } + + if (somas_result.empty()) { + return false; + } + if (tensor_index >= somas_result.size()) { + MS_LOG(EXCEPTION) << "The tensor index:" << tensor_index << " is out of range:" << somas_result.size(); + } + return (somas_result[tensor_index].second != 0); +} + void KernelInfo::set_kernel_mod(const kernel::KernelModPtr &kernel_mod) { kernel_mod_ = kernel_mod; } kernel::KernelMod *KernelInfo::MutableKernelMod() const { return kernel_mod_.get(); } diff --git a/mindspore/ccsrc/runtime/device/kernel_info.h b/mindspore/ccsrc/runtime/device/kernel_info.h index 9c8dbf5dc12..83c81c4a943 100644 --- a/mindspore/ccsrc/runtime/device/kernel_info.h +++ b/mindspore/ccsrc/runtime/device/kernel_info.h @@ -58,8 +58,6 @@ class KernelInfo : public KernelInfoDevice { DeviceAddressPtr GetMutableWorkspaceAddr(size_t index) const; bool WorkspaceAddrExist(size_t index) const; bool SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t index); - bool SetSomasResult(std::vector> &&output_somas_result, - std::vector> &&workspace_somas_result); void set_kernel_mod(const kernel::KernelModPtr &kernel_mod); kernel::KernelMod *MutableKernelMod() const; const kernel::KernelMod *kernel_mod() const; @@ -73,12 +71,7 @@ class KernelInfo : public KernelInfoDevice { uint32_t graph_id() const { return graph_id_; } bool operator==(const KernelInfo &other) const; bool is_feature_map() const { return is_feature_map_; } - const std::vector> &somas_output_offset_aligned_size_list() const { - return somas_output_result_; - } - const std::vector> &somas_workspace_offset_aligned_size_list() const { - return somas_workspace_result_; - } + const std::vector> &output_address_list() const { return output_address_list_; } const std::vector> &workspace_address_list() const { return workspace_address_list_; } @@ -87,6 +80,13 @@ class KernelInfo : public KernelInfoDevice { void set_ref_map(const bool &all_ref, const OutputInputRefMap &ref_map); const OutputInputRefMap &out_in_ref_map() const { return out_in_ref_map_; } + // The interface of somas. + bool SetSomasResult(std::vector> &&output_somas_result, + std::vector> &&workspace_somas_result); + bool IsTensorEnableSomas(const std::vector> &somas_result, size_t tensor_index) const; + const std::vector> &somas_output_result() const { return somas_output_result_; } + const std::vector> &somas_workspace_result() const { return somas_workspace_result_; } + private: bool is_feature_map_; kernel::KernelBuildInfoPtr select_kernel_build_info_; diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_dump.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_dump.cc index 21669f8b90d..24433c8c5dd 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_dump.cc +++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_dump.cc @@ -205,17 +205,31 @@ void DumpKernelActor(const KernelActor *actor, std::ofstream &ofs) { const auto &kernel = actor->kernel(); MS_EXCEPTION_IF_NULL(kernel); + auto kernel_info = dynamic_cast(kernel->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); ofs << "\t\tkernel_name:" << kernel->fullname_with_scope() << "\tinputs_num:" << common::AnfAlgo::GetInputTensorNum(kernel) << "\toutputs_num:" << common::AnfAlgo::GetOutputTensorNum(kernel) << "\tis_dynamic_shape:" << actor->is_dynamic_shape() << "\tis_launch_skipped:" << actor->is_launch_skipped() << "\n"; + const auto &somas_outputs = kernel_info->somas_output_result(); for (size_t i = 0; i < common::AnfAlgo::GetOutputTensorNum(kernel); ++i) { const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(kernel, i, false); MS_EXCEPTION_IF_NULL(device_tensor); ofs << "\t\t\toutput_index:" << i << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize() << "\toriginal_ref_count:" << device_tensor->original_ref_count() - << "\tdynamic_ref_count:" << device_tensor->dynamic_ref_count() << "\n "; + << "\tdynamic_ref_count:" << device_tensor->dynamic_ref_count() + << "\tis_somas_enable:" << kernel_info->IsTensorEnableSomas(somas_outputs, i) << "\n "; + } + const auto &somas_workspace = kernel_info->somas_workspace_result(); + const auto &workspace_addresses = kernel_info->workspace_address_list(); + for (size_t i = 0; i < workspace_addresses.size(); ++i) { + auto &device_tensor = workspace_addresses[i]; + MS_EXCEPTION_IF_NULL(device_tensor); + ofs << "\t\t\tworkspace_index:" << i << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize() + << "\toriginal_ref_count:" << device_tensor->original_ref_count() + << "\tdynamic_ref_count:" << device_tensor->dynamic_ref_count() + << "\tis_somas_enable:" << kernel_info->IsTensorEnableSomas(somas_workspace, i) << "\n "; } DumpAbstractActor(actor, ofs); @@ -232,6 +246,18 @@ void DumpKernelActor(const KernelActor *actor, std::ofstream &ofs) { ofs << "\t\t\tmodifiable_ref_output_index:" << ref_output_index << "\n "; } } + + const int32_t kInvalidPosition = -1; + if (actor->memory_alloc_insert_position().first != kInvalidPosition) { + std::string arrow_type = actor->memory_alloc_insert_position().second ? "data_arrow" : "control_arrow"; + ofs << "\t\tmemory_alloc_insert_position:" << actor->memory_alloc_insert_position().first + << "\tinsert_arrow_type:" << arrow_type << "\n"; + } + if (actor->memory_free_insert_position().first != kInvalidPosition) { + std::string arrow_type = actor->memory_free_insert_position().second ? "data_arrow" : "control_arrow"; + ofs << "\t\tmemory_free_insert_position:" << actor->memory_free_insert_position().first + << "\tinsert_arrow_type:" << arrow_type << "\n"; + } ofs << "\n"; } diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.cc index 18276cbe11c..cb3255d7f1a 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.cc +++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.cc @@ -27,6 +27,20 @@ namespace mindspore { namespace runtime { +namespace { +bool IsSomasEnable(const SomasInfo *somas_info) { + // Somas is currently closed in the runtime. + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + if ((ms_context->get_param(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO0) || + (ms_context->get_param(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1)) { + return false; + } + + return ((somas_info != nullptr) && (somas_info->whole_block_size_ != 0)); +} +} // namespace + using distributed::collective::CollectiveManager; using distributed::recovery::RecoveryContext; @@ -42,46 +56,18 @@ void KernelActor::Init() { MS_EXCEPTION_IF_NULL(kernel_); real_input_num_ = common::AnfAlgo::GetInputTensorNum(kernel_); kernel_info_ = dynamic_cast(kernel_->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info_); is_dynamic_shape_ = common::AnfAlgo::IsDynamicShape(kernel_); - - for (size_t i = 0; i < real_input_num_; ++i) { - const auto &input_device_tensor = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel_, i, false); - MS_EXCEPTION_IF_NULL(input_device_tensor); - (void)real_input_data_infos_.emplace_back( - std::make_shared(input_device_tensor->format(), input_device_tensor->host_shape(), - input_device_tensor->GetSize(), input_device_tensor->type_id())); + if (is_dynamic_shape_ && IsSomasEnable(somas_info_)) { + MS_LOG(EXCEPTION) << "Not support the somas for the dynamic shape: " << GetAID().Name(); } // Init the device tensors and kernel launch info. - copy_input_device_tensors_.resize(real_input_num_); - input_device_tensors_.resize(real_input_num_); - for (auto &input_address : input_device_tensors_) { - (void)memory_free_list_.emplace_back(input_address); - (void)launch_info_.inputs_.emplace_back(std::make_shared
()); - } - MS_EXCEPTION_IF_NULL(kernel_info_); - for (auto &output_address : kernel_info_->output_address_list()) { - MS_EXCEPTION_IF_NULL(output_address); - (void)output_device_tensors_.emplace_back(output_address.get()); - (void)memory_alloc_list_.emplace_back(output_address.get()); - (void)memory_free_list_.emplace_back(output_address.get()); - (void)launch_info_.outputs_.emplace_back(std::make_shared
()); - } - for (auto &external_reference_tensor : external_reference_tensors_) { - (void)memory_free_list_.emplace_back(external_reference_tensor); - } - // The size of workspace maybe changed in dynamic shape, so put workspace_address in the end of memory_alloc_list_ and - // memory_free_list_, for the operation of dynamic_shape condition in FetchWorkspaceDeviceTensor. - for (auto &workspace_address : kernel_info_->workspace_address_list()) { - MS_EXCEPTION_IF_NULL(workspace_address); - (void)workspace_device_tensors_.emplace_back(workspace_address.get()); - (void)memory_alloc_list_.emplace_back(workspace_address.get()); - (void)memory_free_list_.emplace_back(workspace_address.get()); - (void)launch_info_.workspaces_.emplace_back(std::make_shared
()); - } + InitInputInfo(); + InitOutputInfo(); + InitWorkspaceInfo(); // Init the output data. - output_data_by_output_index_.resize(output_device_tensors_.size()); InitOutputData(); if (output_data_.size() != output_data_arrows_.size()) { MS_LOG(EXCEPTION) << "The output data size is wrong: " << GetAID().Name(); @@ -95,12 +81,88 @@ void KernelActor::Init() { MS_LOG(EXCEPTION) << "The output index is out of range: " << GetAID().Name(); } data->data_ = output_device_tensors_[IntToSize(data_arrow->from_output_index_)]; - (void)output_data_by_output_index_[IntToSize(data_arrow->from_output_index_)].emplace_back(data); - ++output_data_index; } } +void KernelActor::InitInputInfo() { + for (size_t i = 0; i < real_input_num_; ++i) { + const auto &input_device_tensor = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel_, i, false); + MS_EXCEPTION_IF_NULL(input_device_tensor); + (void)real_input_data_infos_.emplace_back( + std::make_shared(input_device_tensor->format(), input_device_tensor->host_shape(), + input_device_tensor->GetSize(), input_device_tensor->type_id())); + } + + copy_input_device_tensors_.resize(real_input_num_); + input_device_tensors_.resize(real_input_num_); + for (auto &input_address : input_device_tensors_) { + (void)memory_free_list_.emplace_back(input_address); + (void)launch_info_.inputs_.emplace_back(std::make_shared
()); + } +} + +void KernelActor::InitOutputInfo() { + MS_EXCEPTION_IF_NULL(kernel_info_); + const auto &output_addresses = kernel_info_->output_address_list(); + const auto &somas_outputs = kernel_info_->somas_output_result(); + bool output_need_somas = false; + for (size_t i = 0; i < output_addresses.size(); ++i) { + auto &output_address = output_addresses[i]; + MS_EXCEPTION_IF_NULL(output_address); + (void)output_device_tensors_.emplace_back(output_address.get()); + (void)launch_info_.outputs_.emplace_back(std::make_shared
()); + + // The output taken over by soma does not need to allocate memory. + if (kernel_info_->IsTensorEnableSomas(somas_outputs, i)) { + MS_EXCEPTION_IF_CHECK_FAIL((somas_outputs[i].second >= output_address->GetSize()), "The somas size is wrong."); + UpdateRefCount(output_address.get(), true); + output_need_somas = true; + } else { + (void)memory_alloc_list_.emplace_back(output_address.get()); + (void)memory_free_list_.emplace_back(output_address.get()); + } + } + + if (output_need_somas && (!IsSomasEnable(somas_info_))) { + MS_LOG(EXCEPTION) << "The somas is not enable for: " << GetAID().Name(); + } + + for (auto &external_reference_tensor : external_reference_tensors_) { + (void)memory_free_list_.emplace_back(external_reference_tensor); + } +} + +void KernelActor::InitWorkspaceInfo() { + MS_EXCEPTION_IF_NULL(kernel_info_); + // The size of workspace maybe changed in dynamic shape, so put workspace_address in the end of memory_alloc_list_ and + // memory_free_list_, for the operation of dynamic_shape condition in FetchWorkspaceDeviceTensor. + const auto &workspace_addresses = kernel_info_->workspace_address_list(); + const auto &somas_workspace = kernel_info_->somas_workspace_result(); + bool workspace_need_somas = false; + for (size_t i = 0; i < workspace_addresses.size(); ++i) { + auto &workspace_address = workspace_addresses[i]; + MS_EXCEPTION_IF_NULL(workspace_address); + (void)workspace_device_tensors_.emplace_back(workspace_address.get()); + (void)launch_info_.workspaces_.emplace_back(std::make_shared
()); + + // The workspace taken over by soma does not need to allocate memory. + if (kernel_info_->IsTensorEnableSomas(somas_workspace, i)) { + MS_EXCEPTION_IF_CHECK_FAIL((somas_workspace[i].second >= workspace_address->GetSize()), + "The somas size is wrong."); + UpdateRefCount(workspace_address.get(), true); + workspace_need_somas = true; + } else { + (void)memory_alloc_list_.emplace_back(workspace_address.get()); + (void)memory_free_list_.emplace_back(workspace_address.get()); + } + } + + if (workspace_need_somas && (!IsSomasEnable(somas_info_))) { + MS_LOG(EXCEPTION) << "The somas is not enable for: " << GetAID().Name(); + } +} + void KernelActor::Run(OpContext *const context) { MS_EXCEPTION_IF_NULL(context); MS_EXCEPTION_IF_NULL(device_contexts_[0]); @@ -129,9 +191,12 @@ void KernelActor::FetchWorkspaceDeviceTensor() { if (launch_info_.workspaces_.size() > workspace_sizes.size()) { size_t size = launch_info_.workspaces_.size() - workspace_sizes.size(); (void)workspace_device_tensors_.erase(workspace_device_tensors_.end() - size, workspace_device_tensors_.end()); + (void)launch_info_.workspaces_.erase(launch_info_.workspaces_.end() - size, launch_info_.workspaces_.end()); + + MS_EXCEPTION_IF_CHECK_FAIL((memory_alloc_list_.size() >= size), "The memory alloc list size is wrong."); + MS_EXCEPTION_IF_CHECK_FAIL((memory_free_list_.size() >= size), "The memory free list size is wrong."); (void)memory_alloc_list_.erase(memory_alloc_list_.end() - size, memory_alloc_list_.end()); (void)memory_free_list_.erase(memory_free_list_.end() - size, memory_free_list_.end()); - (void)launch_info_.workspaces_.erase(launch_info_.workspaces_.end() - size, launch_info_.workspaces_.end()); } else if (launch_info_.workspaces_.size() < workspace_sizes.size()) { for (size_t i = launch_info_.workspaces_.size(); i < workspace_sizes.size(); ++i) { auto device_address = device_contexts_[0]->device_res_manager_->CreateDeviceAddress( @@ -141,9 +206,9 @@ void KernelActor::FetchWorkspaceDeviceTensor() { AnfAlgo::SetWorkspaceAddr(device_address, i, kernel_.get()); // set to kernel_info MS_EXCEPTION_IF_NULL(device_address); (void)workspace_device_tensors_.emplace_back(device_address.get()); + (void)launch_info_.workspaces_.emplace_back(std::make_shared
()); (void)memory_alloc_list_.emplace_back(device_address.get()); (void)memory_free_list_.emplace_back(device_address.get()); - (void)launch_info_.workspaces_.emplace_back(std::make_shared
()); } } // Set workspace address new size @@ -191,8 +256,47 @@ void FreeMemory(const std::vector &free_list, const DeviceContex } } // namespace +void KernelActor::SetSomasMemory(OpContext *const context) { + MS_EXCEPTION_IF_NULL(context); + MS_EXCEPTION_IF_NULL(kernel_info_); + if (!IsSomasEnable(somas_info_)) { + return; + } + if (somas_info_->base_address_ == nullptr) { + std::string error_info = "The somas base address isn't allocated when running " + GetAID().Name(); + SET_OPCONTEXT_FAIL_RET_WITH_ERROR(*context, error_info); + } + + // Set the memory address for the output tensors which use the somas. + const auto &somas_outputs = kernel_info_->somas_output_result(); + MS_EXCEPTION_IF_CHECK_FAIL((output_device_tensors_.size() >= somas_outputs.size()), "The output num is wrong."); + for (size_t i = 0; i < somas_outputs.size(); ++i) { + if (somas_outputs[i].second > 0) { + // In this scenario, the Init function can ensure that the pointer of the relevant operation is not nullptr. + // In order to perform performance, the pointer validity is not checked here. + output_device_tensors_[i]->set_ptr(AddressOffset(somas_info_->base_address_, somas_outputs[i].first)); + } + } + + // Set the memory address for the workspace tensors which use the somas. + const auto &somas_workspace = kernel_info_->somas_workspace_result(); + MS_EXCEPTION_IF_CHECK_FAIL((workspace_device_tensors_.size() >= somas_workspace.size()), "The output num is wrong."); + for (size_t i = 0; i < somas_workspace.size(); ++i) { + if (somas_workspace[i].second > 0) { + // In this scenario, the Init function can ensure that the pointer of the relevant operation is not nullptr. + // In order to perform performance, the pointer validity is not checked here. + workspace_device_tensors_[i]->set_ptr(AddressOffset(somas_info_->base_address_, somas_workspace[i].first)); + } + } +} + void KernelActor::SendMemoryAllocReq(OpContext *const context) { running_dependent_msg_num_ = 1; + + // Set the memory address for the tensors which use the somas. + SetSomasMemory(context); + + // Allocate the memory address for other tensors which don't use the somas. if (strategy_ == GraphExecutionStrategy::kPipeline) { if (ActorDispatcher::is_memory_allocation_sync()) { ActorDispatcher::SendSync(memory_manager_aid_, &MemoryManagerActor::AllocateMemory, &memory_alloc_list_, @@ -361,13 +465,14 @@ void KernelActor::FetchInputDeviceTensor(OpContext *const context) if (data_iter != input_op_datas_.end()) { for (auto &input_data : data_iter->second) { MS_EXCEPTION_IF_NULL(input_data); - if (IntToSize(input_data->index_) >= input_device_tensors_.size()) { + size_t input_index = IntToSize(input_data->index_); + if (input_index >= input_device_tensors_.size()) { SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(strategy_, (*context), "The input index is out of range."); } - if (input_device_tensors_[IntToSize(input_data->index_)] != input_data->data_) { - input_device_tensors_[IntToSize(input_data->index_)] = input_data->data_; - memory_free_list_[IntToSize(input_data->index_)] = input_data->data_; + if (input_device_tensors_[input_index] != input_data->data_) { + input_device_tensors_[input_index] = input_data->data_; + memory_free_list_[input_index] = input_data->data_; } CopyInputDeviceTensor(input_data, context); } @@ -407,30 +512,36 @@ void KernelActor::FetchOutputDeviceTensor(OpContext *const context SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(strategy_, (*context), error_info); } + const auto &somas_outputs = kernel_info_->somas_output_result(); + // Update the size of output device tensor. for (size_t i = 0; i < output_addresses.size(); ++i) { auto output_address = output_addresses[i].get(); MS_EXCEPTION_IF_NULL(output_address); - if (output_size_list[i] != output_address->GetSize()) { - // 1. The size of output address may be changed in dynamic shape scenario. - // 2. If the format of the DeviceAddress is different, then the size is originally different. - // Such as NCHW(1,1,1,3) and NC1HWC0(1,1,1,1,16). So we don't need to update the size. - // 3. For example, we need to call cudnnGetRNNTrainingReserveSize to get real output size in LstmGpuKernelMod! - if (AnfAlgo::GetOutputFormat(kernel_, i) == output_address->format()) { - output_address->SetSize(output_size_list[i]); - } + // The output device tensor can't be changed. + if (output_device_tensors_[i] != output_address) { + std::string error_info = + "The device tensor can't be changed of " + GetAID().Name() + " with output index " + std::to_string(i); + SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(strategy_, (*context), error_info); } - // When the tensor is the output of graph or in dynamic shape scenario, the output tensor may be changed. - if (output_device_tensors_[i] != output_address) { - output_device_tensors_[i] = output_address; - memory_alloc_list_[i] = output_address; - memory_free_list_[real_input_num_ + i] = output_address; + if (output_size_list[i] == output_address->GetSize()) { + continue; + } - // Update output data. - for (auto &output_data : output_data_by_output_index_[i]) { - MS_EXCEPTION_IF_NULL(output_data); - output_data->data_ = output_address; - } + // Somas doesn't support the variable size. + if (kernel_info_->IsTensorEnableSomas(somas_outputs, i) && (somas_outputs[i].second < output_size_list[i])) { + std::string error_info = + "Somas doesn't support variable size of " + GetAID().Name() + " with output index " + std::to_string(i) + + ". Suggest to turn off memory optimization by setting the context memory_optimize_level' to 'O0' "; + SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(strategy_, (*context), error_info); + } + + // 1. The size of output address may be changed in dynamic shape scenario. + // 2. If the format of the DeviceAddress is different, then the size is originally different. + // Such as NCHW(1,1,1,3) and NC1HWC0(1,1,1,1,16). So we don't need to update the size. + // 3. For example, we need to call cudnnGetRNNTrainingReserveSize to get real output size in LstmGpuKernelMod! + if (AnfAlgo::GetOutputFormat(kernel_, i) == output_address->format()) { + output_address->SetSize(output_size_list[i]); } } } @@ -474,8 +585,11 @@ bool KernelActor::LaunchKernel(OpContext *const) { } MS_EXCEPTION_IF_NULL(device_contexts_[0]); - return device_contexts_[0]->kernel_executor_->LaunchKernel(kernel_, launch_info_.inputs_, launch_info_.workspaces_, - launch_info_.outputs_, kernel_info_->stream_id()); + MS_LOG(DEBUG) << "Begin launch kernel of actor: " << GetAID().Name(); + auto ret = device_contexts_[0]->kernel_executor_->LaunchKernel( + kernel_, launch_info_.inputs_, launch_info_.workspaces_, launch_info_.outputs_, kernel_info_->stream_id()); + MS_LOG(DEBUG) << "End launch kernel of actor: " << GetAID().Name(); + return ret; } void KernelActor::PostLaunchKernel(OpContext *const context) { diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.h b/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.h index 9287edac4b0..ca3ceb2141f 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.h +++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.h @@ -136,6 +136,11 @@ class KernelActor : public DebugAwareActor { friend class RpcNodeScheduler; #endif + // Init the device tensors and kernel launch info. + void InitInputInfo(); + void InitOutputInfo(); + void InitWorkspaceInfo(); + // Fetch the device tensor for launch. void FetchInputDeviceTensor(OpContext *const context); void FetchOutputDeviceTensor(OpContext *const context); @@ -152,6 +157,9 @@ class KernelActor : public DebugAwareActor { // Back refresh the dynamic device tensor stores that have been triggered copy. void RefreshDeviceTensorCopyStore(OpContext *const context); + // Set the memory address for the tensors which use the somas. + void SetSomasMemory(OpContext *const context); + // The real input number of kernel launch. size_t real_input_num_; @@ -164,9 +172,6 @@ class KernelActor : public DebugAwareActor { std::set modifiable_ref_input_indexes_; std::set modifiable_ref_output_indexes_; - // Cache output data by output index to modify the output data effectively. - std::vector *>> output_data_by_output_index_; - // Whether skip the kernel launch. bool is_launch_skipped_;