diff --git a/mindspore/ccsrc/debug/CMakeLists.txt b/mindspore/ccsrc/debug/CMakeLists.txt index 921bbbf0ddb..091c0b4041b 100644 --- a/mindspore/ccsrc/debug/CMakeLists.txt +++ b/mindspore/ccsrc/debug/CMakeLists.txt @@ -6,7 +6,6 @@ set(_DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/anf_ir_utils.cc" "${CMAKE_CURRENT_SOURCE_DIR}/draw.cc" "${CMAKE_CURRENT_SOURCE_DIR}/dump_proto.cc" - "${CMAKE_CURRENT_SOURCE_DIR}/dump_utils.cc" "${CMAKE_CURRENT_SOURCE_DIR}/trace.cc" "${CMAKE_CURRENT_SOURCE_DIR}/common.cc" "${CMAKE_CURRENT_SOURCE_DIR}/env_config_parser.cc" @@ -53,7 +52,6 @@ if(NOT ENABLE_SECURITY) if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows") list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/common.cc" - "${CMAKE_CURRENT_SOURCE_DIR}/dump_utils.cc" "${CMAKE_CURRENT_SOURCE_DIR}/data_dump/e2e_dump.cc" ) endif() diff --git a/mindspore/ccsrc/debug/dump_utils.cc b/mindspore/ccsrc/debug/dump_utils.cc deleted file mode 100644 index 32a6f2c5dd0..00000000000 --- a/mindspore/ccsrc/debug/dump_utils.cc +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "debug/dump_utils.h" - -#include -#include "utils/log_adapter.h" -#include "utils/ms_context.h" -#include "utils/comm_manager.h" -#include "frontend/parallel/context.h" - -namespace mindspore { -uint32_t DumpUtils::GetRankId() { - uint32_t rank_id = 0; - auto parallel_context = parallel::ParallelContext::GetInstance(); - MS_EXCEPTION_IF_NULL(parallel_context); - auto parallel_mode = parallel_context->parallel_mode(); - if (parallel_mode == parallel::STAND_ALONE) { - MS_LOG(INFO) << "parallel_mode is stand_alone, use 0 as default rank id."; - return rank_id; - } - - auto ms_context = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(ms_context); - std::string world_group; - std::string backend = ms_context->get_param(MS_CTX_DEVICE_TARGET); - if (backend == kAscendDevice) { - world_group = kHcclWorldGroup; - } else if (backend == kGPUDevice) { - world_group = kNcclWorldGroup; - } else { - return rank_id; - } - - if (!CommManager::GetInstance().GetRankID(world_group, &rank_id)) { - MS_LOG(WARNING) << "Failed to get rank id."; - } - - return rank_id; -} -} // namespace mindspore diff --git a/mindspore/ccsrc/debug/dump_utils.h b/mindspore/ccsrc/debug/dump_utils.h deleted file mode 100644 index beda5a83d94..00000000000 --- a/mindspore/ccsrc/debug/dump_utils.h +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_DEBUG_DUMP_UTILS_H_ -#define MINDSPORE_CCSRC_DEBUG_DUMP_UTILS_H_ -#include - -namespace mindspore { -class DumpUtils { - public: - DumpUtils() = default; - ~DumpUtils() = default; - static uint32_t GetRankId(); -}; -} // namespace mindspore -#endif // MINDSPORE_CCSRC_DEBUG_DUMP_UTILS_H_ diff --git a/mindspore/ccsrc/debug/env_config_parser.cc b/mindspore/ccsrc/debug/env_config_parser.cc index 377836d29bf..8a3ff8e3f9a 100644 --- a/mindspore/ccsrc/debug/env_config_parser.cc +++ b/mindspore/ccsrc/debug/env_config_parser.cc @@ -19,7 +19,6 @@ #include "nlohmann/json.hpp" #include "utils/log_adapter.h" #include "debug/common.h" -#include "debug/dump_utils.h" #include "utils/ms_context.h" #include "utils/convert_utils_base.h" @@ -103,19 +102,17 @@ void EnvConfigParser::ParseFromEnv() { has_rdr_setting_ = true; rdr_enabled_ = rdr_enable_env.value(); } - std::string path = ""; auto path_env = GetRdrPathFromEnv(); if (path_env.has_value()) { has_rdr_setting_ = true; - path = path_env.value(); + std::string path = path_env.value(); if (!path.empty()) { if (path.back() != '/') { path += '/'; } + rdr_path_ = path; } } - uint32_t rank_id = DumpUtils::GetRankId(); - rdr_path_ = path + "rank_" + std::to_string(rank_id) + "/rdr/"; #endif } diff --git a/mindspore/ccsrc/debug/rdr/base_recorder.cc b/mindspore/ccsrc/debug/rdr/base_recorder.cc index b9cd0ace0ea..11a4fc08736 100644 --- a/mindspore/ccsrc/debug/rdr/base_recorder.cc +++ b/mindspore/ccsrc/debug/rdr/base_recorder.cc @@ -18,25 +18,10 @@ #include #include "debug/common.h" #include "utils/utils.h" +#include "utils/ms_context.h" +#include "utils/comm_manager.h" namespace mindspore { -void BaseRecorder::SetDirectory(const std::string &directory) { - std::string error_message = module_ + ":" + name_ + " set directory failed."; - if (Common::IsPathValid(directory, MAX_DIRECTORY_LENGTH, error_message)) { - directory_ = directory; - if (directory_.back() != '/') { - directory_ += "/"; - } - } -} - -void BaseRecorder::SetFilename(const std::string &filename) { - std::string error_message = module_ + ":" + name_ + " set filename failed."; - if (Common::IsFilenameValid(filename, MAX_DIRECTORY_LENGTH, error_message)) { - filename_ = filename; - } -} - std::optional BaseRecorder::GetFileRealPath(const std::string &suffix) const { std::string filename; if (filename_.empty()) { @@ -52,6 +37,12 @@ std::optional BaseRecorder::GetFileRealPath(const std::string &suff } } std::string file_path = directory_ + filename; + auto context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context); + auto config_file = context->get_param(MS_CTX_ENV_CONFIG_PATH); + if (config_file.empty()) { + file_path = directory_ + "rank_" + std::to_string(GetRank()) + "/rdr/" + filename; + } auto realpath = Common::GetRealPath(file_path); if (!realpath.has_value()) { MS_LOG(ERROR) << "Get real path failed. " diff --git a/mindspore/ccsrc/debug/rdr/base_recorder.h b/mindspore/ccsrc/debug/rdr/base_recorder.h index 70ed7f2eb84..0e3f526a2f7 100644 --- a/mindspore/ccsrc/debug/rdr/base_recorder.h +++ b/mindspore/ccsrc/debug/rdr/base_recorder.h @@ -59,9 +59,6 @@ class BaseRecorder { std::string GetTimeStamp() const { return timestamp_; } std::optional GetFileRealPath(const std::string &suffix = "") const; - void SetDirectory(const std::string &directory); - void SetFilename(const std::string &filename); - void SetModule(const std::string &module) { module_ = module; } virtual void Export() {} virtual void UpdateInfo(const BaseRecorder &recorder) {} diff --git a/mindspore/ccsrc/debug/rdr/mem_address_recorder.cc b/mindspore/ccsrc/debug/rdr/mem_address_recorder.cc index d6ed9845281..b2576dafb25 100644 --- a/mindspore/ccsrc/debug/rdr/mem_address_recorder.cc +++ b/mindspore/ccsrc/debug/rdr/mem_address_recorder.cc @@ -33,7 +33,7 @@ std::string MemInfo2String(const std::string &label, const AddressPtrList &info) } } // namespace -void GPUMemAddressRecorder::SaveMemInfo(const std::string &op_name, const GPUMemInfo &mem_info, size_t id) { +void MemAddressRecorder::SaveMemInfo(const std::string &op_name, const MemInfo &mem_info, size_t id) { if (op_names_.size() <= id) { return; } @@ -44,10 +44,10 @@ void GPUMemAddressRecorder::SaveMemInfo(const std::string &op_name, const GPUMem mem_info_outputs_[id] = *(mem_info.outputs_); } -void GPUMemAddressRecorder::SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo *mem_info) { +void MemAddressRecorder::SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo *mem_info) { std::lock_guard lock(mtx_); if (!printed) { - MS_LOG(INFO) << "RDR update gpu mem info."; + MS_LOG(INFO) << "RDR update mem info."; printed = true; } op_names_.emplace_back(op_name); @@ -56,7 +56,7 @@ void GPUMemAddressRecorder::SaveMemInfo(const std::string &op_name, const kernel mem_info_outputs_.emplace_back(mem_info->outputs_); } -void GPUMemAddressRecorder::Export() { +void MemAddressRecorder::Export() { auto realpath = GetFileRealPath(); if (!realpath.has_value()) { return; @@ -66,10 +66,10 @@ void GPUMemAddressRecorder::Export() { ChangeFileMode(file_path, S_IRWXU); std::ofstream fout(file_path); if (!fout.is_open()) { - MS_LOG(WARNING) << "Open file for saving gpu memory information failed. File path: '" << file_path << "'."; + MS_LOG(WARNING) << "Open file for saving memory information failed. File path: '" << file_path << "'."; return; } - MS_LOG(INFO) << "RDR export gpu mem info."; + MS_LOG(INFO) << "RDR export mem info."; std::ostringstream mem_info_stream; for (size_t i = 0; i < op_names_.size(); i++) { mem_info_stream << op_names_[i] << std::endl; @@ -86,9 +86,9 @@ void GPUMemAddressRecorder::Export() { ChangeFileMode(file_path, S_IRUSR); } -void GPUMemAddressRecorder::CleanUp() { +void MemAddressRecorder::CleanUp() { std::lock_guard lock(mtx_); - MS_LOG(INFO) << "RDR clean up gpu mem info, kernel size equals " << op_names_.size(); + MS_LOG(INFO) << "RDR clean up mem info, kernel size equals " << op_names_.size(); op_names_.clear(); mem_info_inputs_.clear(); mem_info_workspaces_.clear(); diff --git a/mindspore/ccsrc/debug/rdr/mem_address_recorder.h b/mindspore/ccsrc/debug/rdr/mem_address_recorder.h index 07dbd5234c1..dcb582c2ba7 100644 --- a/mindspore/ccsrc/debug/rdr/mem_address_recorder.h +++ b/mindspore/ccsrc/debug/rdr/mem_address_recorder.h @@ -30,19 +30,19 @@ struct KernelLaunchInfo; using AddressPtr = std::shared_ptr
; } // namespace kernel using AddressPtrList = std::vector; -struct GPUMemInfo { +struct MemInfo { AddressPtrList *inputs_; AddressPtrList *workspaces_; AddressPtrList *outputs_; }; -class GPUMemAddressRecorder : public BaseRecorder { +class MemAddressRecorder : public BaseRecorder { public: - GPUMemAddressRecorder() {} - GPUMemAddressRecorder(const std::string &module, const std::string &name) : BaseRecorder(module, name) {} - ~GPUMemAddressRecorder() {} + MemAddressRecorder() {} + MemAddressRecorder(const std::string &module, const std::string &name) : BaseRecorder(module, name) {} + ~MemAddressRecorder() {} virtual void Export(); - void SaveMemInfo(const std::string &op_name, const GPUMemInfo &mem_info, size_t id); + void SaveMemInfo(const std::string &op_name, const MemInfo &mem_info, size_t id); void SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo *mem_info); void Reset(size_t nsize) { @@ -61,6 +61,6 @@ class GPUMemAddressRecorder : public BaseRecorder { std::vector mem_info_workspaces_; std::vector mem_info_outputs_; }; -using GPUMemAddressRecorderPtr = std::shared_ptr; +using MemAddressRecorderPtr = std::shared_ptr; } // namespace mindspore #endif // MINDSPORE_CCSRC_DEBUG_RDR_MEM_ADDRESS_RECORDER_H_ diff --git a/mindspore/ccsrc/debug/rdr/recorder_manager.cc b/mindspore/ccsrc/debug/rdr/recorder_manager.cc index 8e26750b009..696c909751d 100644 --- a/mindspore/ccsrc/debug/rdr/recorder_manager.cc +++ b/mindspore/ccsrc/debug/rdr/recorder_manager.cc @@ -76,7 +76,7 @@ bool RecorderManager::RdrEnable() const { return rdr_enable_; } -bool RecorderManager::CheckRdrGPUMemIsRecord() const { +bool RecorderManager::CheckRdrMemIsRecord() const { if (!rdr_enable_) { return false; } @@ -84,7 +84,7 @@ bool RecorderManager::CheckRdrGPUMemIsRecord() const { return rdr_has_record_mem_; } -void RecorderManager::SetRdrGPUMemIsRecord(bool is_enable) { +void RecorderManager::SetRdrMemIsRecord(bool is_enable) { if (!rdr_enable_) { return; } diff --git a/mindspore/ccsrc/debug/rdr/recorder_manager.h b/mindspore/ccsrc/debug/rdr/recorder_manager.h index 993bb6fde1d..27c2d448eea 100644 --- a/mindspore/ccsrc/debug/rdr/recorder_manager.h +++ b/mindspore/ccsrc/debug/rdr/recorder_manager.h @@ -62,8 +62,8 @@ class RecorderManager { void UpdateRdrEnable(); bool RdrEnable() const; - bool CheckRdrGPUMemIsRecord() const; - void SetRdrGPUMemIsRecord(bool is_enable = true); + bool CheckRdrMemIsRecord() const; + void SetRdrMemIsRecord(bool is_enable = true); bool RecordObject(const BaseRecorderPtr &recorder); BaseRecorderPtr GetRecorder(std::string module, std::string name); diff --git a/mindspore/ccsrc/debug/rdr/running_data_recorder.cc b/mindspore/ccsrc/debug/rdr/running_data_recorder.cc index 4233dde47ec..03c5f191918 100644 --- a/mindspore/ccsrc/debug/rdr/running_data_recorder.cc +++ b/mindspore/ccsrc/debug/rdr/running_data_recorder.cc @@ -89,19 +89,19 @@ bool RecordStreamExecOrder(const SubModuleId module, const std::string &name, co return ans; } -bool RecordGPUMemAddressInfo(const SubModuleId module, const std::string &name, size_t nsize) { +bool RecordMemAddressInfo(const SubModuleId module, const std::string &name, size_t nsize) { if (!mindspore::RecorderManager::Instance().RdrEnable()) { return false; } std::string submodule_name = std::string(GetSubModuleName(module)); - GPUMemAddressRecorderPtr mem_info_recorder = std::make_shared(submodule_name, name); + MemAddressRecorderPtr mem_info_recorder = std::make_shared(submodule_name, name); mem_info_recorder->Reset(nsize); bool ans = mindspore::RecorderManager::Instance().RecordObject(std::move(mem_info_recorder)); return ans; } -bool UpdateGPUMemAddressInfo(const SubModuleId module, const std::string &name, const std::string &op_name, - const GPUMemInfo &mem_info, size_t id) { +bool UpdateMemAddress(const SubModuleId module, const std::string &name, const std::string &op_name, + const MemInfo &mem_info, size_t id) { if (!mindspore::RecorderManager::Instance().RdrEnable()) { return false; } @@ -109,7 +109,7 @@ bool UpdateGPUMemAddressInfo(const SubModuleId module, const std::string &name, auto recorder = mindspore::RecorderManager::Instance().GetRecorder(submodule_name, name); bool ans = false; if (recorder != nullptr) { - auto mem_recorder = std::dynamic_pointer_cast(recorder); + auto mem_recorder = std::dynamic_pointer_cast(recorder); mem_recorder->SaveMemInfo(op_name, mem_info, id); ans = true; } @@ -120,16 +120,16 @@ void TriggerAll() { mindspore::RecorderManager::Instance().TriggerAll(); } void ResetRecorder() { mindspore::RecorderManager::Instance().ClearAll(); } -void ClearGPUMemAddressInfo() { +void ClearMemAddressInfo() { if (!mindspore::RecorderManager::Instance().RdrEnable()) { return; } - if (RecorderManager::Instance().CheckRdrGPUMemIsRecord()) { + if (RecorderManager::Instance().CheckRdrMemIsRecord()) { std::string name = "mem_address_list"; std::string submodule_name = "KERNEL"; auto recorder = RecorderManager::Instance().GetRecorder(submodule_name, name); if (recorder != nullptr) { - auto mem_recorder = std::dynamic_pointer_cast(recorder); + auto mem_recorder = std::dynamic_pointer_cast(recorder); mem_recorder->CleanUp(); } } diff --git a/mindspore/ccsrc/debug/rdr/running_data_recorder.h b/mindspore/ccsrc/debug/rdr/running_data_recorder.h index c6ecf65e9ac..29e1d1f9787 100644 --- a/mindspore/ccsrc/debug/rdr/running_data_recorder.h +++ b/mindspore/ccsrc/debug/rdr/running_data_recorder.h @@ -33,7 +33,7 @@ class Address; using AddressPtr = std::shared_ptr
; } // namespace kernel using AddressPtrList = std::vector; -struct GPUMemInfo; +struct MemInfo; #ifdef ENABLE_D namespace device { namespace ascend { @@ -52,16 +52,16 @@ bool RecordGraphExecOrder(const SubModuleId module, const std::string &name, const std::vector &final_exec_order); bool RecordString(SubModuleId module, const std::string &name, const std::string &data); bool RecordStreamExecOrder(const SubModuleId module, const std::string &name, const std::vector &exec_order); -bool RecordGPUMemAddressInfo(const SubModuleId module, const std::string &name, size_t nsize); -bool UpdateGPUMemAddressInfo(const SubModuleId module, const std::string &name, const std::string &op_name, - const GPUMemInfo &mem_info, size_t id); +bool RecordMemAddressInfo(const SubModuleId module, const std::string &name, size_t nsize); +bool UpdateMemAddress(const SubModuleId module, const std::string &name, const std::string &op_name, + const MemInfo &mem_info, size_t id); #ifdef ENABLE_D bool RecordTaskDebugInfo(SubModuleId module, const std::string &name, const std::vector &task_debug_info_list); #endif // ENABLE_D void TriggerAll(); void ResetRecorder(); -void ClearGPUMemAddressInfo(); +void ClearMemAddressInfo(); } // namespace RDR } // namespace mindspore #endif // MINDSPORE_CCSRC_DEBUG_RDR_RUNNING_DATA_RECORDER_H_ diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc index ac9fb079b5a..3a00045ca14 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc @@ -41,6 +41,8 @@ #endif #ifdef ENABLE_DUMP_IR #include "debug/rdr/running_data_recorder.h" +#include "debug/rdr/recorder_manager.h" +#include "debug/rdr/mem_address_recorder.h" #endif namespace mindspore { @@ -410,7 +412,11 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) { bool iter_dump_flag = dump_json_parser.GetIterDumpFlag(); uint32_t graph_id = kernel_graph->graph_id(); #endif - +#ifdef ENABLE_DUMP_IR + std::string name = "mem_address_list"; + (void)mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, name, kernels.size()); + size_t id = 0; +#endif for (const auto &kernel : kernels) { #ifdef ENABLE_PROFILE double start_time = GetTime(); @@ -445,6 +451,11 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) { uint32_t pid = getpid(); profiler_inst->OpDataProducerBegin(kernel->fullname_with_scope(), pid); } +#ifdef ENABLE_DUMP_IR + MemInfo mem_info = {&kernel_inputs, &kernel_workspaces, &kernel_outputs}; + std::string op_name = kernel->fullname_with_scope(); + (void)mindspore::RDR::UpdateMemAddress(SubModuleId::SM_KERNEL, name, op_name, mem_info, id++); +#endif try { ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, 0); } catch (std::exception &e) { diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 07880c724d4..7954be12425 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -731,7 +731,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo int exec_order = 1; #ifdef ENABLE_DUMP_IR std::string name = "mem_address_list"; - (void)mindspore::RDR::RecordGPUMemAddressInfo(SubModuleId::SM_KERNEL, name, kernels.size()); + (void)mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, name, kernels.size()); size_t id = 0; #endif CNodePtr last_kernel = GetLastKernel(graph); @@ -769,9 +769,9 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo return false; } #ifdef ENABLE_DUMP_IR - GPUMemInfo mem_info = {&kernel_inputs, &kernel_workspaces, &kernel_outputs}; + MemInfo mem_info = {&kernel_inputs, &kernel_workspaces, &kernel_outputs}; std::string op_name = kernel->fullname_with_scope(); - (void)mindspore::RDR::UpdateGPUMemAddressInfo(SubModuleId::SM_KERNEL, name, op_name, mem_info, id++); + (void)mindspore::RDR::UpdateMemAddress(SubModuleId::SM_KERNEL, name, op_name, mem_info, id++); #endif if (!mock) { LaunchKernelWithoutMock(graph, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, profiling); @@ -830,6 +830,9 @@ void GPUKernelRuntime::LaunchKernelWithoutMock(const session::KernelGraph *graph auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); if (!kernel_mod->Launch(inputs, workspaces, outputs, stream_)) { +#ifdef ENABLE_DUMP_IR + mindspore::RDR::TriggerAll(); +#endif MS_LOG(EXCEPTION) << "Launch kernel failed: " << kernel->fullname_with_scope(); } if (profiler_inst->GetEnableFlag()) { diff --git a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc index e46437d9266..6ac34211b73 100644 --- a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc +++ b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc @@ -23,6 +23,9 @@ #include "mindrt/include/async/async.h" #include "common/trans.h" #include "utils/log_adapter.h" +#ifdef ENABLE_DUMP_IR +#include "debug/rdr/running_data_recorder.h" +#endif namespace mindspore { namespace runtime { @@ -165,10 +168,16 @@ void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext *co auto ret = device_contexts_[0]->LaunchKernel(data_kernel_, launch_info_.inputs_, launch_info_.workspaces_, launch_info_.outputs_); if (!ret) { +#ifdef ENABLE_DUMP_IR + mindspore::RDR::TriggerAll(); +#endif std::string error_info = "Launch kernel failed: " + data_kernel_->fullname_with_scope(); SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info); } } catch (const std::exception &e) { +#ifdef ENABLE_DUMP_IR + mindspore::RDR::TriggerAll(); +#endif MsException::Instance().SetException(); std::string error_info = "Launch kernel exception: " + data_kernel_->fullname_with_scope(); SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info); diff --git a/mindspore/ccsrc/runtime/framework/actor/recorder_actor.cc b/mindspore/ccsrc/runtime/framework/actor/recorder_actor.cc index 9ce7d926652..f38187b7747 100644 --- a/mindspore/ccsrc/runtime/framework/actor/recorder_actor.cc +++ b/mindspore/ccsrc/runtime/framework/actor/recorder_actor.cc @@ -41,23 +41,23 @@ void RecorderActor::RecordInfo(const std::string op_name, const KernelLaunchInfo return; } std::string name = "mem_address_list"; - if (!RecorderManager::Instance().CheckRdrGPUMemIsRecord()) { + if (!RecorderManager::Instance().CheckRdrMemIsRecord()) { std::string submodule_name = "KERNEL"; - auto mem_info_recorder = std::make_shared(submodule_name, name); + auto mem_info_recorder = std::make_shared(submodule_name, name); if (mem_info_recorder == nullptr) { - MS_LOG(ERROR) << "Make GPUMemAddressRecorder shared pointer failed."; + MS_LOG(ERROR) << "Make MemAddressRecorder shared pointer failed."; return; } mem_info_recorder->SaveMemInfo(op_name, launch_info_); bool result = RecorderManager::Instance().RecordObject(std::move(mem_info_recorder)); if (result) { - RecorderManager::Instance().SetRdrGPUMemIsRecord(true); + RecorderManager::Instance().SetRdrMemIsRecord(true); } } else { std::string submodule_name = "KERNEL"; auto recorder = RecorderManager::Instance().GetRecorder(submodule_name, name); if (recorder != nullptr) { - auto mem_recorder = std::dynamic_pointer_cast(recorder); + auto mem_recorder = std::dynamic_pointer_cast(recorder); mem_recorder->SaveMemInfo(op_name, launch_info_); } } diff --git a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc index 1a424c24b86..e2b6160be5d 100644 --- a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc +++ b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc @@ -464,7 +464,7 @@ bool GPUDeviceContext::SyncStream(size_t stream_id) const { mindspore::RDR::TriggerAll(); } // clear RDR gpu memory info - mindspore::RDR::ClearGPUMemAddressInfo(); + mindspore::RDR::ClearMemAddressInfo(); #endif return result; }