!22365 RDR adapts for CPU dynamic memory allocation

Merge pull request !22365 from liangyongxiong/fix
This commit is contained in:
i-robot 2021-08-26 07:22:35 +00:00 committed by Gitee
commit 438169e0b9
17 changed files with 75 additions and 152 deletions

View File

@ -6,7 +6,6 @@ set(_DEBUG_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/anf_ir_utils.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/draw.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/dump_proto.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/dump_utils.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/trace.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/env_config_parser.cc"
@ -53,7 +52,6 @@ if(NOT ENABLE_SECURITY)
if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
list(APPEND _DEBUG_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/dump_utils.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/data_dump/e2e_dump.cc"
)
endif()

View File

@ -1,54 +0,0 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "debug/dump_utils.h"
#include <string>
#include "utils/log_adapter.h"
#include "utils/ms_context.h"
#include "utils/comm_manager.h"
#include "frontend/parallel/context.h"
namespace mindspore {
uint32_t DumpUtils::GetRankId() {
uint32_t rank_id = 0;
auto parallel_context = parallel::ParallelContext::GetInstance();
MS_EXCEPTION_IF_NULL(parallel_context);
auto parallel_mode = parallel_context->parallel_mode();
if (parallel_mode == parallel::STAND_ALONE) {
MS_LOG(INFO) << "parallel_mode is stand_alone, use 0 as default rank id.";
return rank_id;
}
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
std::string world_group;
std::string backend = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
if (backend == kAscendDevice) {
world_group = kHcclWorldGroup;
} else if (backend == kGPUDevice) {
world_group = kNcclWorldGroup;
} else {
return rank_id;
}
if (!CommManager::GetInstance().GetRankID(world_group, &rank_id)) {
MS_LOG(WARNING) << "Failed to get rank id.";
}
return rank_id;
}
} // namespace mindspore

View File

@ -1,29 +0,0 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_DEBUG_DUMP_UTILS_H_
#define MINDSPORE_CCSRC_DEBUG_DUMP_UTILS_H_
#include <stdint.h>
namespace mindspore {
class DumpUtils {
public:
DumpUtils() = default;
~DumpUtils() = default;
static uint32_t GetRankId();
};
} // namespace mindspore
#endif // MINDSPORE_CCSRC_DEBUG_DUMP_UTILS_H_

View File

@ -19,7 +19,6 @@
#include "nlohmann/json.hpp"
#include "utils/log_adapter.h"
#include "debug/common.h"
#include "debug/dump_utils.h"
#include "utils/ms_context.h"
#include "utils/convert_utils_base.h"
@ -103,19 +102,17 @@ void EnvConfigParser::ParseFromEnv() {
has_rdr_setting_ = true;
rdr_enabled_ = rdr_enable_env.value();
}
std::string path = "";
auto path_env = GetRdrPathFromEnv();
if (path_env.has_value()) {
has_rdr_setting_ = true;
path = path_env.value();
std::string path = path_env.value();
if (!path.empty()) {
if (path.back() != '/') {
path += '/';
}
rdr_path_ = path;
}
}
uint32_t rank_id = DumpUtils::GetRankId();
rdr_path_ = path + "rank_" + std::to_string(rank_id) + "/rdr/";
#endif
}

View File

@ -18,25 +18,10 @@
#include <fstream>
#include "debug/common.h"
#include "utils/utils.h"
#include "utils/ms_context.h"
#include "utils/comm_manager.h"
namespace mindspore {
void BaseRecorder::SetDirectory(const std::string &directory) {
std::string error_message = module_ + ":" + name_ + " set directory failed.";
if (Common::IsPathValid(directory, MAX_DIRECTORY_LENGTH, error_message)) {
directory_ = directory;
if (directory_.back() != '/') {
directory_ += "/";
}
}
}
void BaseRecorder::SetFilename(const std::string &filename) {
std::string error_message = module_ + ":" + name_ + " set filename failed.";
if (Common::IsFilenameValid(filename, MAX_DIRECTORY_LENGTH, error_message)) {
filename_ = filename;
}
}
std::optional<std::string> BaseRecorder::GetFileRealPath(const std::string &suffix) const {
std::string filename;
if (filename_.empty()) {
@ -52,6 +37,12 @@ std::optional<std::string> BaseRecorder::GetFileRealPath(const std::string &suff
}
}
std::string file_path = directory_ + filename;
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
auto config_file = context->get_param<std::string>(MS_CTX_ENV_CONFIG_PATH);
if (config_file.empty()) {
file_path = directory_ + "rank_" + std::to_string(GetRank()) + "/rdr/" + filename;
}
auto realpath = Common::GetRealPath(file_path);
if (!realpath.has_value()) {
MS_LOG(ERROR) << "Get real path failed. "

View File

@ -59,9 +59,6 @@ class BaseRecorder {
std::string GetTimeStamp() const { return timestamp_; }
std::optional<std::string> GetFileRealPath(const std::string &suffix = "") const;
void SetDirectory(const std::string &directory);
void SetFilename(const std::string &filename);
void SetModule(const std::string &module) { module_ = module; }
virtual void Export() {}
virtual void UpdateInfo(const BaseRecorder &recorder) {}

View File

@ -33,7 +33,7 @@ std::string MemInfo2String(const std::string &label, const AddressPtrList &info)
}
} // namespace
void GPUMemAddressRecorder::SaveMemInfo(const std::string &op_name, const GPUMemInfo &mem_info, size_t id) {
void MemAddressRecorder::SaveMemInfo(const std::string &op_name, const MemInfo &mem_info, size_t id) {
if (op_names_.size() <= id) {
return;
}
@ -44,10 +44,10 @@ void GPUMemAddressRecorder::SaveMemInfo(const std::string &op_name, const GPUMem
mem_info_outputs_[id] = *(mem_info.outputs_);
}
void GPUMemAddressRecorder::SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo *mem_info) {
void MemAddressRecorder::SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo *mem_info) {
std::lock_guard<std::mutex> lock(mtx_);
if (!printed) {
MS_LOG(INFO) << "RDR update gpu mem info.";
MS_LOG(INFO) << "RDR update mem info.";
printed = true;
}
op_names_.emplace_back(op_name);
@ -56,7 +56,7 @@ void GPUMemAddressRecorder::SaveMemInfo(const std::string &op_name, const kernel
mem_info_outputs_.emplace_back(mem_info->outputs_);
}
void GPUMemAddressRecorder::Export() {
void MemAddressRecorder::Export() {
auto realpath = GetFileRealPath();
if (!realpath.has_value()) {
return;
@ -66,10 +66,10 @@ void GPUMemAddressRecorder::Export() {
ChangeFileMode(file_path, S_IRWXU);
std::ofstream fout(file_path);
if (!fout.is_open()) {
MS_LOG(WARNING) << "Open file for saving gpu memory information failed. File path: '" << file_path << "'.";
MS_LOG(WARNING) << "Open file for saving memory information failed. File path: '" << file_path << "'.";
return;
}
MS_LOG(INFO) << "RDR export gpu mem info.";
MS_LOG(INFO) << "RDR export mem info.";
std::ostringstream mem_info_stream;
for (size_t i = 0; i < op_names_.size(); i++) {
mem_info_stream << op_names_[i] << std::endl;
@ -86,9 +86,9 @@ void GPUMemAddressRecorder::Export() {
ChangeFileMode(file_path, S_IRUSR);
}
void GPUMemAddressRecorder::CleanUp() {
void MemAddressRecorder::CleanUp() {
std::lock_guard<std::mutex> lock(mtx_);
MS_LOG(INFO) << "RDR clean up gpu mem info, kernel size equals " << op_names_.size();
MS_LOG(INFO) << "RDR clean up mem info, kernel size equals " << op_names_.size();
op_names_.clear();
mem_info_inputs_.clear();
mem_info_workspaces_.clear();

View File

@ -30,19 +30,19 @@ struct KernelLaunchInfo;
using AddressPtr = std::shared_ptr<Address>;
} // namespace kernel
using AddressPtrList = std::vector<kernel::AddressPtr>;
struct GPUMemInfo {
struct MemInfo {
AddressPtrList *inputs_;
AddressPtrList *workspaces_;
AddressPtrList *outputs_;
};
class GPUMemAddressRecorder : public BaseRecorder {
class MemAddressRecorder : public BaseRecorder {
public:
GPUMemAddressRecorder() {}
GPUMemAddressRecorder(const std::string &module, const std::string &name) : BaseRecorder(module, name) {}
~GPUMemAddressRecorder() {}
MemAddressRecorder() {}
MemAddressRecorder(const std::string &module, const std::string &name) : BaseRecorder(module, name) {}
~MemAddressRecorder() {}
virtual void Export();
void SaveMemInfo(const std::string &op_name, const GPUMemInfo &mem_info, size_t id);
void SaveMemInfo(const std::string &op_name, const MemInfo &mem_info, size_t id);
void SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo *mem_info);
void Reset(size_t nsize) {
@ -61,6 +61,6 @@ class GPUMemAddressRecorder : public BaseRecorder {
std::vector<AddressPtrList> mem_info_workspaces_;
std::vector<AddressPtrList> mem_info_outputs_;
};
using GPUMemAddressRecorderPtr = std::shared_ptr<GPUMemAddressRecorder>;
using MemAddressRecorderPtr = std::shared_ptr<MemAddressRecorder>;
} // namespace mindspore
#endif // MINDSPORE_CCSRC_DEBUG_RDR_MEM_ADDRESS_RECORDER_H_

View File

@ -76,7 +76,7 @@ bool RecorderManager::RdrEnable() const {
return rdr_enable_;
}
bool RecorderManager::CheckRdrGPUMemIsRecord() const {
bool RecorderManager::CheckRdrMemIsRecord() const {
if (!rdr_enable_) {
return false;
}
@ -84,7 +84,7 @@ bool RecorderManager::CheckRdrGPUMemIsRecord() const {
return rdr_has_record_mem_;
}
void RecorderManager::SetRdrGPUMemIsRecord(bool is_enable) {
void RecorderManager::SetRdrMemIsRecord(bool is_enable) {
if (!rdr_enable_) {
return;
}

View File

@ -62,8 +62,8 @@ class RecorderManager {
void UpdateRdrEnable();
bool RdrEnable() const;
bool CheckRdrGPUMemIsRecord() const;
void SetRdrGPUMemIsRecord(bool is_enable = true);
bool CheckRdrMemIsRecord() const;
void SetRdrMemIsRecord(bool is_enable = true);
bool RecordObject(const BaseRecorderPtr &recorder);
BaseRecorderPtr GetRecorder(std::string module, std::string name);

View File

@ -89,19 +89,19 @@ bool RecordStreamExecOrder(const SubModuleId module, const std::string &name, co
return ans;
}
bool RecordGPUMemAddressInfo(const SubModuleId module, const std::string &name, size_t nsize) {
bool RecordMemAddressInfo(const SubModuleId module, const std::string &name, size_t nsize) {
if (!mindspore::RecorderManager::Instance().RdrEnable()) {
return false;
}
std::string submodule_name = std::string(GetSubModuleName(module));
GPUMemAddressRecorderPtr mem_info_recorder = std::make_shared<GPUMemAddressRecorder>(submodule_name, name);
MemAddressRecorderPtr mem_info_recorder = std::make_shared<MemAddressRecorder>(submodule_name, name);
mem_info_recorder->Reset(nsize);
bool ans = mindspore::RecorderManager::Instance().RecordObject(std::move(mem_info_recorder));
return ans;
}
bool UpdateGPUMemAddressInfo(const SubModuleId module, const std::string &name, const std::string &op_name,
const GPUMemInfo &mem_info, size_t id) {
bool UpdateMemAddress(const SubModuleId module, const std::string &name, const std::string &op_name,
const MemInfo &mem_info, size_t id) {
if (!mindspore::RecorderManager::Instance().RdrEnable()) {
return false;
}
@ -109,7 +109,7 @@ bool UpdateGPUMemAddressInfo(const SubModuleId module, const std::string &name,
auto recorder = mindspore::RecorderManager::Instance().GetRecorder(submodule_name, name);
bool ans = false;
if (recorder != nullptr) {
auto mem_recorder = std::dynamic_pointer_cast<GPUMemAddressRecorder>(recorder);
auto mem_recorder = std::dynamic_pointer_cast<MemAddressRecorder>(recorder);
mem_recorder->SaveMemInfo(op_name, mem_info, id);
ans = true;
}
@ -120,16 +120,16 @@ void TriggerAll() { mindspore::RecorderManager::Instance().TriggerAll(); }
void ResetRecorder() { mindspore::RecorderManager::Instance().ClearAll(); }
void ClearGPUMemAddressInfo() {
void ClearMemAddressInfo() {
if (!mindspore::RecorderManager::Instance().RdrEnable()) {
return;
}
if (RecorderManager::Instance().CheckRdrGPUMemIsRecord()) {
if (RecorderManager::Instance().CheckRdrMemIsRecord()) {
std::string name = "mem_address_list";
std::string submodule_name = "KERNEL";
auto recorder = RecorderManager::Instance().GetRecorder(submodule_name, name);
if (recorder != nullptr) {
auto mem_recorder = std::dynamic_pointer_cast<GPUMemAddressRecorder>(recorder);
auto mem_recorder = std::dynamic_pointer_cast<MemAddressRecorder>(recorder);
mem_recorder->CleanUp();
}
}

View File

@ -33,7 +33,7 @@ class Address;
using AddressPtr = std::shared_ptr<Address>;
} // namespace kernel
using AddressPtrList = std::vector<kernel::AddressPtr>;
struct GPUMemInfo;
struct MemInfo;
#ifdef ENABLE_D
namespace device {
namespace ascend {
@ -52,16 +52,16 @@ bool RecordGraphExecOrder(const SubModuleId module, const std::string &name,
const std::vector<CNodePtr> &final_exec_order);
bool RecordString(SubModuleId module, const std::string &name, const std::string &data);
bool RecordStreamExecOrder(const SubModuleId module, const std::string &name, const std::vector<CNodePtr> &exec_order);
bool RecordGPUMemAddressInfo(const SubModuleId module, const std::string &name, size_t nsize);
bool UpdateGPUMemAddressInfo(const SubModuleId module, const std::string &name, const std::string &op_name,
const GPUMemInfo &mem_info, size_t id);
bool RecordMemAddressInfo(const SubModuleId module, const std::string &name, size_t nsize);
bool UpdateMemAddress(const SubModuleId module, const std::string &name, const std::string &op_name,
const MemInfo &mem_info, size_t id);
#ifdef ENABLE_D
bool RecordTaskDebugInfo(SubModuleId module, const std::string &name,
const std::vector<TaskDebugInfoPtr> &task_debug_info_list);
#endif // ENABLE_D
void TriggerAll();
void ResetRecorder();
void ClearGPUMemAddressInfo();
void ClearMemAddressInfo();
} // namespace RDR
} // namespace mindspore
#endif // MINDSPORE_CCSRC_DEBUG_RDR_RUNNING_DATA_RECORDER_H_

View File

@ -41,6 +41,8 @@
#endif
#ifdef ENABLE_DUMP_IR
#include "debug/rdr/running_data_recorder.h"
#include "debug/rdr/recorder_manager.h"
#include "debug/rdr/mem_address_recorder.h"
#endif
namespace mindspore {
@ -410,7 +412,11 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) {
bool iter_dump_flag = dump_json_parser.GetIterDumpFlag();
uint32_t graph_id = kernel_graph->graph_id();
#endif
#ifdef ENABLE_DUMP_IR
std::string name = "mem_address_list";
(void)mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, name, kernels.size());
size_t id = 0;
#endif
for (const auto &kernel : kernels) {
#ifdef ENABLE_PROFILE
double start_time = GetTime();
@ -445,6 +451,11 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) {
uint32_t pid = getpid();
profiler_inst->OpDataProducerBegin(kernel->fullname_with_scope(), pid);
}
#ifdef ENABLE_DUMP_IR
MemInfo mem_info = {&kernel_inputs, &kernel_workspaces, &kernel_outputs};
std::string op_name = kernel->fullname_with_scope();
(void)mindspore::RDR::UpdateMemAddress(SubModuleId::SM_KERNEL, name, op_name, mem_info, id++);
#endif
try {
ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, 0);
} catch (std::exception &e) {

View File

@ -731,7 +731,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
int exec_order = 1;
#ifdef ENABLE_DUMP_IR
std::string name = "mem_address_list";
(void)mindspore::RDR::RecordGPUMemAddressInfo(SubModuleId::SM_KERNEL, name, kernels.size());
(void)mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, name, kernels.size());
size_t id = 0;
#endif
CNodePtr last_kernel = GetLastKernel(graph);
@ -769,9 +769,9 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
return false;
}
#ifdef ENABLE_DUMP_IR
GPUMemInfo mem_info = {&kernel_inputs, &kernel_workspaces, &kernel_outputs};
MemInfo mem_info = {&kernel_inputs, &kernel_workspaces, &kernel_outputs};
std::string op_name = kernel->fullname_with_scope();
(void)mindspore::RDR::UpdateGPUMemAddressInfo(SubModuleId::SM_KERNEL, name, op_name, mem_info, id++);
(void)mindspore::RDR::UpdateMemAddress(SubModuleId::SM_KERNEL, name, op_name, mem_info, id++);
#endif
if (!mock) {
LaunchKernelWithoutMock(graph, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, profiling);
@ -830,6 +830,9 @@ void GPUKernelRuntime::LaunchKernelWithoutMock(const session::KernelGraph *graph
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
MS_EXCEPTION_IF_NULL(kernel_mod);
if (!kernel_mod->Launch(inputs, workspaces, outputs, stream_)) {
#ifdef ENABLE_DUMP_IR
mindspore::RDR::TriggerAll();
#endif
MS_LOG(EXCEPTION) << "Launch kernel failed: " << kernel->fullname_with_scope();
}
if (profiler_inst->GetEnableFlag()) {

View File

@ -23,6 +23,9 @@
#include "mindrt/include/async/async.h"
#include "common/trans.h"
#include "utils/log_adapter.h"
#ifdef ENABLE_DUMP_IR
#include "debug/rdr/running_data_recorder.h"
#endif
namespace mindspore {
namespace runtime {
@ -165,10 +168,16 @@ void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *co
auto ret = device_contexts_[0]->LaunchKernel(data_kernel_, launch_info_.inputs_, launch_info_.workspaces_,
launch_info_.outputs_);
if (!ret) {
#ifdef ENABLE_DUMP_IR
mindspore::RDR::TriggerAll();
#endif
std::string error_info = "Launch kernel failed: " + data_kernel_->fullname_with_scope();
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info);
}
} catch (const std::exception &e) {
#ifdef ENABLE_DUMP_IR
mindspore::RDR::TriggerAll();
#endif
MsException::Instance().SetException();
std::string error_info = "Launch kernel exception: " + data_kernel_->fullname_with_scope();
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info);

View File

@ -41,23 +41,23 @@ void RecorderActor::RecordInfo(const std::string op_name, const KernelLaunchInfo
return;
}
std::string name = "mem_address_list";
if (!RecorderManager::Instance().CheckRdrGPUMemIsRecord()) {
if (!RecorderManager::Instance().CheckRdrMemIsRecord()) {
std::string submodule_name = "KERNEL";
auto mem_info_recorder = std::make_shared<GPUMemAddressRecorder>(submodule_name, name);
auto mem_info_recorder = std::make_shared<MemAddressRecorder>(submodule_name, name);
if (mem_info_recorder == nullptr) {
MS_LOG(ERROR) << "Make GPUMemAddressRecorder shared pointer failed.";
MS_LOG(ERROR) << "Make MemAddressRecorder shared pointer failed.";
return;
}
mem_info_recorder->SaveMemInfo(op_name, launch_info_);
bool result = RecorderManager::Instance().RecordObject(std::move(mem_info_recorder));
if (result) {
RecorderManager::Instance().SetRdrGPUMemIsRecord(true);
RecorderManager::Instance().SetRdrMemIsRecord(true);
}
} else {
std::string submodule_name = "KERNEL";
auto recorder = RecorderManager::Instance().GetRecorder(submodule_name, name);
if (recorder != nullptr) {
auto mem_recorder = std::dynamic_pointer_cast<GPUMemAddressRecorder>(recorder);
auto mem_recorder = std::dynamic_pointer_cast<MemAddressRecorder>(recorder);
mem_recorder->SaveMemInfo(op_name, launch_info_);
}
}

View File

@ -464,7 +464,7 @@ bool GPUDeviceContext::SyncStream(size_t stream_id) const {
mindspore::RDR::TriggerAll();
}
// clear RDR gpu memory info
mindspore::RDR::ClearGPUMemAddressInfo();
mindspore::RDR::ClearMemAddressInfo();
#endif
return result;
}