!22302 [MS][RDR] Fix allocate memory bug when record gpu meory information in RDR

Merge pull request !22302 from louie5/gpu_memory
This commit is contained in:
i-robot 2021-09-08 06:45:34 +00:00 committed by Gitee
commit c7ab3cc3f8
7 changed files with 43 additions and 83 deletions

View File

@ -33,27 +33,22 @@ std::string MemInfo2String(const std::string &label, const AddressPtrList &info)
}
} // namespace
void MemAddressRecorder::SaveMemInfo(const std::string &op_name, const MemInfo &mem_info, size_t id) {
if (op_names_.size() <= id) {
return;
}
void MemAddressRecorder::SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo &mem_info) {
std::lock_guard<std::mutex> lock(mtx_);
op_names_[id] = op_name;
mem_info_inputs_[id] = *(mem_info.inputs_);
mem_info_workspaces_[id] = *(mem_info.workspaces_);
mem_info_outputs_[id] = *(mem_info.outputs_);
}
void MemAddressRecorder::SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo *mem_info) {
std::lock_guard<std::mutex> lock(mtx_);
if (!printed) {
if (!printed_) {
MS_LOG(INFO) << "RDR update mem info.";
printed = true;
printed_ = true;
}
op_names_.emplace_back(op_name);
mem_info_inputs_.emplace_back(mem_info->inputs_);
mem_info_workspaces_.emplace_back(mem_info->workspaces_);
mem_info_outputs_.emplace_back(mem_info->outputs_);
if (op_names_.count(op_name) != 0) {
op_names_.clear();
mem_info_stream_.str("");
}
op_names_.insert(op_name);
mem_info_stream_ << op_name << std::endl;
mem_info_stream_ << MemInfo2String("kernel_inputs", mem_info.inputs_);
mem_info_stream_ << MemInfo2String("kernel_workspaces", mem_info.workspaces_);
mem_info_stream_ << MemInfo2String("kernel_outputs", mem_info.outputs_);
mem_info_stream_ << std::endl;
}
void MemAddressRecorder::Export() {
@ -69,19 +64,8 @@ void MemAddressRecorder::Export() {
MS_LOG(WARNING) << "Open file for saving memory information failed. File path: '" << file_path << "'.";
return;
}
MS_LOG(INFO) << "RDR export mem info.";
std::ostringstream mem_info_stream;
for (size_t i = 0; i < op_names_.size(); i++) {
mem_info_stream << op_names_[i] << std::endl;
auto inputs = mem_info_inputs_[i];
mem_info_stream << MemInfo2String("kernel_inputs", inputs);
auto workspaces = mem_info_workspaces_[i];
mem_info_stream << MemInfo2String("kernel_workspaces", workspaces);
auto outputs = mem_info_outputs_[i];
mem_info_stream << MemInfo2String("kernel_outputs", outputs);
mem_info_stream << std::endl;
}
fout << mem_info_stream.str();
MS_LOG(INFO) << "RDR export device memory information.";
fout << mem_info_stream_.str();
fout.close();
ChangeFileMode(file_path, S_IRUSR);
}
@ -90,9 +74,7 @@ void MemAddressRecorder::CleanUp() {
std::lock_guard<std::mutex> lock(mtx_);
MS_LOG(INFO) << "RDR clean up mem info, kernel size equals " << op_names_.size();
op_names_.clear();
mem_info_inputs_.clear();
mem_info_workspaces_.clear();
mem_info_outputs_.clear();
printed = false;
mem_info_stream_.str("");
printed_ = false;
}
} // namespace mindspore

View File

@ -18,6 +18,7 @@
#include <vector>
#include <string>
#include <map>
#include <set>
#include <memory>
#include <mutex>
@ -42,24 +43,20 @@ class MemAddressRecorder : public BaseRecorder {
~MemAddressRecorder() {}
virtual void Export();
void SaveMemInfo(const std::string &op_name, const MemInfo &mem_info, size_t id);
void SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo *mem_info);
void SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo &mem_info);
void Reset(size_t nsize) {
op_names_.resize(nsize);
mem_info_inputs_.resize(nsize);
mem_info_workspaces_.resize(nsize);
mem_info_outputs_.resize(nsize);
void Reset() {
op_names_.clear();
mem_info_stream_.str("");
}
void CleanUp();
private:
mutable std::mutex mtx_;
bool printed{false};
std::vector<std::string> op_names_;
std::vector<AddressPtrList> mem_info_inputs_;
std::vector<AddressPtrList> mem_info_workspaces_;
std::vector<AddressPtrList> mem_info_outputs_;
bool printed_{false};
std::set<std::string> op_names_;
std::ostringstream mem_info_stream_;
};
using MemAddressRecorderPtr = std::shared_ptr<MemAddressRecorder>;
} // namespace mindspore

View File

@ -89,19 +89,19 @@ bool RecordStreamExecOrder(const SubModuleId module, const std::string &name, co
return ans;
}
bool RecordMemAddressInfo(const SubModuleId module, const std::string &name, size_t nsize) {
bool RecordMemAddressInfo(const SubModuleId module, const std::string &name) {
if (!mindspore::RecorderManager::Instance().RdrEnable()) {
return false;
}
std::string submodule_name = std::string(GetSubModuleName(module));
MemAddressRecorderPtr mem_info_recorder = std::make_shared<MemAddressRecorder>(submodule_name, name);
mem_info_recorder->Reset(nsize);
mem_info_recorder->Reset();
bool ans = mindspore::RecorderManager::Instance().RecordObject(std::move(mem_info_recorder));
return ans;
}
bool UpdateMemAddress(const SubModuleId module, const std::string &name, const std::string &op_name,
const MemInfo &mem_info, size_t id) {
const kernel::KernelLaunchInfo &mem_info) {
if (!mindspore::RecorderManager::Instance().RdrEnable()) {
return false;
}
@ -110,7 +110,7 @@ bool UpdateMemAddress(const SubModuleId module, const std::string &name, const s
bool ans = false;
if (recorder != nullptr) {
auto mem_recorder = std::dynamic_pointer_cast<MemAddressRecorder>(recorder);
mem_recorder->SaveMemInfo(op_name, mem_info, id);
mem_recorder->SaveMemInfo(op_name, mem_info);
ans = true;
}
return ans;

View File

@ -30,6 +30,7 @@ using CNodePtr = std::shared_ptr<CNode>;
namespace kernel {
class Address;
struct KernelLaunchInfo;
using AddressPtr = std::shared_ptr<Address>;
} // namespace kernel
using AddressPtrList = std::vector<kernel::AddressPtr>;
@ -52,9 +53,9 @@ bool RecordGraphExecOrder(const SubModuleId module, const std::string &name,
const std::vector<CNodePtr> &final_exec_order);
bool RecordString(SubModuleId module, const std::string &name, const std::string &data);
bool RecordStreamExecOrder(const SubModuleId module, const std::string &name, const std::vector<CNodePtr> &exec_order);
bool RecordMemAddressInfo(const SubModuleId module, const std::string &name, size_t nsize);
bool RecordMemAddressInfo(const SubModuleId module, const std::string &name);
bool UpdateMemAddress(const SubModuleId module, const std::string &name, const std::string &op_name,
const MemInfo &mem_info, size_t id);
const kernel::KernelLaunchInfo &mem_info);
#ifdef ENABLE_D
bool RecordTaskDebugInfo(SubModuleId module, const std::string &name,
const std::vector<TaskDebugInfoPtr> &task_debug_info_list);

View File

@ -420,8 +420,7 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) {
#endif
#ifdef ENABLE_DUMP_IR
std::string name = "mem_address_list";
(void)mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, name, kernels.size());
size_t id = 0;
(void)mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, name);
#endif
for (const auto &kernel : kernels) {
#ifdef ENABLE_PROFILE
@ -458,9 +457,9 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) {
profiler_inst->OpDataProducerBegin(kernel->fullname_with_scope(), pid);
}
#ifdef ENABLE_DUMP_IR
MemInfo mem_info = {&kernel_inputs, &kernel_workspaces, &kernel_outputs};
kernel::KernelLaunchInfo mem_info = {kernel_inputs, kernel_workspaces, kernel_outputs};
std::string op_name = kernel->fullname_with_scope();
(void)mindspore::RDR::UpdateMemAddress(SubModuleId::SM_KERNEL, name, op_name, mem_info, id++);
(void)mindspore::RDR::UpdateMemAddress(SubModuleId::SM_KERNEL, name, op_name, mem_info);
#endif
try {
ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, 0);

View File

@ -731,8 +731,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
int exec_order = 1;
#ifdef ENABLE_DUMP_IR
std::string name = "mem_address_list";
(void)mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, name, kernels.size());
size_t id = 0;
(void)mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, name);
#endif
CNodePtr last_kernel = GetLastKernel(graph);
for (const auto &kernel : kernels) {
@ -769,9 +768,9 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
return false;
}
#ifdef ENABLE_DUMP_IR
MemInfo mem_info = {&kernel_inputs, &kernel_workspaces, &kernel_outputs};
kernel::KernelLaunchInfo mem_info = {kernel_inputs, kernel_workspaces, kernel_outputs};
std::string op_name = kernel->fullname_with_scope();
(void)mindspore::RDR::UpdateMemAddress(SubModuleId::SM_KERNEL, name, op_name, mem_info, id++);
(void)mindspore::RDR::UpdateMemAddress(SubModuleId::SM_KERNEL, name, op_name, mem_info);
#endif
if (!mock) {
LaunchKernelWithoutMock(graph, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, profiling);

View File

@ -19,7 +19,7 @@
#include <utility>
#ifdef ENABLE_DUMP_IR
#include "debug/rdr/recorder_manager.h"
#include "debug/rdr/mem_address_recorder.h"
#include "debug/rdr/running_data_recorder.h"
#endif
#include "utils/log_adapter.h"
@ -36,30 +36,12 @@ void RecorderActor::RecordInfo(const std::string op_name, const KernelLaunchInfo
MS_LOG(WARNING) << "GPU kernel's op_name is empty, do not record its memory address in RDR.";
return;
}
// record GPU memory address info
if (!RecorderManager::Instance().RdrEnable()) {
return;
}
std::string name = "mem_address_list";
if (!RecorderManager::Instance().CheckRdrMemIsRecord()) {
std::string submodule_name = "KERNEL";
auto mem_info_recorder = std::make_shared<MemAddressRecorder>(submodule_name, name);
if (mem_info_recorder == nullptr) {
MS_LOG(ERROR) << "Make MemAddressRecorder shared pointer failed.";
return;
}
mem_info_recorder->SaveMemInfo(op_name, launch_info_);
bool result = RecorderManager::Instance().RecordObject(std::move(mem_info_recorder));
if (result) {
RecorderManager::Instance().SetRdrMemIsRecord(true);
}
RDR::RecordMemAddressInfo(SUBMODULE_ID, name);
RecorderManager::Instance().SetRdrMemIsRecord(true);
} else {
std::string submodule_name = "KERNEL";
auto recorder = RecorderManager::Instance().GetRecorder(submodule_name, name);
if (recorder != nullptr) {
auto mem_recorder = std::dynamic_pointer_cast<MemAddressRecorder>(recorder);
mem_recorder->SaveMemInfo(op_name, launch_info_);
}
RDR::UpdateMemAddress(SUBMODULE_ID, name, op_name, *launch_info_);
}
#endif
}