!22302 [MS][RDR] Fix allocate memory bug when record gpu meory information in RDR
Merge pull request !22302 from louie5/gpu_memory
This commit is contained in:
commit
c7ab3cc3f8
|
@ -33,27 +33,22 @@ std::string MemInfo2String(const std::string &label, const AddressPtrList &info)
|
|||
}
|
||||
} // namespace
|
||||
|
||||
void MemAddressRecorder::SaveMemInfo(const std::string &op_name, const MemInfo &mem_info, size_t id) {
|
||||
if (op_names_.size() <= id) {
|
||||
return;
|
||||
}
|
||||
void MemAddressRecorder::SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo &mem_info) {
|
||||
std::lock_guard<std::mutex> lock(mtx_);
|
||||
op_names_[id] = op_name;
|
||||
mem_info_inputs_[id] = *(mem_info.inputs_);
|
||||
mem_info_workspaces_[id] = *(mem_info.workspaces_);
|
||||
mem_info_outputs_[id] = *(mem_info.outputs_);
|
||||
}
|
||||
|
||||
void MemAddressRecorder::SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo *mem_info) {
|
||||
std::lock_guard<std::mutex> lock(mtx_);
|
||||
if (!printed) {
|
||||
if (!printed_) {
|
||||
MS_LOG(INFO) << "RDR update mem info.";
|
||||
printed = true;
|
||||
printed_ = true;
|
||||
}
|
||||
op_names_.emplace_back(op_name);
|
||||
mem_info_inputs_.emplace_back(mem_info->inputs_);
|
||||
mem_info_workspaces_.emplace_back(mem_info->workspaces_);
|
||||
mem_info_outputs_.emplace_back(mem_info->outputs_);
|
||||
if (op_names_.count(op_name) != 0) {
|
||||
op_names_.clear();
|
||||
mem_info_stream_.str("");
|
||||
}
|
||||
op_names_.insert(op_name);
|
||||
mem_info_stream_ << op_name << std::endl;
|
||||
mem_info_stream_ << MemInfo2String("kernel_inputs", mem_info.inputs_);
|
||||
mem_info_stream_ << MemInfo2String("kernel_workspaces", mem_info.workspaces_);
|
||||
mem_info_stream_ << MemInfo2String("kernel_outputs", mem_info.outputs_);
|
||||
mem_info_stream_ << std::endl;
|
||||
}
|
||||
|
||||
void MemAddressRecorder::Export() {
|
||||
|
@ -69,19 +64,8 @@ void MemAddressRecorder::Export() {
|
|||
MS_LOG(WARNING) << "Open file for saving memory information failed. File path: '" << file_path << "'.";
|
||||
return;
|
||||
}
|
||||
MS_LOG(INFO) << "RDR export mem info.";
|
||||
std::ostringstream mem_info_stream;
|
||||
for (size_t i = 0; i < op_names_.size(); i++) {
|
||||
mem_info_stream << op_names_[i] << std::endl;
|
||||
auto inputs = mem_info_inputs_[i];
|
||||
mem_info_stream << MemInfo2String("kernel_inputs", inputs);
|
||||
auto workspaces = mem_info_workspaces_[i];
|
||||
mem_info_stream << MemInfo2String("kernel_workspaces", workspaces);
|
||||
auto outputs = mem_info_outputs_[i];
|
||||
mem_info_stream << MemInfo2String("kernel_outputs", outputs);
|
||||
mem_info_stream << std::endl;
|
||||
}
|
||||
fout << mem_info_stream.str();
|
||||
MS_LOG(INFO) << "RDR export device memory information.";
|
||||
fout << mem_info_stream_.str();
|
||||
fout.close();
|
||||
ChangeFileMode(file_path, S_IRUSR);
|
||||
}
|
||||
|
@ -90,9 +74,7 @@ void MemAddressRecorder::CleanUp() {
|
|||
std::lock_guard<std::mutex> lock(mtx_);
|
||||
MS_LOG(INFO) << "RDR clean up mem info, kernel size equals " << op_names_.size();
|
||||
op_names_.clear();
|
||||
mem_info_inputs_.clear();
|
||||
mem_info_workspaces_.clear();
|
||||
mem_info_outputs_.clear();
|
||||
printed = false;
|
||||
mem_info_stream_.str("");
|
||||
printed_ = false;
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
|
@ -42,24 +43,20 @@ class MemAddressRecorder : public BaseRecorder {
|
|||
~MemAddressRecorder() {}
|
||||
|
||||
virtual void Export();
|
||||
void SaveMemInfo(const std::string &op_name, const MemInfo &mem_info, size_t id);
|
||||
void SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo *mem_info);
|
||||
void SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo &mem_info);
|
||||
|
||||
void Reset(size_t nsize) {
|
||||
op_names_.resize(nsize);
|
||||
mem_info_inputs_.resize(nsize);
|
||||
mem_info_workspaces_.resize(nsize);
|
||||
mem_info_outputs_.resize(nsize);
|
||||
void Reset() {
|
||||
op_names_.clear();
|
||||
mem_info_stream_.str("");
|
||||
}
|
||||
void CleanUp();
|
||||
|
||||
private:
|
||||
mutable std::mutex mtx_;
|
||||
bool printed{false};
|
||||
std::vector<std::string> op_names_;
|
||||
std::vector<AddressPtrList> mem_info_inputs_;
|
||||
std::vector<AddressPtrList> mem_info_workspaces_;
|
||||
std::vector<AddressPtrList> mem_info_outputs_;
|
||||
bool printed_{false};
|
||||
|
||||
std::set<std::string> op_names_;
|
||||
std::ostringstream mem_info_stream_;
|
||||
};
|
||||
using MemAddressRecorderPtr = std::shared_ptr<MemAddressRecorder>;
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -89,19 +89,19 @@ bool RecordStreamExecOrder(const SubModuleId module, const std::string &name, co
|
|||
return ans;
|
||||
}
|
||||
|
||||
bool RecordMemAddressInfo(const SubModuleId module, const std::string &name, size_t nsize) {
|
||||
bool RecordMemAddressInfo(const SubModuleId module, const std::string &name) {
|
||||
if (!mindspore::RecorderManager::Instance().RdrEnable()) {
|
||||
return false;
|
||||
}
|
||||
std::string submodule_name = std::string(GetSubModuleName(module));
|
||||
MemAddressRecorderPtr mem_info_recorder = std::make_shared<MemAddressRecorder>(submodule_name, name);
|
||||
mem_info_recorder->Reset(nsize);
|
||||
mem_info_recorder->Reset();
|
||||
bool ans = mindspore::RecorderManager::Instance().RecordObject(std::move(mem_info_recorder));
|
||||
return ans;
|
||||
}
|
||||
|
||||
bool UpdateMemAddress(const SubModuleId module, const std::string &name, const std::string &op_name,
|
||||
const MemInfo &mem_info, size_t id) {
|
||||
const kernel::KernelLaunchInfo &mem_info) {
|
||||
if (!mindspore::RecorderManager::Instance().RdrEnable()) {
|
||||
return false;
|
||||
}
|
||||
|
@ -110,7 +110,7 @@ bool UpdateMemAddress(const SubModuleId module, const std::string &name, const s
|
|||
bool ans = false;
|
||||
if (recorder != nullptr) {
|
||||
auto mem_recorder = std::dynamic_pointer_cast<MemAddressRecorder>(recorder);
|
||||
mem_recorder->SaveMemInfo(op_name, mem_info, id);
|
||||
mem_recorder->SaveMemInfo(op_name, mem_info);
|
||||
ans = true;
|
||||
}
|
||||
return ans;
|
||||
|
|
|
@ -30,6 +30,7 @@ using CNodePtr = std::shared_ptr<CNode>;
|
|||
|
||||
namespace kernel {
|
||||
class Address;
|
||||
struct KernelLaunchInfo;
|
||||
using AddressPtr = std::shared_ptr<Address>;
|
||||
} // namespace kernel
|
||||
using AddressPtrList = std::vector<kernel::AddressPtr>;
|
||||
|
@ -52,9 +53,9 @@ bool RecordGraphExecOrder(const SubModuleId module, const std::string &name,
|
|||
const std::vector<CNodePtr> &final_exec_order);
|
||||
bool RecordString(SubModuleId module, const std::string &name, const std::string &data);
|
||||
bool RecordStreamExecOrder(const SubModuleId module, const std::string &name, const std::vector<CNodePtr> &exec_order);
|
||||
bool RecordMemAddressInfo(const SubModuleId module, const std::string &name, size_t nsize);
|
||||
bool RecordMemAddressInfo(const SubModuleId module, const std::string &name);
|
||||
bool UpdateMemAddress(const SubModuleId module, const std::string &name, const std::string &op_name,
|
||||
const MemInfo &mem_info, size_t id);
|
||||
const kernel::KernelLaunchInfo &mem_info);
|
||||
#ifdef ENABLE_D
|
||||
bool RecordTaskDebugInfo(SubModuleId module, const std::string &name,
|
||||
const std::vector<TaskDebugInfoPtr> &task_debug_info_list);
|
||||
|
|
|
@ -420,8 +420,7 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) {
|
|||
#endif
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
std::string name = "mem_address_list";
|
||||
(void)mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, name, kernels.size());
|
||||
size_t id = 0;
|
||||
(void)mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, name);
|
||||
#endif
|
||||
for (const auto &kernel : kernels) {
|
||||
#ifdef ENABLE_PROFILE
|
||||
|
@ -458,9 +457,9 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool) {
|
|||
profiler_inst->OpDataProducerBegin(kernel->fullname_with_scope(), pid);
|
||||
}
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
MemInfo mem_info = {&kernel_inputs, &kernel_workspaces, &kernel_outputs};
|
||||
kernel::KernelLaunchInfo mem_info = {kernel_inputs, kernel_workspaces, kernel_outputs};
|
||||
std::string op_name = kernel->fullname_with_scope();
|
||||
(void)mindspore::RDR::UpdateMemAddress(SubModuleId::SM_KERNEL, name, op_name, mem_info, id++);
|
||||
(void)mindspore::RDR::UpdateMemAddress(SubModuleId::SM_KERNEL, name, op_name, mem_info);
|
||||
#endif
|
||||
try {
|
||||
ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, 0);
|
||||
|
|
|
@ -731,8 +731,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
|
|||
int exec_order = 1;
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
std::string name = "mem_address_list";
|
||||
(void)mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, name, kernels.size());
|
||||
size_t id = 0;
|
||||
(void)mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, name);
|
||||
#endif
|
||||
CNodePtr last_kernel = GetLastKernel(graph);
|
||||
for (const auto &kernel : kernels) {
|
||||
|
@ -769,9 +768,9 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
|
|||
return false;
|
||||
}
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
MemInfo mem_info = {&kernel_inputs, &kernel_workspaces, &kernel_outputs};
|
||||
kernel::KernelLaunchInfo mem_info = {kernel_inputs, kernel_workspaces, kernel_outputs};
|
||||
std::string op_name = kernel->fullname_with_scope();
|
||||
(void)mindspore::RDR::UpdateMemAddress(SubModuleId::SM_KERNEL, name, op_name, mem_info, id++);
|
||||
(void)mindspore::RDR::UpdateMemAddress(SubModuleId::SM_KERNEL, name, op_name, mem_info);
|
||||
#endif
|
||||
if (!mock) {
|
||||
LaunchKernelWithoutMock(graph, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, profiling);
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
#include <utility>
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
#include "debug/rdr/recorder_manager.h"
|
||||
#include "debug/rdr/mem_address_recorder.h"
|
||||
#include "debug/rdr/running_data_recorder.h"
|
||||
#endif
|
||||
#include "utils/log_adapter.h"
|
||||
|
||||
|
@ -36,30 +36,12 @@ void RecorderActor::RecordInfo(const std::string op_name, const KernelLaunchInfo
|
|||
MS_LOG(WARNING) << "GPU kernel's op_name is empty, do not record its memory address in RDR.";
|
||||
return;
|
||||
}
|
||||
// record GPU memory address info
|
||||
if (!RecorderManager::Instance().RdrEnable()) {
|
||||
return;
|
||||
}
|
||||
std::string name = "mem_address_list";
|
||||
if (!RecorderManager::Instance().CheckRdrMemIsRecord()) {
|
||||
std::string submodule_name = "KERNEL";
|
||||
auto mem_info_recorder = std::make_shared<MemAddressRecorder>(submodule_name, name);
|
||||
if (mem_info_recorder == nullptr) {
|
||||
MS_LOG(ERROR) << "Make MemAddressRecorder shared pointer failed.";
|
||||
return;
|
||||
}
|
||||
mem_info_recorder->SaveMemInfo(op_name, launch_info_);
|
||||
bool result = RecorderManager::Instance().RecordObject(std::move(mem_info_recorder));
|
||||
if (result) {
|
||||
RecorderManager::Instance().SetRdrMemIsRecord(true);
|
||||
}
|
||||
RDR::RecordMemAddressInfo(SUBMODULE_ID, name);
|
||||
RecorderManager::Instance().SetRdrMemIsRecord(true);
|
||||
} else {
|
||||
std::string submodule_name = "KERNEL";
|
||||
auto recorder = RecorderManager::Instance().GetRecorder(submodule_name, name);
|
||||
if (recorder != nullptr) {
|
||||
auto mem_recorder = std::dynamic_pointer_cast<MemAddressRecorder>(recorder);
|
||||
mem_recorder->SaveMemInfo(op_name, launch_info_);
|
||||
}
|
||||
RDR::UpdateMemAddress(SUBMODULE_ID, name, op_name, *launch_info_);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue