!17700 [MS][RDR] add gpu memory info recording method in actor framework

From: @louie5
Reviewed-by: 
Signed-off-by:
This commit is contained in:
mindspore-ci-bot 2021-06-07 09:26:03 +08:00 committed by Gitee
commit bb17250c97
10 changed files with 133 additions and 18 deletions

View File

@ -37,7 +37,6 @@ void GPUMemAddressRecorder::SaveMemInfo(const std::string &op_name, const GPUMem
if (op_names_.size() <= id) {
return;
}
std::lock_guard<std::mutex> lock(mtx_);
op_names_[id] = op_name;
mem_info_inputs_[id] = *(mem_info.inputs_);
@ -45,6 +44,18 @@ void GPUMemAddressRecorder::SaveMemInfo(const std::string &op_name, const GPUMem
mem_info_outputs_[id] = *(mem_info.outputs_);
}
void GPUMemAddressRecorder::SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo *mem_info) {
std::lock_guard<std::mutex> lock(mtx_);
if (!printed) {
MS_LOG(INFO) << "RDR update gpu mem info.";
printed = true;
}
op_names_.emplace_back(op_name);
mem_info_inputs_.emplace_back(mem_info->inputs_);
mem_info_workspaces_.emplace_back(mem_info->workspaces_);
mem_info_outputs_.emplace_back(mem_info->outputs_);
}
void GPUMemAddressRecorder::Export() {
auto realpath = GetFileRealPath();
if (!realpath.has_value()) {
@ -58,6 +69,7 @@ void GPUMemAddressRecorder::Export() {
MS_LOG(WARNING) << "Open file for saving gpu memory information failed. File path: '" << file_path << "'.";
return;
}
MS_LOG(INFO) << "RDR export gpu mem info.";
std::ostringstream mem_info_stream;
for (size_t i = 0; i < op_names_.size(); i++) {
mem_info_stream << op_names_[i] << std::endl;
@ -73,4 +85,14 @@ void GPUMemAddressRecorder::Export() {
fout.close();
ChangeFileMode(file_path, S_IRUSR);
}
void GPUMemAddressRecorder::CleanUp() {
std::lock_guard<std::mutex> lock(mtx_);
MS_LOG(INFO) << "RDR clean up gpu mem info, kernel size equals " << op_names_.size();
op_names_.clear();
mem_info_inputs_.clear();
mem_info_workspaces_.clear();
mem_info_outputs_.clear();
printed = false;
}
} // namespace mindspore

View File

@ -26,6 +26,7 @@
namespace mindspore {
namespace kernel {
class Address;
struct KernelLaunchInfo;
using AddressPtr = std::shared_ptr<Address>;
} // namespace kernel
using AddressPtrList = std::vector<kernel::AddressPtr>;
@ -42,15 +43,19 @@ class GPUMemAddressRecorder : public BaseRecorder {
virtual void Export();
void SaveMemInfo(const std::string &op_name, const GPUMemInfo &mem_info, size_t id);
void SaveMemInfo(const std::string &op_name, const kernel::KernelLaunchInfo *mem_info);
void Reset(size_t nsize) {
op_names_.resize(nsize);
mem_info_inputs_.resize(nsize);
mem_info_workspaces_.resize(nsize);
mem_info_outputs_.resize(nsize);
}
void CleanUp();
private:
mutable std::mutex mtx_;
bool printed{false};
std::vector<std::string> op_names_;
std::vector<AddressPtrList> mem_info_inputs_;
std::vector<AddressPtrList> mem_info_workspaces_;

View File

@ -46,7 +46,6 @@ bool RecorderManager::RecordObject(const BaseRecorderPtr &recorder) {
if (!rdr_enable_) {
return false;
}
if (recorder == nullptr) {
MS_LOG(ERROR) << "Register recorder module with nullptr.";
return false;
@ -56,10 +55,15 @@ bool RecorderManager::RecordObject(const BaseRecorderPtr &recorder) {
std::pair<std::string, std::string> recorder_key(module, name);
std::lock_guard<std::mutex> lock(mtx_);
recorder_container_[recorder_key] = recorder;
MS_LOG(INFO) << "RDR record object " << name << " in module \"" << module << "\".";
return true;
}
BaseRecorderPtr RecorderManager::GetRecorder(std::string module, std::string name) {
if (!rdr_enable_) {
return nullptr;
}
std::lock_guard<std::mutex> lock(mtx_);
std::pair<std::string, std::string> recorder_key(module, name);
auto item = recorder_container_.find(recorder_key);
if (item != recorder_container_.end()) {
@ -68,11 +72,31 @@ BaseRecorderPtr RecorderManager::GetRecorder(std::string module, std::string nam
return nullptr;
}
bool RecorderManager::RdrEnable() const {
std::lock_guard<std::mutex> lock(mtx_);
return rdr_enable_;
}
bool RecorderManager::CheckRdrGPUMemIsRecord() const {
if (!rdr_enable_) {
return false;
}
std::lock_guard<std::mutex> lock(mtx_);
return rdr_has_record_mem_;
}
void RecorderManager::SetRdrGPUMemIsRecord(bool is_enable) {
if (!rdr_enable_) {
return;
}
std::lock_guard<std::mutex> lock(mtx_);
rdr_has_record_mem_ = is_enable;
}
void RecorderManager::TriggerAll() {
if (!rdr_enable_) {
return;
}
bool trigger = false;
std::lock_guard<std::mutex> lock(mtx_);
for (auto iter = recorder_container_.begin(); iter != recorder_container_.end(); ++iter) {
@ -81,11 +105,18 @@ void RecorderManager::TriggerAll() {
}
if (!trigger) {
MS_LOG(WARNING) << "There is no recorder to export.";
} else {
MS_LOG(INFO) << "RDR export all recorders' info.";
}
}
void RecorderManager::ClearAll() {
if (!rdr_enable_) {
return;
}
std::lock_guard<std::mutex> lock(mtx_);
recorder_container_.clear();
rdr_has_record_mem_ = false;
MS_LOG(INFO) << "RDR clear all recorders' info.";
}
} // namespace mindspore

View File

@ -61,7 +61,10 @@ class RecorderManager {
}
void UpdateRdrEnable();
bool RdrEnable() const { return rdr_enable_; }
bool RdrEnable() const;
bool CheckRdrGPUMemIsRecord() const;
void SetRdrGPUMemIsRecord(bool is_enable = true);
bool RecordObject(const BaseRecorderPtr &recorder);
BaseRecorderPtr GetRecorder(std::string module, std::string name);
void TriggerAll();
@ -72,6 +75,7 @@ class RecorderManager {
~RecorderManager() {}
bool rdr_enable_{false};
bool rdr_has_record_mem_{false};
mutable std::mutex mtx_;
// <module, name>, BaserRecorderPtr

View File

@ -180,8 +180,8 @@ void DeviceQueueDataSourceActor::SendResult(OpContext<DeviceTensor> *context) {
void DeviceQueueDataSourceActor::SendRecorderInfo(OpContext<DeviceTensor> *context) {
if (recorder_aid_ != nullptr) {
Async(*recorder_aid_, &RecorderActor::RecordMemAddressInfo, data_kernel_.get(), &launch_info_, device_context_,
context);
Async(*recorder_aid_, &RecorderActor::RecordInfo, data_kernel_->fullname_with_scope(), &launch_info_,
device_context_, context);
}
}

View File

@ -315,7 +315,8 @@ void KernelActor::SendOutput(OpContext<DeviceTensor> *context) const {
// Send recorder info.
if (recorder_aid_ != nullptr) {
Async(*recorder_aid_, &RecorderActor::RecordMemAddressInfo, kernel_.get(), &launch_info_, device_context_, context);
Async(*recorder_aid_, &RecorderActor::RecordInfo, kernel_->fullname_with_scope(), &launch_info_, device_context_,
context);
}
// No output.

View File

@ -79,7 +79,7 @@ void LoopCountActor::Execute(OpContext<DeviceTensor> *context) {
void LoopCountActor::SendOutput(OpContext<DeviceTensor> *context) {
// Send recorder info.
if (recorder_aid_ != nullptr) {
Async(*recorder_aid_, &RecorderActor::ClearMemAddressInfo, context);
Async(*recorder_aid_, &RecorderActor::RecordOnStepEnd, context);
}
// Send loop count to output actor.

View File

@ -15,20 +15,54 @@
*/
#include "runtime/framework/actor/recorder_actor.h"
#include <string>
#include <utility>
#include "debug/rdr/recorder_manager.h"
#include "debug/rdr/mem_address_recorder.h"
#include "utils/log_adapter.h"
namespace mindspore {
namespace runtime {
void RecorderActor::RecordMemAddressInfo(const AnfNode *node, const KernelLaunchInfo *launch_info_,
const DeviceContext *device_context, OpContext<DeviceTensor> *op_context) {
MS_EXCEPTION_IF_NULL(node);
void RecorderActor::RecordInfo(const std::string op_name, const KernelLaunchInfo *launch_info_,
const DeviceContext *device_context, OpContext<DeviceTensor> *op_context) {
MS_EXCEPTION_IF_NULL(launch_info_);
MS_EXCEPTION_IF_NULL(device_context);
MS_EXCEPTION_IF_NULL(op_context);
// todo record
#ifdef ENABLE_DUMP_IR
if (op_name.empty()) {
MS_LOG(WARNING) << "GPU kernel's op_name is empty, do not record its memory address in RDR.";
return;
}
// record GPU memory address info
if (!RecorderManager::Instance().RdrEnable()) {
return;
}
std::string name = "mem_address_list";
if (!RecorderManager::Instance().CheckRdrGPUMemIsRecord()) {
std::string submodule_name = "KERNEL";
auto mem_info_recorder = std::make_shared<GPUMemAddressRecorder>(submodule_name, name);
if (mem_info_recorder == nullptr) {
MS_LOG(ERROR) << "Make GPUMemAddressRecorder shared pointer failed.";
return;
}
mem_info_recorder->SaveMemInfo(op_name, launch_info_);
bool result = RecorderManager::Instance().RecordObject(std::move(mem_info_recorder));
if (result) {
RecorderManager::Instance().SetRdrGPUMemIsRecord(true);
}
} else {
std::string submodule_name = "KERNEL";
auto recorder = RecorderManager::Instance().GetRecorder(submodule_name, name);
if (recorder != nullptr) {
auto mem_recorder = std::dynamic_pointer_cast<GPUMemAddressRecorder>(recorder);
mem_recorder->SaveMemInfo(op_name, launch_info_);
}
}
#endif
}
void RecorderActor::ClearMemAddressInfo(OpContext<DeviceTensor> *op_context) {
void RecorderActor::RecordOnStepEnd(OpContext<DeviceTensor> *op_context) {
MS_EXCEPTION_IF_NULL(op_context);
// todo clear
}

View File

@ -18,6 +18,7 @@
#define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_RECORDER_ACTOR_H_
#include <memory>
#include <string>
#include "runtime/framework/actor/actor_common.h"
#include "runtime/framework/device_tensor_store.h"
#include "runtime/hardware/device_context.h"
@ -34,11 +35,11 @@ class RecorderActor : public ActorBase {
~RecorderActor() override = default;
// The memory recorder of each node.
void RecordMemAddressInfo(const AnfNode *node, const KernelLaunchInfo *launch_info_,
const DeviceContext *device_context, OpContext<DeviceTensor> *op_context);
void RecordInfo(const std::string op_name, const KernelLaunchInfo *launch_info_, const DeviceContext *device_context,
OpContext<DeviceTensor> *op_context);
// Clear memory recorder at the step end.
void ClearMemAddressInfo(OpContext<DeviceTensor> *op_context);
void RecordOnStepEnd(OpContext<DeviceTensor> *op_context);
};
} // namespace runtime
} // namespace mindspore

View File

@ -35,6 +35,8 @@
#include "profiler/device/gpu/gpu_profiling_utils.h"
#include "backend/session/kernel_graph.h"
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "debug/rdr/recorder_manager.h"
#include "debug/rdr/mem_address_recorder.h"
namespace mindspore {
namespace device {
@ -364,7 +366,6 @@ bool GPUDeviceContext::LaunchKernelWithProfiling(const CNodePtr &kernel, const s
if (profiler_inst->GetSyncEnableFlag()) {
CHECK_RET_WITH_RETURN_ERROR(SyncStream(), "Profiler SyncStream failed.");
}
return ret;
}
@ -380,7 +381,23 @@ bool GPUDeviceContext::SyncStream(size_t stream_id) const {
if (stream_id >= streams_.size()) {
MS_LOG(EXCEPTION) << "The stream_id: " << stream_id << " is greater than stream array size: " << streams_.size();
}
return GPUDeviceManager::GetInstance().SyncStream(streams_[stream_id]);
bool result = GPUDeviceManager::GetInstance().SyncStream(streams_[stream_id]);
#ifdef ENABLE_DUMP_IR
if (!result) {
RecorderManager::Instance().TriggerAll();
}
// clear RDR gpu memory info
if (RecorderManager::Instance().CheckRdrGPUMemIsRecord()) {
std::string name = "mem_address_list";
std::string submodule_name = "KERNEL";
auto recorder = RecorderManager::Instance().GetRecorder(submodule_name, name);
if (recorder != nullptr) {
auto mem_recorder = std::dynamic_pointer_cast<GPUMemAddressRecorder>(recorder);
mem_recorder->CleanUp();
}
}
#endif
return result;
}
std::shared_ptr<Bucket> GPUDeviceContext::CreateBucket(uint32_t bucket_id, uint32_t bucket_size) const {