support recording gpu memory information and graph execute order

This commit is contained in:
louei5 2021-02-25 19:03:39 +08:00
parent 50542793c8
commit 99203038a5
9 changed files with 189 additions and 22 deletions

View File

@ -9,18 +9,10 @@ set(_DEBUG_SRC_LIST
)
if(ENABLE_DUMP_IR)
if(ENABLE_D)
list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/rdr/task_debug_info_recorder.cc")
file(GLOB_RECURSE _RDR_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "rdr/*.cc")
if(NOT ENABLE_D)
list(REMOVE_ITEM _RDR_SRC_LIST "rdr/task_debug_info_recorder.cc")
endif()
list(APPEND _DEBUG_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/rdr/base_recorder.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/rdr/graph_exec_order_recorder.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/rdr/graph_recorder.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/rdr/stream_exec_order_recorder.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/rdr/string_recorder.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/rdr/recorder_manager.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/rdr/running_data_recorder.cc"
)
endif()
if(ENABLE_DEBUGGER)
@ -38,5 +30,6 @@ if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
list(APPEND _DEBUG_SRC_LIST "data_dump/e2e_dump_util.cc")
endif()
set_property(SOURCE ${_DEBUG_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEBUG)
add_library(_mindspore_debug_obj OBJECT ${_DEBUG_SRC_LIST})
set_property(SOURCE ${_DEBUG_SRC_LIST} ${_RDR_SRC_LIST} PROPERTY COMPILE_DEFINITIONS
SUBMODULE_ID=mindspore::SubModuleId::SM_DEBUG)
add_library(_mindspore_debug_obj OBJECT ${_DEBUG_SRC_LIST} ${_RDR_SRC_LIST})

View File

@ -38,16 +38,20 @@ void BaseRecorder::SetFilename(const std::string &filename) {
}
std::optional<std::string> BaseRecorder::GetFileRealPath(const std::string &suffix) {
std::string filename;
if (filename_.empty()) {
filename_ = module_ + delimiter_ + tag_;
filename = module_ + delimiter_ + tag_;
if (!suffix.empty()) {
filename_ += delimiter_ + suffix;
filename += delimiter_ + suffix;
}
filename += delimiter_ + timestamp_;
} else {
filename = filename_;
if (!suffix.empty()) {
filename = filename_ + delimiter_ + suffix;
}
filename_ += delimiter_ + timestamp_;
} else if (!suffix.empty()) {
filename_ += delimiter_ + suffix;
}
std::string file_path = directory_ + filename_;
std::string file_path = directory_ + filename;
auto realpath = Common::GetRealPath(file_path);
if (!realpath.has_value()) {
MS_LOG(ERROR) << "Get real path failed. "

View File

@ -19,10 +19,12 @@
#include "mindspore/core/ir/anf.h"
#include "mindspore/core/utils/log_adapter.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "utils/utils.h"
namespace mindspore {
namespace {
bool DumpGraphExeOrder(const std::string &filename, const std::vector<CNodePtr> &execution_order) {
ChangeFileMode(filename, S_IRWXU);
std::ofstream fout(filename, std::ofstream::app);
if (!fout.is_open()) {
MS_LOG(WARNING) << "Open file for saving graph exec order failed.";
@ -41,6 +43,7 @@ bool DumpGraphExeOrder(const std::string &filename, const std::vector<CNodePtr>
}
fout << "================== execution order ==================\n";
fout.close();
ChangeFileMode(filename, S_IRUSR);
return true;
}
} // namespace

View File

@ -0,0 +1,69 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "debug/rdr/mem_address_recorder.h"
#include <fstream>
#include <sstream>
#include <utility>
#include "backend/kernel_compiler/kernel.h"
namespace mindspore {
namespace {
std::string MemInfo2String(const std::string &label, const AddressPtrList &info) {
std::ostringstream ss;
ss << label << " " << info.size() << std::endl;
for (size_t i = 0; i < info.size(); i++) {
ss << "&" << info[i]->addr << " #" << info[i]->size << std::endl;
}
return ss.str();
}
} // namespace
MemAddressRecorder &MemAddressRecorder::Instance() {
static MemAddressRecorder recorder;
return recorder;
}
void MemAddressRecorder::SaveMemInfo(const std::string &op_name, const GPUMemInfo &mem_info) {
std::lock_guard<std::mutex> lock(mtx_);
std::ostringstream mem_info_stream;
auto inputs = mem_info.inputs_;
mem_info_stream << op_name << std::endl;
mem_info_stream << MemInfo2String("kernel_inputs", *inputs);
auto workspaces = mem_info.workspaces_;
mem_info_stream << MemInfo2String("kernel_workspaces", *workspaces);
auto outputs = mem_info.outputs_;
mem_info_stream << MemInfo2String("kernel_outputs", *outputs);
mem_info_stream << std::endl;
mem_info_str_ += mem_info_stream.str();
}
void MemAddressRecorder::Export() {
auto realpath = GetFileRealPath();
if (!realpath.has_value()) {
return;
}
std::lock_guard<std::mutex> lock(mtx_);
std::string file_path = realpath.value() + ".txt";
ChangeFileMode(file_path, S_IRWXU);
std::ofstream fout(file_path);
if (!fout.is_open()) {
MS_LOG(WARNING) << "Open file for saving gpu memory information failed. File path: '" << file_path << "'.";
return;
}
fout << mem_info_str_;
fout.close();
ChangeFileMode(file_path, S_IRUSR);
}
} // namespace mindspore

View File

@ -0,0 +1,53 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_DEBUG_RDR_MEM_ADDRESS_RECORDER_H_
#define MINDSPORE_CCSRC_DEBUG_RDR_MEM_ADDRESS_RECORDER_H_
#include <vector>
#include <string>
#include <memory>
#include <mutex>
#include "debug/rdr/base_recorder.h"
namespace mindspore {
namespace kernel {
class Address;
using AddressPtr = std::shared_ptr<Address>;
} // namespace kernel
using AddressPtrList = std::vector<kernel::AddressPtr>;
struct GPUMemInfo {
AddressPtrList *inputs_;
AddressPtrList *workspaces_;
AddressPtrList *outputs_;
};
class MemAddressRecorder : public BaseRecorder {
public:
static MemAddressRecorder &Instance();
virtual void Export();
void SaveMemInfo(const std::string &op_name, const GPUMemInfo &mem_info);
void SetTag(const std::string &tag) { tag_ = tag; }
private:
MemAddressRecorder() {}
MemAddressRecorder(const MemAddressRecorder &recorder);
MemAddressRecorder &operator=(const MemAddressRecorder &recorder);
mutable std::mutex mtx_;
std::string mem_info_str_;
};
using MemAddressRecorderPtr = std::shared_ptr<MemAddressRecorder>;
} // namespace mindspore
#endif // MINDSPORE_CCSRC_DEBUG_RDR_MEM_ADDRESS_RECORDER_H_

View File

@ -78,5 +78,8 @@ void RecorderManager::TriggerAll() {
}
}
void RecorderManager::ClearAll() { recorder_container_.clear(); }
void RecorderManager::ClearAll() {
std::lock_guard<std::mutex> lock(mtx_);
recorder_container_.clear();
}
} // namespace mindspore

View File

@ -20,8 +20,10 @@
#include "debug/rdr/recorder_manager.h"
#include "debug/rdr/string_recorder.h"
#include "debug/rdr/stream_exec_order_recorder.h"
#include "debug/rdr/mem_address_recorder.h"
#include "mindspore/core/ir/func_graph.h"
#include "mindspore/core/ir/anf.h"
#include "backend/kernel_compiler/kernel.h"
#ifdef ENABLE_D
#include "runtime/device/ascend/tasksink/task_generator.h"
#include "debug/rdr/task_debug_info_recorder.h"
@ -123,7 +125,21 @@ bool RecordStreamExecOrder(const SubModuleId module, const std::string &tag, con
return ans;
}
void TriggerAll() { mindspore::RecorderManager::Instance().TriggerAll(); }
bool RecordMemAddressInfo(const SubModuleId module, const std::string &tag, const std::string &op_name,
const GPUMemInfo &mem_info) {
if (!mindspore::RecorderManager::Instance().RdrEnable()) {
return false;
}
std::string submodule_name = std::string(GetSubModuleName(module));
MemAddressRecorder::Instance().SetModule(submodule_name);
MemAddressRecorder::Instance().SetTag(tag);
MemAddressRecorder::Instance().SaveMemInfo(op_name, mem_info);
return true;
}
void TriggerAll() {
mindspore::RecorderManager::Instance().TriggerAll();
MemAddressRecorder::Instance().Export();
}
void ClearAll() { mindspore::RecorderManager::Instance().ClearAll(); }

View File

@ -25,6 +25,13 @@ class FuncGraph;
class CNode;
using FuncGraphPtr = std::shared_ptr<FuncGraph>;
using CNodePtr = std::shared_ptr<CNode>;
namespace kernel {
class Address;
using AddressPtr = std::shared_ptr<Address>;
} // namespace kernel
using AddressPtrList = std::vector<kernel::AddressPtr>;
struct GPUMemInfo;
#ifdef ENABLE_D
namespace device {
namespace ascend {
@ -35,6 +42,7 @@ class TaskDebugInfo;
} // namespace device
using TaskDebugInfoPtr = std::shared_ptr<device::ascend::tasksink::TaskDebugInfo>;
#endif // ENABLE_D
namespace RDR {
bool RecordAnfGraph(const SubModuleId module, const std::string &tag, const FuncGraphPtr &graph, bool full_name,
const std::string &file_type = ".ir;.pb;.dat");
@ -44,6 +52,8 @@ bool RecordString(SubModuleId module, const std::string &tag, const std::string
const std::string &filename = "");
bool RecordStreamExecOrder(const SubModuleId module, const std::string &tag, const int &graph_id,
const std::vector<CNodePtr> &exec_order);
bool RecordMemAddressInfo(const SubModuleId module, const std::string &tag, const std::string &op_name,
const GPUMemInfo &mem_info);
#ifdef ENABLE_D
bool RecordTaskDebugInfo(SubModuleId module, const std::string &tag,
const std::vector<TaskDebugInfoPtr> &task_debug_info_list, int graph_id = 0);

View File

@ -40,6 +40,10 @@
#ifdef ENABLE_DEBUGGER
#include "debug/debug_services.h"
#endif
#ifdef ENABLE_DUMP_IR
#include "debug/rdr/running_data_recorder.h"
#include "debug/rdr/mem_address_recorder.h"
#endif
namespace mindspore {
namespace device {
@ -51,6 +55,9 @@ static const size_t PARAMETER_OUTPUT_INDEX = 0;
bool GPUKernelRuntime::SyncStream() {
if (!GPUDeviceManager::GetInstance().SyncStream(stream_)) {
#ifdef ENABLE_DUMP_IR
mindspore::RDR::TriggerAll();
#endif
MS_LOG(ERROR) << "Call SyncStream error.";
return false;
}
@ -637,7 +644,10 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
}
auto &kernels = graph->execution_order();
int exec_order = 1;
#ifdef ENABLE_DUMP_IR
std::string exec_order_tag = "graph_exec_order";
mindspore::RDR::RecordGraphExecOrder(SubModuleId::SM_KERNEL, exec_order_tag, kernels, graph->graph_id());
#endif
auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
MS_EXCEPTION_IF_NULL(profiler_inst);
@ -678,6 +688,12 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
}
return false;
}
#ifdef ENABLE_DUMP_IR
GPUMemInfo mem_info = {&kernel_inputs, &kernel_workspaces, &kernel_outputs};
std::string tag = "mem_address_list";
std::string op_name = kernel->fullname_with_scope();
mindspore::RDR::RecordMemAddressInfo(SubModuleId::SM_KERNEL, tag, op_name, mem_info);
#endif
if (!mock) {
if (!profiling) {
if (profiler_inst->GetEnableFlag()) {