!48325 [Kernel Dump]delete the no overflow operator file on kernel by kernel mode for dump

Merge pull request !48325 from maoyaomin/mym_debugger_kernel_dumper
This commit is contained in:
i-robot 2023-02-07 01:36:56 +00:00 committed by Gitee
commit 6f24f09301
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
10 changed files with 147 additions and 35 deletions

View File

@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "debug/data_dump/dump_utils.h"
#include <dirent.h>
#include <map>
#include <vector>
#include <algorithm>
@ -174,4 +175,81 @@ void DumpToFile(const std::string &file_name, const std::string &dump_str) {
file.close();
ChangeFileMode(real_path_str, S_IRUSR);
}
void RemoveEmptyDir(const std::string &dir_path) {
uint32_t dir_count = 0;
DIR *d = opendir(dir_path.c_str());
struct dirent *dir = nullptr;
while ((dir = readdir(d)) != nullptr) {
std::string name = dir->d_name;
if (name == "." || name == "..") {
continue;
} else {
dir_count++;
}
}
(void)closedir(d);
if (dir_count == 0) {
auto ret = remove(dir_path.c_str());
if (ret == 0) {
MS_LOG(INFO) << "Delete empty dir successfully, dir path is:" << dir_path;
}
}
}
void SaveOverflowOperator(const std::string &iterator, const std::string &dump_rank_path) {
const std::string overflow_dump_dir = "debug_files";
const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
const std::string cur_step_overflow_path = dump_rank_path + "/" + overflow_dump_dir + "/" + iterator;
DIR *d = opendir(cur_step_overflow_path.c_str());
overflowOperators.clear();
if (d == nullptr) {
MS_LOG(WARNING) << "Overflow file directory does not exist!";
} else {
struct dirent *dir = nullptr;
while ((dir = readdir(d)) != nullptr) {
std::string filename = dir->d_name;
if (filename.find(overflow_file_prefix) != std::string::npos) {
uint32_t pos_start = overflow_file_prefix.size();
uint32_t n = filename.rfind(".") - pos_start + 2;
std::string stream_task_name = filename.substr(pos_start - 1, n);
overflowOperators.emplace_back(stream_task_name);
}
}
(void)closedir(d);
}
}
void DeleteNoOverflowFile(uint32_t rank_id, uint32_t graph_id) {
auto &json_parser = DumpJsonParser::GetInstance();
if (!(json_parser.async_dump_enabled() || json_parser.e2e_dump_enabled())) {
return;
}
std::string cur_dump_path = json_parser.path() + "/rank_" + std::to_string(rank_id);
std::string net_name_ = json_parser.net_name();
std::string iterator = std::to_string(json_parser.cur_dump_iter());
SaveOverflowOperator(iterator, cur_dump_path);
std::string overflow_operator_dump_path =
cur_dump_path + "/" + net_name_ + "/" + std::to_string(graph_id) + "/" + iterator;
DIR *d = opendir(overflow_operator_dump_path.c_str());
if (d == nullptr) {
MS_LOG(WARNING) << "Overflow iterator file directory does not exist!";
} else {
struct dirent *dir = nullptr;
while ((dir = readdir(d)) != nullptr) {
std::string filename = dir->d_name;
bool is_exist =
std::any_of(std::begin(overflowOperators), std::end(overflowOperators),
[&](std::string stream_task_str) { return filename.find(stream_task_str) != std::string::npos; });
if (!is_exist) {
auto ret = remove((overflow_operator_dump_path + "/" + filename).c_str());
if (ret == 0) {
MS_LOG(INFO) << "Delete file successfully, filename is:" << filename.c_str();
}
}
}
(void)closedir(d);
RemoveEmptyDir(overflow_operator_dump_path);
}
}
} // namespace mindspore

View File

@ -18,6 +18,7 @@
#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_DUMP_UTILS_H_
#include <map>
#include <vector>
#include <string>
#include <memory>
@ -30,6 +31,7 @@ using DeviceTensorPtr = std::shared_ptr<DeviceTensor>;
namespace mindspore {
static const size_t PARAMETER_OUTPUT_INDEX = 0;
static const size_t VALUE_NODE_OUTPUT_INDEX = 0;
static std::vector<std::string> overflowOperators;
std::string GenerateDumpPath(uint32_t graph_id, uint32_t rank_id = 0, bool is_cst = false);
@ -43,6 +45,12 @@ const DeviceTensorPtr GetParameterInfo(const AnfNodePtr &node, NotNull<ShapeVect
void DumpMemToFile(const std::string &file_path, const device::DeviceAddress &addr, const ShapeVector &int_shapes,
const TypeId &type, bool trans_flag = false);
void RemoveEmptyDir(const std::string &dir_path);
void SaveOverflowOperator(const std::string &iterator, const std::string &dump_rank_path);
void DeleteNoOverflowFile(uint32_t rank_id, uint32_t graph_id);
BACKEND_EXPORT std::string GetOpNameWithoutScope(const std::string &fullname_with_scope,
const std::string &separator = "--");

View File

@ -658,7 +658,8 @@ bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32
void E2eDump::DumpParametersData(uint32_t rank_id, const Debugger *debugger) {
uint32_t root_graph_id = debugger->GetCurrentRootGraphId();
auto &dump_json_parser = DumpJsonParser::GetInstance();
if (dump_json_parser.async_dump_enabled() && !debugger->GetAscendKernelByKernelFlag()) {
if ((dump_json_parser.async_dump_enabled() && !debugger->GetAscendKernelByKernelFlag()) ||
(dump_json_parser.async_dump_enabled() && dump_json_parser.op_debug_mode() > 0)) {
// Dump parameters for mindRT in async dump only for kernel by kernel mode.
return;
}

View File

@ -28,6 +28,8 @@ std::map<std::string, std::shared_ptr<OverflowDumper>> &OverflowDumper::GetInsta
return instance_map;
}
void OverflowDumper::Clear() { GetInstanceMap().clear(); }
std::shared_ptr<OverflowDumper> OverflowDumper::GetInstance(const std::string &name) noexcept {
if (auto iter = GetInstanceMap().find(name); iter != GetInstanceMap().end()) {
return iter->second;

View File

@ -34,6 +34,7 @@ class BACKEND_EXPORT OverflowDumper {
static std::shared_ptr<OverflowDumper> GetInstance(const std::string &name) noexcept;
static bool Register(const std::string &name, const std::shared_ptr<OverflowDumper> &instance);
static void Clear();
virtual void OpLoadDumpInfo(const CNodePtr &kernel) = 0;
virtual void Init() = 0;
virtual void OpDebugRegisterForStream(const CNodePtr &kernel) = 0;

View File

@ -200,8 +200,10 @@ void AscendKernelRuntime::ClearGraphModelMap() {
graph_data_dumper_.clear();
// tell users which dump kernel name not used
DumpJsonParser::GetInstance().PrintUnusedKernel();
KernelDumper kernel_dumper;
kernel_dumper.OpDebugUnregisterForStream();
if (DumpJsonParser::GetInstance().async_dump_enabled()) {
KernelDumper kernel_dumper;
kernel_dumper.OpDebugUnregisterForStream();
}
#endif
graph_kernel_events_map_.clear();

View File

@ -436,18 +436,14 @@ void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
}
void KernelDumper::OpDebugUnregisterForStream() {
uint32_t op_debug_mode = DumpJsonParser::GetInstance().op_debug_mode();
if (op_debug_mode == kNoOverflow) {
return;
}
for (auto iter = KernelDumper::op_debug_tasks.begin(); iter != KernelDumper::op_debug_tasks.end(); iter++) {
rtError_t rt_ret = rtDebugUnRegisterForStream(iter->first);
if (rt_ret != RT_ERROR_NONE) {
MS_LOG(EXCEPTION) << "[KernelDumperr] Call rtDebugUnRegisterForStream failed, ret = " << rt_ret;
MS_LOG(EXCEPTION) << "[KernelDumper] Call rtDebugUnRegisterForStream failed, ret = " << rt_ret;
}
}
KernelDumper::op_debug_tasks.clear();
OverflowDumper::Clear();
}
#endif
} // namespace ascend

View File

@ -21,7 +21,6 @@
#include <set>
#include <memory>
#include <string>
#include <vector>
#include <functional>
#include "runtime/dev.h"
#include "runtime/mem.h"
@ -88,7 +87,7 @@ class KernelDumper : public debug::OverflowDumper {
bool is_op_debug_;
uint32_t op_debug_mode_;
void *dev_load_mem_;
void *dev_load_mem_ = nullptr;
void *proto_dev_mem_ = nullptr;
void *proto_size_dev_mem_ = nullptr;
std::mutex register_mutex_;

View File

@ -176,31 +176,42 @@ void DebugActor::DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
}
}
if (DumpJsonParser::GetInstance().async_dump_enabled()) {
auto kCurLoopCountName = "current_loop_count";
for (size_t i = 0; i < graphs.size(); i++) {
const auto &graph_ = graphs[i];
if (device_contexts[i]->GetDeviceType() != device::DeviceType::kAscend) {
continue;
}
auto device_loop_control_tensors = graph_->device_loop_control_tensors();
if (device_loop_control_tensors.count(kCurLoopCountName) == 0) {
MS_LOG(WARNING) << "Can't find Device Loop Control Tensor " << kCurLoopCountName;
return;
}
auto tensor = device_loop_control_tensors.at(kCurLoopCountName);
MS_EXCEPTION_IF_NULL(tensor);
auto *cur_val = static_cast<int32_t *>(tensor->data_c());
MS_EXCEPTION_IF_NULL(cur_val);
*cur_val = current_step;
tensor->set_sync_status(kNeedSyncHostToDevice);
auto device_address = tensor->device_address();
MS_EXCEPTION_IF_NULL(device_address);
if (!device_address->SyncHostToDevice(tensor->shape(), LongToSize(tensor->data().nbytes()), tensor->data_type(),
tensor->data_c(), tensor->device_info().host_format_)) {
MS_LOG(EXCEPTION) << "SyncHostToDevice failed for device loop control parameter " << kCurLoopCountName;
}
bool is_data_map_ = false;
if (graphs.size() == 1) {
const auto &graph_ = graphs[0];
KernelGraphPtr kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(graph_);
const auto kernels = kernel_graph->execution_order();
is_data_map_ = std::any_of(kernels.cbegin(), kernels.cend(), [](const auto &kernel) {
return kernel->fullname_with_scope().find("InitDataSetQueue") != std::string::npos;
});
}
if (!is_data_map_) {
auto kCurLoopCountName = "current_loop_count";
for (size_t i = 0; i < graphs.size(); i++) {
const auto &graph_ = graphs[i];
if (device_contexts[i]->GetDeviceType() != device::DeviceType::kAscend) {
continue;
}
auto device_loop_control_tensors = graph_->device_loop_control_tensors();
if (device_loop_control_tensors.count(kCurLoopCountName) == 0) {
MS_LOG(WARNING) << "Can't find Device Loop Control Tensor " << kCurLoopCountName;
return;
}
auto tensor = device_loop_control_tensors.at(kCurLoopCountName);
MS_EXCEPTION_IF_NULL(tensor);
auto *cur_val = static_cast<int32_t *>(tensor->data_c());
MS_EXCEPTION_IF_NULL(cur_val);
*cur_val = current_step;
tensor->set_sync_status(kNeedSyncHostToDevice);
auto device_address = tensor->device_address();
MS_EXCEPTION_IF_NULL(device_address);
if (!device_address->SyncHostToDevice(tensor->shape(), LongToSize(tensor->data().nbytes()), tensor->data_type(),
tensor->data_c(), tensor->device_info().host_format_)) {
MS_LOG(EXCEPTION) << "SyncHostToDevice failed for device loop control parameter " << kCurLoopCountName;
}
}
current_step++;
}
current_step++;
}
#endif
}
@ -224,6 +235,17 @@ void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const
}
#endif
#ifdef ENABLE_DEBUGGER
#ifndef ENABLE_SECURITY
if (DumpJsonParser::GetInstance().async_dump_enabled() && DumpJsonParser::GetInstance().op_debug_mode() > 0 &&
Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
uint32_t rank_id = Debugger::GetRankID();
uint32_t graph_id = Debugger::GetInstance()->GetCurrentRootGraphId();
DeleteNoOverflowFile(rank_id, graph_id);
}
#endif
#endif
#ifdef ENABLE_DEBUGGER
auto debugger = Debugger::GetInstance();
if (debugger != nullptr) {

View File

@ -22,6 +22,9 @@
#include "runtime/graph_scheduler/actor/actor_common.h"
#include "runtime/graph_scheduler/device_tensor_store.h"
#include "runtime/hardware/device_context.h"
#ifdef ENABLE_DEBUGGER
#include "debug/data_dump/dump_utils.h"
#endif
namespace mindspore {
namespace runtime {