!23405 Add remaining isolation for dump

Merge pull request !23405 from sabrinasun_59ee/macro
This commit is contained in:
i-robot 2021-09-15 14:16:02 +00:00 committed by Gitee
commit 3dcea69172
15 changed files with 79 additions and 9 deletions

View File

@ -21,7 +21,9 @@
#include <memory>
#include "runtime/device/ascend/ge_runtime/task_info.h"
#include "backend/kernel_compiler/kernel.h"
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#endif
using TaskInfoPtr = std::shared_ptr<mindspore::ge::model_runner::TaskInfo>;
namespace mindspore {

View File

@ -55,8 +55,10 @@
#include "backend/optimizer/common/helper.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "utils/config_manager.h"
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#include "debug/data_dump/e2e_dump.h"
#endif
#include "debug/anf_ir_utils.h"
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
#include "backend/session/ascend_auto_monad.h"
@ -99,6 +101,7 @@ constexpr char SR_TAG[] = "sr_tag";
constexpr char BACKWARD[] = "backward";
constexpr auto kUnknowErrorString = "Unknown error occurred";
namespace {
#ifndef ENABLE_SECURITY
void DumpGraphExeOrder(const std::vector<CNodePtr> &execution_order, const std::string &tag = "") {
MS_LOG(INFO) << "Dump execution_order size " << execution_order.size();
MS_LOG(INFO) << "[index][stream_label][graph_id][node string]";
@ -129,6 +132,7 @@ void DumpGraphExeOrder(const std::vector<CNodePtr> &execution_order, const std::
}
buf << "================== execution order ==================\n";
}
#endif
bool IsVMGraphTaskSink() {
auto ms_context = MsContext::GetInstance();
@ -1153,6 +1157,7 @@ void AscendSession::SelectKernel(const KernelGraph &kernel_graph) const {
MS_LOG(INFO) << "Finish!";
}
#ifndef ENABLE_SECURITY
void DumpInit(uint32_t device_id) {
auto &json_parser = DumpJsonParser::GetInstance();
json_parser.Parse();
@ -1165,6 +1170,7 @@ void DumpInit(uint32_t device_id) {
}
}
}
#endif
void AscendSession::InitRuntimeResource() {
MS_LOG(INFO) << "Start!";
@ -1180,7 +1186,9 @@ void AscendSession::InitRuntimeResource() {
// get actual rank id if it's distribution training case.
rank_id_ = GetRankId();
}
#ifndef ENABLE_SECURITY
DumpInit(rank_id_);
#endif
MS_LOG(INFO) << "Finish!";
}
@ -1470,11 +1478,15 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
if (is_task && is_task_sink) {
#ifndef ENABLE_SECURITY
DumpSetup(kernel_graph);
#endif
}
bool ret_ok = runtime_instance->Run(kernel_graph.get(), is_task_sink);
if (is_task && is_task_sink) {
#ifndef ENABLE_SECURITY
Dump(kernel_graph);
#endif
}
if (!ret_ok) {
#ifdef ENABLE_DUMP_IR
@ -1485,6 +1497,7 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
MS_LOG(DEBUG) << "Finish!";
}
#ifndef ENABLE_SECURITY
void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const {
MS_LOG(DEBUG) << "Start!";
MS_EXCEPTION_IF_NULL(kernel_graph);
@ -1498,6 +1511,7 @@ void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const
E2eDump::DumpData(kernel_graph.get(), rank_id_);
MS_LOG(DEBUG) << "Finish!";
}
#endif
void AscendSession::DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs) {
#ifdef ENABLE_DUMP_IR
@ -1636,7 +1650,9 @@ void AscendSession::MergeGraphExecOrder() {
}
// set final_exec_order into final graph
MS_EXCEPTION_IF_NULL(final_graph);
#ifndef ENABLE_SECURITY
DumpGraphExeOrder(final_exec_order);
#endif
final_graph->set_execution_order(final_exec_order);
}

View File

@ -109,8 +109,10 @@ class AscendSession : public SessionBasic {
void RunOpGenKernelEvent(const KernelGraph *graph) const;
void Load(const std::shared_ptr<KernelGraph> &kernel_graph) const;
void Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const;
#ifndef ENABLE_SECURITY
void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
void DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const;
#endif
void DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs);
void LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
// below functions are used for run op

View File

@ -33,7 +33,9 @@
#include "backend/optimizer/pass/erase_visit_attr.h"
#include "debug/anf_ir_dump.h"
#include "debug/dump_proto.h"
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#endif
#if ((defined ENABLE_CPU) && (!defined _WIN32))
#include "ps/util.h"
#include "ps/ps_context.h"

View File

@ -31,7 +31,9 @@
#include "abstract/utils.h"
#include "utils/utils.h"
#include "common/trans.h"
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#endif
#ifdef ENABLE_DEBUGGER
#include "debug/tensor_load.h"
#endif
@ -507,7 +509,9 @@ bool AscendDeviceAddress::DumpMemToFile(const std::string &filepath, const std::
MS_LOG(ERROR) << "Copy device mem to host failed";
return ret;
}
#ifndef ENABLE_SECURITY
ret = DumpJsonParser::DumpToFile(path, out_tensor->data_c(), host_size, host_shape, host_type);
#endif
} else {
auto host_tmp = std::vector<uint8_t>(size_);
auto ret_rt_memcpy = rtMemcpy(host_tmp.data(), size_, ptr_, size_, RT_MEMCPY_DEVICE_TO_HOST);
@ -516,7 +520,9 @@ bool AscendDeviceAddress::DumpMemToFile(const std::string &filepath, const std::
}
std::string path = filepath + '.' + format_;
MS_LOG(INFO) << "E2E Dump path is " << path;
#ifndef ENABLE_SECURITY
ret = DumpJsonParser::DumpToFile(path, host_tmp.data(), size_, host_shape, type_id_);
#endif
}
return ret;

View File

@ -20,7 +20,6 @@
#include <utility>
#include <algorithm>
#include "utils/signal_util.h"
#include "debug/data_dump/e2e_dump.h"
#include "runtime/device/ascend/ascend_device_address.h"
#include "runtime/device/ascend/distribute/ascend_collective.h"
#include "utils/ms_context.h"
@ -39,7 +38,10 @@
#endif
#include "runtime/device/ascend/ascend_memory_manager.h"
#include "runtime/device/ascend/ascend_event.h"
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#include "debug/data_dump/e2e_dump.h"
#endif
#include "toolchain/adx_datadump_server.h"
#include "utils/trace_base.h"
#include "graphengine/inc/external/acl/error_codes/rt_error_codes.h"
@ -226,6 +228,7 @@ bool AscendKernelRuntime::NeedDestroyHccl() {
return true;
}
#ifndef ENABLE_SECURITY
void AsyncDataDumpUninit() {
if (DumpJsonParser::GetInstance().async_dump_enabled()) {
if (AdxDataDumpServerUnInit() != 0) {
@ -234,7 +237,6 @@ void AsyncDataDumpUninit() {
}
}
#ifndef ENABLE_SECURITY
void AscendKernelRuntime::ReportProfilingData() {
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
@ -268,7 +270,9 @@ void AscendKernelRuntime::ReleaseDeviceRes() {
// release ge runtime
ClearGraphModelMap();
#ifndef ENABLE_SECURITY
AsyncDataDumpUninit();
#endif
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
@ -382,6 +386,7 @@ bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph) {
bool AscendKernelRuntime::KernelMemNotReuse(const AnfNodePtr &node) {
MS_EXCEPTION_IF_NULL(node);
bool need_dump = false;
#ifndef ENABLE_SECURITY
auto &dump_json_parser = DumpJsonParser::GetInstance();
if (dump_json_parser.e2e_dump_enabled() && dump_json_parser.dump_mode() == 1) {
auto op_name = node->fullname_with_scope();
@ -389,6 +394,7 @@ bool AscendKernelRuntime::KernelMemNotReuse(const AnfNodePtr &node) {
need_dump = true;
}
}
#endif
return need_dump;
}
@ -447,14 +453,18 @@ bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) {
if (ConfigManager::GetInstance().dataset_mode() == DS_SINK_MODE && (ConfigManager::GetInstance().iter_num() > 1)) {
MS_LOG(EXCEPTION) << "Dynamic shape is not supported with dataset_sink_mode.";
}
#ifndef ENABLE_SECURITY
if (DumpJsonParser::GetInstance().async_dump_enabled()) {
MS_LOG(EXCEPTION) << "Dynamic shape is not supported with Asynchronous Dump. Please use Synchronous Dump.";
}
#endif
MS_LOG(INFO) << "Dynamic Shape Graph Generate Dynamic kernel";
return GenDynamicKernel(graph);
}
MS_LOG(INFO) << "GenTask start. GraphId:" << graph->graph_id();
#ifndef ENABLE_SECURITY
DumpJsonParser::GetInstance().UpdateNeedDumpKernels(NOT_NULL(graph));
#endif
#ifdef MEM_REUSE_DEBUG
if (!EnvConfigParser::GetInstance().GetSysMemreuse()) {
// Get normal graph ir for memreuse
@ -656,6 +666,7 @@ std::string AscendKernelRuntime::GetDumpPath() {
return path;
}
#ifndef ENABLE_SECURITY
void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
const std::string path = GetDumpPath();
@ -680,6 +691,7 @@ void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *grap
E2eDump::DumpOutputImpl(node, false, path, &full_scope_name, nullptr);
}
}
#endif
bool AscendKernelRuntime::Run(session::KernelGraph *const graph, bool is_task_sink) {
const uint64_t kUSecondInSecond = 1000000;
@ -947,7 +959,9 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) {
try {
ModelRunner::Instance().RunModel(graph->graph_id());
} catch (const std::exception &) {
#ifndef ENABLE_SECURITY
DumpTaskExceptionInfo(graph);
#endif
#ifdef ENABLE_TDTQUE
// Run task error, we should call TdtHostDestroy to release tdt to avoid DeviceQueueOp hostPush hung
// case1: cpu usage 100% cause thread/process exit, but some tdt thread remain in backend

View File

@ -102,7 +102,9 @@ class AscendKernelRuntime : public KernelRuntime {
#endif
static CNodePtr GetErrorNodeName(uint32_t streamid, uint32_t taskid);
static std::string GetDumpPath();
#ifndef ENABLE_SECURITY
static void DumpTaskExceptionInfo(const session::KernelGraph *graph);
#endif
static void TaskFailCallback(rtExceptionInfo *task_fail_info);
static bool DeleteDumpDir(const std::string &path);
static int DeleteDumpFile(std::string path);

View File

@ -29,7 +29,9 @@
#include "proto/op_mapping_info.pb.h"
#include "utils/comm_manager.h"
#include "utils/ms_context.h"
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#endif
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
@ -67,6 +69,7 @@ DataDumper::~DataDumper() {
ReleaseDevMem(&op_debug_dump_args_);
}
#ifndef ENABLE_SECURITY
void DataDumper::GetNeedDumpKernelList(NotNull<std::map<std::string, CNodePtr> *> kernel_map) const {
for (const auto &kernel : kernel_graph_->execution_order()) {
if (AnfAlgo::GetKernelType(kernel) == HCCL_KERNEL &&
@ -196,6 +199,7 @@ bool DataDumper::KernelNeedDump(const CNodePtr &kernel) const {
// dump all kernel if mode is set 0 in data_dump.json
return DumpJsonParser::GetInstance().NeedDump(kernel->fullname_with_scope());
}
#endif
void DataDumper::UnloadDumpInfo() {
if (!load_flag_) {
@ -246,7 +250,9 @@ void DataDumper::ConstructDumpTask(NotNull<const CNodePtr &> kernel, NotNull<aic
MS_EXCEPTION_IF_NULL(iter->second);
auto task_id = std::get<kTupleTaskId>(*iter->second);
auto stream_id = std::get<kTupleStreamId>(*iter->second);
#ifndef ENABLE_SECURITY
auto args = std::get<kTupleArgs>(*iter->second);
#endif
MS_LOG(INFO) << "[DataDump] Get runtime info task_id:" << task_id << " stream_id:" << stream_id;
dump_task->set_task_id(task_id);
@ -255,8 +261,10 @@ void DataDumper::ConstructDumpTask(NotNull<const CNodePtr &> kernel, NotNull<aic
dump_task->mutable_op()->set_op_name(kernel->fullname_with_scope());
dump_task->mutable_op()->set_op_type(AnfAlgo::GetCNodeName(kernel.get()));
#ifndef ENABLE_SECURITY
DumpKernelOutput(kernel, args, dump_task);
DumpKernelInput(kernel, args, dump_task);
#endif
}
void DataDumper::SetOpDebugMappingInfo(const NotNull<aicpu::dump::OpMappingInfo *> dump_info) const {
@ -287,6 +295,7 @@ void DataDumper::SetOpDebugMappingInfo(const NotNull<aicpu::dump::OpMappingInfo
dump_info->mutable_task()->Add(std::move(task));
}
#ifndef ENABLE_SECURITY
void DataDumper::OpDebugRegister() {
uint32_t op_debug_mode = DumpJsonParser::GetInstance().op_debug_mode();
auto iter = kOverflowModeStr.find(op_debug_mode);
@ -336,6 +345,7 @@ void DataDumper::OpDebugUnregister() {
MS_LOG(EXCEPTION) << "[DataDump] Call rtDebugUnRegister failed, ret = " << rt_ret;
}
}
#endif
void DataDumper::RtLoadDumpData(const aicpu::dump::OpMappingInfo &dump_info, void **ptr) {
std::string proto_str;
@ -372,6 +382,7 @@ void SetDumpShape(const std::vector<size_t> &ms_shape, NotNull<aicpu::dump::Shap
}
}
#ifndef ENABLE_SECURITY
void DataDumper::DumpKernelOutput(const CNodePtr &kernel, void *args, NotNull<aicpu::dump::Task *> task) {
if (!DumpJsonParser::GetInstance().OutputNeedDump()) {
MS_LOG(INFO) << "Skip dump output";
@ -452,6 +463,7 @@ void DataDumper::DumpKernelInput(const CNodePtr &kernel, void *args, NotNull<aic
offset += sizeof(void *);
}
}
#endif
std::string DataDumper::StripUniqueId(const std::string node_name) {
size_t last_underscore = node_name.find_last_of('_');

View File

@ -53,20 +53,26 @@ class DataDumper {
void set_runtime_info(const std::map<std::string, std::shared_ptr<RuntimeInfo>> &runtime_info) {
runtime_info_map_ = runtime_info;
}
#ifndef ENABLE_SECURITY
void LoadDumpInfo();
void UnloadDumpInfo();
void OpDebugRegister();
void OpDebugUnregister();
#endif
void UnloadDumpInfo();
private:
void ReleaseDevMem(void **ptr) const noexcept;
#ifndef ENABLE_SECURITY
bool KernelNeedDump(const CNodePtr &kernel) const;
void SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_info) const;
#endif
void SetOpDebugMappingInfo(const NotNull<aicpu::dump::OpMappingInfo *> dump_info) const;
void ConstructDumpTask(NotNull<const CNodePtr &> kernel, NotNull<aicpu::dump::Task *> dump_task) const;
#ifndef ENABLE_SECURITY
void GetNeedDumpKernelList(NotNull<std::map<std::string, CNodePtr> *> kernel_map) const;
static void DumpKernelOutput(const CNodePtr &kernel, void *args, NotNull<aicpu::dump::Task *> task);
static void DumpKernelInput(const CNodePtr &kernel, void *args, NotNull<aicpu::dump::Task *> task);
#endif
static std::string StripUniqueId(const std::string node_name);
static void RtLoadDumpData(const aicpu::dump::OpMappingInfo &dump_info, void **ptr);

View File

@ -18,7 +18,9 @@
#include <memory>
#include "runtime/device/convert_tensor_utils.h"
#include "runtime/hardware/cpu/cpu_memory_pool.h"
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#endif
namespace mindspore {
namespace device {

View File

@ -33,7 +33,9 @@
#include "debug/anf_ir_dump.h"
#include "debug/rdr/running_data_recorder.h"
#endif
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#endif
namespace mindspore {
namespace runtime {

View File

@ -30,7 +30,9 @@
#if !defined(_WIN32) && !defined(_WIN64)
#include "utils/signal_util.h"
#endif
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#endif
#ifdef ENABLE_DUMP_IR
#include "debug/rdr/recorder_manager.h"
#endif
@ -246,12 +248,15 @@ void GraphScheduler::BuildAndScheduleGlobalActor() {
(void)actorMgr->Spawn(base_recorder_actor, true);
// Create and schedule debug actor.
#ifndef ENABLE_SECURITY
bool debugger_actor_need = DumpJsonParser::GetInstance().e2e_dump_enabled();
#endif
#ifdef ENABLE_DEBUGGER
if (Debugger::GetInstance()->DebuggerBackendEnabled()) {
debugger_actor_need = true;
}
#endif
#ifndef ENABLE_SECURITY
if (debugger_actor_need) {
auto debug_actor = std::make_shared<DebugActor>();
MS_EXCEPTION_IF_NULL(debug_actor);
@ -259,6 +264,7 @@ void GraphScheduler::BuildAndScheduleGlobalActor() {
auto base_debug_actor = static_cast<ActorReference>(debug_actor);
(void)actorMgr->Spawn(base_debug_actor, true);
}
#endif
}
ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info) {

View File

@ -30,7 +30,9 @@
#include "backend/optimizer/pass/replace_node_by_proxy.h"
#include "backend/optimizer/pass/erase_visit_attr.h"
#include "profiler/device/cpu/cpu_profiling.h"
#ifndef ENABLE_SECURITY
#include "debug/data_dump/dump_json_parser.h"
#endif
namespace mindspore {
namespace device {

View File

@ -16,11 +16,6 @@ This module provides APIs to load and process dump data, i.e. read tensors, chec
for watchpoints and other debugging services.
"""
from mindspore._c_expression import security
from . import dbg_services
from . import mi_validator_helpers
from . import mi_validators
if security.enable_security():
raise ModuleNotFoundError("Offline debugger is not supported in security mode."\
"Please recompile mindspore without `-s on`.")

View File

@ -17,12 +17,13 @@ The module DbgServices provides offline debugger APIs.
"""
from mindspore._c_expression import security
import mindspore._mindspore_offline_debug as cds
from mindspore.offline_debug.mi_validators import check_init, check_initialize, check_add_watchpoint,\
check_remove_watchpoint, check_check_watchpoints, check_read_tensor_info, check_initialize_done, \
check_tensor_info_init, check_tensor_data_init, check_tensor_base_data_init, check_tensor_stat_data_init,\
check_watchpoint_hit_init, check_parameter_init
from mindspore.offline_debug.mi_validator_helpers import replace_minus_one
if not security.enable_security():
import mindspore._mindspore_offline_debug as cds
def get_version():