!6907 Load input tensors in debugger before suspending execution

Merge pull request !6907 from Harshvardhan Gupta/load-input-dbg
This commit is contained in:
mindspore-ci-bot 2020-10-08 21:50:45 +08:00 committed by Gitee
commit 9c79b9d712
13 changed files with 107 additions and 140 deletions

View File

@ -171,7 +171,7 @@ GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) {
device::KernelAdjust::GetInstance().Profiling(NOT_NULL(root_graph.get()));
// build kernel
BuildKernel(root_graph);
if (debugger_) {
if (debugger_ && debugger_->partial_memory()) {
debugger_->PreExecute(root_graph);
}
SetSummaryNodes(root_graph.get());
@ -248,7 +248,7 @@ void AscendSession::BuildGraph(GraphId graph_id) {
BuildKernel(graph);
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (debugger_) {
if (debugger_ && debugger_->partial_memory()) {
debugger_->PreExecute(graph);
}
if (ms_context->get_param<bool>(MS_CTX_PRECOMPILE_ONLY)) {
@ -312,6 +312,9 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
}
// load input data from user input
LoadInputData(kernel_graph, inputs);
if (debugger_) {
debugger_->PreExecute(kernel_graph);
}
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
// Initialize parameter server
InitPSParamAndOptim(kernel_graph, inputs);

View File

@ -278,9 +278,9 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
auto &kernel_graph = graphs_[graph_id];
PreIterationDbg(kernel_graph);
// Load input data from user input
LoadInputData(kernel_graph, inputs);
PreIterationDbg(kernel_graph);
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
// Initialize parameter server
InitPSParamAndOptim(kernel_graph, inputs);

View File

@ -22,7 +22,6 @@
#include <utility>
#include <memory>
#include <map>
#include "backend/session/session_context.h"
#include "backend/session/kernel_graph.h"
#include "backend/session/anf_runtime_algorithm.h"

View File

@ -30,6 +30,7 @@
#include "pipeline/jit/pipeline.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "runtime/device/kernel_runtime.h"
using debugger::EventReply;
using debugger::GraphProto;
@ -47,6 +48,7 @@ namespace mindspore {
DebuggerPtr Debugger::debugger_ = nullptr;
std::mutex Debugger::instance_lock_;
static const size_t PRAMATER_OUTPUT_INDEX = 0;
Debugger::Debugger()
: grpc_client_(nullptr),
@ -62,7 +64,26 @@ Debugger::Debugger()
is_dataset_graph_(false),
partial_memory_(false),
last_overflow_bin_(0),
overflow_bin_path_("") {}
overflow_bin_path_("") {
if (CheckDebuggerEnabled()) {
// configure partial memory reuse
partial_memory_ = CheckDebuggerPartialMemoryEnabled();
// switch memory reuse on or off
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
context_ptr->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, partial_memory_);
// print some message about memory reuse to user
if (partial_memory_) {
MS_LOG(WARNING)
<< "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
"step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
} else {
MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
"usage for large models.";
}
}
}
void Debugger::Init(const uint32_t device_id, const std::string device_target) {
// access lock for public method
@ -133,27 +154,6 @@ void Debugger::EnableDebugger() {
MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
port = "50051";
}
// configure partial memory reuse
const char *env_partial_mem_str = std::getenv("MS_DEBUGGER_PARTIAL_MEM");
if (env_partial_mem_str != nullptr) {
MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str;
if (std::strcmp(env_partial_mem_str, "1") == 0) {
partial_memory_ = true;
}
}
// switch memory reuse on or off
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
context_ptr->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, partial_memory_);
// print some message about memory reuse to user
if (partial_memory_) {
MS_LOG(WARNING) << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
"step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
} else {
MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
"usage for large models.";
}
#ifdef ENABLE_D
// set operation overflow info
overflow_bin_path_ = DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id(), device_id_);
@ -195,9 +195,7 @@ void Debugger::EnableDebugger() {
bool Debugger::CheckDebuggerDumpEnabled() {
// see if dump is enabled
if (device_target_ == kGPUDevice) {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
return runtime_instance->DumpDataEnabled();
return device::KernelRuntime::DumpDataEnabled();
}
return false;
}
@ -213,6 +211,17 @@ bool Debugger::CheckDebuggerEnabled() {
return false;
}
bool Debugger::CheckDebuggerPartialMemoryEnabled() {
const char *env_partial_mem_str = std::getenv("MS_DEBUGGER_PARTIAL_MEM");
if (env_partial_mem_str != nullptr) {
MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str;
if (std::strcmp(env_partial_mem_str, "1") == 0) {
return true;
}
}
return false;
}
bool Debugger::DebuggerBackendEnabled() { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }
void Debugger::Reset() {
@ -324,6 +333,7 @@ void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
// only try to enable debugger if it is not a dataset graph
EnableDebugger();
if (debugger_enabled_) {
LoadParameters();
// get graph proto and send to mindinsight
SendGraphAndSuspend(GetGraphProto());
}
@ -839,4 +849,34 @@ bool Debugger::CheckPort(const char *port) {
return true;
}
void Debugger::LoadParameters() {
if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
if (!(num_step_ == 0 || device_target_ == kAscendDevice ||
(device_target_ == kGPUDevice && device::KernelRuntime::DumpDataEnabledIteration())))
return;
MS_EXCEPTION_IF_NULL(graph_ptr_);
const auto &parameters = graph_ptr_->inputs();
// for parameters, set its execution order to be 0;
int exec_order = 0;
for (auto &item : parameters) {
if (!item->isa<Parameter>()) {
continue;
}
std::string parameter_name = item->fullname_with_scope();
auto addr = AnfAlgo::GetOutputAddr(item, PRAMATER_OUTPUT_INDEX);
auto type = AnfAlgo::GetOutputInferDataType(item, PRAMATER_OUTPUT_INDEX);
auto format = kOpFormat_DEFAULT;
string tensor_name = parameter_name + ':' + "0";
ShapeVector int_shapes;
auto shape = AnfAlgo::GetOutputDeviceShape(item, PRAMATER_OUTPUT_INDEX);
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
[](size_t inner_item) { return SizeToInt(inner_item); });
bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, true);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
}
}
}
} // namespace mindspore

View File

@ -103,6 +103,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
void SendMetadata();
void LoadParameters();
private:
// private constructor for singleton
Debugger();
@ -118,6 +120,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// check if debugger enabled
bool CheckDebuggerEnabled();
bool CheckDebuggerPartialMemoryEnabled();
// check and save graph pointer
void CheckGraphPtr(const KernelGraphPtr &graph_ptr);

View File

@ -663,39 +663,25 @@ bool AscendDeviceAddress::DumpMemToFile(bool trans_flag, const std::string &file
}
#ifdef ENABLE_DEBUGGER
bool AscendDeviceAddress::LoadMemToHost(bool trans_flag, const std::string &tensor_name, int execution_order,
bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order,
const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type,
size_t slot, Debugger *debugger, bool keep_prev) const {
size_t slot, bool keep_prev) const {
bool ret = false;
DebugServices *debug_services = debugger->debug_services();
MS_EXCEPTION_IF_NULL(debug_services);
TensorLoader *tensor_loader = debug_services->tensor_loader();
TensorLoader *tensor_loader = Debugger::GetInstance()->debug_services()->tensor_loader();
MS_EXCEPTION_IF_NULL(tensor_loader);
// TensorData is freed up in AscendSession class
auto tensor_data = std::make_shared<mindspore::TensorData>();
tensor_data->SetName(tensor_name);
tensor_data->SetExecutionOrder(execution_order);
tensor_data->SetSlot(slot);
if (trans_flag) {
MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(host_type, host_shape);
size_t host_size = out_tensor->data().nbytes();
ret = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
if (!ret) {
MS_LOG(ERROR) << "Copy device mem to host failed";
return ret;
}
tensor_data->SetTensor(out_tensor);
} else {
mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
size_t host_size = out_tensor->data().nbytes();
auto ret_rt_memcpy = rtMemcpy(out_tensor->data_c(), host_size, ptr_, host_size, RT_MEMCPY_DEVICE_TO_HOST);
if (ret_rt_memcpy != RT_ERROR_NONE) {
MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]";
}
MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
tensor_data->SetTensor(out_tensor);
mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
size_t host_size = out_tensor->data().nbytes();
auto ret_rt_memcpy = rtMemcpy(out_tensor->data_c(), host_size, ptr_, host_size, RT_MEMCPY_DEVICE_TO_HOST);
if (ret_rt_memcpy != RT_ERROR_NONE) {
MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]";
}
MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
tensor_data->SetTensor(out_tensor);
ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev);
return ret;
}

View File

@ -45,9 +45,8 @@ class AscendDeviceAddress : public DeviceAddress {
bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type) const override;
#ifdef ENABLE_DEBUGGER
bool LoadMemToHost(bool dump_mode, const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
bool keep_prev) const;
bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const override;
#endif
private:

View File

@ -254,15 +254,10 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr);
MS_EXCEPTION_IF_NULL(ascend_addr);
ShapeVector int_shapes;
if (trans_flag) {
int_shapes = trans::GetRuntimePaddingShape(node, j);
} else {
auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
[](size_t inner_item) { return SizeToInt(inner_item); });
}
auto ret =
ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, j, debugger, false);
auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
[](size_t inner_item) { return SizeToInt(inner_item); });
auto ret = ascend_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost: flag:" << trans_flag << ", tensor_name:" << tensor_name
<< ", host_format:" << format << ".!";
@ -272,40 +267,6 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
}
}
void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger) {
MS_EXCEPTION_IF_NULL(graph);
// trans_flag: "true" means tensor values will be transfered to host format, otherwise not.
bool trans_flag = false;
const auto &parameters = graph->inputs();
// for parameters, set its execution order to be 0;
int exec_order = 0;
for (auto &item : parameters) {
if (!item->isa<Parameter>()) {
continue;
}
std::string parameter_name = item->fullname_with_scope();
auto addr = AnfAlgo::GetOutputAddr(item, PRAMATER_OUTPUT_INDEX);
auto type = AnfAlgo::GetOutputInferDataType(item, PRAMATER_OUTPUT_INDEX);
auto format = kOpFormat_DEFAULT;
string tensor_name = parameter_name + ':' + "0";
auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr);
MS_EXCEPTION_IF_NULL(ascend_addr);
ShapeVector int_shapes;
if (trans_flag) {
int_shapes = trans::GetRuntimePaddingShape(item, PRAMATER_OUTPUT_INDEX);
} else {
auto shape = AnfAlgo::GetOutputDeviceShape(item, PRAMATER_OUTPUT_INDEX);
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
[](size_t inner_item) { return SizeToInt(inner_item); });
}
auto ret =
ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost Failed: flag:" << trans_flag << ", path:" << tensor_name
<< ", host_format:" << format << ".!";
}
}
}
} // namespace
#endif
@ -319,7 +280,7 @@ bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debug
// load output
LoadOutput(graph, debugger);
// load parameters
LoadParameters(graph, debugger);
if (debugger) debugger->LoadParameters();
#endif
return true;
}

View File

@ -70,6 +70,12 @@ class DeviceAddress : public mindspore::DeviceSync {
const ShapeVector &host_shape, TypeId host_type) const {
return true;
}
#ifdef ENABLE_DEBUGGER
virtual bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const {
return true;
}
#endif
protected:
const void *ptr() const { return ptr_; }

View File

@ -80,14 +80,14 @@ GPUDeviceAddress::~GPUDeviceAddress() {
}
#ifdef ENABLE_DEBUGGER
bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
const ShapeVector &host_shape, TypeId host_type, size_t slot,
bool keep_prev) const {
bool ret = false;
if (size_ == 0) {
return true;
}
DebugServices *debug_services = debugger->debug_services();
TensorLoader *tensor_loader = debug_services->tensor_loader();
TensorLoader *tensor_loader = Debugger::GetInstance()->debug_services()->tensor_loader();
mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
size_t host_size = out_tensor->data().nbytes();

View File

@ -44,8 +44,7 @@ class GPUDeviceAddress : public DeviceAddress {
#ifdef ENABLE_DEBUGGER
bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
bool keep_prev) const;
const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const override;
#endif
private:
DeviceAddressStatus status_{DeviceAddressStatus::kInDevice};

View File

@ -111,7 +111,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
[](size_t inner_item) { return SizeToInt(inner_item); });
auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, true);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
@ -130,7 +130,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j);
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
[](size_t inner_item) { return SizeToInt(inner_item); });
auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, debugger, false);
auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
@ -148,36 +148,6 @@ void UpdateStepNum(Debugger *debugger, bool dump_enabled) {
}
}
void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) {
MS_EXCEPTION_IF_NULL(graph);
if (!(debugger && dump_enabled)) {
return;
}
const auto &parameters = graph->inputs();
// for parameters, set its execution order to be 0;
int exec_order = 0;
for (auto &item : parameters) {
if (!item->isa<Parameter>()) {
continue;
}
std::string parameter_name = item->fullname_with_scope();
auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX);
auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX);
auto format = kOpFormat_DEFAULT;
string tensor_name = parameter_name + ':' + "0";
auto gpu_addr = dynamic_cast<const mindspore::device::gpu::GPUDeviceAddress *>(addr);
ShapeVector int_shapes;
auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX);
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
[](size_t inner_item) { return SizeToInt(inner_item); });
auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
}
}
}
void ClearCurrentData(Debugger *debugger, bool dump_enabled) {
if (debugger && (debugger->debugger_enabled() || dump_enabled)) {
DebugServices *debug_services = debugger->debug_services();
@ -601,7 +571,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
}
if (!mock) {
// collect weights and bias for dump mode
LoadParameters(graph, debugger, dump_enabled);
if (debugger) debugger->LoadParameters();
CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
}
ClearSwapInfo(mock);

View File

@ -53,8 +53,8 @@ class KernelRuntime {
void RunOpAssignMemory(const ValuePtr &pre_output_value, const std::vector<tensor::TensorPtr> &input_tensors,
session::KernelGraph *graph);
void RunOpClearMemory(const session::KernelGraph *graph);
bool DumpDataEnabled();
bool DumpDataEnabledIteration();
static bool DumpDataEnabled();
static bool DumpDataEnabledIteration();
virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger);
virtual bool Load(session::KernelGraph *graph, bool is_task_sink);
virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0;