From b3c0eb61d5e0f7ef510fb9991427c1607fb6cbd1 Mon Sep 17 00:00:00 2001 From: John Tzanakakis Date: Fri, 17 Jul 2020 10:26:35 -0400 Subject: [PATCH] GPU debugger - milestone 1 and GPU dump Additonal Authors: Adel Shafiei, Harshvardhan Gupta --- build.sh | 3 + .../ccsrc/backend/session/gpu_session.cc | 78 +++++ mindspore/ccsrc/backend/session/gpu_session.h | 14 + .../ccsrc/backend/session/session_basic.cc | 1 - .../ccsrc/backend/session/session_basic.h | 5 +- mindspore/ccsrc/debug/CMakeLists.txt | 1 + mindspore/ccsrc/debug/debugger/debugger.cc | 23 +- mindspore/ccsrc/debug/debugger/debugger.h | 3 +- mindspore/ccsrc/debug/tensor_load.h | 50 ++++ .../device/ascend/ascend_kernel_runtime.cc | 2 +- .../device/ascend/ascend_kernel_runtime.h | 2 +- .../runtime/device/cpu/cpu_kernel_runtime.cc | 2 +- .../runtime/device/cpu/cpu_kernel_runtime.h | 2 +- .../runtime/device/gpu/gpu_device_address.cc | 37 +++ .../runtime/device/gpu/gpu_device_address.h | 8 + .../runtime/device/gpu/gpu_kernel_runtime.cc | 281 +++++++++++++++++- .../runtime/device/gpu/gpu_kernel_runtime.h | 14 +- .../ccsrc/runtime/device/kernel_runtime.cc | 37 ++- .../ccsrc/runtime/device/kernel_runtime.h | 6 +- 19 files changed, 534 insertions(+), 35 deletions(-) diff --git a/build.sh b/build.sh index adeb099fb7b..146b0de1c51 100755 --- a/build.sh +++ b/build.sh @@ -279,6 +279,9 @@ checkopts() done } checkopts "$@" +if [[ "X$ENABLE_GPU" = "Xon" ]] && [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then + ENABLE_DEBUGGER="on" +fi echo "---------------- MindSpore: build start ----------------" mkdir -pv "${BUILD_PATH}/package/mindspore/lib" git submodule update --init graphengine diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index 3a462d9cb9a..4398d9a0375 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -37,6 +37,7 @@ #include "common/trans.h" #include "utils/context/ms_context.h" #include "utils/base_ref_extends.h" +#include "debug/tensor_load.h" namespace mindspore { namespace session { @@ -164,7 +165,11 @@ void GPUSession::LoadInputData(const std::shared_ptr &kernel_graph, void GPUSession::Execute(const std::shared_ptr &kernel_graph) const { auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); +#ifdef ENABLE_DEBUGGER + if (!runtime_instance->Run(kernel_graph.get(), debugger_.get())) { +#else if (!runtime_instance->Run(kernel_graph.get())) { +#endif MS_LOG(EXCEPTION) << "GPU execute graph failed!"; } } @@ -229,6 +234,9 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList void GPUSession::RunGraph(const GraphId &graph_id, const std::vector &inputs, VectorRef *outputs) { auto &kernel_graph = graphs_[graph_id]; +#ifdef ENABLE_DEBUGGER + PreIterationDbg(kernel_graph); +#endif // Load input data from user input LoadInputData(kernel_graph, inputs); #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) @@ -245,6 +253,9 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vectorenable_gpu_summary()) { Summary(kernel_graph.get()); } +#ifdef ENABLE_DEBUGGER + PostIterationDbg(kernel_graph); +#endif } void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info, @@ -296,6 +310,70 @@ py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph RunOpClearMemory(kernel_graph.get()); return tuple_tensors; } + +#ifdef ENABLE_DEBUGGER +void GPUSession::Dump(const std::shared_ptr &kernel_graph) const { +#ifdef ENABLE_DUMP_E2E + MS_EXCEPTION_IF_NULL(kernel_graph); + auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); + MS_EXCEPTION_IF_NULL(runtime_instance); + (void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get()); +#endif +} + +bool GPUSession::DumpDataEnabledIteration() const { + auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); + MS_EXCEPTION_IF_NULL(runtime_instance); + return runtime_instance->DumpDataEnabledIteration(); +} + +void GPUSession::PreIterationDbg(const std::shared_ptr &kernel_graph) const { + if (debugger_) { + debugger_->PreExecute(kernel_graph); + } + PreLoadTensor(kernel_graph); +} + +void GPUSession::PostIterationDbg(const std::shared_ptr &kernel_graph) const { + bool dump_enabled = DumpDataEnabledIteration(); + // debug used for dump + if (debugger_ && dump_enabled) { + Dump(kernel_graph); + } + if (debugger_) { + debugger_->PostExecute(); + } +} + +void GPUSession::PreLoadTensor(const std::shared_ptr &kernel_graph) const { + bool dump_enabled = DumpDataEnabledIteration(); + if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) { + return; + } + MS_EXCEPTION_IF_NULL(kernel_graph); + auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); + MS_EXCEPTION_IF_NULL(runtime_instance); + DebugServices *debug_services = debugger_->debug_services(); + TensorLoader *tensor_loader = debug_services->tensor_loader(); + tensor_loader->EmptyTensor(); + uint32_t iter_num = tensor_loader->GetIterNum(); + tensor_loader->set_iter_num(++iter_num); +} + +void GPUSession::PostLoadTensor(const std::shared_ptr &kernel_graph) const { + bool dump_enabled = DumpDataEnabledIteration(); + if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) { + return; + } + MS_EXCEPTION_IF_NULL(kernel_graph); + auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); + MS_EXCEPTION_IF_NULL(runtime_instance); + DebugServices *debug_services = debugger_->debug_services(); + TensorLoader *tensor_loader = debug_services->tensor_loader(); + tensor_loader->EmptyPrevTensor(); +} +#endif + } // namespace gpu } // namespace session } // namespace mindspore diff --git a/mindspore/ccsrc/backend/session/gpu_session.h b/mindspore/ccsrc/backend/session/gpu_session.h index 04e5021c02a..3e4e84a29bb 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.h +++ b/mindspore/ccsrc/backend/session/gpu_session.h @@ -67,6 +67,20 @@ class GPUSession : public SessionBasic { const std::vector &inputs_const) const override; void Execute(const std::shared_ptr &kernel_graph) const; + +#ifdef ENABLE_DEBUGGER + void Dump(const std::shared_ptr &kernel_graph) const; + + bool DumpDataEnabledIteration() const; + + void PreIterationDbg(const std::shared_ptr &kernel_graph) const; + + void PostIterationDbg(const std::shared_ptr &kernel_graph) const; + + void PreLoadTensor(const std::shared_ptr &kernel_graph) const; + + void PostLoadTensor(const std::shared_ptr &kernel_graph) const; +#endif }; using GPUSessionPtr = std::shared_ptr; MS_REG_SESSION(kGPUDevice, GPUSession); diff --git a/mindspore/ccsrc/backend/session/session_basic.cc b/mindspore/ccsrc/backend/session/session_basic.cc index ff72716c607..38eecc99299 100644 --- a/mindspore/ccsrc/backend/session/session_basic.cc +++ b/mindspore/ccsrc/backend/session/session_basic.cc @@ -24,7 +24,6 @@ #include "backend/kernel_compiler/common_utils.h" #include "frontend/operator/ops.h" #include "common/trans.h" -#include "utils/context/ms_context.h" #include "utils/config_manager.h" #include "backend/session/anf_runtime_algorithm.h" #include "backend/kernel_compiler/oplib/oplib.h" diff --git a/mindspore/ccsrc/backend/session/session_basic.h b/mindspore/ccsrc/backend/session/session_basic.h index 367b1fe80a9..838a8807aed 100755 --- a/mindspore/ccsrc/backend/session/session_basic.h +++ b/mindspore/ccsrc/backend/session/session_basic.h @@ -32,6 +32,7 @@ #include "utils/contract.h" #include "pipeline/pynative/pynative_execute.h" #include "runtime/device/kernel_info.h" +#include "utils/context/ms_context.h" #ifdef ENABLE_DEBUGGER #include "debug/debugger/debugger.h" #endif @@ -112,7 +113,9 @@ class SessionBasic { // set debugger void SetDebugger() { debugger_ = Debugger::GetInstance(); - debugger_->Init(device_id_); + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + debugger_->Init(device_id_, ms_context->device_target()); } #endif diff --git a/mindspore/ccsrc/debug/CMakeLists.txt b/mindspore/ccsrc/debug/CMakeLists.txt index 8be5a0a834b..9ed24ec25b5 100644 --- a/mindspore/ccsrc/debug/CMakeLists.txt +++ b/mindspore/ccsrc/debug/CMakeLists.txt @@ -16,6 +16,7 @@ if (ENABLE_DEBUGGER) "${CMAKE_CURRENT_SOURCE_DIR}/debugger/grpc_client.cc" "${CMAKE_CURRENT_SOURCE_DIR}/debugger/proto_exporter.cc" "${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/common.cc" ) endif (ENABLE_DEBUGGER) diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index dd89e17e2db..b9e9238034e 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -21,6 +21,7 @@ #include "debug/debugger/debugger.h" #include "pipeline/jit/pipeline.h" #include "backend/session/anf_runtime_algorithm.h" +#include "runtime/device/kernel_runtime_manager.h" using debugger::EventReply; using debugger::GraphProto; @@ -41,17 +42,20 @@ Debugger::Debugger() : grpc_client_(nullptr), debug_services_(nullptr), device_id_(0), + device_target_(""), num_step_(0), debugger_enabled_(false), is_dataset_graph_(false), partial_memory_(false) {} -void Debugger::Init(const uint32_t device_id) { +void Debugger::Init(const uint32_t device_id, const std::string device_target) { // access lock for public method std::lock_guard a_lock(access_lock_); // save device_id MS_LOG(INFO) << "Debugger got device_id: " << device_id; device_id_ = device_id; + MS_LOG(INFO) << "Debugger got device_target: " << device_target; + device_target_ = device_target; } void Debugger::EnableDebugger() { @@ -62,6 +66,14 @@ void Debugger::EnableDebugger() { grpc_client_ = nullptr; debug_services_ = nullptr; + // see if dump is enabled + bool dump_enabled = false; + if (device_target_ == kGPUDevice) { + auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); + MS_EXCEPTION_IF_NULL(runtime_instance); + dump_enabled = runtime_instance->DumpDataEnabled(); + } + // get env variables to configure debugger const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER"); if (env_enable_str != nullptr) { @@ -70,7 +82,8 @@ void Debugger::EnableDebugger() { debugger_enabled_ = true; } } - if (!debugger_enabled_) { + + if (!debugger_enabled_ && !dump_enabled) { MS_LOG(WARNING) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger."; return; } @@ -118,7 +131,10 @@ void Debugger::EnableDebugger() { } // initialize grpc client - grpc_client_ = std::make_unique(host, port); + if (debugger_enabled_) { + grpc_client_ = std::make_unique(host, port); + } + debug_services_ = std::make_unique(); } @@ -127,6 +143,7 @@ void Debugger::Reset() { std::lock_guard a_lock(access_lock_); // reset components device_id_ = 0; + device_target_ = ""; num_step_ = 0; debugger_enabled_ = false; is_dataset_graph_ = false; diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index 5a3965d7cc9..f72a3e038c7 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -55,7 +55,7 @@ class Debugger : public std::enable_shared_from_this { // init // only save device_id - void Init(const uint32_t device_id); + void Init(const uint32_t device_id, const std::string device_target); // reset debugger void Reset(); @@ -128,6 +128,7 @@ class Debugger : public std::enable_shared_from_this { std::unique_ptr debug_services_; KernelGraphPtr graph_ptr_; uint32_t device_id_; + std::string device_target_; int32_t num_step_; bool debugger_enabled_; bool is_dataset_graph_; diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h index ae0e89aae27..7215b9a6244 100644 --- a/mindspore/ccsrc/debug/tensor_load.h +++ b/mindspore/ccsrc/debug/tensor_load.h @@ -24,6 +24,10 @@ #include #include #include "debug/tensor_data.h" +#include "ir/dtype.h" +#ifdef ENABLE_DUMP_E2E +#include "debug/e2e_dump.h" +#endif namespace mindspore { class TensorLoader { public: @@ -72,8 +76,54 @@ class TensorLoader { void EmptyPrevTensor() { prev_tensor_list_map.clear(); } + void EmptyCurrentTensor() { + tensor_list_map.clear(); + tensor_list.clear(); + } + void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; } +#ifdef ENABLE_DUMP_E2E + bool DumpTensorToFile(std::string tensor_name, bool trans_flag, const std::string &filepath, + const std::string &host_fmt, const std::vector &host_shape, TypeId host_type, + TypeId addr_type_id, std::string addr_format, size_t slot) const { + bool ret = false; + if (filepath.empty()) { + MS_LOG(ERROR) << "Dump file path is null!"; + return ret; + } + std::string shape = "shape"; + if (host_shape.size()) { + for (auto &value : host_shape) { + shape = shape + '_' + std::to_string(value); + } + } else { + shape = shape + "_0"; + } + std::string file_extension = ".bin"; + std::string path = ""; + if (trans_flag) { + path = filepath + '_' + shape + '_' + TypeIdLabel(host_type) + '_' + host_fmt + file_extension; + } else { + path = filepath + '_' + shape + '_' + TypeIdToType(addr_type_id)->ToString() + '_' + addr_format + file_extension; + } + + MS_LOG(INFO) << "Dump path is " << path; + + std::string tensor_loader_name = tensor_name + ":" + std::to_string(slot); + auto iter = tensor_list_map.find(tensor_loader_name); + if (iter != tensor_list_map.end()) { + std::shared_ptr node = iter->second; + mindspore::tensor::TensorPtr out_tensor = node->GetTensor(); + size_t host_size = out_tensor->data().nbytes(); + + ret = mindspore::Dump::DumpToFile(path, out_tensor->data_c(), host_size); + } + + return ret; + } +#endif + private: std::vector> tensor_list; std::map> tensor_list_map; diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index aafbf757654..da290dd1c0b 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -275,7 +275,7 @@ void DumpParameters(mindspore::session::KernelGraph *graph, const string &dump_p } // namespace #endif -bool AscendKernelRuntime::DumpData(mindspore::session::KernelGraph *graph) { +bool AscendKernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) { MS_EXCEPTION_IF_NULL(graph); #ifdef ENABLE_DUMP_E2E MS_LOG(INFO) << "Start dump step"; diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h index c4d82b05672..33500dc27f3 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h @@ -38,7 +38,7 @@ class AscendKernelRuntime : public KernelRuntime { AscendKernelRuntime() = default; ~AscendKernelRuntime() override; bool Init() override; - bool DumpData(session::KernelGraph *graph) override; + bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override; bool LoadData(session::KernelGraph *graph, Debugger *debugger) override; bool GenTask(const session::KernelGraph *graph) override; bool RunTask(const session::KernelGraph *graph) override; diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc index 8b144b04400..0c3cf9684a9 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc @@ -270,7 +270,7 @@ void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutput resource_manager_.DecreaseSummaryRefCount(summary_outputs); } -bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) { +bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, Debugger *debugger) { MS_EXCEPTION_IF_NULL(kernel_graph); resource_manager_.IncreaseAddressRefCount(kernel_graph); diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h index dc952f526ec..a486ab1a8b8 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h @@ -36,7 +36,7 @@ class CPUKernelRuntime : public KernelRuntime { ~CPUKernelRuntime() override = default; bool Init() override { return true; } - bool Run(session::KernelGraph *graph) override; + bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override; void AssignKernelAddress(session::KernelGraph *kernel_graph); void BindInputOutput(const session::KernelGraph *kernel_graph, const std::vector &inputs, VectorRef *outputs, std::vector *need_sync_outputs); diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc index a20a6a9a3c8..35fc90b7e45 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc @@ -16,9 +16,16 @@ #include "runtime/device/gpu/gpu_device_address.h" #include +#include #include "runtime/device/gpu/gpu_device_manager.h" #include "utils/log_adapter.h" #include "runtime/device/gpu/gpu_memory_allocator.h" +#include "ir/tensor.h" +#ifdef ENABLE_DEBUGGER +#include "debug/debug_services.h" +#include "debug/tensor_load.h" +#include "debug/debugger/debugger.h" +#endif namespace mindspore { namespace device { @@ -59,6 +66,36 @@ GPUDeviceAddress::~GPUDeviceAddress() { ptr_ = nullptr; } } +#ifdef ENABLE_DEBUGGER +bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, + const std::vector &host_shape, TypeId host_type, size_t slot, + Debugger *debugger, bool keep_prev) const { + bool ret = false; + if (size_ == 0) { + return true; + } + DebugServices *debug_services = debugger->debug_services(); + TensorLoader *tensor_loader = debug_services->tensor_loader(); + + mindspore::tensor::TensorPtr out_tensor = std::make_shared(type_id_, host_shape); + size_t host_size = out_tensor->data().nbytes(); + auto ret_rt_memcpy = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c()); + if (!ret_rt_memcpy) { + MS_LOG(ERROR) << "Copy device mem to host failed"; + return ret; + } + auto tensor_data = std::make_shared(); + tensor_data->SetName(tensor_name); + tensor_data->SetExecutionOrder(execution_order); + tensor_data->SetTensor(out_tensor); + tensor_data->SetSlot(slot); + ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev); + + MS_LOG(INFO) << "E2E tensor name is " << tensor_name; + + return ret; +} +#endif } // namespace gpu } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h index 8b846bf341a..8a3baccb611 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h @@ -22,6 +22,9 @@ #include "runtime/device/device_address.h" namespace mindspore { +#ifdef ENABLE_DEBUGGER +class Debugger; +#endif namespace device { namespace gpu { class GPUDeviceAddress : public DeviceAddress { @@ -37,6 +40,11 @@ class GPUDeviceAddress : public DeviceAddress { DeviceAddressStatus status() const { return status_; } DeviceAddressType DeviceType() const override { return DeviceAddressType::kGPU; } +#ifdef ENABLE_DEBUGGER + bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, + const std::vector &host_shape, TypeId host_type, size_t slot, Debugger *debugger, + bool keep_prev) const; +#endif private: DeviceAddressStatus status_{DeviceAddressStatus::kInDevice}; }; diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 3a5d9ca34a8..dbfc80b9ff4 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "runtime/device/gpu/gpu_kernel_runtime.h" +#include #include "runtime/device/gpu/gpu_device_address.h" #include "runtime/device/gpu/cuda_driver.h" #include "runtime/device/gpu/gpu_buffer_mgr.h" @@ -29,6 +29,8 @@ #include "runtime/device/gpu/gpu_memory_manager.h" #include "backend/kernel_compiler/common_utils.h" #include "runtime/device/gpu/gpu_memory_copy_manager.h" +#include "common/trans.h" +#include "ir/dtype.h" namespace mindspore { namespace device { @@ -36,6 +38,7 @@ namespace gpu { using mindspore::device::memswap::MemSwapInfoSet; using mindspore::device::memswap::MemSwapManager; using mindspore::device::memswap::SwapKind; +static const size_t PARAMETER_OUTPUT_INDEX = 0; bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); } bool GPUKernelRuntime::Init() { @@ -43,7 +46,15 @@ bool GPUKernelRuntime::Init() { GPUMemoryAllocator::GetInstance().CheckMaxDeviceMemory(); return true; } - auto ret = InitDevice(); + bool ret = false; +#ifdef ENABLE_DUMP_E2E + ret = SetDumpConf(); + if (!ret) { + MS_LOG(INFO) << "No dump conf to set!"; + } +#endif + + ret = InitDevice(); if (!ret) { MS_LOG(ERROR) << "InitDevice error."; return ret; @@ -63,6 +74,216 @@ bool GPUKernelRuntime::Init() { return ret; } +#ifdef ENABLE_DUMP_E2E +namespace { +void DumpOutput(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf, + Debugger *debugger) { + MS_EXCEPTION_IF_NULL(graph); + MS_EXCEPTION_IF_NULL(dump_conf); + bool trans_flag = dump_conf->trans_flag(); + const auto &apply_kernels = graph->execution_order(); + for (const auto &node : apply_kernels) { + MS_EXCEPTION_IF_NULL(node); + auto node_name = AnfAlgo::GetCNodeName(node); + std::string kernel_name = node->fullname_with_scope(); + if (!dump_conf->IsKernelNeedDump(kernel_name)) { + continue; + } + const std::string strsrc = "/"; + const std::string strdst = "--"; + std::string::size_type pos = 0; + std::string::size_type srclen = strsrc.size(); + std::string::size_type dstlen = strdst.size(); + while ((pos = kernel_name.find(strsrc, pos)) != std::string::npos) { + kernel_name.replace(pos, srclen, strdst); + pos += dstlen; + } + auto output_size = AnfAlgo::GetOutputTensorNum(node); + for (size_t j = 0; j < output_size; ++j) { + auto addr = AnfAlgo::GetOutputAddr(node, j); + TypeId addr_type_id = addr->type_id(); + std::string addr_format = addr->format(); + std::vector int_shapes; + if (trans_flag) { + int_shapes = trans::GetRuntimePaddingShape(node, j); + } else { + auto shape = AnfAlgo::GetOutputDeviceShape(node, j); + (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), + [](size_t inner_item) { return SizeToInt(inner_item); }); + } + + auto type = AnfAlgo::GetOutputInferDataType(node, j); + + auto format = kOpFormat_DEFAULT; + string filepath = dump_path + '/' + kernel_name + '_' + "output_" + std::to_string(j); + + DebugServices *debug_services = debugger->debug_services(); + TensorLoader *tensor_loader = debug_services->tensor_loader(); + std::string original_kernel_name = node->fullname_with_scope(); + size_t slot = j; + auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type, + addr_type_id, addr_format, slot); + + if (!ret) { + std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath + + ", host_format:" + format + ".!"; + } + } + } +} + +void DumpParameters(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf, + Debugger *debugger) { + MS_EXCEPTION_IF_NULL(graph); + MS_EXCEPTION_IF_NULL(dump_conf); + bool trans_flag = dump_conf->trans_flag(); + const auto ¶meters = graph->inputs(); + for (auto &item : parameters) { + if (!item->isa()) { + continue; + } + std::string parameter_name = item->fullname_with_scope(); + if (!dump_conf->IsKernelNeedDump(parameter_name)) { + continue; + } + auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX); + TypeId addr_type_id = addr->type_id(); + std::string addr_format = addr->format(); + std::vector int_shapes; + if (trans_flag) { + int_shapes = trans::GetRuntimePaddingShape(item, PARAMETER_OUTPUT_INDEX); + } else { + auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX); + (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), + [](size_t inner_item) { return SizeToInt(inner_item); }); + } + + auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX); + + auto format = kOpFormat_DEFAULT; + string filepath = dump_path + '/' + parameter_name + '_' + "output_0"; + + DebugServices *debug_services = debugger->debug_services(); + TensorLoader *tensor_loader = debug_services->tensor_loader(); + std::string original_kernel_name = parameter_name; + size_t slot = 0; + auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type, + addr_type_id, addr_format, slot); + + if (!ret) { + std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath + + ", host_format:" + format + ".!"; + } + } +} +} // namespace + +bool GPUKernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) { + MS_EXCEPTION_IF_NULL(graph); + MS_LOG(INFO) << "Start dump step"; + DumpConfPtr dump_conf = GetDumpConf(); + MS_EXCEPTION_IF_NULL(dump_conf); + dump_conf->UpdataCurIter(); + bool dump_flag = dump_conf->dump_enable(); + if (!dump_flag) { + MS_LOG(INFO) << "Dump flag is disable, pass dump step"; + return true; + } + uint32_t cur_iter = dump_conf->cur_iter(); + if (dump_conf->dump_iter() != 0) { + if (cur_iter != dump_conf->dump_iter()) { + return true; + } + } + MS_LOG(INFO) << "Cur iter is " << cur_iter; + std::string net_name = dump_conf->dump_net_name(); + std::string iterator = std::to_string(cur_iter); + std::string dump_path = dump_conf->dump_path(); + if (dump_path.back() == '/') { + dump_path = dump_path + net_name + '/' + iterator; + } else { + dump_path = dump_path + '/' + net_name + '/' + iterator; + } + + // dump output + DumpOutput(graph, dump_path, dump_conf, debugger); + // dump parameters + DumpParameters(graph, dump_path, dump_conf, debugger); + + return true; +} +#endif + +#ifdef ENABLE_DEBUGGER +namespace { +void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, + const std::vector &kernel_inputs, + const std::vector &kernel_workspaces, + const std::vector &kernel_outputs, int exec_order, void *stream_ptr, + bool dump_enabled) { + if (!(debugger && (debugger->debugger_enabled() || dump_enabled))) { + return; + } + std::string kernel_name = kernel->fullname_with_scope(); + auto output_size = AnfAlgo::GetOutputTensorNum(kernel); + for (size_t j = 0; j < output_size; ++j) { + auto addr = kernel_outputs[j]; + auto type = AnfAlgo::GetOutputInferDataType(kernel, j); + auto format = kOpFormat_DEFAULT; + auto gpu_addr = std::make_unique(addr->addr, addr->size, format, type); + string tensor_name = kernel_name + ':' + std::to_string(j); + std::vector int_shapes; + auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j); + (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), + [](size_t inner_item) { return SizeToInt(inner_item); }); + auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, debugger, false); + if (!ret) { + MS_LOG(ERROR) << "LoadMemToHost:" + << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; + } + } +} + +void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) { + MS_EXCEPTION_IF_NULL(graph); + if (!(debugger && (debugger->debugger_enabled() || dump_enabled))) { + return; + } + const auto ¶meters = graph->inputs(); + // for parameters, set its execution order to be 0; + int exec_order = 0; + for (auto &item : parameters) { + if (!item->isa()) { + continue; + } + std::string parameter_name = item->fullname_with_scope(); + auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX); + auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX); + auto format = kOpFormat_DEFAULT; + string tensor_name = parameter_name + ':' + "0"; + auto gpu_addr = dynamic_cast(addr); + std::vector int_shapes; + auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX); + (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), + [](size_t inner_item) { return SizeToInt(inner_item); }); + auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, debugger, true); + if (!ret) { + MS_LOG(ERROR) << "LoadMemToHost:" + << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; + } + } +} + +void ClearCurrentData(Debugger *debugger, bool dump_enabled) { + if (debugger && (debugger->debugger_enabled() || dump_enabled)) { + DebugServices *debug_services = debugger->debug_services(); + TensorLoader *tensor_loader = debug_services->tensor_loader(); + tensor_loader->EmptyCurrentTensor(); + } +} +} // namespace +#endif + DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, TypeId type_id) { return std::make_shared(device_ptr, device_size, format, type_id); @@ -147,7 +368,7 @@ void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) { } } -bool GPUKernelRuntime::Run(session::KernelGraph *graph) { +bool GPUKernelRuntime::Run(session::KernelGraph *graph, Debugger *debugger) { struct timeval start_time, end_time; (void)gettimeofday(&start_time, nullptr); bool ret = true; @@ -170,7 +391,7 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) { mem_reuse_util_ = mem_reuse_iter->second; MS_EXCEPTION_IF_NULL(mem_reuse_util_); - ret = RunOneStep(graph); + ret = RunOneStep(graph, debugger); } else { ret = LaunchKernel(graph); } @@ -182,28 +403,28 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) { return ret; } -bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph) { +bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph, Debugger *debugger) { bool ret = true; auto graph_id = graph->graph_id(); if (!is_first_step_map_[graph_id]) { // Normally run graph - ret = LaunchKernelDynamic(graph); + ret = LaunchKernelDynamic(graph, debugger); } else { // Mock run first step - ret = LaunchKernelDynamic(graph, true, false); + ret = LaunchKernelDynamic(graph, debugger, true, false); if (ret) { // Normally run graph - ret = LaunchKernelDynamic(graph); + ret = LaunchKernelDynamic(graph, debugger); } else { // Trigger memory swap - ret = SearchMemSwapScheme(graph); + ret = SearchMemSwapScheme(graph, debugger); } is_first_step_map_[graph_id] = false; } return ret; } -bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) { +bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) { MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment."; bool ret = false; ClearKernelOldOutputAndWorkspace(graph); @@ -217,7 +438,7 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) { if (!mem_swap_manager_->RetreatSwapInfo()) { return false; } - ret = LaunchKernelDynamic(graph, true, false); + ret = LaunchKernelDynamic(graph, debugger, true, false); if (!ret) { ClearKernelOldOutputAndWorkspace(graph); } @@ -225,14 +446,14 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) { mem_swap_manager_->AssignHostMemory(); // Time profiling - ret = LaunchKernelDynamic(graph, false, true); + ret = LaunchKernelDynamic(graph, debugger, false, true); if (!ret) { return ret; } - return RefineMemSwapScheme(graph); + return RefineMemSwapScheme(graph, debugger); } -bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) { +bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) { MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment."; auto &kernels = graph->execution_order(); for (const auto &kernel : kernels) { @@ -245,7 +466,7 @@ bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) { bool ret = false; while (!ret) { mem_swap_manager_->AdjustSwapInPos(kernel, swap_in_task_idx); - ret = LaunchKernelDynamic(graph, true, false); + ret = LaunchKernelDynamic(graph, debugger, true, false); if (!ret) { ClearKernelOldOutputAndWorkspace(graph); ClearSwapInfo(true); @@ -384,14 +605,24 @@ void GPUKernelRuntime::ClearKernelWorkspaceAddress(const session::KernelGraph *g } } -bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bool mock, bool profiling) { +bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger, bool mock, + bool profiling) { MS_EXCEPTION_IF_NULL(graph); MS_EXCEPTION_IF_NULL(mem_reuse_util_); // Reset the reference count. mem_reuse_util_->ResetDynamicUsedRefCount(); // The inputs and outputs memory of communication kernel need be continuous, so separate processing. AllocCommunicationOpDynamicRes(graph); + +#ifdef ENABLE_DEBUGGER + bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); + if (!mock) { + // collect weights and bias + LoadParameters(graph, debugger, dump_enabled); + } +#endif auto &kernels = graph->execution_order(); + int exec_order = 1; for (const auto &kernel : kernels) { auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); @@ -400,6 +631,12 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo AddressPtrList kernel_outputs; auto ret = AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs, mock); if (!ret) { +#ifdef ENABLE_DEBUGGER + if (!mock) { + // invalidate current data collected by the debugger + ClearCurrentData(debugger, dump_enabled); + } +#endif return false; } if (!mock) { @@ -409,9 +646,21 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo } else { LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); } +#ifdef ENABLE_DEBUGGER + // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost) + LoadKernelData(debugger, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, + dump_enabled); +#endif } + exec_order = exec_order + 1; FreeKernelDynamicRes(kernel); if (!UpdateMemorySwapTask(kernel, mock, profiling)) { +#ifdef ENABLE_DEBUGGER + if (!mock) { + // invalidate current data collected by the debugger + ClearCurrentData(debugger, dump_enabled); + } +#endif return false; } } diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h index 9a210c8e772..8f3cb9cb252 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h @@ -38,7 +38,10 @@ class GPUKernelRuntime : public KernelRuntime { bool Init() override; void ReleaseDeviceRes() override; void AssignMemory(session::KernelGraph *graph) override; - bool Run(session::KernelGraph *graph) override; + bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override; +#ifdef ENABLE_DUMP_E2E + bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override; +#endif protected: DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, @@ -61,10 +64,11 @@ class GPUKernelRuntime : public KernelRuntime { void ClearKernelOutputAddress(const session::KernelGraph *graph); void ClearKernelWorkspaceAddress(const session::KernelGraph *graph); void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph); - bool RunOneStep(const session::KernelGraph *graph); - bool SearchMemSwapScheme(const session::KernelGraph *graph); - bool RefineMemSwapScheme(const session::KernelGraph *graph); - bool LaunchKernelDynamic(const session::KernelGraph *graph, bool mock = false, bool profiling = false); + bool RunOneStep(const session::KernelGraph *graph, Debugger *debugger = nullptr); + bool SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr); + bool RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr); + bool LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger = nullptr, bool mock = false, + bool profiling = false); void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs, const AddressPtrList &workspace, const AddressPtrList &outputs); bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock); diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc index 6571bc22d69..28c155cb5ae 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc @@ -41,7 +41,7 @@ KernelRuntime::~KernelRuntime() { #endif } -bool KernelRuntime::Run(session::KernelGraph *graph) { +bool KernelRuntime::Run(session::KernelGraph *graph, Debugger *debugger) { bool ret = false; auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); @@ -72,7 +72,7 @@ bool KernelRuntime::Run(session::KernelGraph *graph) { } // for D to impl -bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph) { +bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) { if (graph != nullptr) { return true; } @@ -190,6 +190,39 @@ void KernelRuntime::RunOpClearMemory(const session::KernelGraph *graph) { } } +bool KernelRuntime::DumpDataEnabled() { + bool ret = false; +#ifdef ENABLE_DUMP_E2E + DumpConfPtr dump_conf = GetDumpConf(); + MS_EXCEPTION_IF_NULL(dump_conf); + bool dump_flag = dump_conf->dump_enable(); + if (!dump_flag) { + return ret; + } + ret = true; +#endif + return ret; +} + +bool KernelRuntime::DumpDataEnabledIteration() { + bool ret = false; +#ifdef ENABLE_DUMP_E2E + if (!DumpDataEnabled()) { + return ret; + } + DumpConfPtr dump_conf = GetDumpConf(); + MS_EXCEPTION_IF_NULL(dump_conf); + uint32_t cur_iter = dump_conf->cur_iter() + 1; + if (dump_conf->dump_iter() != 0) { + if (cur_iter != dump_conf->dump_iter()) { + return ret; + } + } + ret = true; +#endif + return ret; +} + void KernelRuntime::AssignStaticMemory(session::KernelGraph *graph) { AssignStaticMemoryInput(graph); AssignStaticMemoryValueNode(graph); diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h index 3b771b0090d..e56c80bca08 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h @@ -55,8 +55,10 @@ class KernelRuntime { virtual void AssignMemory(session::KernelGraph *graph); void RunOpAssignMemory(const std::vector &input_tensors, session::KernelGraph *graph); void RunOpClearMemory(const session::KernelGraph *graph); - virtual bool Run(session::KernelGraph *graph); - virtual bool DumpData(session::KernelGraph *graph); + bool DumpDataEnabled(); + bool DumpDataEnabledIteration(); + virtual bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr); + virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr); virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger); virtual bool RunTask(const session::KernelGraph *graph); virtual bool GenTask(const session::KernelGraph *graph);