From e8d9803258ed7491d633f4b70fbc0152ffd50fa4 Mon Sep 17 00:00:00 2001 From: lvchangquan Date: Wed, 4 Aug 2021 10:22:39 +0800 Subject: [PATCH] add profiling codes based on ascend and gpu in pynative mode --- .../pipeline/pynative/pynative_execute.cc | 9 +++ .../profiler/device/gpu/gpu_profiling.cc | 9 +++ .../ccsrc/profiler/device/gpu/gpu_profiling.h | 1 + mindspore/ccsrc/profiler/device/profiling.h | 5 ++ mindspore/ccsrc/runtime/device/CMakeLists.txt | 2 +- .../runtime/device/ascend/ascend_event.cc | 27 +++++++++ .../runtime/device/ascend/ascend_event.h | 11 +++- .../device/ascend/ascend_kernel_runtime.cc | 13 +++++ .../device/ascend/ascend_kernel_runtime.h | 1 + .../ccsrc/runtime/device/gpu/gpu_event.cc | 14 +++++ .../ccsrc/runtime/device/gpu/gpu_event.h | 2 + .../ccsrc/runtime/device/kernel_runtime.cc | 45 ++++++++++++++- .../ccsrc/runtime/device/kernel_runtime.h | 6 ++ .../runtime/device/pynative_profiling.cc | 49 ++++++++++++++++ .../ccsrc/runtime/device/pynative_profiling.h | 56 +++++++++++++++++++ .../hardware/gpu/gpu_device_context.cc | 7 +++ mindspore/ccsrc/utils/utils.h | 2 + mindspore/core/ir/device_event.h | 2 + tests/ut/cpp/stub/runtime/runtime_stub.cc | 4 ++ 19 files changed, 261 insertions(+), 4 deletions(-) create mode 100644 mindspore/ccsrc/runtime/device/pynative_profiling.cc create mode 100644 mindspore/ccsrc/runtime/device/pynative_profiling.h diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc index 8ffa2642bc6..127a25855a5 100644 --- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc +++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc @@ -32,6 +32,7 @@ #include "ir/tensor.h" #include "utils/any.h" #include "utils/utils.h" +#include "utils/profile.h" #include "utils/ms_context.h" #include "utils/check_convert_utils.h" #include "utils/context/context_extends.h" @@ -67,6 +68,7 @@ #include "debug/anf_ir_dump.h" #include "runtime/hardware/device_context_manager.h" +#include "runtime/device/pynative_profiling.h" using mindspore::tensor::TensorPy; @@ -702,6 +704,9 @@ py::object GetDstType(const TypeId &type_id) { } // namespace py::object RealRunOp(const py::args &args) { + auto real_run_op_start = GetTime(); + auto &profiler_inst = device::PynativeProfiler::GetInstance(); + profiler_inst.AddRealRunOpIndex(); CheckPyNativeContext(); auto executor = PynativeExecutor::GetInstance(); MS_EXCEPTION_IF_NULL(executor); @@ -709,6 +714,10 @@ py::object RealRunOp(const py::args &args) { MS_EXCEPTION_IF_NULL(op_exec_info); py::object ret = py::none(); PynativeExecutorTry(executor->forward_executor()->RunOpS, &ret, op_exec_info); + auto real_run_op_end = GetTime(); + profiler_inst.SetRealRunOpName(op_exec_info->op_name); + profiler_inst.SetRealRunOpTime(std::make_pair(real_run_op_start, real_run_op_end)); + profiler_inst.SingleOpProfilingData(); return ret; } diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc index 3553e18bf01..23b0a78bed6 100644 --- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc @@ -25,6 +25,7 @@ #include "pybind_api/api_register.h" #include "utils/log_adapter.h" #include "utils/utils.h" +#include "utils/profile.h" #include "utils/ms_context.h" namespace mindspore { @@ -446,6 +447,12 @@ void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) { } } +void GPUProfiler::SingleOpLaunchTimeProcess(float op_time_elapsed) { + auto launch_end_time = GetTime(); + double launch_start_time = launch_end_time - op_time_elapsed / kTimeUnit / kTimeUnit; + SetSingleOpLaunchTime(std::make_pair(launch_start_time, launch_end_time)); +} + void GPUProfiler::OpDataProducerEnd() { float op_time_elapsed = 0; if (sync_enable_flag_) { @@ -459,9 +466,11 @@ void GPUProfiler::OpDataProducerEnd() { CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy op event stop failed"); op_time_elapsed = op_time_elapsed * kTimeUnit; op_host_time_stop_ = GetHostTimeStamp(); + SingleOpLaunchTimeProcess(op_time_elapsed); } else { op_host_time_stop_ = GetHostTimeStamp(); op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit; + SingleOpLaunchTimeProcess(op_time_elapsed); } MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed; Profiler::SetRunTimeData(op_name_, op_time_elapsed); diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h index 17fdd71b93a..ae79e59ebe3 100644 --- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h @@ -135,6 +135,7 @@ class GPUProfiler : public Profiler { std::string ProfileDataPath() const { return profile_data_path_; } private: + void SingleOpLaunchTimeProcess(float op_time_elapsed); void OpsParser(); void EventLog(const Event &event); void ClearInst() override; diff --git a/mindspore/ccsrc/profiler/device/profiling.h b/mindspore/ccsrc/profiler/device/profiling.h index 6b3dd23676d..95318569ea0 100644 --- a/mindspore/ccsrc/profiler/device/profiling.h +++ b/mindspore/ccsrc/profiler/device/profiling.h @@ -79,6 +79,10 @@ class Profiler { bool GetEnableFlag() const { return enable_flag_; } std::string ProfileDataPath() const { return profile_data_path_; } void RecordOneStepStartEndInfo(std::string op_name); + std::pair GetSingleOpLaunchTime() { return single_op_launch_start_time_end_time_; } + void SetSingleOpLaunchTime(const std::pair &launch_start_end) { + single_op_launch_start_time_end_time_ = launch_start_end; + } protected: void SetRunTimeData(const std::string &op_name, const float time_elapsed); @@ -86,6 +90,7 @@ class Profiler { uint64_t GetHostMonoTimeStamp() const; virtual void SaveProfileData() = 0; virtual void ClearInst() = 0; + std::pair single_op_launch_start_time_end_time_; bool enable_flag_ = false; std::string profile_data_path_; std::unordered_map op_info_map_; diff --git a/mindspore/ccsrc/runtime/device/CMakeLists.txt b/mindspore/ccsrc/runtime/device/CMakeLists.txt index 56cd9fe6275..19a7b9f90cf 100644 --- a/mindspore/ccsrc/runtime/device/CMakeLists.txt +++ b/mindspore/ccsrc/runtime/device/CMakeLists.txt @@ -1,7 +1,7 @@ file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc" "kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc" "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc" - "bucket.cc" "launch_kernel.cc" "launch_mul.cc" + "bucket.cc" "launch_kernel.cc" "launch_mul.cc" "pynative_profiling.cc" ) if("${ENABLE_HIDDEN}" STREQUAL "OFF") diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_event.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_event.cc index eb063c54c6a..9e98291fd24 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_event.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_event.cc @@ -29,6 +29,14 @@ AscendEvent::AscendEvent() { } } +AscendTimeEvent::AscendTimeEvent() { + auto ret = rtEventCreateWithFlag(&event_, RT_EVENT_TIME_LINE); + if (ret != RT_ERROR_NONE) { + MS_LOG(ERROR) << "rtEventCreate failed, ret:" << ret; + event_ = nullptr; + } +} + AscendEvent::~AscendEvent() { auto ret = rtEventDestroy(event_); if (ret != RT_ERROR_NONE) { @@ -60,5 +68,24 @@ void AscendEvent::WaitEvent() { need_wait_ = false; } +void AscendEvent::SyncEvent() { + MS_EXCEPTION_IF_NULL(event_); + auto ret = rtEventSynchronize(event_); + if (ret != RT_ERROR_NONE) { + MS_LOG(EXCEPTION) << "rtEventSynchronize failed, ret:" << ret; + } +} + +void AscendEvent::ElapsedTime(float *cost_time, DeviceEvent *other) { + MS_EXCEPTION_IF_NULL(event_); + auto ascend_other = static_cast(other); + MS_EXCEPTION_IF_NULL(ascend_other); + MS_EXCEPTION_IF_NULL(ascend_other->event_); + auto ret = rtEventElapsedTime(cost_time, event_, ascend_other->event_); + if (ret != RT_ERROR_NONE) { + MS_LOG(EXCEPTION) << "rtEventElapsedTime failed, ret:" << ret; + } +} + bool AscendEvent::NeedWait() { return need_wait_; } } // namespace mindspore::device::ascend diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_event.h b/mindspore/ccsrc/runtime/device/ascend/ascend_event.h index 059390e8c92..358752cec87 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_event.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_event.h @@ -19,6 +19,7 @@ #include "runtime/base.h" #include "ir/device_event.h" + namespace mindspore::device::ascend { class AscendEvent : public DeviceEvent { public: @@ -28,14 +29,22 @@ class AscendEvent : public DeviceEvent { void WaitEvent() override; void RecordEvent() override; bool NeedWait() override; + void SyncEvent() override; + void ElapsedTime(float *cost_time, DeviceEvent *other) override; void set_wait_stream(rtStream_t wait_stream) override { wait_stream_ = wait_stream; } void set_record_stream(rtStream_t record_stream) override { record_stream_ = record_stream; } - private: + protected: rtEvent_t event_{nullptr}; rtStream_t wait_stream_{nullptr}; rtStream_t record_stream_{nullptr}; bool need_wait_{false}; }; + +class AscendTimeEvent : public AscendEvent { + public: + AscendTimeEvent(); + ~AscendTimeEvent() override = default; +}; } // namespace mindspore::device::ascend #endif // MINDSPORE_ASCEND_EVENT_H diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 6dad9375810..db031af8bc0 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -284,6 +284,13 @@ void AscendKernelRuntime::PreInit() { } bool AscendKernelRuntime::Init() { + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + auto execution_mode = ms_context->get_param(MS_CTX_EXECUTION_MODE); + auto profiling_flag = ms_context->get_param(MS_CTX_ENABLE_PROFILING); + if (execution_mode == kPynativeMode && profiling_flag) { + pynative_mode_profiling_flag_ = true; + } if (initialized_) { SetCurrentContext(); return true; @@ -948,6 +955,12 @@ std::shared_ptr AscendKernelRuntime::CreateDeviceEvent() { return ascend_event; } +std::shared_ptr AscendKernelRuntime::CreateDeviceTimeEvent() { + auto ascend_time_event = std::make_shared(); + MS_EXCEPTION_IF_NULL(ascend_time_event); + return ascend_time_event; +} + uint64_t AscendKernelRuntime::GetAvailableMemMaxSize() const { auto ascend_mem_manager = std::dynamic_pointer_cast(mem_manager_); MS_EXCEPTION_IF_NULL(ascend_mem_manager); diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h index 7e24cde8153..ebcc5aed16d 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h @@ -60,6 +60,7 @@ class AscendKernelRuntime : public KernelRuntime { uint64_t GetAvailableMemMaxSize() const override; DeviceAddressType GetTargetDeviceAddressType() const override { return DeviceAddressType::kAscend; }; std::shared_ptr CreateDeviceEvent() override; + std::shared_ptr CreateDeviceTimeEvent() override; void *compute_stream() const override { return stream_; } void *communication_stream() const override { return communication_stream_; } diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_event.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_event.cc index 41ead402a70..d0da14d2e4a 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_event.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_event.cc @@ -42,5 +42,19 @@ void GpuEvent::RecordEvent() { need_wait_ = true; } +void GpuEvent::SyncEvent() { + MS_EXCEPTION_IF_NULL(event_); + CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventSynchronize(event_), "cudaEventSynchronize failed"); +} + +void GpuEvent::ElapsedTime(float *cost_time, DeviceEvent *other) { + MS_EXCEPTION_IF_NULL(event_); + auto gpu_event = static_cast(other); + MS_EXCEPTION_IF_NULL(gpu_event); + MS_EXCEPTION_IF_NULL(gpu_event->event_); + CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventElapsedTime(cost_time, event_, gpu_event->event_), + "cudaEventElapsedTime failed"); +} + bool GpuEvent::NeedWait() { return need_wait_; } } // namespace mindspore::device::gpu diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_event.h b/mindspore/ccsrc/runtime/device/gpu/gpu_event.h index a5cd50e0be0..443f689054e 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_event.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_event.h @@ -29,6 +29,8 @@ class GpuEvent : public DeviceEvent { void WaitEvent() override; void RecordEvent() override; bool NeedWait() override; + void SyncEvent() override; + void ElapsedTime(float *cost_time, DeviceEvent *other) override; void set_wait_stream(void *wait_stream) override { wait_stream_ = static_cast(wait_stream); } void set_record_stream(void *record_stream) override { record_stream_ = static_cast(record_stream); } diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc index 8b1343d43c8..66d8bb0a094 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc @@ -32,6 +32,7 @@ #include "utils/utils.h" #include "frontend/parallel/context.h" #include "debug/env_config_parser.h" +#include "runtime/device/pynative_profiling.h" #if ((defined ENABLE_CPU) && (!defined _WIN32)) #include "ps/ps_cache/ps_cache_manager.h" #endif @@ -966,6 +967,36 @@ void KernelRuntime::LaunchKernelEvent(const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs, void *stream) { + MS_EXCEPTION_IF_NULL(kernel_mod); + MS_EXCEPTION_IF_NULL(stream); + float cost_time = 0; + auto start = CreateDeviceTimeEvent(); + auto end = CreateDeviceTimeEvent(); + MS_EXCEPTION_IF_NULL(start); + MS_EXCEPTION_IF_NULL(end); + start->set_record_stream(stream); + end->set_record_stream(stream); + start->RecordEvent(); + bool ret = kernel_mod->Launch(inputs, workspace, outputs, stream); + end->RecordEvent(); + start->SyncEvent(); + end->SyncEvent(); + start->ElapsedTime(&cost_time, end.get()); + auto launch_end_time = GetTime(); + auto &profiler_inst = PynativeProfiler::GetInstance(); + double launch_start_time = launch_end_time - cost_time / kBasicTimeTransferUnit; + auto op_launch_start_time_end_time = std::make_pair(launch_start_time, launch_end_time); + profiler_inst.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_time_end_time)); + if (!ret) { + MS_LOG(EXCEPTION) << "Launch kernel failed, kernel name is : " << op_name; + } + return ret; +} + bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) { const auto &kernels = graph.execution_order(); std::vector dynamic_kernel_list; @@ -1020,9 +1051,19 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) { GenLaunchArgs(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs); bool ret; if (AnfAlgo::IsCommunicationOp(kernel)) { - ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_); + if (pynative_mode_profiling_flag_) { + ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs, + kernel_workspaces, kernel_outputs, communication_stream_); + } else { + ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_); + } } else { - ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_); + if (pynative_mode_profiling_flag_) { + ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs, + kernel_workspaces, kernel_outputs, stream_); + } else { + ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_); + } } if (!ret) { MS_LOG(ERROR) << "Launch kernel failed."; diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h index d3c7d2b1d1e..30ab39b1c5a 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h @@ -103,6 +103,7 @@ class KernelRuntime { virtual uint64_t GetAvailableMemMaxSize() const { return 0; } void GenKernelEvents(const session::KernelGraph *graph); virtual std::shared_ptr CreateDeviceEvent() { return nullptr; } + virtual std::shared_ptr CreateDeviceTimeEvent() { return nullptr; } virtual DeviceAddressType GetTargetDeviceAddressType() const = 0; virtual void *compute_stream() const { return nullptr; } virtual void *communication_stream() const { return nullptr; } @@ -139,6 +140,10 @@ class KernelRuntime { void RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, session::KernelGraph *graph); void AssignValueNodeTensor(const ValueNodePtr &value_node, const ValuePtr &node_value, size_t output_idx); DeviceAddressPtr PreAssignCNodeMemory(const AnfNodePtr &anf_node, size_t index); + bool LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name, + const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs, void *stream); #if (ENABLE_CPU && !_WIN32) void GetFirstPSEmbeddingCache(const session::KernelGraph *graph, AnfNodePtr *const first_cache_input_index, size_t *const first_cache_size); @@ -148,6 +153,7 @@ class KernelRuntime { protected: uint32_t device_id_{0}; + bool pynative_mode_profiling_flag_{false}; #if !defined(_WIN32) && !defined(_WIN64) std::shared_ptr debugger_; #endif diff --git a/mindspore/ccsrc/runtime/device/pynative_profiling.cc b/mindspore/ccsrc/runtime/device/pynative_profiling.cc new file mode 100644 index 00000000000..9e16be07ce1 --- /dev/null +++ b/mindspore/ccsrc/runtime/device/pynative_profiling.cc @@ -0,0 +1,49 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "runtime/device/pynative_profiling.h" +#include +#include +#include +#include +#include +#include "utils/profile.h" +#include "utils/utils.h" + +namespace mindspore { +namespace device { +void PynativeProfiler::SingleOpProfilingData() { + static std::ofstream of("pynative_forward_profiling_data.csv"); + of.setf(std::ios::fixed, std::ios::floatfield); + if (real_run_op_index_ == 1) { + of << "RealRunOpIndex" << ',' << "RealRunOpName" << ',' << "OpName" << ',' << "RealRunOpStartTime(s)" << ',' + << "OpDeviceStartTime(s)" << ',' << "OpDeviceEndTime(s)" << ',' << "RealRunOpEndTime(s)" << std::endl; + } + if (op_name_launch_start_time_end_time_vec_.empty()) { + of << real_run_op_index_ << ',' << real_run_op_name_ << ',' << "nopnode" << ',' + << real_run_op_start_time_end_time_.first << ',' << "nopnode" << ',' << "nopnode" << ',' + << real_run_op_start_time_end_time_.second << std::endl; + return; + } + for (const auto &i : op_name_launch_start_time_end_time_vec_) { + of << real_run_op_index_ << ',' << real_run_op_name_ << ',' << i.first << ',' + << real_run_op_start_time_end_time_.first << ',' << i.second.first << ',' << i.second.second << ',' + << real_run_op_start_time_end_time_.second << std::endl; + } + op_name_launch_start_time_end_time_vec_.clear(); +} +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/pynative_profiling.h b/mindspore/ccsrc/runtime/device/pynative_profiling.h new file mode 100644 index 00000000000..6a910a6433d --- /dev/null +++ b/mindspore/ccsrc/runtime/device/pynative_profiling.h @@ -0,0 +1,56 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_ +#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_ + +#include +#include +#include +#include +#include + +namespace mindspore { +namespace device { +class PynativeProfiler { + public: + static PynativeProfiler &GetInstance() { + static PynativeProfiler instance; + return instance; + } + + void AddRealRunOpIndex() { ++real_run_op_index_; } + void SetRealRunOpName(const std::string &name) { real_run_op_name_ = name; } + void SetRealRunOpTime(const std::pair &start_end) { real_run_op_start_time_end_time_ = start_end; } + void SetOpNameAndLaunchTime(const std::pair> &name_start_end) { + op_name_launch_start_time_end_time_vec_.push_back(name_start_end); + } + void SingleOpProfilingData(); + + private: + PynativeProfiler() = default; + ~PynativeProfiler() = default; + PynativeProfiler(const PynativeProfiler &) = delete; + PynativeProfiler &operator=(const PynativeProfiler &) = delete; + int real_run_op_index_ = 0; + std::string real_run_op_name_; + std::pair real_run_op_start_time_end_time_; + std::vector>> op_name_launch_start_time_end_time_vec_; +}; +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_ diff --git a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc index fa92a5aac3f..167c341108c 100644 --- a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc +++ b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc @@ -16,6 +16,8 @@ #include "runtime/hardware/gpu/gpu_device_context.h" #include +#include +#include "runtime/device/pynative_profiling.h" #include "runtime/device/gpu/kernel_info_setter.h" #include "runtime/device/gpu/gpu_kernel_build.h" #include "runtime/device/gpu/gpu_device_address.h" @@ -432,6 +434,11 @@ bool GPUDeviceContext::LaunchKernelWithProfiling(const CNodePtr &kernel, const s bool ret = DoLaunchKernel(kernel_mod, inputs, workspace, outputs); profiler_inst->OpDataProducerEnd(); + auto op_launch_start_end_time = profiler_inst->GetSingleOpLaunchTime(); + auto &pynative_profiler = PynativeProfiler::GetInstance(); + std::string op_name = kernel->fullname_with_scope(); + pynative_profiler.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_end_time)); + if (profiler_inst->GetSyncEnableFlag()) { CHECK_RET_WITH_RETURN_ERROR(SyncStream(), "Profiler SyncStream failed."); } diff --git a/mindspore/ccsrc/utils/utils.h b/mindspore/ccsrc/utils/utils.h index a4983b310a4..4c576e2624b 100644 --- a/mindspore/ccsrc/utils/utils.h +++ b/mindspore/ccsrc/utils/utils.h @@ -507,6 +507,8 @@ constexpr auto kUpdateStateRealInput = 2; // index define of Load constexpr auto kLoadRealInput = 1; constexpr auto kLoadStateInput = 2; +// time transfer unit +constexpr int kBasicTimeTransferUnit = 1000; // index of input or output enum Index : size_t { kIndex0 = 0, diff --git a/mindspore/core/ir/device_event.h b/mindspore/core/ir/device_event.h index 8309d2b4e37..5c855bbf3a4 100644 --- a/mindspore/core/ir/device_event.h +++ b/mindspore/core/ir/device_event.h @@ -24,6 +24,8 @@ class DeviceEvent { virtual void WaitEvent() = 0; virtual void RecordEvent() = 0; virtual bool NeedWait() = 0; + virtual void SyncEvent() = 0; + virtual void ElapsedTime(float *cost_time, DeviceEvent *other) = 0; virtual void set_wait_stream(void *stream) = 0; virtual void set_record_stream(void *stream) = 0; }; diff --git a/tests/ut/cpp/stub/runtime/runtime_stub.cc b/tests/ut/cpp/stub/runtime/runtime_stub.cc index 0682ce3e7f8..5579a495b74 100644 --- a/tests/ut/cpp/stub/runtime/runtime_stub.cc +++ b/tests/ut/cpp/stub/runtime/runtime_stub.cc @@ -25,6 +25,10 @@ rtError_t rtEventSynchronize(rtEvent_t event) { return RT_ERROR_NONE; } +rtError_t rtEventCreateWithFlag(rtEvent_t *event, uint32_t flag) { return RT_ERROR_NONE; } + +rtError_t rtEventElapsedTime(float *time, rtEvent_t start, rtEvent_t end) { return RT_ERROR_NONE; } + rtError_t rtMalloc(void **devPtr, uint64_t size, rtMemType_t type) { return RT_ERROR_NONE; } rtError_t rtMemcpy(void *dst, uint64_t destMax, const void *src, uint64_t count, rtMemcpyKind_t kind) {