!21362 add pynative profiling codes based on ascend and gpu

Merge pull request !21362 from lvchangquan/profiling_refactor
2021-08-13 06:18:31 +00:00 · 2021-08-13 06:18:31 +00:00 · 6afcd815d2
parent 8d548c8545 e8d9803258
commit 6afcd815d2
19 changed files with 261 additions and 4 deletions
--- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
@ -32,6 +32,7 @@
 #include "ir/tensor.h"
 #include "utils/any.h"
 #include "utils/utils.h"
+#include "utils/profile.h"
 #include "utils/ms_context.h"
 #include "utils/check_convert_utils.h"
 #include "utils/context/context_extends.h"
@ -67,6 +68,7 @@

 #include "debug/anf_ir_dump.h"
 #include "runtime/hardware/device_context_manager.h"
+#include "runtime/device/pynative_profiling.h"

 using mindspore::tensor::TensorPy;

@ -702,6 +704,9 @@ py::object GetDstType(const TypeId &type_id) {
 }  // namespace

 py::object RealRunOp(const py::args &args) {
+  auto real_run_op_start = GetTime();
+  auto &profiler_inst = device::PynativeProfiler::GetInstance();
+  profiler_inst.AddRealRunOpIndex();
  CheckPyNativeContext();
  auto executor = PynativeExecutor::GetInstance();
  MS_EXCEPTION_IF_NULL(executor);
@ -709,6 +714,10 @@ py::object RealRunOp(const py::args &args) {
  MS_EXCEPTION_IF_NULL(op_exec_info);
  py::object ret = py::none();
  PynativeExecutorTry(executor->forward_executor()->RunOpS, &ret, op_exec_info);
+  auto real_run_op_end = GetTime();
+  profiler_inst.SetRealRunOpName(op_exec_info->op_name);
+  profiler_inst.SetRealRunOpTime(std::make_pair(real_run_op_start, real_run_op_end));
+  profiler_inst.SingleOpProfilingData();
  return ret;
 }

--- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc
+++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc
@ -25,6 +25,7 @@
 #include "pybind_api/api_register.h"
 #include "utils/log_adapter.h"
 #include "utils/utils.h"
+#include "utils/profile.h"
 #include "utils/ms_context.h"

 namespace mindspore {
@ -446,6 +447,12 @@ void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) {
  }
 }

+void GPUProfiler::SingleOpLaunchTimeProcess(float op_time_elapsed) {
+  auto launch_end_time = GetTime();
+  double launch_start_time = launch_end_time - op_time_elapsed / kTimeUnit / kTimeUnit;
+  SetSingleOpLaunchTime(std::make_pair(launch_start_time, launch_end_time));
+}
+
 void GPUProfiler::OpDataProducerEnd() {
  float op_time_elapsed = 0;
  if (sync_enable_flag_) {
@ -459,9 +466,11 @@ void GPUProfiler::OpDataProducerEnd() {
    CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy  op event stop failed");
    op_time_elapsed = op_time_elapsed * kTimeUnit;
    op_host_time_stop_ = GetHostTimeStamp();
+    SingleOpLaunchTimeProcess(op_time_elapsed);
  } else {
    op_host_time_stop_ = GetHostTimeStamp();
    op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit;
+    SingleOpLaunchTimeProcess(op_time_elapsed);
  }
  MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed;
  Profiler::SetRunTimeData(op_name_, op_time_elapsed);
--- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h
+++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h
@ -135,6 +135,7 @@ class GPUProfiler : public Profiler {
  std::string ProfileDataPath() const { return profile_data_path_; }

 private:
+  void SingleOpLaunchTimeProcess(float op_time_elapsed);
  void OpsParser();
  void EventLog(const Event &event);
  void ClearInst() override;
--- a/mindspore/ccsrc/profiler/device/profiling.h
+++ b/mindspore/ccsrc/profiler/device/profiling.h
@ -79,6 +79,10 @@ class Profiler {
  bool GetEnableFlag() const { return enable_flag_; }
  std::string ProfileDataPath() const { return profile_data_path_; }
  void RecordOneStepStartEndInfo(std::string op_name);
+  std::pair<double, double> GetSingleOpLaunchTime() { return single_op_launch_start_time_end_time_; }
+  void SetSingleOpLaunchTime(const std::pair<double, double> &launch_start_end) {
+    single_op_launch_start_time_end_time_ = launch_start_end;
+  }

 protected:
  void SetRunTimeData(const std::string &op_name, const float time_elapsed);
@ -86,6 +90,7 @@ class Profiler {
  uint64_t GetHostMonoTimeStamp() const;
  virtual void SaveProfileData() = 0;
  virtual void ClearInst() = 0;
+  std::pair<double, double> single_op_launch_start_time_end_time_;
  bool enable_flag_ = false;
  std::string profile_data_path_;
  std::unordered_map<std::string, OpInfo> op_info_map_;
--- a/mindspore/ccsrc/runtime/device/CMakeLists.txt
+++ b/mindspore/ccsrc/runtime/device/CMakeLists.txt
@ -1,7 +1,7 @@
 file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc"
    "kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc"
    "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc"
-    "bucket.cc" "launch_kernel.cc" "launch_mul.cc"
+    "bucket.cc" "launch_kernel.cc" "launch_mul.cc" "pynative_profiling.cc"
 )

 if("${ENABLE_HIDDEN}" STREQUAL "OFF")
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_event.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_event.cc
@ -29,6 +29,14 @@ AscendEvent::AscendEvent() {
  }
 }

+AscendTimeEvent::AscendTimeEvent() {
+  auto ret = rtEventCreateWithFlag(&event_, RT_EVENT_TIME_LINE);
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(ERROR) << "rtEventCreate failed, ret:" << ret;
+    event_ = nullptr;
+  }
+}
+
 AscendEvent::~AscendEvent() {
  auto ret = rtEventDestroy(event_);
  if (ret != RT_ERROR_NONE) {
@ -60,5 +68,24 @@ void AscendEvent::WaitEvent() {
  need_wait_ = false;
 }

+void AscendEvent::SyncEvent() {
+  MS_EXCEPTION_IF_NULL(event_);
+  auto ret = rtEventSynchronize(event_);
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(EXCEPTION) << "rtEventSynchronize failed, ret:" << ret;
+  }
+}
+
+void AscendEvent::ElapsedTime(float *cost_time, DeviceEvent *other) {
+  MS_EXCEPTION_IF_NULL(event_);
+  auto ascend_other = static_cast<AscendEvent *>(other);
+  MS_EXCEPTION_IF_NULL(ascend_other);
+  MS_EXCEPTION_IF_NULL(ascend_other->event_);
+  auto ret = rtEventElapsedTime(cost_time, event_, ascend_other->event_);
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(EXCEPTION) << "rtEventElapsedTime failed, ret:" << ret;
+  }
+}
+
 bool AscendEvent::NeedWait() { return need_wait_; }
 }  // namespace mindspore::device::ascend
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_event.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_event.h
@ -19,6 +19,7 @@

 #include "runtime/base.h"
 #include "ir/device_event.h"
+
 namespace mindspore::device::ascend {
 class AscendEvent : public DeviceEvent {
 public:
@ -28,14 +29,22 @@ class AscendEvent : public DeviceEvent {
  void WaitEvent() override;
  void RecordEvent() override;
  bool NeedWait() override;
+  void SyncEvent() override;
+  void ElapsedTime(float *cost_time, DeviceEvent *other) override;
  void set_wait_stream(rtStream_t wait_stream) override { wait_stream_ = wait_stream; }
  void set_record_stream(rtStream_t record_stream) override { record_stream_ = record_stream; }

- private:
+ protected:
  rtEvent_t event_{nullptr};
  rtStream_t wait_stream_{nullptr};
  rtStream_t record_stream_{nullptr};
  bool need_wait_{false};
 };
+
+class AscendTimeEvent : public AscendEvent {
+ public:
+  AscendTimeEvent();
+  ~AscendTimeEvent() override = default;
+};
 }  // namespace mindspore::device::ascend
 #endif  // MINDSPORE_ASCEND_EVENT_H
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@ -282,6 +282,13 @@ void AscendKernelRuntime::PreInit() {
 }

 bool AscendKernelRuntime::Init() {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
+  auto profiling_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PROFILING);
+  if (execution_mode == kPynativeMode && profiling_flag) {
+    pynative_mode_profiling_flag_ = true;
+  }
  if (initialized_) {
    SetCurrentContext();
    return true;
@ -946,6 +953,12 @@ std::shared_ptr<DeviceEvent> AscendKernelRuntime::CreateDeviceEvent() {
  return ascend_event;
 }

+std::shared_ptr<DeviceEvent> AscendKernelRuntime::CreateDeviceTimeEvent() {
+  auto ascend_time_event = std::make_shared<AscendTimeEvent>();
+  MS_EXCEPTION_IF_NULL(ascend_time_event);
+  return ascend_time_event;
+}
+
 uint64_t AscendKernelRuntime::GetAvailableMemMaxSize() const {
  auto ascend_mem_manager = std::dynamic_pointer_cast<AscendMemoryManager>(mem_manager_);
  MS_EXCEPTION_IF_NULL(ascend_mem_manager);
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@ -58,6 +58,7 @@ class AscendKernelRuntime : public KernelRuntime {
  uint64_t GetAvailableMemMaxSize() const override;
  DeviceAddressType GetTargetDeviceAddressType() const override { return DeviceAddressType::kAscend; };
  std::shared_ptr<DeviceEvent> CreateDeviceEvent() override;
+  std::shared_ptr<DeviceEvent> CreateDeviceTimeEvent() override;
  void *compute_stream() const override { return stream_; }
  void *communication_stream() const override { return communication_stream_; }

--- a/mindspore/ccsrc/runtime/device/gpu/gpu_event.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_event.cc
@ -42,5 +42,19 @@ void GpuEvent::RecordEvent() {
  need_wait_ = true;
 }

+void GpuEvent::SyncEvent() {
+  MS_EXCEPTION_IF_NULL(event_);
+  CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventSynchronize(event_), "cudaEventSynchronize failed");
+}
+
+void GpuEvent::ElapsedTime(float *cost_time, DeviceEvent *other) {
+  MS_EXCEPTION_IF_NULL(event_);
+  auto gpu_event = static_cast<GpuEvent *>(other);
+  MS_EXCEPTION_IF_NULL(gpu_event);
+  MS_EXCEPTION_IF_NULL(gpu_event->event_);
+  CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventElapsedTime(cost_time, event_, gpu_event->event_),
+                                     "cudaEventElapsedTime failed");
+}
+
 bool GpuEvent::NeedWait() { return need_wait_; }
 }  // namespace mindspore::device::gpu
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_event.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_event.h
@ -29,6 +29,8 @@ class GpuEvent : public DeviceEvent {
  void WaitEvent() override;
  void RecordEvent() override;
  bool NeedWait() override;
+  void SyncEvent() override;
+  void ElapsedTime(float *cost_time, DeviceEvent *other) override;
  void set_wait_stream(void *wait_stream) override { wait_stream_ = static_cast<cudaStream_t>(wait_stream); }
  void set_record_stream(void *record_stream) override { record_stream_ = static_cast<cudaStream_t>(record_stream); }

--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@ -32,6 +32,7 @@
 #include "utils/utils.h"
 #include "frontend/parallel/context.h"
 #include "debug/env_config_parser.h"
+#include "runtime/device/pynative_profiling.h"
 #if ((defined ENABLE_CPU) && (!defined _WIN32))
 #include "ps/ps_cache/ps_cache_manager.h"
 #endif
@ -966,6 +967,36 @@ void KernelRuntime::LaunchKernelEvent(const std::vector<std::vector<std::functio
  }
 }

+bool KernelRuntime::LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name,
+                                                      const std::vector<AddressPtr> &inputs,
+                                                      const std::vector<AddressPtr> &workspace,
+                                                      const std::vector<AddressPtr> &outputs, void *stream) {
+  MS_EXCEPTION_IF_NULL(kernel_mod);
+  MS_EXCEPTION_IF_NULL(stream);
+  float cost_time = 0;
+  auto start = CreateDeviceTimeEvent();
+  auto end = CreateDeviceTimeEvent();
+  MS_EXCEPTION_IF_NULL(start);
+  MS_EXCEPTION_IF_NULL(end);
+  start->set_record_stream(stream);
+  end->set_record_stream(stream);
+  start->RecordEvent();
+  bool ret = kernel_mod->Launch(inputs, workspace, outputs, stream);
+  end->RecordEvent();
+  start->SyncEvent();
+  end->SyncEvent();
+  start->ElapsedTime(&cost_time, end.get());
+  auto launch_end_time = GetTime();
+  auto &profiler_inst = PynativeProfiler::GetInstance();
+  double launch_start_time = launch_end_time - cost_time / kBasicTimeTransferUnit;
+  auto op_launch_start_time_end_time = std::make_pair(launch_start_time, launch_end_time);
+  profiler_inst.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_time_end_time));
+  if (!ret) {
+    MS_LOG(EXCEPTION) << "Launch kernel failed, kernel name is : " << op_name;
+  }
+  return ret;
+}
+
 bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
  const auto &kernels = graph.execution_order();
  std::vector<DynamicKernelPtr> dynamic_kernel_list;
@ -1020,9 +1051,19 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
      GenLaunchArgs(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
      bool ret;
      if (AnfAlgo::IsCommunicationOp(kernel)) {
-        ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_);
+        if (pynative_mode_profiling_flag_) {
+          ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs,
+                                                  kernel_workspaces, kernel_outputs, communication_stream_);
+        } else {
+          ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_);
+        }
      } else {
-        ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
+        if (pynative_mode_profiling_flag_) {
+          ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs,
+                                                  kernel_workspaces, kernel_outputs, stream_);
+        } else {
+          ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
+        }
      }
      if (!ret) {
        MS_LOG(ERROR) << "Launch kernel failed.";
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@ -98,6 +98,7 @@ class KernelRuntime {
  virtual uint64_t GetAvailableMemMaxSize() const { return 0; }
  void GenKernelEvents(const session::KernelGraph *graph);
  virtual std::shared_ptr<DeviceEvent> CreateDeviceEvent() { return nullptr; }
+  virtual std::shared_ptr<DeviceEvent> CreateDeviceTimeEvent() { return nullptr; }
  virtual DeviceAddressType GetTargetDeviceAddressType() const = 0;
  virtual void *compute_stream() const { return nullptr; }
  virtual void *communication_stream() const { return nullptr; }
@ -134,6 +135,10 @@ class KernelRuntime {
  void RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, session::KernelGraph *graph);
  void AssignValueNodeTensor(const ValueNodePtr &value_node, const ValuePtr &node_value, size_t output_idx);
  DeviceAddressPtr PreAssignCNodeMemory(const AnfNodePtr &anf_node, size_t index);
+  bool LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name,
+                                         const std::vector<AddressPtr> &inputs,
+                                         const std::vector<AddressPtr> &workspace,
+                                         const std::vector<AddressPtr> &outputs, void *stream);
 #if (ENABLE_CPU && !_WIN32)
  void GetFirstPSEmbeddingCache(const session::KernelGraph *graph, AnfNodePtr *const first_cache_input_index,
                                size_t *const first_cache_size);
@ -143,6 +148,7 @@ class KernelRuntime {

 protected:
  uint32_t device_id_{0};
+  bool pynative_mode_profiling_flag_{false};
 #if !defined(_WIN32) && !defined(_WIN64)
  std::shared_ptr<Debugger> debugger_;
 #endif
--- a/mindspore/ccsrc/runtime/device/pynative_profiling.cc
+++ b/mindspore/ccsrc/runtime/device/pynative_profiling.cc
@ -0,0 +1,49 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "runtime/device/pynative_profiling.h"
+#include <iostream>
+#include <fstream>
+#include <utility>
+#include <memory>
+#include <string>
+#include "utils/profile.h"
+#include "utils/utils.h"
+
+namespace mindspore {
+namespace device {
+void PynativeProfiler::SingleOpProfilingData() {
+  static std::ofstream of("pynative_forward_profiling_data.csv");
+  of.setf(std::ios::fixed, std::ios::floatfield);
+  if (real_run_op_index_ == 1) {
+    of << "RealRunOpIndex" << ',' << "RealRunOpName" << ',' << "OpName" << ',' << "RealRunOpStartTime(s)" << ','
+       << "OpDeviceStartTime(s)" << ',' << "OpDeviceEndTime(s)" << ',' << "RealRunOpEndTime(s)" << std::endl;
+  }
+  if (op_name_launch_start_time_end_time_vec_.empty()) {
+    of << real_run_op_index_ << ',' << real_run_op_name_ << ',' << "nopnode" << ','
+       << real_run_op_start_time_end_time_.first << ',' << "nopnode" << ',' << "nopnode" << ','
+       << real_run_op_start_time_end_time_.second << std::endl;
+    return;
+  }
+  for (const auto &i : op_name_launch_start_time_end_time_vec_) {
+    of << real_run_op_index_ << ',' << real_run_op_name_ << ',' << i.first << ','
+       << real_run_op_start_time_end_time_.first << ',' << i.second.first << ',' << i.second.second << ','
+       << real_run_op_start_time_end_time_.second << std::endl;
+  }
+  op_name_launch_start_time_end_time_vec_.clear();
+}
+}  // namespace device
+}  // namespace mindspore
--- a/mindspore/ccsrc/runtime/device/pynative_profiling.h
+++ b/mindspore/ccsrc/runtime/device/pynative_profiling.h
@ -0,0 +1,56 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_
+#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <utility>
+#include <unordered_map>
+
+namespace mindspore {
+namespace device {
+class PynativeProfiler {
+ public:
+  static PynativeProfiler &GetInstance() {
+    static PynativeProfiler instance;
+    return instance;
+  }
+
+  void AddRealRunOpIndex() { ++real_run_op_index_; }
+  void SetRealRunOpName(const std::string &name) { real_run_op_name_ = name; }
+  void SetRealRunOpTime(const std::pair<double, double> &start_end) { real_run_op_start_time_end_time_ = start_end; }
+  void SetOpNameAndLaunchTime(const std::pair<std::string, std::pair<double, double>> &name_start_end) {
+    op_name_launch_start_time_end_time_vec_.push_back(name_start_end);
+  }
+  void SingleOpProfilingData();
+
+ private:
+  PynativeProfiler() = default;
+  ~PynativeProfiler() = default;
+  PynativeProfiler(const PynativeProfiler &) = delete;
+  PynativeProfiler &operator=(const PynativeProfiler &) = delete;
+  int real_run_op_index_ = 0;
+  std::string real_run_op_name_;
+  std::pair<double, double> real_run_op_start_time_end_time_;
+  std::vector<std::pair<std::string, std::pair<double, double>>> op_name_launch_start_time_end_time_vec_;
+};
+}  // namespace device
+}  // namespace mindspore
+
+#endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_
--- a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
@ -16,6 +16,8 @@

 #include "runtime/hardware/gpu/gpu_device_context.h"
 #include <dlfcn.h>
+#include <utility>
+#include "runtime/device/pynative_profiling.h"
 #include "runtime/device/gpu/kernel_info_setter.h"
 #include "runtime/device/gpu/gpu_kernel_build.h"
 #include "runtime/device/gpu/gpu_device_address.h"
@ -432,6 +434,11 @@ bool GPUDeviceContext::LaunchKernelWithProfiling(const CNodePtr &kernel, const s
  bool ret = DoLaunchKernel(kernel_mod, inputs, workspace, outputs);
  profiler_inst->OpDataProducerEnd();

+  auto op_launch_start_end_time = profiler_inst->GetSingleOpLaunchTime();
+  auto &pynative_profiler = PynativeProfiler::GetInstance();
+  std::string op_name = kernel->fullname_with_scope();
+  pynative_profiler.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_end_time));
+
  if (profiler_inst->GetSyncEnableFlag()) {
    CHECK_RET_WITH_RETURN_ERROR(SyncStream(), "Profiler SyncStream failed.");
  }
--- a/mindspore/ccsrc/utils/utils.h
+++ b/mindspore/ccsrc/utils/utils.h
@ -507,6 +507,8 @@ constexpr auto kUpdateStateRealInput = 2;
 // index define of Load
 constexpr auto kLoadRealInput = 1;
 constexpr auto kLoadStateInput = 2;
+// time transfer unit
+constexpr int kBasicTimeTransferUnit = 1000;
 // index of input or output
 enum Index : size_t {
  kIndex0 = 0,
--- a/mindspore/core/ir/device_event.h
+++ b/mindspore/core/ir/device_event.h
@ -24,6 +24,8 @@ class DeviceEvent {
  virtual void WaitEvent() = 0;
  virtual void RecordEvent() = 0;
  virtual bool NeedWait() = 0;
+  virtual void SyncEvent() = 0;
+  virtual void ElapsedTime(float *cost_time, DeviceEvent *other) = 0;
  virtual void set_wait_stream(void *stream) = 0;
  virtual void set_record_stream(void *stream) = 0;
 };
--- a/tests/ut/cpp/stub/runtime/runtime_stub.cc
+++ b/tests/ut/cpp/stub/runtime/runtime_stub.cc
@ -25,6 +25,10 @@

 rtError_t rtEventSynchronize(rtEvent_t event) { return RT_ERROR_NONE; }

+rtError_t rtEventCreateWithFlag(rtEvent_t *event, uint32_t flag) { return RT_ERROR_NONE; }
+
+rtError_t rtEventElapsedTime(float *time, rtEvent_t start, rtEvent_t end) { return RT_ERROR_NONE; }
+
 rtError_t rtMalloc(void **devPtr, uint64_t size, rtMemType_t type) { return RT_ERROR_NONE; }

 rtError_t rtMemcpy(void *dst, uint64_t destMax, const void *src, uint64_t count, rtMemcpyKind_t kind) {