From 9770be4bab9ba2eea0dd0e8e9bb1e84da0813e81 Mon Sep 17 00:00:00 2001
From: yanghaitao <yanghaitao1@huawei.com>
Date: Mon, 13 Jul 2020 14:12:44 +0800
Subject: [PATCH] gpu profiling

---
 .../gpu/data/dataset_iterator_kernel.cc       | 35 +++++++++-
 .../gpu/data/dataset_iterator_kernel.h        |  4 ++
 .../gpu/data/dataset_profiling.cc             | 70 +++++++++++++++++++
 .../gpu/data/dataset_profiling.h              | 50 +++++++++++++
 .../engine/datasetops/device_queue_op.cc      | 68 +++++++++++++++---
 .../engine/datasetops/device_queue_op.h       |  3 +-
 .../profiler/device/gpu/gpu_profiling.cc      | 25 +++++--
 .../ccsrc/profiler/device/gpu/gpu_profiling.h | 29 ++++++--
 8 files changed, 259 insertions(+), 25 deletions(-)
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.cc
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.h
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc
index c8de6b349e4..562f0b21019 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc
@@ -13,21 +13,24 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h"
+
 #include <cuda_runtime_api.h>
+#include <memory>
 #include <string>
 #include <vector>
+#include "backend/kernel_compiler/gpu/data/dataset_utils.h"
+#include "profiler/device/gpu/gpu_profiling.h"
 #include "runtime/device/gpu/gpu_buffer_mgr.h"
 #include "runtime/device/gpu/gpu_common.h"
-#include "backend/kernel_compiler/gpu/data/dataset_utils.h"
 
 namespace mindspore {
 namespace kernel {
 using mindspore::device::GpuBufferMgr;
 using mindspore::device::HandleMgr;
 
-DatasetIteratorKernel::DatasetIteratorKernel() : handle_(HandleMgr::INVALID_HANDLE), total_bytes_(0) {}
+DatasetIteratorKernel::DatasetIteratorKernel()
+    : handle_(HandleMgr::INVALID_HANDLE), total_bytes_(0), profiling_enable_(false), profiling_op_(nullptr) {}
 
 DatasetIteratorKernel::~DatasetIteratorKernel() { GpuBufferMgr::GetInstance().Close(handle_); }
 
@@ -60,6 +63,14 @@ bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) {
     MS_LOG(EXCEPTION) << "Gpu Queue(" << queue_name_ << ") Open Failed";
   }
 
+  auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
+  MS_EXCEPTION_IF_NULL(profiler_inst);
+  profiling_enable_ = profiler_inst->GetEnableFlag();
+  if (profiling_enable_) {
+    std::string path = profiler_inst->ProfileDataPath();
+    profiling_op_ = std::make_shared<GetNextProfiling>(path);
+    profiler_inst->RegisterProfilingOp(profiling_op_);
+  }
   return true;
 }
 
@@ -69,11 +80,21 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
                                    const std::vector<AddressPtr> &outputs, void *stream) {
   void *addr = nullptr;
   size_t len = 0;
+  uint64_t start_time_stamp = 0;
+  uint32_t queue_size = 0;
 
   int repeat = 0;
   while (true) {
+    if (profiling_enable_) {
+      start_time_stamp = profiling_op_->GetTimeStamp();
+      queue_size = GpuBufferMgr::GetInstance().Size(handle_);
+    }
     auto ret = GpuBufferMgr::GetInstance().Front(handle_, &addr, &len);
     if (ret == device::SUCCESS) {
+      if (profiling_enable_) {
+        uint64_t end_time_stamp = profiling_op_->GetTimeStamp();
+        profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp);
+      }
       break;
     }
 
@@ -84,10 +105,18 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
         continue;
       } else {
         MS_LOG(ERROR) << "Get data timeout";
+        if (profiling_enable_) {
+          uint64_t end_time_stamp = profiling_op_->GetTimeStamp();
+          profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp);
+        }
         return false;
       }
     }
 
+    if (profiling_enable_) {
+      uint64_t end_time_stamp = profiling_op_->GetTimeStamp();
+      profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp);
+    }
     MS_LOG(ERROR) << "Get data failed, errcode " << ret;
     return false;
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h
index b20df721a62..2aa62880f7a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h
@@ -17,8 +17,10 @@
 #ifndef MINDSPORE_GET_NEXT_KERNEL_H
 #define MINDSPORE_GET_NEXT_KERNEL_H
 
+#include <memory>
 #include <string>
 #include <vector>
+#include "backend/kernel_compiler/gpu/data/dataset_profiling.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 
@@ -44,6 +46,8 @@ class DatasetIteratorKernel : public GpuKernel {
   std::string queue_name_;
   unsigned int handle_;
   size_t total_bytes_;
+  bool profiling_enable_;
+  std::shared_ptr<GetNextProfiling> profiling_op_;
 
   std::vector<size_t> input_size_list_;
   std::vector<size_t> output_size_list_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.cc
new file mode 100644
index 00000000000..f16146685be
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.cc
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/kernel_compiler/gpu/data/dataset_profiling.h"
+
+#include <fstream>
+#include <memory>
+#include <string>
+#include <utility>
+#include "utils/log_adapter.h"
+#include "utils/ms_utils.h"
+#include "utils/utils.h"
+
+namespace mindspore {
+namespace kernel {
+
+GetNextProfiling::GetNextProfiling(const std::string &path) : profiling_path_(path) {}
+
+void GetNextProfiling::GetDeviceId() {
+  // If DEVICE_ID is not set,defult value is 0
+  device_id_ = common::GetEnv("DEVICE_ID");
+  if (device_id_.empty()) {
+    device_id_ = "0";
+  }
+}
+
+void GetNextProfiling::Init() {
+  GetDeviceId();
+  file_name_ = profiling_path_ + "/minddata_getnext_profiling_" + device_id_ + ".txt";
+  op_name_ = kGetNextOpName;
+}
+
+void GetNextProfiling::SaveProfilingData() {
+  std::ofstream handle(file_name_, std::ios::trunc);
+  if (!handle.is_open()) {
+    MS_LOG(ERROR) << "Open get-next profiling file failed.";
+    return;
+  }
+  for (uint32_t index = 0; index < queue_size_.size(); index++) {
+    handle << Name() << " " << time_stamp_[index].first << " " << time_stamp_[index].second << " " << queue_size_[index]
+           << std::endl;
+  }
+  handle.close();
+}
+
+void GetNextProfiling::RecordData(uint32_t queue_size, uint64_t start_time_stamp, uint64_t end_time_stamp) {
+  queue_size_.emplace_back(queue_size);
+  std::pair<uint64_t, uint64_t> time_stamp(start_time_stamp, end_time_stamp);
+  time_stamp_.emplace_back(time_stamp);
+}
+
+uint64_t GetNextProfiling::GetTimeStamp() const {
+  auto cur_sys_clock = std::chrono::system_clock::now();
+  uint64_t time_stamp = std::chrono::duration_cast<std::chrono::nanoseconds>(cur_sys_clock.time_since_epoch()).count();
+  return time_stamp;
+}
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.h
new file mode 100644
index 00000000000..35ecbccd266
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_DATASET_DATASET_PROFILING_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_DATASET_DATASET_PROFILING_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "profiler/device/gpu/gpu_profiling.h"
+
+using mindspore::profiler::gpu::ProfilingOp;
+
+namespace mindspore {
+namespace kernel {
+class GetNextProfiling : public ProfilingOp {
+ public:
+  explicit GetNextProfiling(const std::string &path);
+  ~GetNextProfiling() = default;
+  void SaveProfilingData();
+  void GetDeviceId();
+  uint64_t GetTimeStamp() const;
+  void RecordData(uint32_t queue_size, uint64_t start_time_stamp, uint64_t end_time_stamp);
+  void Init();
+
+ private:
+  std::string profiling_path_;
+  std::string file_name_;
+  std::vector<uint32_t> queue_size_;
+  std::vector<std::pair<uint64_t, uint64_t>> time_stamp_;  // First value of std::pair is the start time stamp,
+                                                           // Second value of std::pair is the stop time stamp
+  std::string device_id_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_DATASET_DATASET_PROFILING_H_
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
index 38efaaf5d99..69ed55309cf 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
@@ -14,18 +14,19 @@
  * limitations under the License.
  */
 
+#include "minddata/dataset/engine/datasetops/device_queue_op.h"
+
 #include <iomanip>
 #include <iostream>
 #include <memory>
 #include "minddata/dataset/core/config_manager.h"
 #include "minddata/dataset/core/global_context.h"
-#include "minddata/dataset/engine/datasetops/device_queue_op.h"
 #include "minddata/dataset/engine/data_buffer.h"
 #include "minddata/dataset/engine/dataset_iterator.h"
-#include "minddata/dataset/engine/opt/pass.h"
-#include "minddata/dataset/engine/perf/profiling.h"
-#include "minddata/dataset/engine/perf/device_queue_tracing.h"
 #include "minddata/dataset/engine/datasetops/epoch_ctrl_op.h"
+#include "minddata/dataset/engine/opt/pass.h"
+#include "minddata/dataset/engine/perf/device_queue_tracing.h"
+#include "minddata/dataset/engine/perf/profiling.h"
 #include "minddata/dataset/util/status.h"
 #include "minddata/dataset/util/task_manager.h"
 
@@ -197,6 +198,19 @@ Status DeviceQueueOp::SendDataToGPU() {
   bool is_open = false;
   uint32_t handle = INVALID_HANDLE;
   auto release_function = std::bind(&DeviceQueueOp::ReleaseData, this, std::placeholders::_1);
+  double batch_start_time, end_time;
+  int32_t batch_cost, push_cost;
+  int32_t connector_size = 0;
+  int32_t connector_capacity;
+  std::shared_ptr<DeviceQueueTracing> profiling_node;
+  bool isProfilingEnable = tree_->GetProfilingManager()->IsProfilingEnable();
+  if (isProfilingEnable) {
+    std::shared_ptr<Tracing> node;
+    RETURN_IF_NOT_OK(tree_->GetProfilingManager()->GetTracingNode(kDeviceQueueTracingName, &node));
+    profiling_node = std::dynamic_pointer_cast<DeviceQueueTracing>(node);
+    batch_start_time = ProfilingTime::GetCurMilliSecond();
+    connector_capacity = ChildOpConnectorCapacity();
+  }
 
   std::unique_ptr<DataBuffer> current_buffer;
   RETURN_IF_NOT_OK(GetNextInput(&current_buffer));
@@ -220,20 +234,44 @@ Status DeviceQueueOp::SendDataToGPU() {
           }
           is_open = true;
         }
-        RETURN_IF_NOT_OK(RetryPushGPUData(data_size, curr_row, handle));
+        RETURN_IF_NOT_OK(RetryPushGPUData(data_size, curr_row, handle, isProfilingEnable, &push_cost));
         total_batch++;
+        if (isProfilingEnable) {
+          end_time = ProfilingTime::GetCurMilliSecond();
+          // record push data time
+          profiling_node->Record(TIME, TDT_PUSH_TIME, total_batch, push_cost);
+          batch_cost = (int32_t)(end_time - batch_start_time);
+          // record batch time
+          profiling_node->Record(TIME, BATCH_TIME, total_batch, batch_cost);
+          // record pipeline time
+          profiling_node->Record(TIME, PIPELINE_TIME, total_batch, batch_cost - push_cost);
+          batch_start_time = end_time;
+          // record connector depth
+          profiling_node->Record(CONNECTOR_DEPTH, connector_capacity, total_batch, connector_size);
+        }
       }
-      if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed())
+      if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) {
+        if (isProfilingEnable) {
+          connector_size = ChildOpConnectorSize();
+          connector_capacity = ChildOpConnectorCapacity();
+        }
         RETURN_IF_NOT_OK(GetNextInput(&current_buffer));
-      else
+      } else {
         is_break_loop = true;
+      }
     }
-    if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed())
+    if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) {
+      if (isProfilingEnable) {
+        connector_size = ChildOpConnectorSize();
+        connector_capacity = ChildOpConnectorCapacity();
+      }
       RETURN_IF_NOT_OK(GetNextInput(&current_buffer));
-    else
+    } else {
       is_break_loop = true;
+    }
   }
 
+  tree_->SetFinished();
   MS_LOG(INFO) << "Device queue total batch is " << total_batch << ".";
 
   GpuBufferMgr::GetInstance().Close(handle);
@@ -241,9 +279,10 @@ Status DeviceQueueOp::SendDataToGPU() {
   return Status::OK();
 }
 
-Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, const TensorRow &curr_row,
-                                       uint32_t handle) {
+Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, const TensorRow &curr_row, uint32_t handle,
+                                       bool profiling, int32_t *push_time) {
   std::vector<device::DataItemGpu> items;
+  double start_time;
   for (int i = 0; i < data_size.size(); i++) {
     device::DataItemGpu data_item;
     data_item.data_len_ = data_size[i];
@@ -253,7 +292,14 @@ Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, con
 
   while (!GpuBufferMgr::GetInstance().IsClosed() && !TaskManager::FindMe()->Interrupted()) {
     RETURN_IF_NOT_OK(MallocForGPUData(&items, curr_row));
+    if (profiling) {
+      start_time = ProfilingTime::GetCurMilliSecond();
+    }
     BlockQueueStatus_T ret = GpuBufferMgr::GetInstance().Push(handle, items, WAIT_TIME);
+    if (profiling) {
+      double end_time = ProfilingTime::GetCurMilliSecond();
+      *push_time = (int32_t)(end_time - start_time);
+    }
     if (ret) {
       for (int i = 0; i < items.size(); i++) {
         ReleaseData(items[i].data_ptr_);
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h
index dc24380f0db..7dc999dfa5b 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h
@@ -168,7 +168,8 @@ class DeviceQueueOp : public PipelineOp {
 
 #ifdef ENABLE_GPUQUE
   Status SendDataToGPU();
-  Status RetryPushGPUData(const std::vector<size_t> &data_size, const TensorRow &curr_row, uint32_t handle);
+  Status RetryPushGPUData(const std::vector<size_t> &data_size, const TensorRow &curr_row, uint32_t handle,
+                          bool profiling, int32_t *push_time);
   Status MallocForGPUData(std::vector<device::DataItemGpu> *items, const TensorRow &curr_row);
   void ReleaseData(void *addr);
 
diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc
index fbdb7459f3b..85b39369167 100644
--- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc
+++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cxxabi.h>
-#include <cmath>
-#include <chrono>
 #include "profiler/device/gpu/gpu_profiling.h"
+
+#include <cxxabi.h>
+#include <chrono>
+#include <cmath>
 #include "profiler/device/gpu/cupti_interface.h"
 #include "profiler/device/gpu/data_saver.h"
-#include "utils/log_adapter.h"
 #include "pybind_api/api_register.h"
+#include "utils/log_adapter.h"
+#include "utils/utils.h"
 
 namespace mindspore {
 namespace profiler {
@@ -456,6 +458,13 @@ void GPUProfiler::Stop() {
   ClearInst();
 }
 
+void GPUProfiler::SaveExtraProfileData() {
+  for (auto op : profiling_op_) {
+    op.second->SaveProfilingData();
+  }
+  MS_LOG(INFO) << "Save extra profiling data end.";
+}
+
 void GPUProfiler::SaveProfileData() {
   if (profile_data_path_.empty()) {
     MS_LOG(WARNING) << "Profile data path is empty, skip save profile data.";
@@ -464,6 +473,7 @@ void GPUProfiler::SaveProfileData() {
     dataSaver.ParseOpInfo(op_info_map_);
     dataSaver.ParseEvent(events_);
     dataSaver.WriteFile(profile_data_path_);
+    SaveExtraProfileData();
   }
 }
 
@@ -639,6 +649,13 @@ void GPUProfiler::HandleActivityRecord(CUpti_Activity *record) {
 
   AddEvent(std::move(profilingData));
 }
+void GPUProfiler::RegisterProfilingOp(std::shared_ptr<ProfilingOp> node) {
+  if (profiling_op_.find(node->Name()) != profiling_op_.end()) {
+    return;
+  }
+  node->Init();
+  profiling_op_[node->Name()] = node;
+}
 
 void CUPTIAPI GPUProfiler::AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
   int stat = posix_memalign(reinterpret_cast<void **>(buffer), ALIGN_SIZE, BUF_SIZE);
diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h
index f83841dc26d..d3510d9a27f 100644
--- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h
+++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h
@@ -18,14 +18,15 @@
 #define MINDSPORE_GPU_PROFILING_H
 #include <cuda.h>
 #include <cupti.h>
-#include <cstdio>
-#include <unordered_map>
-#include <string>
-#include <vector>
-#include <mutex>
-#include <memory>
 #include <algorithm>
+#include <cstdio>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
 #include <utility>
+#include <vector>
 
 namespace mindspore {
 namespace profiler {
@@ -109,6 +110,18 @@ struct BaseTime {
 
 const float kTimeUnit = 1000;
 
+class ProfilingOp {
+ public:
+  ProfilingOp() = default;
+  virtual ~ProfilingOp() = default;
+  virtual void SaveProfilingData() = 0;
+  virtual void Init() = 0;
+  std::string Name() const { return op_name_; }
+
+ protected:
+  std::string op_name_;
+};
+
 class GPUProfiler {
  public:
   static std::shared_ptr<GPUProfiler> GetInstance();
@@ -130,6 +143,8 @@ class GPUProfiler {
   void OpDataProducerBegin(const std::string op_name, void *stream);
   void OpDataProducerEnd();
   void ProcessEvents();
+  void RegisterProfilingOp(std::shared_ptr<ProfilingOp> node);
+  std::string ProfileDataPath() const { return profile_data_path_; }
 
  private:
   GPUProfiler() = default;
@@ -153,6 +168,7 @@ class GPUProfiler {
   std::string op_name_;
   void *stream_;
   void SaveProfileData();
+  void SaveExtraProfileData();
   std::mutex event_mutex_;
 
   std::vector<CUpti_ActivityKind> activities_enable_;
@@ -172,6 +188,7 @@ class GPUProfiler {
   uint64_t op_host_time_stop_;
   uint64_t op_cupti_time_start_;
   std::string profile_data_path_;
+  std::map<std::string, std::shared_ptr<ProfilingOp>> profiling_op_;
 };
 }  // namespace gpu
 }  // namespace profiler