From 9770be4bab9ba2eea0dd0e8e9bb1e84da0813e81 Mon Sep 17 00:00:00 2001 From: yanghaitao Date: Mon, 13 Jul 2020 14:12:44 +0800 Subject: [PATCH] gpu profiling --- .../gpu/data/dataset_iterator_kernel.cc | 35 +++++++++- .../gpu/data/dataset_iterator_kernel.h | 4 ++ .../gpu/data/dataset_profiling.cc | 70 +++++++++++++++++++ .../gpu/data/dataset_profiling.h | 50 +++++++++++++ .../engine/datasetops/device_queue_op.cc | 68 +++++++++++++++--- .../engine/datasetops/device_queue_op.h | 3 +- .../profiler/device/gpu/gpu_profiling.cc | 25 +++++-- .../ccsrc/profiler/device/gpu/gpu_profiling.h | 29 ++++++-- 8 files changed, 259 insertions(+), 25 deletions(-) create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.cc create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.h diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc index c8de6b349e4..562f0b21019 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc @@ -13,21 +13,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h" + #include +#include #include #include +#include "backend/kernel_compiler/gpu/data/dataset_utils.h" +#include "profiler/device/gpu/gpu_profiling.h" #include "runtime/device/gpu/gpu_buffer_mgr.h" #include "runtime/device/gpu/gpu_common.h" -#include "backend/kernel_compiler/gpu/data/dataset_utils.h" namespace mindspore { namespace kernel { using mindspore::device::GpuBufferMgr; using mindspore::device::HandleMgr; -DatasetIteratorKernel::DatasetIteratorKernel() : handle_(HandleMgr::INVALID_HANDLE), total_bytes_(0) {} +DatasetIteratorKernel::DatasetIteratorKernel() + : handle_(HandleMgr::INVALID_HANDLE), total_bytes_(0), profiling_enable_(false), profiling_op_(nullptr) {} DatasetIteratorKernel::~DatasetIteratorKernel() { GpuBufferMgr::GetInstance().Close(handle_); } @@ -60,6 +63,14 @@ bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) { MS_LOG(EXCEPTION) << "Gpu Queue(" << queue_name_ << ") Open Failed"; } + auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); + MS_EXCEPTION_IF_NULL(profiler_inst); + profiling_enable_ = profiler_inst->GetEnableFlag(); + if (profiling_enable_) { + std::string path = profiler_inst->ProfileDataPath(); + profiling_op_ = std::make_shared(path); + profiler_inst->RegisterProfilingOp(profiling_op_); + } return true; } @@ -69,11 +80,21 @@ bool DatasetIteratorKernel::Launch(const std::vector &, const std::v const std::vector &outputs, void *stream) { void *addr = nullptr; size_t len = 0; + uint64_t start_time_stamp = 0; + uint32_t queue_size = 0; int repeat = 0; while (true) { + if (profiling_enable_) { + start_time_stamp = profiling_op_->GetTimeStamp(); + queue_size = GpuBufferMgr::GetInstance().Size(handle_); + } auto ret = GpuBufferMgr::GetInstance().Front(handle_, &addr, &len); if (ret == device::SUCCESS) { + if (profiling_enable_) { + uint64_t end_time_stamp = profiling_op_->GetTimeStamp(); + profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp); + } break; } @@ -84,10 +105,18 @@ bool DatasetIteratorKernel::Launch(const std::vector &, const std::v continue; } else { MS_LOG(ERROR) << "Get data timeout"; + if (profiling_enable_) { + uint64_t end_time_stamp = profiling_op_->GetTimeStamp(); + profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp); + } return false; } } + if (profiling_enable_) { + uint64_t end_time_stamp = profiling_op_->GetTimeStamp(); + profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp); + } MS_LOG(ERROR) << "Get data failed, errcode " << ret; return false; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h index b20df721a62..2aa62880f7a 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h @@ -17,8 +17,10 @@ #ifndef MINDSPORE_GET_NEXT_KERNEL_H #define MINDSPORE_GET_NEXT_KERNEL_H +#include #include #include +#include "backend/kernel_compiler/gpu/data/dataset_profiling.h" #include "backend/kernel_compiler/gpu/gpu_kernel.h" #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" @@ -44,6 +46,8 @@ class DatasetIteratorKernel : public GpuKernel { std::string queue_name_; unsigned int handle_; size_t total_bytes_; + bool profiling_enable_; + std::shared_ptr profiling_op_; std::vector input_size_list_; std::vector output_size_list_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.cc new file mode 100644 index 00000000000..f16146685be --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.cc @@ -0,0 +1,70 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "backend/kernel_compiler/gpu/data/dataset_profiling.h" + +#include +#include +#include +#include +#include "utils/log_adapter.h" +#include "utils/ms_utils.h" +#include "utils/utils.h" + +namespace mindspore { +namespace kernel { + +GetNextProfiling::GetNextProfiling(const std::string &path) : profiling_path_(path) {} + +void GetNextProfiling::GetDeviceId() { + // If DEVICE_ID is not set,defult value is 0 + device_id_ = common::GetEnv("DEVICE_ID"); + if (device_id_.empty()) { + device_id_ = "0"; + } +} + +void GetNextProfiling::Init() { + GetDeviceId(); + file_name_ = profiling_path_ + "/minddata_getnext_profiling_" + device_id_ + ".txt"; + op_name_ = kGetNextOpName; +} + +void GetNextProfiling::SaveProfilingData() { + std::ofstream handle(file_name_, std::ios::trunc); + if (!handle.is_open()) { + MS_LOG(ERROR) << "Open get-next profiling file failed."; + return; + } + for (uint32_t index = 0; index < queue_size_.size(); index++) { + handle << Name() << " " << time_stamp_[index].first << " " << time_stamp_[index].second << " " << queue_size_[index] + << std::endl; + } + handle.close(); +} + +void GetNextProfiling::RecordData(uint32_t queue_size, uint64_t start_time_stamp, uint64_t end_time_stamp) { + queue_size_.emplace_back(queue_size); + std::pair time_stamp(start_time_stamp, end_time_stamp); + time_stamp_.emplace_back(time_stamp); +} + +uint64_t GetNextProfiling::GetTimeStamp() const { + auto cur_sys_clock = std::chrono::system_clock::now(); + uint64_t time_stamp = std::chrono::duration_cast(cur_sys_clock.time_since_epoch()).count(); + return time_stamp; +} +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.h new file mode 100644 index 00000000000..35ecbccd266 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.h @@ -0,0 +1,50 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_DATASET_DATASET_PROFILING_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_DATASET_DATASET_PROFILING_H_ + +#include +#include +#include +#include +#include "profiler/device/gpu/gpu_profiling.h" + +using mindspore::profiler::gpu::ProfilingOp; + +namespace mindspore { +namespace kernel { +class GetNextProfiling : public ProfilingOp { + public: + explicit GetNextProfiling(const std::string &path); + ~GetNextProfiling() = default; + void SaveProfilingData(); + void GetDeviceId(); + uint64_t GetTimeStamp() const; + void RecordData(uint32_t queue_size, uint64_t start_time_stamp, uint64_t end_time_stamp); + void Init(); + + private: + std::string profiling_path_; + std::string file_name_; + std::vector queue_size_; + std::vector> time_stamp_; // First value of std::pair is the start time stamp, + // Second value of std::pair is the stop time stamp + std::string device_id_; +}; +} // namespace kernel +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_DATASET_DATASET_PROFILING_H_ diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc index 38efaaf5d99..69ed55309cf 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc @@ -14,18 +14,19 @@ * limitations under the License. */ +#include "minddata/dataset/engine/datasetops/device_queue_op.h" + #include #include #include #include "minddata/dataset/core/config_manager.h" #include "minddata/dataset/core/global_context.h" -#include "minddata/dataset/engine/datasetops/device_queue_op.h" #include "minddata/dataset/engine/data_buffer.h" #include "minddata/dataset/engine/dataset_iterator.h" -#include "minddata/dataset/engine/opt/pass.h" -#include "minddata/dataset/engine/perf/profiling.h" -#include "minddata/dataset/engine/perf/device_queue_tracing.h" #include "minddata/dataset/engine/datasetops/epoch_ctrl_op.h" +#include "minddata/dataset/engine/opt/pass.h" +#include "minddata/dataset/engine/perf/device_queue_tracing.h" +#include "minddata/dataset/engine/perf/profiling.h" #include "minddata/dataset/util/status.h" #include "minddata/dataset/util/task_manager.h" @@ -197,6 +198,19 @@ Status DeviceQueueOp::SendDataToGPU() { bool is_open = false; uint32_t handle = INVALID_HANDLE; auto release_function = std::bind(&DeviceQueueOp::ReleaseData, this, std::placeholders::_1); + double batch_start_time, end_time; + int32_t batch_cost, push_cost; + int32_t connector_size = 0; + int32_t connector_capacity; + std::shared_ptr profiling_node; + bool isProfilingEnable = tree_->GetProfilingManager()->IsProfilingEnable(); + if (isProfilingEnable) { + std::shared_ptr node; + RETURN_IF_NOT_OK(tree_->GetProfilingManager()->GetTracingNode(kDeviceQueueTracingName, &node)); + profiling_node = std::dynamic_pointer_cast(node); + batch_start_time = ProfilingTime::GetCurMilliSecond(); + connector_capacity = ChildOpConnectorCapacity(); + } std::unique_ptr current_buffer; RETURN_IF_NOT_OK(GetNextInput(¤t_buffer)); @@ -220,20 +234,44 @@ Status DeviceQueueOp::SendDataToGPU() { } is_open = true; } - RETURN_IF_NOT_OK(RetryPushGPUData(data_size, curr_row, handle)); + RETURN_IF_NOT_OK(RetryPushGPUData(data_size, curr_row, handle, isProfilingEnable, &push_cost)); total_batch++; + if (isProfilingEnable) { + end_time = ProfilingTime::GetCurMilliSecond(); + // record push data time + profiling_node->Record(TIME, TDT_PUSH_TIME, total_batch, push_cost); + batch_cost = (int32_t)(end_time - batch_start_time); + // record batch time + profiling_node->Record(TIME, BATCH_TIME, total_batch, batch_cost); + // record pipeline time + profiling_node->Record(TIME, PIPELINE_TIME, total_batch, batch_cost - push_cost); + batch_start_time = end_time; + // record connector depth + profiling_node->Record(CONNECTOR_DEPTH, connector_capacity, total_batch, connector_size); + } } - if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) + if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) { + if (isProfilingEnable) { + connector_size = ChildOpConnectorSize(); + connector_capacity = ChildOpConnectorCapacity(); + } RETURN_IF_NOT_OK(GetNextInput(¤t_buffer)); - else + } else { is_break_loop = true; + } } - if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) + if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) { + if (isProfilingEnable) { + connector_size = ChildOpConnectorSize(); + connector_capacity = ChildOpConnectorCapacity(); + } RETURN_IF_NOT_OK(GetNextInput(¤t_buffer)); - else + } else { is_break_loop = true; + } } + tree_->SetFinished(); MS_LOG(INFO) << "Device queue total batch is " << total_batch << "."; GpuBufferMgr::GetInstance().Close(handle); @@ -241,9 +279,10 @@ Status DeviceQueueOp::SendDataToGPU() { return Status::OK(); } -Status DeviceQueueOp::RetryPushGPUData(const std::vector &data_size, const TensorRow &curr_row, - uint32_t handle) { +Status DeviceQueueOp::RetryPushGPUData(const std::vector &data_size, const TensorRow &curr_row, uint32_t handle, + bool profiling, int32_t *push_time) { std::vector items; + double start_time; for (int i = 0; i < data_size.size(); i++) { device::DataItemGpu data_item; data_item.data_len_ = data_size[i]; @@ -253,7 +292,14 @@ Status DeviceQueueOp::RetryPushGPUData(const std::vector &data_size, con while (!GpuBufferMgr::GetInstance().IsClosed() && !TaskManager::FindMe()->Interrupted()) { RETURN_IF_NOT_OK(MallocForGPUData(&items, curr_row)); + if (profiling) { + start_time = ProfilingTime::GetCurMilliSecond(); + } BlockQueueStatus_T ret = GpuBufferMgr::GetInstance().Push(handle, items, WAIT_TIME); + if (profiling) { + double end_time = ProfilingTime::GetCurMilliSecond(); + *push_time = (int32_t)(end_time - start_time); + } if (ret) { for (int i = 0; i < items.size(); i++) { ReleaseData(items[i].data_ptr_); diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h index dc24380f0db..7dc999dfa5b 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h @@ -168,7 +168,8 @@ class DeviceQueueOp : public PipelineOp { #ifdef ENABLE_GPUQUE Status SendDataToGPU(); - Status RetryPushGPUData(const std::vector &data_size, const TensorRow &curr_row, uint32_t handle); + Status RetryPushGPUData(const std::vector &data_size, const TensorRow &curr_row, uint32_t handle, + bool profiling, int32_t *push_time); Status MallocForGPUData(std::vector *items, const TensorRow &curr_row); void ReleaseData(void *addr); diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc index fbdb7459f3b..85b39369167 100644 --- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc @@ -14,14 +14,16 @@ * limitations under the License. */ -#include -#include -#include #include "profiler/device/gpu/gpu_profiling.h" + +#include +#include +#include #include "profiler/device/gpu/cupti_interface.h" #include "profiler/device/gpu/data_saver.h" -#include "utils/log_adapter.h" #include "pybind_api/api_register.h" +#include "utils/log_adapter.h" +#include "utils/utils.h" namespace mindspore { namespace profiler { @@ -456,6 +458,13 @@ void GPUProfiler::Stop() { ClearInst(); } +void GPUProfiler::SaveExtraProfileData() { + for (auto op : profiling_op_) { + op.second->SaveProfilingData(); + } + MS_LOG(INFO) << "Save extra profiling data end."; +} + void GPUProfiler::SaveProfileData() { if (profile_data_path_.empty()) { MS_LOG(WARNING) << "Profile data path is empty, skip save profile data."; @@ -464,6 +473,7 @@ void GPUProfiler::SaveProfileData() { dataSaver.ParseOpInfo(op_info_map_); dataSaver.ParseEvent(events_); dataSaver.WriteFile(profile_data_path_); + SaveExtraProfileData(); } } @@ -639,6 +649,13 @@ void GPUProfiler::HandleActivityRecord(CUpti_Activity *record) { AddEvent(std::move(profilingData)); } +void GPUProfiler::RegisterProfilingOp(std::shared_ptr node) { + if (profiling_op_.find(node->Name()) != profiling_op_.end()) { + return; + } + node->Init(); + profiling_op_[node->Name()] = node; +} void CUPTIAPI GPUProfiler::AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { int stat = posix_memalign(reinterpret_cast(buffer), ALIGN_SIZE, BUF_SIZE); diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h index f83841dc26d..d3510d9a27f 100644 --- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h @@ -18,14 +18,15 @@ #define MINDSPORE_GPU_PROFILING_H #include #include -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include #include +#include namespace mindspore { namespace profiler { @@ -109,6 +110,18 @@ struct BaseTime { const float kTimeUnit = 1000; +class ProfilingOp { + public: + ProfilingOp() = default; + virtual ~ProfilingOp() = default; + virtual void SaveProfilingData() = 0; + virtual void Init() = 0; + std::string Name() const { return op_name_; } + + protected: + std::string op_name_; +}; + class GPUProfiler { public: static std::shared_ptr GetInstance(); @@ -130,6 +143,8 @@ class GPUProfiler { void OpDataProducerBegin(const std::string op_name, void *stream); void OpDataProducerEnd(); void ProcessEvents(); + void RegisterProfilingOp(std::shared_ptr node); + std::string ProfileDataPath() const { return profile_data_path_; } private: GPUProfiler() = default; @@ -153,6 +168,7 @@ class GPUProfiler { std::string op_name_; void *stream_; void SaveProfileData(); + void SaveExtraProfileData(); std::mutex event_mutex_; std::vector activities_enable_; @@ -172,6 +188,7 @@ class GPUProfiler { uint64_t op_host_time_stop_; uint64_t op_cupti_time_start_; std::string profile_data_path_; + std::map> profiling_op_; }; } // namespace gpu } // namespace profiler