forked from mindspore-Ecosystem/mindspore
!7430 MindData profiling for GPU
Merge pull request !7430 from yanghaitao/yht_gpu_profiling
This commit is contained in:
commit
b79b613acc
|
@ -13,21 +13,24 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h"
|
#include "backend/kernel_compiler/gpu/data/dataset_iterator_kernel.h"
|
||||||
|
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include "backend/kernel_compiler/gpu/data/dataset_utils.h"
|
||||||
|
#include "profiler/device/gpu/gpu_profiling.h"
|
||||||
#include "runtime/device/gpu/gpu_buffer_mgr.h"
|
#include "runtime/device/gpu/gpu_buffer_mgr.h"
|
||||||
#include "runtime/device/gpu/gpu_common.h"
|
#include "runtime/device/gpu/gpu_common.h"
|
||||||
#include "backend/kernel_compiler/gpu/data/dataset_utils.h"
|
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace kernel {
|
namespace kernel {
|
||||||
using mindspore::device::GpuBufferMgr;
|
using mindspore::device::GpuBufferMgr;
|
||||||
using mindspore::device::HandleMgr;
|
using mindspore::device::HandleMgr;
|
||||||
|
|
||||||
DatasetIteratorKernel::DatasetIteratorKernel() : handle_(HandleMgr::INVALID_HANDLE), total_bytes_(0) {}
|
DatasetIteratorKernel::DatasetIteratorKernel()
|
||||||
|
: handle_(HandleMgr::INVALID_HANDLE), total_bytes_(0), profiling_enable_(false), profiling_op_(nullptr) {}
|
||||||
|
|
||||||
DatasetIteratorKernel::~DatasetIteratorKernel() { GpuBufferMgr::GetInstance().Close(handle_); }
|
DatasetIteratorKernel::~DatasetIteratorKernel() { GpuBufferMgr::GetInstance().Close(handle_); }
|
||||||
|
|
||||||
|
@ -60,6 +63,14 @@ bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) {
|
||||||
MS_LOG(EXCEPTION) << "Gpu Queue(" << queue_name_ << ") Open Failed";
|
MS_LOG(EXCEPTION) << "Gpu Queue(" << queue_name_ << ") Open Failed";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(profiler_inst);
|
||||||
|
profiling_enable_ = profiler_inst->GetEnableFlag();
|
||||||
|
if (profiling_enable_) {
|
||||||
|
std::string path = profiler_inst->ProfileDataPath();
|
||||||
|
profiling_op_ = std::make_shared<GetNextProfiling>(path);
|
||||||
|
profiler_inst->RegisterProfilingOp(profiling_op_);
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -69,11 +80,21 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
|
||||||
const std::vector<AddressPtr> &outputs, void *stream) {
|
const std::vector<AddressPtr> &outputs, void *stream) {
|
||||||
void *addr = nullptr;
|
void *addr = nullptr;
|
||||||
size_t len = 0;
|
size_t len = 0;
|
||||||
|
uint64_t start_time_stamp = 0;
|
||||||
|
uint32_t queue_size = 0;
|
||||||
|
|
||||||
int repeat = 0;
|
int repeat = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
if (profiling_enable_) {
|
||||||
|
start_time_stamp = profiling_op_->GetTimeStamp();
|
||||||
|
queue_size = GpuBufferMgr::GetInstance().Size(handle_);
|
||||||
|
}
|
||||||
auto ret = GpuBufferMgr::GetInstance().Front(handle_, &addr, &len);
|
auto ret = GpuBufferMgr::GetInstance().Front(handle_, &addr, &len);
|
||||||
if (ret == device::SUCCESS) {
|
if (ret == device::SUCCESS) {
|
||||||
|
if (profiling_enable_) {
|
||||||
|
uint64_t end_time_stamp = profiling_op_->GetTimeStamp();
|
||||||
|
profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -84,10 +105,18 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
MS_LOG(ERROR) << "Get data timeout";
|
MS_LOG(ERROR) << "Get data timeout";
|
||||||
|
if (profiling_enable_) {
|
||||||
|
uint64_t end_time_stamp = profiling_op_->GetTimeStamp();
|
||||||
|
profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp);
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (profiling_enable_) {
|
||||||
|
uint64_t end_time_stamp = profiling_op_->GetTimeStamp();
|
||||||
|
profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp);
|
||||||
|
}
|
||||||
MS_LOG(ERROR) << "Get data failed, errcode " << ret;
|
MS_LOG(ERROR) << "Get data failed, errcode " << ret;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,8 +17,10 @@
|
||||||
#ifndef MINDSPORE_GET_NEXT_KERNEL_H
|
#ifndef MINDSPORE_GET_NEXT_KERNEL_H
|
||||||
#define MINDSPORE_GET_NEXT_KERNEL_H
|
#define MINDSPORE_GET_NEXT_KERNEL_H
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include "backend/kernel_compiler/gpu/data/dataset_profiling.h"
|
||||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||||
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
||||||
|
|
||||||
|
@ -44,6 +46,8 @@ class DatasetIteratorKernel : public GpuKernel {
|
||||||
std::string queue_name_;
|
std::string queue_name_;
|
||||||
unsigned int handle_;
|
unsigned int handle_;
|
||||||
size_t total_bytes_;
|
size_t total_bytes_;
|
||||||
|
bool profiling_enable_;
|
||||||
|
std::shared_ptr<GetNextProfiling> profiling_op_;
|
||||||
|
|
||||||
std::vector<size_t> input_size_list_;
|
std::vector<size_t> input_size_list_;
|
||||||
std::vector<size_t> output_size_list_;
|
std::vector<size_t> output_size_list_;
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
#include "backend/kernel_compiler/gpu/data/dataset_profiling.h"
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
#include "utils/log_adapter.h"
|
||||||
|
#include "utils/ms_utils.h"
|
||||||
|
#include "utils/utils.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace kernel {
|
||||||
|
|
||||||
|
GetNextProfiling::GetNextProfiling(const std::string &path) : profiling_path_(path) {}
|
||||||
|
|
||||||
|
void GetNextProfiling::GetDeviceId() {
|
||||||
|
// If DEVICE_ID is not set,defult value is 0
|
||||||
|
device_id_ = common::GetEnv("DEVICE_ID");
|
||||||
|
if (device_id_.empty()) {
|
||||||
|
device_id_ = "0";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void GetNextProfiling::Init() {
|
||||||
|
GetDeviceId();
|
||||||
|
file_name_ = profiling_path_ + "/minddata_getnext_profiling_" + device_id_ + ".txt";
|
||||||
|
op_name_ = kGetNextOpName;
|
||||||
|
}
|
||||||
|
|
||||||
|
void GetNextProfiling::SaveProfilingData() {
|
||||||
|
std::ofstream handle(file_name_, std::ios::trunc);
|
||||||
|
if (!handle.is_open()) {
|
||||||
|
MS_LOG(ERROR) << "Open get-next profiling file failed.";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (uint32_t index = 0; index < queue_size_.size(); index++) {
|
||||||
|
handle << Name() << " " << time_stamp_[index].first << " " << time_stamp_[index].second << " " << queue_size_[index]
|
||||||
|
<< std::endl;
|
||||||
|
}
|
||||||
|
handle.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
void GetNextProfiling::RecordData(uint32_t queue_size, uint64_t start_time_stamp, uint64_t end_time_stamp) {
|
||||||
|
queue_size_.emplace_back(queue_size);
|
||||||
|
std::pair<uint64_t, uint64_t> time_stamp(start_time_stamp, end_time_stamp);
|
||||||
|
time_stamp_.emplace_back(time_stamp);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t GetNextProfiling::GetTimeStamp() const {
|
||||||
|
auto cur_sys_clock = std::chrono::system_clock::now();
|
||||||
|
uint64_t time_stamp = std::chrono::duration_cast<std::chrono::nanoseconds>(cur_sys_clock.time_since_epoch()).count();
|
||||||
|
return time_stamp;
|
||||||
|
}
|
||||||
|
} // namespace kernel
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,50 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_DATASET_DATASET_PROFILING_H_
|
||||||
|
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_DATASET_DATASET_PROFILING_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
#include "profiler/device/gpu/gpu_profiling.h"
|
||||||
|
|
||||||
|
using mindspore::profiler::gpu::ProfilingOp;
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace kernel {
|
||||||
|
class GetNextProfiling : public ProfilingOp {
|
||||||
|
public:
|
||||||
|
explicit GetNextProfiling(const std::string &path);
|
||||||
|
~GetNextProfiling() = default;
|
||||||
|
void SaveProfilingData();
|
||||||
|
void GetDeviceId();
|
||||||
|
uint64_t GetTimeStamp() const;
|
||||||
|
void RecordData(uint32_t queue_size, uint64_t start_time_stamp, uint64_t end_time_stamp);
|
||||||
|
void Init();
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string profiling_path_;
|
||||||
|
std::string file_name_;
|
||||||
|
std::vector<uint32_t> queue_size_;
|
||||||
|
std::vector<std::pair<uint64_t, uint64_t>> time_stamp_; // First value of std::pair is the start time stamp,
|
||||||
|
// Second value of std::pair is the stop time stamp
|
||||||
|
std::string device_id_;
|
||||||
|
};
|
||||||
|
} // namespace kernel
|
||||||
|
} // namespace mindspore
|
||||||
|
|
||||||
|
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_DATASET_DATASET_PROFILING_H_
|
|
@ -14,18 +14,19 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "minddata/dataset/engine/datasetops/device_queue_op.h"
|
||||||
|
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include "minddata/dataset/core/config_manager.h"
|
#include "minddata/dataset/core/config_manager.h"
|
||||||
#include "minddata/dataset/core/global_context.h"
|
#include "minddata/dataset/core/global_context.h"
|
||||||
#include "minddata/dataset/engine/datasetops/device_queue_op.h"
|
|
||||||
#include "minddata/dataset/engine/data_buffer.h"
|
#include "minddata/dataset/engine/data_buffer.h"
|
||||||
#include "minddata/dataset/engine/dataset_iterator.h"
|
#include "minddata/dataset/engine/dataset_iterator.h"
|
||||||
#include "minddata/dataset/engine/opt/pass.h"
|
|
||||||
#include "minddata/dataset/engine/perf/profiling.h"
|
|
||||||
#include "minddata/dataset/engine/perf/device_queue_tracing.h"
|
|
||||||
#include "minddata/dataset/engine/datasetops/epoch_ctrl_op.h"
|
#include "minddata/dataset/engine/datasetops/epoch_ctrl_op.h"
|
||||||
|
#include "minddata/dataset/engine/opt/pass.h"
|
||||||
|
#include "minddata/dataset/engine/perf/device_queue_tracing.h"
|
||||||
|
#include "minddata/dataset/engine/perf/profiling.h"
|
||||||
#include "minddata/dataset/util/status.h"
|
#include "minddata/dataset/util/status.h"
|
||||||
#include "minddata/dataset/util/task_manager.h"
|
#include "minddata/dataset/util/task_manager.h"
|
||||||
|
|
||||||
|
@ -197,6 +198,19 @@ Status DeviceQueueOp::SendDataToGPU() {
|
||||||
bool is_open = false;
|
bool is_open = false;
|
||||||
uint32_t handle = INVALID_HANDLE;
|
uint32_t handle = INVALID_HANDLE;
|
||||||
auto release_function = std::bind(&DeviceQueueOp::ReleaseData, this, std::placeholders::_1);
|
auto release_function = std::bind(&DeviceQueueOp::ReleaseData, this, std::placeholders::_1);
|
||||||
|
double batch_start_time, end_time;
|
||||||
|
int32_t batch_cost, push_cost;
|
||||||
|
int32_t connector_size = 0;
|
||||||
|
int32_t connector_capacity;
|
||||||
|
std::shared_ptr<DeviceQueueTracing> profiling_node;
|
||||||
|
bool isProfilingEnable = tree_->GetProfilingManager()->IsProfilingEnable();
|
||||||
|
if (isProfilingEnable) {
|
||||||
|
std::shared_ptr<Tracing> node;
|
||||||
|
RETURN_IF_NOT_OK(tree_->GetProfilingManager()->GetTracingNode(kDeviceQueueTracingName, &node));
|
||||||
|
profiling_node = std::dynamic_pointer_cast<DeviceQueueTracing>(node);
|
||||||
|
batch_start_time = ProfilingTime::GetCurMilliSecond();
|
||||||
|
connector_capacity = ChildOpConnectorCapacity();
|
||||||
|
}
|
||||||
|
|
||||||
std::unique_ptr<DataBuffer> current_buffer;
|
std::unique_ptr<DataBuffer> current_buffer;
|
||||||
RETURN_IF_NOT_OK(GetNextInput(¤t_buffer));
|
RETURN_IF_NOT_OK(GetNextInput(¤t_buffer));
|
||||||
|
@ -220,20 +234,44 @@ Status DeviceQueueOp::SendDataToGPU() {
|
||||||
}
|
}
|
||||||
is_open = true;
|
is_open = true;
|
||||||
}
|
}
|
||||||
RETURN_IF_NOT_OK(RetryPushGPUData(data_size, curr_row, handle));
|
RETURN_IF_NOT_OK(RetryPushGPUData(data_size, curr_row, handle, isProfilingEnable, &push_cost));
|
||||||
total_batch++;
|
total_batch++;
|
||||||
|
if (isProfilingEnable) {
|
||||||
|
end_time = ProfilingTime::GetCurMilliSecond();
|
||||||
|
// record push data time
|
||||||
|
profiling_node->Record(TIME, TDT_PUSH_TIME, total_batch, push_cost);
|
||||||
|
batch_cost = (int32_t)(end_time - batch_start_time);
|
||||||
|
// record batch time
|
||||||
|
profiling_node->Record(TIME, BATCH_TIME, total_batch, batch_cost);
|
||||||
|
// record pipeline time
|
||||||
|
profiling_node->Record(TIME, PIPELINE_TIME, total_batch, batch_cost - push_cost);
|
||||||
|
batch_start_time = end_time;
|
||||||
|
// record connector depth
|
||||||
|
profiling_node->Record(CONNECTOR_DEPTH, connector_capacity, total_batch, connector_size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed())
|
if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) {
|
||||||
|
if (isProfilingEnable) {
|
||||||
|
connector_size = ChildOpConnectorSize();
|
||||||
|
connector_capacity = ChildOpConnectorCapacity();
|
||||||
|
}
|
||||||
RETURN_IF_NOT_OK(GetNextInput(¤t_buffer));
|
RETURN_IF_NOT_OK(GetNextInput(¤t_buffer));
|
||||||
else
|
} else {
|
||||||
is_break_loop = true;
|
is_break_loop = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed())
|
if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) {
|
||||||
|
if (isProfilingEnable) {
|
||||||
|
connector_size = ChildOpConnectorSize();
|
||||||
|
connector_capacity = ChildOpConnectorCapacity();
|
||||||
|
}
|
||||||
RETURN_IF_NOT_OK(GetNextInput(¤t_buffer));
|
RETURN_IF_NOT_OK(GetNextInput(¤t_buffer));
|
||||||
else
|
} else {
|
||||||
is_break_loop = true;
|
is_break_loop = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tree_->SetFinished();
|
||||||
MS_LOG(INFO) << "Device queue total batch is " << total_batch << ".";
|
MS_LOG(INFO) << "Device queue total batch is " << total_batch << ".";
|
||||||
|
|
||||||
GpuBufferMgr::GetInstance().Close(handle);
|
GpuBufferMgr::GetInstance().Close(handle);
|
||||||
|
@ -241,9 +279,10 @@ Status DeviceQueueOp::SendDataToGPU() {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, const TensorRow &curr_row,
|
Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, const TensorRow &curr_row, uint32_t handle,
|
||||||
uint32_t handle) {
|
bool profiling, int32_t *push_time) {
|
||||||
std::vector<device::DataItemGpu> items;
|
std::vector<device::DataItemGpu> items;
|
||||||
|
double start_time;
|
||||||
for (int i = 0; i < data_size.size(); i++) {
|
for (int i = 0; i < data_size.size(); i++) {
|
||||||
device::DataItemGpu data_item;
|
device::DataItemGpu data_item;
|
||||||
data_item.data_len_ = data_size[i];
|
data_item.data_len_ = data_size[i];
|
||||||
|
@ -253,7 +292,14 @@ Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, con
|
||||||
|
|
||||||
while (!GpuBufferMgr::GetInstance().IsClosed() && !TaskManager::FindMe()->Interrupted()) {
|
while (!GpuBufferMgr::GetInstance().IsClosed() && !TaskManager::FindMe()->Interrupted()) {
|
||||||
RETURN_IF_NOT_OK(MallocForGPUData(&items, curr_row));
|
RETURN_IF_NOT_OK(MallocForGPUData(&items, curr_row));
|
||||||
|
if (profiling) {
|
||||||
|
start_time = ProfilingTime::GetCurMilliSecond();
|
||||||
|
}
|
||||||
BlockQueueStatus_T ret = GpuBufferMgr::GetInstance().Push(handle, items, WAIT_TIME);
|
BlockQueueStatus_T ret = GpuBufferMgr::GetInstance().Push(handle, items, WAIT_TIME);
|
||||||
|
if (profiling) {
|
||||||
|
double end_time = ProfilingTime::GetCurMilliSecond();
|
||||||
|
*push_time = (int32_t)(end_time - start_time);
|
||||||
|
}
|
||||||
if (ret) {
|
if (ret) {
|
||||||
for (int i = 0; i < items.size(); i++) {
|
for (int i = 0; i < items.size(); i++) {
|
||||||
ReleaseData(items[i].data_ptr_);
|
ReleaseData(items[i].data_ptr_);
|
||||||
|
|
|
@ -168,7 +168,8 @@ class DeviceQueueOp : public PipelineOp {
|
||||||
|
|
||||||
#ifdef ENABLE_GPUQUE
|
#ifdef ENABLE_GPUQUE
|
||||||
Status SendDataToGPU();
|
Status SendDataToGPU();
|
||||||
Status RetryPushGPUData(const std::vector<size_t> &data_size, const TensorRow &curr_row, uint32_t handle);
|
Status RetryPushGPUData(const std::vector<size_t> &data_size, const TensorRow &curr_row, uint32_t handle,
|
||||||
|
bool profiling, int32_t *push_time);
|
||||||
Status MallocForGPUData(std::vector<device::DataItemGpu> *items, const TensorRow &curr_row);
|
Status MallocForGPUData(std::vector<device::DataItemGpu> *items, const TensorRow &curr_row);
|
||||||
void ReleaseData(void *addr);
|
void ReleaseData(void *addr);
|
||||||
|
|
||||||
|
|
|
@ -14,14 +14,16 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <cxxabi.h>
|
|
||||||
#include <cmath>
|
|
||||||
#include <chrono>
|
|
||||||
#include "profiler/device/gpu/gpu_profiling.h"
|
#include "profiler/device/gpu/gpu_profiling.h"
|
||||||
|
|
||||||
|
#include <cxxabi.h>
|
||||||
|
#include <chrono>
|
||||||
|
#include <cmath>
|
||||||
#include "profiler/device/gpu/cupti_interface.h"
|
#include "profiler/device/gpu/cupti_interface.h"
|
||||||
#include "profiler/device/gpu/data_saver.h"
|
#include "profiler/device/gpu/data_saver.h"
|
||||||
#include "utils/log_adapter.h"
|
|
||||||
#include "pybind_api/api_register.h"
|
#include "pybind_api/api_register.h"
|
||||||
|
#include "utils/log_adapter.h"
|
||||||
|
#include "utils/utils.h"
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace profiler {
|
namespace profiler {
|
||||||
|
@ -456,6 +458,13 @@ void GPUProfiler::Stop() {
|
||||||
ClearInst();
|
ClearInst();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void GPUProfiler::SaveExtraProfileData() {
|
||||||
|
for (auto op : profiling_op_) {
|
||||||
|
op.second->SaveProfilingData();
|
||||||
|
}
|
||||||
|
MS_LOG(INFO) << "Save extra profiling data end.";
|
||||||
|
}
|
||||||
|
|
||||||
void GPUProfiler::SaveProfileData() {
|
void GPUProfiler::SaveProfileData() {
|
||||||
if (profile_data_path_.empty()) {
|
if (profile_data_path_.empty()) {
|
||||||
MS_LOG(WARNING) << "Profile data path is empty, skip save profile data.";
|
MS_LOG(WARNING) << "Profile data path is empty, skip save profile data.";
|
||||||
|
@ -464,6 +473,7 @@ void GPUProfiler::SaveProfileData() {
|
||||||
dataSaver.ParseOpInfo(op_info_map_);
|
dataSaver.ParseOpInfo(op_info_map_);
|
||||||
dataSaver.ParseEvent(events_);
|
dataSaver.ParseEvent(events_);
|
||||||
dataSaver.WriteFile(profile_data_path_);
|
dataSaver.WriteFile(profile_data_path_);
|
||||||
|
SaveExtraProfileData();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -639,6 +649,13 @@ void GPUProfiler::HandleActivityRecord(CUpti_Activity *record) {
|
||||||
|
|
||||||
AddEvent(std::move(profilingData));
|
AddEvent(std::move(profilingData));
|
||||||
}
|
}
|
||||||
|
void GPUProfiler::RegisterProfilingOp(std::shared_ptr<ProfilingOp> node) {
|
||||||
|
if (profiling_op_.find(node->Name()) != profiling_op_.end()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
node->Init();
|
||||||
|
profiling_op_[node->Name()] = node;
|
||||||
|
}
|
||||||
|
|
||||||
void CUPTIAPI GPUProfiler::AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
|
void CUPTIAPI GPUProfiler::AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
|
||||||
int stat = posix_memalign(reinterpret_cast<void **>(buffer), ALIGN_SIZE, BUF_SIZE);
|
int stat = posix_memalign(reinterpret_cast<void **>(buffer), ALIGN_SIZE, BUF_SIZE);
|
||||||
|
|
|
@ -18,14 +18,15 @@
|
||||||
#define MINDSPORE_GPU_PROFILING_H
|
#define MINDSPORE_GPU_PROFILING_H
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <cupti.h>
|
#include <cupti.h>
|
||||||
#include <cstdio>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <mutex>
|
|
||||||
#include <memory>
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
#include <mutex>
|
||||||
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace profiler {
|
namespace profiler {
|
||||||
|
@ -109,6 +110,18 @@ struct BaseTime {
|
||||||
|
|
||||||
const float kTimeUnit = 1000;
|
const float kTimeUnit = 1000;
|
||||||
|
|
||||||
|
class ProfilingOp {
|
||||||
|
public:
|
||||||
|
ProfilingOp() = default;
|
||||||
|
virtual ~ProfilingOp() = default;
|
||||||
|
virtual void SaveProfilingData() = 0;
|
||||||
|
virtual void Init() = 0;
|
||||||
|
std::string Name() const { return op_name_; }
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::string op_name_;
|
||||||
|
};
|
||||||
|
|
||||||
class GPUProfiler {
|
class GPUProfiler {
|
||||||
public:
|
public:
|
||||||
static std::shared_ptr<GPUProfiler> GetInstance();
|
static std::shared_ptr<GPUProfiler> GetInstance();
|
||||||
|
@ -130,6 +143,8 @@ class GPUProfiler {
|
||||||
void OpDataProducerBegin(const std::string op_name, void *stream);
|
void OpDataProducerBegin(const std::string op_name, void *stream);
|
||||||
void OpDataProducerEnd();
|
void OpDataProducerEnd();
|
||||||
void ProcessEvents();
|
void ProcessEvents();
|
||||||
|
void RegisterProfilingOp(std::shared_ptr<ProfilingOp> node);
|
||||||
|
std::string ProfileDataPath() const { return profile_data_path_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
GPUProfiler() = default;
|
GPUProfiler() = default;
|
||||||
|
@ -153,6 +168,7 @@ class GPUProfiler {
|
||||||
std::string op_name_;
|
std::string op_name_;
|
||||||
void *stream_;
|
void *stream_;
|
||||||
void SaveProfileData();
|
void SaveProfileData();
|
||||||
|
void SaveExtraProfileData();
|
||||||
std::mutex event_mutex_;
|
std::mutex event_mutex_;
|
||||||
|
|
||||||
std::vector<CUpti_ActivityKind> activities_enable_;
|
std::vector<CUpti_ActivityKind> activities_enable_;
|
||||||
|
@ -172,6 +188,7 @@ class GPUProfiler {
|
||||||
uint64_t op_host_time_stop_;
|
uint64_t op_host_time_stop_;
|
||||||
uint64_t op_cupti_time_start_;
|
uint64_t op_cupti_time_start_;
|
||||||
std::string profile_data_path_;
|
std::string profile_data_path_;
|
||||||
|
std::map<std::string, std::shared_ptr<ProfilingOp>> profiling_op_;
|
||||||
};
|
};
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
} // namespace profiler
|
} // namespace profiler
|
||||||
|
|
Loading…
Reference in New Issue