forked from mindspore-Ecosystem/mindspore
!21362 add pynative profiling codes based on ascend and gpu
Merge pull request !21362 from lvchangquan/profiling_refactor
This commit is contained in:
commit
6afcd815d2
|
@ -32,6 +32,7 @@
|
||||||
#include "ir/tensor.h"
|
#include "ir/tensor.h"
|
||||||
#include "utils/any.h"
|
#include "utils/any.h"
|
||||||
#include "utils/utils.h"
|
#include "utils/utils.h"
|
||||||
|
#include "utils/profile.h"
|
||||||
#include "utils/ms_context.h"
|
#include "utils/ms_context.h"
|
||||||
#include "utils/check_convert_utils.h"
|
#include "utils/check_convert_utils.h"
|
||||||
#include "utils/context/context_extends.h"
|
#include "utils/context/context_extends.h"
|
||||||
|
@ -67,6 +68,7 @@
|
||||||
|
|
||||||
#include "debug/anf_ir_dump.h"
|
#include "debug/anf_ir_dump.h"
|
||||||
#include "runtime/hardware/device_context_manager.h"
|
#include "runtime/hardware/device_context_manager.h"
|
||||||
|
#include "runtime/device/pynative_profiling.h"
|
||||||
|
|
||||||
using mindspore::tensor::TensorPy;
|
using mindspore::tensor::TensorPy;
|
||||||
|
|
||||||
|
@ -702,6 +704,9 @@ py::object GetDstType(const TypeId &type_id) {
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
py::object RealRunOp(const py::args &args) {
|
py::object RealRunOp(const py::args &args) {
|
||||||
|
auto real_run_op_start = GetTime();
|
||||||
|
auto &profiler_inst = device::PynativeProfiler::GetInstance();
|
||||||
|
profiler_inst.AddRealRunOpIndex();
|
||||||
CheckPyNativeContext();
|
CheckPyNativeContext();
|
||||||
auto executor = PynativeExecutor::GetInstance();
|
auto executor = PynativeExecutor::GetInstance();
|
||||||
MS_EXCEPTION_IF_NULL(executor);
|
MS_EXCEPTION_IF_NULL(executor);
|
||||||
|
@ -709,6 +714,10 @@ py::object RealRunOp(const py::args &args) {
|
||||||
MS_EXCEPTION_IF_NULL(op_exec_info);
|
MS_EXCEPTION_IF_NULL(op_exec_info);
|
||||||
py::object ret = py::none();
|
py::object ret = py::none();
|
||||||
PynativeExecutorTry(executor->forward_executor()->RunOpS, &ret, op_exec_info);
|
PynativeExecutorTry(executor->forward_executor()->RunOpS, &ret, op_exec_info);
|
||||||
|
auto real_run_op_end = GetTime();
|
||||||
|
profiler_inst.SetRealRunOpName(op_exec_info->op_name);
|
||||||
|
profiler_inst.SetRealRunOpTime(std::make_pair(real_run_op_start, real_run_op_end));
|
||||||
|
profiler_inst.SingleOpProfilingData();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
#include "pybind_api/api_register.h"
|
#include "pybind_api/api_register.h"
|
||||||
#include "utils/log_adapter.h"
|
#include "utils/log_adapter.h"
|
||||||
#include "utils/utils.h"
|
#include "utils/utils.h"
|
||||||
|
#include "utils/profile.h"
|
||||||
#include "utils/ms_context.h"
|
#include "utils/ms_context.h"
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
|
@ -446,6 +447,12 @@ void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void GPUProfiler::SingleOpLaunchTimeProcess(float op_time_elapsed) {
|
||||||
|
auto launch_end_time = GetTime();
|
||||||
|
double launch_start_time = launch_end_time - op_time_elapsed / kTimeUnit / kTimeUnit;
|
||||||
|
SetSingleOpLaunchTime(std::make_pair(launch_start_time, launch_end_time));
|
||||||
|
}
|
||||||
|
|
||||||
void GPUProfiler::OpDataProducerEnd() {
|
void GPUProfiler::OpDataProducerEnd() {
|
||||||
float op_time_elapsed = 0;
|
float op_time_elapsed = 0;
|
||||||
if (sync_enable_flag_) {
|
if (sync_enable_flag_) {
|
||||||
|
@ -459,9 +466,11 @@ void GPUProfiler::OpDataProducerEnd() {
|
||||||
CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy op event stop failed");
|
CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy op event stop failed");
|
||||||
op_time_elapsed = op_time_elapsed * kTimeUnit;
|
op_time_elapsed = op_time_elapsed * kTimeUnit;
|
||||||
op_host_time_stop_ = GetHostTimeStamp();
|
op_host_time_stop_ = GetHostTimeStamp();
|
||||||
|
SingleOpLaunchTimeProcess(op_time_elapsed);
|
||||||
} else {
|
} else {
|
||||||
op_host_time_stop_ = GetHostTimeStamp();
|
op_host_time_stop_ = GetHostTimeStamp();
|
||||||
op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit;
|
op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit;
|
||||||
|
SingleOpLaunchTimeProcess(op_time_elapsed);
|
||||||
}
|
}
|
||||||
MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed;
|
MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed;
|
||||||
Profiler::SetRunTimeData(op_name_, op_time_elapsed);
|
Profiler::SetRunTimeData(op_name_, op_time_elapsed);
|
||||||
|
|
|
@ -135,6 +135,7 @@ class GPUProfiler : public Profiler {
|
||||||
std::string ProfileDataPath() const { return profile_data_path_; }
|
std::string ProfileDataPath() const { return profile_data_path_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void SingleOpLaunchTimeProcess(float op_time_elapsed);
|
||||||
void OpsParser();
|
void OpsParser();
|
||||||
void EventLog(const Event &event);
|
void EventLog(const Event &event);
|
||||||
void ClearInst() override;
|
void ClearInst() override;
|
||||||
|
|
|
@ -79,6 +79,10 @@ class Profiler {
|
||||||
bool GetEnableFlag() const { return enable_flag_; }
|
bool GetEnableFlag() const { return enable_flag_; }
|
||||||
std::string ProfileDataPath() const { return profile_data_path_; }
|
std::string ProfileDataPath() const { return profile_data_path_; }
|
||||||
void RecordOneStepStartEndInfo(std::string op_name);
|
void RecordOneStepStartEndInfo(std::string op_name);
|
||||||
|
std::pair<double, double> GetSingleOpLaunchTime() { return single_op_launch_start_time_end_time_; }
|
||||||
|
void SetSingleOpLaunchTime(const std::pair<double, double> &launch_start_end) {
|
||||||
|
single_op_launch_start_time_end_time_ = launch_start_end;
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void SetRunTimeData(const std::string &op_name, const float time_elapsed);
|
void SetRunTimeData(const std::string &op_name, const float time_elapsed);
|
||||||
|
@ -86,6 +90,7 @@ class Profiler {
|
||||||
uint64_t GetHostMonoTimeStamp() const;
|
uint64_t GetHostMonoTimeStamp() const;
|
||||||
virtual void SaveProfileData() = 0;
|
virtual void SaveProfileData() = 0;
|
||||||
virtual void ClearInst() = 0;
|
virtual void ClearInst() = 0;
|
||||||
|
std::pair<double, double> single_op_launch_start_time_end_time_;
|
||||||
bool enable_flag_ = false;
|
bool enable_flag_ = false;
|
||||||
std::string profile_data_path_;
|
std::string profile_data_path_;
|
||||||
std::unordered_map<std::string, OpInfo> op_info_map_;
|
std::unordered_map<std::string, OpInfo> op_info_map_;
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc"
|
file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc"
|
||||||
"kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc"
|
"kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc"
|
||||||
"memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc"
|
"memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc"
|
||||||
"bucket.cc" "launch_kernel.cc" "launch_mul.cc"
|
"bucket.cc" "launch_kernel.cc" "launch_mul.cc" "pynative_profiling.cc"
|
||||||
)
|
)
|
||||||
|
|
||||||
if("${ENABLE_HIDDEN}" STREQUAL "OFF")
|
if("${ENABLE_HIDDEN}" STREQUAL "OFF")
|
||||||
|
|
|
@ -29,6 +29,14 @@ AscendEvent::AscendEvent() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AscendTimeEvent::AscendTimeEvent() {
|
||||||
|
auto ret = rtEventCreateWithFlag(&event_, RT_EVENT_TIME_LINE);
|
||||||
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
MS_LOG(ERROR) << "rtEventCreate failed, ret:" << ret;
|
||||||
|
event_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
AscendEvent::~AscendEvent() {
|
AscendEvent::~AscendEvent() {
|
||||||
auto ret = rtEventDestroy(event_);
|
auto ret = rtEventDestroy(event_);
|
||||||
if (ret != RT_ERROR_NONE) {
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
@ -60,5 +68,24 @@ void AscendEvent::WaitEvent() {
|
||||||
need_wait_ = false;
|
need_wait_ = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void AscendEvent::SyncEvent() {
|
||||||
|
MS_EXCEPTION_IF_NULL(event_);
|
||||||
|
auto ret = rtEventSynchronize(event_);
|
||||||
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
MS_LOG(EXCEPTION) << "rtEventSynchronize failed, ret:" << ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendEvent::ElapsedTime(float *cost_time, DeviceEvent *other) {
|
||||||
|
MS_EXCEPTION_IF_NULL(event_);
|
||||||
|
auto ascend_other = static_cast<AscendEvent *>(other);
|
||||||
|
MS_EXCEPTION_IF_NULL(ascend_other);
|
||||||
|
MS_EXCEPTION_IF_NULL(ascend_other->event_);
|
||||||
|
auto ret = rtEventElapsedTime(cost_time, event_, ascend_other->event_);
|
||||||
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
MS_LOG(EXCEPTION) << "rtEventElapsedTime failed, ret:" << ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool AscendEvent::NeedWait() { return need_wait_; }
|
bool AscendEvent::NeedWait() { return need_wait_; }
|
||||||
} // namespace mindspore::device::ascend
|
} // namespace mindspore::device::ascend
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
#include "runtime/base.h"
|
#include "runtime/base.h"
|
||||||
#include "ir/device_event.h"
|
#include "ir/device_event.h"
|
||||||
|
|
||||||
namespace mindspore::device::ascend {
|
namespace mindspore::device::ascend {
|
||||||
class AscendEvent : public DeviceEvent {
|
class AscendEvent : public DeviceEvent {
|
||||||
public:
|
public:
|
||||||
|
@ -28,14 +29,22 @@ class AscendEvent : public DeviceEvent {
|
||||||
void WaitEvent() override;
|
void WaitEvent() override;
|
||||||
void RecordEvent() override;
|
void RecordEvent() override;
|
||||||
bool NeedWait() override;
|
bool NeedWait() override;
|
||||||
|
void SyncEvent() override;
|
||||||
|
void ElapsedTime(float *cost_time, DeviceEvent *other) override;
|
||||||
void set_wait_stream(rtStream_t wait_stream) override { wait_stream_ = wait_stream; }
|
void set_wait_stream(rtStream_t wait_stream) override { wait_stream_ = wait_stream; }
|
||||||
void set_record_stream(rtStream_t record_stream) override { record_stream_ = record_stream; }
|
void set_record_stream(rtStream_t record_stream) override { record_stream_ = record_stream; }
|
||||||
|
|
||||||
private:
|
protected:
|
||||||
rtEvent_t event_{nullptr};
|
rtEvent_t event_{nullptr};
|
||||||
rtStream_t wait_stream_{nullptr};
|
rtStream_t wait_stream_{nullptr};
|
||||||
rtStream_t record_stream_{nullptr};
|
rtStream_t record_stream_{nullptr};
|
||||||
bool need_wait_{false};
|
bool need_wait_{false};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class AscendTimeEvent : public AscendEvent {
|
||||||
|
public:
|
||||||
|
AscendTimeEvent();
|
||||||
|
~AscendTimeEvent() override = default;
|
||||||
|
};
|
||||||
} // namespace mindspore::device::ascend
|
} // namespace mindspore::device::ascend
|
||||||
#endif // MINDSPORE_ASCEND_EVENT_H
|
#endif // MINDSPORE_ASCEND_EVENT_H
|
||||||
|
|
|
@ -282,6 +282,13 @@ void AscendKernelRuntime::PreInit() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AscendKernelRuntime::Init() {
|
bool AscendKernelRuntime::Init() {
|
||||||
|
auto ms_context = MsContext::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(ms_context);
|
||||||
|
auto execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
|
||||||
|
auto profiling_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PROFILING);
|
||||||
|
if (execution_mode == kPynativeMode && profiling_flag) {
|
||||||
|
pynative_mode_profiling_flag_ = true;
|
||||||
|
}
|
||||||
if (initialized_) {
|
if (initialized_) {
|
||||||
SetCurrentContext();
|
SetCurrentContext();
|
||||||
return true;
|
return true;
|
||||||
|
@ -946,6 +953,12 @@ std::shared_ptr<DeviceEvent> AscendKernelRuntime::CreateDeviceEvent() {
|
||||||
return ascend_event;
|
return ascend_event;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<DeviceEvent> AscendKernelRuntime::CreateDeviceTimeEvent() {
|
||||||
|
auto ascend_time_event = std::make_shared<AscendTimeEvent>();
|
||||||
|
MS_EXCEPTION_IF_NULL(ascend_time_event);
|
||||||
|
return ascend_time_event;
|
||||||
|
}
|
||||||
|
|
||||||
uint64_t AscendKernelRuntime::GetAvailableMemMaxSize() const {
|
uint64_t AscendKernelRuntime::GetAvailableMemMaxSize() const {
|
||||||
auto ascend_mem_manager = std::dynamic_pointer_cast<AscendMemoryManager>(mem_manager_);
|
auto ascend_mem_manager = std::dynamic_pointer_cast<AscendMemoryManager>(mem_manager_);
|
||||||
MS_EXCEPTION_IF_NULL(ascend_mem_manager);
|
MS_EXCEPTION_IF_NULL(ascend_mem_manager);
|
||||||
|
|
|
@ -58,6 +58,7 @@ class AscendKernelRuntime : public KernelRuntime {
|
||||||
uint64_t GetAvailableMemMaxSize() const override;
|
uint64_t GetAvailableMemMaxSize() const override;
|
||||||
DeviceAddressType GetTargetDeviceAddressType() const override { return DeviceAddressType::kAscend; };
|
DeviceAddressType GetTargetDeviceAddressType() const override { return DeviceAddressType::kAscend; };
|
||||||
std::shared_ptr<DeviceEvent> CreateDeviceEvent() override;
|
std::shared_ptr<DeviceEvent> CreateDeviceEvent() override;
|
||||||
|
std::shared_ptr<DeviceEvent> CreateDeviceTimeEvent() override;
|
||||||
void *compute_stream() const override { return stream_; }
|
void *compute_stream() const override { return stream_; }
|
||||||
void *communication_stream() const override { return communication_stream_; }
|
void *communication_stream() const override { return communication_stream_; }
|
||||||
|
|
||||||
|
|
|
@ -42,5 +42,19 @@ void GpuEvent::RecordEvent() {
|
||||||
need_wait_ = true;
|
need_wait_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void GpuEvent::SyncEvent() {
|
||||||
|
MS_EXCEPTION_IF_NULL(event_);
|
||||||
|
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventSynchronize(event_), "cudaEventSynchronize failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
void GpuEvent::ElapsedTime(float *cost_time, DeviceEvent *other) {
|
||||||
|
MS_EXCEPTION_IF_NULL(event_);
|
||||||
|
auto gpu_event = static_cast<GpuEvent *>(other);
|
||||||
|
MS_EXCEPTION_IF_NULL(gpu_event);
|
||||||
|
MS_EXCEPTION_IF_NULL(gpu_event->event_);
|
||||||
|
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventElapsedTime(cost_time, event_, gpu_event->event_),
|
||||||
|
"cudaEventElapsedTime failed");
|
||||||
|
}
|
||||||
|
|
||||||
bool GpuEvent::NeedWait() { return need_wait_; }
|
bool GpuEvent::NeedWait() { return need_wait_; }
|
||||||
} // namespace mindspore::device::gpu
|
} // namespace mindspore::device::gpu
|
||||||
|
|
|
@ -29,6 +29,8 @@ class GpuEvent : public DeviceEvent {
|
||||||
void WaitEvent() override;
|
void WaitEvent() override;
|
||||||
void RecordEvent() override;
|
void RecordEvent() override;
|
||||||
bool NeedWait() override;
|
bool NeedWait() override;
|
||||||
|
void SyncEvent() override;
|
||||||
|
void ElapsedTime(float *cost_time, DeviceEvent *other) override;
|
||||||
void set_wait_stream(void *wait_stream) override { wait_stream_ = static_cast<cudaStream_t>(wait_stream); }
|
void set_wait_stream(void *wait_stream) override { wait_stream_ = static_cast<cudaStream_t>(wait_stream); }
|
||||||
void set_record_stream(void *record_stream) override { record_stream_ = static_cast<cudaStream_t>(record_stream); }
|
void set_record_stream(void *record_stream) override { record_stream_ = static_cast<cudaStream_t>(record_stream); }
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,7 @@
|
||||||
#include "utils/utils.h"
|
#include "utils/utils.h"
|
||||||
#include "frontend/parallel/context.h"
|
#include "frontend/parallel/context.h"
|
||||||
#include "debug/env_config_parser.h"
|
#include "debug/env_config_parser.h"
|
||||||
|
#include "runtime/device/pynative_profiling.h"
|
||||||
#if ((defined ENABLE_CPU) && (!defined _WIN32))
|
#if ((defined ENABLE_CPU) && (!defined _WIN32))
|
||||||
#include "ps/ps_cache/ps_cache_manager.h"
|
#include "ps/ps_cache/ps_cache_manager.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -966,6 +967,36 @@ void KernelRuntime::LaunchKernelEvent(const std::vector<std::vector<std::functio
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool KernelRuntime::LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name,
|
||||||
|
const std::vector<AddressPtr> &inputs,
|
||||||
|
const std::vector<AddressPtr> &workspace,
|
||||||
|
const std::vector<AddressPtr> &outputs, void *stream) {
|
||||||
|
MS_EXCEPTION_IF_NULL(kernel_mod);
|
||||||
|
MS_EXCEPTION_IF_NULL(stream);
|
||||||
|
float cost_time = 0;
|
||||||
|
auto start = CreateDeviceTimeEvent();
|
||||||
|
auto end = CreateDeviceTimeEvent();
|
||||||
|
MS_EXCEPTION_IF_NULL(start);
|
||||||
|
MS_EXCEPTION_IF_NULL(end);
|
||||||
|
start->set_record_stream(stream);
|
||||||
|
end->set_record_stream(stream);
|
||||||
|
start->RecordEvent();
|
||||||
|
bool ret = kernel_mod->Launch(inputs, workspace, outputs, stream);
|
||||||
|
end->RecordEvent();
|
||||||
|
start->SyncEvent();
|
||||||
|
end->SyncEvent();
|
||||||
|
start->ElapsedTime(&cost_time, end.get());
|
||||||
|
auto launch_end_time = GetTime();
|
||||||
|
auto &profiler_inst = PynativeProfiler::GetInstance();
|
||||||
|
double launch_start_time = launch_end_time - cost_time / kBasicTimeTransferUnit;
|
||||||
|
auto op_launch_start_time_end_time = std::make_pair(launch_start_time, launch_end_time);
|
||||||
|
profiler_inst.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_time_end_time));
|
||||||
|
if (!ret) {
|
||||||
|
MS_LOG(EXCEPTION) << "Launch kernel failed, kernel name is : " << op_name;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
|
bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
|
||||||
const auto &kernels = graph.execution_order();
|
const auto &kernels = graph.execution_order();
|
||||||
std::vector<DynamicKernelPtr> dynamic_kernel_list;
|
std::vector<DynamicKernelPtr> dynamic_kernel_list;
|
||||||
|
@ -1020,9 +1051,19 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
|
||||||
GenLaunchArgs(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
|
GenLaunchArgs(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
|
||||||
bool ret;
|
bool ret;
|
||||||
if (AnfAlgo::IsCommunicationOp(kernel)) {
|
if (AnfAlgo::IsCommunicationOp(kernel)) {
|
||||||
ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_);
|
if (pynative_mode_profiling_flag_) {
|
||||||
|
ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs,
|
||||||
|
kernel_workspaces, kernel_outputs, communication_stream_);
|
||||||
|
} else {
|
||||||
|
ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
|
if (pynative_mode_profiling_flag_) {
|
||||||
|
ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs,
|
||||||
|
kernel_workspaces, kernel_outputs, stream_);
|
||||||
|
} else {
|
||||||
|
ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
MS_LOG(ERROR) << "Launch kernel failed.";
|
MS_LOG(ERROR) << "Launch kernel failed.";
|
||||||
|
|
|
@ -98,6 +98,7 @@ class KernelRuntime {
|
||||||
virtual uint64_t GetAvailableMemMaxSize() const { return 0; }
|
virtual uint64_t GetAvailableMemMaxSize() const { return 0; }
|
||||||
void GenKernelEvents(const session::KernelGraph *graph);
|
void GenKernelEvents(const session::KernelGraph *graph);
|
||||||
virtual std::shared_ptr<DeviceEvent> CreateDeviceEvent() { return nullptr; }
|
virtual std::shared_ptr<DeviceEvent> CreateDeviceEvent() { return nullptr; }
|
||||||
|
virtual std::shared_ptr<DeviceEvent> CreateDeviceTimeEvent() { return nullptr; }
|
||||||
virtual DeviceAddressType GetTargetDeviceAddressType() const = 0;
|
virtual DeviceAddressType GetTargetDeviceAddressType() const = 0;
|
||||||
virtual void *compute_stream() const { return nullptr; }
|
virtual void *compute_stream() const { return nullptr; }
|
||||||
virtual void *communication_stream() const { return nullptr; }
|
virtual void *communication_stream() const { return nullptr; }
|
||||||
|
@ -134,6 +135,10 @@ class KernelRuntime {
|
||||||
void RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, session::KernelGraph *graph);
|
void RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, session::KernelGraph *graph);
|
||||||
void AssignValueNodeTensor(const ValueNodePtr &value_node, const ValuePtr &node_value, size_t output_idx);
|
void AssignValueNodeTensor(const ValueNodePtr &value_node, const ValuePtr &node_value, size_t output_idx);
|
||||||
DeviceAddressPtr PreAssignCNodeMemory(const AnfNodePtr &anf_node, size_t index);
|
DeviceAddressPtr PreAssignCNodeMemory(const AnfNodePtr &anf_node, size_t index);
|
||||||
|
bool LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name,
|
||||||
|
const std::vector<AddressPtr> &inputs,
|
||||||
|
const std::vector<AddressPtr> &workspace,
|
||||||
|
const std::vector<AddressPtr> &outputs, void *stream);
|
||||||
#if (ENABLE_CPU && !_WIN32)
|
#if (ENABLE_CPU && !_WIN32)
|
||||||
void GetFirstPSEmbeddingCache(const session::KernelGraph *graph, AnfNodePtr *const first_cache_input_index,
|
void GetFirstPSEmbeddingCache(const session::KernelGraph *graph, AnfNodePtr *const first_cache_input_index,
|
||||||
size_t *const first_cache_size);
|
size_t *const first_cache_size);
|
||||||
|
@ -143,6 +148,7 @@ class KernelRuntime {
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
uint32_t device_id_{0};
|
uint32_t device_id_{0};
|
||||||
|
bool pynative_mode_profiling_flag_{false};
|
||||||
#if !defined(_WIN32) && !defined(_WIN64)
|
#if !defined(_WIN32) && !defined(_WIN64)
|
||||||
std::shared_ptr<Debugger> debugger_;
|
std::shared_ptr<Debugger> debugger_;
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime/device/pynative_profiling.h"
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <utility>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include "utils/profile.h"
|
||||||
|
#include "utils/utils.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace device {
|
||||||
|
void PynativeProfiler::SingleOpProfilingData() {
|
||||||
|
static std::ofstream of("pynative_forward_profiling_data.csv");
|
||||||
|
of.setf(std::ios::fixed, std::ios::floatfield);
|
||||||
|
if (real_run_op_index_ == 1) {
|
||||||
|
of << "RealRunOpIndex" << ',' << "RealRunOpName" << ',' << "OpName" << ',' << "RealRunOpStartTime(s)" << ','
|
||||||
|
<< "OpDeviceStartTime(s)" << ',' << "OpDeviceEndTime(s)" << ',' << "RealRunOpEndTime(s)" << std::endl;
|
||||||
|
}
|
||||||
|
if (op_name_launch_start_time_end_time_vec_.empty()) {
|
||||||
|
of << real_run_op_index_ << ',' << real_run_op_name_ << ',' << "nopnode" << ','
|
||||||
|
<< real_run_op_start_time_end_time_.first << ',' << "nopnode" << ',' << "nopnode" << ','
|
||||||
|
<< real_run_op_start_time_end_time_.second << std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (const auto &i : op_name_launch_start_time_end_time_vec_) {
|
||||||
|
of << real_run_op_index_ << ',' << real_run_op_name_ << ',' << i.first << ','
|
||||||
|
<< real_run_op_start_time_end_time_.first << ',' << i.second.first << ',' << i.second.second << ','
|
||||||
|
<< real_run_op_start_time_end_time_.second << std::endl;
|
||||||
|
}
|
||||||
|
op_name_launch_start_time_end_time_vec_.clear();
|
||||||
|
}
|
||||||
|
} // namespace device
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,56 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <utility>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace device {
|
||||||
|
class PynativeProfiler {
|
||||||
|
public:
|
||||||
|
static PynativeProfiler &GetInstance() {
|
||||||
|
static PynativeProfiler instance;
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
void AddRealRunOpIndex() { ++real_run_op_index_; }
|
||||||
|
void SetRealRunOpName(const std::string &name) { real_run_op_name_ = name; }
|
||||||
|
void SetRealRunOpTime(const std::pair<double, double> &start_end) { real_run_op_start_time_end_time_ = start_end; }
|
||||||
|
void SetOpNameAndLaunchTime(const std::pair<std::string, std::pair<double, double>> &name_start_end) {
|
||||||
|
op_name_launch_start_time_end_time_vec_.push_back(name_start_end);
|
||||||
|
}
|
||||||
|
void SingleOpProfilingData();
|
||||||
|
|
||||||
|
private:
|
||||||
|
PynativeProfiler() = default;
|
||||||
|
~PynativeProfiler() = default;
|
||||||
|
PynativeProfiler(const PynativeProfiler &) = delete;
|
||||||
|
PynativeProfiler &operator=(const PynativeProfiler &) = delete;
|
||||||
|
int real_run_op_index_ = 0;
|
||||||
|
std::string real_run_op_name_;
|
||||||
|
std::pair<double, double> real_run_op_start_time_end_time_;
|
||||||
|
std::vector<std::pair<std::string, std::pair<double, double>>> op_name_launch_start_time_end_time_vec_;
|
||||||
|
};
|
||||||
|
} // namespace device
|
||||||
|
} // namespace mindspore
|
||||||
|
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_
|
|
@ -16,6 +16,8 @@
|
||||||
|
|
||||||
#include "runtime/hardware/gpu/gpu_device_context.h"
|
#include "runtime/hardware/gpu/gpu_device_context.h"
|
||||||
#include <dlfcn.h>
|
#include <dlfcn.h>
|
||||||
|
#include <utility>
|
||||||
|
#include "runtime/device/pynative_profiling.h"
|
||||||
#include "runtime/device/gpu/kernel_info_setter.h"
|
#include "runtime/device/gpu/kernel_info_setter.h"
|
||||||
#include "runtime/device/gpu/gpu_kernel_build.h"
|
#include "runtime/device/gpu/gpu_kernel_build.h"
|
||||||
#include "runtime/device/gpu/gpu_device_address.h"
|
#include "runtime/device/gpu/gpu_device_address.h"
|
||||||
|
@ -432,6 +434,11 @@ bool GPUDeviceContext::LaunchKernelWithProfiling(const CNodePtr &kernel, const s
|
||||||
bool ret = DoLaunchKernel(kernel_mod, inputs, workspace, outputs);
|
bool ret = DoLaunchKernel(kernel_mod, inputs, workspace, outputs);
|
||||||
profiler_inst->OpDataProducerEnd();
|
profiler_inst->OpDataProducerEnd();
|
||||||
|
|
||||||
|
auto op_launch_start_end_time = profiler_inst->GetSingleOpLaunchTime();
|
||||||
|
auto &pynative_profiler = PynativeProfiler::GetInstance();
|
||||||
|
std::string op_name = kernel->fullname_with_scope();
|
||||||
|
pynative_profiler.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_end_time));
|
||||||
|
|
||||||
if (profiler_inst->GetSyncEnableFlag()) {
|
if (profiler_inst->GetSyncEnableFlag()) {
|
||||||
CHECK_RET_WITH_RETURN_ERROR(SyncStream(), "Profiler SyncStream failed.");
|
CHECK_RET_WITH_RETURN_ERROR(SyncStream(), "Profiler SyncStream failed.");
|
||||||
}
|
}
|
||||||
|
|
|
@ -507,6 +507,8 @@ constexpr auto kUpdateStateRealInput = 2;
|
||||||
// index define of Load
|
// index define of Load
|
||||||
constexpr auto kLoadRealInput = 1;
|
constexpr auto kLoadRealInput = 1;
|
||||||
constexpr auto kLoadStateInput = 2;
|
constexpr auto kLoadStateInput = 2;
|
||||||
|
// time transfer unit
|
||||||
|
constexpr int kBasicTimeTransferUnit = 1000;
|
||||||
// index of input or output
|
// index of input or output
|
||||||
enum Index : size_t {
|
enum Index : size_t {
|
||||||
kIndex0 = 0,
|
kIndex0 = 0,
|
||||||
|
|
|
@ -24,6 +24,8 @@ class DeviceEvent {
|
||||||
virtual void WaitEvent() = 0;
|
virtual void WaitEvent() = 0;
|
||||||
virtual void RecordEvent() = 0;
|
virtual void RecordEvent() = 0;
|
||||||
virtual bool NeedWait() = 0;
|
virtual bool NeedWait() = 0;
|
||||||
|
virtual void SyncEvent() = 0;
|
||||||
|
virtual void ElapsedTime(float *cost_time, DeviceEvent *other) = 0;
|
||||||
virtual void set_wait_stream(void *stream) = 0;
|
virtual void set_wait_stream(void *stream) = 0;
|
||||||
virtual void set_record_stream(void *stream) = 0;
|
virtual void set_record_stream(void *stream) = 0;
|
||||||
};
|
};
|
||||||
|
|
|
@ -25,6 +25,10 @@
|
||||||
|
|
||||||
rtError_t rtEventSynchronize(rtEvent_t event) { return RT_ERROR_NONE; }
|
rtError_t rtEventSynchronize(rtEvent_t event) { return RT_ERROR_NONE; }
|
||||||
|
|
||||||
|
rtError_t rtEventCreateWithFlag(rtEvent_t *event, uint32_t flag) { return RT_ERROR_NONE; }
|
||||||
|
|
||||||
|
rtError_t rtEventElapsedTime(float *time, rtEvent_t start, rtEvent_t end) { return RT_ERROR_NONE; }
|
||||||
|
|
||||||
rtError_t rtMalloc(void **devPtr, uint64_t size, rtMemType_t type) { return RT_ERROR_NONE; }
|
rtError_t rtMalloc(void **devPtr, uint64_t size, rtMemType_t type) { return RT_ERROR_NONE; }
|
||||||
|
|
||||||
rtError_t rtMemcpy(void *dst, uint64_t destMax, const void *src, uint64_t count, rtMemcpyKind_t kind) {
|
rtError_t rtMemcpy(void *dst, uint64_t destMax, const void *src, uint64_t count, rtMemcpyKind_t kind) {
|
||||||
|
|
Loading…
Reference in New Issue