!21362 add pynative profiling codes based on ascend and gpu

Merge pull request !21362 from lvchangquan/profiling_refactor
This commit is contained in:
i-robot 2021-08-13 06:18:31 +00:00 committed by Gitee
commit 6afcd815d2
19 changed files with 261 additions and 4 deletions

View File

@ -32,6 +32,7 @@
#include "ir/tensor.h"
#include "utils/any.h"
#include "utils/utils.h"
#include "utils/profile.h"
#include "utils/ms_context.h"
#include "utils/check_convert_utils.h"
#include "utils/context/context_extends.h"
@ -67,6 +68,7 @@
#include "debug/anf_ir_dump.h"
#include "runtime/hardware/device_context_manager.h"
#include "runtime/device/pynative_profiling.h"
using mindspore::tensor::TensorPy;
@ -702,6 +704,9 @@ py::object GetDstType(const TypeId &type_id) {
} // namespace
py::object RealRunOp(const py::args &args) {
auto real_run_op_start = GetTime();
auto &profiler_inst = device::PynativeProfiler::GetInstance();
profiler_inst.AddRealRunOpIndex();
CheckPyNativeContext();
auto executor = PynativeExecutor::GetInstance();
MS_EXCEPTION_IF_NULL(executor);
@ -709,6 +714,10 @@ py::object RealRunOp(const py::args &args) {
MS_EXCEPTION_IF_NULL(op_exec_info);
py::object ret = py::none();
PynativeExecutorTry(executor->forward_executor()->RunOpS, &ret, op_exec_info);
auto real_run_op_end = GetTime();
profiler_inst.SetRealRunOpName(op_exec_info->op_name);
profiler_inst.SetRealRunOpTime(std::make_pair(real_run_op_start, real_run_op_end));
profiler_inst.SingleOpProfilingData();
return ret;
}

View File

@ -25,6 +25,7 @@
#include "pybind_api/api_register.h"
#include "utils/log_adapter.h"
#include "utils/utils.h"
#include "utils/profile.h"
#include "utils/ms_context.h"
namespace mindspore {
@ -446,6 +447,12 @@ void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) {
}
}
void GPUProfiler::SingleOpLaunchTimeProcess(float op_time_elapsed) {
auto launch_end_time = GetTime();
double launch_start_time = launch_end_time - op_time_elapsed / kTimeUnit / kTimeUnit;
SetSingleOpLaunchTime(std::make_pair(launch_start_time, launch_end_time));
}
void GPUProfiler::OpDataProducerEnd() {
float op_time_elapsed = 0;
if (sync_enable_flag_) {
@ -459,9 +466,11 @@ void GPUProfiler::OpDataProducerEnd() {
CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy op event stop failed");
op_time_elapsed = op_time_elapsed * kTimeUnit;
op_host_time_stop_ = GetHostTimeStamp();
SingleOpLaunchTimeProcess(op_time_elapsed);
} else {
op_host_time_stop_ = GetHostTimeStamp();
op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit;
SingleOpLaunchTimeProcess(op_time_elapsed);
}
MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed;
Profiler::SetRunTimeData(op_name_, op_time_elapsed);

View File

@ -135,6 +135,7 @@ class GPUProfiler : public Profiler {
std::string ProfileDataPath() const { return profile_data_path_; }
private:
void SingleOpLaunchTimeProcess(float op_time_elapsed);
void OpsParser();
void EventLog(const Event &event);
void ClearInst() override;

View File

@ -79,6 +79,10 @@ class Profiler {
bool GetEnableFlag() const { return enable_flag_; }
std::string ProfileDataPath() const { return profile_data_path_; }
void RecordOneStepStartEndInfo(std::string op_name);
std::pair<double, double> GetSingleOpLaunchTime() { return single_op_launch_start_time_end_time_; }
void SetSingleOpLaunchTime(const std::pair<double, double> &launch_start_end) {
single_op_launch_start_time_end_time_ = launch_start_end;
}
protected:
void SetRunTimeData(const std::string &op_name, const float time_elapsed);
@ -86,6 +90,7 @@ class Profiler {
uint64_t GetHostMonoTimeStamp() const;
virtual void SaveProfileData() = 0;
virtual void ClearInst() = 0;
std::pair<double, double> single_op_launch_start_time_end_time_;
bool enable_flag_ = false;
std::string profile_data_path_;
std::unordered_map<std::string, OpInfo> op_info_map_;

View File

@ -1,7 +1,7 @@
file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc"
"kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc"
"memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc"
"bucket.cc" "launch_kernel.cc" "launch_mul.cc"
"bucket.cc" "launch_kernel.cc" "launch_mul.cc" "pynative_profiling.cc"
)
if("${ENABLE_HIDDEN}" STREQUAL "OFF")

View File

@ -29,6 +29,14 @@ AscendEvent::AscendEvent() {
}
}
AscendTimeEvent::AscendTimeEvent() {
auto ret = rtEventCreateWithFlag(&event_, RT_EVENT_TIME_LINE);
if (ret != RT_ERROR_NONE) {
MS_LOG(ERROR) << "rtEventCreate failed, ret:" << ret;
event_ = nullptr;
}
}
AscendEvent::~AscendEvent() {
auto ret = rtEventDestroy(event_);
if (ret != RT_ERROR_NONE) {
@ -60,5 +68,24 @@ void AscendEvent::WaitEvent() {
need_wait_ = false;
}
void AscendEvent::SyncEvent() {
MS_EXCEPTION_IF_NULL(event_);
auto ret = rtEventSynchronize(event_);
if (ret != RT_ERROR_NONE) {
MS_LOG(EXCEPTION) << "rtEventSynchronize failed, ret:" << ret;
}
}
void AscendEvent::ElapsedTime(float *cost_time, DeviceEvent *other) {
MS_EXCEPTION_IF_NULL(event_);
auto ascend_other = static_cast<AscendEvent *>(other);
MS_EXCEPTION_IF_NULL(ascend_other);
MS_EXCEPTION_IF_NULL(ascend_other->event_);
auto ret = rtEventElapsedTime(cost_time, event_, ascend_other->event_);
if (ret != RT_ERROR_NONE) {
MS_LOG(EXCEPTION) << "rtEventElapsedTime failed, ret:" << ret;
}
}
bool AscendEvent::NeedWait() { return need_wait_; }
} // namespace mindspore::device::ascend

View File

@ -19,6 +19,7 @@
#include "runtime/base.h"
#include "ir/device_event.h"
namespace mindspore::device::ascend {
class AscendEvent : public DeviceEvent {
public:
@ -28,14 +29,22 @@ class AscendEvent : public DeviceEvent {
void WaitEvent() override;
void RecordEvent() override;
bool NeedWait() override;
void SyncEvent() override;
void ElapsedTime(float *cost_time, DeviceEvent *other) override;
void set_wait_stream(rtStream_t wait_stream) override { wait_stream_ = wait_stream; }
void set_record_stream(rtStream_t record_stream) override { record_stream_ = record_stream; }
private:
protected:
rtEvent_t event_{nullptr};
rtStream_t wait_stream_{nullptr};
rtStream_t record_stream_{nullptr};
bool need_wait_{false};
};
class AscendTimeEvent : public AscendEvent {
public:
AscendTimeEvent();
~AscendTimeEvent() override = default;
};
} // namespace mindspore::device::ascend
#endif // MINDSPORE_ASCEND_EVENT_H

View File

@ -282,6 +282,13 @@ void AscendKernelRuntime::PreInit() {
}
bool AscendKernelRuntime::Init() {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
auto profiling_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PROFILING);
if (execution_mode == kPynativeMode && profiling_flag) {
pynative_mode_profiling_flag_ = true;
}
if (initialized_) {
SetCurrentContext();
return true;
@ -946,6 +953,12 @@ std::shared_ptr<DeviceEvent> AscendKernelRuntime::CreateDeviceEvent() {
return ascend_event;
}
std::shared_ptr<DeviceEvent> AscendKernelRuntime::CreateDeviceTimeEvent() {
auto ascend_time_event = std::make_shared<AscendTimeEvent>();
MS_EXCEPTION_IF_NULL(ascend_time_event);
return ascend_time_event;
}
uint64_t AscendKernelRuntime::GetAvailableMemMaxSize() const {
auto ascend_mem_manager = std::dynamic_pointer_cast<AscendMemoryManager>(mem_manager_);
MS_EXCEPTION_IF_NULL(ascend_mem_manager);

View File

@ -58,6 +58,7 @@ class AscendKernelRuntime : public KernelRuntime {
uint64_t GetAvailableMemMaxSize() const override;
DeviceAddressType GetTargetDeviceAddressType() const override { return DeviceAddressType::kAscend; };
std::shared_ptr<DeviceEvent> CreateDeviceEvent() override;
std::shared_ptr<DeviceEvent> CreateDeviceTimeEvent() override;
void *compute_stream() const override { return stream_; }
void *communication_stream() const override { return communication_stream_; }

View File

@ -42,5 +42,19 @@ void GpuEvent::RecordEvent() {
need_wait_ = true;
}
void GpuEvent::SyncEvent() {
MS_EXCEPTION_IF_NULL(event_);
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventSynchronize(event_), "cudaEventSynchronize failed");
}
void GpuEvent::ElapsedTime(float *cost_time, DeviceEvent *other) {
MS_EXCEPTION_IF_NULL(event_);
auto gpu_event = static_cast<GpuEvent *>(other);
MS_EXCEPTION_IF_NULL(gpu_event);
MS_EXCEPTION_IF_NULL(gpu_event->event_);
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventElapsedTime(cost_time, event_, gpu_event->event_),
"cudaEventElapsedTime failed");
}
bool GpuEvent::NeedWait() { return need_wait_; }
} // namespace mindspore::device::gpu

View File

@ -29,6 +29,8 @@ class GpuEvent : public DeviceEvent {
void WaitEvent() override;
void RecordEvent() override;
bool NeedWait() override;
void SyncEvent() override;
void ElapsedTime(float *cost_time, DeviceEvent *other) override;
void set_wait_stream(void *wait_stream) override { wait_stream_ = static_cast<cudaStream_t>(wait_stream); }
void set_record_stream(void *record_stream) override { record_stream_ = static_cast<cudaStream_t>(record_stream); }

View File

@ -32,6 +32,7 @@
#include "utils/utils.h"
#include "frontend/parallel/context.h"
#include "debug/env_config_parser.h"
#include "runtime/device/pynative_profiling.h"
#if ((defined ENABLE_CPU) && (!defined _WIN32))
#include "ps/ps_cache/ps_cache_manager.h"
#endif
@ -966,6 +967,36 @@ void KernelRuntime::LaunchKernelEvent(const std::vector<std::vector<std::functio
}
}
bool KernelRuntime::LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name,
const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream) {
MS_EXCEPTION_IF_NULL(kernel_mod);
MS_EXCEPTION_IF_NULL(stream);
float cost_time = 0;
auto start = CreateDeviceTimeEvent();
auto end = CreateDeviceTimeEvent();
MS_EXCEPTION_IF_NULL(start);
MS_EXCEPTION_IF_NULL(end);
start->set_record_stream(stream);
end->set_record_stream(stream);
start->RecordEvent();
bool ret = kernel_mod->Launch(inputs, workspace, outputs, stream);
end->RecordEvent();
start->SyncEvent();
end->SyncEvent();
start->ElapsedTime(&cost_time, end.get());
auto launch_end_time = GetTime();
auto &profiler_inst = PynativeProfiler::GetInstance();
double launch_start_time = launch_end_time - cost_time / kBasicTimeTransferUnit;
auto op_launch_start_time_end_time = std::make_pair(launch_start_time, launch_end_time);
profiler_inst.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_time_end_time));
if (!ret) {
MS_LOG(EXCEPTION) << "Launch kernel failed, kernel name is : " << op_name;
}
return ret;
}
bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
const auto &kernels = graph.execution_order();
std::vector<DynamicKernelPtr> dynamic_kernel_list;
@ -1020,9 +1051,19 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
GenLaunchArgs(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
bool ret;
if (AnfAlgo::IsCommunicationOp(kernel)) {
ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_);
if (pynative_mode_profiling_flag_) {
ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs,
kernel_workspaces, kernel_outputs, communication_stream_);
} else {
ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_);
}
} else {
ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
if (pynative_mode_profiling_flag_) {
ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs,
kernel_workspaces, kernel_outputs, stream_);
} else {
ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
}
}
if (!ret) {
MS_LOG(ERROR) << "Launch kernel failed.";

View File

@ -98,6 +98,7 @@ class KernelRuntime {
virtual uint64_t GetAvailableMemMaxSize() const { return 0; }
void GenKernelEvents(const session::KernelGraph *graph);
virtual std::shared_ptr<DeviceEvent> CreateDeviceEvent() { return nullptr; }
virtual std::shared_ptr<DeviceEvent> CreateDeviceTimeEvent() { return nullptr; }
virtual DeviceAddressType GetTargetDeviceAddressType() const = 0;
virtual void *compute_stream() const { return nullptr; }
virtual void *communication_stream() const { return nullptr; }
@ -134,6 +135,10 @@ class KernelRuntime {
void RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, session::KernelGraph *graph);
void AssignValueNodeTensor(const ValueNodePtr &value_node, const ValuePtr &node_value, size_t output_idx);
DeviceAddressPtr PreAssignCNodeMemory(const AnfNodePtr &anf_node, size_t index);
bool LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name,
const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream);
#if (ENABLE_CPU && !_WIN32)
void GetFirstPSEmbeddingCache(const session::KernelGraph *graph, AnfNodePtr *const first_cache_input_index,
size_t *const first_cache_size);
@ -143,6 +148,7 @@ class KernelRuntime {
protected:
uint32_t device_id_{0};
bool pynative_mode_profiling_flag_{false};
#if !defined(_WIN32) && !defined(_WIN64)
std::shared_ptr<Debugger> debugger_;
#endif

View File

@ -0,0 +1,49 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "runtime/device/pynative_profiling.h"
#include <iostream>
#include <fstream>
#include <utility>
#include <memory>
#include <string>
#include "utils/profile.h"
#include "utils/utils.h"
namespace mindspore {
namespace device {
void PynativeProfiler::SingleOpProfilingData() {
static std::ofstream of("pynative_forward_profiling_data.csv");
of.setf(std::ios::fixed, std::ios::floatfield);
if (real_run_op_index_ == 1) {
of << "RealRunOpIndex" << ',' << "RealRunOpName" << ',' << "OpName" << ',' << "RealRunOpStartTime(s)" << ','
<< "OpDeviceStartTime(s)" << ',' << "OpDeviceEndTime(s)" << ',' << "RealRunOpEndTime(s)" << std::endl;
}
if (op_name_launch_start_time_end_time_vec_.empty()) {
of << real_run_op_index_ << ',' << real_run_op_name_ << ',' << "nopnode" << ','
<< real_run_op_start_time_end_time_.first << ',' << "nopnode" << ',' << "nopnode" << ','
<< real_run_op_start_time_end_time_.second << std::endl;
return;
}
for (const auto &i : op_name_launch_start_time_end_time_vec_) {
of << real_run_op_index_ << ',' << real_run_op_name_ << ',' << i.first << ','
<< real_run_op_start_time_end_time_.first << ',' << i.second.first << ',' << i.second.second << ','
<< real_run_op_start_time_end_time_.second << std::endl;
}
op_name_launch_start_time_end_time_vec_.clear();
}
} // namespace device
} // namespace mindspore

View File

@ -0,0 +1,56 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_
#include <memory>
#include <string>
#include <vector>
#include <utility>
#include <unordered_map>
namespace mindspore {
namespace device {
class PynativeProfiler {
public:
static PynativeProfiler &GetInstance() {
static PynativeProfiler instance;
return instance;
}
void AddRealRunOpIndex() { ++real_run_op_index_; }
void SetRealRunOpName(const std::string &name) { real_run_op_name_ = name; }
void SetRealRunOpTime(const std::pair<double, double> &start_end) { real_run_op_start_time_end_time_ = start_end; }
void SetOpNameAndLaunchTime(const std::pair<std::string, std::pair<double, double>> &name_start_end) {
op_name_launch_start_time_end_time_vec_.push_back(name_start_end);
}
void SingleOpProfilingData();
private:
PynativeProfiler() = default;
~PynativeProfiler() = default;
PynativeProfiler(const PynativeProfiler &) = delete;
PynativeProfiler &operator=(const PynativeProfiler &) = delete;
int real_run_op_index_ = 0;
std::string real_run_op_name_;
std::pair<double, double> real_run_op_start_time_end_time_;
std::vector<std::pair<std::string, std::pair<double, double>>> op_name_launch_start_time_end_time_vec_;
};
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_PYNATIVE_PROFILING_H_

View File

@ -16,6 +16,8 @@
#include "runtime/hardware/gpu/gpu_device_context.h"
#include <dlfcn.h>
#include <utility>
#include "runtime/device/pynative_profiling.h"
#include "runtime/device/gpu/kernel_info_setter.h"
#include "runtime/device/gpu/gpu_kernel_build.h"
#include "runtime/device/gpu/gpu_device_address.h"
@ -432,6 +434,11 @@ bool GPUDeviceContext::LaunchKernelWithProfiling(const CNodePtr &kernel, const s
bool ret = DoLaunchKernel(kernel_mod, inputs, workspace, outputs);
profiler_inst->OpDataProducerEnd();
auto op_launch_start_end_time = profiler_inst->GetSingleOpLaunchTime();
auto &pynative_profiler = PynativeProfiler::GetInstance();
std::string op_name = kernel->fullname_with_scope();
pynative_profiler.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_end_time));
if (profiler_inst->GetSyncEnableFlag()) {
CHECK_RET_WITH_RETURN_ERROR(SyncStream(), "Profiler SyncStream failed.");
}

View File

@ -507,6 +507,8 @@ constexpr auto kUpdateStateRealInput = 2;
// index define of Load
constexpr auto kLoadRealInput = 1;
constexpr auto kLoadStateInput = 2;
// time transfer unit
constexpr int kBasicTimeTransferUnit = 1000;
// index of input or output
enum Index : size_t {
kIndex0 = 0,

View File

@ -24,6 +24,8 @@ class DeviceEvent {
virtual void WaitEvent() = 0;
virtual void RecordEvent() = 0;
virtual bool NeedWait() = 0;
virtual void SyncEvent() = 0;
virtual void ElapsedTime(float *cost_time, DeviceEvent *other) = 0;
virtual void set_wait_stream(void *stream) = 0;
virtual void set_record_stream(void *stream) = 0;
};

View File

@ -25,6 +25,10 @@
rtError_t rtEventSynchronize(rtEvent_t event) { return RT_ERROR_NONE; }
rtError_t rtEventCreateWithFlag(rtEvent_t *event, uint32_t flag) { return RT_ERROR_NONE; }
rtError_t rtEventElapsedTime(float *time, rtEvent_t start, rtEvent_t end) { return RT_ERROR_NONE; }
rtError_t rtMalloc(void **devPtr, uint64_t size, rtMemType_t type) { return RT_ERROR_NONE; }
rtError_t rtMemcpy(void *dst, uint64_t destMax, const void *src, uint64_t count, rtMemcpyKind_t kind) {