diff --git a/mindspore/ccsrc/profiler/device/ascend/ascend_profiling.cc b/mindspore/ccsrc/profiler/device/ascend/ascend_profiling.cc index 913c7022588..62740c2e93b 100644 --- a/mindspore/ccsrc/profiler/device/ascend/ascend_profiling.cc +++ b/mindspore/ccsrc/profiler/device/ascend/ascend_profiling.cc @@ -29,26 +29,6 @@ const int kIndent = 8; AscendProfiler::AscendProfiler() : counter_(0) { Reset(); } -void AscendProfiler::RecordEvent(EventType event_type, const char *fmt, ...) { - va_list args; - va_start(args, fmt); - - char buf[kEventDescMax]; - if (vsnprintf_s(buf, kEventDescMax, kEventDescMax - 1, fmt, args) == -1) { - MS_LOG(ERROR) << "format failed:" << fmt; - va_end(args); - return; - } - - va_end(args); - std::string event = buf; - auto index = counter_++; - auto &evt = events_[index]; - evt.timestamp = std::chrono::system_clock::now(); - evt.desc = std::move(event); - evt.event_type = event_type; -} - void AscendProfiler::Dump(std::ostream &output_stream) { MS_LOG(INFO) << "start dump async profiling info"; if (events_.empty()) { @@ -60,7 +40,7 @@ void AscendProfiler::Dump(std::ostream &output_stream) { std::vector prev_timestamps; prev_timestamps.resize(kMaxEventTypes, start); - for (int i = 0; i < counter_; ++i) { + for (uint32_t i = 0; i < counter_; ++i) { auto &evt = events_[i]; auto elapsed = std::chrono::duration_cast(evt.timestamp - start).count(); auto &prev_ts = prev_timestamps[evt.event_type]; diff --git a/mindspore/ccsrc/profiler/device/ascend/ascend_profiling.h b/mindspore/ccsrc/profiler/device/ascend/ascend_profiling.h index 6489e73d9cd..30e3f2beebe 100644 --- a/mindspore/ccsrc/profiler/device/ascend/ascend_profiling.h +++ b/mindspore/ccsrc/profiler/device/ascend/ascend_profiling.h @@ -43,15 +43,13 @@ class AscendProfiler { return instance; } - void RecordEvent(EventType event_type, const char *fmt, ...); - void Reset(); void Dump(std::ostream &os); private: std::vector events_; - std::atomic_int counter_; + std::atomic_uint32_t counter_; }; } // namespace ascend } // namespace profiler diff --git a/mindspore/ccsrc/profiler/device/ascend/rt_callback_manager.cc b/mindspore/ccsrc/profiler/device/ascend/rt_callback_manager.cc index 9fdbeaf9344..09d0e743027 100644 --- a/mindspore/ccsrc/profiler/device/ascend/rt_callback_manager.cc +++ b/mindspore/ccsrc/profiler/device/ascend/rt_callback_manager.cc @@ -23,17 +23,6 @@ namespace profiler { namespace ascend { CallbackManager::CallbackManager(rtStream_t stream) : stream_(stream) {} -Status CallbackManager::Init() { - MS_LOG(INFO) << "CallbackManager init, Start to async process event"; - ret_future_ = std::async([&] { return CallbackProcess(); }); - if (!ret_future_.valid()) { - MS_LOG(ERROR) << "Failed to init callback manager."; - return kFail; - } - - return kSuccess; -} - Status CallbackManager::CallbackProcess() { std::pair> entry; while (true) { @@ -50,15 +39,15 @@ Status CallbackManager::CallbackProcess() { auto rt_err = rtEventSynchronize(event); if (rt_err != RT_ERROR_NONE) { MS_LOG(ERROR) << "rtEventSynchronize failed. ret:" << rt_err; - auto ret = rtEventDestroy(event); - if (ret != RT_ERROR_NONE) { + rt_err = rtEventDestroy(event); + if (rt_err != RT_ERROR_NONE) { MS_LOG(ERROR) << "rtEventDestroy failed"; } return kFail; } - auto ret = rtEventDestroy(event); - if (ret != RT_ERROR_NONE) { + rt_err = rtEventDestroy(event); + if (rt_err != RT_ERROR_NONE) { MS_LOG(ERROR) << "rtEventDestroy failed"; } @@ -120,7 +109,7 @@ void CallbackManager::RtCallbackFunc(const void *data) { } Status CallbackManager::RegisterCallback(const std::function &callback) { - auto func = std::unique_ptr>(new (std::nothrow) std::function(callback)); + auto func = std::make_unique>(callback); if (func == nullptr) { MS_LOG(ERROR) << "callback is nullptr"; return kInvalidParam; diff --git a/mindspore/ccsrc/profiler/device/ascend/rt_callback_manager.h b/mindspore/ccsrc/profiler/device/ascend/rt_callback_manager.h index 9d87f261602..4ea8247faa2 100644 --- a/mindspore/ccsrc/profiler/device/ascend/rt_callback_manager.h +++ b/mindspore/ccsrc/profiler/device/ascend/rt_callback_manager.h @@ -40,8 +40,6 @@ class CallbackManager { ~CallbackManager() = default; - Status Init(); - Status Destroy(); Status RegisterCallback(rtCallback_t callback, const void *user_data); diff --git a/mindspore/ccsrc/profiler/device/common/memory_profiling.cc b/mindspore/ccsrc/profiler/device/common/memory_profiling.cc index a538083633d..b1404988ef3 100644 --- a/mindspore/ccsrc/profiler/device/common/memory_profiling.cc +++ b/mindspore/ccsrc/profiler/device/common/memory_profiling.cc @@ -24,7 +24,6 @@ namespace mindspore { namespace profiler { - constexpr char kOutputPath[] = "output"; std::shared_ptr MemoryProfiling::AddGraphMemoryNode(uint32_t graph_id) { @@ -33,7 +32,7 @@ std::shared_ptr MemoryProfiling::AddGraphMemoryNode(uint32_t graph_ return node; } -std::shared_ptr MemoryProfiling::GetGraphMemoryNode(uint32_t graph_id) { +std::shared_ptr MemoryProfiling::GetGraphMemoryNode(uint32_t graph_id) const { auto node = graph_memory_.find(graph_id); if (node != graph_memory_.end()) { return node->second; diff --git a/mindspore/ccsrc/profiler/device/common/memory_profiling.h b/mindspore/ccsrc/profiler/device/common/memory_profiling.h index aa3642ce4ba..18924b005e4 100644 --- a/mindspore/ccsrc/profiler/device/common/memory_profiling.h +++ b/mindspore/ccsrc/profiler/device/common/memory_profiling.h @@ -26,7 +26,6 @@ namespace mindspore { namespace profiler { - class NodeMemory { public: NodeMemory() : node_name_(""), node_id_(0) {} @@ -107,9 +106,8 @@ class MemoryProfiling { return instance; } - MemoryProto &GetMemProto() { return memory_proto_; } std::shared_ptr AddGraphMemoryNode(uint32_t graph_id); - std::shared_ptr GetGraphMemoryNode(uint32_t graph_id); + std::shared_ptr GetGraphMemoryNode(uint32_t graph_id) const; void SetDeviceMemSize(uint64_t size) { device_mem_size_ = size; } void MemoryToPB(); void SaveMemoryProfiling(); diff --git a/mindspore/ccsrc/profiler/device/common/memory_profiling.proto b/mindspore/ccsrc/profiler/device/common/memory_profiling.proto index eb596e62c85..e382f9d368a 100644 --- a/mindspore/ccsrc/profiler/device/common/memory_profiling.proto +++ b/mindspore/ccsrc/profiler/device/common/memory_profiling.proto @@ -20,7 +20,7 @@ package mindspore.profiler; message MemoryProto { repeated GraphMemProto graph_mem = 1; // memory usage of multiple graphs - int64 total_mem = 2; // total allocated device memory + uint64 total_mem = 2; // total allocated device memory } message GraphMemProto { @@ -34,17 +34,17 @@ message GraphMemProto { message NodeMemProto { string node_name = 1; // node name - int64 node_id = 2; // node id with respect to the execution order - repeated int64 input_tensor_id = 3; // input tensor id - repeated int64 output_tensor_id = 4; // output tensor id - repeated int64 workspace_tensor_id = 5; // workspace tensor id + uint64 node_id = 2; // node id with respect to the execution order + repeated uint64 input_tensor_id = 3; // input tensor id + repeated uint64 output_tensor_id = 4; // output tensor id + repeated uint64 workspace_tensor_id = 5; // workspace tensor id } message TensorMemProto { - int64 tensor_id = 1; // tensor id - int64 size = 2; // aligned tensor size + uint64 tensor_id = 1; // tensor id + uint64 size = 2; // aligned tensor size string type = 3; // tensor type, e.g. Common, OutputOnly - int64 life_start = 4; // the exe node id at which tensor memory allocated - int64 life_end = 5; // the exe node id at which tensor memory deallocated + uint64 life_start = 4; // the exe node id at which tensor memory allocated + uint64 life_end = 5; // the exe node id at which tensor memory deallocated string life_long = 6; // see LifeLongType enum } diff --git a/mindspore/ccsrc/profiler/device/cpu/cpu_data_saver.cc b/mindspore/ccsrc/profiler/device/cpu/cpu_data_saver.cc index d703095f760..3a9392918c4 100644 --- a/mindspore/ccsrc/profiler/device/cpu/cpu_data_saver.cc +++ b/mindspore/ccsrc/profiler/device/cpu/cpu_data_saver.cc @@ -24,7 +24,7 @@ namespace mindspore { namespace profiler { namespace cpu { -void CpuDataSaver::WriteFile(std::string out_path_dir) { +void CpuDataSaver::WriteFile(const std::string out_path_dir) { if (op_detail_infos_.empty() || op_type_infos_.empty()) { MS_LOG(INFO) << "No cpu operation detail infos to write."; return; diff --git a/mindspore/ccsrc/profiler/device/cpu/cpu_data_saver.h b/mindspore/ccsrc/profiler/device/cpu/cpu_data_saver.h index e549a762134..d9442f91589 100644 --- a/mindspore/ccsrc/profiler/device/cpu/cpu_data_saver.h +++ b/mindspore/ccsrc/profiler/device/cpu/cpu_data_saver.h @@ -37,7 +37,7 @@ class CpuDataSaver : public DataSaver { CpuDataSaver &operator=(const CpuDataSaver &) = delete; - void WriteFile(std::string out_path); + void WriteFile(const std::string out_path); }; } // namespace cpu } // namespace profiler diff --git a/mindspore/ccsrc/profiler/device/cpu/cpu_profiling.cc b/mindspore/ccsrc/profiler/device/cpu/cpu_profiling.cc index 2e4fec2ce76..5b268ea5653 100644 --- a/mindspore/ccsrc/profiler/device/cpu/cpu_profiling.cc +++ b/mindspore/ccsrc/profiler/device/cpu/cpu_profiling.cc @@ -16,9 +16,9 @@ #include "profiler/device/cpu/cpu_profiling.h" -#include #include #include +#include #include "profiler/device/cpu/cpu_data_saver.h" #include "pybind_api/api_register.h" #include "utils/log_adapter.h" @@ -27,8 +27,7 @@ namespace mindspore { namespace profiler { namespace cpu { -std::shared_ptr CPUProfiler::profiler_inst_ = - std::shared_ptr(new (std::nothrow) CPUProfiler()); +std::shared_ptr CPUProfiler::profiler_inst_ = std::make_shared(); std::shared_ptr &CPUProfiler::GetInstance() { return profiler_inst_; } diff --git a/mindspore/ccsrc/profiler/device/cpu/cpu_profiling.h b/mindspore/ccsrc/profiler/device/cpu/cpu_profiling.h index f1a12ce7a7b..d647150eace 100644 --- a/mindspore/ccsrc/profiler/device/cpu/cpu_profiling.h +++ b/mindspore/ccsrc/profiler/device/cpu/cpu_profiling.h @@ -33,6 +33,7 @@ const float kNanosecondToMillisecond = 1000000; class CPUProfiler : public Profiler { public: static std::shared_ptr &GetInstance(); + CPUProfiler() = default; ~CPUProfiler() = default; CPUProfiler(const CPUProfiler &) = delete; CPUProfiler &operator=(const CPUProfiler &) = delete; @@ -44,7 +45,6 @@ class CPUProfiler : public Profiler { void OpDataProducerEnd() override; private: - CPUProfiler() = default; void SetRunTimeData(const std::string &op_name, const uint32_t pid); void SaveProfileData() override; void ClearInst() override; diff --git a/mindspore/ccsrc/profiler/device/data_saver.cc b/mindspore/ccsrc/profiler/device/data_saver.cc index 8987038d655..846c0a438f7 100644 --- a/mindspore/ccsrc/profiler/device/data_saver.cc +++ b/mindspore/ccsrc/profiler/device/data_saver.cc @@ -23,7 +23,7 @@ namespace mindspore { namespace profiler { -OpDetailInfo::OpDetailInfo(std::shared_ptr op_info, float proportion) +OpDetailInfo::OpDetailInfo(const std::shared_ptr op_info, float proportion) : op_info_(op_info), proportion_(proportion) { // op_full_name is like 'xxx/xxx/{op_type}-op{node_id}' op_full_name_ = op_info->op_name; @@ -72,7 +72,7 @@ void DataSaver::AddOpDetailInfoForType(const OpDetailInfo &op_detail_info) { } } -float DataSaver::GetTotalOpTime(const OpInfoMap &op_info_maps) { +float DataSaver::GetTotalOpTime(const OpInfoMap &op_info_maps) const { float sum = 0; sum = std::accumulate(op_info_maps.begin(), op_info_maps.end(), sum, [](float i, auto iter) { return i + iter.second.op_host_cost_time; }); @@ -80,7 +80,7 @@ float DataSaver::GetTotalOpTime(const OpInfoMap &op_info_maps) { return sum; } -void DataSaver::WriteOpType(const std::string &saver_base_dir) { +void DataSaver::WriteOpType(const std::string &saver_base_dir) const { std::string file_path = saver_base_dir + "/" + op_side_ + "_op_type_info_" + device_id_ + ".csv"; std::ofstream ofs(file_path); // check if the file is writable @@ -110,7 +110,7 @@ void DataSaver::WriteOpType(const std::string &saver_base_dir) { MS_LOG(INFO) << "Write " << op_type_infos_.size() << " op type infos into file: " << file_path; } -void DataSaver::WriteOpDetail(const std::string &saver_base_dir) { +void DataSaver::WriteOpDetail(const std::string &saver_base_dir) const { std::string file_path = saver_base_dir + "/" + op_side_ + "_op_detail_info_" + device_id_ + ".csv"; std::ofstream ofs(file_path); if (!ofs.is_open()) { @@ -139,7 +139,7 @@ void DataSaver::WriteOpDetail(const std::string &saver_base_dir) { MS_LOG(INFO) << "Write " << op_detail_infos_.size() << " op detail infos into file: " << file_path; } -void DataSaver::WriteOpTimestamp(const std::string &saver_base_dir) { +void DataSaver::WriteOpTimestamp(const std::string &saver_base_dir) const { std::string file_path = saver_base_dir + "/" + op_side_ + "_op_execute_timestamp_" + device_id_ + ".txt"; std::ofstream ofs(file_path); // check if the file is writable @@ -167,7 +167,7 @@ void DataSaver::WriteOpTimestamp(const std::string &saver_base_dir) { ChangeFileMode(file_path); } -void DataSaver::ChangeFileMode(const std::string &file_path) { +void DataSaver::ChangeFileMode(const std::string &file_path) const { if (chmod(common::SafeCStr(file_path), S_IRUSR | S_IWUSR) == -1) { MS_LOG(WARNING) << "Modify file: " << file_path << " to rw fail."; return; diff --git a/mindspore/ccsrc/profiler/device/data_saver.h b/mindspore/ccsrc/profiler/device/data_saver.h index 97fe0e878a6..13c3ab80227 100644 --- a/mindspore/ccsrc/profiler/device/data_saver.h +++ b/mindspore/ccsrc/profiler/device/data_saver.h @@ -34,7 +34,7 @@ struct OpDetailInfo { float proportion_{0}; OpDetailInfo() = default; - OpDetailInfo(std::shared_ptr op_info, float proportion); + OpDetailInfo(const std::shared_ptr op_info, float proportion); std::string GetCpuHeader() const { return "op_side,op_type,op_name,full_op_name,op_occurrences,op_total_time(ms)," @@ -45,13 +45,13 @@ struct OpDetailInfo { "cuda_activity_cost_time(us),cuda_activity_call_count"; } - void OutputCpuOpDetailInfo(std::ostream &os) { + void OutputCpuOpDetailInfo(std::ostream &os) const { os << "Host," << op_type_ << ',' << op_name_ << ',' << op_full_name_ << ',' << op_info_->op_count << ',' << op_info_->op_host_cost_time << ',' << op_avg_time_ << ',' << proportion_ << ",Default," << op_info_->pid << std::endl; } - void OutputGpuOpDetailInfo(std::ostream &os) { + void OutputGpuOpDetailInfo(std::ostream &os) const { os << "Device," << op_type_ << ',' << op_name_ << ',' << op_full_name_ << ',' << op_info_->op_count << ',' << op_info_->op_host_cost_time << ',' << op_avg_time_ << ',' << proportion_ << ',' << op_info_->cupti_activity_time << ',' << op_info_->op_kernel_count << std::endl; @@ -72,12 +72,12 @@ struct OpType { } std::string GetGpuHeader() const { return "op_type,type_occurrences,total_time(us),total_proportion,avg_time(us)"; } - void OutputCpuOpTypeInfo(std::ostream &os) { + void OutputCpuOpTypeInfo(std::ostream &os) const { os << op_type_ << ',' << count_ << ',' << count_ / step_ << ',' << total_time_ << ',' << total_time_ / count_ << ',' << proportion_ << std::endl; } - void OutputGpuOpTypeInfo(std::ostream &os) { + void OutputGpuOpTypeInfo(std::ostream &os) const { os << op_type_ << ',' << count_ << ',' << total_time_ << ',' << proportion_ << ',' << avg_time_ << std::endl; } @@ -105,15 +105,15 @@ class DataSaver { protected: void AddOpDetailInfoForType(const OpDetailInfo &op_detail_info); - float GetTotalOpTime(const OpInfoMap &op_info_maps); + float GetTotalOpTime(const OpInfoMap &op_info_maps) const; - void WriteOpType(const std::string &saver_base_dir); + void WriteOpType(const std::string &saver_base_dir) const; - void WriteOpDetail(const std::string &saver_base_dir); + void WriteOpDetail(const std::string &saver_base_dir) const; - void WriteOpTimestamp(const std::string &saver_base_dir); + void WriteOpTimestamp(const std::string &saver_base_dir) const; - void ChangeFileMode(const std::string &file_path); + void ChangeFileMode(const std::string &file_path) const; OpTypeInfos op_type_infos_; OpDetailInfos op_detail_infos_; diff --git a/mindspore/ccsrc/profiler/device/gpu/cupti_interface.cc b/mindspore/ccsrc/profiler/device/gpu/cupti_interface.cc index 0eb55a6db22..9107093a24b 100644 --- a/mindspore/ccsrc/profiler/device/gpu/cupti_interface.cc +++ b/mindspore/ccsrc/profiler/device/gpu/cupti_interface.cc @@ -43,23 +43,23 @@ inline void *GetCUPTIFunc(const char *name) { return func; } -typedef CUptiResult (*CuptiSubscribeFunc)(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback, - void *userdata); -typedef CUptiResult (*CuptiEnableDomainFunc)(uint32_t enable, CUpti_SubscriberHandle subscriber, - CUpti_CallbackDomain domain); -typedef CUptiResult (*CuptiActivityEnableFunc)(CUpti_ActivityKind kind); -typedef CUptiResult (*CuptiActivityRegisterCallbacksFunc)(CUpti_BuffersCallbackRequestFunc funcBufferRequested, - CUpti_BuffersCallbackCompleteFunc funcBufferCompleted); -typedef CUptiResult (*CuptiUnsubscribeFunc)(CUpti_SubscriberHandle subscriber); -typedef CUptiResult (*CuptiActivityFlushAllFunc)(uint32_t flag); -typedef CUptiResult (*CuptiActivityDisableFunc)(CUpti_ActivityKind kind); -typedef CUptiResult (*CuptiActivityGetNextRecordFunc)(uint8_t *buffer, size_t validBufferSizeBytes, - CUpti_Activity **record); -typedef CUptiResult (*CuptiActivityGetNumDroppedRecordsFunc)(CUcontext context, uint32_t streamId, size_t *dropped); -typedef CUptiResult (*CuptiGetTimestampFunc)(uint64_t *timestamp); -typedef CUptiResult (*CuptiGetResultStringFunc)(CUptiResult result, const char **str); -typedef CUptiResult (*CuptiGetStreamIdFunc)(CUcontext context, CUstream stream, uint32_t *streamId); -typedef CUptiResult (*CuptiGetDeviceIdFunc)(CUcontext context, uint32_t *deviceId); +using CuptiSubscribeFunc = CUptiResult (*)(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback, + void *userdata); +using CuptiEnableDomainFunc = CUptiResult (*)(uint32_t enable, CUpti_SubscriberHandle subscriber, + CUpti_CallbackDomain domain); +using CuptiActivityEnableFunc = CUptiResult (*)(CUpti_ActivityKind kind); +using CuptiActivityRegisterCallbacksFunc = CUptiResult (*)(CUpti_BuffersCallbackRequestFunc funcBufferRequested, + CUpti_BuffersCallbackCompleteFunc funcBufferCompleted); +using CuptiUnsubscribeFunc = CUptiResult (*)(CUpti_SubscriberHandle subscriber); +using CuptiActivityFlushAllFunc = CUptiResult (*)(uint32_t flag); +using CuptiActivityDisableFunc = CUptiResult (*)(CUpti_ActivityKind kind); +using CuptiActivityGetNextRecordFunc = CUptiResult (*)(uint8_t *buffer, size_t validBufferSizeBytes, + CUpti_Activity **record); +using CuptiActivityGetNumDroppedRecordsFunc = CUptiResult (*)(CUcontext context, uint32_t streamId, size_t *dropped); +using CuptiGetTimestampFunc = CUptiResult (*)(uint64_t *timestamp); +using CuptiGetResultStringFunc = CUptiResult (*)(CUptiResult result, const char **str); +using CuptiGetStreamIdFunc = CUptiResult (*)(CUcontext context, CUstream stream, uint32_t *streamId); +using CuptiGetDeviceIdFunc = CUptiResult (*)(CUcontext context, uint32_t *deviceId); CUptiResult CuptiSubscribe(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback, void *userdata) { static auto func_ptr = reinterpret_cast(GetCUPTIFunc("cuptiSubscribe")); diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc index 7517664fbff..e1f5f992086 100644 --- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc @@ -16,10 +16,10 @@ #include "profiler/device/gpu/gpu_profiling.h" -#include #include #include #include +#include #include "profiler/device/gpu/cupti_interface.h" #include "profiler/device/gpu/gpu_data_saver.h" #include "pybind_api/api_register.h" @@ -29,29 +29,29 @@ namespace mindspore { namespace profiler { namespace gpu { -#define BUF_SIZE (32 * 1024) -#define ALIGN_SIZE (8) -#define CHECK_CUPTI_RET_WITH_ERROR(expression, message) \ - if (expression != CUPTI_SUCCESS) { \ - const char *errstr; \ - CuptiGetResultString(expression, &errstr); \ - MS_LOG(ERROR) << "CUPTI Error:" << errstr << " function:" << message; \ +const size_t BUF_SIZE = 32 * 1024; +const size_t ALIGN_SIZE = 8; +#define CHECK_CUPTI_RET_WITH_ERROR(expression, message) \ + if ((expression) != CUPTI_SUCCESS) { \ + const char *errstr; \ + CuptiGetResultString(expression, &errstr); \ + MS_LOG(ERROR) << "CUPTI Error:" << errstr << " function:" << (message); \ } -#define CHECK_CUPTI_RET_WITH_EXCEPT(expression, message) \ - if (expression != CUPTI_SUCCESS) { \ - const char *errstr; \ - CuptiGetResultString(expression, &errstr); \ - MS_LOG(EXCEPTION) << "CUPTI Error:" << errstr << " function:" << message; \ - } -#define CHECK_CUDA_RET_WITH_ERROR(expression, message) \ - { \ - cudaError_t status = (expression); \ - if (status != cudaSuccess) { \ - MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \ - << cudaGetErrorString(status); \ - } \ +#define CHECK_CUPTI_RET_WITH_EXCEPT(expression, message) \ + if ((expression) != CUPTI_SUCCESS) { \ + const char *errstr; \ + CuptiGetResultString(expression, &errstr); \ + MS_LOG(EXCEPTION) << "CUPTI Error:" << errstr << " function:" << (message); \ } +#define CHECK_CUDA_RET_WITH_ERROR(expression, message) \ + do { \ + cudaError_t status = (expression); \ + if (status != cudaSuccess) { \ + MS_LOG(ERROR) << "CUDA Error: " << (message) << " | Error Number: " << status << " " \ + << cudaGetErrorString(status); \ + } \ + } while (0) #define PROFILER_ERROR_IF_NULLPTR(ptr) \ do { \ if ((ptr) == nullptr) { \ @@ -60,8 +60,7 @@ namespace gpu { } \ } while (0) -std::shared_ptr GPUProfiler::profiler_inst_ = - std::shared_ptr(new (std::nothrow) GPUProfiler()); +std::shared_ptr GPUProfiler::profiler_inst_ = std::make_shared(); int32_t GetThreadID() { uint32_t thread_id = static_cast(pthread_self()); @@ -114,6 +113,8 @@ bool IsMemcpyAsyncEvent(CUpti_CallbackId cb_id) { case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: return true; + default: + return false; } return false; } @@ -134,6 +135,8 @@ bool IsMemcpySyncEvent(CUpti_CallbackId cb_id) { case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: return true; + default: + return false; } return false; } diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h index 9df0ceafeac..ed817ea2f10 100644 --- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h +++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h @@ -111,6 +111,7 @@ class ProfilingOp { class GPUProfiler : public Profiler { public: static std::shared_ptr &GetInstance(); + GPUProfiler() = default; ~GPUProfiler() { StopCUPTI(); } GPUProfiler(const GPUProfiler &) = delete; GPUProfiler &operator=(const GPUProfiler &) = delete; @@ -134,7 +135,6 @@ class GPUProfiler : public Profiler { std::string ProfileDataPath() const { return profile_data_path_; } private: - GPUProfiler() = default; void OpsParser(); void EventLog(const Event &event); void ClearInst() override; diff --git a/mindspore/ccsrc/profiler/device/profiling.cc b/mindspore/ccsrc/profiler/device/profiling.cc index f03c9547185..d921888aaed 100644 --- a/mindspore/ccsrc/profiler/device/profiling.cc +++ b/mindspore/ccsrc/profiler/device/profiling.cc @@ -16,9 +16,9 @@ #include "profiler/device/profiling.h" -#include #include #include +#include #include "profiler/device/cpu/cpu_data_saver.h" #include "pybind_api/api_register.h" #include "utils/log_adapter.h" @@ -26,7 +26,7 @@ namespace mindspore { namespace profiler { -uint64_t Profiler::GetHostMonoTimeStamp() { +uint64_t Profiler::GetHostMonoTimeStamp() const { struct timespec ts; #if defined(_WIN32) || defined(_WIN64) clock_gettime(CLOCK_MONOTONIC, &ts); diff --git a/mindspore/ccsrc/profiler/device/profiling.h b/mindspore/ccsrc/profiler/device/profiling.h index 7b46edf0836..0e5a48e79a8 100644 --- a/mindspore/ccsrc/profiler/device/profiling.h +++ b/mindspore/ccsrc/profiler/device/profiling.h @@ -61,7 +61,7 @@ class Profiler { protected: void SetRunTimeData(const std::string &op_name, const float time_elapsed); void SetRunTimeData(const std::string &op_name, const uint64_t start, const float duration); - uint64_t GetHostMonoTimeStamp(); + uint64_t GetHostMonoTimeStamp() const; virtual void SaveProfileData() = 0; virtual void ClearInst() = 0; bool enable_flag_ = false; diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 6e167879fad..743fadfc770 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -654,14 +654,6 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo mindspore::RDR::RecordGPUMemAddressInfo(SubModuleId::SM_KERNEL, name, kernels.size()); size_t id = 0; #endif - auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); - MS_EXCEPTION_IF_NULL(profiler_inst); - - if (profiler_inst->GetEnableFlag() && profiler::gpu::ProfilingUtils::IsFirstStep(graph->graph_id())) { - profiler::gpu::ProfilingTraceInfo profiling_trace = - profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph)); - profiler_inst->SetStepTraceOpName(profiling_trace); - } CNodePtr last_kernel = GetLastKernel(graph); for (const auto &kernel : kernels) { auto kernel_mod = AnfAlgo::GetKernelMod(kernel); @@ -700,22 +692,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo mindspore::RDR::UpdateGPUMemAddressInfo(SubModuleId::SM_KERNEL, name, op_name, mem_info, id++); #endif if (!mock) { - if (!profiling) { - if (profiler_inst->GetEnableFlag()) { - profiler_inst->OpDataProducerBegin(kernel->fullname_with_scope(), stream_); - } - if (!kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_)) { - MS_LOG(EXCEPTION) << "Launch kernel failed: " << kernel->fullname_with_scope(); - } - if (profiler_inst->GetEnableFlag()) { - profiler_inst->OpDataProducerEnd(); - if (profiler_inst->GetSyncEnableFlag()) { - CHECK_OP_RET_WITH_ERROR(SyncStream(), "Profiler SyncStream failed."); - } - } - } else { - LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); - } + LaunchKernelWithoutMock(graph, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, profiling); if (gpu_kernel && dynamic_kernel && dynamic_kernel->is_dynamic_shape()) { gpu_kernel->PostExecute(); @@ -748,6 +725,37 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo return true; } +void GPUKernelRuntime::LaunchKernelWithoutMock(const session::KernelGraph *graph, const AnfNodePtr &kernel, + const AddressPtrList &inputs, const AddressPtrList &workspaces, + const AddressPtrList &outputs, bool profiling) { + auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); + MS_EXCEPTION_IF_NULL(profiler_inst); + + if (profiler_inst->GetEnableFlag() && profiler::gpu::ProfilingUtils::IsFirstStep(graph->graph_id())) { + profiler::gpu::ProfilingTraceInfo profiling_trace = + profiler::gpu::ProfilingUtils::GetProfilingTraceFromEnv(NOT_NULL(graph)); + profiler_inst->SetStepTraceOpName(profiling_trace); + } + + if (!profiling) { + if (profiler_inst->GetEnableFlag()) { + profiler_inst->OpDataProducerBegin(kernel->fullname_with_scope(), stream_); + } + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + if (!kernel_mod->Launch(inputs, workspaces, outputs, stream_)) { + MS_LOG(EXCEPTION) << "Launch kernel failed: " << kernel->fullname_with_scope(); + } + if (profiler_inst->GetEnableFlag()) { + profiler_inst->OpDataProducerEnd(); + if (profiler_inst->GetSyncEnableFlag()) { + CHECK_OP_RET_WITH_ERROR(SyncStream(), "Profiler SyncStream failed."); + } + } + } else { + LaunchKernelWithTimeProfiling(kernel, inputs, workspaces, outputs); + } +} + bool GPUKernelRuntime::RunOpLaunchKernelDynamic(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); const auto &kernels = graph->execution_order(); diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h index 5853394147c..3d1a6a19a58 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h @@ -111,6 +111,10 @@ class GPUKernelRuntime : public KernelRuntime { DeviceAddressPtr GetMutableOutputAddr(const AnfNodePtr &node, size_t i, bool visit_nop_node); session::KernelWithIndex GetPrevNodeOutput(const AnfNodePtr &node, size_t i); + void LaunchKernelWithoutMock(const session::KernelGraph *graph, const AnfNodePtr &kernel, + const AddressPtrList &inputs, const AddressPtrList &workspaces, + const AddressPtrList &outputs, bool profiling); + std::unordered_map mem_reuse_util_map_; std::unordered_map mem_swap_map_; std::unordered_map is_first_step_map_; diff --git a/tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc b/tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc index c10726d8928..dbac6c75276 100644 --- a/tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc +++ b/tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc @@ -53,15 +53,12 @@ namespace mindspore { namespace profiler { namespace ascend { CallbackManager::CallbackManager(rtStream_t stream) : stream_(stream) {} -Status CallbackManager::Init() { return kSuccess; } Status CallbackManager::Destroy() { return kSuccess; } Status CallbackManager::RegisterCallback(rtCallback_t callback, const void *user_data) { return kSuccess; } Status CallbackManager::RegisterCallback(const std::function &callback) { return kSuccess; } AscendProfiler::AscendProfiler() : counter_(0) { Reset(); } -void AscendProfiler::RecordEvent(EventType event_type, const char *fmt, ...) {} - void AscendProfiler::Dump(std::ostream &output_stream) {} void AscendProfiler::Reset() {}