forked from mindspore-Ecosystem/mindspore
!25367 use acl api to control profiling
Merge pull request !25367 from yanghaitao/yht_condation_start_profiler
This commit is contained in:
commit
ce00ee1ad1
|
@ -1373,13 +1373,6 @@ void InitHccl() {
|
|||
(void)context::OpenTsd(ms_context);
|
||||
}
|
||||
#endif
|
||||
#if (defined ENABLE_D)
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (!ProfilingManager::GetInstance().IsProfiling()) {
|
||||
ProfilingManager::GetInstance().SetHcclEnabledBefProfilingEnabled();
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
void FinalizeHccl() {
|
||||
|
@ -1440,38 +1433,10 @@ void ReleaseGeTsd() {
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef ENABLE_SECURITY
|
||||
void StartUpProfiling() {
|
||||
#ifdef ENABLE_D
|
||||
if (!ProfilingManager::GetInstance().IsProfiling()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
|
||||
MS_LOG(INFO) << "Startup profiling";
|
||||
// Start up profiling before OpenTsd
|
||||
uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
std::string device_name = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
if (ms_context->backend_policy() == "ms" &&
|
||||
ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
|
||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(device_name, device_id);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||
runtime_instance->PreInit();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
void InitPipeline() {
|
||||
// set python env flag
|
||||
RecordInitStatus();
|
||||
mindspore::parse::python_adapter::set_python_env_flag(true);
|
||||
#ifndef ENABLE_SECURITY
|
||||
// Startup profiling before open tsd
|
||||
StartUpProfiling();
|
||||
#endif
|
||||
// open tsd before ge initialize
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
|
|
|
@ -14,14 +14,27 @@
|
|||
*/
|
||||
|
||||
#include "profiler/device/ascend/ascend_profiling.h"
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include "pybind_api/api_register.h"
|
||||
#include "utils/log_adapter.h"
|
||||
#include "utils/utils.h"
|
||||
#include "runtime/device/ascend/profiling/profiling_manager.h"
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
using mindspore::device::ascend::ProfilingManager;
|
||||
|
||||
namespace mindspore {
|
||||
namespace profiler {
|
||||
namespace ascend {
|
||||
std::map<std::string, aclprofAicoreMetrics> kAicMetrics{
|
||||
{"ArithmeticUtilization", ACL_AICORE_ARITHMETIC_UTILIZATION},
|
||||
{"PipeUtilization", ACL_AICORE_PIPE_UTILIZATION},
|
||||
{"Memory", ACL_AICORE_MEMORY_BANDWIDTH},
|
||||
{"MemoryLO", ACL_AICORE_L0B_AND_WIDTH},
|
||||
{"ResourceConflictRatio", ACL_AICORE_RESOURCE_CONFLICT_RATIO},
|
||||
};
|
||||
|
||||
std::shared_ptr<AscendProfiler> AscendProfiler::ascend_profiler_ = std::make_shared<AscendProfiler>();
|
||||
|
||||
std::shared_ptr<AscendProfiler> &AscendProfiler::GetInstance() { return ascend_profiler_; }
|
||||
|
@ -31,21 +44,113 @@ void AscendProfiler::StepProfilingEnable(const bool enable_flag) {
|
|||
enable_flag_ = enable_flag;
|
||||
}
|
||||
|
||||
void AscendProfiler::Start(const std::string &profiling_options) {
|
||||
void AscendProfiler::InitProfiling(const std::string &profiling_path, uint32_t device_id,
|
||||
const std::string &profiling_options) {
|
||||
MS_LOG(INFO) << "Begin to init profiling and call aclprofInit function.";
|
||||
profiling_options_ = profiling_options;
|
||||
profile_data_path_ = profiling_path;
|
||||
device_id_ = device_id;
|
||||
(void)ProfilingManager::GetInstance().InitProfiling(profiling_path, device_id);
|
||||
|
||||
aclError aclRet = aclprofInit(profile_data_path_.c_str(), profile_data_path_.length());
|
||||
if (aclRet != ACL_SUCCESS) {
|
||||
MS_LOG(EXCEPTION) << "Failed to call aclprofInit function.";
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t AscendProfiler::GetOptionsMask() const {
|
||||
uint64_t mask = ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS;
|
||||
|
||||
nlohmann::json options_json;
|
||||
try {
|
||||
options_json = nlohmann::json::parse(profiling_options_);
|
||||
} catch (const std::exception &err) {
|
||||
MS_LOG(ERROR) << "Failed to parse profiling options.";
|
||||
return ACL_AICORE_NONE;
|
||||
}
|
||||
|
||||
if (options_json["task_trace"] == "on") {
|
||||
mask |= ACL_PROF_TASK_TIME;
|
||||
}
|
||||
|
||||
if (options_json["aicpu"] == "on") {
|
||||
mask |= ACL_PROF_AICPU;
|
||||
}
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
aclprofAicoreMetrics AscendProfiler::GetAicMetrics() const {
|
||||
nlohmann::json options_json;
|
||||
try {
|
||||
options_json = nlohmann::json::parse(profiling_options_);
|
||||
} catch (const std::exception &err) {
|
||||
MS_LOG(ERROR) << "Failed to parse profiling options.";
|
||||
return ACL_AICORE_NONE;
|
||||
}
|
||||
auto result = std::find_if(kAicMetrics.begin(), kAicMetrics.end(), [&options_json](const auto &metric) {
|
||||
return metric.first == options_json["aic_metrics"];
|
||||
});
|
||||
if (result == kAicMetrics.end()) {
|
||||
return ACL_AICORE_NONE;
|
||||
}
|
||||
return result->second;
|
||||
}
|
||||
|
||||
void AscendProfiler::Start() {
|
||||
uint32_t device_list[1] = {device_id_};
|
||||
uint32_t device_num = 1;
|
||||
uint64_t mask = GetOptionsMask();
|
||||
aclprofAicoreMetrics aic_metrics = GetAicMetrics();
|
||||
acl_config_ = aclprofCreateConfig(device_list, device_num, aic_metrics, nullptr, GetOptionsMask());
|
||||
if (acl_config_ == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Failed to call aclprofCreateConfig function.";
|
||||
}
|
||||
aclError aclRet = aclprofStart(acl_config_);
|
||||
if (aclRet != ACL_SUCCESS) {
|
||||
MS_LOG(EXCEPTION) << "Failed to call aclprofStart function.";
|
||||
}
|
||||
MS_LOG(INFO) << "Start profiling, options mask is " << mask << " aic_metrics is " << aic_metrics;
|
||||
|
||||
StepProfilingEnable(true);
|
||||
}
|
||||
|
||||
void AscendProfiler::Stop() {
|
||||
MS_LOG(INFO) << "Stop profiling";
|
||||
MS_LOG(INFO) << "Begin to stop profiling.";
|
||||
if (acl_config_ == nullptr) {
|
||||
MS_LOG(EXCEPTION)
|
||||
<< "Failed to stop profiling because of null acl config.Please make sure call Profiler.Start function "
|
||||
"before call Profiler.Stop function.";
|
||||
}
|
||||
|
||||
aclError aclRet = aclprofStop(acl_config_);
|
||||
if (aclRet != ACL_SUCCESS) {
|
||||
MS_LOG(EXCEPTION) << "Failed to call aclprofStop function.";
|
||||
}
|
||||
aclRet = aclprofDestroyConfig(acl_config_);
|
||||
if (aclRet != ACL_SUCCESS) {
|
||||
MS_LOG(EXCEPTION) << "Failed to call aclprofDestroyConfig function.";
|
||||
}
|
||||
|
||||
StepProfilingEnable(false);
|
||||
}
|
||||
|
||||
void AscendProfiler::Finalize() const {
|
||||
MS_LOG(INFO) << "Begin to finalize profiling";
|
||||
aclError aclRet = aclprofFinalize();
|
||||
if (aclRet != ACL_SUCCESS) {
|
||||
MS_LOG(EXCEPTION) << "Failed to call aclprofDestroyConfig function.";
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_PYBIND_DEFINE(AscendProfiler_, ([](const py::module *m) {
|
||||
(void)py::class_<AscendProfiler, std::shared_ptr<AscendProfiler>>(*m, "AscendProfiler")
|
||||
.def_static("get_instance", &AscendProfiler::GetInstance, "AscendProfiler get_instance.")
|
||||
.def("start", &AscendProfiler::Start, py::arg("profiling_options"), "start")
|
||||
.def("stop", &AscendProfiler::Stop, "stop");
|
||||
.def("init", &AscendProfiler::InitProfiling, py::arg("profiling_path"), py::arg("device_id"),
|
||||
py::arg("profiling_options"), "init")
|
||||
.def("start", &AscendProfiler::Start, "start")
|
||||
.def("stop", &AscendProfiler::Stop, "stop")
|
||||
.def("finalize", &AscendProfiler::Finalize, "finalize");
|
||||
}));
|
||||
} // namespace ascend
|
||||
} // namespace profiler
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <string>
|
||||
#include <memory>
|
||||
#include "profiler/device/profiling.h"
|
||||
#include "acl/acl_prof.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace profiler {
|
||||
|
@ -30,18 +31,24 @@ class AscendProfiler : public Profiler {
|
|||
AscendProfiler(const AscendProfiler &) = delete;
|
||||
AscendProfiler &operator=(const AscendProfiler &) = delete;
|
||||
void Init(const std::string &profileDataPath) { return; }
|
||||
void InitProfiling(const std::string &profiling_path, uint32_t device_id, const std::string &profiling_options);
|
||||
void Stop();
|
||||
void StepProfilingEnable(const bool enable_flag) override;
|
||||
void OpDataProducerEnd() { return; }
|
||||
void Start(const std::string &profiling_options);
|
||||
void Start();
|
||||
bool GetProfilingEnableFlag() const { return enable_flag_; }
|
||||
std::string GetProfilingOptions() const { return profiling_options_; }
|
||||
void SaveProfileData() { return; }
|
||||
void ClearInst() { return; }
|
||||
uint64_t GetOptionsMask() const;
|
||||
aclprofAicoreMetrics GetAicMetrics() const;
|
||||
void Finalize() const;
|
||||
|
||||
private:
|
||||
static std::shared_ptr<AscendProfiler> ascend_profiler_;
|
||||
std::string profiling_options_;
|
||||
uint32_t device_id_;
|
||||
aclprofConfig *acl_config_;
|
||||
};
|
||||
} // namespace ascend
|
||||
} // namespace profiler
|
||||
|
|
|
@ -244,7 +244,7 @@ void AsyncDataDumpUninit() {
|
|||
void AscendKernelRuntime::ReportProfilingData() {
|
||||
auto context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
if (ProfilingManager::GetInstance().IsProfiling() &&
|
||||
if (ProfilingManager::GetInstance().IsProfilingStart() &&
|
||||
context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
|
||||
// Save Profiling Framework data
|
||||
OpNameTaskStreamReporter reporter(device_id_, "nonsink", stream_id_task_id_op_name_map_);
|
||||
|
@ -295,9 +295,6 @@ void AscendKernelRuntime::ReleaseDeviceRes() {
|
|||
}
|
||||
|
||||
(void)ResetDevice(device_id);
|
||||
#ifndef ENABLE_SECURITY
|
||||
(void)ProfilingManager::GetInstance().StopProfiling();
|
||||
#endif
|
||||
current_graph_ = nullptr;
|
||||
if (context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode &&
|
||||
!context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) {
|
||||
|
@ -313,14 +310,6 @@ void AscendKernelRuntime::PreInit() {
|
|||
if (error_manager_ret != 0) {
|
||||
MS_LOG(WARNING) << "Init ErrorManager failed.";
|
||||
}
|
||||
auto ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
|
||||
if (!ret) {
|
||||
const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
|
||||
if (!error_message.empty() && error_message.find(kUnknowErrorString) == string::npos) {
|
||||
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
|
||||
}
|
||||
MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed.";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -567,10 +556,18 @@ bool AscendKernelRuntime::LoadTask(const session::KernelGraph &graph) {
|
|||
}
|
||||
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (ProfilingManager::GetInstance().IsProfiling()) {
|
||||
if (ProfilingManager::GetInstance().IsProfilingInitialized()) {
|
||||
auto task_ids = ModelRunner::Instance().GetTaskIdList(model_iter->first);
|
||||
auto stream_ids = ModelRunner::Instance().GetStreamIdList(model_iter->first);
|
||||
ProfilingUtils::ReportProfilingData(task_ids, stream_ids, graph);
|
||||
// Report data directly if profiling is start
|
||||
if (ProfilingUtils::ValidComputeGraph(graph)) {
|
||||
if (ProfilingManager::GetInstance().IsProfilingStart()) {
|
||||
ProfilingUtils::ReportProfilingData(task_ids, stream_ids, graph.graph_id());
|
||||
} else {
|
||||
// Cache data and save when profiling is start
|
||||
ProfilingUtils::SetReportProfilingData(task_ids, stream_ids, graph.graph_id());
|
||||
}
|
||||
}
|
||||
}
|
||||
LaunchDataDump(graph.graph_id());
|
||||
#endif
|
||||
|
|
|
@ -25,6 +25,9 @@
|
|||
#include "utils/convert_utils.h"
|
||||
#include "runtime/base.h"
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "runtime/device/ascend/profiling/profiling_utils.h"
|
||||
|
||||
using mindspore::device::ascend::ProfilingUtils;
|
||||
|
||||
namespace {
|
||||
constexpr Status PROF_SUCCESS = 0;
|
||||
|
@ -39,7 +42,8 @@ ProfilingManager &ProfilingManager::GetInstance() {
|
|||
return inst;
|
||||
}
|
||||
|
||||
ProfilingManager::ProfilingManager() : device_id_(0), prof_cb_({0}), hccl_enabled_bef_profiling_enabled_(false) {}
|
||||
ProfilingManager::ProfilingManager()
|
||||
: device_id_(0), prof_cb_({0}), cur_state_(kProfilingInvalid), profiling_path_("") {}
|
||||
|
||||
uint64_t ProfilingManager::GetJobId() const { return 0; }
|
||||
|
||||
|
@ -110,57 +114,15 @@ Status ProfilingManager::GetProfConf(const NotNull<MsprofGeOptions *> prof) {
|
|||
return PROF_SUCCESS;
|
||||
}
|
||||
|
||||
bool ProfilingManager::StartupProfiling(uint32_t device_id) {
|
||||
auto is_profiling = IsProfiling();
|
||||
if (!is_profiling) {
|
||||
int32_t cb_ret = MsprofInit(0XFF, nullptr, 0);
|
||||
if (cb_ret != UintToInt(PROF_SUCCESS)) {
|
||||
MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret;
|
||||
return false;
|
||||
}
|
||||
MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
|
||||
return true;
|
||||
}
|
||||
|
||||
if (hccl_enabled_bef_profiling_enabled_) {
|
||||
MS_LOG(ERROR)
|
||||
<< "Please check the Profiler object initialized before mindspore.context.set_auto_parallel_context() "
|
||||
"and mindspore.communication.management.init(). Profiler should be initialized before these code.";
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ProfilingManager::InitProfiling(const std::string &profiling_path, uint32_t device_id) {
|
||||
profiling_path_ = profiling_path;
|
||||
device_id_ = device_id;
|
||||
|
||||
struct MsprofGeOptions prof_conf = {0};
|
||||
if (GetProfConf(NOT_NULL(&prof_conf)) != PROF_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Get prof conf failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ProfStartUp(NOT_NULL(&prof_conf))) {
|
||||
MS_LOG(ERROR) << "ProfMgrStartUp failed.";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ProfilingManager::ProfStartUp(const NotNull<MsprofGeOptions *> prof_conf) const {
|
||||
MS_LOG(INFO) << "Prof start up. ";
|
||||
|
||||
bool ret = ProfRegisterCtrlCallback();
|
||||
if (ret == false) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
// call profiling start up api
|
||||
int32_t cb_ret = MsprofInit(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS),
|
||||
static_cast<void *>(prof_conf.get()), sizeof(MsprofGeOptions));
|
||||
if (cb_ret != UintToInt(PROF_SUCCESS)) {
|
||||
MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
MS_LOG(INFO) << "Start up profiling success.";
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -188,25 +150,6 @@ rtError_t CtrlCallbackHandle(uint32_t rt_type, void *data, uint32_t /* len */) {
|
|||
return RT_ERROR_NONE;
|
||||
}
|
||||
|
||||
bool ProfilingManager::StopProfiling() const {
|
||||
MS_LOG(INFO) << "StopProfiling";
|
||||
if (!IsProfiling()) {
|
||||
MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
|
||||
return true;
|
||||
}
|
||||
|
||||
// plugin unregister
|
||||
PluginUnInit();
|
||||
|
||||
// stop profiling
|
||||
int32_t cb_ret = MsprofFinalize();
|
||||
if (cb_ret != 0) {
|
||||
MS_LOG(WARNING) << "Call MsprofFinalize failed, ret: " << cb_ret;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
Status ProfilingManager::CallMsprofReport(const NotNull<ReporterData *> reporter_data) const {
|
||||
if (prof_cb_.msprofReporterCallback == nullptr) {
|
||||
MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr.";
|
||||
|
@ -224,6 +167,58 @@ Status ProfilingManager::CallMsprofReport(const NotNull<ReporterData *> reporter
|
|||
return PROF_SUCCESS;
|
||||
}
|
||||
|
||||
Status ProfilingManager::ProfHandleInit() {
|
||||
MS_LOG(INFO) << "Begin to init profiling. Current profiling state is " << cur_state_;
|
||||
cur_state_ = kProfilingInit;
|
||||
auto cb_ret = ProfilingManager::GetInstance().PluginInit();
|
||||
if (cb_ret != PROF_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Failed to init profiling.";
|
||||
return PROF_FAILED;
|
||||
}
|
||||
|
||||
return PROF_SUCCESS;
|
||||
}
|
||||
|
||||
Status ProfilingManager::ProfHandleStart() {
|
||||
MS_LOG(INFO) << "Begin to start profiling. Current profiling state is " << cur_state_;
|
||||
cur_state_ = kProfilingStart;
|
||||
|
||||
// Report graph data if there is any cache data.
|
||||
ProfilingUtils::ReportAllGraphProfilingData();
|
||||
|
||||
return PROF_SUCCESS;
|
||||
}
|
||||
|
||||
Status ProfilingManager::ProfHandleStop() {
|
||||
MS_LOG(INFO) << "Begin to stop profiling. Current profiling state is " << cur_state_;
|
||||
cur_state_ = kProfilingStop;
|
||||
return PROF_SUCCESS;
|
||||
}
|
||||
|
||||
Status ProfilingManager::ProfHandleFinalize() {
|
||||
MS_LOG(INFO) << "Begin to finalize profiling. Current profiling state is " << cur_state_;
|
||||
cur_state_ = kProfilingFinalize;
|
||||
ProfilingManager::GetInstance().PluginUnInit();
|
||||
|
||||
return PROF_SUCCESS;
|
||||
}
|
||||
|
||||
Status ProfilingManager::ProfCommandHandle(ProfCommandHandleType type) {
|
||||
// Only need process "Init"/“Start”/“Stop”/“Finalize”
|
||||
if (type == kProfCommandhandleInit) {
|
||||
return ProfHandleInit();
|
||||
} else if (type == kProfCommandhandleStart) {
|
||||
return ProfHandleStart();
|
||||
} else if (type == kProfCommandhandleStop) {
|
||||
return ProfHandleStop();
|
||||
} else if (type == kProfCommandhandleFinalize) {
|
||||
return ProfHandleFinalize();
|
||||
}
|
||||
|
||||
MS_LOG(ERROR) << "Receive invalid profiling type " << type << ". Current profiling state is << " << cur_state_;
|
||||
return PROF_FAILED;
|
||||
}
|
||||
|
||||
Status ProfCtrlSwitchHandle(void *data) {
|
||||
if (data == nullptr) {
|
||||
MS_LOG(ERROR) << "Ctrl switch handl data is nullptr.";
|
||||
|
@ -235,18 +230,7 @@ Status ProfCtrlSwitchHandle(void *data) {
|
|||
return ProfCommandHandle(type);
|
||||
}
|
||||
|
||||
Status ProfCommandHandle(ProfCommandHandleType type) {
|
||||
MS_LOG(INFO) << "ProfCommandHandle start, type:" << type;
|
||||
if (type == kProfCommandhandleInit) {
|
||||
auto cb_ret = ProfilingManager::GetInstance().PluginInit();
|
||||
if (cb_ret != PROF_SUCCESS) {
|
||||
MS_LOG(ERROR) << "Profiling plugin int failed.";
|
||||
return PROF_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
return PROF_SUCCESS;
|
||||
}
|
||||
Status ProfCommandHandle(ProfCommandHandleType type) { return ProfilingManager::GetInstance().ProfCommandHandle(type); }
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
#include "toolchain/slog.h"
|
||||
#include "runtime/base.h"
|
||||
#include "profiler/device/profiling.h"
|
||||
#include "acl/acl_prof.h"
|
||||
|
||||
using std::map;
|
||||
using std::string;
|
||||
|
@ -50,19 +51,16 @@ enum ProfCommandHandleType {
|
|||
kProfCommandhandleModelUnsubscribe
|
||||
};
|
||||
|
||||
enum ProfilingState { kProfilingInvalid, kProfilingInit, kProfilingStart, kProfilingStop, kProfilingFinalize };
|
||||
|
||||
class ProfilingManager {
|
||||
public:
|
||||
static ProfilingManager &GetInstance();
|
||||
uint64_t GetJobId() const;
|
||||
bool ProfRegisterCtrlCallback() const;
|
||||
bool StartupProfiling(uint32_t device_id);
|
||||
bool StopProfiling() const;
|
||||
|
||||
inline bool IsProfiling() const {
|
||||
auto profiler_manager = profiler::ProfilerManager::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(profiler_manager);
|
||||
return profiler_manager->GetProfilingEnableFlag();
|
||||
}
|
||||
bool InitProfiling(const std::string &profiling_path, uint32_t device_id);
|
||||
bool IsProfilingInitialized() const { return cur_state_ >= kProfilingInit; }
|
||||
inline bool IsProfilingStart() const { return cur_state_ >= kProfilingStart; }
|
||||
Status PluginInit() const;
|
||||
void PluginUnInit() const;
|
||||
Status CallMsprofReport(NotNull<ReporterData *> reporter_data) const;
|
||||
|
@ -71,17 +69,22 @@ class ProfilingManager {
|
|||
void SetMsprofReporterCallback(MsprofReporterCallback func) { prof_cb_.msprofReporterCallback = func; }
|
||||
void SetMsprofSetDeviceCallback(MsprofSetDeviceCallback func) { prof_cb_.msprofSetDeviceCallback = func; }
|
||||
Status GetProfConf(NotNull<MsprofGeOptions *> prof);
|
||||
void SetHcclEnabledBefProfilingEnabled() { hccl_enabled_bef_profiling_enabled_ = true; }
|
||||
Status ProfCommandHandle(ProfCommandHandleType type);
|
||||
Status ProfHandleInit();
|
||||
Status ProfHandleStart();
|
||||
Status ProfHandleStop();
|
||||
Status ProfHandleFinalize();
|
||||
|
||||
protected:
|
||||
ProfilingManager();
|
||||
~ProfilingManager() {}
|
||||
|
||||
private:
|
||||
bool ProfStartUp(NotNull<MsprofGeOptions *> prof_conf) const;
|
||||
uint32_t device_id_;
|
||||
MsprofCallback prof_cb_;
|
||||
bool hccl_enabled_bef_profiling_enabled_;
|
||||
aclprofConfig *acl_config_;
|
||||
ProfilingState cur_state_;
|
||||
std::string profiling_path_;
|
||||
};
|
||||
|
||||
Status ProfCommandHandle(ProfCommandHandleType type);
|
||||
|
|
|
@ -390,14 +390,15 @@ bool ProfilingUtils::ValidComputeGraph(const session::KernelGraph &kernel_graph)
|
|||
return false;
|
||||
}
|
||||
|
||||
void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids,
|
||||
const session::KernelGraph &kernel_graph) {
|
||||
if (!ValidComputeGraph(kernel_graph)) {
|
||||
MS_LOG(INFO) << "Not a valid compute graph:" << kernel_graph.graph_id();
|
||||
return;
|
||||
void ProfilingUtils::ReportAllGraphProfilingData() {
|
||||
for (auto data : report_data_) {
|
||||
ReportProfilingData(data.task_ids_, data.stream_ids_, data.graph_id_);
|
||||
}
|
||||
}
|
||||
|
||||
auto ret = graph_profiling_cnode_.find(kernel_graph.graph_id());
|
||||
void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids,
|
||||
uint32_t graph_id) {
|
||||
auto ret = graph_profiling_cnode_.find(graph_id);
|
||||
if (ret == graph_profiling_cnode_.end()) {
|
||||
MS_LOG(ERROR) << "Graph id not found";
|
||||
return;
|
||||
|
@ -415,7 +416,7 @@ void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids,
|
|||
graph_reporter.ReportData();
|
||||
|
||||
// Report profiling point
|
||||
auto point_iter = graph_point_.find(kernel_graph.graph_id());
|
||||
auto point_iter = graph_point_.find(graph_id);
|
||||
if (point_iter == graph_point_.end()) {
|
||||
MS_LOG(ERROR) << "Graph id not found in graph_point";
|
||||
return;
|
||||
|
@ -426,6 +427,12 @@ void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids,
|
|||
}
|
||||
point_reporter.ReportData();
|
||||
}
|
||||
|
||||
void ProfilingUtils::SetReportProfilingData(const std::vector<uint32_t> &task_ids,
|
||||
const std::vector<uint32_t> &stream_ids, uint32_t graph_id) {
|
||||
GraphProfilingData report_data = {task_ids, stream_ids, graph_id};
|
||||
report_data_.emplace_back(report_data);
|
||||
}
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -48,6 +48,12 @@ struct ProfilingContent {
|
|||
uint32_t flags;
|
||||
};
|
||||
|
||||
struct GraphProfilingData {
|
||||
std::vector<uint32_t> task_ids_;
|
||||
std::vector<uint32_t> stream_ids_;
|
||||
uint32_t graph_id_;
|
||||
};
|
||||
|
||||
class ProfilingUtils {
|
||||
public:
|
||||
ProfilingUtils() = default;
|
||||
|
@ -69,7 +75,7 @@ class ProfilingUtils {
|
|||
static void SetGraphKernelName(uint32_t graph_id, const std::vector<std::string> &kernel_names);
|
||||
// Save graph information to Framework file
|
||||
static void ReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids,
|
||||
const session::KernelGraph &graph);
|
||||
uint32_t graph_id);
|
||||
// Generate profiling trace
|
||||
static ProfilingTraceInfo GenerateProfilingTrace(const session::KernelGraph &kernel_graph);
|
||||
|
||||
|
@ -81,6 +87,11 @@ class ProfilingUtils {
|
|||
|
||||
static std::map<uint32_t, std::vector<std::string>> graph_kernel_name() { return graph_kernel_name_; }
|
||||
|
||||
static void SetReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids,
|
||||
uint32_t graph_id);
|
||||
static void ReportAllGraphProfilingData();
|
||||
static bool ValidComputeGraph(const session::KernelGraph &kernel_graph);
|
||||
|
||||
inline static constexpr char kProfiling[] = "Profiling";
|
||||
inline static constexpr char kNotify[] = "notify";
|
||||
inline static constexpr char kProfilerTraceId[] = "profiler_trace_id";
|
||||
|
@ -101,7 +112,6 @@ class ProfilingUtils {
|
|||
static void GetCNodeOutputRealNode(const std::string &node_name, const session::KernelGraph &kernel_graph,
|
||||
NotNull<std::set<std::string> *> getnext_outputs);
|
||||
|
||||
static bool ValidComputeGraph(const session::KernelGraph &kernel_graph);
|
||||
static void SaveProfilingPoint(uint32_t graph_id, const std::string &node_name, uint32_t point_id);
|
||||
|
||||
// graph id --> (kernel name list)
|
||||
|
@ -109,8 +119,9 @@ class ProfilingUtils {
|
|||
inline static std::map<uint32_t, std::vector<std::string>> graph_kernel_name_;
|
||||
inline static std::map<uint32_t, std::vector<std::shared_ptr<ProfDesc>>> graph_point_;
|
||||
inline static uint32_t custom_node_index_;
|
||||
inline static std::vector<GraphProfilingData> report_data_;
|
||||
};
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_UTILS_H_
|
||||
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEN_D_PROFILING_PROFILING_UTILS_H_
|
||||
|
|
|
@ -278,7 +278,7 @@ bool TaskGenerator::LaunchAllKernel(const std::vector<CNodePtr> &anf_node_list,
|
|||
|
||||
#ifndef ENABLE_SECURITY
|
||||
ProfilingUtils::SetGraphKernelName(graph_id, kernel_name_list);
|
||||
if (ProfilingManager::GetInstance().IsProfiling()) {
|
||||
if (ProfilingManager::GetInstance().IsProfilingInitialized()) {
|
||||
ProfilingUtils::SetGraphProfilingCNode(graph_id, profiling_cnode_list);
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -642,7 +642,7 @@ CNodePtr KernelAdjust::CreateStreamAssignAddnOP(const std::shared_ptr<session::K
|
|||
|
||||
#ifndef ENABLE_SECURITY
|
||||
void KernelAdjust::Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr) {
|
||||
if (!ascend::ProfilingManager::GetInstance().IsProfiling()) {
|
||||
if (!ascend::ProfilingManager::GetInstance().IsProfilingInitialized()) {
|
||||
MS_LOG(INFO) << "No need to profiling";
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -140,6 +140,8 @@ class Profiler:
|
|||
self._get_devid_rankid_and_devtarget()
|
||||
self._get_output_path(kwargs)
|
||||
self._profile_communication = False
|
||||
self._has_started = False
|
||||
self.start_profile = True
|
||||
|
||||
# Setup and start MindData Profiling
|
||||
self._md_profiler = cde.GlobalContext.profiling_manager()
|
||||
|
@ -174,7 +176,7 @@ class Profiler:
|
|||
raise ValueError(msg)
|
||||
# use context interface to open profiling, for the new mindspore version(after 2020.5.21)
|
||||
self._ascend_profiler = c_expression.AscendProfiler.get_instance()
|
||||
self._ascend_profiler.start(profiling_options)
|
||||
self._ascend_profiler.init(self._output_path, int(self._dev_id), profiling_options)
|
||||
base_profiling_container_path = os.path.join(self._output_path, "container")
|
||||
container_path = os.path.join(base_profiling_container_path, self._dev_id)
|
||||
data_path = os.path.join(container_path, "data")
|
||||
|
@ -184,8 +186,10 @@ class Profiler:
|
|||
|
||||
# add job id env through user input later
|
||||
self._job_id_env = 0
|
||||
self._start_time = int(time.time() * 10000000)
|
||||
logger.info("Profiling: profiling start time: %d", self._start_time)
|
||||
self._init_time = int(time.time() * 10000000)
|
||||
logger.info("Profiling: profiling init time: %d", self._init_time)
|
||||
if self.start_profile:
|
||||
self.start()
|
||||
|
||||
def _construct_profiling_options(self):
|
||||
"""
|
||||
|
@ -225,7 +229,9 @@ class Profiler:
|
|||
logger.critical(msg)
|
||||
raise ValueError(msg)
|
||||
self._output_path, _ = os.path.split(job_dir)
|
||||
|
||||
self.start_profile = kwargs.pop("start_profile", True)
|
||||
if not isinstance(self.start_profile, bool):
|
||||
raise TypeError("The parameter start_profile must be bool.")
|
||||
self._profile_communication = kwargs.pop("profile_communication", False)
|
||||
if not isinstance(self._profile_communication, bool):
|
||||
raise TypeError("The parameter profile_communication must be bool.")
|
||||
|
@ -270,6 +276,12 @@ class Profiler:
|
|||
self._rank_size = get_group_size()
|
||||
|
||||
release()
|
||||
if (not self.start_profile) or self._has_started:
|
||||
self._ascend_profiler.stop()
|
||||
else:
|
||||
msg = "The profiler has not start, so can not stop."
|
||||
logger.info(msg)
|
||||
self._ascend_profiler.finalize()
|
||||
|
||||
job_id = self._get_profiling_job_id()
|
||||
logger.info("Profiling: job id is %s ", job_id)
|
||||
|
@ -377,7 +389,30 @@ class Profiler:
|
|||
self._dev_id, self._rank_id, is_training_mode_flag)
|
||||
logger.info("Profiling: analyzing the operation FLOPs.")
|
||||
flops_parser.execute()
|
||||
|
||||
def start(self):
|
||||
"""Used for Ascend, start profiling."""
|
||||
if not self._has_started:
|
||||
self._has_started = True
|
||||
else:
|
||||
msg = "The profiler has already started."
|
||||
logger.error(msg)
|
||||
raise RuntimeError(msg)
|
||||
self._ascend_profiler.start()
|
||||
self._start_time = int(time.time() * 10000000)
|
||||
logger.info("Profiling: start time: %d", self._start_time)
|
||||
|
||||
def stop(self):
|
||||
"""Used for Ascend, stop profiling."""
|
||||
if self._has_started:
|
||||
self._has_started = False
|
||||
else:
|
||||
msg = "The profiler has not start, so can not stop."
|
||||
logger.error(msg)
|
||||
raise RuntimeError(msg)
|
||||
self._ascend_profiler.stop()
|
||||
self._stop_time = int(time.time() * 10000000)
|
||||
logger.info("Profiling: stop time: %d", self._stop_time)
|
||||
|
||||
def _gpu_analyse(self):
|
||||
"""Collect and analyse gpu performance data"""
|
||||
|
@ -573,8 +608,7 @@ class Profiler:
|
|||
if int(job_start_time) < self._start_time:
|
||||
logger.warning("Find profiling job path %s, but start_time(%d) is earlier than this training "
|
||||
"start_time(%d), profiler will ignore this job dir.",
|
||||
job_dir, job_start_time, self._start_time)
|
||||
continue
|
||||
job_dir, int(job_start_time), self._start_time)
|
||||
|
||||
job_id = dir_name
|
||||
break
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <string>
|
||||
#include "prof_mgr_core.h"
|
||||
#include "prof_callback.h"
|
||||
#include "acl/acl_prof.h"
|
||||
|
||||
namespace Msprof {
|
||||
namespace Engine {
|
||||
|
@ -73,3 +74,19 @@ int32_t MsprofInit(uint32_t dataType, void *data, uint32_t dataLen) { return 0;
|
|||
* @return 0:SUCCESS, >0:FAILED
|
||||
*/
|
||||
int32_t MsprofFinalize() { return 0; }
|
||||
|
||||
ACL_FUNC_VISIBILITY aclError aclprofInit(const char *profilerResultPath, size_t length) { return ACL_SUCCESS; }
|
||||
|
||||
ACL_FUNC_VISIBILITY aclError aclprofStart(const aclprofConfig *profilerConfig) { return ACL_SUCCESS; }
|
||||
|
||||
ACL_FUNC_VISIBILITY aclError aclprofStop(const aclprofConfig *profilerConfig) { return ACL_SUCCESS; }
|
||||
|
||||
ACL_FUNC_VISIBILITY aclError aclprofFinalize() { return ACL_SUCCESS; }
|
||||
|
||||
ACL_FUNC_VISIBILITY aclprofConfig *aclprofCreateConfig(uint32_t *deviceIdList, uint32_t deviceNums,
|
||||
aclprofAicoreMetrics aicoreMetrics,
|
||||
aclprofAicoreEvents *aicoreEvents, uint64_t dataTypeConfig) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ACL_FUNC_VISIBILITY aclError aclprofDestroyConfig(const aclprofConfig *profilerConfig) { return ACL_SUCCESS; }
|
||||
|
|
Loading…
Reference in New Issue