!25367 use acl api to control profiling

Merge pull request !25367 from yanghaitao/yht_condation_start_profiler
This commit is contained in:
i-robot 2021-11-18 12:36:14 +00:00 committed by Gitee
commit ce00ee1ad1
12 changed files with 290 additions and 160 deletions

View File

@ -1373,13 +1373,6 @@ void InitHccl() {
(void)context::OpenTsd(ms_context);
}
#endif
#if (defined ENABLE_D)
#ifndef ENABLE_SECURITY
if (!ProfilingManager::GetInstance().IsProfiling()) {
ProfilingManager::GetInstance().SetHcclEnabledBefProfilingEnabled();
}
#endif
#endif
}
void FinalizeHccl() {
@ -1440,38 +1433,10 @@ void ReleaseGeTsd() {
}
}
#ifndef ENABLE_SECURITY
void StartUpProfiling() {
#ifdef ENABLE_D
if (!ProfilingManager::GetInstance().IsProfiling()) {
return;
}
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
MS_LOG(INFO) << "Startup profiling";
// Start up profiling before OpenTsd
uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
std::string device_name = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
if (ms_context->backend_policy() == "ms" &&
ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(device_name, device_id);
MS_EXCEPTION_IF_NULL(runtime_instance);
runtime_instance->PreInit();
}
#endif
}
#endif
void InitPipeline() {
// set python env flag
RecordInitStatus();
mindspore::parse::python_adapter::set_python_env_flag(true);
#ifndef ENABLE_SECURITY
// Startup profiling before open tsd
StartUpProfiling();
#endif
// open tsd before ge initialize
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);

View File

@ -14,14 +14,27 @@
*/
#include "profiler/device/ascend/ascend_profiling.h"
#include <map>
#include <string>
#include "pybind_api/api_register.h"
#include "utils/log_adapter.h"
#include "utils/utils.h"
#include "runtime/device/ascend/profiling/profiling_manager.h"
#include <nlohmann/json.hpp>
using mindspore::device::ascend::ProfilingManager;
namespace mindspore {
namespace profiler {
namespace ascend {
std::map<std::string, aclprofAicoreMetrics> kAicMetrics{
{"ArithmeticUtilization", ACL_AICORE_ARITHMETIC_UTILIZATION},
{"PipeUtilization", ACL_AICORE_PIPE_UTILIZATION},
{"Memory", ACL_AICORE_MEMORY_BANDWIDTH},
{"MemoryLO", ACL_AICORE_L0B_AND_WIDTH},
{"ResourceConflictRatio", ACL_AICORE_RESOURCE_CONFLICT_RATIO},
};
std::shared_ptr<AscendProfiler> AscendProfiler::ascend_profiler_ = std::make_shared<AscendProfiler>();
std::shared_ptr<AscendProfiler> &AscendProfiler::GetInstance() { return ascend_profiler_; }
@ -31,21 +44,113 @@ void AscendProfiler::StepProfilingEnable(const bool enable_flag) {
enable_flag_ = enable_flag;
}
void AscendProfiler::Start(const std::string &profiling_options) {
void AscendProfiler::InitProfiling(const std::string &profiling_path, uint32_t device_id,
const std::string &profiling_options) {
MS_LOG(INFO) << "Begin to init profiling and call aclprofInit function.";
profiling_options_ = profiling_options;
profile_data_path_ = profiling_path;
device_id_ = device_id;
(void)ProfilingManager::GetInstance().InitProfiling(profiling_path, device_id);
aclError aclRet = aclprofInit(profile_data_path_.c_str(), profile_data_path_.length());
if (aclRet != ACL_SUCCESS) {
MS_LOG(EXCEPTION) << "Failed to call aclprofInit function.";
}
}
uint64_t AscendProfiler::GetOptionsMask() const {
uint64_t mask = ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS;
nlohmann::json options_json;
try {
options_json = nlohmann::json::parse(profiling_options_);
} catch (const std::exception &err) {
MS_LOG(ERROR) << "Failed to parse profiling options.";
return ACL_AICORE_NONE;
}
if (options_json["task_trace"] == "on") {
mask |= ACL_PROF_TASK_TIME;
}
if (options_json["aicpu"] == "on") {
mask |= ACL_PROF_AICPU;
}
return mask;
}
aclprofAicoreMetrics AscendProfiler::GetAicMetrics() const {
nlohmann::json options_json;
try {
options_json = nlohmann::json::parse(profiling_options_);
} catch (const std::exception &err) {
MS_LOG(ERROR) << "Failed to parse profiling options.";
return ACL_AICORE_NONE;
}
auto result = std::find_if(kAicMetrics.begin(), kAicMetrics.end(), [&options_json](const auto &metric) {
return metric.first == options_json["aic_metrics"];
});
if (result == kAicMetrics.end()) {
return ACL_AICORE_NONE;
}
return result->second;
}
void AscendProfiler::Start() {
uint32_t device_list[1] = {device_id_};
uint32_t device_num = 1;
uint64_t mask = GetOptionsMask();
aclprofAicoreMetrics aic_metrics = GetAicMetrics();
acl_config_ = aclprofCreateConfig(device_list, device_num, aic_metrics, nullptr, GetOptionsMask());
if (acl_config_ == nullptr) {
MS_LOG(EXCEPTION) << "Failed to call aclprofCreateConfig function.";
}
aclError aclRet = aclprofStart(acl_config_);
if (aclRet != ACL_SUCCESS) {
MS_LOG(EXCEPTION) << "Failed to call aclprofStart function.";
}
MS_LOG(INFO) << "Start profiling, options mask is " << mask << " aic_metrics is " << aic_metrics;
StepProfilingEnable(true);
}
void AscendProfiler::Stop() {
MS_LOG(INFO) << "Stop profiling";
MS_LOG(INFO) << "Begin to stop profiling.";
if (acl_config_ == nullptr) {
MS_LOG(EXCEPTION)
<< "Failed to stop profiling because of null acl config.Please make sure call Profiler.Start function "
"before call Profiler.Stop function.";
}
aclError aclRet = aclprofStop(acl_config_);
if (aclRet != ACL_SUCCESS) {
MS_LOG(EXCEPTION) << "Failed to call aclprofStop function.";
}
aclRet = aclprofDestroyConfig(acl_config_);
if (aclRet != ACL_SUCCESS) {
MS_LOG(EXCEPTION) << "Failed to call aclprofDestroyConfig function.";
}
StepProfilingEnable(false);
}
void AscendProfiler::Finalize() const {
MS_LOG(INFO) << "Begin to finalize profiling";
aclError aclRet = aclprofFinalize();
if (aclRet != ACL_SUCCESS) {
MS_LOG(EXCEPTION) << "Failed to call aclprofDestroyConfig function.";
}
}
REGISTER_PYBIND_DEFINE(AscendProfiler_, ([](const py::module *m) {
(void)py::class_<AscendProfiler, std::shared_ptr<AscendProfiler>>(*m, "AscendProfiler")
.def_static("get_instance", &AscendProfiler::GetInstance, "AscendProfiler get_instance.")
.def("start", &AscendProfiler::Start, py::arg("profiling_options"), "start")
.def("stop", &AscendProfiler::Stop, "stop");
.def("init", &AscendProfiler::InitProfiling, py::arg("profiling_path"), py::arg("device_id"),
py::arg("profiling_options"), "init")
.def("start", &AscendProfiler::Start, "start")
.def("stop", &AscendProfiler::Stop, "stop")
.def("finalize", &AscendProfiler::Finalize, "finalize");
}));
} // namespace ascend
} // namespace profiler

View File

@ -18,6 +18,7 @@
#include <string>
#include <memory>
#include "profiler/device/profiling.h"
#include "acl/acl_prof.h"
namespace mindspore {
namespace profiler {
@ -30,18 +31,24 @@ class AscendProfiler : public Profiler {
AscendProfiler(const AscendProfiler &) = delete;
AscendProfiler &operator=(const AscendProfiler &) = delete;
void Init(const std::string &profileDataPath) { return; }
void InitProfiling(const std::string &profiling_path, uint32_t device_id, const std::string &profiling_options);
void Stop();
void StepProfilingEnable(const bool enable_flag) override;
void OpDataProducerEnd() { return; }
void Start(const std::string &profiling_options);
void Start();
bool GetProfilingEnableFlag() const { return enable_flag_; }
std::string GetProfilingOptions() const { return profiling_options_; }
void SaveProfileData() { return; }
void ClearInst() { return; }
uint64_t GetOptionsMask() const;
aclprofAicoreMetrics GetAicMetrics() const;
void Finalize() const;
private:
static std::shared_ptr<AscendProfiler> ascend_profiler_;
std::string profiling_options_;
uint32_t device_id_;
aclprofConfig *acl_config_;
};
} // namespace ascend
} // namespace profiler

View File

@ -244,7 +244,7 @@ void AsyncDataDumpUninit() {
void AscendKernelRuntime::ReportProfilingData() {
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
if (ProfilingManager::GetInstance().IsProfiling() &&
if (ProfilingManager::GetInstance().IsProfilingStart() &&
context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
// Save Profiling Framework data
OpNameTaskStreamReporter reporter(device_id_, "nonsink", stream_id_task_id_op_name_map_);
@ -295,9 +295,6 @@ void AscendKernelRuntime::ReleaseDeviceRes() {
}
(void)ResetDevice(device_id);
#ifndef ENABLE_SECURITY
(void)ProfilingManager::GetInstance().StopProfiling();
#endif
current_graph_ = nullptr;
if (context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode &&
!context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) {
@ -313,14 +310,6 @@ void AscendKernelRuntime::PreInit() {
if (error_manager_ret != 0) {
MS_LOG(WARNING) << "Init ErrorManager failed.";
}
auto ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
if (!ret) {
const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
if (!error_message.empty() && error_message.find(kUnknowErrorString) == string::npos) {
MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
}
MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed.";
}
}
#endif
@ -567,10 +556,18 @@ bool AscendKernelRuntime::LoadTask(const session::KernelGraph &graph) {
}
#ifndef ENABLE_SECURITY
if (ProfilingManager::GetInstance().IsProfiling()) {
if (ProfilingManager::GetInstance().IsProfilingInitialized()) {
auto task_ids = ModelRunner::Instance().GetTaskIdList(model_iter->first);
auto stream_ids = ModelRunner::Instance().GetStreamIdList(model_iter->first);
ProfilingUtils::ReportProfilingData(task_ids, stream_ids, graph);
// Report data directly if profiling is start
if (ProfilingUtils::ValidComputeGraph(graph)) {
if (ProfilingManager::GetInstance().IsProfilingStart()) {
ProfilingUtils::ReportProfilingData(task_ids, stream_ids, graph.graph_id());
} else {
// Cache data and save when profiling is start
ProfilingUtils::SetReportProfilingData(task_ids, stream_ids, graph.graph_id());
}
}
}
LaunchDataDump(graph.graph_id());
#endif

View File

@ -25,6 +25,9 @@
#include "utils/convert_utils.h"
#include "runtime/base.h"
#include <nlohmann/json.hpp>
#include "runtime/device/ascend/profiling/profiling_utils.h"
using mindspore::device::ascend::ProfilingUtils;
namespace {
constexpr Status PROF_SUCCESS = 0;
@ -39,7 +42,8 @@ ProfilingManager &ProfilingManager::GetInstance() {
return inst;
}
ProfilingManager::ProfilingManager() : device_id_(0), prof_cb_({0}), hccl_enabled_bef_profiling_enabled_(false) {}
ProfilingManager::ProfilingManager()
: device_id_(0), prof_cb_({0}), cur_state_(kProfilingInvalid), profiling_path_("") {}
uint64_t ProfilingManager::GetJobId() const { return 0; }
@ -110,57 +114,15 @@ Status ProfilingManager::GetProfConf(const NotNull<MsprofGeOptions *> prof) {
return PROF_SUCCESS;
}
bool ProfilingManager::StartupProfiling(uint32_t device_id) {
auto is_profiling = IsProfiling();
if (!is_profiling) {
int32_t cb_ret = MsprofInit(0XFF, nullptr, 0);
if (cb_ret != UintToInt(PROF_SUCCESS)) {
MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret;
return false;
}
MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
return true;
}
if (hccl_enabled_bef_profiling_enabled_) {
MS_LOG(ERROR)
<< "Please check the Profiler object initialized before mindspore.context.set_auto_parallel_context() "
"and mindspore.communication.management.init(). Profiler should be initialized before these code.";
return false;
}
bool ProfilingManager::InitProfiling(const std::string &profiling_path, uint32_t device_id) {
profiling_path_ = profiling_path;
device_id_ = device_id;
struct MsprofGeOptions prof_conf = {0};
if (GetProfConf(NOT_NULL(&prof_conf)) != PROF_SUCCESS) {
MS_LOG(ERROR) << "Get prof conf failed.";
return false;
}
if (!ProfStartUp(NOT_NULL(&prof_conf))) {
MS_LOG(ERROR) << "ProfMgrStartUp failed.";
return false;
}
return true;
}
bool ProfilingManager::ProfStartUp(const NotNull<MsprofGeOptions *> prof_conf) const {
MS_LOG(INFO) << "Prof start up. ";
bool ret = ProfRegisterCtrlCallback();
if (ret == false) {
return ret;
}
// call profiling start up api
int32_t cb_ret = MsprofInit(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS),
static_cast<void *>(prof_conf.get()), sizeof(MsprofGeOptions));
if (cb_ret != UintToInt(PROF_SUCCESS)) {
MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret;
return false;
}
MS_LOG(INFO) << "Start up profiling success.";
return true;
}
@ -188,25 +150,6 @@ rtError_t CtrlCallbackHandle(uint32_t rt_type, void *data, uint32_t /* len */) {
return RT_ERROR_NONE;
}
bool ProfilingManager::StopProfiling() const {
MS_LOG(INFO) << "StopProfiling";
if (!IsProfiling()) {
MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
return true;
}
// plugin unregister
PluginUnInit();
// stop profiling
int32_t cb_ret = MsprofFinalize();
if (cb_ret != 0) {
MS_LOG(WARNING) << "Call MsprofFinalize failed, ret: " << cb_ret;
return false;
}
return true;
}
Status ProfilingManager::CallMsprofReport(const NotNull<ReporterData *> reporter_data) const {
if (prof_cb_.msprofReporterCallback == nullptr) {
MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr.";
@ -224,6 +167,58 @@ Status ProfilingManager::CallMsprofReport(const NotNull<ReporterData *> reporter
return PROF_SUCCESS;
}
Status ProfilingManager::ProfHandleInit() {
MS_LOG(INFO) << "Begin to init profiling. Current profiling state is " << cur_state_;
cur_state_ = kProfilingInit;
auto cb_ret = ProfilingManager::GetInstance().PluginInit();
if (cb_ret != PROF_SUCCESS) {
MS_LOG(ERROR) << "Failed to init profiling.";
return PROF_FAILED;
}
return PROF_SUCCESS;
}
Status ProfilingManager::ProfHandleStart() {
MS_LOG(INFO) << "Begin to start profiling. Current profiling state is " << cur_state_;
cur_state_ = kProfilingStart;
// Report graph data if there is any cache data.
ProfilingUtils::ReportAllGraphProfilingData();
return PROF_SUCCESS;
}
Status ProfilingManager::ProfHandleStop() {
MS_LOG(INFO) << "Begin to stop profiling. Current profiling state is " << cur_state_;
cur_state_ = kProfilingStop;
return PROF_SUCCESS;
}
Status ProfilingManager::ProfHandleFinalize() {
MS_LOG(INFO) << "Begin to finalize profiling. Current profiling state is " << cur_state_;
cur_state_ = kProfilingFinalize;
ProfilingManager::GetInstance().PluginUnInit();
return PROF_SUCCESS;
}
Status ProfilingManager::ProfCommandHandle(ProfCommandHandleType type) {
// Only need process "Init"/“Start”/“Stop”/“Finalize”
if (type == kProfCommandhandleInit) {
return ProfHandleInit();
} else if (type == kProfCommandhandleStart) {
return ProfHandleStart();
} else if (type == kProfCommandhandleStop) {
return ProfHandleStop();
} else if (type == kProfCommandhandleFinalize) {
return ProfHandleFinalize();
}
MS_LOG(ERROR) << "Receive invalid profiling type " << type << ". Current profiling state is << " << cur_state_;
return PROF_FAILED;
}
Status ProfCtrlSwitchHandle(void *data) {
if (data == nullptr) {
MS_LOG(ERROR) << "Ctrl switch handl data is nullptr.";
@ -235,18 +230,7 @@ Status ProfCtrlSwitchHandle(void *data) {
return ProfCommandHandle(type);
}
Status ProfCommandHandle(ProfCommandHandleType type) {
MS_LOG(INFO) << "ProfCommandHandle start, type:" << type;
if (type == kProfCommandhandleInit) {
auto cb_ret = ProfilingManager::GetInstance().PluginInit();
if (cb_ret != PROF_SUCCESS) {
MS_LOG(ERROR) << "Profiling plugin int failed.";
return PROF_FAILED;
}
}
return PROF_SUCCESS;
}
Status ProfCommandHandle(ProfCommandHandleType type) { return ProfilingManager::GetInstance().ProfCommandHandle(type); }
} // namespace ascend
} // namespace device
} // namespace mindspore

View File

@ -28,6 +28,7 @@
#include "toolchain/slog.h"
#include "runtime/base.h"
#include "profiler/device/profiling.h"
#include "acl/acl_prof.h"
using std::map;
using std::string;
@ -50,19 +51,16 @@ enum ProfCommandHandleType {
kProfCommandhandleModelUnsubscribe
};
enum ProfilingState { kProfilingInvalid, kProfilingInit, kProfilingStart, kProfilingStop, kProfilingFinalize };
class ProfilingManager {
public:
static ProfilingManager &GetInstance();
uint64_t GetJobId() const;
bool ProfRegisterCtrlCallback() const;
bool StartupProfiling(uint32_t device_id);
bool StopProfiling() const;
inline bool IsProfiling() const {
auto profiler_manager = profiler::ProfilerManager::GetInstance();
MS_EXCEPTION_IF_NULL(profiler_manager);
return profiler_manager->GetProfilingEnableFlag();
}
bool InitProfiling(const std::string &profiling_path, uint32_t device_id);
bool IsProfilingInitialized() const { return cur_state_ >= kProfilingInit; }
inline bool IsProfilingStart() const { return cur_state_ >= kProfilingStart; }
Status PluginInit() const;
void PluginUnInit() const;
Status CallMsprofReport(NotNull<ReporterData *> reporter_data) const;
@ -71,17 +69,22 @@ class ProfilingManager {
void SetMsprofReporterCallback(MsprofReporterCallback func) { prof_cb_.msprofReporterCallback = func; }
void SetMsprofSetDeviceCallback(MsprofSetDeviceCallback func) { prof_cb_.msprofSetDeviceCallback = func; }
Status GetProfConf(NotNull<MsprofGeOptions *> prof);
void SetHcclEnabledBefProfilingEnabled() { hccl_enabled_bef_profiling_enabled_ = true; }
Status ProfCommandHandle(ProfCommandHandleType type);
Status ProfHandleInit();
Status ProfHandleStart();
Status ProfHandleStop();
Status ProfHandleFinalize();
protected:
ProfilingManager();
~ProfilingManager() {}
private:
bool ProfStartUp(NotNull<MsprofGeOptions *> prof_conf) const;
uint32_t device_id_;
MsprofCallback prof_cb_;
bool hccl_enabled_bef_profiling_enabled_;
aclprofConfig *acl_config_;
ProfilingState cur_state_;
std::string profiling_path_;
};
Status ProfCommandHandle(ProfCommandHandleType type);

View File

@ -390,14 +390,15 @@ bool ProfilingUtils::ValidComputeGraph(const session::KernelGraph &kernel_graph)
return false;
}
void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids,
const session::KernelGraph &kernel_graph) {
if (!ValidComputeGraph(kernel_graph)) {
MS_LOG(INFO) << "Not a valid compute graph:" << kernel_graph.graph_id();
return;
void ProfilingUtils::ReportAllGraphProfilingData() {
for (auto data : report_data_) {
ReportProfilingData(data.task_ids_, data.stream_ids_, data.graph_id_);
}
}
auto ret = graph_profiling_cnode_.find(kernel_graph.graph_id());
void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids,
uint32_t graph_id) {
auto ret = graph_profiling_cnode_.find(graph_id);
if (ret == graph_profiling_cnode_.end()) {
MS_LOG(ERROR) << "Graph id not found";
return;
@ -415,7 +416,7 @@ void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids,
graph_reporter.ReportData();
// Report profiling point
auto point_iter = graph_point_.find(kernel_graph.graph_id());
auto point_iter = graph_point_.find(graph_id);
if (point_iter == graph_point_.end()) {
MS_LOG(ERROR) << "Graph id not found in graph_point";
return;
@ -426,6 +427,12 @@ void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids,
}
point_reporter.ReportData();
}
void ProfilingUtils::SetReportProfilingData(const std::vector<uint32_t> &task_ids,
const std::vector<uint32_t> &stream_ids, uint32_t graph_id) {
GraphProfilingData report_data = {task_ids, stream_ids, graph_id};
report_data_.emplace_back(report_data);
}
} // namespace ascend
} // namespace device
} // namespace mindspore

View File

@ -48,6 +48,12 @@ struct ProfilingContent {
uint32_t flags;
};
struct GraphProfilingData {
std::vector<uint32_t> task_ids_;
std::vector<uint32_t> stream_ids_;
uint32_t graph_id_;
};
class ProfilingUtils {
public:
ProfilingUtils() = default;
@ -69,7 +75,7 @@ class ProfilingUtils {
static void SetGraphKernelName(uint32_t graph_id, const std::vector<std::string> &kernel_names);
// Save graph information to Framework file
static void ReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids,
const session::KernelGraph &graph);
uint32_t graph_id);
// Generate profiling trace
static ProfilingTraceInfo GenerateProfilingTrace(const session::KernelGraph &kernel_graph);
@ -81,6 +87,11 @@ class ProfilingUtils {
static std::map<uint32_t, std::vector<std::string>> graph_kernel_name() { return graph_kernel_name_; }
static void SetReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids,
uint32_t graph_id);
static void ReportAllGraphProfilingData();
static bool ValidComputeGraph(const session::KernelGraph &kernel_graph);
inline static constexpr char kProfiling[] = "Profiling";
inline static constexpr char kNotify[] = "notify";
inline static constexpr char kProfilerTraceId[] = "profiler_trace_id";
@ -101,7 +112,6 @@ class ProfilingUtils {
static void GetCNodeOutputRealNode(const std::string &node_name, const session::KernelGraph &kernel_graph,
NotNull<std::set<std::string> *> getnext_outputs);
static bool ValidComputeGraph(const session::KernelGraph &kernel_graph);
static void SaveProfilingPoint(uint32_t graph_id, const std::string &node_name, uint32_t point_id);
// graph id --> (kernel name list)
@ -109,8 +119,9 @@ class ProfilingUtils {
inline static std::map<uint32_t, std::vector<std::string>> graph_kernel_name_;
inline static std::map<uint32_t, std::vector<std::shared_ptr<ProfDesc>>> graph_point_;
inline static uint32_t custom_node_index_;
inline static std::vector<GraphProfilingData> report_data_;
};
} // namespace ascend
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_UTILS_H_
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEN_D_PROFILING_PROFILING_UTILS_H_

View File

@ -278,7 +278,7 @@ bool TaskGenerator::LaunchAllKernel(const std::vector<CNodePtr> &anf_node_list,
#ifndef ENABLE_SECURITY
ProfilingUtils::SetGraphKernelName(graph_id, kernel_name_list);
if (ProfilingManager::GetInstance().IsProfiling()) {
if (ProfilingManager::GetInstance().IsProfilingInitialized()) {
ProfilingUtils::SetGraphProfilingCNode(graph_id, profiling_cnode_list);
}
#endif

View File

@ -642,7 +642,7 @@ CNodePtr KernelAdjust::CreateStreamAssignAddnOP(const std::shared_ptr<session::K
#ifndef ENABLE_SECURITY
void KernelAdjust::Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr) {
if (!ascend::ProfilingManager::GetInstance().IsProfiling()) {
if (!ascend::ProfilingManager::GetInstance().IsProfilingInitialized()) {
MS_LOG(INFO) << "No need to profiling";
return;
}

View File

@ -140,6 +140,8 @@ class Profiler:
self._get_devid_rankid_and_devtarget()
self._get_output_path(kwargs)
self._profile_communication = False
self._has_started = False
self.start_profile = True
# Setup and start MindData Profiling
self._md_profiler = cde.GlobalContext.profiling_manager()
@ -174,7 +176,7 @@ class Profiler:
raise ValueError(msg)
# use context interface to open profiling, for the new mindspore version(after 2020.5.21)
self._ascend_profiler = c_expression.AscendProfiler.get_instance()
self._ascend_profiler.start(profiling_options)
self._ascend_profiler.init(self._output_path, int(self._dev_id), profiling_options)
base_profiling_container_path = os.path.join(self._output_path, "container")
container_path = os.path.join(base_profiling_container_path, self._dev_id)
data_path = os.path.join(container_path, "data")
@ -184,8 +186,10 @@ class Profiler:
# add job id env through user input later
self._job_id_env = 0
self._start_time = int(time.time() * 10000000)
logger.info("Profiling: profiling start time: %d", self._start_time)
self._init_time = int(time.time() * 10000000)
logger.info("Profiling: profiling init time: %d", self._init_time)
if self.start_profile:
self.start()
def _construct_profiling_options(self):
"""
@ -225,7 +229,9 @@ class Profiler:
logger.critical(msg)
raise ValueError(msg)
self._output_path, _ = os.path.split(job_dir)
self.start_profile = kwargs.pop("start_profile", True)
if not isinstance(self.start_profile, bool):
raise TypeError("The parameter start_profile must be bool.")
self._profile_communication = kwargs.pop("profile_communication", False)
if not isinstance(self._profile_communication, bool):
raise TypeError("The parameter profile_communication must be bool.")
@ -270,6 +276,12 @@ class Profiler:
self._rank_size = get_group_size()
release()
if (not self.start_profile) or self._has_started:
self._ascend_profiler.stop()
else:
msg = "The profiler has not start, so can not stop."
logger.info(msg)
self._ascend_profiler.finalize()
job_id = self._get_profiling_job_id()
logger.info("Profiling: job id is %s ", job_id)
@ -377,7 +389,30 @@ class Profiler:
self._dev_id, self._rank_id, is_training_mode_flag)
logger.info("Profiling: analyzing the operation FLOPs.")
flops_parser.execute()
def start(self):
"""Used for Ascend, start profiling."""
if not self._has_started:
self._has_started = True
else:
msg = "The profiler has already started."
logger.error(msg)
raise RuntimeError(msg)
self._ascend_profiler.start()
self._start_time = int(time.time() * 10000000)
logger.info("Profiling: start time: %d", self._start_time)
def stop(self):
"""Used for Ascend, stop profiling."""
if self._has_started:
self._has_started = False
else:
msg = "The profiler has not start, so can not stop."
logger.error(msg)
raise RuntimeError(msg)
self._ascend_profiler.stop()
self._stop_time = int(time.time() * 10000000)
logger.info("Profiling: stop time: %d", self._stop_time)
def _gpu_analyse(self):
"""Collect and analyse gpu performance data"""
@ -573,8 +608,7 @@ class Profiler:
if int(job_start_time) < self._start_time:
logger.warning("Find profiling job path %s, but start_time(%d) is earlier than this training "
"start_time(%d), profiler will ignore this job dir.",
job_dir, job_start_time, self._start_time)
continue
job_dir, int(job_start_time), self._start_time)
job_id = dir_name
break

View File

@ -16,6 +16,7 @@
#include <string>
#include "prof_mgr_core.h"
#include "prof_callback.h"
#include "acl/acl_prof.h"
namespace Msprof {
namespace Engine {
@ -73,3 +74,19 @@ int32_t MsprofInit(uint32_t dataType, void *data, uint32_t dataLen) { return 0;
* @return 0:SUCCESS, >0:FAILED
*/
int32_t MsprofFinalize() { return 0; }
ACL_FUNC_VISIBILITY aclError aclprofInit(const char *profilerResultPath, size_t length) { return ACL_SUCCESS; }
ACL_FUNC_VISIBILITY aclError aclprofStart(const aclprofConfig *profilerConfig) { return ACL_SUCCESS; }
ACL_FUNC_VISIBILITY aclError aclprofStop(const aclprofConfig *profilerConfig) { return ACL_SUCCESS; }
ACL_FUNC_VISIBILITY aclError aclprofFinalize() { return ACL_SUCCESS; }
ACL_FUNC_VISIBILITY aclprofConfig *aclprofCreateConfig(uint32_t *deviceIdList, uint32_t deviceNums,
aclprofAicoreMetrics aicoreMetrics,
aclprofAicoreEvents *aicoreEvents, uint64_t dataTypeConfig) {
return nullptr;
}
ACL_FUNC_VISIBILITY aclError aclprofDestroyConfig(const aclprofConfig *profilerConfig) { return ACL_SUCCESS; }