profiler support to collect parallel strategy info

If SetNodeOutputType functions forcibly splits into multiple functions, the readability decreases, so it blocks lizard scans
This commit is contained in:
ougongchang 2021-10-25 20:01:54 +08:00
parent 4419883fa4
commit 9229f1c1ff
16 changed files with 406 additions and 39 deletions

View File

@ -92,4 +92,7 @@ mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/PostF
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/WinogradPostFuncBiasReluC8.c:WinogradPostFuncBiasReluC8
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/deconv_winograd_fp32.c:PackDeConvWgDataFp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/deconv_winograd_fp32.c:DeConvWgMerge
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/TiledC8MatMulFp32.c:TiledC8MatmulFp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/TiledC8MatMulFp32.c:TiledC8MatmulFp32
mindspore/mindspore/ccsrc/debug/dump_proto.cc:mindspore::ProtoExporter::SetNodeOutputType
mindspore/mindspore/ccsrc/debug/dump_proto.cc:mindspore::ProtoExporter::SetValueToProto
mindspore/mindspore/ccsrc/debug/dump_proto.cc:mindspore::ProtoExporter::SetScalarToProto

View File

@ -177,6 +177,7 @@ if(ENABLE_DUMP_PROTO)
"utils/checkpoint.proto"
"utils/print.proto"
"utils/node_strategy.proto"
"utils/profiling_parallel.proto"
)
ms_protobuf_generate_py(PY_SRCS PY_HDRS PY_PYS ${PROTO_PY})

View File

@ -37,6 +37,7 @@ class ProtoExporter {
~ProtoExporter() {}
std::string GetFuncGraphProtoString(const FuncGraphPtr &func_graph);
void ExportFuncGraph(const FuncGraphPtr &func_graph, irpb::GraphProto *graph_proto);
private:
void InitModelInfo();
@ -51,7 +52,6 @@ class ProtoExporter {
void SetNodeOutputType(const AnfNodePtr &node, irpb::TypeProto *type_proto);
void SetNodeOutputType(const TypePtr &node, const BaseShapePtr &shape, irpb::TypeProto *type_proto);
void ExportFuncGraph(const FuncGraphPtr &func_graph, irpb::GraphProto *graph_proto);
void ExportParameters(const FuncGraphPtr &func_graph, irpb::GraphProto *graph_proto);
void ExportCNodes(const FuncGraphPtr &func_graph, irpb::GraphProto *graph_proto,
std::map<AnfNodePtr, size_t> *const_map_ptr);
@ -340,6 +340,20 @@ void ProtoExporter::GetOpNodeTypeAndAttrs(const FuncGraphPtr &, const AnfNodePtr
}
const PrimitivePtr &prim = GetValueNode<PrimitivePtr>(node);
// set node parallel info
auto operator_info = node->user_data<parallel::OperatorInfo>();
if (operator_info != nullptr) {
auto strategy = operator_info->strategy();
if (strategy != nullptr) {
ValuePtr strategy_value = MakeValue(strategy->GetInputDim());
// display the strategy generated by batch parallel
auto attrs = prim->attrs();
attrs[mindspore::parallel::IN_STRATEGY] = strategy_value;
(void)prim->SetAttrs(attrs);
}
}
node_proto->set_op_type(prim->name());
for (const auto &attr : prim->attrs()) {
irpb::AttributeProto *attr_proto = node_proto->add_attribute();
@ -488,6 +502,13 @@ void ProtoExporter::ExportCNode(const FuncGraphPtr &func_graph, const CNodePtr &
// set node output type
SetNodeOutputType(node, node_proto->mutable_output_type());
if (IsValueNode<Primitive>(op)) {
PrimitivePtr primitive = GetValueNode<PrimitivePtr>(op);
if (!primitive->instance_name().empty()) {
node_proto->set_instance_name(primitive->instance_name());
}
}
}
}
@ -544,6 +565,11 @@ std::string GetFuncGraphProtoString(const FuncGraphPtr &func_graph) {
return exporter.GetFuncGraphProtoString(func_graph);
}
void GetFuncGraphProto(const FuncGraphPtr &func_graph, irpb::GraphProto *graph_proto) {
ProtoExporter exporter;
exporter.ExportFuncGraph(func_graph, graph_proto);
}
#ifdef ENABLE_DUMP_IR
void DumpIRProto(const FuncGraphPtr &func_graph, const std::string &suffix) {
if (func_graph == nullptr) {

View File

@ -22,6 +22,7 @@
#include "ir/func_graph.h"
#include "proto/mind_ir.pb.h"
#include "debug/common.h"
#include "proto/anf_ir.pb.h"
namespace mindspore {
using ModelProtoPtr = std::shared_ptr<mind_ir::ModelProto>;
@ -34,6 +35,8 @@ std::string GetBinaryProtoString(const FuncGraphPtr &func_graph);
ModelProtoPtr GetBinaryProto(const FuncGraphPtr &func_graph, bool save_tensor_data = false);
void DumpIRProto(const FuncGraphPtr &func_graph, const std::string &suffix);
void GetFuncGraphProto(const FuncGraphPtr &func_graph, irpb::GraphProto *graph_proto);
} // namespace mindspore
#endif // MINDSPORE_CCSRC_DEBUG_DUMP_PROTO_H_

View File

@ -83,6 +83,8 @@ class DeviceManager {
GroupManager group_manager() const { return gm_; }
void set_group_manager(const GroupManager &gm) { gm_ = gm; }
std::vector<std::vector<int64_t>> stage_devices() const { return stage_devices_; }
void Clear();
std::string world_group() const { return gm_.world_group(); }
std::vector<std::pair<std::string, std::vector<uint32_t>>> group_info() const { return gm_.group_info(); }

View File

@ -58,6 +58,12 @@
#include "runtime/hardware/device_context_manager.h"
#include "runtime/device/kernel_runtime_manager.h"
#ifndef ENABLE_SECURITY
#ifdef ENABLE_D
#include "mindspore/ccsrc/profiler/device/ascend/parallel_strategy_profiling.h"
#endif
#endif
#if ((defined ENABLE_CPU) && (!defined _WIN32))
#include "ps/constants.h"
#include "ps/util.h"
@ -628,6 +634,13 @@ void GraphExecutorPy::SaveCompiledGraph(const std::string &phase) {
MS_LOG(INFO) << "Save compiled func graph(" << func_graph->ToString() << ") phase(" << phase << ")!";
info_[phase]->func_graph = func_graph;
#ifndef ENABLE_SECURITY
#ifdef ENABLE_D
profiler::ascend::DumpProfileParallelStrategy(func_graph);
#endif
#endif
if ((func_graph != nullptr) && func_graph->has_flag(parallel::AUTO_PARALLEL) &&
((parallel_mode == parallel::AUTO_PARALLEL) || (parallel_mode == parallel::SEMI_AUTO_PARALLEL))) {
MS_LOG(DEBUG) << "Save model parallel parameter layout graph!";

View File

@ -22,6 +22,7 @@
#include "utils/ms_utils.h"
#include "nlohmann/json.hpp"
#include "profiler/device/ascend/ascend_profiling.h"
#include "profiler/device/ascend/options.h"
namespace mindspore {
namespace profiler {
@ -117,41 +118,6 @@ bool MemoryProfiling::MemoryToPB() {
return true;
}
std::string MemoryProfiling::GetOutputPath() const {
auto ascend_profiler = AscendProfiler::GetInstance();
MS_EXCEPTION_IF_NULL(ascend_profiler);
const std::string options_str = ascend_profiler->GetProfilingOptions();
nlohmann::json options_json;
try {
options_json = nlohmann::json::parse(options_str);
} catch (nlohmann::json::parse_error &e) {
MS_LOG(EXCEPTION) << "Parse profiling option json failed, error:" << e.what();
}
auto iter = options_json.find(kOutputPath);
if (iter != options_json.end() && iter->is_string()) {
char real_path[PATH_MAX] = {0};
if ((*iter).size() >= PATH_MAX) {
MS_LOG(ERROR) << "Path is invalid for memory profiling.";
return "";
}
#if defined(_WIN32) || defined(_WIN64)
if (_fullpath(real_path, common::SafeCStr(*iter), PATH_MAX) == nullptr) {
MS_LOG(ERROR) << "Path is invalid for memory profiling.";
return "";
}
#else
if (realpath(common::SafeCStr(*iter), real_path) == nullptr) {
MS_LOG(ERROR) << "Path is invalid for memory profiling.";
return "";
}
#endif
return real_path;
}
MS_LOG(ERROR) << "Output path is not found when save memory profiling data";
return "";
}
void MemoryProfiling::SaveMemoryProfiling() {
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);

View File

@ -115,8 +115,6 @@ class MemoryProfiling {
void SaveMemoryProfiling();
private:
std::string GetOutputPath() const;
MemoryProto memory_proto_;
std::map<uint32_t, std::shared_ptr<GraphMemory>> graph_memory_;
uint64_t device_mem_size_;

View File

@ -0,0 +1,64 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string>
#include <climits>
#include "profiler/device/ascend/options.h"
#include "utils/ms_context.h"
#include "debug/common.h"
#include "profiler/device/ascend/ascend_profiling.h"
constexpr char kOutputPath[] = "output";
namespace mindspore {
namespace profiler {
namespace ascend {
std::string GetOutputPath() {
auto ascend_profiler = AscendProfiler::GetInstance();
MS_EXCEPTION_IF_NULL(ascend_profiler);
const std::string options_str = ascend_profiler->GetProfilingOptions();
nlohmann::json options_json;
try {
options_json = nlohmann::json::parse(options_str);
} catch (nlohmann::json::parse_error &e) {
MS_LOG(EXCEPTION) << "Parse profiling option json failed, error:" << e.what();
}
auto iter = options_json.find(kOutputPath);
if (iter != options_json.end() && iter->is_string()) {
char real_path[PATH_MAX] = {0};
if ((*iter).size() >= PATH_MAX) {
MS_LOG(ERROR) << "Path is invalid for profiling.";
return "";
}
#if defined(_WIN32) || defined(_WIN64)
if (_fullpath(real_path, common::SafeCStr(*iter), PATH_MAX) == nullptr) {
MS_LOG(ERROR) << "Path is invalid for memory profiling.";
return "";
}
#else
if (realpath(common::SafeCStr(*iter), real_path) == nullptr) {
MS_LOG(ERROR) << "Path is invalid for profiling.";
return "";
}
#endif
return real_path;
}
MS_LOG(ERROR) << "Output path is not found when save profiling data";
return "";
}
} // namespace ascend
} // namespace profiler
} // namespace mindspore

View File

@ -0,0 +1,31 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_OPTIONS_H
#define MINDSPORE_OPTIONS_H
#include <string>
#include "nlohmann/json.hpp"
namespace mindspore {
namespace profiler {
namespace ascend {
std::string GetOutputPath();
nlohmann::json GetContextProfilingOption();
} // namespace ascend
} // namespace profiler
} // namespace mindspore
#endif // MINDSPORE_OPTIONS_H

View File

@ -0,0 +1,151 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "profiler/device/ascend/parallel_strategy_profiling.h"
#include <vector>
#include "sys/stat.h"
#include "debug/dump_proto.h"
#include "frontend/parallel/context.h"
#include "frontend/parallel/device_manager.h"
#include "profiler/device/ascend/options.h"
#include "profiler/device/ascend/ascend_profiling.h"
#include "proto/profiling_parallel.pb.h"
#include "utils/ms_context.h"
#include "utils/utils.h"
#if ((defined ENABLE_CPU) && (!defined _WIN32))
#include "ps/ps_context.h"
#include "ps/util.h"
#endif
namespace mindspore {
namespace profiler {
namespace ascend {
bool IsProfilingParallelStrategyEnabled() {
auto ascend_profiler = AscendProfiler::GetInstance();
MS_EXCEPTION_IF_NULL(ascend_profiler);
if (!ascend_profiler->GetProfilingEnableFlag()) {
return false;
}
#if ((defined ENABLE_CPU) && (!defined _WIN32))
if (ps::PSContext::instance()->is_server() || ps::PSContext::instance()->is_scheduler()) {
MS_LOG(INFO) << "Current is ps server or ps scheduler, profiling parallel "
"strategy is disabled.";
return false;
}
#endif
std::string parallel_mode = parallel::ParallelContext::GetInstance()->parallel_mode();
if ((parallel_mode == parallel::AUTO_PARALLEL) || (parallel_mode == parallel::SEMI_AUTO_PARALLEL) ||
(parallel_mode == parallel::DATA_PARALLEL)) {
return true;
}
MS_LOG(INFO) << "Profiling parallel strategy is disabled, current parallel mode is " << parallel_mode;
return false;
}
bool StringToInt(std::string *str, int32_t *value) {
try {
*value = stoi(*str);
} catch (std::invalid_argument &) {
MS_LOG(ERROR) << "Catch invalid_argument, invalid of digit string: " << *str;
return false;
}
return true;
}
std::string GetProfilingParallelString(const FuncGraphPtr &func_graph) {
irpb::ProfilingParallel profiling_parallel;
irpb::GraphProto *graph_proto = profiling_parallel.mutable_graph();
MS_EXCEPTION_IF_NULL(graph_proto);
GetFuncGraphProto(func_graph, graph_proto);
// set parallel model
std::string parallel_mode = parallel::ParallelContext::GetInstance()->parallel_mode();
irpb::Config *config = profiling_parallel.mutable_config();
MS_EXCEPTION_IF_NULL(config);
config->set_parallel_type(parallel_mode);
// Note: Only parallel mode is AUTO_PARALLEL or SEMI_AUTO_PARALLEL, the
// g_device_manager is not nullptr;
if (parallel::g_device_manager != nullptr) {
auto rank_id = parallel::g_device_manager->global_rank();
auto stage_id = parallel::g_device_manager->stage_id();
config->set_rank_id(rank_id);
config->set_stage_id(stage_id);
// set stage_devices
for (std::vector<int64_t> devices : parallel::g_device_manager->stage_devices()) {
irpb::TensorShapeProto *stage_devices = config->add_stage_devices();
MS_EXCEPTION_IF_NULL(stage_devices);
for (int64_t device : devices) {
stage_devices->add_dim()->set_size(device);
}
}
} else {
auto rank_id = common::GetEnv("RANK_ID");
// If RANK_ID is not set, default value is 0
if (rank_id.empty()) {
rank_id = "0";
MS_LOG(WARNING) << R"(Can not find RANK_ID in environment, This affects profiling to "
"collect rank ID data and parallel strategy data. please execute "
"'export RANK_ID=RANK_ID' in environment.)";
}
int32_t rank_id_int = 0;
bool ret = StringToInt(&rank_id, &rank_id_int);
if (!ret) {
MS_LOG(EXCEPTION) << "The given RANK_ID is an invalid digit string.";
}
config->set_rank_id(rank_id_int);
}
return profiling_parallel.SerializeAsString();
}
void DumpProfileParallelStrategy(const FuncGraphPtr &func_graph) {
if (!IsProfilingParallelStrategyEnabled()) {
return;
}
MS_LOG(INFO) << "Start to DumpProfileParallelStrategy.";
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
std::string dir_path = GetOutputPath();
auto rank_id = common::GetEnv("RANK_ID");
// If RANK_ID is not set, default value is 0
if (rank_id.empty()) {
rank_id = "0";
}
std::string file_path = dir_path + std::string("/parallel_strategy_") + std::string(rank_id) + std::string(".pb");
MS_LOG(INFO) << "Start to write parallel strategy string, file path is " << file_path;
std::ofstream ofs(file_path);
if (!ofs.is_open()) {
MS_LOG(ERROR) << "Open file '" << file_path << "' failed!"
<< " Errno:" << errno << " ErrInfo:" << strerror(errno);
return;
}
ofs << GetProfilingParallelString(func_graph);
ofs.close();
ChangeFileMode(file_path, S_IRUSR);
MS_LOG(INFO) << "Save profile parallel strategy success.";
}
} // namespace ascend
} // namespace profiler
} // namespace mindspore

View File

@ -0,0 +1,32 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_PARALLEL_STRATEGY_PROFILING_H
#define MINDSPORE_PARALLEL_STRATEGY_PROFILING_H
#include <string>
#include "ir/func_graph.h"
#include "base/base.h"
namespace mindspore {
namespace profiler {
namespace ascend {
void DumpProfileParallelStrategy(const FuncGraphPtr &func_graph);
} // namespace ascend
} // namespace profiler
} // namespace mindspore
#endif // MINDSPORE_PARALLEL_STRATEGY_PROFILING_H

View File

@ -233,6 +233,11 @@ message NodeProto {
// The full_name_with_scope of CNode
optional string full_name = 8;
// Note: Id 9 is reserved for the source_address field of the debugger, please see debug_graph.proto
// As same as the IR file instance name field.
optional string instance_name = 10;
}
// Models

View File

@ -0,0 +1,45 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// The anf_ir.proto is `proto2` syntax, so this file use `proto2`, else will occur a compile error.
syntax = "proto2";
package mindspore.irpb;
option cc_enable_arenas = true;
import "anf_ir.proto";
message ProfilingParallel {
optional string version = 1;
optional Config config = 2;
optional GraphProto graph = 3;
}
message Config {
optional uint32 rank_id = 1;
// The stage of current device
optional uint32 stage_id = 2;
// optional stand_alone/data_parallel/semi_auto_parallel/auto_parallel/hybrid_parallel
optional string parallel_type = 3;
// The global stages and devices info, ex. 2 stages: [[0,1,2,3], [4,5,6,7]]
// If the training mode is not pipeline parallel training, [[0,1,2,3,4,5,6,7,8]]
repeated TensorShapeProto stage_devices = 4;
}

View File

@ -206,8 +206,10 @@ if(ENABLE_SECURITY)
list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/profiler/device/profiling.cc")
list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/profiler/device/ascend/memory_profiling.cc")
list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/profiler/device/ascend/ascend_profiling.cc")
list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/profiler/device/ascend/options.cc")
list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/debug/data_dump/dump_json_parser.cc")
endif()
list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/profiler/device/ascend/parallel_strategy_profiling.cc")
add_library(_ut_mindspore_obj OBJECT ${MINDSPORE_SRC_LIST})
add_library(_ut_ut_obj OBJECT ${UT_SRCS})

View File

@ -0,0 +1,25 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "profiler/device/ascend/parallel_strategy_profiling.h"
namespace mindspore {
namespace profiler {
namespace ascend {
void DumpProfileParallelStrategy(const FuncGraphPtr &func_graph){};
} // namespace ascend
} // namespace profiler
} // namespace mindspore