diff --git a/mindspore/ccsrc/frontend/parallel/graph_util/graph_splitter.cc b/mindspore/ccsrc/frontend/parallel/graph_util/graph_splitter.cc index 46f9dd82bad..bf124f143f9 100644 --- a/mindspore/ccsrc/frontend/parallel/graph_util/graph_splitter.cc +++ b/mindspore/ccsrc/frontend/parallel/graph_util/graph_splitter.cc @@ -341,6 +341,7 @@ distributed::DistExecutionMode GenerateStrategy() { enable_embedding_cache = ps::PSContext::instance()->cache_enable(); #endif std::string parallel_mode = parallel::ParallelContext::GetInstance()->parallel_mode(); + MS_LOG(INFO) << "Current parallel mode is " << parallel_mode; bool using_parallel = (parallel_mode != parallel::kStandalone) ? true : false; // The conditions' priority is: EmbeddingCache > Parameter Server > General. if (enable_embedding_cache) { @@ -352,6 +353,7 @@ distributed::DistExecutionMode GenerateStrategy() { } else { strategy = distributed::DistExecutionMode::kGeneralMode; } + MS_LOG(INFO) << "Generated distributed strategy is " << strategy; return strategy; } @@ -1074,18 +1076,24 @@ void GraphSplitter::Run() { DyeGraph(); // If all nodes are all on this process, no need to split the graph. So return. if (!NeedSplitGraph()) { - MS_LOG(INFO) << "No need to build and split distributed graph."; + MS_LOG(INFO) << "All nodes are on this precoess so there's no need to build and split distributed graph."; return; } // Step 2: Create exec_mode_ according to the current execution mode. CreateExecutionMode(); + // If this is general mode but no label is set, do not split graph to avoid unexpected optimizing out. + if (mode_ == distributed::DistExecutionMode::kGeneralMode && !GraphHasLabel(func_graph_)) { + MS_LOG(INFO) << "This graph has no label on it in general mode. So no need to split."; + return; + } + // Step 3: Prebuild the distributed graph before it gets split. exec_mode_->PreBuildDistributedGraph(); if (!NeedSplitGraph()) { - MS_LOG(INFO) << "No need to build and split distributed graph."; + MS_LOG(INFO) << "All nodes are on this precoess so there's no need to build and split distributed graph."; return; } @@ -1150,6 +1158,8 @@ void GraphSplitter::CreateExecutionMode() { exec_mode_ = std::make_unique(func_graph_, &node_labels_, rank_id_, role_); } else if (mode_ == distributed::DistExecutionMode::kEmbeddingCacheMode) { exec_mode_ = std::make_unique(func_graph_, &node_labels_, rank_id_, role_); + } else if (mode_ == distributed::DistExecutionMode::kParallelMode) { + exec_mode_ = std::make_unique(func_graph_, &node_labels_, rank_id_, role_); } else if (mode_ == distributed::DistExecutionMode::kGeneralMode) { exec_mode_ = std::make_unique(func_graph_, &node_labels_, rank_id_, role_); } diff --git a/mindspore/ccsrc/frontend/parallel/graph_util/graph_splitter.h b/mindspore/ccsrc/frontend/parallel/graph_util/graph_splitter.h index aca803830c5..6fdfb51a922 100644 --- a/mindspore/ccsrc/frontend/parallel/graph_util/graph_splitter.h +++ b/mindspore/ccsrc/frontend/parallel/graph_util/graph_splitter.h @@ -380,6 +380,15 @@ class GeneralMode : public DistributedExecutionMode { ~GeneralMode() = default; }; +// The mode applied when AutoParallel feature is enabled. +class ParallelMode : public DistributedExecutionMode { + public: + explicit ParallelMode(const FuncGraphPtr &func_graph, NodeLabels *node_labels, uint32_t rank_id, + const std::string &role) + : DistributedExecutionMode(func_graph, node_labels, rank_id, role) {} + ~ParallelMode() = default; +}; + // The class is used as an action in pipeline. It will process the graph and split the nodes to each process in the // cluster. class GraphSplitter { diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/device/distribution/collective_init.cc b/mindspore/ccsrc/plugin/device/gpu/hal/device/distribution/collective_init.cc index 6a22ebbdd4d..0e3caf7f725 100644 --- a/mindspore/ccsrc/plugin/device/gpu/hal/device/distribution/collective_init.cc +++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/distribution/collective_init.cc @@ -58,7 +58,8 @@ void CollectiveInitializer::InitCollective() { #endif } else { if (!distributed::Initialize()) { - MS_LOG(EXCEPTION) << "Failed to initialize distributed execution for NCCL."; + MS_LOG(EXCEPTION) << "Failed to initialize distributed execution for NCCL. Maybe the MindSpore cluster is not " + "successfully built. Please check schuduler and other nodes' log."; } } CollectiveInitializer::instance().collective_inited_ = true; diff --git a/tests/st/ps/part_ps/test_ps_embedding_heterogeneous_conv2d_adam.py b/tests/st/ps/part_ps/test_ps_embedding_heterogeneous_conv2d_adam.py index 9fb5814745d..b045c63b970 100644 --- a/tests/st/ps/part_ps/test_ps_embedding_heterogeneous_conv2d_adam.py +++ b/tests/st/ps/part_ps/test_ps_embedding_heterogeneous_conv2d_adam.py @@ -37,6 +37,7 @@ from mindspore.common.parameter import Parameter from mindspore.train import Model from mindspore.common import set_seed from mindspore.communication.management import init +from mindspore.parallel._ps_context import _is_role_worker parser = argparse.ArgumentParser(description='test_ps_lenet') parser.add_argument("--device_target", type=str, default="GPU") @@ -177,7 +178,8 @@ class NetFactory: no_ps = self.no_ps_impl(ds2) print(part_ps) print(no_ps) - assert np.allclose(no_ps, part_ps, rtol=1.0e-4, atol=1.0e-4) + if _is_role_worker(): + assert np.allclose(no_ps, part_ps, rtol=1.0e-4, atol=1.0e-4) if __name__ == "__main__":