!49579 fix cache group info bug

Merge pull request !49579 from xiaoyao/cache_opt
2023-03-07 01:45:12 +00:00 · 2023-03-07 01:45:12 +00:00 · e1b83c5d3d
parent dbf06f2174 de0f0bf214
commit e1b83c5d3d
9 changed files with 130 additions and 60 deletions
--- a/mindspore/ccsrc/frontend/parallel/pass/handle_group_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/pass/handle_group_info.cc
@ -0,0 +1,43 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "frontend/parallel/pass/handle_group_info.h"
+#include "frontend/parallel/device_manager.h"
+#include "include/common/utils/parallel_context.h"
+#include "frontend/parallel/step_parallel_utils.h"
+
+namespace mindspore {
+namespace parallel {
+void HandleGroupInfo(const FuncGraphPtr &root) {
+  if (g_device_manager == nullptr) {
+    return;
+  }
+  auto group_info = g_device_manager->group_info();
+  auto group_info_save_path = common::GetEnv("GROUP_INFO_FILE");
+  if (!group_info_save_path.empty()) {
+    ParallelContext::GetInstance()->set_group_ckpt_save_file(group_info_save_path);
+  }
+
+  if (StrategyCheckpoint::GetInstance().group_info_save_on()) {
+    auto &strategy_ckt = StrategyCheckpoint::GetInstance();
+    RankList comm_group = strategy_ckt.common_mirror_group();
+    if (strategy_ckt.SaveGroupInfo(group_info, comm_group) != SUCCESS) {
+      MS_LOG(EXCEPTION) << "Save group info failed";
+    }
+  }
+}
+}  // namespace parallel
+}  // namespace mindspore
--- a/mindspore/ccsrc/frontend/parallel/pass/handle_group_info.h
+++ b/mindspore/ccsrc/frontend/parallel/pass/handle_group_info.h
@ -0,0 +1,28 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_PASS_HANDLE_GROUP_INFO_H_
+#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_PASS_HANDLE_GROUP_INFO_H_
+
+#include "ir/anf.h"
+
+namespace mindspore {
+namespace parallel {
+// Handle hccl group info.
+void HandleGroupInfo(const FuncGraphPtr &graph);
+}  // namespace parallel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_PASS_HANDLE_GROUP_INFO_H_
--- a/mindspore/ccsrc/frontend/parallel/pass/micro_interleaved_order_control.cc
+++ b/mindspore/ccsrc/frontend/parallel/pass/micro_interleaved_order_control.cc
@ -14,20 +14,21 @@
 * limitations under the License.
 */

-#include "frontend/optimizer/micro_interleaved_order_control.h"
+#include "frontend/parallel/pass/micro_interleaved_order_control.h"
 #include <memory>
 #include <list>
 #include <vector>
 #include <string>
 #include <algorithm>
 #include <queue>
+#include <unordered_map>
 #include <utility>
 #include "mindspore/core/ops/core_ops.h"
 #include "include/common/utils/utils.h"
 #include "frontend/parallel/step_parallel.h"

 namespace mindspore {
-namespace opt {
+namespace parallel {
 namespace {
 constexpr auto kGradientsFlag = "Gradients";
 const size_t interleaved_size = 2;
@ -302,5 +303,5 @@ void MicroInterleavedOrderControl(const FuncGraphPtr &graph) {
  }
  MicroInterleavedOrderControlPipeline(manager, origin_nodes_topological);
 }
-}  // namespace opt
+}  // namespace parallel
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/parallel/pass/micro_interleaved_order_control.h
+++ b/mindspore/ccsrc/frontend/parallel/pass/micro_interleaved_order_control.h
@ -14,15 +14,15 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_FRONTEND_OPTIMIZER_MICRO_INTERLEAVED_ORDER_CONTROL_H_
-#define MINDSPORE_CCSRC_FRONTEND_OPTIMIZER_MICRO_INTERLEAVED_ORDER_CONTROL_H_
+#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_PASS_MICRO_INTERLEAVED_ORDER_CONTROL_H_
+#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_PASS_MICRO_INTERLEAVED_ORDER_CONTROL_H_

 #include "ir/anf.h"

 namespace mindspore {
-namespace opt {
+namespace parallel {
 // Micro interleaved nodes order control.
 void MicroInterleavedOrderControl(const FuncGraphPtr &graph);
-}  // namespace opt
+}  // namespace parallel
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_FRONTEND_OPTIMIZER_MICRO_INTERLEAVED_ORDER_CONTROL_H_
+#endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_PASS_MICRO_INTERLEAVED_ORDER_CONTROL_H_
--- a/mindspore/ccsrc/frontend/parallel/pass/overlap_opt_shard_in_pipeline.cc
+++ b/mindspore/ccsrc/frontend/parallel/pass/overlap_opt_shard_in_pipeline.cc
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include "frontend/optimizer/overlap_opt_shard_in_pipeline.h"
+#include "frontend/parallel/pass/overlap_opt_shard_in_pipeline.h"
 #include <memory>
 #include <vector>
 #include <list>
@ -29,7 +29,7 @@
 #include "include/common/utils/comm_manager.h"

 namespace mindspore {
-namespace opt {
+namespace parallel {
 namespace {
 inline bool is_allgather_comm_ops(const AnfNodePtr &node) {
  static const std::vector<PrimitivePtr> kAllGatherOpsPrim = {prim::kPrimMicroStepAllGather,
@ -131,5 +131,5 @@ void OverlapOptShardInPipeline(const FuncGraphPtr &graph) {
    manager->SetEdge(recv_user, recv_user_index, depend_node);
  }
 }
-}  // namespace opt
+}  // namespace parallel
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/parallel/pass/overlap_opt_shard_in_pipeline.h
+++ b/mindspore/ccsrc/frontend/parallel/pass/overlap_opt_shard_in_pipeline.h
@ -14,15 +14,15 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_FRONTEND_OPTIMIZER_OVERLAP_OPT_SHARD_IN_PIPELINE_H_
-#define MINDSPORE_CCSRC_FRONTEND_OPTIMIZER_OVERLAP_OPT_SHARD_IN_PIPELINE_H_
+#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_PASS_OVERLAP_OPT_SHARD_IN_PIPELINE_H_
+#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_PASS_OVERLAP_OPT_SHARD_IN_PIPELINE_H_

 #include "ir/anf.h"

 namespace mindspore {
-namespace opt {
+namespace parallel {
 // Automatically insert duplicated recomputed nodes.
 void OverlapOptShardInPipeline(const FuncGraphPtr &graph);
-}  // namespace opt
+}  // namespace parallel
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_FRONTEND_OPTIMIZER_OVERLAP_OPT_SHARD_IN_PIPELINE_H_
+#endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_PASS_OVERLAP_OPT_SHARD_IN_PIPELINE_H_
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
@ -2468,21 +2468,6 @@ static void ReorderForPipelineSplit(const FuncGraphPtr &root, const FuncGraphMan
  }
 }

-static void HandleGroupInfo(const FuncGraphPtr &root) {
-  auto group_info = g_device_manager->group_info();
-  auto group_info_save_path = common::GetEnv("GROUP_INFO_FILE");
-  if (!group_info_save_path.empty()) {
-    ParallelContext::GetInstance()->set_group_ckpt_save_file(group_info_save_path);
-  }
-
-  if (StrategyCheckpoint::GetInstance().group_info_save_on()) {
-    RankList comm_group = FindCommonMirrorGroup(root);
-    if (StrategyCheckpoint::GetInstance().SaveGroupInfo(group_info, comm_group) != SUCCESS) {
-      MS_LOG(EXCEPTION) << "Save group info failed";
-    }
-  }
-}
-
 static void HandleDataParallel() {
  std::string parallel_mode = ParallelContext::GetInstance()->parallel_mode();
  if (parallel_mode == kDataParallel) {
@ -2792,8 +2777,8 @@ bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer)

  PipelinePostProcess(root, all_nodes);

-  HandleGroupInfo(root);
-
+  auto comm_group = FindCommonMirrorGroup(root);
+  StrategyCheckpoint::GetInstance().set_common_mirror_group(comm_group);
  // handle full split parameters in grad accumulation, do not contain optimizer-sharding's parameter
  HandleFullySplitParameters(root);

--- a/mindspore/ccsrc/frontend/parallel/strategy_checkpoint/parallel_strategy_checkpoint.h
+++ b/mindspore/ccsrc/frontend/parallel/strategy_checkpoint/parallel_strategy_checkpoint.h
@ -50,6 +50,9 @@ class StrategyCheckpoint {
  bool LoadCheckPointOn() const { return load_checkpoint_on_; }
  bool SaveCheckPointOn() const { return save_checkpoint_on_; }

+  void set_common_mirror_group(const RankList &comm_group) { common_mirror_group_ = comm_group; }
+  RankList common_mirror_group() const { return common_mirror_group_; }
+
 private:
  std::string load_file_;
  std::string save_file_;
@ -63,6 +66,7 @@ class StrategyCheckpoint {
  bool load_format_json_ = true;
  bool save_format_json_ = true;
  StrategyCheckpointInfo strategy_checkpoint_info_;
+  RankList common_mirror_group_;
 };
 }  // namespace parallel
 }  // namespace mindspore
--- a/mindspore/ccsrc/pipeline/jit/pass.cc
+++ b/mindspore/ccsrc/pipeline/jit/pass.cc
@ -43,14 +43,15 @@
 #include "frontend/parallel/pynative_shard/pynative_shard.h"
 #include "frontend/parallel/pass/label_micro_interleaved_index.h"
 #include "frontend/parallel/pass/reorder_send_recv_between_fp_bp.h"
+#include "frontend/parallel/pass/micro_interleaved_order_control.h"
+#include "frontend/parallel/pass/overlap_opt_shard_in_pipeline.h"
+#include "frontend/parallel/pass/handle_group_info.h"
 #include "frontend/optimizer/recompute.h"
 #include "frontend/optimizer/slice_activation_in_recompute.h"
-#include "frontend/optimizer/micro_interleaved_order_control.h"
 #include "frontend/optimizer/comm_op_attrs.h"
 #include "frontend/optimizer/environ_conversion.h"
 #include "frontend/optimizer/comm_op_reuse_tag.h"
 #include "frontend/optimizer/py_interpret_to_execute.h"
-#include "frontend/optimizer/overlap_opt_shard_in_pipeline.h"
 #include "utils/log_adapter.h"
 #include "pipeline/jit/pipeline_split.h"
 #include "pipeline/pynative/pynative_execute.h"
@ -639,7 +640,7 @@ bool ReorderSendRecvBetweenFpBpPass(const ResourcePtr &resource) {

 bool MicroInterLeavedOrderControlPass(const ResourcePtr &resource) {
  MS_EXCEPTION_IF_NULL(resource);
-  opt::MicroInterleavedOrderControl(resource->func_graph());
+  parallel::MicroInterleavedOrderControl(resource->func_graph());
  return true;
 }

@ -657,7 +658,13 @@ bool AddCommOpReusePass(const ResourcePtr &resource) {

 bool OverlapOptShardInPipelinePass(const ResourcePtr &resource) {
  MS_EXCEPTION_IF_NULL(resource);
-  opt::OverlapOptShardInPipeline(resource->func_graph());
+  parallel::OverlapOptShardInPipeline(resource->func_graph());
+  return true;
+}
+
+bool HandleGroupInfoPass(const ResourcePtr &resource) {
+  MS_EXCEPTION_IF_NULL(resource);
+  parallel::HandleGroupInfo(resource->func_graph());
  return true;
 }

@ -864,8 +871,7 @@ bool AddEmbeddingCachePass(const ResourcePtr &resource) {
  return true;
 }

-std::vector<PassItem> kVmPasses = {
-  {"py_interpret_to_execute", PyInterpretToExecutePass},
+std::vector<PassItem> kVmPasses = {{"py_interpret_to_execute", PyInterpretToExecutePass},
                                   {"simplify_data_structures", SimplifyDataStructuresPass},
                                   {"opt_a", OptPassAGroup},
                                   {"clean_after_opta", CleanAfterOptAPass},
@ -885,7 +891,8 @@ std::vector<PassItem> kVmPasses = {
                                   {"comm_op_add_attrs", CommOpAddAttrs},
                                   {"add_comm_op_reuse_tag", AddCommOpReusePass},
                                   {"overlap_opt_shard_in_pipeline", OverlapOptShardInPipelinePass},
-};
+                                   // The pass cache hccl group, so the hccl group should be created before the pass
+                                   {"handle_group_info", HandleGroupInfoPass}};

 std::vector<PassItem> kGePasses = {{"py_interpret_to_execute", PyInterpretToExecutePass},
                                   {"simplify_data_structures", SimplifyDataStructuresPass},
@ -894,7 +901,9 @@ std::vector<PassItem> kGePasses = {{"py_interpret_to_execute", PyInterpretToExec
                                   {"opt_b", OptPassBGroup},
                                   {"opt_control", ControlGroup},
                                   {"opt_prepare", PrepareGroup},
-                                   {"cconv", CconvPass}};
+                                   {"cconv", CconvPass},
+                                   // The pass cache hccl group, so the hccl group should be created before the pass
+                                   {"handle_group_info", HandleGroupInfoPass}};

 std::vector<PassItem> kPynativePasses = {{"opt_a", OptPassAGroup},
                                         {"opt_b", OptPassBGroup},