!35225 fix_independent_stream_error_r1.6

Merge pull request !35225 from yao_yf/fix_independent_stream_error_r1.6
2022-06-01 01:19:25 +00:00 · 2022-06-01 01:19:25 +00:00 · 34761f434f
parent 4dc930bbd2 f9c54b5a81
commit 34761f434f
5 changed files with 163 additions and 14 deletions
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
@ -159,6 +159,7 @@
 #include "backend/optimizer/ascend/dynamic_shape/convert_inherited_dynamic_op.h"
 #include "backend/optimizer/ascend/dynamic_shape/link_custom_op.h"
 #include "backend/optimizer/pass/adjust_depend_for_parallel_optimizer_recompute_all_gather.h"
 #include "backend/optimizer/pass/gradients_allreduce_depend_last_send.h"
 #include "backend/kernel_compiler/tbe/tbe_kernel_compile.h"
 #include "utils/ms_context.h"
 #include "utils/config_manager.h"
@ -461,6 +462,7 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern
  other_pm->AddPass(std::make_shared<SplitInputsForReduceScatter>());
  other_pm->AddPass(std::make_shared<BroadcastFusion>());
  other_pm->AddPass(std::make_shared<InsertTensorMoveForCascade>());
  other_pm->AddPass(std::make_shared<GradientsAllReduceDependLastSend>());
  other_pm->AddPass(std::make_shared<ParameterTransOpFusion>());
  other_pm->AddPass(std::make_shared<RefreshParameterFormat>());
  other_pm->AddPass(std::make_shared<SplitOpOptimizer>());
--- a/mindspore/ccsrc/backend/optimizer/pass/gradients_allreduce_depend_last_send.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/gradients_allreduce_depend_last_send.cc
@ -0,0 +1,80 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/optimizer/pass/gradients_allreduce_depend_last_send.h"
 #include <algorithm>
 #include "frontend/parallel/context.h"
 #include "backend/session/anf_runtime_algorithm.h"
 namespace mindspore {
 namespace opt {
 bool GradientsAllReduceDependLastSend::Run(const FuncGraphPtr &graph) {
  MS_EXCEPTION_IF_NULL(graph);
  auto parallel_mode = parallel::ParallelContext::GetInstance()->parallel_mode();
  if (parallel_mode != parallel::SEMI_AUTO_PARALLEL && parallel_mode != parallel::AUTO_PARALLEL) {
    return false;
  }
  int32_t split_stage_num = parallel::ParallelContext::GetInstance()->pipeline_stage_split_num();
  if (split_stage_num <= 1) {
    return false;
  }
  std::vector<AnfNodePtr> node_list = TopoSort(graph->get_return());
  std::vector<CNodePtr> addn_list;
  CNodePtr last_send;
  for (auto &node : node_list) {
    MS_EXCEPTION_IF_NULL(node);
    if (!node->cast<CNodePtr>() || !AnfUtils::IsRealKernel(node)) {
      continue;
    }
    auto cnode = node->cast<CNodePtr>();
    if (IsPrimitiveCNode(cnode, prim::kPrimAllReduce) && AnfAlgo::IsFusion(cnode)) {
      auto last_input = cnode->inputs().back();
      if (IsPrimitiveCNode(last_input, prim::kPrimTensorMove)) {
        auto last_input_cnode = last_input->cast<CNodePtr>();
        auto real_input_node = last_input_cnode->input(1);
        if (IsPrimitiveCNode(real_input_node, prim::kPrimDepend)) {
          auto addn_node = real_input_node->cast<CNodePtr>()->input(2);
          if (IsPrimitiveCNode(addn_node, prim::kPrimAddN)) {
            MS_LOG(INFO) << "Find the pipeline addn " << addn_node->fullname_with_scope();
            addn_list.push_back(addn_node->cast<CNodePtr>());
          }
        }
      }
    }
    if (IsPrimitiveCNode(cnode, prim::kPrimSend)) {
      last_send = cnode;
    }
  }
  return InsertDependBetweenAllReduceAndSend(graph, addn_list, last_send);
 }
 bool GradientsAllReduceDependLastSend::InsertDependBetweenAllReduceAndSend(const FuncGraphPtr &graph,
                                                                           const std::vector<CNodePtr> &addn_list,
                                                                           const CNodePtr &last_send) {
  bool changed = false;
  FuncGraphManagerPtr manager = graph->manager();
  for (auto &addn : addn_list) {
    std::vector<AnfNodePtr> inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimDepend->name())), addn,
                                      last_send};
    auto new_depend = graph->NewCNode(inputs);
    new_depend->set_abstract(addn->abstract());
    (void)manager->Replace(addn, new_depend);
    changed = true;
  }
  return changed;
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/pass/gradients_allreduce_depend_last_send.h
+++ b/mindspore/ccsrc/backend/optimizer/pass/gradients_allreduce_depend_last_send.h
@ -0,0 +1,44 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_GRADIENTS_ALLREDUCE_DEPEBD_LAST_SEND_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_GRADIENTS_ALLREDUCE_DEPEBD_LAST_SEND_H_
 #include <vector>
 #include <string>
 #include <utility>
 #include <memory>
 #include "backend/optimizer/common/pass.h"
 #include "ir/func_graph.h"
 #include "ir/anf.h"
 #include "backend/optimizer/common/helper.h"
 #include "backend/optimizer/common/optimizer.h"
 namespace mindspore {
 namespace opt {
 class GradientsAllReduceDependLastSend : public Pass {
 public:
  GradientsAllReduceDependLastSend() : Pass("adjust_depend_for_parallel_optimizer_recompute_all_gather") {}
  ~GradientsAllReduceDependLastSend() override = default;
  bool Run(const FuncGraphPtr &graph) override;
 private:
  bool InsertDependBetweenAllReduceAndSend(const FuncGraphPtr &graph, const std::vector<CNodePtr> &addn_list,
                                           const CNodePtr &last_send);
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_GRADIENTS_ALLREDUCE_DEPEBD_LAST_SEND_H_
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_stream_assign.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_stream_assign.cc
@ -264,8 +264,10 @@ void AscendStreamAssign::AssignStream(const NotNull<KernelGraphPtr> &graph_ptr)
    Reset();
    SetLoopSink();
    ReorderIndependentOrders(graph_ptr);
-    TrailingTimeOptimizationByReorder(graph_ptr);
+    auto parallel_mode = parallel::ParallelContext::GetInstance()->parallel_mode();
-
+    if (parallel_mode != parallel::SEMI_AUTO_PARALLEL && parallel_mode != parallel::AUTO_PARALLEL) {
      TrailingTimeOptimizationByReorder(graph_ptr);
    }
    AssignAllNodesStream(graph_ptr);
    UpdateAtomicAddrCleanStreamId(graph_ptr);
    InsertStreamActive(graph_ptr);
@ -1121,6 +1123,33 @@ void AscendStreamAssign::InsertStreamActiveForCommon(const NotNull<KernelGraphPt
  graph_ptr->set_execution_order(update_cnode_list);
 }
 std::vector<CNodePtr> AscendStreamAssign::GetIndependentNodesNeedsInsertActive(const std::vector<CNodePtr> exe_orders,
                                                                               const uint32_t graph_id) {
  uint32_t pre_stream_id = 0;
  std::vector<CNodePtr> independent_nodes_need_insert_active;
  CNodePtr pre_cnode;
  for (size_t i = 0; i < exe_orders.size(); i++) {
    auto cur_cnode_ptr = exe_orders[i];
    if (!AnfAlgo::IsIndependentNode(cur_cnode_ptr)) {
      continue;
    }
    if (AnfAlgo::GetGraphId(cur_cnode_ptr.get()) != graph_id) {
      continue;
    }
    auto cur_stream_id = AnfAlgo::GetStreamId(cur_cnode_ptr);
    if (pre_stream_id == 0) {
      pre_stream_id = cur_stream_id;
      pre_cnode = cur_cnode_ptr;
    }
    if (cur_stream_id != pre_stream_id) {
      independent_nodes_need_insert_active.push_back(pre_cnode);
    }
    pre_stream_id = cur_stream_id;
    pre_cnode = cur_cnode_ptr;
  }
  return independent_nodes_need_insert_active;
 }
 void AscendStreamAssign::InsertStreamActiveForIndependent(const NotNull<KernelGraphPtr> &graph_ptr) {
  auto root_graph_id = graph_ptr->graph_id();
  if (root_graph_id == kInvalidGraphId) {
@ -1132,31 +1161,23 @@ void AscendStreamAssign::InsertStreamActiveForIndependent(const NotNull<KernelGr
      independent_streams = item.second;
    }
  }
  // Root graph independent stream size is not more than one, no need insert active
  if (independent_streams.size() <= 1) {
    return;
  }
  std::vector<CNodePtr> update_cnode_list;
  auto exe_orders = graph_ptr->execution_order();
-
+  auto independent_nodes_need_insert_active = GetIndependentNodesNeedsInsertActive(exe_orders, root_graph_id);
  // first independent is been activated, active other independent stream
  std::vector<uint32_t> streams;
  std::copy(independent_streams.begin(), independent_streams.end(), std::back_inserter(streams));
  std::sort(streams.begin(), streams.end());
  uint32_t node_num = 0;
  for (size_t i = 0; i < exe_orders.size(); i++) {
    auto cur_cnode_ptr = exe_orders[i];
    update_cnode_list.emplace_back(cur_cnode_ptr);
-    if (!AnfAlgo::IsIndependentNode(cur_cnode_ptr)) {
+    if (!AnfAlgo::IsIndependentNode(cur_cnode_ptr) || AnfAlgo::GetGraphId(cur_cnode_ptr.get()) != root_graph_id) {
      continue;
    }
    if (AnfAlgo::GetGraphId(cur_cnode_ptr.get()) != root_graph_id) {
      continue;
    }
    node_num++;
    auto cur_stream_id = AnfAlgo::GetStreamId(cur_cnode_ptr);
    auto it = std::find(streams.begin(), streams.end(), cur_stream_id);
    if (it == streams.end()) {
@ -1165,7 +1186,8 @@ void AscendStreamAssign::InsertStreamActiveForIndependent(const NotNull<KernelGr
      std::copy(exe_orders.begin() + i + 1, exe_orders.end(), std::back_inserter(update_cnode_list));
      break;
    } else {
-      if (node_num == kMaxCommonNodeNumPerStream) {
+      if (std::find(independent_nodes_need_insert_active.begin(), independent_nodes_need_insert_active.end(),
                    cur_cnode_ptr) != independent_nodes_need_insert_active.end()) {
        CNodePtr active_ptr = KernelAdjust::GetInstance().CreateStreamActiveOp(graph_ptr);
        // 1.set stream id
        AnfAlgo::SetStreamId(cur_stream_id, active_ptr.get());
@ -1173,7 +1195,6 @@ void AscendStreamAssign::InsertStreamActiveForIndependent(const NotNull<KernelGr
        std::vector<uint32_t> active_index_list{*(it + 1)};
        AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue<std::vector<uint32_t>>(active_index_list), active_ptr);
        update_cnode_list.emplace_back(active_ptr);
        node_num = 0;
      }
    }
  }
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_stream_assign.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_stream_assign.h
@ -104,6 +104,8 @@ class AscendStreamAssign {
                                            std::pair<std::string, std::map<uint32_t, std::set<uint32_t>>> group_item);
  std::vector<std::pair<uint32_t, vector<size_t>>> GetStreamIDHcomMap(std::vector<CNodePtr> cnode_ptr_list,
                                                                      std::string group, size_t graph_id);
  std::vector<CNodePtr> GetIndependentNodesNeedsInsertActive(const std::vector<CNodePtr> exe_orders,
                                                             const uint32_t graph_id);
  void AdjustAtomicAddrCleanOrder(const NotNull<KernelGraphPtr> &graph_ptr);
  vector<CNodePtr> GetLastInputCnode(const NotNull<KernelGraphPtr> &graph_ptr, const CNodePtr &cur_cnode_ptr);