fix fault recover in optimizer shard

2021-11-17 10:28:04 +08:00 · 2021-11-17 10:28:04 +08:00 · 01dc4bbdf9
parent e6ee9b2f19
commit 01dc4bbdf9
8 changed files with 240 additions and 46 deletions
--- a/mindspore/ccsrc/frontend/parallel/device_manager.cc
+++ b/mindspore/ccsrc/frontend/parallel/device_manager.cc
@ -259,6 +259,7 @@ RankList DeviceManager::FindRankListByHashName(const std::string &hash_name) {
  }
  RankList rank_list;
  std::string rank_str = "";
  rank_list_name = rank_list_name + "-";
  for (size_t i = 0; i < rank_list_name.size(); i++) {
    if (rank_list_name[i] == '-') {
      int64_t rank_id = std::stoi(rank_str);
--- a/mindspore/ccsrc/frontend/parallel/parameter_manager.cc
+++ b/mindspore/ccsrc/frontend/parallel/parameter_manager.cc
@ -374,7 +374,7 @@ void HandleNoUsedParameter(const FuncGraphPtr &root) {
  }
 }
-bool IsFullySplitParameter(const ParameterPtr &param_ptr) {
+bool IsFullySplitParameter(const ParameterPtr &param_ptr, size_t allow_repeat_num) {
  auto tensor_layout = param_ptr->user_data<parallel::TensorLayout>();
  if (tensor_layout == nullptr) {
    return false;
@ -391,7 +391,7 @@ bool IsFullySplitParameter(const ParameterPtr &param_ptr) {
    return false;
  }
-  if (group_devices.size() == 1) {
+  if (group_devices.size() <= allow_repeat_num) {
    MS_LOG(INFO) << "The parameter: " << param_ptr->name() << " is fully split";
    return true;
  }
--- a/mindspore/ccsrc/frontend/parallel/parameter_manager.h
+++ b/mindspore/ccsrc/frontend/parallel/parameter_manager.h
@ -45,7 +45,7 @@ void HandleFullySplitParameters(const FuncGraphPtr &root);
 void SetClonedTensorShapeForOptimizer(const FuncGraphPtr &root);
 void HandleAdaFactorOpt(const FuncGraphPtr &root);
 bool ParameterIsCloned(const AnfNodePtr &parameter_node);
-bool IsFullySplitParameter(const ParameterPtr &param_ptr);
+bool IsFullySplitParameter(const ParameterPtr &param_ptr, size_t allow_repeat_num = 1);
 }  // namespace parallel
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
@ -3123,48 +3123,13 @@ bool IsInsertVirtualOutput(const FuncGraphPtr &root) {
          current_stage == split_stage_num - 1);
 }
 RankList FindCommonMirrorGroup(const FuncGraphPtr &root) {
  auto parameters = root->parameters();
  for (auto &parameter : parameters) {
    auto param_ptr = parameter->cast<ParameterPtr>();
    MS_EXCEPTION_IF_NULL(param_ptr);
    if (IsFullySplitParameter(param_ptr)) {
      MS_LOG(WARNING) << "The parameter :" << param_ptr->fullname_with_scope()
                      << " is fully shard, thus cannot find common data parallel group for this rank";
      return {};
    }
  }
  AnfNodePtr ret = root->get_return();
  MS_EXCEPTION_IF_NULL(ret);
  std::vector<int64_t> common_group_list;
  std::vector<AnfNodePtr> all_nodes = DeepScopedGraphSearch(ret);
  bool is_first_group = true;
  for (auto &node : all_nodes) {
    if (!IsPrimitiveCNode(node, prim::kPrimMirror)) {
      continue;
    }
    auto prim = GetCNodePrimitive(node);
    if (!prim->HasAttr(GROUP)) {
      MS_LOG(EXCEPTION) << "The mirror operator dose not have group attr : " << node->DebugString();
    }
    std::string group_name = GetValue<std::string>(prim->GetAttr(GROUP));
    std::vector<int64_t> group_list = g_device_manager->FindRankListByHashName(group_name);
    if (is_first_group) {
      common_group_list = group_list;
      is_first_group = false;
    } else {
      std::vector<int64_t> new_comm_group_list;
      std::set_intersection(common_group_list.begin(), common_group_list.end(), group_list.begin(), group_list.end(),
                            std::back_inserter(new_comm_group_list));
      common_group_list = new_comm_group_list;
    }
  }
  MS_LOG(INFO) << "The common mirror group is:" << common_group_list;
  return common_group_list;
 }
 static void HandleGroupInfo(const FuncGraphPtr &root) {
  auto group_info = g_device_manager->group_info();
  auto group_info_save_path = common::GetEnv("GROUP_INFO_FILE");
  if (!group_info_save_path.empty()) {
    ParallelContext::GetInstance()->set_group_ckpt_save_file(group_info_save_path);
  }
  if (StrategyCheckpoint::GetInstance().group_info_save_on()) {
    RankList comm_group = FindCommonMirrorGroup(root);
    if (StrategyCheckpoint::GetInstance().SaveGroupInfo(group_info, comm_group) != SUCCESS) {
@ -3173,6 +3138,25 @@ static void HandleGroupInfo(const FuncGraphPtr &root) {
  }
 }
 static void HandleDataParallel() {
  std::string parallel_mode = ParallelContext::GetInstance()->parallel_mode();
  if (parallel_mode == DATA_PARALLEL) {
    auto group_info_save_path = common::GetEnv("GROUP_INFO_FILE");
    if (!group_info_save_path.empty()) {
      std::vector<std::pair<std::string, std::vector<uint32_t>>> group_info;
      int64_t device_num = GetCommInfo().device_num;
      RankList comm_group;
      for (size_t i = 0; i < size_t(device_num); ++i) {
        comm_group.push_back(i);
      }
      ParallelContext::GetInstance()->set_group_ckpt_save_file(group_info_save_path);
      if (StrategyCheckpoint::GetInstance().SaveGroupInfo(group_info, comm_group) != SUCCESS) {
        MS_LOG(EXCEPTION) << "Save group info failed";
      }
    }
  }
 }
 static void PipelinePostProcess(const FuncGraphPtr &root, const std::vector<AnfNodePtr> &all_nodes) {
  auto pipeline_stages = ParallelContext::GetInstance()->pipeline_stage_split_num();
  if (pipeline_stages > 1) {
@ -3191,6 +3175,7 @@ bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer)
  MS_EXCEPTION_IF_NULL(optimizer);
  MS_EXCEPTION_IF_NULL(ParallelContext::GetInstance());
  std::string parallel_mode = ParallelContext::GetInstance()->parallel_mode();
  HandleDataParallel();
  pipeline::ResourceBasePtr res = optimizer->resource();
  MS_EXCEPTION_IF_NULL(res);
  FuncGraphManagerPtr manager = res->manager();
--- a/mindspore/ccsrc/frontend/parallel/step_parallel_utils.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel_utils.cc
@ -35,6 +35,7 @@
 #include "frontend/parallel/graph_util/graph_info.h"
 #include "frontend/parallel/graph_util/node_info.h"
 #include "frontend/parallel/node_check.h"
 #include "frontend/parallel/parameter_manager.h"
 #include "ir/param_info.h"
 #include "ir/tensor.h"
 #include "utils/trace_base.h"
@ -149,6 +150,61 @@ Shapes GetNodeShape(const AnfNodePtr &node) {
  return shapes;
 }
 RankList FindCommonMirrorGroup(const FuncGraphPtr &root) {
  auto parameters = root->parameters();
  for (auto &parameter : parameters) {
    auto param_ptr = parameter->cast<ParameterPtr>();
    MS_EXCEPTION_IF_NULL(param_ptr);
    if (!(param_ptr->has_default() && ParameterRequireGrad(param_ptr))) {
      continue;
    }
    size_t allow_repeat_num = 1;
    if (ParallelContext::GetInstance()->enable_parallel_optimizer() &&
        (!param_ptr->param_info() || !param_ptr->param_info()->parallel_optimizer())) {
      if (ParallelContext::GetInstance()->optimizer_weight_shard_size() == -1) {
        MS_LOG(WARNING) << "The parameter :" << param_ptr->fullname_with_scope()
                        << " is fully shard by optimizer parallel,"
                           " thus cannot find common data parallel group for this rank";
        return {g_device_manager->global_rank()};
      }
      allow_repeat_num = size_t(ParallelContext::GetInstance()->optimizer_weight_shard_size());
    }
    if (IsFullySplitParameter(param_ptr, allow_repeat_num)) {
      MS_LOG(WARNING) << "The parameter :" << param_ptr->fullname_with_scope()
                      << " is fully shard, thus cannot find common data parallel group for this rank";
      return {g_device_manager->global_rank()};
    }
  }
  AnfNodePtr ret = root->get_return();
  MS_EXCEPTION_IF_NULL(ret);
  std::vector<int64_t> common_group_list;
  std::vector<AnfNodePtr> all_nodes = DeepScopedGraphSearch(ret);
  bool is_first_group = true;
  for (auto &node : all_nodes) {
    if (!IsPrimitiveCNode(node, prim::kPrimMirror) && !IsPrimitiveCNode(node, prim::kPrimMirrorMicroStep) &&
        !IsPrimitiveCNode(node, prim::kPrimMirrorMiniStep)) {
      continue;
    }
    auto prim = GetCNodePrimitive(node);
    if (!prim->HasAttr(GROUP)) {
      MS_LOG(EXCEPTION) << "The mirror operator dose not have group attr : " << node->DebugString();
    }
    std::string group_name = GetValue<std::string>(prim->GetAttr(GROUP));
    std::vector<int64_t> group_list = g_device_manager->FindRankListByHashName(group_name);
    if (is_first_group) {
      common_group_list = group_list;
      is_first_group = false;
    } else {
      std::vector<int64_t> new_comm_group_list;
      std::set_intersection(common_group_list.begin(), common_group_list.end(), group_list.begin(), group_list.end(),
                            std::back_inserter(new_comm_group_list));
      common_group_list = new_comm_group_list;
    }
  }
  MS_LOG(INFO) << "The common mirror group is:" << common_group_list;
  return common_group_list;
 }
 std::string CreateInstanceName(const CNodePtr &node, size_t index) {
  MS_EXCEPTION_IF_NULL(node);
  if (!IsValueNode<Primitive>(node->input(0))) {
--- a/mindspore/ccsrc/frontend/parallel/step_parallel_utils.h
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel_utils.h
@ -28,6 +28,7 @@ namespace parallel {
 bool IsSomePrimitive(const CNodePtr &cnode, const std::string &name);
 bool IsParallelCareNode(const CNodePtr &cnode);
 Shapes GetNodeShape(const AnfNodePtr &node);
 RankList FindCommonMirrorGroup(const FuncGraphPtr &root);
 std::string CreateInstanceName(const CNodePtr &node, size_t index);
 void SetCommunicationOpGroupLabel(std::vector<AnfNodePtr> new_node_input);
 std::vector<AnfNodePtr> ReplaceOpInput(const Operator &replace_op, const std::string &instance_name,
--- a/mindspore/train/serialization.py
+++ b/mindspore/train/serialization.py
@ -1160,10 +1160,12 @@ def _merge_param_with_strategy(sliced_data, parameter_name, strategy, is_even):
    return merged_tensor
-def ckpt_restore_group_info(group_info_file_name):
+def restore_group_info_list(group_info_file_name):
    """
    Build rank list, the checkpoint of ranks in the rank list has the same contents with the local rank
-    that saves the group_info_file_name
+    that saves the group_info_file_name.
    To save the group info file, please export GROUP_INFO_FILE environment variables like
    "export GROUP_INFO_FILE=/data/group_info.pb".
    Args:
        group_info_file_name (str): Name of group information file.
@ -1175,7 +1177,7 @@ def ckpt_restore_group_info(group_info_file_name):
        TypeError: group_info_file_name is not str.
    Examples:
-        >>> restore_list = ckpt_restore_group_info("./group_info.ckpt")
+        >>> restore_list = restore_group_info_list("./group_info.pb")
    """
    if not isinstance(group_info_file_name, str):
        raise TypeError(f"The group_info_file_name should be str, but got {type(group_info_file_name)}.")
--- a/tests/ut/python/parallel/test_parallel_mirror_group.py
+++ b/tests/ut/python/parallel/test_parallel_mirror_group.py
@ -0,0 +1,149 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """ test group info """
 import os
 import numpy as np
 import mindspore.nn as nn
 from mindspore import Tensor, Parameter
 from mindspore.common.api import _cell_graph_executor
 from mindspore.nn import TrainOneStepCell
 from mindspore.nn.wrap.cell_wrapper import _VirtualDatasetCell
 from mindspore.nn.optim import  Momentum
 from mindspore.ops import operations as P
 from mindspore import context
 from mindspore.train.serialization import restore_group_info_list
 class Net3(nn.Cell):
    """Net definition"""
    def __init__(self, strategy1, strategy2, strategy3):
        super(Net3, self).__init__()
        self.fc1 = P.MatMul().shard(strategy1)
        self.fc2 = P.MatMul().shard(strategy2)
        self.fc3 = P.MatMul().shard(strategy3)
        self.p1 = Parameter(Tensor(np.ones([48, 64]).astype(np.float32)), name="weight1")
        self.p2 = Parameter(Tensor(np.ones([64, 16]).astype(np.float32)), name="weight2", parallel_optimizer=False)
        self.p3 = Parameter(Tensor(np.ones([16, 16]).astype(np.float32)), name="weight3")
    def construct(self, x, y):
        x = self.fc1(x, self.p1)
        x = self.fc2(x, self.p2)
        z = x - y
        z = self.fc3(z, self.p3)
        return z
 def auto_parallel_compile_net(strategy1=None, strategy2=None, strategy3=None):
    context.set_context(mode=context.GRAPH_MODE)
    inputs = Tensor(np.ones([32, 48]).astype(np.float32))
    label = Tensor(np.zeros([32, 16]).astype(np.float32))
    net = Net3(strategy1, strategy2, strategy3)
    auto_parallel = context.get_auto_parallel_context("parallel_mode") in ["semi_auto_parallel", "auto_parallel"]
    if auto_parallel:
        net = _VirtualDatasetCell(net)
    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    train_network = TrainOneStepCell(net, optimizer).set_comm_fusion(4)
    train_network.set_auto_parallel()
    train_network.set_train()
    _cell_graph_executor.compile(train_network, inputs, label, phase="train", auto_parallel_mode=auto_parallel)
 def test_mirror_group():
    """
    Feature: save and load mirror group
    Description: semi-auto, disable parallel optimizer.
    Expectation: group info list match expectation value.
    """
    os.environ['GROUP_INFO_FILE'] = "./test_mirror_group.pb"
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel",
                                      device_num=32, enable_parallel_optimizer=False)
    auto_parallel_compile_net(((8, 1), (1, 4)), ((32, 1), (1, 1)), ((8, 4), (4, 1)))
    group_info_list = restore_group_info_list("./test_mirror_group.pb")
    assert group_info_list == [0, 4, 8, 12, 16, 20, 24, 28]
    context.reset_auto_parallel_context()
    del os.environ['GROUP_INFO_FILE']
 def test_data_parallel_group():
    """
    Feature: save and load mirror group
    Description: data-parallel, disable parallel optimizer.
    Expectation: group info list match expectation value.
    """
    os.environ['GROUP_INFO_FILE'] = "./test_data_parallel_group.pb"
    context.set_auto_parallel_context(parallel_mode="data_parallel",
                                      device_num=32, enable_parallel_optimizer=False)
    auto_parallel_compile_net(((8, 1), (1, 4)), ((32, 1), (1, 1)), ((8, 4), (4, 1)))
    group_info_list = restore_group_info_list("./test_data_parallel_group.pb")
    assert group_info_list == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                               24, 25, 26, 27, 28, 29, 30, 31]
    context.reset_auto_parallel_context()
    del os.environ['GROUP_INFO_FILE']
 def test_mirror_group_parallel_optimizer():
    """
    Feature: save and load mirror group
    Description: semi-auto, enable parallel optimizer.
    Expectation: group info list match expectation value.
    """
    os.environ['GROUP_INFO_FILE'] = "./test_mirror_group_parallel_optimizer.pb"
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel",
                                      device_num=32, enable_parallel_optimizer=True)
    auto_parallel_compile_net(((8, 1), (1, 4)), ((32, 1), (1, 1)), ((8, 4), (4, 1)))
    group_info_list = restore_group_info_list("./test_mirror_group_parallel_optimizer.pb")
    assert group_info_list == [0]
    context.reset_auto_parallel_context()
    del os.environ['GROUP_INFO_FILE']
 def test_mirror_group_parallel_optimizer_not_full_shard():
    """
    Feature: save and load mirror group
    Description: semi-auto, enable parallel optimizer but not fully shard.
    Expectation: group info list match expectation value.
    """
    os.environ['GROUP_INFO_FILE'] = "./test_mirror_group_parallel_optimizer_not_full_shard.pb"
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel",
                                      device_num=32, enable_parallel_optimizer=True, optimizer_weight_shard_size=2)
    auto_parallel_compile_net(((8, 1), (1, 4)), ((32, 1), (1, 1)), ((8, 4), (4, 1)))
    group_info_list = restore_group_info_list("./test_mirror_group_parallel_optimizer_not_full_shard.pb")
    assert group_info_list == [0, 8, 16, 24]
    context.reset_auto_parallel_context()
    del os.environ['GROUP_INFO_FILE']
 def test_pipeline_split_stage0_mirror_group():
    """
    Feature: save and load mirror group
    Description: semi-auto, pipeline parallel.
    Expectation: group info list match expectation value.
    """
    import mindspore as ms
    from mindspore import Model
    from .test_pipeline_split import PipelineCell, PipelineSplit, DatasetLenet
    os.environ['GROUP_INFO_FILE'] = "./test_pipeline_split_stage0_mirror_group.pb"
    context.set_auto_parallel_context(device_num=64, global_rank=0, pipeline_stages=2)
    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
    data = Tensor(np.ones([32, 64]), dtype=ms.float32)
    label = Tensor(np.ones([64, 64]), dtype=ms.float32)
    strategy1 = ((4, 1), (1, 8))
    strategy2 = ((4, 1), (1, 1))
    net = PipelineCell(PipelineSplit(strategy1, strategy2), 4)
    params = net.network.cell.block[0].trainable_params()
    dataset = DatasetLenet(data, label, 3)
    optimizer = nn.Lamb(params, learning_rate=0.01)
    model = Model(net, optimizer=optimizer)
    model.train(2, dataset, dataset_sink_mode=False)
    group_info_list = restore_group_info_list("./test_pipeline_split_stage0_mirror_group.pb")
    assert group_info_list == [0, 8, 16, 24]