!1938 [AutoParallel] limit GatherV2, BN and Softmax to data parallel

Merge pull request !1938 from Chong/ReID
2020-06-11 09:12:10 +08:00 · 2020-06-11 09:12:10 +08:00 · c1c683eea8
parent f69a3ee5fc bee57fda66
commit c1c683eea8
8 changed files with 109 additions and 52 deletions
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_generate_strategy.cc
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_generate_strategy.cc
@ -135,24 +135,51 @@ std::vector<std::vector<int32_t>> PreparePReLU(const std::shared_ptr<Graph> &gra
  return strategies;
 }

-std::vector<std::vector<int32_t>> PrepareBiasAdd(std::vector<int32_t> s) {
+std::vector<std::vector<int32_t>> PrepareBatchNorm(const std::shared_ptr<Graph> &graph,
+                                                   const std::vector<std::shared_ptr<OperatorInfo>> &ops,
+                                                   const size_t iter_graph, const size_t iter_ops) {
+  std::vector<std::vector<int32_t>> strategies = MakeDataParallelStrategy(graph, ops, iter_graph, iter_ops);
+  for (size_t i = 1; i < strategies.size(); i++) {
+    strategies[i][0] = strategies[0][1];
+  }
+  strategies[1][0] = 1;
+  return strategies;
+}
+
+std::vector<std::vector<int32_t>> PrepareSoftmaxWithLogits(const std::shared_ptr<Graph> &graph,
+                                                           const std::vector<std::shared_ptr<OperatorInfo>> &ops,
+                                                           const size_t iter_graph, const size_t iter_ops) {
+  std::vector<std::vector<int32_t>> strategies = MakeDataParallelStrategy(graph, ops, iter_graph, iter_ops);
+  graph->nodes[iter_graph].tensor_parm.tensor_str.str_w = graph->nodes[iter_graph].tensor_parm.tensor_str.str_h;
+  graph->nodes[iter_graph].tensor_parm.tensor_str.str_h = graph->nodes[iter_graph].tensor_parm.tensor_str.str_c;
+  graph->nodes[iter_graph].tensor_parm.tensor_str.str_c = graph->nodes[iter_graph].tensor_parm.tensor_str.str_n;
+  return strategies;
+}
+
+std::vector<std::vector<int32_t>> PrepareBiasAdd(const std::shared_ptr<std::vector<int32_t>> &s) {
  std::vector<std::vector<int32_t>> strategies;
-  strategies.push_back(s);
+  strategies.push_back(*s);
  std::vector<int32_t> s_biasadd;
-  s_biasadd.push_back(s[1]);
+  s_biasadd.push_back(s->at(1));
  strategies.push_back(s_biasadd);
  return strategies;
 }

-std::vector<std::vector<int32_t>> PrepareOneHot(std::vector<int32_t> s) {
+std::vector<std::vector<int32_t>> PrepareOneHot(const std::shared_ptr<std::vector<int32_t>> &s) {
  std::vector<std::vector<int32_t>> strategies;
  std::vector<int32_t> s_empty = {};
-  strategies.push_back(s);
+  strategies.push_back(*s);
  strategies.push_back(s_empty);
  strategies.push_back(s_empty);
  return strategies;
 }

+std::vector<std::vector<int32_t>> PrepareGatherV2(const std::shared_ptr<std::vector<int32_t>> &s) {
+  std::vector<std::vector<int32_t>> strategies;
+  strategies.push_back(*s);
+  return strategies;
+}
+
 std::vector<std::vector<int32_t>> MakeRecSearchStrategy(const std::shared_ptr<Graph> &graph,
                                                        const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                        const size_t iter_graph, const size_t iter_ops) {
@ -270,6 +297,12 @@ std::vector<std::vector<int32_t>> PrepareStrategy(const std::shared_ptr<Graph> &
    return PrepareMatMul(graph, ops, iter_graph, iter_ops);
  } else if (type == PRELU) {
    return PreparePReLU(graph, ops, iter_graph, iter_ops);
+  } else if (type == BATCH_NORM) {
+    return PrepareBatchNorm(graph, ops, iter_graph, iter_ops);
+  } else if (type == SOFTMAX_CROSS_ENTROPY_WITH_LOGITS) {
+    return PrepareSoftmaxWithLogits(graph, ops, iter_graph, iter_ops);
+  } else if (type == SOFTMAX || type == LOG_SOFTMAX || type == SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS) {
+    return MakeDataParallelStrategy(graph, ops, iter_graph, iter_ops);
  } else {
    return MakeRecSearchStrategy(graph, ops, iter_graph, iter_ops);
  }
@ -336,7 +369,7 @@ std::vector<int32_t> CopyIncomingOperatorOutputStrategy(const std::shared_ptr<Gr
 std::vector<int32_t> PrepareIncomingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                          const size_t incoming_op_index) {
  std::vector<int32_t> s;
-  if (ops[incoming_op_index]->type() == RESHAPE) {
+  if (ops[incoming_op_index]->type() == RESHAPE || ops[incoming_op_index]->type() == GATHERV2) {
    return s;
  }
  auto strategy = ops[incoming_op_index]->selected_strategy();
@ -456,11 +489,6 @@ std::vector<int32_t> ModifyStrategyIfReduceIncoming(const std::vector<std::share
  return s_Reduce;
 }

-std::vector<int32_t> ModifyStrategyIfSoftmaxIncoming(std::vector<int32_t> s) {
-  s.pop_back();
-  return s;
-}
-
 std::vector<int32_t> CopyIncomingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                       const size_t iter_ops, const size_t incoming_op_index) {
  std::vector<int32_t> s;
@ -474,9 +502,6 @@ std::vector<int32_t> CopyIncomingOperatorInputStrategy(const std::vector<std::sh
        ops[incoming_op_index]->type() == REDUCE_MIN || ops[incoming_op_index]->type() == REDUCE_MEAN) {
      s = ModifyStrategyIfReduceIncoming(ops, incoming_op_index, s);
    }
-    if (ops[incoming_op_index]->type() == SOFTMAX_CROSS_ENTROPY_WITH_LOGITS) {
-      s = ModifyStrategyIfSoftmaxIncoming(s);
-    }
  }
  return s;
 }
@ -496,11 +521,15 @@ std::vector<std::vector<int32_t>> GenerateStrategiesFromStrategy(const std::vect
    return stra;
  }

+  auto s_ptr = std::make_shared<std::vector<int32_t>>(basic_stra);
  if (ops[iter_ops]->type() == BIAS_ADD) {
-    return PrepareBiasAdd(basic_stra);
+    return PrepareBiasAdd(s_ptr);
  }
  if (ops[iter_ops]->type() == ONEHOT) {
-    return PrepareOneHot(basic_stra);
+    return PrepareOneHot(s_ptr);
+  }
+  if (ops[iter_ops]->type() == GATHERV2) {
+    return PrepareGatherV2(s_ptr);
  }

  for (size_t iter_op_inputs = 0; iter_op_inputs < (size_t)ops[iter_ops]->inputs_tensor_info().size();
@ -599,7 +628,8 @@ std::vector<int32_t> CopyOutgoingOperatorInputStrategy(const std::vector<std::sh
                                                       const size_t iter_ops) {
  std::vector<int32_t> s;
  if (ops[iter_ops]->type() == REDUCE_MAX || ops[iter_ops]->type() == REDUCE_MIN ||
-      ops[iter_ops]->type() == REDUCE_SUM || ops[iter_ops]->type() == REDUCE_MEAN || ops[iter_ops]->type() == RESHAPE) {
+      ops[iter_ops]->type() == REDUCE_SUM || ops[iter_ops]->type() == REDUCE_MEAN || ops[iter_ops]->type() == RESHAPE ||
+      ops[iter_ops]->type() == GATHERV2) {
    return s;
  }

--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_generate_strategy.h
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_generate_strategy.h
@ -37,8 +37,15 @@ std::vector<std::vector<int32_t>> PrepareMatMul(const std::shared_ptr<Graph> &gr
 std::vector<std::vector<int32_t>> PreparePReLU(const std::shared_ptr<Graph> &graph,
                                               const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                               const size_t iter_graph, const size_t iter_ops);
-std::vector<std::vector<int32_t>> PrepareBiasAdd(std::vector<int32_t> s);
-std::vector<std::vector<int32_t>> PrepareOneHot(std::vector<int32_t> s);
+std::vector<std::vector<int32_t>> PrepareBatchNorm(const std::shared_ptr<Graph> &graph,
+                                                   const std::vector<std::shared_ptr<OperatorInfo>> &ops,
+                                                   const size_t iter_graph, const size_t iter_ops);
+std::vector<std::vector<int32_t>> PrepareSoftmaxWithLogits(const std::shared_ptr<Graph> &graph,
+                                                           const std::vector<std::shared_ptr<OperatorInfo>> &ops,
+                                                           const size_t iter_graph, const size_t iter_ops);
+std::vector<std::vector<int32_t>> PrepareBiasAdd(const std::shared_ptr<std::vector<int32_t>> &s);
+std::vector<std::vector<int32_t>> PrepareOneHot(const std::shared_ptr<std::vector<int32_t>> &s);
+std::vector<std::vector<int32_t>> PrepareGatherV2(const std::shared_ptr<std::vector<int32_t>> &s);
 std::vector<std::vector<int32_t>> MakeRecSearchStrategy(const std::shared_ptr<Graph> &graph,
                                                        const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                        const size_t iter_graph, const size_t iter_ops);
@ -64,11 +71,11 @@ std::vector<int32_t> ModifyStrategyIfSqueezeIncoming(const std::vector<std::shar
 std::vector<int32_t> GetDimList(const std::vector<std::shared_ptr<OperatorInfo>> &ops, const size_t iter_ops);
 std::vector<int32_t> ModifyStrategyIfReduceIncoming(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                    const size_t incoming_op_index, std::vector<int32_t> s);
-std::vector<int32_t> ModifyStrategyIfSoftmaxIncoming(std::vector<int32_t> s);
 std::vector<int32_t> CopyIncomingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                                       const size_t iter_ops, const size_t incoming_op_index);
 std::vector<std::vector<int32_t>> GenerateStrategiesFromStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
-                                                                 const size_t iter_ops, std::vector<int32_t> s);
+                                                                 const size_t iter_ops,
+                                                                 std::vector<int32_t> basic_stra);
 void GenerateEliminatedOperatorStrategyForward(std::shared_ptr<Graph> graph,
                                               const std::vector<std::shared_ptr<OperatorInfo>> &ops,
                                               const std::vector<std::vector<std::string>> &input_tensor_names,
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_graph.h
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_graph.h
@ -48,7 +48,8 @@ enum OperatorType {
  kRecSqueeze,
  kRecCast,
  kRecReduce,
-  kRecPReLU
+  kRecPReLU,
+  kRecGatherV2
 };

 enum InfoType { kApplication, kConstant };
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_parse_graph.cc
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_parse_graph.cc
@ -199,7 +199,7 @@ std::shared_ptr<Graph> EliminateGraph(const std::shared_ptr<Graph> graph,
    OperatorType::kRecOneHot, OperatorType::kRecReLU,      OperatorType::kRecLog,     OperatorType::kRecExp,
    OperatorType::kRecAdd,    OperatorType::kRecElmWiseOp, OperatorType::kRecBiasAdd, OperatorType::kRecSub,
    OperatorType::kRecMul,    OperatorType::kRecDiv,       OperatorType::kRecSqueeze, OperatorType::kRecReduce,
-    OperatorType::kRecCast,   OperatorType::kRecReshape};
+    OperatorType::kRecCast,   OperatorType::kRecReshape,   OperatorType::kRecGatherV2};
  for (size_t node_index = 0; node_index < (size_t)graph->nodes.size(); node_index++) {
    auto type = graph->nodes[node_index].apply.op_type;
    if (type_list.find(type) != type_list.end()) {
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_parse_graph.h
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_parse_graph.h
@ -46,6 +46,7 @@ const std::map<std::string, OperatorType> DictOpType{
  {REDUCE_MAX, OperatorType::kRecReduce},
  {REDUCE_MIN, OperatorType::kRecReduce},
  {REDUCE_MEAN, OperatorType::kRecReduce},
+  {GATHERV2, OperatorType::kRecGatherV2},

  {RELU, OperatorType::kRecReLU},
  {"ReLU6", OperatorType::kRecReLU},
@ -63,9 +64,9 @@ const std::map<std::string, OperatorType> DictOpType{
  {MUL, OperatorType::kRecElmWiseOp},
  {DIV, OperatorType::kRecElmWiseOp},
  {REAL_DIV, OperatorType::kRecElmWiseOp},
-  {SOFTMAX, OperatorType::kRecElmWiseOp},
-  {LOG_SOFTMAX, OperatorType::kRecElmWiseOp},
-  {SOFTMAX_CROSS_ENTROPY_WITH_LOGITS, OperatorType::kRecElmWiseOp},
+  {SOFTMAX, OperatorType::kRecSoftmax},
+  {LOG_SOFTMAX, OperatorType::kRecSoftmax},
+  {SOFTMAX_CROSS_ENTROPY_WITH_LOGITS, OperatorType::kRecSoftmax},
  {SQRT, OperatorType::kRecElmWiseOp},
  {NEG, OperatorType::kRecElmWiseOp},
  {POW, OperatorType::kRecElmWiseOp},
--- a/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_partition.cc
+++ b/mindspore/ccsrc/parallel/auto_parallel/rec_core/rec_partition.cc
@ -53,9 +53,8 @@ double GetWeights(const Graph::NodeType &node) {
    auto cost_ptr = std::make_shared<CostTensorAdd>();

    return cost_ptr->GetMinCostIn();
-  } else if (op.op_type == OperatorType::kRecReLU || op.op_type == OperatorType::kRecSoftmax ||
-             op.op_type == OperatorType::kRecSparseSoftmaxCrossEntropyWithLogits) {
-    // For Activation and Softmax
+  } else if (op.op_type == OperatorType::kRecReLU) {
+    // For Activation
    auto cost_ptr = std::make_shared<CostCommon>();

    return cost_ptr->GetMinCostIn();
@ -69,11 +68,6 @@ double GetWeights(const Graph::NodeType &node) {
    auto cost_ptr = std::make_shared<CostBiasAdd>();

    return cost_ptr->GetMinCostIn();
-  } else if (op.op_type == OperatorType::kRecBatchNorm) {
-    // For BatchNorm
-    auto cost_ptr = std::make_shared<CostBatchNorm>();
-
-    return cost_ptr->GetMinCostIn(op);
  } else if (op.op_type == OperatorType::kRecOneHot || op.op_type == OperatorType::kRecLog ||
             op.op_type == OperatorType::kRecExp || op.op_type == OperatorType::kRecAdd ||
             op.op_type == OperatorType::kRecSub || op.op_type == OperatorType::kRecMul ||
@ -83,8 +77,10 @@ double GetWeights(const Graph::NodeType &node) {
    auto cost_ptr = std::make_shared<CostCommon>();

    return cost_ptr->GetMinCostIn();
-  } else if (op.op_type == OperatorType::kRecUnkownType || op.op_type == OperatorType::kRecPReLU) {
-    // For unknown type
+  } else if (op.op_type == OperatorType::kRecUnkownType || op.op_type == OperatorType::kRecPReLU ||
+             op.op_type == OperatorType::kRecBatchNorm || op.op_type == OperatorType::kRecSoftmax ||
+             op.op_type == OperatorType::kRecSparseSoftmaxCrossEntropyWithLogits) {
+    // For unprocessed type
    return 0.0;
  } else {
    MS_LOG(EXCEPTION) << "Failure: GetOperatorWeight failed.";
@ -147,9 +143,8 @@ StrategyRec PartitionNode(const Graph::NodeType &node,
    auto cost_ptr = std::make_shared<CostTensorAdd>();

    return cost_ptr->GetOptimalStr(node, node_name_to_strategy, *graph);
-  } else if (node.apply.op_type == OperatorType::kRecReLU || node.apply.op_type == OperatorType::kRecSoftmax ||
-             node.apply.op_type == OperatorType::kRecSparseSoftmaxCrossEntropyWithLogits) {
-    // For Softmax & Activation
+  } else if (node.apply.op_type == OperatorType::kRecReLU) {
+    // For Activation
    auto cost_ptr = std::make_shared<CostCommon>();

    return cost_ptr->GetOptimalStr(node, node_name_to_strategy, *graph);
@ -162,11 +157,6 @@ StrategyRec PartitionNode(const Graph::NodeType &node,
    // For BiasAdd
    auto cost_ptr = std::make_shared<CostBiasAdd>();

-    return cost_ptr->GetOptimalStr(node, node_name_to_strategy, *graph);
-  } else if (node.apply.op_type == OperatorType::kRecBatchNorm) {
-    // For BatchNorm
-    auto cost_ptr = std::make_shared<CostBatchNorm>();
-
    return cost_ptr->GetOptimalStr(node, node_name_to_strategy, *graph);
  } else if (node.apply.op_type == OperatorType::kRecOneHot || node.apply.op_type == OperatorType::kRecLog ||
             node.apply.op_type == OperatorType::kRecExp || node.apply.op_type == OperatorType::kRecAdd ||
@ -177,8 +167,10 @@ StrategyRec PartitionNode(const Graph::NodeType &node,
    auto cost_ptr = std::make_shared<CostCommon>();

    return cost_ptr->GetOptimalStr(node, node_name_to_strategy, *graph);
-  } else if (node.apply.op_type == OperatorType::kRecUnkownType || node.apply.op_type == OperatorType::kRecPReLU) {
-    // For unknown type
+  } else if (node.apply.op_type == OperatorType::kRecUnkownType || node.apply.op_type == OperatorType::kRecPReLU ||
+             node.apply.op_type == OperatorType::kRecBatchNorm || node.apply.op_type == OperatorType::kRecSoftmax ||
+             node.apply.op_type == OperatorType::kRecSparseSoftmaxCrossEntropyWithLogits) {
+    // For unprocessed type
    StrategyRec default_strategy;
    return default_strategy;
  } else {
--- a/mindspore/ccsrc/parallel/step_auto_parallel.cc
+++ b/mindspore/ccsrc/parallel/step_auto_parallel.cc
@ -410,9 +410,11 @@ Status ConstructCostGraphNodesByUniqueId(const std::vector<AnfNodePtr> &all_node
    ValueNodePtr prim_anf_node = cnode->input(0)->cast<ValueNodePtr>();
    if (!IsAutoParallelCareNode(cnode)) {
      // Needed by rec_parser
-      PrimitivePtr prim = GetValueNode<PrimitivePtr>(prim_anf_node);
-      if (prim->name() == TUPLE_GETITEM) {
-        entire_costgraph->add_tuple_getitem(std::make_pair(cnode->UniqueId(), cnode->input(1)->UniqueId()));
+      if (ParallelContext::GetInstance()->strategy_search_mode() == RECURSIVE_PROGRAMMING) {
+        auto prev_cnode = GetInternalOperatorInfo(cnode, prim_anf_node);
+        if (prev_cnode != nullptr) {
+          entire_costgraph->add_tuple_getitem(std::make_pair(cnode->UniqueId(), prev_cnode->UniqueId()));
+        }
      }
      continue;
    }
@ -473,9 +475,11 @@ Status ConstructCostGraphNodesByUniqueIdTC(const std::vector<AnfNodePtr> &all_no
    ValueNodePtr prim_anf_node = cnode->input(0)->cast<ValueNodePtr>();
    if (!IsAutoParallelCareNode(cnode)) {
      // Needed by rec_parser
-      PrimitivePtr prim = GetValueNode<PrimitivePtr>(prim_anf_node);
-      if (prim->name() == TUPLE_GETITEM) {
-        entire_costgraph->add_tuple_getitem(std::make_pair(cnode->UniqueId(), cnode->input(1)->UniqueId()));
+      if (ParallelContext::GetInstance()->strategy_search_mode() == RECURSIVE_PROGRAMMING) {
+        auto prev_cnode = GetInternalOperatorInfo(cnode, prim_anf_node);
+        if (prev_cnode != nullptr) {
+          entire_costgraph->add_tuple_getitem(std::make_pair(cnode->UniqueId(), prev_cnode->UniqueId()));
+        }
      }
      continue;
    }
@ -1100,6 +1104,26 @@ std::vector<std::vector<std::string>> RecInputTensorNames(const std::map<std::st
  return input_tensor_names;
 }

+CNodePtr GetInternalOperatorInfo(const CNodePtr &cnode, const ValueNodePtr &prim_anf_node) {
+  PrimitivePtr prim = GetValueNode<PrimitivePtr>(prim_anf_node);
+  if (prim->name() == TUPLE_GETITEM || prim->name() == DEPEND) {
+    auto prev_cnode = cnode->input(1)->cast<CNodePtr>();
+    if (prev_cnode == nullptr || !IsValueNode<Primitive>(prev_cnode->input(0))) {
+      return nullptr;
+    }
+    auto prev_prim = prev_cnode->input(0)->cast<ValueNodePtr>()->value()->cast<PrimitivePtr>();
+    while (prev_prim->name() == TUPLE_GETITEM || prev_prim->name() == DEPEND) {
+      prev_cnode = prev_cnode->input(1)->cast<CNodePtr>();
+      if (prev_cnode == nullptr || !IsValueNode<Primitive>(prev_cnode->input(0))) {
+        return nullptr;
+      }
+      prev_prim = prev_cnode->input(0)->cast<ValueNodePtr>()->value()->cast<PrimitivePtr>();
+    }
+    return prev_cnode;
+  }
+  return nullptr;
+}
+
 Status ParallelStrategyRecSearch(const std::vector<AnfNodePtr> &all_nodes, const FuncGraphPtr &root) {
  if (CostModelContext::GetInstance()->is_multi_subgraphs()) {
    if (ConstructCostGraphNodesByUniqueIdTC(all_nodes, root) == SUCCESS) {
--- a/mindspore/ccsrc/parallel/step_auto_parallel.h
+++ b/mindspore/ccsrc/parallel/step_auto_parallel.h
@ -57,6 +57,8 @@ Status ParallelStrategyRecSearch(const std::vector<AnfNodePtr> &all_nodes, const

 std::vector<std::vector<std::string>> RecInputTensorNames(const std::map<std::string, std::string>::iterator &it,
                                                          std::vector<std::vector<std::string>> input_tensor_names);
+
+CNodePtr GetInternalOperatorInfo(const CNodePtr &cnode, const ValueNodePtr &prim_anf_node);
 }  // namespace parallel
 }  // namespace mindspore
 #endif  // PARALLEL_STEP_AUTO_PARALLEL_H_