!17819 Add all gather fusion and concat pass for gpu

Merge pull request !17819 from ZPaC/master-add-gpu-all-gather-fusion
2021-06-07 14:32:32 +08:00 · 2021-06-07 14:32:32 +08:00 · 8fe3da0ddc
parent 878cb6ac3b 35b639868d
commit 8fe3da0ddc
4 changed files with 202 additions and 4 deletions
--- a/mindspore/ccsrc/backend/optimizer/gpu/concat_outputs_for_all_gather.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/concat_outputs_for_all_gather.cc
@ -0,0 +1,162 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/optimizer/gpu/concat_outputs_for_all_gather.h"
+#include <string>
+#include <tuple>
+#include <utility>
+#include "backend/session/anf_runtime_algorithm.h"
+
+namespace mindspore::opt {
+namespace {
+using OutputInfo =
+  std::tuple<std::vector<TypeId>, std::vector<std::vector<size_t>>, std::vector<std::string>, std::vector<TypeId>>;
+OutputInfo GetNodeOutputInfo(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  std::vector<TypeId> output_infer_dtype;
+  std::vector<std::vector<size_t>> output_infer_shape;
+  std::vector<std::string> output_format;
+  std::vector<TypeId> output_device_dtype;
+  auto type_ptr = node->Type();
+  auto shape_ptr = node->Shape();
+  size_t output_num = AnfAlgo::GetOutputTensorNum(node);
+  auto kernel_info = static_cast<device::KernelInfo *>(node->kernel_info());
+  MS_EXCEPTION_IF_NULL(kernel_info);
+  auto build_info = kernel_info->select_kernel_build_info();
+  MS_EXCEPTION_IF_NULL(build_info);
+  for (size_t i = 0; i < output_num; i++) {
+    output_infer_dtype.emplace_back(AnfAlgo::GetOutputInferDataType(type_ptr, i));
+    output_infer_shape.emplace_back(AnfAlgo::GetOutputInferShape(node, shape_ptr, i));
+    output_format.emplace_back(build_info->GetOutputFormat(i));
+    output_device_dtype.emplace_back(build_info->GetOutputDeviceType(i));
+  }
+
+  return {output_infer_dtype, output_infer_shape, output_format, output_device_dtype};
+}
+
+kernel::KernelBuildInfoPtr GenerateKernelBuildInfo(const AnfNodePtr &concat, const OutputInfo &allgather_output_info,
+                                                   size_t allgather_input_num, size_t allgather_input_idx) {
+  MS_EXCEPTION_IF_NULL(concat);
+  std::vector<std::string> inputs_device_format;
+  std::vector<std::string> outputs_device_format;
+  std::vector<TypeId> inputs_device_type;
+  std::vector<TypeId> outputs_device_type;
+  kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
+  size_t concat_input_num = AnfAlgo::GetInputTensorNum(concat);
+  for (size_t i = 0; i < concat_input_num; ++i) {
+    size_t input_index = allgather_input_idx + i * allgather_input_num;
+    inputs_device_format.emplace_back(std::get<2>(allgather_output_info)[input_index]);
+    inputs_device_type.emplace_back(std::get<3>(allgather_output_info)[input_index]);
+  }
+  // Current only support default format & float16
+  auto cmp_format = inputs_device_format.begin();
+  auto format_iter = std::find_if(inputs_device_format.begin(), inputs_device_format.end(),
+                                  [&](const auto &format) { return format != (*cmp_format); });
+  if (format_iter != inputs_device_format.end()) {
+    MS_LOG(EXCEPTION) << "Input format is not same, value: " << (*format_iter) << ", need format: " << (*cmp_format);
+  }
+  auto cmp_dtype = inputs_device_type.begin();
+  auto dtype_iter = std::find_if(inputs_device_type.begin(), inputs_device_type.end(),
+                                 [&](const auto &dtype) { return dtype != (*cmp_dtype); });
+  if (dtype_iter != inputs_device_type.end()) {
+    MS_LOG(EXCEPTION) << "Input dtype is not same, value: " << TypeIdLabel(*dtype_iter)
+                      << ", need dtype: " << TypeIdLabel(*cmp_dtype);
+  }
+  outputs_device_format.emplace_back(*cmp_format);
+  outputs_device_type.emplace_back(*cmp_dtype);
+
+  builder.SetFusionType(kernel::FusionType::OPAQUE);
+  builder.SetInputsFormat(inputs_device_format);
+  builder.SetOutputsFormat(outputs_device_format);
+  builder.SetInputsDeviceType(inputs_device_type);
+  builder.SetOutputsDeviceType(outputs_device_type);
+  return builder.Build();
+}
+
+AnfNodePtr InsertConcatForOutput(const FuncGraphPtr &func_graph, const AnfNodePtr &node, const OutputInfo &output_info,
+                                 const std::vector<AnfNodePtr> &new_tuple_getitems, int64_t rank_size) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  std::vector<AnfNodePtr> make_tuple_inputs{NewValueNode(std::make_shared<Primitive>(prim::kPrimMakeTuple->name()))};
+  size_t inputs_size = AnfAlgo::GetInputTensorNum(node);
+  for (size_t i = 0; i < inputs_size; ++i) {
+    std::vector<AnfNodePtr> concat_inputs{NewValueNode(std::make_shared<Primitive>(prim::kPrimConcat->name()))};
+    for (size_t j = 0, idx = i; j < LongToSize(rank_size); ++j, idx += inputs_size) {
+      concat_inputs.push_back(new_tuple_getitems[idx]);
+    }
+    auto concat = func_graph->NewCNode(concat_inputs);
+    MS_EXCEPTION_IF_NULL(concat);
+    MS_EXCEPTION_IF_NULL(new_tuple_getitems[i]);
+    const std::vector<TypeId> &dtypes = {std::get<0>(output_info)[i]};
+    const auto &shape = std::get<1>(output_info)[i];
+    std::vector<std::vector<size_t>> shapes = {shape};
+    shapes[0][0] *= rank_size;
+    AnfAlgo::SetOutputInferTypeAndShape(dtypes, shapes, concat.get());
+    AnfAlgo::SetNodeAttr(kAttrAxis, MakeValue(static_cast<int64_t>(0)), concat);
+    AnfAlgo::SetNodeAttr(kAttrInputNums, MakeValue(rank_size), concat);
+    std::vector<int64_t> dyn_input_size{rank_size};
+    AnfAlgo::SetNodeAttr(kAttrDynInputSizes, MakeValue(dyn_input_size), concat);
+    auto kernel_build_info = GenerateKernelBuildInfo(concat, output_info, inputs_size, i);
+    AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info, concat.get());
+    make_tuple_inputs.push_back(concat);
+  }
+
+  auto make_tuple = func_graph->NewCNode(make_tuple_inputs);
+  return make_tuple;
+}
+}  // namespace
+
+const BaseRef ConcatOutputsForAllGather::DefinePattern() const {
+  VarPtr Xs = std::make_shared<SeqVar>();
+  auto prim = std::make_shared<Primitive>(kAllGatherOpName);
+  return VectorRef({prim, Xs});
+}
+
+const AnfNodePtr ConcatOutputsForAllGather::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                                    const EquivPtr &) const {
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (!AnfAlgo::HasNodeAttr(kAttrFusion, cnode) || !AnfAlgo::HasNodeAttr(kAttrRankSize, cnode)) {
+    return nullptr;
+  }
+  auto fusion = AnfAlgo::GetNodeAttr<int64_t>(cnode, kAttrFusion);
+  if (fusion <= 0) {
+    return nullptr;
+  }
+  if (AnfAlgo::HasNodeAttr("fused", cnode) || AnfAlgo::GetInputTensorNum(node) == 1) {
+    return nullptr;
+  }
+  AnfAlgo::SetNodeAttr("fused", MakeValue(true), node);
+  auto rank_size = AnfAlgo::GetNodeAttr<int64_t>(node, kAttrRankSize);
+  std::vector<AnfNodePtr> new_outputs;
+  OutputInfo output_info = GetNodeOutputInfo(node);
+  size_t output_num = AnfAlgo::GetOutputTensorNum(node);
+  for (size_t i = 0; i < output_num; ++i) {
+    int64_t temp = SizeToLong(i);
+    auto idx = NewValueNode(temp);
+    MS_EXCEPTION_IF_NULL(idx);
+    auto imm = std::make_shared<Int64Imm>(temp);
+    auto abstract_scalar = std::make_shared<abstract::AbstractScalar>(imm);
+    idx->set_abstract(abstract_scalar);
+    auto tuple_getitem = func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), node, idx});
+    MS_EXCEPTION_IF_NULL(tuple_getitem);
+    AnfAlgo::SetOutputInferTypeAndShape({std::get<0>(output_info)[i]}, {std::get<1>(output_info)[i]},
+                                        tuple_getitem.get());
+    new_outputs.emplace_back(std::move(tuple_getitem));
+  }
+  return InsertConcatForOutput(func_graph, node, output_info, new_outputs, rank_size);
+}
+}  // namespace mindspore::opt
--- a/mindspore/ccsrc/backend/optimizer/gpu/concat_outputs_for_all_gather.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/concat_outputs_for_all_gather.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_CONCAT_OUTPUTS_FOR_ALLGATHER_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_CONCAT_OUTPUTS_FOR_ALLGATHER_H_
+
+#include <memory>
+#include <vector>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ConcatOutputsForAllGather : public PatternProcessPass {
+ public:
+  explicit ConcatOutputsForAllGather(bool multigraph = true)
+      : PatternProcessPass("concat_outputs_for_all_gather", multigraph) {}
+  ~ConcatOutputsForAllGather() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_CONCAT_OUTPUTS_FOR_ALLGATHER_H_
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@ -51,6 +51,7 @@
 #endif
 #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
 #include "backend/optimizer/pass/communication_op_fusion.h"
+#include "backend/optimizer/gpu/concat_outputs_for_all_gather.h"
 #include "backend/optimizer/pass/getitem_tuple.h"
 #include "common/trans.h"
 #include "debug/anf_ir_dump.h"
@ -175,6 +176,8 @@ void GPUSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_gra
  pm->AddPass(std::make_shared<opt::AddReluV2Fusion>());
  pm->AddPass(std::make_shared<opt::AddReluGradV2Fusion>());
  pm->AddPass(std::make_shared<opt::AllReduceFusion>());
+  pm->AddPass(std::make_shared<opt::AllGatherFusion>());
+  pm->AddPass(std::make_shared<opt::ConcatOutputsForAllGather>());
  pm->AddPass(std::make_shared<opt::GetitemTuple>());
  pm->AddPass(std::make_shared<opt::ReducePrecisionFusion>("reduce_precision"));
  optimizer->AddPassManager(pm);
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@ -1052,7 +1052,8 @@ void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph
  auto &kernels = graph->execution_order();
  for (auto &kernel : kernels) {
    MS_EXCEPTION_IF_NULL(kernel);
-    if (AnfAlgo::IsCommunicationOp(kernel)) {
+    if (AnfAlgo::IsCommunicationOp(kernel) && AnfAlgo::GetCNodeName(kernel) != kHcomSendOpName &&
+        AnfAlgo::GetCNodeName(kernel) != kReceiveOpName) {
      AllocCommunicationOpInputDynamicRes(kernel);
      AllocCommunicationOpOutputDynamicRes(kernel);
    }
@ -1120,9 +1121,6 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
 void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, bool, const DeviceAddressPtrList addr_list,
                                                  size_t total_size, std::vector<size_t> size_list) {
  MS_EXCEPTION_IF_NULL(mem_manager_);
-  if (!is_need_alloc_memory) {
-    return;
-  }
  auto ret = mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list);
  if (!ret) {
    MS_LOG(EXCEPTION) << "Malloc device memory failed.";