add op atomic clean to clear input addr in launch allreduce

2021-03-25 11:19:23 +08:00 · 2021-03-25 11:19:23 +08:00 · 0a7df321fe
parent b9f87ba48a
commit 0a7df321fe
16 changed files with 236 additions and 29 deletions
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc
@ -27,6 +27,7 @@
 #include "runtime/device/kernel_runtime_manager.h"
 #include "runtime/device/ascend/ascend_event.h"
 #include "runtime/device/ascend/ascend_launch_mul.h"
+#include "runtime/device/ascend/ascend_launch_atomic_clean.h"
 #include "utils/profile.h"

 #define CHECK_ASCEND_RT_WITH_EXCEPTION(expression, message)    \
@ -90,16 +91,18 @@ void AscendBucket::FreeAllDeviceMem() {
    ar_output_addr_ = nullptr;
  }
  // clear launch mul device Memory
-  if (launch_kernel != nullptr) {
-    launch_kernel->FreeLaunchDeviceMem();
+  if (launch_mul_ != nullptr) {
+    launch_mul_->FreeLaunchDeviceMem();
+  }
+  // clear launch atomic clean device Memory
+  if (launch_atomic_clean_ != nullptr) {
+    launch_atomic_clean_->FreeLaunchDeviceMem();
  }
 }

 void AscendBucket::CopyTensorToContiguousMemory() {
-  // Clean input addr
-  CHECK_ASCEND_RT_WITH_EXCEPTION(rtMemsetAsync(ar_input_addr_, total_size_, 0, total_size_, compute_stream_),
-                                 "Call rtMemsetAsync failed");
-
+  // clear allreduce input addr
+  CleanAllReduceInputAddr();
  for (size_t i = 0; i < bucket_size_; ++i) {
    MS_EXCEPTION_IF_NULL(memcpy_input_addrs_[i]);
    MS_EXCEPTION_IF_NULL(memcpy_output_addrs_[i]);
@ -151,15 +154,36 @@ void AscendBucket::LaunchAllReduce() {
  }
 }

-std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchKernel() {
+void AscendBucket::CleanAllReduceInputAddr() {
+  if (launch_atomic_clean_ == nullptr) {
+    launch_atomic_clean_ = CreateLaunchAtomicClean();
+    MS_EXCEPTION_IF_NULL(launch_atomic_clean_);
+  }
+  // set atomic clean input addr
+  launch_atomic_clean_->SetInputAddr(ar_input_addr_);
+  // launch atomic clean
+  launch_atomic_clean_->LaunchOpKernel();
+}
+
+std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchMul() {
  if (tensor_type_list_.empty()) {
    MS_LOG(ERROR) << "tensor_type_list_ is empty";
  }
-  auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_);
+  auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_);
  MS_EXCEPTION_IF_NULL(launch_mul);
  return launch_mul;
 }

+std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchAtomicClean() {
+  if (tensor_type_list_.empty()) {
+    MS_LOG(ERROR) << "tensor_type_list_ is empty";
+  }
+  auto launch_atomic_clean =
+    std::make_shared<AscendLaunchAtomicClean>(compute_stream_, tensor_type_list_[0], total_size_);
+  MS_EXCEPTION_IF_NULL(launch_atomic_clean);
+  return launch_atomic_clean;
+}
+
 void AscendBucket::Init() {
  pre_event_ = std::make_shared<AscendEvent>();
  post_event_ = std::make_shared<AscendEvent>();
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.h
@ -34,7 +34,9 @@ class AscendBucket : public Bucket {
  void FreeDeviceMem(void *dev_ptr) override;
  void CopyTensorToContiguousMemory() override;
  void LaunchAllReduce() override;
-  std::shared_ptr<LaunchKernel> CreateLaunchKernel() override;
+  std::shared_ptr<LaunchKernel> CreateLaunchMul() override;
+  std::shared_ptr<LaunchKernel> CreateLaunchAtomicClean();
+  void CleanAllReduceInputAddr();
 };
 }  // namespace mindspore::device::ascend
 #endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc
@ -0,0 +1,114 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "runtime/device/ascend/ascend_launch_atomic_clean.h"
+
+#include <memory>
+#include <vector>
+#include "abstract/utils.h"
+#include "backend/session/single_kernel_graph.h"
+#include "backend/session/anf_runtime_algorithm.h"
+#include "debug/anf_ir_dump.h"
+
+namespace mindspore::device::ascend {
+void AscendLaunchAtomicClean::FreeDeviceMem(void *addr) { AscendLaunchKernel::FreeDeviceMem(addr); }
+
+size_t AscendLaunchAtomicClean::AlignSizeForLaunchKernel(size_t size) {
+  return AscendLaunchKernel::AlignSizeForLaunchKernel(size);
+}
+
+uint8_t *AscendLaunchAtomicClean::AllocDeviceMem(size_t size) { return AscendLaunchKernel::AllocDeviceMem(size); }
+
+void AscendLaunchAtomicClean::KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) {
+  AscendLaunchKernel::KernelSelect(kernel_graph);
+}
+
+void AscendLaunchAtomicClean::KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) {
+  AscendLaunchKernel::KernelBuild(kernel_graph);
+}
+
+void AscendLaunchAtomicClean::LaunchOpKernel() {
+  if (atomic_clean_graph_ == nullptr) {
+    // construct atomic clean kernel graph and set attr
+    ConstructKernelGraphAndSetAttr();
+    // kernel build
+    KernelBuild(atomic_clean_graph_);
+  }
+  // obtain kernel_mod
+  if (atomic_clean_graph_->execution_order().size() != 1) {
+    MS_LOG(ERROR) << "the execution order of the atomic clean graph should have only one node";
+  }
+  kernel_mod_ = AnfAlgo::GetKernelMod(atomic_clean_graph_->execution_order()[0]);
+  MS_EXCEPTION_IF_NULL(kernel_mod_);
+  // obtain kernel inputs
+  std::vector<kernel::AddressPtr> kernel_inputs;
+  auto input = std::make_shared<kernel::Address>();
+  MS_EXCEPTION_IF_NULL(input);
+  input->addr = input_addr_;
+  MS_EXCEPTION_IF_NULL(input->addr);
+  input->size = total_size_;
+  kernel_inputs.push_back(input);
+  // obtain kernel outputs
+  auto kernel_outputs = ObtainKernelOutputs(kernel_mod_->GetOutputSizeList());
+  // obtain kernel workspace
+  auto kernel_workspaces = ObtainKernelWorkspaces(kernel_mod_->GetWorkspaceSizeList());
+  // launch
+  auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
+  if (!ret_status) {
+    MS_LOG(ERROR) << "Launch single kernel failed.";
+  }
+}
+
+void AscendLaunchAtomicClean::FreeLaunchDeviceMem() {
+  input_addr_ = nullptr;
+  FreeOutputAndWorkspaceDeviceMem();
+}
+
+std::shared_ptr<session::KernelGraph> AscendLaunchAtomicClean::ObtainAtomicCleanKernelGraph() {
+  std::vector<TypeId> input_dtypes = {dtype_};
+  std::vector<TypeId> output_dtypes = {};
+  // obtain input & output shapes
+  size_t dtype_size = abstract::TypeIdSize(dtype_);
+  int64_t shape = total_size_ / dtype_size;
+  std::vector<std::vector<int64_t>> input_shapes = {{shape}};
+  std::vector<std::vector<size_t>> output_shapes = {};
+  auto atomic_clean_graph = session::SingleKernelGraph::ConstructKernelGraphBasedOnSingleOp(
+    kAtomicAddrCleanOpName, input_dtypes, input_shapes, output_dtypes, output_shapes);
+  MS_EXCEPTION_IF_NULL(atomic_clean_graph);
+  return atomic_clean_graph;
+}
+
+void AscendLaunchAtomicClean::ConstructKernelGraphAndSetAttr() {
+  // construct atomic clean kernel graph
+  atomic_clean_graph_ = ObtainAtomicCleanKernelGraph();
+  MS_EXCEPTION_IF_NULL(atomic_clean_graph_);
+  // set atomic clean attr
+  if (!atomic_clean_graph_->execution_order().empty()) {
+    auto clean_node = atomic_clean_graph_->execution_order()[0];
+    // set abstract
+    AbstractBasePtr abstract = std::make_shared<abstract::AbstractNone>();
+    MS_EXCEPTION_IF_NULL(abstract);
+    clean_node->set_abstract(abstract);
+    // set build info
+    auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+    builder->SetKernelType(KernelType::TBE_KERNEL);
+    AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), clean_node.get());
+    // set attr
+    std::vector<size_t> clean_size = {total_size_};
+    AnfAlgo::SetNodeAttr(kAttrAtomicAddMemSize, MakeValue(clean_size), clean_node);
+  }
+}
+}  // namespace mindspore::device::ascend
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.h
@ -0,0 +1,57 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_
+#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_
+
+#include <vector>
+#include <memory>
+#include "runtime/device/ascend/ascend_launch_kernel.h"
+
+namespace mindspore::device::ascend {
+class AscendLaunchAtomicClean : public AscendLaunchKernel {
+ public:
+  AscendLaunchAtomicClean(void *stream, TypeId dtype, size_t total_size)
+      : AscendLaunchKernel(stream),
+        dtype_(dtype),
+        total_size_(total_size),
+        atomic_clean_graph_(nullptr),
+        input_addr_(nullptr) {}
+  ~AscendLaunchAtomicClean() override = default;
+
+  void SetInputAddr(uint8_t *input_addr) override { input_addr_ = input_addr; }
+  void FreeDeviceMem(void *addr) override;
+  size_t AlignSizeForLaunchKernel(size_t size) override;
+  uint8_t *AllocDeviceMem(size_t size) override;
+  void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
+  void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;
+
+  void LaunchOpKernel() override;
+  void FreeLaunchDeviceMem() override;
+
+ protected:
+  TypeId dtype_;
+  size_t total_size_;
+  std::shared_ptr<session::KernelGraph> atomic_clean_graph_;
+  uint8_t *input_addr_;
+
+ private:
+  std::shared_ptr<session::KernelGraph> ObtainAtomicCleanKernelGraph();
+  void ConstructKernelGraphAndSetAttr();
+};
+}  // namespace mindspore::device::ascend
+
+#endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.h
@ -33,6 +33,7 @@ class AscendLaunchKernel : public LaunchKernel {
  void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
  void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;

+  void SetInputAddr(uint8_t *input_addr) override = 0;
  void LaunchOpKernel() override = 0;
  void FreeLaunchDeviceMem() override = 0;
 };
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.h
@ -25,10 +25,11 @@
 namespace mindspore::device::ascend {
 class AscendLaunchMul : public AscendLaunchKernel, public LaunchMul {
 public:
-  AscendLaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr)
-      : AscendLaunchKernel(stream), LaunchMul(dtype, total_size, input1_addr) {}
+  AscendLaunchMul(void *stream, TypeId dtype, size_t total_size)
+      : AscendLaunchKernel(stream), LaunchMul(dtype, total_size) {}
  ~AscendLaunchMul() override = default;

+  void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; }
  void FreeDeviceMem(void *addr) override;
  size_t AlignSizeForLaunchKernel(size_t size) override;
  uint8_t *AllocDeviceMem(size_t size) override;
--- a/mindspore/ccsrc/runtime/device/bucket.cc
+++ b/mindspore/ccsrc/runtime/device/bucket.cc
@ -94,12 +94,16 @@ void Bucket::CalculateMean() {
  if (!grad_mean) {
    return;
  }
-  launch_kernel = CreateLaunchKernel();
-  MS_EXCEPTION_IF_NULL(launch_kernel);
+  if (launch_mul_ == nullptr) {
+    launch_mul_ = CreateLaunchMul();
+    MS_EXCEPTION_IF_NULL(launch_mul_);
+  }
+  // set mul input1 addr
+  launch_mul_->SetInputAddr(ar_output_addr_);
  // launch mean
-  launch_kernel->LaunchOpKernel();
+  launch_mul_->LaunchOpKernel();
  // store output tensor addr
-  auto launch_output = launch_kernel->GetKernelOutputAddr();
+  auto launch_output = launch_mul_->GetKernelOutputAddr();
  if (launch_output.size() != 1) {
    MS_LOG(ERROR) << "launch mul outputs should have one output";
  }
--- a/mindspore/ccsrc/runtime/device/bucket.h
+++ b/mindspore/ccsrc/runtime/device/bucket.h
@ -38,7 +38,8 @@ class Bucket {
        compute_stream_(nullptr),
        pre_event_(nullptr),
        post_event_(nullptr),
-        launch_kernel(nullptr),
+        launch_mul_(nullptr),
+        launch_atomic_clean_(nullptr),
        total_size_(0),
        ar_input_addr_(nullptr),
        ar_output_addr_(nullptr) {}
@ -60,7 +61,8 @@ class Bucket {

  std::shared_ptr<DeviceEvent> pre_event_;
  std::shared_ptr<DeviceEvent> post_event_;
-  std::shared_ptr<LaunchKernel> launch_kernel;
+  std::shared_ptr<LaunchKernel> launch_mul_;
+  std::shared_ptr<LaunchKernel> launch_atomic_clean_;

  size_t total_size_;
  uint8_t *ar_input_addr_;
@ -77,7 +79,7 @@ class Bucket {
  virtual void AllocateAllReduceAddr() = 0;
  void UpdateTensorAddr();
  void CalculateMean();
-  virtual std::shared_ptr<LaunchKernel> CreateLaunchKernel() = 0;
+  virtual std::shared_ptr<LaunchKernel> CreateLaunchMul() = 0;
  virtual void LaunchAllReduce() = 0;
  virtual void FreeAllDeviceMem() = 0;
  virtual void FreeDeviceMem(void *dev_ptr) = 0;
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.cc
@ -92,8 +92,8 @@ void GPUBucket::FreeAllDeviceMem() {
    ar_output_addr_ = nullptr;
  }
  // clear launch mul device memory
-  if (launch_kernel != nullptr) {
-    launch_kernel->FreeLaunchDeviceMem();
+  if (launch_mul_ != nullptr) {
+    launch_mul_->FreeLaunchDeviceMem();
  }
  MS_LOG(INFO) << "end";
 }
@ -156,11 +156,11 @@ void GPUBucket::LaunchAllReduce() {
  MS_LOG(INFO) << "end";
 }

-std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchKernel() {
+std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchMul() {
  if (tensor_type_list_.empty()) {
    MS_LOG(ERROR) << "tensor_type_list_ is empty";
  }
-  auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_);
+  auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_);
  MS_EXCEPTION_IF_NULL(launch_mul);
  return launch_mul;
 }
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.h
@ -34,7 +34,7 @@ class GPUBucket : public Bucket {
  void FreeDeviceMem(void *dev_ptr) override;
  void CopyTensorToContiguousMemory() override;
  void LaunchAllReduce() override;
-  std::shared_ptr<LaunchKernel> CreateLaunchKernel() override;
+  std::shared_ptr<LaunchKernel> CreateLaunchMul() override;
  const void *collective_handle_;
 };
 }  // namespace mindspore::device::gpu
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_launch_kernel.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_launch_kernel.h
@ -33,6 +33,7 @@ class GPULaunchkernel : public LaunchKernel {
  void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
  void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;

+  void SetInputAddr(uint8_t *input_addr) override = 0;
  void LaunchOpKernel() override = 0;
  void FreeLaunchDeviceMem() override = 0;
 };
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_launch_mul.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_launch_mul.h
@ -25,10 +25,10 @@
 namespace mindspore::device::gpu {
 class GPULaunchMul : public GPULaunchkernel, public LaunchMul {
 public:
-  GPULaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr)
-      : GPULaunchkernel(stream), LaunchMul(dtype, total_size, input1_addr) {}
+  GPULaunchMul(void *stream, TypeId dtype, size_t total_size) : GPULaunchkernel(stream), LaunchMul(dtype, total_size) {}
  ~GPULaunchMul() override = default;

+  void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; }
  void FreeDeviceMem(void *addr) override;
  size_t AlignSizeForLaunchKernel(size_t size) override;
  uint8_t *AllocDeviceMem(size_t size) override;
--- a/mindspore/ccsrc/runtime/device/launch_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/launch_kernel.cc
@ -83,7 +83,7 @@ void LaunchKernel::LaunchSingleKernel(const std::vector<uint8_t *> &inputs_addr)
  // launch
  auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
  if (!ret_status) {
-    MS_LOG(ERROR) << "Launch mul kernel failed.";
+    MS_LOG(ERROR) << "Launch single kernel failed.";
  }
 }

--- a/mindspore/ccsrc/runtime/device/launch_kernel.h
+++ b/mindspore/ccsrc/runtime/device/launch_kernel.h
@ -37,6 +37,7 @@ class LaunchKernel {
  virtual void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) = 0;
  virtual void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) = 0;

+  virtual void SetInputAddr(uint8_t *input_addr) = 0;
  virtual void LaunchOpKernel() = 0;
  virtual void FreeLaunchDeviceMem() = 0;

@ -46,7 +47,6 @@ class LaunchKernel {
  std::vector<uint8_t *> outputs_addr_;
  std::vector<uint8_t *> workspaces_addr_;

- private:
  std::vector<kernel::AddressPtr> ObtainKernelAddress(const std::vector<size_t> &list, std::vector<uint8_t *> *addr);
  std::vector<kernel::AddressPtr> ObtainKernelInputs(const std::vector<size_t> &inputs_list,
                                                     const std::vector<uint8_t *> &inputs_addr);
--- a/mindspore/ccsrc/runtime/device/launch_mul.h
+++ b/mindspore/ccsrc/runtime/device/launch_mul.h
@ -24,10 +24,10 @@
 namespace mindspore::device {
 class LaunchMul {
 public:
-  LaunchMul(TypeId dtype, size_t total_size, uint8_t *input1_addr)
+  LaunchMul(TypeId dtype, size_t total_size)
      : dtype_(dtype),
        total_size_(total_size),
-        input1_addr_(input1_addr),
+        input1_addr_(nullptr),
        input2_addr_(nullptr),
        input2_value_(0),
        mul_graph_(nullptr) {}
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@ -106,6 +106,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.cc"
+        "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_graph_kernel.cc"
        "../../../mindspore/ccsrc/runtime/device/convert_tensor_utils.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc"