diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc
index fd56e9b63af..6a818df9dcb 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc
@@ -27,6 +27,7 @@
 #include "runtime/device/kernel_runtime_manager.h"
 #include "runtime/device/ascend/ascend_event.h"
 #include "runtime/device/ascend/ascend_launch_mul.h"
+#include "runtime/device/ascend/ascend_launch_atomic_clean.h"
 #include "utils/profile.h"
 
 #define CHECK_ASCEND_RT_WITH_EXCEPTION(expression, message)    \
@@ -90,16 +91,18 @@ void AscendBucket::FreeAllDeviceMem() {
     ar_output_addr_ = nullptr;
   }
   // clear launch mul device Memory
-  if (launch_kernel != nullptr) {
-    launch_kernel->FreeLaunchDeviceMem();
+  if (launch_mul_ != nullptr) {
+    launch_mul_->FreeLaunchDeviceMem();
+  }
+  // clear launch atomic clean device Memory
+  if (launch_atomic_clean_ != nullptr) {
+    launch_atomic_clean_->FreeLaunchDeviceMem();
   }
 }
 
 void AscendBucket::CopyTensorToContiguousMemory() {
-  // Clean input addr
-  CHECK_ASCEND_RT_WITH_EXCEPTION(rtMemsetAsync(ar_input_addr_, total_size_, 0, total_size_, compute_stream_),
-                                 "Call rtMemsetAsync failed");
-
+  // clear allreduce input addr
+  CleanAllReduceInputAddr();
   for (size_t i = 0; i < bucket_size_; ++i) {
     MS_EXCEPTION_IF_NULL(memcpy_input_addrs_[i]);
     MS_EXCEPTION_IF_NULL(memcpy_output_addrs_[i]);
@@ -151,15 +154,36 @@ void AscendBucket::LaunchAllReduce() {
   }
 }
 
-std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchKernel() {
+void AscendBucket::CleanAllReduceInputAddr() {
+  if (launch_atomic_clean_ == nullptr) {
+    launch_atomic_clean_ = CreateLaunchAtomicClean();
+    MS_EXCEPTION_IF_NULL(launch_atomic_clean_);
+  }
+  // set atomic clean input addr
+  launch_atomic_clean_->SetInputAddr(ar_input_addr_);
+  // launch atomic clean
+  launch_atomic_clean_->LaunchOpKernel();
+}
+
+std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchMul() {
   if (tensor_type_list_.empty()) {
     MS_LOG(ERROR) << "tensor_type_list_ is empty";
   }
-  auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_);
+  auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_);
   MS_EXCEPTION_IF_NULL(launch_mul);
   return launch_mul;
 }
 
+std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchAtomicClean() {
+  if (tensor_type_list_.empty()) {
+    MS_LOG(ERROR) << "tensor_type_list_ is empty";
+  }
+  auto launch_atomic_clean =
+    std::make_shared<AscendLaunchAtomicClean>(compute_stream_, tensor_type_list_[0], total_size_);
+  MS_EXCEPTION_IF_NULL(launch_atomic_clean);
+  return launch_atomic_clean;
+}
+
 void AscendBucket::Init() {
   pre_event_ = std::make_shared<AscendEvent>();
   post_event_ = std::make_shared<AscendEvent>();
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.h b/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.h
index cd3050090bf..af6f770d8d0 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.h
@@ -34,7 +34,9 @@ class AscendBucket : public Bucket {
   void FreeDeviceMem(void *dev_ptr) override;
   void CopyTensorToContiguousMemory() override;
   void LaunchAllReduce() override;
-  std::shared_ptr<LaunchKernel> CreateLaunchKernel() override;
+  std::shared_ptr<LaunchKernel> CreateLaunchMul() override;
+  std::shared_ptr<LaunchKernel> CreateLaunchAtomicClean();
+  void CleanAllReduceInputAddr();
 };
 }  // namespace mindspore::device::ascend
 #endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc
new file mode 100644
index 00000000000..6abec788283
--- /dev/null
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc
@@ -0,0 +1,114 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "runtime/device/ascend/ascend_launch_atomic_clean.h"
+
+#include <memory>
+#include <vector>
+#include "abstract/utils.h"
+#include "backend/session/single_kernel_graph.h"
+#include "backend/session/anf_runtime_algorithm.h"
+#include "debug/anf_ir_dump.h"
+
+namespace mindspore::device::ascend {
+void AscendLaunchAtomicClean::FreeDeviceMem(void *addr) { AscendLaunchKernel::FreeDeviceMem(addr); }
+
+size_t AscendLaunchAtomicClean::AlignSizeForLaunchKernel(size_t size) {
+  return AscendLaunchKernel::AlignSizeForLaunchKernel(size);
+}
+
+uint8_t *AscendLaunchAtomicClean::AllocDeviceMem(size_t size) { return AscendLaunchKernel::AllocDeviceMem(size); }
+
+void AscendLaunchAtomicClean::KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) {
+  AscendLaunchKernel::KernelSelect(kernel_graph);
+}
+
+void AscendLaunchAtomicClean::KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) {
+  AscendLaunchKernel::KernelBuild(kernel_graph);
+}
+
+void AscendLaunchAtomicClean::LaunchOpKernel() {
+  if (atomic_clean_graph_ == nullptr) {
+    // construct atomic clean kernel graph and set attr
+    ConstructKernelGraphAndSetAttr();
+    // kernel build
+    KernelBuild(atomic_clean_graph_);
+  }
+  // obtain kernel_mod
+  if (atomic_clean_graph_->execution_order().size() != 1) {
+    MS_LOG(ERROR) << "the execution order of the atomic clean graph should have only one node";
+  }
+  kernel_mod_ = AnfAlgo::GetKernelMod(atomic_clean_graph_->execution_order()[0]);
+  MS_EXCEPTION_IF_NULL(kernel_mod_);
+  // obtain kernel inputs
+  std::vector<kernel::AddressPtr> kernel_inputs;
+  auto input = std::make_shared<kernel::Address>();
+  MS_EXCEPTION_IF_NULL(input);
+  input->addr = input_addr_;
+  MS_EXCEPTION_IF_NULL(input->addr);
+  input->size = total_size_;
+  kernel_inputs.push_back(input);
+  // obtain kernel outputs
+  auto kernel_outputs = ObtainKernelOutputs(kernel_mod_->GetOutputSizeList());
+  // obtain kernel workspace
+  auto kernel_workspaces = ObtainKernelWorkspaces(kernel_mod_->GetWorkspaceSizeList());
+  // launch
+  auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
+  if (!ret_status) {
+    MS_LOG(ERROR) << "Launch single kernel failed.";
+  }
+}
+
+void AscendLaunchAtomicClean::FreeLaunchDeviceMem() {
+  input_addr_ = nullptr;
+  FreeOutputAndWorkspaceDeviceMem();
+}
+
+std::shared_ptr<session::KernelGraph> AscendLaunchAtomicClean::ObtainAtomicCleanKernelGraph() {
+  std::vector<TypeId> input_dtypes = {dtype_};
+  std::vector<TypeId> output_dtypes = {};
+  // obtain input & output shapes
+  size_t dtype_size = abstract::TypeIdSize(dtype_);
+  int64_t shape = total_size_ / dtype_size;
+  std::vector<std::vector<int64_t>> input_shapes = {{shape}};
+  std::vector<std::vector<size_t>> output_shapes = {};
+  auto atomic_clean_graph = session::SingleKernelGraph::ConstructKernelGraphBasedOnSingleOp(
+    kAtomicAddrCleanOpName, input_dtypes, input_shapes, output_dtypes, output_shapes);
+  MS_EXCEPTION_IF_NULL(atomic_clean_graph);
+  return atomic_clean_graph;
+}
+
+void AscendLaunchAtomicClean::ConstructKernelGraphAndSetAttr() {
+  // construct atomic clean kernel graph
+  atomic_clean_graph_ = ObtainAtomicCleanKernelGraph();
+  MS_EXCEPTION_IF_NULL(atomic_clean_graph_);
+  // set atomic clean attr
+  if (!atomic_clean_graph_->execution_order().empty()) {
+    auto clean_node = atomic_clean_graph_->execution_order()[0];
+    // set abstract
+    AbstractBasePtr abstract = std::make_shared<abstract::AbstractNone>();
+    MS_EXCEPTION_IF_NULL(abstract);
+    clean_node->set_abstract(abstract);
+    // set build info
+    auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+    builder->SetKernelType(KernelType::TBE_KERNEL);
+    AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), clean_node.get());
+    // set attr
+    std::vector<size_t> clean_size = {total_size_};
+    AnfAlgo::SetNodeAttr(kAttrAtomicAddMemSize, MakeValue(clean_size), clean_node);
+  }
+}
+}  // namespace mindspore::device::ascend
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.h b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.h
new file mode 100644
index 00000000000..eaf3b9dcd48
--- /dev/null
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.h
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_
+#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_
+
+#include <vector>
+#include <memory>
+#include "runtime/device/ascend/ascend_launch_kernel.h"
+
+namespace mindspore::device::ascend {
+class AscendLaunchAtomicClean : public AscendLaunchKernel {
+ public:
+  AscendLaunchAtomicClean(void *stream, TypeId dtype, size_t total_size)
+      : AscendLaunchKernel(stream),
+        dtype_(dtype),
+        total_size_(total_size),
+        atomic_clean_graph_(nullptr),
+        input_addr_(nullptr) {}
+  ~AscendLaunchAtomicClean() override = default;
+
+  void SetInputAddr(uint8_t *input_addr) override { input_addr_ = input_addr; }
+  void FreeDeviceMem(void *addr) override;
+  size_t AlignSizeForLaunchKernel(size_t size) override;
+  uint8_t *AllocDeviceMem(size_t size) override;
+  void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
+  void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;
+
+  void LaunchOpKernel() override;
+  void FreeLaunchDeviceMem() override;
+
+ protected:
+  TypeId dtype_;
+  size_t total_size_;
+  std::shared_ptr<session::KernelGraph> atomic_clean_graph_;
+  uint8_t *input_addr_;
+
+ private:
+  std::shared_ptr<session::KernelGraph> ObtainAtomicCleanKernelGraph();
+  void ConstructKernelGraphAndSetAttr();
+};
+}  // namespace mindspore::device::ascend
+
+#endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.h b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.h
index a228162317f..12561e3ffa4 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.h
@@ -33,6 +33,7 @@ class AscendLaunchKernel : public LaunchKernel {
   void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
   void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;
 
+  void SetInputAddr(uint8_t *input_addr) override = 0;
   void LaunchOpKernel() override = 0;
   void FreeLaunchDeviceMem() override = 0;
 };
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.h b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.h
index ab090b95c99..0c8a600e0a6 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.h
@@ -25,10 +25,11 @@
 namespace mindspore::device::ascend {
 class AscendLaunchMul : public AscendLaunchKernel, public LaunchMul {
  public:
-  AscendLaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr)
-      : AscendLaunchKernel(stream), LaunchMul(dtype, total_size, input1_addr) {}
+  AscendLaunchMul(void *stream, TypeId dtype, size_t total_size)
+      : AscendLaunchKernel(stream), LaunchMul(dtype, total_size) {}
   ~AscendLaunchMul() override = default;
 
+  void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; }
   void FreeDeviceMem(void *addr) override;
   size_t AlignSizeForLaunchKernel(size_t size) override;
   uint8_t *AllocDeviceMem(size_t size) override;
diff --git a/mindspore/ccsrc/runtime/device/bucket.cc b/mindspore/ccsrc/runtime/device/bucket.cc
index c354e994dd3..dcf54d76e8a 100644
--- a/mindspore/ccsrc/runtime/device/bucket.cc
+++ b/mindspore/ccsrc/runtime/device/bucket.cc
@@ -94,12 +94,16 @@ void Bucket::CalculateMean() {
   if (!grad_mean) {
     return;
   }
-  launch_kernel = CreateLaunchKernel();
-  MS_EXCEPTION_IF_NULL(launch_kernel);
+  if (launch_mul_ == nullptr) {
+    launch_mul_ = CreateLaunchMul();
+    MS_EXCEPTION_IF_NULL(launch_mul_);
+  }
+  // set mul input1 addr
+  launch_mul_->SetInputAddr(ar_output_addr_);
   // launch mean
-  launch_kernel->LaunchOpKernel();
+  launch_mul_->LaunchOpKernel();
   // store output tensor addr
-  auto launch_output = launch_kernel->GetKernelOutputAddr();
+  auto launch_output = launch_mul_->GetKernelOutputAddr();
   if (launch_output.size() != 1) {
     MS_LOG(ERROR) << "launch mul outputs should have one output";
   }
diff --git a/mindspore/ccsrc/runtime/device/bucket.h b/mindspore/ccsrc/runtime/device/bucket.h
index a2cc276cf45..4bbe8f4a8e4 100644
--- a/mindspore/ccsrc/runtime/device/bucket.h
+++ b/mindspore/ccsrc/runtime/device/bucket.h
@@ -38,7 +38,8 @@ class Bucket {
         compute_stream_(nullptr),
         pre_event_(nullptr),
         post_event_(nullptr),
-        launch_kernel(nullptr),
+        launch_mul_(nullptr),
+        launch_atomic_clean_(nullptr),
         total_size_(0),
         ar_input_addr_(nullptr),
         ar_output_addr_(nullptr) {}
@@ -60,7 +61,8 @@ class Bucket {
 
   std::shared_ptr<DeviceEvent> pre_event_;
   std::shared_ptr<DeviceEvent> post_event_;
-  std::shared_ptr<LaunchKernel> launch_kernel;
+  std::shared_ptr<LaunchKernel> launch_mul_;
+  std::shared_ptr<LaunchKernel> launch_atomic_clean_;
 
   size_t total_size_;
   uint8_t *ar_input_addr_;
@@ -77,7 +79,7 @@ class Bucket {
   virtual void AllocateAllReduceAddr() = 0;
   void UpdateTensorAddr();
   void CalculateMean();
-  virtual std::shared_ptr<LaunchKernel> CreateLaunchKernel() = 0;
+  virtual std::shared_ptr<LaunchKernel> CreateLaunchMul() = 0;
   virtual void LaunchAllReduce() = 0;
   virtual void FreeAllDeviceMem() = 0;
   virtual void FreeDeviceMem(void *dev_ptr) = 0;
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.cc
index 2fc4ddbe06b..d6e10114fa8 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.cc
@@ -92,8 +92,8 @@ void GPUBucket::FreeAllDeviceMem() {
     ar_output_addr_ = nullptr;
   }
   // clear launch mul device memory
-  if (launch_kernel != nullptr) {
-    launch_kernel->FreeLaunchDeviceMem();
+  if (launch_mul_ != nullptr) {
+    launch_mul_->FreeLaunchDeviceMem();
   }
   MS_LOG(INFO) << "end";
 }
@@ -156,11 +156,11 @@ void GPUBucket::LaunchAllReduce() {
   MS_LOG(INFO) << "end";
 }
 
-std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchKernel() {
+std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchMul() {
   if (tensor_type_list_.empty()) {
     MS_LOG(ERROR) << "tensor_type_list_ is empty";
   }
-  auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_);
+  auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_);
   MS_EXCEPTION_IF_NULL(launch_mul);
   return launch_mul;
 }
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.h b/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.h
index 7f5211dfb78..34fecf14354 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.h
@@ -34,7 +34,7 @@ class GPUBucket : public Bucket {
   void FreeDeviceMem(void *dev_ptr) override;
   void CopyTensorToContiguousMemory() override;
   void LaunchAllReduce() override;
-  std::shared_ptr<LaunchKernel> CreateLaunchKernel() override;
+  std::shared_ptr<LaunchKernel> CreateLaunchMul() override;
   const void *collective_handle_;
 };
 }  // namespace mindspore::device::gpu
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_launch_kernel.h b/mindspore/ccsrc/runtime/device/gpu/gpu_launch_kernel.h
index ae3dd575f96..e438ca3f3fb 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_launch_kernel.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_launch_kernel.h
@@ -33,6 +33,7 @@ class GPULaunchkernel : public LaunchKernel {
   void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
   void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;
 
+  void SetInputAddr(uint8_t *input_addr) override = 0;
   void LaunchOpKernel() override = 0;
   void FreeLaunchDeviceMem() override = 0;
 };
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_launch_mul.h b/mindspore/ccsrc/runtime/device/gpu/gpu_launch_mul.h
index e5798a932f0..ff22d041707 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_launch_mul.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_launch_mul.h
@@ -25,10 +25,10 @@
 namespace mindspore::device::gpu {
 class GPULaunchMul : public GPULaunchkernel, public LaunchMul {
  public:
-  GPULaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr)
-      : GPULaunchkernel(stream), LaunchMul(dtype, total_size, input1_addr) {}
+  GPULaunchMul(void *stream, TypeId dtype, size_t total_size) : GPULaunchkernel(stream), LaunchMul(dtype, total_size) {}
   ~GPULaunchMul() override = default;
 
+  void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; }
   void FreeDeviceMem(void *addr) override;
   size_t AlignSizeForLaunchKernel(size_t size) override;
   uint8_t *AllocDeviceMem(size_t size) override;
diff --git a/mindspore/ccsrc/runtime/device/launch_kernel.cc b/mindspore/ccsrc/runtime/device/launch_kernel.cc
index b90ba1610a5..cf0101b86cf 100644
--- a/mindspore/ccsrc/runtime/device/launch_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/launch_kernel.cc
@@ -83,7 +83,7 @@ void LaunchKernel::LaunchSingleKernel(const std::vector<uint8_t *> &inputs_addr)
   // launch
   auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
   if (!ret_status) {
-    MS_LOG(ERROR) << "Launch mul kernel failed.";
+    MS_LOG(ERROR) << "Launch single kernel failed.";
   }
 }
 
diff --git a/mindspore/ccsrc/runtime/device/launch_kernel.h b/mindspore/ccsrc/runtime/device/launch_kernel.h
index 0f75a9572e9..8e532a75560 100644
--- a/mindspore/ccsrc/runtime/device/launch_kernel.h
+++ b/mindspore/ccsrc/runtime/device/launch_kernel.h
@@ -37,6 +37,7 @@ class LaunchKernel {
   virtual void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) = 0;
   virtual void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) = 0;
 
+  virtual void SetInputAddr(uint8_t *input_addr) = 0;
   virtual void LaunchOpKernel() = 0;
   virtual void FreeLaunchDeviceMem() = 0;
 
@@ -46,7 +47,6 @@ class LaunchKernel {
   std::vector<uint8_t *> outputs_addr_;
   std::vector<uint8_t *> workspaces_addr_;
 
- private:
   std::vector<kernel::AddressPtr> ObtainKernelAddress(const std::vector<size_t> &list, std::vector<uint8_t *> *addr);
   std::vector<kernel::AddressPtr> ObtainKernelInputs(const std::vector<size_t> &inputs_list,
                                                      const std::vector<uint8_t *> &inputs_addr);
diff --git a/mindspore/ccsrc/runtime/device/launch_mul.h b/mindspore/ccsrc/runtime/device/launch_mul.h
index 1b2c4651397..461a91f9735 100644
--- a/mindspore/ccsrc/runtime/device/launch_mul.h
+++ b/mindspore/ccsrc/runtime/device/launch_mul.h
@@ -24,10 +24,10 @@
 namespace mindspore::device {
 class LaunchMul {
  public:
-  LaunchMul(TypeId dtype, size_t total_size, uint8_t *input1_addr)
+  LaunchMul(TypeId dtype, size_t total_size)
       : dtype_(dtype),
         total_size_(total_size),
-        input1_addr_(input1_addr),
+        input1_addr_(nullptr),
         input2_addr_(nullptr),
         input2_value_(0),
         mul_graph_(nullptr) {}
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 42d3fe57983..27aa68c49f6 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -106,6 +106,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc"
         "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.cc"
         "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.cc"
+        "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc"
         "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_graph_kernel.cc"
         "../../../mindspore/ccsrc/runtime/device/convert_tensor_utils.cc"
         "../../../mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc"