diff --git a/akg b/akg
index f8f4e60bf3c..72b359ad457 160000
--- a/akg
+++ b/akg
@@ -1 +1 @@
-Subproject commit f8f4e60bf3c435cec41cbe48fe24901277ef9556
+Subproject commit 72b359ad457ed8f4f254c8a3bd2bde88967202fb
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.cc
index 2e7c594e09a..b56723277e3 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.cc
@@ -558,7 +558,7 @@ bool AkgKernelJsonGenerator::CollectJson(const AnfNodePtr &anf_node, nlohmann::j
 bool AkgKernelJsonGenerator::CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes,
                                               const std::vector<AnfNodePtr> &input_list,
                                               const std::vector<AnfNodePtr> &output_list, nlohmann::json *kernel_json) {
-  if (anf_nodes.empty() || input_list.empty()) {
+  if (anf_nodes.empty()) {
     MS_LOG(ERROR) << "Invalid input size, anf_nodes [" << anf_nodes.size() << "], input_list [" << input_list.size()
                   << "].";
     return false;
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.cc
index f753e8fba7d..2b73ff88404 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.cc
@@ -374,13 +374,10 @@ CNodePtr AtomicCleanInsertter::CreateAtomicCleanCompositeNode(const KernelGraphP
 
   // Create composite op's sub-graph.
   auto new_sub_graph = std::make_shared<FuncGraph>();
-  auto parameter = new_sub_graph->add_parameter();
-  parameter->set_abstract(value_node->abstract());
-  parameter->set_kernel_info(value_node->kernel_info_ptr());
 
-  AnfNodePtr broadcast_input_node = parameter;
+  AnfNodePtr broadcast_input_node = value_node;
   if (dst_type == kNumberTypeFloat16) {
-    AnfNodePtrList cast_inputs = {NewValueNode(prim::kPrimCast), parameter};
+    AnfNodePtrList cast_inputs = {NewValueNode(prim::kPrimCast), value_node};
     auto cast_node_inner =
       CreateCNode(cast_inputs, new_sub_graph, {.format = format, .shape = {1}, .type = TypeIdToType(dst_type)});
     AnfAlgo::SetNodeAttr("dst_type", MakeValue("float32"), cast_node_inner);
@@ -400,12 +397,13 @@ CNodePtr AtomicCleanInsertter::CreateAtomicCleanCompositeNode(const KernelGraphP
 
   // Makeup sub-graph.
   new_sub_graph->set_output(broadcast_to_node_inner);
-  auto broadcast_to_composite_node = main_graph->NewCNode({NewValueNode(new_sub_graph), value_node});
+  auto broadcast_to_composite_node = main_graph->NewCNode({NewValueNode(new_sub_graph)});
   broadcast_to_composite_node->set_abstract(broadcast_to_node_inner->abstract());
-  SetNewKernelInfo(broadcast_to_composite_node, new_sub_graph, {value_node}, {broadcast_to_node_inner},
-                   kernel::Processor::CUDA);
+  SetNewKernelInfo(broadcast_to_composite_node, new_sub_graph, {}, {broadcast_to_node_inner},
+                   AnfAlgo::GetProcessor(atomic_add_node_));
   auto graph_attr = ExtractGraphKernelName(TopoSort(new_sub_graph->get_return()), "", "atomic_clean");
   new_sub_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(graph_attr));
+  new_sub_graph->set_attr("composite_type", MakeValue("atomic_clean"));
 
   return broadcast_to_composite_node;
 }
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/clean_all_in_once.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/clean_all_in_once.cc
new file mode 100644
index 00000000000..4354cddee48
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/clean_all_in_once.cc
@@ -0,0 +1,123 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/graph_kernel/clean_all_in_once.h"
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+#include "backend/session/anf_runtime_algorithm.h"
+#include "backend/kernel_compiler/common_utils.h"
+#include "backend/optimizer/graph_kernel/graph_kernel_helper.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+ShapeVector GetValidShape(const AnfNodePtr &node) {
+  // Shape will not contain 1 in head.
+  auto shape = GetShape(node);
+  ShapeVector valid_shape;
+  bool valid = false;
+  for (auto s : shape) {
+    if (!valid && s == 1) {
+      continue;
+    }
+    valid = true;
+    valid_shape.push_back(s);
+  }
+  return valid_shape;
+}
+
+bool IsAtomicCleanNode(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto func_graph = GetValueNode<FuncGraphPtr>(cnode->input(kAnfPrimitiveIndex));
+  MS_EXCEPTION_IF_NULL(func_graph);
+  if (!func_graph->has_attr("composite_type")) {
+    return false;
+  }
+
+  auto ctype_value = func_graph->get_attr("composite_type");
+  if (!ctype_value->isa<StringImm>()) {
+    MS_LOG(EXCEPTION) << "Attribute composite_type should be a string!";
+  }
+  auto ctype = GetValue<std::string>(ctype_value);
+  return ctype == "atomic_clean";
+}
+
+std::vector<AnfNodePtrList> SplitVectorByWidth(const AnfNodePtrList &nodes, int width) {
+  std::vector<AnfNodePtrList> splitted_nodes;
+  if (!nodes.empty()) {
+    return splitted_nodes;
+  }
+
+  int num = (nodes.size() - 1) / width + 1;
+  splitted_nodes.resize(num);
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    splitted_nodes[i / width].push_back(nodes[i]);
+  }
+  return splitted_nodes;
+}
+}  // namespace
+
+bool CleanAllInOnce::Run(const FuncGraphPtr &func_graph) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  auto mng = func_graph->manager();
+  if (mng == nullptr) {
+    mng = Manage(func_graph, true);
+    func_graph->set_manager(mng);
+  }
+  auto todos = TopoSort(func_graph->get_return());
+  std::map<ShapeVector, AnfNodePtrList> clean_map;
+  std::for_each(todos.cbegin(), todos.cend(), [&clean_map](const AnfNodePtr &node) {
+    if (AnfAlgo::IsGraphKernel(node) && IsAtomicCleanNode(node)) {
+      auto valid_shape = GetValidShape(node);
+      auto iter = clean_map.find(valid_shape);
+      if (iter != clean_map.end()) {
+        iter->second.push_back(node);
+      } else {
+        clean_map.insert({valid_shape, {node}});
+      }
+    }
+  });
+
+  bool changed = false;
+  if (!clean_map.empty()) {
+    for (auto iter : clean_map) {
+      // Do all in once is not good, so do ten in once.
+      auto splitted_nodes = SplitVectorByWidth(iter.second, 10);
+      for (auto &snodes : splitted_nodes) {
+        if (snodes.size() < 2) {
+          continue;
+        }
+        AnfNodePtr clean_all_node;
+        std::tie(clean_all_node, std::ignore) = FuseNodesToSubGraph(snodes, func_graph, "clean_all");
+        MS_LOG(INFO) << "Add node to clean batch buffers in once(" << clean_all_node->fullname_with_scope()
+                     << ") for atomic add!";
+        changed = true;
+      }
+    }
+  }
+
+  if (changed) {
+    mng->RemoveRoots();
+    mng->KeepRoots({func_graph});
+  }
+  return changed;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/clean_all_in_once.h b/mindspore/ccsrc/backend/optimizer/graph_kernel/clean_all_in_once.h
new file mode 100644
index 00000000000..59a8d622f9f
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/clean_all_in_once.h
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CLEAN_ALL_IN_ONCE_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CLEAN_ALL_IN_ONCE_H_
+#include <map>
+#include <memory>
+#include "backend/optimizer/common/pass.h"
+#include "ir/func_graph.h"
+
+namespace mindspore {
+namespace opt {
+class CleanAllInOnce : public Pass {
+ public:
+  CleanAllInOnce() : Pass("clean_all_in_once") {}
+  ~CleanAllInOnce() override = default;
+  bool Run(const FuncGraphPtr &func_graph);
+};
+using CleanAllInOncePtr = std::shared_ptr<CleanAllInOnce>;
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CLEAN_ALL_IN_ONCE_H_
diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc
index 994c5c5f6d5..b239ec5f304 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -38,6 +38,7 @@
 #include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h"
 #include "backend/optimizer/graph_kernel/arithmetic_simplify.h"
 #include "backend/optimizer/graph_kernel/basic_ops_fusion.h"
+#include "backend/optimizer/graph_kernel/clean_all_in_once.h"
 #include "backend/optimizer/graph_kernel/eliminate_redundant_output.h"
 #include "backend/optimizer/graph_kernel/tensor_promotion.h"
 #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h"
@@ -182,6 +183,7 @@ void GPUSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_
   // will be exposed, use GetitemTuple Pass to delete them.
   pm->AddPass(std::make_shared<opt::GetitemTuple>());
   pm->AddPass(std::make_shared<opt::AtomicCleanInsertter>());
+  pm->AddPass(std::make_shared<opt::CleanAllInOnce>());
   pm->AddPass(std::make_shared<opt::BindValueToGraph>());
   optimizer->AddPassManager(pm);
   (void)optimizer->Optimize(kernel_graph);