synchronize latest Ascend software suite 18 Jul 2020, and merging branches

2020-07-18 16:10:19 +08:00 · 2020-07-18 16:10:19 +08:00 · 859acc6d2a
parent bb9f715c10 43567f9b9f
commit 859acc6d2a
414 changed files with 10415 additions and 2035 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -15,4 +15,4 @@
 	url = https://gitee.com/mindspore/akg.git
 [submodule "graphengine"]
 	path = graphengine
-	url = https://gitee.com/ms-incubator/graphengine.git
+	url = https://gitee.com/mindspore/graphengine.git
--- a/README.md
+++ b/README.md
@ -202,10 +202,10 @@ Check out how MindSpore Open Governance [works](https://gitee.com/mindspore/comm

 ### Communication

- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/enQtOTcwMTIxMDI3NjM0LTNkMWM2MzI5NjIyZWU5ZWQ5M2EwMTQ5MWNiYzMxOGM4OWFhZjI4M2E5OGI2YTg3ODU1ODE2Njg1MThiNWI3YmQ) - Communication platform for developers.
+- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/zt-dgk65rli-3ex4xvS4wHX7UDmsQmfu8w) - Communication platform for developers.
 - IRC channel at `#mindspore` (only for meeting minutes logging purpose)
- Video Conferencing: https://meet.jit.si
- Mailing-list: https://mailweb.mindspore.cn/postorius/lists
+- Video Conferencing: TBD
+- Mailing-list: <https://mailweb.mindspore.cn/postorius/lists>

 ## Contributing

--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit df57a6cf9450e347d1854687d1fe66a420ee3b35
+Subproject commit f60af9df4220bf3db5de2b224418953c0dc1f625
--- a/build.sh
+++ b/build.sh
@ -24,7 +24,7 @@ usage()
 {
  echo "Usage:"
  echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
-  echo "              [-a on|off] [-Q on|off] [-S on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
+  echo "              [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
  echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]"
  echo ""
  echo "Options:"
@ -48,7 +48,6 @@ usage()
  echo "    -P Enable dump anf graph to file in ProtoBuffer format, default on"
  echo "    -Q Enable dump memory, default off"
  echo "    -D Enable dumping of function graph ir, default on"
-  echo "    -S Enable async data dump, default off"
  echo "    -z Compile dataset & mindrecord, default on"
  echo "    -M Enable MPI and NCCL for GPU training, gpu default on"
  echo "    -V Specify the minimum required cuda version, default CUDA 10.1"
@ -89,7 +88,6 @@ checkopts()
  ENABLE_TIMELINE="off"
  ENABLE_DUMP2PROTO="on"
  ENABLE_DUMPE2E="off"
-  ENABLE_DATA_DUMP="off"
  ENABLE_DUMP_IR="on"
  COMPILE_MINDDATA="on"
  ENABLE_MPI="off"
@ -104,7 +102,7 @@ checkopts()
  ENABLE_PYTHON="on"

  # Process the options
-  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:S:D:zM:V:K:sB:E' opt
+  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:sB:E' opt
  do
    OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
    case "${opt}" in
@ -186,6 +184,7 @@ checkopts()
        elif [[ "X$OPTARG" == "Xd" || "X$OPTARG" == "Xascend" ]]; then
          ENABLE_D="on"
          ENABLE_CPU="on"
+          ENABLE_SERVING="on"
        elif [[ "X$OPTARG" == "Xcpu" ]]; then
          ENABLE_CPU="on"
        else
@ -220,11 +219,6 @@ checkopts()
        ENABLE_DUMPE2E="$OPTARG"
        echo "enable dump end to end"
        ;;
-      S)
-        check_on_off $OPTARG S
-        ENABLE_DATA_DUMP="$OPTARG"
-        echo "enable data dump"
-        ;;
      D)
        check_on_off $OPTARG D
        ENABLE_DUMP_IR="$OPTARG"
@ -328,9 +322,6 @@ build_mindspore()
    if [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON"
    fi
-    if [[ "X$ENABLE_DATA_DUMP" = "Xon" ]]; then
-        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DATA_DUMP=ON"
-    fi
    CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}"
    CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}"
    if [[ "X$ENABLE_MPI" = "Xon" ]]; then
--- a/cmake/external_libs/glog.cmake
+++ b/cmake/external_libs/glog.cmake
@ -1,4 +1,4 @@
-set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS}")
+set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
 mindspore_add_pkg(glog
        VER 0.4.0
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@ -116,10 +116,10 @@ if(ENABLE_DUMP_E2E)
    add_compile_definitions(ENABLE_DUMP_E2E)
 endif()

-if(ENABLE_DATA_DUMP)
-    add_compile_definitions(ENABLE_DATA_DUMP)
-endif()
-
 if(ENABLE_DEBUGGER)
    add_compile_definitions(ENABLE_DEBUGGER)
 endif()
+
+if(ENABLE_TESTCASES)
+    add_compile_definitions(ENABLE_TESTCASES)
+endif()
--- a/cmake/package_script.cmake
+++ b/cmake/package_script.cmake
@ -1,13 +1,16 @@
 # find exec
 find_package(Python3 3.7 COMPONENTS Interpreter Development)
 if (NOT Python3_FOUND)
-    message("No python3 found.")
-    return ()
+    message(FATAL_ERROR "No python3 found.")
 endif ()

 set(PYTHON ${Python3_EXECUTABLE})
 set(PYTHON_VERSION ${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR})

+if (NOT PYTHON_VERSION MATCHES "3.7")
+    message(FATAL_ERROR "FIND PYTHON VERSION ${PYTHON_VERSION} BUT CAN NOT MATCH PYTHON VERSION 3.7")
+endif ()
+
 find_package(Git)
 if (NOT GIT_FOUND)
    message("No git found.")
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit eee707935c066c16e9b9cd207f8125871b6b97cf
+Subproject commit 103f2d1019dc50d781d7a964551d9f1f50b3b009
--- a/hub/docs/.gitkeep
+++ b/hub/docs/.gitkeep
--- a/hub/images/.gitkeep
+++ b/hub/images/.gitkeep
--- a/hub/scripts/.gitkeep
+++ b/hub/scripts/.gitkeep
--- a/mindspore/_extends/parse/resources.py
+++ b/mindspore/_extends/parse/resources.py
@ -17,7 +17,7 @@
 """Resources for ast tree parse."""
 import ast
 import math
-from mindspore import IndexedSlices
+from mindspore import IndexedSlices, SparseTensor
 from mindspore.ops.composite import multitype_ops
 from mindspore.ops import functional as F, composite as C
 from . import standard_method as M
@ -140,4 +140,5 @@ convert_object_map = {

    # user defined
    IndexedSlices:  F.make_indexed_slices,
+    SparseTensor:   F.make_sparse_tensor,
 }
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@ -44,7 +44,7 @@ if(ENABLE_GPU)
            "backend/kernel_compiler/akg/akg_kernel_attrs_process.cc"
            )

-    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53)
+    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
    list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/blocking_queue.cc" "runtime/device/gpu/gpu_buffer_mgr.cc")
    list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/mpi/mpi_initializer.cc"
                                  "runtime/device/gpu/distribution/collective_wrapper.cc"
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@ -26,14 +26,6 @@ if (ENABLE_CPU)
        "cpu/*.cc"
    )

-    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc" 
-                                  "cpu/ps/pull_kernel.cc"
-                                  "cpu/ps/embedding_look_up_ps_kernel.cc"
-                                  "cpu/ps/embedding_look_up_proxy_kernel.cc"
-                                  "cpu/ps/apply_momentum_ps_kernel.cc"
-                                  "cpu/ps/sparse_apply_adam_ps_kernel.cc"
-                                  "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
-
    if (NOT ENABLE_MPI)
        list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc")
        list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc")
@ -41,6 +33,17 @@ if (ENABLE_CPU)
    endif ()
 endif ()

+if (${CMAKE_SYSTEM_NAME} MATCHES "Windows" OR ENABLE_GE)
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/apply_momentum_ps_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_proxy_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_ps_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pserver_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pull_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_adam_ps_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
+endif()
+
 if (ENABLE_GPU)
    file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "gpu/*.cu"
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
@ -18,6 +18,7 @@
 #include <algorithm>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/optimizer/common/helper.h"
+#include "backend/kernel_compiler/common_utils.h"

 namespace mindspore {
 namespace kernel {
@ -75,15 +76,7 @@ void SetAkgAttrsForCast(const AnfNodePtr &anf_node) {

  std::string dst_type;
  TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, 0);
-  if (output_type == kFloat32->type_id()) {
-    dst_type = "float32";
-  } else if (output_type == kFloat16->type_id()) {
-    dst_type = "float16";
-  } else if (output_type == kInt32->type_id()) {
-    dst_type = "int32";
-  } else {
-    MS_LOG(WARNING) << "Unknown cast_to type: " << TypeIdToType(output_type)->ToString();
-  }
+  dst_type = TypeId2String(output_type);
  AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node);
 }

--- a/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
@ -21,9 +21,7 @@
 #include <memory>
 #include "framework/ge_runtime/task_info.h"
 #include "backend/kernel_compiler/kernel.h"
-#ifdef ENABLE_DATA_DUMP
 #include "debug/data_dump_parser.h"
-#endif

 using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>;
 namespace mindspore {
@ -34,13 +32,7 @@ class AscendKernelMod : public KernelMod {
                                           const std::vector<AddressPtr> &, uint32_t) = 0;
  uint32_t block_dim() { return block_dim_; }
  uint32_t stream_id() { return stream_id_; }
-  virtual bool NeedDump() {
-#ifdef ENABLE_DATA_DUMP
-    return DataDumpParser::GetInstance().NeedDump(kernel_name_);
-#else
-    return false;
-#endif
-  }
+  virtual bool NeedDump() { return DataDumpParser::GetInstance().NeedDump(kernel_name_); }

 protected:
  uint32_t block_dim_{1};
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
@ -20,6 +20,7 @@
 #include <iostream>
 #include <utility>
 #include <fstream>
+#include <algorithm>
 #include <thread>
 #include "nlohmann/json.hpp"
 #include "backend/session/anf_runtime_algorithm.h"
@ -499,235 +500,329 @@ int Sign(float x) {
  return 0;
 }

-void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
-                              size_t outer_dim) {
-  MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
-  MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
-  MS_EXCEPTION_IF_NULL(unique_grad);
-  MS_EXCEPTION_IF_NULL(unique_grad->value_);
-  MS_EXCEPTION_IF_NULL(unique_grad->indices_);
+namespace {
+struct BucketSparseGradient {
+  float *value_;
+  int *indices_;
+  int *global_indices_;
+  size_t indices_size_;
+};
+
+struct MultiThreadReduceSparseGradientParam {
+  SparseGradient *input_grad_{nullptr};
+  SparseGradient *workspace_grad_{nullptr};
+  SparseGradient *output_grad_{nullptr};
+  size_t max_index_{0};
+  size_t value_stride_{0};
+  size_t thread_num_{0};
+  bool use_sort_reduce_{false};
+};
+
+void CalculateEachBucketSize(const std::shared_ptr<SparseGradient> &sparse_grad, size_t max_index,
+                             std::vector<size_t> *each_bucket_size) {
+  MS_LOG(DEBUG) << "Start";
+  MS_EXCEPTION_IF_NULL(sparse_grad);
+  MS_EXCEPTION_IF_NULL(sparse_grad->indices_);
+  MS_EXCEPTION_IF_NULL(each_bucket_size);
+  size_t bucket_num = each_bucket_size->size();
+  for (size_t i = 0; i < sparse_grad->indices_size_; ++i) {
+    int index = sparse_grad->indices_[i];
+    if (index >= 0 && IntToSize(index) < max_index) {
+      auto bucket_id = index % bucket_num;
+      each_bucket_size->at(bucket_id)++;
+    }
+  }
+  MS_LOG(DEBUG) << "End";
+}
+
+void SplitAndCalculateSegmentBucketSize(const MultiThreadReduceSparseGradientParam &param,
+                                        std::vector<std::shared_ptr<SparseGradient>> *segments_ptr,
+                                        std::vector<std::shared_ptr<std::vector<size_t>>> *segment_bucket_sizes_ptr) {
+  MS_EXCEPTION_IF_NULL(param.input_grad_);
+  MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr);
+  MS_EXCEPTION_IF_NULL(segments_ptr);
+  auto &segments = *segments_ptr;
+  auto &segment_bucket_sizes = *segment_bucket_sizes_ptr;
+  auto input_grad = param.input_grad_;
+  if (param.thread_num_ < 1) {
+    MS_EXCEPTION(ArgumentError) << "Input param thread num must > 0!";
+  }
+  size_t thread_indices_size = input_grad->indices_size_ / param.thread_num_;
+  size_t left_indices_size = input_grad->indices_size_ % param.thread_num_;
+  std::vector<std::thread> threads;
+  threads.reserve(param.thread_num_);
+  segments.reserve(param.thread_num_);
+
+  size_t current_indices_offset = 0;
+  for (size_t i = 0; i < param.thread_num_; ++i) {
+    segment_bucket_sizes.emplace_back(std::make_shared<std::vector<size_t>>(param.thread_num_, 0));
+    size_t indices_size = thread_indices_size;
+    if (i < left_indices_size) {
+      indices_size += 1;
+    }
+    segments.emplace_back(std::make_shared<SparseGradient>());
+    segments[i]->value_ = input_grad->value_ + current_indices_offset * param.value_stride_;
+    segments[i]->indices_ = input_grad->indices_ + current_indices_offset;
+    segments[i]->indices_size_ = indices_size;
+    threads.emplace_back(
+      std::thread(CalculateEachBucketSize, segments[i], param.max_index_, segment_bucket_sizes[i].get()));
+    current_indices_offset += indices_size;
+  }
+
+  for (size_t i = 0; i < param.thread_num_; ++i) {
+    threads[i].join();
+  }
+}
+
+void CopySegmentIndicesToBucket(const MultiThreadReduceSparseGradientParam &param,
+                                const std::shared_ptr<SparseGradient> &segment, size_t bucket_offset,
+                                const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets) {
+  MS_LOG(DEBUG) << "Start";
+  MS_EXCEPTION_IF_NULL(segment);
+  MS_EXCEPTION_IF_NULL(segment->indices_);
+  std::vector<size_t> bucket_data_num(param.thread_num_, 0);
+  for (size_t i = 0; i < segment->indices_size_; ++i) {
+    int index = segment->indices_[i];
+    if (index >= 0 && IntToSize(index) < param.max_index_) {
+      auto bucket_id = index % param.thread_num_;
+      auto bucket_index = bucket_data_num[bucket_id];
+      buckets[bucket_id]->indices_[bucket_index] = index;
+      buckets[bucket_id]->global_indices_[bucket_index] = bucket_offset + i;
+      bucket_data_num[bucket_id]++;
+    }
+  }
+  MS_LOG(DEBUG) << "End";
+}
+
+void GatherSegmentIndicesToOutputBucket(const MultiThreadReduceSparseGradientParam &param,
+                                        const std::vector<std::shared_ptr<SparseGradient>> &segments,
+                                        const std::vector<std::shared_ptr<std::vector<size_t>>> &segment_bucket_sizes,
+                                        std::vector<std::shared_ptr<BucketSparseGradient>> *buckets_ptr) {
+  MS_EXCEPTION_IF_NULL(param.output_grad_);
+  MS_EXCEPTION_IF_NULL(param.output_grad_->value_);
+  MS_EXCEPTION_IF_NULL(param.output_grad_->indices_);
+  MS_EXCEPTION_IF_NULL(buckets_ptr);
+  auto &buckets = *buckets_ptr;
+  size_t thread_num = param.thread_num_;
+  if (thread_num != segment_bucket_sizes.size()) {
+    MS_EXCEPTION(ArgumentError) << "Input param thread num not equal to segment size!";
+  }
+  std::vector<size_t> bucket_data_size(thread_num, 0);
+  for (size_t i = 0; i < thread_num; ++i) {
+    for (size_t j = 0; j < thread_num; ++j) {
+      bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
+    }
+  }
+  size_t current_indices_offset = 0;
+  for (size_t i = 0; i < thread_num; ++i) {
+    buckets.emplace_back(std::make_shared<BucketSparseGradient>());
+    buckets[i]->value_ = param.output_grad_->value_ + current_indices_offset * param.value_stride_;
+    buckets[i]->indices_ = param.output_grad_->indices_ + current_indices_offset;
+    buckets[i]->global_indices_ = param.workspace_grad_->indices_ + current_indices_offset;
+    buckets[i]->indices_size_ = bucket_data_size[i];
+    current_indices_offset += bucket_data_size[i];
+  }
+  std::vector<size_t> tmp_bucket_data_size(thread_num, 0);
+  std::vector<std::vector<std::shared_ptr<BucketSparseGradient>>> each_thread_buckets;
+  for (size_t i = 0; i < thread_num; ++i) {
+    std::vector<std::shared_ptr<BucketSparseGradient>> thread_buckets;
+    for (size_t j = 0; j < thread_num; ++j) {
+      thread_buckets.emplace_back(std::make_shared<BucketSparseGradient>());
+      thread_buckets[j]->indices_ = buckets[j]->indices_ + tmp_bucket_data_size[j];
+      thread_buckets[j]->global_indices_ = buckets[j]->global_indices_ + tmp_bucket_data_size[j];
+      thread_buckets[j]->value_ = buckets[j]->value_ + tmp_bucket_data_size[j] * param.value_stride_;
+      thread_buckets[j]->indices_size_ = segment_bucket_sizes[i]->at(j);
+      tmp_bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
+    }
+    each_thread_buckets.emplace_back(thread_buckets);
+  }
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+  current_indices_offset = 0;
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back(
+      std::thread(CopySegmentIndicesToBucket, param, segments[i], current_indices_offset, each_thread_buckets[i]));
+    current_indices_offset += segments[i]->indices_size_;
+  }
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads[i].join();
+  }
+}
+
+void SortAndReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam &param,
+                                       const std::shared_ptr<BucketSparseGradient> &bucket,
+                                       const std::shared_ptr<SparseGradient> &reduced_bucket) {
+  MS_LOG(DEBUG) << "Start";
+  MS_EXCEPTION_IF_NULL(bucket);
+  MS_EXCEPTION_IF_NULL(bucket->value_);
+  MS_EXCEPTION_IF_NULL(bucket->indices_);
+  MS_EXCEPTION_IF_NULL(reduced_bucket);
+  MS_EXCEPTION_IF_NULL(reduced_bucket->value_);
+  MS_EXCEPTION_IF_NULL(reduced_bucket->indices_);
+  std::vector<std::pair<int, int>> sorted_indices;
+  sorted_indices.reserve(bucket->indices_size_);
+  for (size_t i = 0; i < bucket->indices_size_; ++i) {
+    int index = bucket->indices_[i];
+    int global_index = bucket->global_indices_[i];
+    sorted_indices.emplace_back(std::pair<int, int>(index, global_index));
+  }
+  std::sort(sorted_indices.begin(), sorted_indices.end());
+
+  float *global_value = param.input_grad_->value_;
+  size_t unique_indices_size = 0;
+  size_t max_length = reduced_bucket->indices_size_ * param.value_stride_;
+  int last_index{0};
+  size_t value_offset{0};
+  for (size_t i = 0; i < sorted_indices.size(); ++i) {
+    int index = sorted_indices[i].first;
+    int global_index = sorted_indices[i].second;
+    int global_value_offset = global_index * param.value_stride_;
+    if (i == 0 || index != last_index) {
+      if (i != 0) {
+        unique_indices_size++;
+      }
+      reduced_bucket->indices_[unique_indices_size] = index;
+      value_offset = unique_indices_size * param.value_stride_;
+      auto ret_code = memcpy_s(reduced_bucket->value_ + value_offset, (max_length - value_offset) * sizeof(float),
+                               global_value + global_value_offset, param.value_stride_ * sizeof(float));
+      if (ret_code != EOK) {
+        MS_LOG(EXCEPTION) << "Failed to copy data!";
+      }
+    } else {
+      for (size_t j = 0; j < param.value_stride_; ++j) {
+        reduced_bucket->value_[value_offset + j] += global_value[global_value_offset + j];
+      }
+    }
+    last_index = index;
+  }
+  reduced_bucket->indices_size_ = unique_indices_size;
+  MS_LOG(DEBUG) << "End";
+}
+
+void ReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam &param,
+                                const std::shared_ptr<BucketSparseGradient> &bucket,
+                                const std::shared_ptr<SparseGradient> &reduced_bucket) {
+  MS_LOG(DEBUG) << "Start";
+  MS_EXCEPTION_IF_NULL(bucket);
+  MS_EXCEPTION_IF_NULL(bucket->value_);
+  MS_EXCEPTION_IF_NULL(bucket->indices_);
+  MS_EXCEPTION_IF_NULL(reduced_bucket);
+  MS_EXCEPTION_IF_NULL(reduced_bucket->value_);
+  MS_EXCEPTION_IF_NULL(reduced_bucket->indices_);
+
+  float *global_value = param.input_grad_->value_;
  std::unordered_map<int, size_t> index_map;
  size_t unique_indices_size = 0;
-  for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
-    int index = origin_sparse_grad.indices_[i];
-    if (index < 0 || IntToSize(index) >= first_dim) {
-      continue;
-    }
+  size_t max_length = reduced_bucket->indices_size_ * param.value_stride_;
+  for (size_t i = 0; i < bucket->indices_size_; ++i) {
+    int index = bucket->indices_[i];
+    int global_index = bucket->global_indices_[i];
    auto iter = index_map.find(index);
    if (iter == index_map.end()) {
-      index_map[index] = unique_indices_size;
-      unique_grad->indices_[unique_indices_size] = index;
-      size_t start_index = unique_indices_size * outer_dim;
-      size_t end_index = start_index + outer_dim;
-      for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) {
-        unique_grad->value_[j] = origin_sparse_grad.value_[k];
+      reduced_bucket->indices_[unique_indices_size] = index;
+      size_t start_index = unique_indices_size * param.value_stride_;
+      index_map[index] = start_index;
+      auto ret_code = memcpy_s(reduced_bucket->value_ + start_index, (max_length - start_index) * sizeof(float),
+                               global_value + global_index * param.value_stride_, param.value_stride_ * sizeof(float));
+      if (ret_code != EOK) {
+        MS_LOG(EXCEPTION) << "Failed to copy data!";
      }
      unique_indices_size++;
    } else {
-      size_t first_index = iter->second;
-      size_t start_index = first_index * outer_dim;
-      size_t end_index = start_index + outer_dim;
-      for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) {
-        unique_grad->value_[j] += origin_sparse_grad.value_[k];
+      size_t start_index = iter->second;
+      size_t end_index = start_index + param.value_stride_;
+      for (size_t j = start_index, k = global_index * param.value_stride_; j < end_index; ++j, ++k) {
+        reduced_bucket->value_[j] += global_value[k];
      }
    }
  }
-  unique_grad->indices_size_ = unique_indices_size;
-}
-
-struct WorkerParamsForReduceSparseGradient {
-  size_t slice_start_{0};
-  size_t slice_end_{0};
-  size_t max_length_{0};
-  size_t outer_dim_{0};
-  std::vector<std::pair<int, size_t>> *sorted_indices_{nullptr};
-  std::vector<size_t> *slice_positions_{nullptr};
-  float *src_value_{nullptr};
-  SparseGradient *unique_grad_{nullptr};
-};
-
-void WorkerForReduceSparseGradient(WorkerParamsForReduceSparseGradient param) {
-  MS_EXCEPTION_IF_NULL(param.sorted_indices_);
-  MS_EXCEPTION_IF_NULL(param.slice_positions_);
-  MS_EXCEPTION_IF_NULL(param.src_value_);
-  MS_EXCEPTION_IF_NULL(param.unique_grad_);
-  auto outer_dim = param.outer_dim_;
-  auto &sorted_indices = *(param.sorted_indices_);
-  auto &slice_positions = *(param.slice_positions_);
-  auto unique_grad = param.unique_grad_;
-  for (size_t slice_id = param.slice_start_; slice_id < param.slice_end_; ++slice_id) {
-    size_t cur_pos = slice_positions[slice_id];
-    int index = sorted_indices[cur_pos].first;
-    unique_grad->indices_[slice_id] = index;
-    size_t start_index = slice_id * outer_dim;
-    auto ret_code = memcpy_s(unique_grad->value_ + start_index, (param.max_length_ - start_index) * sizeof(float),
-                             param.src_value_ + sorted_indices[cur_pos].second, outer_dim * sizeof(float));
-    if (ret_code != EOK) {
-      MS_LOG(EXCEPTION) << "Failed to copy data!";
-    }
-    cur_pos++;
-    size_t end_pos;
-    if (slice_id + 1 < slice_positions.size()) {
-      end_pos = slice_positions[slice_id + 1];
-    } else {
-      end_pos = sorted_indices.size();
-    }
-    while (cur_pos < end_pos) {
-      for (size_t i = 0; i < outer_dim; ++i) {
-        unique_grad->value_[start_index + i] += param.src_value_[sorted_indices[cur_pos].second + i];
-      }
-      cur_pos++;
-    }
-  }
-}
-
-void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
-                                        size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
-                                        std::vector<size_t> *slice_positions) {
-  MS_LOG(DEBUG) << "Start";
-  size_t thread_num = 24;
-  if (slice_positions->size() < thread_num) {
-    thread_num = slice_positions->size();
-  }
-  size_t stride = (slice_positions->size() + thread_num - 1) / thread_num;
-  thread_num = (slice_positions->size() + stride - 1) / stride;
-  std::vector<std::thread> threads;
-  size_t max_length = sorted_indices->size() * outer_dim;
-  for (size_t i = 0; i < thread_num; ++i) {
-    size_t slice_start = i * stride;
-    size_t slice_end = 0;
-    if (i == thread_num - 1) {
-      slice_end = slice_positions->size();
-    } else {
-      slice_end = slice_start + stride;
-    }
-    WorkerParamsForReduceSparseGradient params{
-      slice_start, slice_end, max_length, outer_dim, sorted_indices, slice_positions, origin_sparse_grad.value_,
-      unique_grad};
-    threads.emplace_back(std::thread(WorkerForReduceSparseGradient, params));
-  }
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads[i].join();
-  }
+  reduced_bucket->indices_size_ = unique_indices_size;
  MS_LOG(DEBUG) << "End";
 }

-void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
-                          size_t outer_dim, bool use_multi_threads) {
-  MS_LOG(DEBUG) << "Start";
-  MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
-  MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
-  MS_EXCEPTION_IF_NULL(unique_grad);
-  MS_EXCEPTION_IF_NULL(unique_grad->value_);
-  MS_EXCEPTION_IF_NULL(unique_grad->indices_);
-  std::vector<std::pair<int, size_t>> sorted_indices;
-  sorted_indices.reserve(origin_sparse_grad.indices_size_);
-  for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
-    int index = origin_sparse_grad.indices_[i];
-    if (index >= 0 && IntToSize(index) < first_dim) {
-      sorted_indices.emplace_back(std::pair<int, size_t>(index, i * outer_dim));
-    }
-  }
-  std::sort(
-    sorted_indices.begin(), sorted_indices.end(),
-    [](const std::pair<int, size_t> &left, const std::pair<int, size_t> &right) { return left.first < right.first; });
-  int last_index = 0;
-  std::vector<size_t> slice_positions;
-  slice_positions.reserve(sorted_indices.size());
-  for (size_t i = 0; i < sorted_indices.size(); ++i) {
-    if (i == 0 || last_index != sorted_indices[i].first) {
-      slice_positions.emplace_back(i);
-    }
-    last_index = sorted_indices[i].first;
-  }
-  if (use_multi_threads) {
-    RunMultiThreadReduceSparseGradient(origin_sparse_grad, unique_grad, outer_dim, &sorted_indices, &slice_positions);
-  } else {
-    size_t max_length = sorted_indices.size() * outer_dim;
-    WorkerParamsForReduceSparseGradient params{0,
-                                               slice_positions.size(),
-                                               max_length,
-                                               outer_dim,
-                                               &sorted_indices,
-                                               &slice_positions,
-                                               origin_sparse_grad.value_,
-                                               unique_grad};
-    WorkerForReduceSparseGradient(params);
-  }
-  unique_grad->indices_size_ = slice_positions.size();
-  MS_LOG(DEBUG) << "End";
-}
-
-void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
-                               SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
-                               size_t outer_dim) {
-  MS_LOG(DEBUG) << "Start";
-  if (unique_slice_grads.empty()) {
-    return;
-  }
-  size_t index_data_size = outer_dim * sizeof(float);
-  size_t unique_indices_size = 0;
-  for (size_t i = 0; i < unique_slice_grads.size(); ++i) {
-    auto &slice_grad = unique_slice_grads[i];
-    auto ret_code = memcpy_s(tmp_grad->value_ + unique_indices_size * outer_dim,
-                             (tmp_grad->indices_size_ - unique_indices_size) * index_data_size, slice_grad->value_,
-                             slice_grad->indices_size_ * index_data_size);
-    if (ret_code != EOK) {
-      MS_LOG(EXCEPTION) << "Failed to copy data!";
-    }
-    ret_code =
-      memcpy_s(tmp_grad->indices_ + unique_indices_size, (tmp_grad->indices_size_ - unique_indices_size) * sizeof(int),
-               slice_grad->indices_, slice_grad->indices_size_ * sizeof(int));
-    if (ret_code != EOK) {
-      MS_LOG(EXCEPTION) << "Failed to copy data!";
-    }
-    unique_indices_size += slice_grad->indices_size_;
-  }
-  tmp_grad->indices_size_ = unique_indices_size;
-  ReduceSparseGradient(*tmp_grad, unique_grad, first_dim, outer_dim);
-  MS_LOG(DEBUG) << "End";
-}
-
-void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
-                                  SparseGradient *unique_grad, size_t first_dim, size_t outer_dim) {
-  MS_LOG(DEBUG) << "Start";
-  MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
-  MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
-  MS_EXCEPTION_IF_NULL(unique_grad);
-  MS_EXCEPTION_IF_NULL(unique_grad->value_);
-  MS_EXCEPTION_IF_NULL(unique_grad->indices_);
-  MS_EXCEPTION_IF_NULL(tmp_grad);
-  MS_EXCEPTION_IF_NULL(tmp_grad->value_);
-  MS_EXCEPTION_IF_NULL(tmp_grad->indices_);
-  size_t thread_num = 24;
-  if (origin_sparse_grad.indices_size_ < thread_num) {
-    thread_num = origin_sparse_grad.indices_size_;
-  }
-  size_t thread_indices_size = origin_sparse_grad.indices_size_ / thread_num;
-  size_t left_indices_size = origin_sparse_grad.indices_size_ % thread_num;
+void ReduceBucketSparseGradientToWorkspace(const MultiThreadReduceSparseGradientParam &param,
+                                           const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets,
+                                           std::vector<std::shared_ptr<SparseGradient>> *reduced_buckets_ptr) {
+  MS_EXCEPTION_IF_NULL(param.workspace_grad_);
+  MS_EXCEPTION_IF_NULL(param.workspace_grad_->value_);
+  MS_EXCEPTION_IF_NULL(param.workspace_grad_->indices_);
+  MS_EXCEPTION_IF_NULL(reduced_buckets_ptr);
+  auto &reduced_buckets = *reduced_buckets_ptr;
+  size_t thread_num = buckets.size();
  std::vector<std::thread> threads;
  threads.reserve(thread_num);
-  std::vector<std::shared_ptr<SparseGradient>> unique_slice_grads;
+
+  size_t current_indices_offset = 0;
  for (size_t i = 0; i < thread_num; ++i) {
-    size_t indices_size = thread_indices_size;
-    if (i == thread_num - 1) {
-      indices_size = thread_indices_size + left_indices_size;
+    reduced_buckets.emplace_back(std::make_shared<SparseGradient>());
+    reduced_buckets[i]->value_ = param.workspace_grad_->value_ + current_indices_offset * param.value_stride_;
+    reduced_buckets[i]->indices_ = param.workspace_grad_->indices_ + current_indices_offset;
+    reduced_buckets[i]->indices_size_ = buckets[i]->indices_size_;
+    if (param.use_sort_reduce_) {
+      threads.emplace_back(std::thread(SortAndReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i]));
+    } else {
+      threads.emplace_back(std::thread(ReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i]));
    }
-    size_t value_offset = i * thread_indices_size * outer_dim;
-    size_t indices_offset = i * thread_indices_size;
-    auto slice_grad = SparseGradient(
-      {origin_sparse_grad.value_ + value_offset, origin_sparse_grad.indices_ + indices_offset, indices_size});
-    unique_slice_grads.emplace_back(std::make_shared<SparseGradient>());
-    unique_slice_grads[i]->value_ = unique_grad->value_ + value_offset;
-    unique_slice_grads[i]->indices_ = unique_grad->indices_ + indices_offset;
-    unique_slice_grads[i]->indices_size_ = indices_size;
-    threads.emplace_back(
-      std::thread(ReduceSparseGradient, slice_grad, unique_slice_grads[i].get(), first_dim, outer_dim, false));
+    current_indices_offset += buckets[i]->indices_size_;
  }
  for (size_t i = 0; i < thread_num; ++i) {
    threads[i].join();
  }
-  ReduceMultiSparseGradient(unique_slice_grads, tmp_grad, unique_grad, first_dim, outer_dim);
+}
+
+void MergeReduceSparseGradient(const MultiThreadReduceSparseGradientParam &param,
+                               const std::vector<std::shared_ptr<SparseGradient>> &reduced_buckets) {
+  MS_EXCEPTION_IF_NULL(param.output_grad_);
+  auto output_grad = param.output_grad_;
+  MS_EXCEPTION_IF_NULL(output_grad->value_);
+  MS_EXCEPTION_IF_NULL(output_grad->indices_);
+  size_t stride_data_size = param.value_stride_ * sizeof(float);
+  size_t unique_indices_size = 0;
+  for (size_t i = 0; i < reduced_buckets.size(); ++i) {
+    auto &bucket = reduced_buckets[i];
+    MS_EXCEPTION_IF_NULL(bucket);
+    if (bucket->indices_size_ == 0) {
+      continue;
+    }
+    auto ret_code = memcpy_s(output_grad->value_ + unique_indices_size * param.value_stride_,
+                             (output_grad->indices_size_ - unique_indices_size) * stride_data_size, bucket->value_,
+                             bucket->indices_size_ * stride_data_size);
+    if (ret_code != EOK) {
+      MS_LOG(EXCEPTION) << "Failed to copy data!";
+    }
+    ret_code = memcpy_s(output_grad->indices_ + unique_indices_size,
+                        (output_grad->indices_size_ - unique_indices_size) * sizeof(int), bucket->indices_,
+                        bucket->indices_size_ * sizeof(int));
+    if (ret_code != EOK) {
+      MS_LOG(EXCEPTION) << "Failed to copy data!";
+    }
+    unique_indices_size += bucket->indices_size_;
+  }
+  output_grad->indices_size_ = unique_indices_size;
+}
+}  // namespace
+
+void BucketReduceSparseGradient(const ReduceSparseGradientParam &param) {
+  MS_LOG(DEBUG) << "Start";
+  MS_EXCEPTION_IF_NULL(param.input_grad_);
+  size_t thread_num = 23;
+  if (param.input_grad_->indices_size_ < thread_num) {
+    thread_num = param.input_grad_->indices_size_;
+  }
+  MultiThreadReduceSparseGradientParam multi_thread_param({param.input_grad_, param.workspace_grad_, param.output_grad_,
+                                                           param.max_index_, param.value_stride_, thread_num,
+                                                           param.use_sort_reduce_});
+  std::vector<std::shared_ptr<SparseGradient>> segments;
+  std::vector<std::shared_ptr<std::vector<size_t>>> segment_bucket_sizes;
+  SplitAndCalculateSegmentBucketSize(multi_thread_param, &segments, &segment_bucket_sizes);
+
+  std::vector<std::shared_ptr<BucketSparseGradient>> buckets;
+  GatherSegmentIndicesToOutputBucket(multi_thread_param, segments, segment_bucket_sizes, &buckets);
+
+  std::vector<std::shared_ptr<SparseGradient>> reduced_buckets;
+  ReduceBucketSparseGradientToWorkspace(multi_thread_param, buckets, &reduced_buckets);
+
+  MergeReduceSparseGradient(multi_thread_param, reduced_buckets);
  MS_LOG(DEBUG) << "End";
 }

--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
@ -73,9 +73,18 @@ class KernelMeta {
 };

 struct SparseGradient {
-  float *value_;
-  int *indices_;
-  size_t indices_size_;
+  float *value_{nullptr};
+  int *indices_{nullptr};
+  size_t indices_size_{0};
+};
+
+struct ReduceSparseGradientParam {
+  SparseGradient *input_grad_{nullptr};
+  SparseGradient *workspace_grad_{nullptr};
+  SparseGradient *output_grad_{nullptr};
+  size_t max_index_{0};
+  size_t value_stride_{0};
+  bool use_sort_reduce_{false};
 };

 struct MultiThreadComputeParams {
@ -112,10 +121,6 @@ void SaveJsonInfo(const std::string &json_name, const std::string &info);
 std::string GetProcessor(const AnfNodePtr &anf_node);
 bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
 int Sign(float x);
-void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
-                              size_t outer_dim);
-void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
-                          size_t outer_dim, bool use_multi_threads = true);
 std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
 std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
                                                                            const std::vector<AnfNodePtr> &input_list);
@ -130,14 +135,7 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<An
 bool IsWeightBoundary(const AnfNodePtr &node);
 void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params,
                        size_t total_compute_size);
-void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
-                                        size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
-                                        std::vector<size_t> *slice_positions);
-void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
-                               SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
-                               size_t outer_dim);
-void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
-                                  SparseGradient *unique_grad, size_t first_dim, size_t outer_dim);
+void BucketReduceSparseGradient(const ReduceSparseGradientParam &param);
 std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode);
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
@ -46,7 +46,7 @@ class EmbeddingLookUpCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

- private:
+ protected:
  void LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2,
                   float **output_addr);
  void CheckParam(const CNodePtr &kernel_node);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.cc
@ -53,15 +53,15 @@ bool EmbeddingLookUpProxyKernel::Launch(const std::vector<kernel::AddressPtr> &i
  size_t output_size = outputs[0]->size;

  size_t size = input_size / sizeof(float);
-  ::ps::SArray<float> lookup_ids(size, 0);
+  ::ps::SArray<int> lookup_ids(size, 0);
  ::ps::SArray<int> lengths{size};
-  ::ps::SArray<float> lookup_result;
+  ::ps::SArray<float> lookup_result(output_size / sizeof(float), 0);

  auto ret = memcpy_s(lookup_ids.data(), input_size, indices_addr, input_size);
  if (ret != EOK) {
    MS_LOG(EXCEPTION) << "Lookup id memcpy failed.";
  }
-  parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, lookup_result,
+  parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, &lookup_result,
                                                                 parallel::ps::kEmbeddingLookupCmd);

  auto ret2 = memcpy_s(output_addr, output_size, lookup_result.data(), output_size);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.cc
@ -50,7 +50,7 @@ void EmbeddingLookUpPSKernel::InitKernel(
  split_num_ = pserver_num_;

  // input shape should be sharded after computing offset_;
-  Shard(input_shape_, axis_);
+  Shard(&input_shape_, axis_);

  size_t output_size =
    std::accumulate(output_shape_.begin(), output_shape_.end(), sizeof(float), std::multiplies<size_t>());
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.cc
@ -34,5 +34,13 @@ MS_REG_CPU_KERNEL_T(Push,
 MS_REG_CPU_KERNEL_T(
  Push, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt64),
  PushKernel, float);
+
+MS_REG_CPU_KERNEL_T(Push,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddOutputAttr(kNumberTypeUInt64),
+                    PushKernel, float);
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h
@ -43,7 +43,7 @@ class PushKernel : public CPUKernel {
      sizes.push_back(SizeToInt(input->size) / sizeof(T));
    }
    parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes);
-    memcpy(outputs[0]->addr, &key_, sizeof(size_t));
+    memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t));
    return true;
  }

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc
@ -75,7 +75,7 @@ void SparseApplyAdamPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar

 void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
  const auto &indices_addr = inputs[10];
-  indices_size_ = indices_addr->size;
+  indices_size_ = indices_addr->size / sizeof(int);
  workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
  workspace_size_list_[1] = indices_size_ * sizeof(int);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc
@ -64,7 +64,7 @@ void SparseApplyFtrlPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar

 void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
  const auto &indices_addr = inputs[4];
-  indices_size_ = indices_addr->size;
+  indices_size_ = indices_addr->size / sizeof(int);
  workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
  workspace_size_list_[1] = indices_size_ * sizeof(int);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc
@ -81,6 +81,8 @@ void SparseApplyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node)
  MS_EXCEPTION_IF_NULL(kernel_node);
  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
+  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
+  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
  workspace_size_list_.emplace_back(var_first_dim_size_ * var_outer_dim_size_ * sizeof(float));
 }

@ -142,11 +144,21 @@ bool SparseApplyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
  auto indices = reinterpret_cast<int *>(inputs[10]->addr);
  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
-  auto m_t = reinterpret_cast<float *>(workspace[2]->addr);
+  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
+  auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
+  auto m_t = reinterpret_cast<float *>(workspace[4]->addr);

  SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
-  ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
-                       var_outer_dim_size_);
+  SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
+  SparseGradient input_sparse_grad({grad, indices, indices_size_});
+  ReduceSparseGradientParam param;
+  param.input_grad_ = &input_sparse_grad;
+  param.workspace_grad_ = &workspace_sparse_grad;
+  param.output_grad_ = &unique_sparse_grad;
+  param.max_index_ = var_first_dim_size_;
+  param.value_stride_ = var_outer_dim_size_;
+  BucketReduceSparseGradient(param);
+
  size_t total_dim_size = var_first_dim_size_ * var_outer_dim_size_;
  lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_ftrl_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_ftrl_cpu_kernel.cc
@ -132,12 +132,19 @@ bool SparseApplyFtrlCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
  auto indices = reinterpret_cast<int *>(inputs[4]->addr);
  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
-  auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr);
-  auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr);
+  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
+  auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
+
  SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
-  SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_});
-  TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad,
-                               var_first_dim_size_, var_outer_dim_size_);
+  SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
+  SparseGradient input_sparse_grad({grad, indices, indices_size_});
+  ReduceSparseGradientParam param;
+  param.input_grad_ = &input_sparse_grad;
+  param.workspace_grad_ = &workspace_sparse_grad;
+  param.output_grad_ = &unique_sparse_grad;
+  param.max_index_ = var_first_dim_size_;
+  param.value_stride_ = var_outer_dim_size_;
+  BucketReduceSparseGradient(param);

  MultiThreadComputeParams input_params;
  input_params.var_ = var;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.cc
@ -123,13 +123,19 @@ bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr>
  auto indices = reinterpret_cast<int *>(inputs[10]->addr);
  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
-  auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr);
-  auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr);
+  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
+  auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);

  SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
-  SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_});
-  TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad,
-                               var_first_dim_size_, var_outer_dim_size_);
+  SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
+  SparseGradient input_sparse_grad({grad, indices, indices_size_});
+  ReduceSparseGradientParam param;
+  param.input_grad_ = &input_sparse_grad;
+  param.workspace_grad_ = &workspace_sparse_grad;
+  param.output_grad_ = &unique_sparse_grad;
+  param.max_index_ = var_first_dim_size_;
+  param.value_stride_ = var_outer_dim_size_;
+  BucketReduceSparseGradient(param);

  lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
  MultiThreadComputeParams input_params;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc
@ -61,6 +61,8 @@ void SparseApplyProximalAdagradCPUKernel::InitInputOutputSize(const CNodePtr &ke
  MS_EXCEPTION_IF_NULL(kernel_node);
  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
+  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
+  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
 }

 void SparseApplyProximalAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
@ -119,9 +121,19 @@ bool SparseApplyProximalAdagradCPUKernel::Launch(const std::vector<kernel::Addre
  auto indices = reinterpret_cast<int *>(inputs[6]->addr);
  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
+  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
+  auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
+
  SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
-  ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
-                       var_outer_dim_size_);
+  SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
+  SparseGradient input_sparse_grad({grad, indices, indices_size_});
+  ReduceSparseGradientParam param;
+  param.input_grad_ = &input_sparse_grad;
+  param.workspace_grad_ = &workspace_sparse_grad;
+  param.output_grad_ = &unique_sparse_grad;
+  param.max_index_ = var_first_dim_size_;
+  param.value_stride_ = var_outer_dim_size_;
+  BucketReduceSparseGradient(param);

  MultiThreadComputeParams input_params;
  input_params.var_ = var;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.cc
@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                      BroadcastToGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      BroadcastToGpuKernel, half)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h
@ -0,0 +1,83 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class BroadcastToGpuKernel : public GpuKernel {
+ public:
+  BroadcastToGpuKernel() {}
+  ~BroadcastToGpuKernel() = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *input_addr = GetDeviceAddress<T>(inputs, 0);
+    T *output_addr = GetDeviceAddress<T>(outputs, 0);
+
+    BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], output_shape_[0], output_shape_[1],
+                output_shape_[2], output_shape_[3], input_addr, output_addr,
+                reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+  bool Init(const CNodePtr &kernel_node) override {
+    auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    if (input_shapes.size() > 4 || output_shapes.size() > 4) {
+      MS_LOG(EXCEPTION) << "BroadcastTo operation not support dim greater than 4";
+    }
+
+    for (int i = input_shapes.size() - 1; i >= 0; i--) {
+      input_shape_[i] = input_shapes[i];
+    }
+
+    for (int j = output_shapes.size() - 1; j >= 0; j--) {
+      output_shape_[j] = output_shapes[j];
+    }
+
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * sizeof(T));
+    output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] * sizeof(T));
+  }
+
+ private:
+  int input_shape_[4] = {1, 1, 1, 1};
+  int output_shape_[4] = {1, 1, 1, 1};
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h
@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CONCATV2_GPU_KERNEL_H

 #include <vector>
+#include <memory>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
@ -27,40 +28,35 @@ namespace kernel {
 template <typename T>
 class ConcatV2GpuFwdKernel : public GpuKernel {
 public:
-  ConcatV2GpuFwdKernel() : axis_(0), output_size_(0) {}
+  ConcatV2GpuFwdKernel()
+      : axis_(0),
+        input_num_(1),
+        output_size_(0),
+        all_size_before_axis_(1),
+        all_size_axis_(1),
+        inputs_host_(nullptr),
+        len_axis_(nullptr) {}
  ~ConcatV2GpuFwdKernel() override = default;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    if (inputs.size() == 2) {
-      T *input_0 = GetDeviceAddress<T>(inputs, 0);
-      T *input_1 = GetDeviceAddress<T>(inputs, 1);
-      T *output = GetDeviceAddress<T>(outputs, 0);
-      ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], input_0, input_1, output,
-                   reinterpret_cast<cudaStream_t>(stream_ptr));
-    }
-
-    if (inputs.size() == 3) {
-      T *input_0 = GetDeviceAddress<T>(inputs, 0);
-      T *input_1 = GetDeviceAddress<T>(inputs, 1);
-      T *input_2 = GetDeviceAddress<T>(inputs, 2);
-      T *output = GetDeviceAddress<T>(outputs, 0);
-      ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], input_0, input_1, input_2, output,
-                   reinterpret_cast<cudaStream_t>(stream_ptr));
-    }
-
-    if (inputs.size() == 4) {
-      T *input_0 = GetDeviceAddress<T>(inputs, 0);
-      T *input_1 = GetDeviceAddress<T>(inputs, 1);
-      T *input_2 = GetDeviceAddress<T>(inputs, 2);
-      T *input_3 = GetDeviceAddress<T>(inputs, 3);
-      T *output = GetDeviceAddress<T>(outputs, 0);
-      ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], w_[3], input_0, input_1, input_2, input_3, output,
-                   reinterpret_cast<cudaStream_t>(stream_ptr));
+    T *output = GetDeviceAddress<T>(outputs, 0);
+    T **inputs_device = GetDeviceAddress<T *>(workspace, 0);
+    int *len_axis_device = GetDeviceAddress<int>(workspace, 1);
+    for (size_t i = 0; i < inputs.size(); i++) {
+      inputs_host_[i] = GetDeviceAddress<T>(inputs, i);
    }
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_,
+                                               cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                               "ConcatV2 opt cudaMemcpyAsync inputs failed");
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_,
+                                               cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                               "ConcatV2 opt cudaMemcpyAsync length on axis failed");
+    ConcatKernel(output_size_, input_num_, all_size_before_axis_, all_size_axis_, len_axis_device, inputs_device,
+                 output, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
@ -74,25 +70,34 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
      axis_ += SizeToInt(input_shape.size());
    }

-    auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-    for (size_t i = 0; i < input_num; i++) {
-      auto input_size = sizeof(T);
+    input_num_ = SizeToInt(AnfAlgo::GetInputTensorNum(kernel_node));
+    inputs_host_ = std::make_unique<T *[]>(input_num_);
+    len_axis_ = std::make_unique<int[]>(input_num_);
+    for (int i = 0; i < input_num_; i++) {
+      int input_size = 1;
      auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
      for (size_t j = 0; j < input_shape.size(); j++) {
        input_size *= SizeToInt(input_shape[j]);
-        if (j >= IntToSize(axis_)) {
-          w_[i] *= SizeToInt(input_shape[j]);
-        }
-        input_size_list_.push_back(input_size);
      }
+      input_size_list_.push_back(IntToSize(input_size * sizeof(T)));
+      len_axis_[i] = SizeToInt(input_shape[axis_]);
    }
+    workspace_size_list_.push_back(sizeof(T *) * input_num_);
+    workspace_size_list_.push_back(sizeof(int) * input_num_);

    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
-    output_size_ = sizeof(T);
-    for (size_t i = 0; i < output_shape.size(); i++) {
+    output_size_ = 1;
+    for (int i = 0; i < SizeToInt(output_shape.size()); i++) {
      output_size_ *= output_shape[i];
+      if (i > axis_) {
+        all_size_before_axis_ *= output_shape[i];
+        all_size_axis_ *= output_shape[i];
+      }
+      if (i == axis_) {
+        all_size_before_axis_ *= output_shape[i];
+      }
    }
-    output_size_list_.push_back(output_size_);
+    output_size_list_.push_back(IntToSize(output_size_ * sizeof(T)));

    InitSizeLists();
    return true;
@ -103,11 +108,6 @@ class ConcatV2GpuFwdKernel : public GpuKernel {

 private:
  bool CheckParam(const CNodePtr &kernel_node) {
-    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-    if (input_num < 2 || input_num > 4) {
-      MS_LOG(ERROR) << "Input number is " << input_num << ", but ConcatV2GpuFwdKernel needs inputs between 2 and 4.";
-      return false;
-    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but ConcatV2GpuFwdKernel needs 1 output.";
@ -115,9 +115,13 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
    }
    return true;
  }
-  int w_[4] = {1, 1, 1, 1};
  int axis_;
-  size_t output_size_;
+  int input_num_;
+  int output_size_;
+  int all_size_before_axis_;
+  int all_size_axis_;
+  std::unique_ptr<T *[]> inputs_host_;
+  std::unique_ptr<int[]> len_axis_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.cc
@ -0,0 +1,31 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(
+  Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  SplitGpuFwdKernel, float)
+MS_REG_GPU_KERNEL_ONE(Split,
+                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+                      SplitGpuFwdKernel, int)
+MS_REG_GPU_KERNEL_ONE(
+  Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  SplitGpuFwdKernel, half)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h
@ -0,0 +1,153 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
+#define MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
+
+#include <vector>
+#include <memory>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class SplitGpuFwdKernel : public GpuKernel {
+ public:
+  SplitGpuFwdKernel()
+      : axis_(0),
+        output_num_(1),
+        input_size_(1),
+        axis_step_(1),
+        all_size_before_axis_(1),
+        all_size_axis_(1),
+        outputs_host_(nullptr) {}
+  ~SplitGpuFwdKernel() override = default;
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *input = GetDeviceAddress<T>(inputs, 0);
+    T **outputs_device = GetDeviceAddress<T *>(workspace, 0);
+    for (size_t i = 0; i < outputs.size(); i++) {
+      outputs_host_[i] = GetDeviceAddress<T>(outputs, i);
+    }
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_,
+                                               cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                               "Split opt cudaMemcpyAsync outputs failed");
+    SplitKernel(input_size_, axis_step_, all_size_before_axis_, all_size_axis_, input, outputs_device,
+                reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    axis_ = GetAttr<int>(kernel_node, "axis");
+    if (axis_ < 0) {
+      auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+      axis_ += SizeToInt(input_shape.size());
+    }
+    output_num_ = GetAttr<int>(kernel_node, "output_num");
+
+    if (!CheckParam(kernel_node)) {
+      return false;
+    }
+
+    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    input_size_ = 1;
+    all_size_before_axis_ = 1;
+    all_size_axis_ = 1;
+
+    for (int i = 0; i < SizeToInt(input_shape.size()); i++) {
+      input_size_ *= input_shape[i];
+      if (i > axis_) {
+        all_size_before_axis_ *= input_shape[i];
+        all_size_axis_ *= input_shape[i];
+      }
+      if (i == axis_) {
+        all_size_before_axis_ *= input_shape[i];
+      }
+    }
+    input_size_list_.push_back(IntToSize(input_size_ * sizeof(T)));
+    axis_step_ = input_shape[axis_] / output_num_;
+
+    for (int i = 0; i < output_num_; i++) {
+      size_t output_size = 1;
+      auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, i);
+      for (size_t j = 0; j < output_shape.size(); j++) {
+        output_size *= output_shape[j];
+      }
+      output_size_list_.push_back(output_size * sizeof(T));
+    }
+    workspace_size_list_.push_back(sizeof(T *) * output_num_);
+    InitSizeLists();
+    outputs_host_ = std::make_unique<T *[]>(output_num_);
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {}
+
+ private:
+  bool CheckParam(const CNodePtr &kernel_node) {
+    auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    int dims = SizeToInt(input_shape.size());
+    int output_num = SizeToInt(AnfAlgo::GetOutputTensorNum(kernel_node));
+
+    if (input_num != 1) {
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but Split needs 1 input.";
+      return false;
+    }
+    if (dims == 0) {
+      MS_LOG(ERROR) << "Input dims is " << dims << ", scalar is not supported.";
+      return false;
+    }
+    if (axis_ < -dims || axis_ >= dims) {
+      MS_LOG(ERROR) << "Attr axis " << axis_ << " must be in " << -dims << "~" << dims;
+      return false;
+    }
+    if (output_num_ > SizeToInt(input_shape[axis_])) {
+      MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must less than" << input_shape[axis_];
+      return false;
+    }
+    if (input_shape[axis_] % output_num_ != 0) {
+      MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must be divided by" << input_shape[axis_];
+      return false;
+    }
+    if (output_num_ != output_num) {
+      MS_LOG(ERROR) << "Output num is " << output_num << ", but need " << output_num_;
+      return false;
+    }
+    return true;
+  }
+  int axis_;
+  int output_num_;
+  int input_size_;
+  int axis_step_;
+  int all_size_before_axis_;
+  int all_size_axis_;
+  std::unique_ptr<T *[]> outputs_host_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.cc
@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_TWO(TopK,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      TopKGpuKernel, float, int)
+}
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
@ -0,0 +1,110 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T, typename S>
+class TopKGpuKernel : public GpuKernel {
+ public:
+  TopKGpuKernel() : sorted_(false), outer_size_(1), inner_size_(1), k_(1), use_share_mem_(true), ceil_power2_(0) {}
+  ~TopKGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspaces,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *input_addr = GetDeviceAddress<T>(inputs, 0);
+    S *k = GetDeviceAddress<S>(inputs, 1);
+    T *output_addr = GetDeviceAddress<T>(outputs, 0);
+    S *indices = GetDeviceAddress<S>(outputs, 1);
+    T *data_buff = nullptr;
+    S *index_buff = nullptr;
+    if (use_share_mem_ == false) {
+      data_buff = GetDeviceAddress<T>(workspaces, 0);
+      index_buff = GetDeviceAddress<S>(workspaces, 1);
+    }
+
+    TopK(outer_size_, inner_size_, input_addr, k, output_addr, indices, data_buff, index_buff,
+         reinterpret_cast<cudaStream_t>(stream_ptr));
+
+    if (sorted_ == false) {
+      std::cout << "================BitonicSortByKey" << std::endl;
+      BitonicSortByKey(outer_size_, k_, output_addr, indices, data_buff, index_buff,
+                       reinterpret_cast<cudaStream_t>(stream_ptr));
+    }
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    for (size_t i = 0; i < input_shapes.size() - 1; i++) {
+      outer_size_ *= input_shapes[i];
+    }
+    inner_size_ = input_shapes[input_shapes.size() - 1];
+    k_ = output_shapes[output_shapes.size() - 1];
+
+    sorted_ = GetAttr<bool>(kernel_node, "sorted");
+
+    ceil_power2_ = RoundUpPower2(inner_size_);
+    size_t buffer_size = ceil_power2_ * (sizeof(T) + sizeof(S));
+    if (buffer_size > SHARED_MEM_PER_BLOCK) {
+      use_share_mem_ = false;
+      MS_LOG(WARNING) << "CUDA share memory not enough, sort with RAM";
+    }
+
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(outer_size_ * inner_size_ * sizeof(T));
+    input_size_list_.push_back(sizeof(S));
+    output_size_list_.push_back(outer_size_ * k_ * sizeof(T));
+    output_size_list_.push_back(outer_size_ * k_ * sizeof(S));
+    if (use_share_mem_ == false) {
+      workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(T));
+      workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(S));
+    }
+  }
+
+ private:
+  bool sorted_;
+  int outer_size_;
+  int inner_size_;
+  int k_;
+  bool use_share_mem_;
+  int ceil_power2_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // TopKpuKernel
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
@ -116,16 +116,16 @@ __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const
                                                      output);
    case BROADCAST_TYPE_REALDIV:
      return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+                                                        output);
    case BROADCAST_TYPE_MUL:
      return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+                                                    output);
    case BROADCAST_TYPE_SUB:
      return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+                                                    output);
    case BROADCAST_TYPE_ADD:
      return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+                                                    output);
  }
 }

@ -176,6 +176,28 @@ void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, cons
  NoBroadcastKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, op, input0, input1, output);
 }

+template <typename T>
+__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0,
+                                  const int o1, const int o2, const int o3, const T *input_addr, T *output_addr) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < o0 * o1 * o2 * o3; pos += blockDim.x * gridDim.x) {
+    int i = pos / (o1 * o2 * o3) % o0;
+    int j = pos / (o2 * o3) % o1;
+    int k = pos / o3 % o2;
+    int l = pos % o3;
+
+    int input_idx = Index(i, i0) * i1 * i2 * i3 + Index(j, i1) * i2 * i3 + Index(k, i2) * i3 + Index(l, i3);
+    output_addr[pos] = input_addr[input_idx];
+  }
+}
+
+template <typename T>
+void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
+                 const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) {
+  int nums = o0 * o1 * o2 * o3;
+  BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr,
+                                                                  output_addr);
+}
+
 template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
                        enum BroadcastOpType op, const float *input0, const float *input1, bool *output,
@ -204,5 +226,11 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
                          bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
                          half *output, cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
-                          int *output, cudaStream_t stream);
+template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
+                          cudaStream_t stream);
+
+template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
+                          const int &o2, const int &o3, const float *input_addr, float *output_addr,
+                          cudaStream_t stream);
+template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
+                          const int &o2, const int &o3, const half *input_addr, half *output_addr, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
@ -41,4 +41,8 @@ template <typename T, typename S>
 void NoBroadcast(const int &size, enum BroadcastOpType op, const T *input0, const T *input1, S *output,
                 cudaStream_t stream);

+template <typename T>
+void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
+                 const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream);
+
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cu
@ -19,90 +19,51 @@
 #include <cuda_runtime.h>
 #include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
 template <typename T>
-__global__ void Concat(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
-    int n = pos / (w1 + w2);
-    int m = pos % (w1 + w2);
-    output[pos] = m >= w1 ? input_2[n * w2 + m - w1] : input_1[n * w1 + m];
+__global__ void Concat(const int size, const int input_num,
+                       const int all_size_before_axis, const int all_size_axis,
+                       int* len_axis, T** inputs, T* output) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
+    int num = pos % all_size_before_axis / all_size_axis;
+    int block = -1;
+    int axis_inc = 0;
+    int block_len = 0;
+    for (int i = 0; i < input_num; i++) {
+      if (axis_inc <= num) {
+        block++;
+        axis_inc += len_axis[i];
+      } else {
+        break;
+      }
+    }
+    block_len = len_axis[block];
+    axis_inc -= len_axis[block];
+    int block_pos = pos / all_size_before_axis * block_len * all_size_axis +
+                    (num - axis_inc) * all_size_axis + pos % all_size_axis;;
+    output[pos] = inputs[block][block_pos];
  }
  return;
 }

 template <typename T>
-__global__ void Concat(const size_t size, const int w1, const int w2, const int w3,
-                       const T* input_1, const T* input_2, const T* input_3, T* output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
-    int n = pos / (w1 + w2 + w3);
-    int m = pos % (w1 + w2 + w3);
-    output[pos] = m < w1 ? input_1[n * w1 + m] :
-                    m < w1 + w2 ? input_2[n * w2 + m - w1] :
-                      input_3[n * w3 + m - w1 - w2];
-  }
-  return;
-}
-
-template <typename T>
-__global__ void Concat(const size_t size, const int w1, const int w2, const int w3, const int w4,
-                       const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
-    int n = pos / (w1 + w2 + w3 + w4);
-    int m = pos % (w1 + w2 + w3 + w4);
-    output[pos] = m < w1 ? input_1[n * w1 + m] :
-                    m < w1 + w2 ? input_2[n * w2 + m - w1]:
-                      m < w1 + w2 + w3 ? input_3[n * w3 + m - w1 - w2]:
-                        input_4[n * w4 + m - w1 - w2 - w3];
-  }
-  return;
-}
-
-template <typename T>
-void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output,
-                 cudaStream_t cuda_stream) {
-  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, input_1, input_2, output);
-  return;
-}
-
-template <typename T>
-void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
-                  const T* input_1, const T* input_2, const T* input_3, T* output,
+void ConcatKernel(const int size, const int input_num,
+                  const int all_size_before_axis, const int all_size_axis,
+                  int* len_axis, T** inputs, T* output,
                  cudaStream_t cuda_stream) {
-  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, input_1, input_2, input_3, output);
+  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_num,
+                                                            all_size_before_axis, all_size_axis,
+                                                            len_axis, inputs, output);
  return;
 }

-template <typename T>
-void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
-                  const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output,
-                  cudaStream_t cuda_stream) {
-  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, w4, input_1,
-                                                            input_2, input_3, input_4, output);
-  return;
-}
-
-template void ConcatKernel(const size_t size, const int w1, const int w2, const float* input_1, const float* input_2,
-                           float* output, cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int* input_1, const int* input_2,
-                           int* output, cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int w1, const int w2, const half* input_1, const half* input_2,
-                           half* output, cudaStream_t cuda_stream);
-
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
-                           const float* input_1, const float* input_2, const float* input_3,
-                           float* output, cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
-                           const int* input_1, const int* input_2, const int* input_3,
-                           int* output, cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
-                           const half* input_1, const half* input_2, const half* input_3,
-                           half* output, cudaStream_t cuda_stream);
-
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
-                           const float* input_1, const float* input_2, const float* input_3, const float* input_4,
-                           float* output, cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
-                           const int* input_1, const int* input_2, const int* input_3, const int* input_4,
-                           int* output, cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
-                           const half* input_1, const half* input_2, const half* input_3, const half* input_4,
-                           half* output, cudaStream_t cuda_stream);
-
+template void ConcatKernel(const int size, const int input_num,
+                           const int all_size_before_axis, const int all_size_axis,
+                           int* len_axis, float** inputs, float* output,
+                           cudaStream_t cuda_stream);
+template void ConcatKernel(const int size, const int input_num,
+                           const int all_size_before_axis, const int all_size_axis,
+                           int* len_axis, int** inputs, int* output,
+                           cudaStream_t cuda_stream);
+template void ConcatKernel(const int size, const int input_num,
+                           const int all_size_before_axis, const int all_size_axis,
+                           int* len_axis, half** inputs, half* output,
+                           cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh
@ -19,13 +19,8 @@

 #include "runtime/device/gpu/cuda_common.h"
 template <typename T>
-void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output,
-                  cudaStream_t cuda_stream);
-template <typename T>
-void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
-                  const T* input_1, const T* input_2, const T* input_3, T* output, cudaStream_t cuda_stream);
-template <typename T>
-void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
-                  const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output,
+void ConcatKernel(const int size, const int input_num,
+                  const int all_size_before_axis, const int all_size_axis,
+                  int* len_axis, T** inputs, T* output,
                  cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CONCATV2IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cu
@ -15,9 +15,9 @@
 */

 #include "momentum_impl.cuh"
-template <typename T, typename S>
+template <typename T, typename S, typename G>
 __global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const S *learning_rate,
-                                             const T *gradient, const S *momentum) {
+                                             const G *gradient, const S *momentum) {
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
    accumulation[i] = momentum[0] * accumulation[i] + gradient[i];
    variable[i] -= learning_rate[0] * accumulation[i];
@ -34,19 +34,32 @@ __global__ void MomentumUpdateVariableKernel(const size_t size, half *variable,
  }
  return;
 }
-template <typename T, typename S>
-void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient,
+template <>
+__global__ void MomentumUpdateVariableKernel(const size_t size, float *variable, float *accumulation,
+                                             const float *learning_rate, const half *gradient,
+                                             const float *momentum) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
+    accumulation[i] = momentum[0] * accumulation[i] + __half2float(gradient[i]);
+    variable[i] -= learning_rate[0] * accumulation[i];
+  }
+  return;
+}
+template <typename T, typename S, typename G>
+void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
                            const S *momentum, cudaStream_t cuda_stream) {
  MomentumUpdateVariableKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, variable, accumulation,
                                                                                  learning_rate, gradient, momentum);
  return;
 }
-template void MomentumUpdateVariable<float, float>(const size_t size, float *variable, float *accumulation,
+template void MomentumUpdateVariable<float, float, float>(const size_t size, float *variable, float *accumulation,
                                                   const float *learning_rate, const float *gradient,
                                                   const float *momentum, cudaStream_t cuda_stream);
-template void MomentumUpdateVariable<half, half>(const size_t size, half *variable, half *accumulation,
+template void MomentumUpdateVariable<half, half, half>(const size_t size, half *variable, half *accumulation,
                                                 const half *learning_rate, const half *gradient,
                                                 const half *momentum, cudaStream_t cuda_stream);
-template void MomentumUpdateVariable<half, float>(const size_t size, half *variable, half *accumulation,
+template void MomentumUpdateVariable<half, float, half>(const size_t size, half *variable, half *accumulation,
+                                                  const float *learning_rate, const half *gradient,
+                                                  const float *momentum, cudaStream_t cuda_stream);
+template void MomentumUpdateVariable<float, float, half>(const size_t size, float *variable, float *accumulation,
                                                  const float *learning_rate, const half *gradient,
                                                  const float *momentum, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh
@ -18,8 +18,8 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_

 #include "runtime/device/gpu/cuda_common.h"
-template <typename T, typename S>
-void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient,
+template <typename T, typename S, typename G>
+void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
                            const S *momentum, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cu
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <cuda_runtime.h>
+#include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
+template <typename T>
+__global__ void Split(const int size, const int axis_step, const int all_size_before_axis,
+                      const int all_size_axis, const T* input, T** outputs) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    int num = pos % all_size_before_axis / all_size_axis;
+    int block = num / axis_step;
+    int block_pos = pos / all_size_before_axis * axis_step * all_size_axis +
+                    num % axis_step * all_size_axis + pos % all_size_axis;
+    outputs[block][block_pos] = input[pos];
+  }
+  return;
+}
+
+template <typename T>
+void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
+                 const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) {
+  Split<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, axis_step, all_size_before_axis,
+                                                           all_size_axis, input, outputs);
+  return;
+}
+
+template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
+                          const int all_size_axis, const float* input, float** outputs,
+                          cudaStream_t cuda_stream);
+template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
+                          const int all_size_axis, const int* input, int** outputs,
+                          cudaStream_t cuda_stream);
+template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
+                          const int all_size_axis, const half* input, half** outputs,
+                          cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh
@ -0,0 +1,24 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
+
+#include "runtime/device/gpu/cuda_common.h"
+template <typename T>
+void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
+                 const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cu
@ -0,0 +1,162 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
+#include <limits>
+#include <algorithm>
+
+int RoundUpPower2(int v) {
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v++;
+  return v;
+}
+
+template <typename T>
+__inline__ __device__ void Swap(T *lhs, T *rhs) {
+  T tmp = lhs[0];
+  lhs[0] = rhs[0];
+  rhs[0] = tmp;
+}
+
+template <typename T, typename S>
+__global__ void TopkKernel(const int outer, const int inner, const int ceil_power2, const T *input, const S *k,
+                           T *output, S *indices, T *data_buff, S *index_buff) {
+  // default: sort with share memory
+  extern __shared__ T share_mem[];
+  T *data_arr = share_mem;
+  S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2);
+  // sort with RAM
+  if (data_buff != nullptr && index_buff != nullptr) {
+    data_arr = data_buff + blockIdx.x * ceil_power2;
+    index_arr = index_buff + blockIdx.x * ceil_power2;
+  }
+
+  for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
+    data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max();
+    index_arr[i] = i;
+  }
+  __syncthreads();
+
+  for (size_t i = 2; i <= ceil_power2; i <<= 1) {
+    for (size_t j = (i >> 1); j > 0; j >>= 1) {
+      for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
+        size_t tid_comp = tid ^ j;
+        if (tid_comp > tid) {
+          if ((tid & i) == 0) {
+            if (data_arr[tid] > data_arr[tid_comp]) {
+              Swap(&data_arr[tid], &data_arr[tid_comp]);
+              Swap(&index_arr[tid], &index_arr[tid_comp]);
+            }
+          } else {
+            if (data_arr[tid] < data_arr[tid_comp]) {
+              Swap(&data_arr[tid], &data_arr[tid_comp]);
+              Swap(&index_arr[tid], &index_arr[tid_comp]);
+            }
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+
+  for (size_t tid = threadIdx.x; tid < k[0]; tid += blockDim.x) {
+    output[blockIdx.x * k[0] + tid] = data_arr[inner - tid - 1];
+    indices[blockIdx.x * k[0] + tid] = index_arr[inner - tid - 1];
+  }
+}
+
+template <typename T, typename S>
+void TopK(const int &outer, const int &inner, const T *input, const S *k, T *output, S *indices, T *data_buff,
+          S *index_buff, cudaStream_t stream) {
+  int ceil_power2 = RoundUpPower2(inner);
+  int share_mem = (data_buff == nullptr) ? ceil_power2 * (sizeof(T) + sizeof(S)) : 0;
+  int thread = std::min(ceil_power2, GET_THREADS);
+  TopkKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, k, output, indices, data_buff,
+                                                   index_buff);
+}
+
+template <typename T, typename S>
+__global__ void BitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2, T *input,
+                                       S *indices, T *data_buff, S *index_buff) {
+  // default: sort with share memory
+  extern __shared__ T share_mem[];
+  T *data_arr = share_mem;
+  S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2);
+  // sort with RAM
+  if (data_buff != nullptr && index_buff != nullptr) {
+    data_arr = data_buff + blockIdx.x * ceil_power2;
+    index_arr = index_buff + blockIdx.x * ceil_power2;
+  }
+
+  for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
+    data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max();
+    index_arr[i] = (i < inner) ? indices[blockIdx.x * inner + i] : std::numeric_limits<S>::max();;
+  }
+  __syncthreads();
+
+  for (size_t i = 2; i <= ceil_power2; i <<= 1) {
+    for (size_t j = (i >> 1); j > 0; j >>= 1) {
+      for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
+        size_t tid_comp = tid ^ j;
+        if (tid_comp > tid) {
+          if ((tid & i) == 0) {
+            if (index_arr[tid] > index_arr[tid_comp]) {
+              Swap(&data_arr[tid], &data_arr[tid_comp]);
+              Swap(&index_arr[tid], &index_arr[tid_comp]);
+            }
+          } else {
+            if (index_arr[tid] < index_arr[tid_comp]) {
+              Swap(&data_arr[tid], &data_arr[tid_comp]);
+              Swap(&index_arr[tid], &index_arr[tid_comp]);
+            }
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+
+  for (size_t tid = threadIdx.x; tid < inner; tid += blockDim.x) {
+    input[blockIdx.x * inner + tid] = data_arr[tid];
+    indices[blockIdx.x * inner + tid] = index_arr[tid];
+  }
+}
+
+template <typename T, typename S>
+void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff,
+                      cudaStream_t stream) {
+  int ceil_power2 = RoundUpPower2(inner);
+  size_t share_mem = ceil_power2 * (sizeof(T) + sizeof(S));
+  if (share_mem > SHARED_MEM_PER_BLOCK) {
+    share_mem = 0;
+  } else {
+    data_buff = nullptr;
+    index_buff = nullptr;
+  }
+  int thread = std::min(ceil_power2, GET_THREADS);
+  BitonicSortByKeyKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, indices, data_buff,
+                                                               index_buff);
+}
+
+template void TopK(const int &outer, const int &inner, const float *input_addr, const int *k, float *output,
+                   int *indices, float *data_buff, int *index_buff, cudaStream_t stream);
+template void BitonicSortByKey(const int &outer, const int &inner, float *input, int *indices, float *data_buff,
+                               int *index_buff, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh
@ -0,0 +1,32 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
+
+#include <cuda_runtime.h>
+#include "runtime/device/gpu/cuda_common.h"
+
+template <typename T, typename S>
+void TopK(const int &outer, const int &inner, const T *input_addr, const S *k, T *output, S *indices, T *data_buff,
+          S *index_buff, cudaStream_t stream);
+
+template <typename T, typename S>
+void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff,
+                      cudaStream_t stream);
+int RoundUpPower2(int v);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cu
@ -103,6 +103,35 @@ __global__ void ZeroslikeKernel(T *output, size_t count) {
  return;
 }
 template <typename T>
+__global__ void AbsKernel(T *input, T *output, size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = abs(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void AbsKernel(half *input, half *output, size_t count) {
+  half zero = 0.0;
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = input[i] < zero ? -input[i] : input[i];
+  }
+  return;
+}
+template <typename T>
+__global__ void FloorKernel(T *input, T *output, size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = floor(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void FloorKernel(half *input, half *output, size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = hfloor(input[i]);
+  }
+  return;
+}
+template <typename T>
 void Exponential(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
  ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
  return;
@ -147,6 +176,16 @@ void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream) {
  ZeroslikeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(output, count);
  return;
 }
+template <typename T>
+void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
+  AbsKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
+  FloorKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}

 template void Exponential<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Logarithm<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
@ -156,6 +195,8 @@ template void Square<float>(float *input, float *output, size_t count, cudaStrea
 template void Sqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Rsqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Zeroslike<float>(float *output, size_t count, cudaStream_t cuda_stream);
+template void Abs<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
+template void Floor<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Exponential<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Logarithm<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Negative<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
@ -164,3 +205,5 @@ template void Square<half>(half *input, half *output, size_t count, cudaStream_t
 template void Sqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Rsqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Zeroslike<half>(half *output, size_t count, cudaStream_t cuda_stream);
+template void Abs<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
+template void Floor<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cuh
@ -34,5 +34,9 @@ template <typename T>
 void Rsqrt(T *input, T *output, size_t count, cudaStream_t cuda_stream);
 template <typename T>
 void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream);
+template <typename T>
+void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream);
+template <typename T>
+void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
@ -88,6 +88,12 @@ class GpuKernelRegister {
  static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \
  static const GpuKernelRegister g_##OPNAME##_##T##_##S##_gpu_kernel_reg(#OPNAME, ATTR,          \
                                                                         []() { return new OPCLASS<T, S>(); });
+
+// register of mixed accuracy kernels which use template and maintain three typename
+#define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G)                                     \
+  static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S, G>>::value, " must be base of GpuKernel"); \
+  static const GpuKernelRegister g_##OPNAME##_##T##_##S##_##G##_gpu_kernel_reg(                     \
+    #OPNAME, ATTR, []() { return new OPCLASS<T, S, G>(); });
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_GPUKERNELFACTORY_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.cc
@ -46,5 +46,13 @@ MS_REG_GPU_KERNEL_ONE(Sqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOut
                      UnaryOpGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(Rsqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                      UnaryOpGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                      UnaryOpGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      UnaryOpGpuKernel, half)
+MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                      UnaryOpGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      UnaryOpGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
@ -36,6 +36,8 @@ enum UnaryOptype {
  UNARY_OP_SQUARE,
  UNARY_OP_SQRT,
  UNARY_OP_RSQRT,
+  UNARY_OP_ABS,
+  UNARY_OP_FLOOR,
  UNARY_OP_INVALID_TYPE = 255
 };
 static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY_OP_EXP},
@ -45,7 +47,9 @@ static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY
                                                                   {"ZerosLike", UNARY_OP_ZEROSLIKE},
                                                                   {"Square", UNARY_OP_SQUARE},
                                                                   {"Sqrt", UNARY_OP_SQRT},
-                                                                   {"Rsqrt", UNARY_OP_RSQRT}};
+                                                                   {"Rsqrt", UNARY_OP_RSQRT},
+                                                                   {"Abs", UNARY_OP_ABS},
+                                                                   {"Floor", UNARY_OP_FLOOR}};
 template <typename T>
 class UnaryOpGpuKernel : public GpuKernel {
 public:
@ -100,6 +104,14 @@ class UnaryOpGpuKernel : public GpuKernel {
        Zeroslike(output_addr, output_size_ / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
        return true;
      }
+      case UNARY_OP_ABS: {
+        Abs(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
+        break;
+      }
+      case UNARY_OP_FLOOR: {
+        Floor(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
+        break;
+      }
      default: {
        MS_LOG(EXCEPTION) << "Unary operation " << unary_op_type_ << " is not supported.";
      }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.cc
@ -34,15 +34,15 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNorm,
 MS_REG_GPU_KERNEL_ONE(FusedBatchNorm,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16),
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
                      FusedBatchNormGpuKernel, half)
 MS_REG_GPU_KERNEL_ONE(BatchNorm,
                      KernelAttr()
@ -60,15 +60,15 @@ MS_REG_GPU_KERNEL_ONE(BatchNorm,
 MS_REG_GPU_KERNEL_ONE(BatchNorm,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16),
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
                      FusedBatchNormGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h
@ -56,17 +56,17 @@ class FusedBatchNormGpuKernel : public GpuKernel {
      return true;
    }
    auto x = GetDeviceAddress<T>(inputs, 0);
-    auto scale = GetDeviceAddress<T>(inputs, 1);
-    auto bias = GetDeviceAddress<T>(inputs, 2);
-    auto runing_mean = GetDeviceAddress<T>(inputs, 3);
-    auto runnig_variance = GetDeviceAddress<T>(inputs, 4);
+    auto scale = GetDeviceAddress<float>(inputs, 1);
+    auto bias = GetDeviceAddress<float>(inputs, 2);
+    auto runing_mean = GetDeviceAddress<float>(inputs, 3);
+    auto runnig_variance = GetDeviceAddress<float>(inputs, 4);
    auto y = GetDeviceAddress<T>(outputs, 0);

    const float alpha = 1;
    const float beta = 0;
    if (is_train_) {
-      auto save_mean = GetDeviceAddress<T>(outputs, 3);
-      auto save_variance = GetDeviceAddress<T>(outputs, 4);
+      auto save_mean = GetDeviceAddress<float>(outputs, 3);
+      auto save_variance = GetDeviceAddress<float>(outputs, 4);
      CHECK_CUDNN_RET_WITH_EXCEPT(
        cudnnBatchNormalizationForwardTraining(handle_, mode_, &alpha, &beta, x_desc_, x, y_desc_, y,
                                               scale_bias_mean_var_desc_, scale, bias, exp_avg_factor_, runing_mean,
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.cc
@ -33,12 +33,12 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNormGrad,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16),
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
                      FusedBatchNormGradGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
@ -55,12 +55,12 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
    }
    auto dy = GetDeviceAddress<T>(inputs, 0);
    auto x = GetDeviceAddress<T>(inputs, 1);
-    auto scale = GetDeviceAddress<T>(inputs, 2);
-    auto save_mean = GetDeviceAddress<T>(inputs, 3);
-    auto save_variance = GetDeviceAddress<T>(inputs, 4);
+    auto scale = GetDeviceAddress<float>(inputs, 2);
+    auto save_mean = GetDeviceAddress<float>(inputs, 3);
+    auto save_variance = GetDeviceAddress<float>(inputs, 4);
    auto dx = GetDeviceAddress<T>(outputs, 0);
-    auto bn_scale = GetDeviceAddress<T>(outputs, 1);
-    auto bn_bias = GetDeviceAddress<T>(outputs, 2);
+    auto bn_scale = GetDeviceAddress<float>(outputs, 1);
+    auto bn_bias = GetDeviceAddress<float>(outputs, 2);

    const float alpha_data_diff = 1;
    const float beta_data_diff = 0;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.cc
@ -18,32 +18,41 @@

 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddOutputAttr(kNumberTypeFloat32),
-                      MomentumGpuKernel, float, float)
-MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16),
-                      MomentumGpuKernel, half, half)
-MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddOutputAttr(kNumberTypeFloat16),
-                      MomentumGpuKernel, half, float)
+MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32),
+                        MomentumGpuKernel, float, float, float)
+MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddOutputAttr(kNumberTypeFloat16),
+                        MomentumGpuKernel, half, half, half)
+MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat16),
+                        MomentumGpuKernel, half, float, half)
+MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32),
+                        MomentumGpuKernel, float, float, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.h
@ -23,7 +23,7 @@
 #include "backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh"
 namespace mindspore {
 namespace kernel {
-template <typename T, typename S>
+template <typename T, typename S, typename G>
 class MomentumGpuKernel : public GpuKernel {
 public:
  MomentumGpuKernel()
@ -38,7 +38,7 @@ class MomentumGpuKernel : public GpuKernel {
    T *variable = GetDeviceAddress<T>(inputs, 0);
    T *accumulation = GetDeviceAddress<T>(inputs, 1);
    S *learning_rate = GetDeviceAddress<S>(inputs, 2);
-    T *gradient = GetDeviceAddress<T>(inputs, 3);
+    G *gradient = GetDeviceAddress<G>(inputs, 3);
    S *momentum = GetDeviceAddress<S>(inputs, 4);
    MomentumUpdateVariable(inputs[0]->size / sizeof(T), variable, accumulation, learning_rate, gradient, momentum,
                           reinterpret_cast<cudaStream_t>(stream_ptr));
@ -54,7 +54,7 @@ class MomentumGpuKernel : public GpuKernel {
    variable_size_ = sizeof(T);
    accumulation_size_ = sizeof(T);
    learning_rate_size_ = sizeof(S);
-    gradient_size_ = sizeof(T);
+    gradient_size_ = sizeof(G);
    momentum_size_ = sizeof(S);

    auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.cc
@ -81,6 +81,7 @@ static std::map<string, string> tbe_func_adapter_map = {
  {"sparse_apply_proximal_adagrad", "sparse_apply_proximal_adagrad_d"},
  {"apply_add_sign", "apply_add_sign_d"},
  {"apply_power_sign", "apply_power_sign_d"},
+  {"apply_centered_rms_prop", "apply_centered_rms_prop_d"},
  {"transpose", "transpose_d"},
  {"fill", "fill_d"},
  {"unsorted_segment_sum", "unsorted_segment_sum_d"},
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
@ -43,6 +43,7 @@ constexpr auto kJInputs = "inputs";
 constexpr auto kJOutputs = "outputs";
 constexpr auto kJAttrs = "attrs";
 constexpr auto kJKernelName = "kernel_name";
+constexpr auto kJFullName = "full_name";
 constexpr auto kJOpInfo = "op_info";
 constexpr auto kJDtype = "dtype";
 constexpr auto kJtype = "type";
@ -125,6 +126,7 @@ bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspor
    op_info_json[kJKernelName] = json_name_;
  }
  (*kernel_json)[kJOpInfo] = op_info_json;
+  (*kernel_json)[kJFullName] = anf_node->fullname_with_scope();
  if (creater_type_ == SINGLE_BUILD) {
    TbeUtils::SaveJsonInfo(json_name_, json_info_);
  }
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
@ -97,6 +97,7 @@
 #include "backend/optimizer/ascend/format_type/modify_ops_attrs.h"
 #include "backend/optimizer/ascend/format_type/remove_no_use_reshape_op.h"
 #include "backend/optimizer/ascend/ir_fusion/add_input_to_output.h"
+#include "backend/optimizer/ascend/format_type/remove_internal_output.h"
 #include "utils/context/ms_context.h"
 #include "utils/config_manager.h"
 #include "debug/anf_ir_dump.h"
@ -201,6 +202,7 @@ void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph)
  data_layout_pm->AddPass(std::make_shared<OptimizeDependence>());
  data_layout_pm->AddPass(std::make_shared<TransDataSplit>());
  data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
+  data_layout_pm->AddPass(std::make_shared<RemoveInternalOutputTransOp>());
  optimizer->AddPassManager(data_layout_pm);
  (void)optimizer->Optimize(kernel_graph);
  kernel_graph->SetExecOrderByDefault();
@ -222,6 +224,7 @@ void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_grap
  mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>());
  mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
  mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>());
+  mixed_precision_pm->AddPass(std::make_shared<RemoveInternalOutputCast>());
  optimizer->AddPassManager(mixed_precision_pm);
  (void)optimizer->Optimize(kernel_graph);
  kernel_graph->SetExecOrderByDefault();
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
@ -142,6 +142,7 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
  MS_EXCEPTION_IF_NULL(node);
  std::vector<AnfNodePtr> make_tuple_inputs;
  make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
+  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
  for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(node); ++output_idx) {
    std::string output_format = AnfAlgo::GetOutputFormat(node, output_idx);
    if (output_format == kOpFormat_NC1KHKWHWC0) {
@ -151,7 +152,11 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
    auto tuple_getitem = CreatTupleGetItemNode(func_graph, node, output_idx);
    std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx);
    if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) {
-      make_tuple_inputs.emplace_back(AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false));
+      auto trans_op = AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false);
+      if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
+        kernel_graph->ReplaceInternalOutput(node, trans_op, output_idx, 0);
+      }
+      make_tuple_inputs.emplace_back(trans_op);
    } else {
      // No need insert trans op.
      make_tuple_inputs.push_back(tuple_getitem);
@ -249,9 +254,14 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP
  if (outputs_num == 0) {
    return node;
  }
+  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
  // Single output
  if (outputs_num == 1 && (!AnfAlgo::IsTupleOutput(node))) {
-    return InsertTransOpForSingleOutput(func_graph, node, kernel_select);
+    auto new_node = InsertTransOpForSingleOutput(func_graph, node, kernel_select);
+    if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
+      kernel_graph->ReplaceInternalOutput(node, new_node);
+    }
+    return new_node;
  }
  // Multiple output
  return InsertTransOpForMultipleOutput(func_graph, node, kernel_select);
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
@ -40,6 +40,7 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
  std::vector<AnfNodePtr> make_tuple_inputs;
  AbstractBasePtrList abstract_list;
  make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
+  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
  for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(cnode); ++output_idx) {
    AnfNodePtr replace_node = nullptr;
    const auto origin_shape = AnfAlgo::GetOutputInferShape(cnode, output_idx);
@ -64,6 +65,9 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
        MS_EXCEPTION_IF_NULL(replace_node);
        replace_node->set_scope(cnode->scope());
        AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
+        if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
+          kernel_graph->ReplaceInternalOutput(cnode, replace_node, output_idx, 0);
+        }
      } else {
        replace_node = getitem;
      }
@ -87,6 +91,7 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
    return cnode;
  }
  MS_EXCEPTION_IF_NULL(cnode->Type());
+  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
  // Single output
  if (!cnode->Type()->isa<Tuple>()) {
    if (!need_insert_cast[0]) {
@ -109,6 +114,9 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
      MS_EXCEPTION_IF_NULL(replace_node);
      replace_node->set_scope(cnode->scope());
      AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
+      if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
+        kernel_graph->ReplaceInternalOutput(cnode, replace_node);
+      }
    }
    return replace_node;
  }
@ -188,6 +196,10 @@ const AnfNodePtr InsertCast::Process(const FuncGraphPtr &func_graph, const AnfNo
  CNodePtr cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  auto new_node = InsertCastForInput(func_graph, cnode);
+  auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
+  if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
+    kernel_graph->ReplaceInternalOutput(node, new_node);
+  }
  // process output
  return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true));
 }
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_trans_op.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_trans_op.cc
@ -46,14 +46,13 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
  if (node == nullptr || !AnfAlgo::IsRealKernel(node)) {
    return nullptr;
  }
-  AnfNodePtr front_node;
+  AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
+  MS_LOG(DEBUG) << "process op: " << node->DebugString();
+  AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
  auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
  if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
-    front_node = kernel_graph->GetFrontNodeByInternalOutput(node);
+    kernel_graph->ReplaceInternalOutput(node, new_node);
  }
-  AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
-  MS_LOG(DEBUG) << "====process op: " << node->DebugString();
-  AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (ms_context->execution_mode() == kPynativeMode && !ms_context->enable_pynative_hook()) {
@ -61,12 +60,7 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
      return new_node;
    }
  }
-  auto final_node = InsertTransOpForOutput(func_graph, new_node, kernel_select_);
-  if (kernel_graph != nullptr && front_node != nullptr) {
-    auto old_node = kernel_graph->GetInternalOutputByFrontNode(front_node);
-    kernel_graph->ReplaceInternalOutput(old_node, final_node);
-  }
-  return final_node;
+  return InsertTransOpForOutput(func_graph, new_node, kernel_select_);
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.cc
@ -0,0 +1,83 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/optimizer/ascend/format_type/remove_internal_output.h"
+#include <memory>
+#include "backend/session/anf_runtime_algorithm.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+bool UsedForOutputOnly(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  auto manager = func_graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  auto &node_users = manager->node_users();
+  auto iter = node_users.find(node);
+  if (iter == node_users.end()) {
+    return false;
+  }
+  const auto &node_set = iter->second;
+  for (const auto &node_index : node_set) {
+    if (!AnfAlgo::CheckPrimitiveType(node_index.first, prim::kPrimMakeTuple)) {
+      return false;
+    }
+  }
+  return true;
+}
+}  // namespace
+const BaseRef RemoveInternalOutputTransOp::DefinePattern() const {
+  VarPtr X = std::make_shared<Var>();
+  auto prim = std::make_shared<Primitive>(kTransDataOpName);
+  return VectorRef({prim, X});
+}
+
+const BaseRef RemoveInternalOutputCast::DefinePattern() const {
+  VarPtr X = std::make_shared<Var>();
+  return VectorRef({prim::kPrimCast, X});
+}
+
+const AnfNodePtr RemoveInternalOutput::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                               const EquivPtr &) const {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(node);
+  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
+  if (kernel_graph == nullptr) {
+    return nullptr;
+  }
+  if (!kernel_graph->IsInternalOutput(node)) {
+    return nullptr;
+  }
+  if (!UsedForOutputOnly(func_graph, node)) {
+    return nullptr;
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  CheckCNodeInputSize(cnode, kTransOpInputNum);
+  auto input_node = cnode->input(1);
+  if (!AnfAlgo::CheckPrimitiveType(input_node, prim::kPrimTupleGetItem)) {
+    kernel_graph->ReplaceInternalOutput(node, input_node);
+  } else {
+    auto tuple_getitem = input_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(tuple_getitem);
+    int idx = AnfAlgo::GetTupleGetItemOutIndex(tuple_getitem);
+    AnfNodePtr real_input_node = AnfAlgo::GetTupleGetItemRealInput(tuple_getitem);
+    kernel_graph->ReplaceInternalOutput(node, real_input_node, 0, idx);
+  }
+  return input_node;
+}
+}  // namespace opt
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.h
@ -0,0 +1,51 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
+
+#include <string>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class RemoveInternalOutput : public PatternProcessPass {
+ public:
+  explicit RemoveInternalOutput(const std::string &name, bool multigraph = true)
+      : PatternProcessPass(name, multigraph) {}
+  ~RemoveInternalOutput() override = default;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+};
+
+class RemoveInternalOutputTransOp : public RemoveInternalOutput {
+ public:
+  explicit RemoveInternalOutputTransOp(bool multigraph = true)
+      : RemoveInternalOutput("remove_internal_output_trans_op", multigraph) {}
+  ~RemoveInternalOutputTransOp() override = default;
+  const BaseRef DefinePattern() const override;
+};
+
+class RemoveInternalOutputCast : public RemoveInternalOutput {
+ public:
+  explicit RemoveInternalOutputCast(bool multigraph = true)
+      : RemoveInternalOutput("remove_internal_output_cast", multigraph) {}
+  ~RemoveInternalOutputCast() override = default;
+  const BaseRef DefinePattern() const override;
+};
+}  // namespace opt
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/adam_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/adam_fusion.h
@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
-#define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_

 #include <memory>
 #include "backend/optimizer/common/optimizer.h"
@ -53,4 +53,4 @@ class AdamFusion : public PatternProcessPass {
 };
 }  // namespace opt
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/adam_weight_decay_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/adam_weight_decay_fusion.h
@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
-#define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_

 #include <memory>
 #include "backend/optimizer/common/optimizer.h"
@ -55,4 +55,4 @@ class AdamWeightDecayFusion : public PatternProcessPass {
 };
 }  // namespace opt
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.cc
@ -0,0 +1,65 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/gpu/replace_addn_fusion.h"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "backend/session/anf_runtime_algorithm.h"
+#include "ir/primitive.h"
+#include "utils/utils.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+const BaseRef ReplaceAddNFusion::DefinePattern() const {
+  VectorRef addn = VectorRef({prim::kPrimAddN, A, B});
+  return addn;
+}
+
+const AnfNodePtr ReplaceAddNFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
+                                            const EquivPtr &equiv) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(equiv);
+
+  auto A = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
+  auto B = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 1);
+  MS_EXCEPTION_IF_NULL(A);
+  MS_EXCEPTION_IF_NULL(B);
+  int num_input = AnfAlgo::GetNodeAttr<int>(node, "n");
+
+  if (num_input == 2) {
+    auto prim = std::make_shared<Primitive>(prim::kPrimTensorAdd->name());
+    MS_EXCEPTION_IF_NULL(prim);
+    std::vector<AnfNodePtr> inputs = {NewValueNode(prim), A, B};
+    auto add_new = graph->NewCNode(inputs);
+    std::vector<TypeId> outputs_type;
+    std::vector<std::vector<size_t>> outputs_shape;
+    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(A, 0));
+    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(A, 0));
+    AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, add_new.get());
+    auto manager = graph->manager();
+    MS_EXCEPTION_IF_NULL(manager);
+    manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(add_new));
+    return add_new;
+  } else {
+    return nullptr;
+  }
+}
+}  // namespace opt
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.h
@ -0,0 +1,40 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
+
+#include <memory>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ReplaceAddNFusion : public PatternProcessPass {
+ public:
+  explicit ReplaceAddNFusion(bool multigraph = true) : PatternProcessPass("replace_addn", multigraph) {
+    A = std::make_shared<Var>();
+    B = std::make_shared<Var>();
+  }
+  ~ReplaceAddNFusion() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  VarPtr A;
+  VarPtr B;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.cc
@ -0,0 +1,92 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "backend/session/anf_runtime_algorithm.h"
+#include "ir/primitive.h"
+#include "utils/utils.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+const BaseRef ReplaceBNCastFusion::DefinePattern() const {
+  VectorRef in_cast = VectorRef({prim::kPrimCast, x_});
+  VectorRef fbn2 = VectorRef({prim::kPrimFusedBatchNorm, in_cast, scale_, bias_, mean_, var_});
+  VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2, index_});
+  VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
+  return out_cast;
+}
+
+const AnfNodePtr ReplaceBNCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
+                                              const EquivPtr &equiv) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(equiv);
+
+  auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
+  auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
+  MS_EXCEPTION_IF_NULL(index_node);
+  auto value_node = index_node->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(value_node);
+  int item_idx = GetValue<int>(value_node->value());
+
+  auto fbn2 = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
+  auto x_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 0);
+  auto x_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(x_after), 0);
+  if (item_idx != 0) {
+    return nullptr;
+  }
+  auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 1);
+  auto bias = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 2);
+  auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 3);
+  auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 4);
+
+  MS_EXCEPTION_IF_NULL(fbn2);
+  MS_EXCEPTION_IF_NULL(x_after);
+  MS_EXCEPTION_IF_NULL(x_before);
+  MS_EXCEPTION_IF_NULL(scale);
+  MS_EXCEPTION_IF_NULL(bias);
+  MS_EXCEPTION_IF_NULL(mean);
+  MS_EXCEPTION_IF_NULL(var);
+
+  auto manager = graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  manager->Replace(utils::cast<CNodePtr>(x_after), utils::cast<CNodePtr>(x_before));
+  manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
+
+  std::vector<TypeId> outputs_type;
+  std::vector<std::vector<size_t>> outputs_shape;
+  auto output_num = AnfAlgo::GetOutputTensorNum(fbn2);
+  for (size_t i = 0; i < output_num; i++) {
+    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2, i));
+    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2, i));
+  }
+  outputs_type[0] = kNumberTypeFloat16;
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2.get());
+
+  outputs_type.clear();
+  outputs_shape.clear();
+  outputs_type.push_back(kNumberTypeFloat16);
+  outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
+  return tuple;
+}
+}  // namespace opt
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.h
@ -0,0 +1,58 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
+
+#include <memory>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ReplaceBNCastFusion : public PatternProcessPass {
+ public:
+  explicit ReplaceBNCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_cast", multigraph) {
+    x_ = std::make_shared<Var>();
+    scale_ = std::make_shared<Var>();
+    bias_ = std::make_shared<Var>();
+    mean_ = std::make_shared<Var>();
+    var_ = std::make_shared<Var>();
+    y_ = std::make_shared<Var>();
+    running_mean_ = std::make_shared<Var>();
+    running_var_ = std::make_shared<Var>();
+    save_mean_ = std::make_shared<Var>();
+    save_var_ = std::make_shared<Var>();
+    index_ = std::make_shared<Var>();
+  }
+  ~ReplaceBNCastFusion() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  VarPtr x_;
+  VarPtr scale_;
+  VarPtr bias_;
+  VarPtr mean_;
+  VarPtr var_;
+  VarPtr y_;
+  VarPtr running_mean_;
+  VarPtr running_var_;
+  VarPtr save_mean_;
+  VarPtr save_var_;
+  VarPtr index_;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.cc
@ -0,0 +1,88 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "backend/session/anf_runtime_algorithm.h"
+#include "ir/primitive.h"
+#include "utils/utils.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+const BaseRef ReplaceBNGradCast2Fusion::DefinePattern() const {
+  VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_, x_, scale_, mean_, var_});
+  VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
+  VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
+  return out_cast;
+}
+
+const AnfNodePtr ReplaceBNGradCast2Fusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
+                                                   const EquivPtr &equiv) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(equiv);
+  auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
+  auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
+  MS_EXCEPTION_IF_NULL(index_node);
+  auto value_node = index_node->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(value_node);
+  int item_idx = GetValue<int>(value_node->value());
+  if (item_idx != 0) {
+    return nullptr;
+  }
+  auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
+
+  auto dy_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
+  auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
+
+  auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
+  auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
+  auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);
+
+  MS_EXCEPTION_IF_NULL(fbn2g);
+  MS_EXCEPTION_IF_NULL(dy_);
+  MS_EXCEPTION_IF_NULL(scale);
+  MS_EXCEPTION_IF_NULL(x_);
+  MS_EXCEPTION_IF_NULL(mean);
+  MS_EXCEPTION_IF_NULL(var);
+
+  auto manager = graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
+  std::vector<TypeId> outputs_type;
+  std::vector<std::vector<size_t>> outputs_shape;
+  auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
+  for (size_t i = 0; i < output_num; i++) {
+    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
+    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
+  }
+  outputs_type[0] = AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0);
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
+
+  outputs_type.clear();
+  outputs_shape.clear();
+  outputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0));
+  outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
+
+  return tuple;
+}
+}  // namespace opt
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h
@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
+
+#include <memory>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ReplaceBNGradCast2Fusion : public PatternProcessPass {
+ public:
+  explicit ReplaceBNGradCast2Fusion(bool multigraph = true) : PatternProcessPass("replace_grad_cast2", multigraph) {
+    dy_ = std::make_shared<Var>();
+    x_ = std::make_shared<Var>();
+    scale_ = std::make_shared<Var>();
+    mean_ = std::make_shared<Var>();
+    var_ = std::make_shared<Var>();
+    dx_ = std::make_shared<Var>();
+    bn_scale_ = std::make_shared<Var>();
+    bn_bias_ = std::make_shared<Var>();
+    index_ = std::make_shared<Var>();
+  }
+  ~ReplaceBNGradCast2Fusion() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  VarPtr dy_;
+  VarPtr x_;
+  VarPtr scale_;
+  VarPtr mean_;
+  VarPtr var_;
+  VarPtr dx_;
+  VarPtr bn_scale_;
+  VarPtr bn_bias_;
+  VarPtr index_;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc
@ -0,0 +1,91 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "backend/session/anf_runtime_algorithm.h"
+#include "ir/primitive.h"
+#include "utils/utils.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+const BaseRef ReplaceBNGradCastFusion::DefinePattern() const {
+  VectorRef dy_cast = VectorRef({prim::kPrimCast, dy_});
+  VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_cast, x_, scale_, mean_, var_});
+  VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
+  VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
+  return out_cast;
+}
+
+const AnfNodePtr ReplaceBNGradCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
+                                                  const EquivPtr &equiv) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(equiv);
+
+  auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
+  auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
+  MS_EXCEPTION_IF_NULL(index_node);
+  auto value_node = index_node->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(value_node);
+  int item_idx = GetValue<int>(value_node->value());
+  if (item_idx != 0) {
+    return nullptr;
+  }
+  auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
+
+  auto dy_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
+  auto dy_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(dy_after), 0);
+  auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
+
+  auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
+  auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
+  auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);
+
+  MS_EXCEPTION_IF_NULL(fbn2g);
+  MS_EXCEPTION_IF_NULL(dy_after);
+  MS_EXCEPTION_IF_NULL(dy_before);
+  MS_EXCEPTION_IF_NULL(scale);
+  MS_EXCEPTION_IF_NULL(x_);
+  MS_EXCEPTION_IF_NULL(mean);
+  MS_EXCEPTION_IF_NULL(var);
+
+  auto manager = graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  manager->Replace(utils::cast<CNodePtr>(dy_after), utils::cast<CNodePtr>(dy_before));
+  manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
+  std::vector<TypeId> outputs_type;
+  std::vector<std::vector<size_t>> outputs_shape;
+  auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
+  for (size_t i = 0; i < output_num; i++) {
+    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
+    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
+  }
+  outputs_type[0] = kNumberTypeFloat16;
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
+  outputs_type.clear();
+  outputs_shape.clear();
+  outputs_type.push_back(kNumberTypeFloat16);
+  outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
+  return tuple;
+}
+}  // namespace opt
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.h
@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
+
+#include <memory>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ReplaceBNGradCastFusion : public PatternProcessPass {
+ public:
+  explicit ReplaceBNGradCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_grad_cast", multigraph) {
+    dy_ = std::make_shared<Var>();
+    x_ = std::make_shared<Var>();
+    scale_ = std::make_shared<Var>();
+    mean_ = std::make_shared<Var>();
+    var_ = std::make_shared<Var>();
+    dx_ = std::make_shared<Var>();
+    bn_scale_ = std::make_shared<Var>();
+    bn_bias_ = std::make_shared<Var>();
+    index_ = std::make_shared<Var>();
+  }
+  ~ReplaceBNGradCastFusion() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  VarPtr dy_;
+  VarPtr x_;
+  VarPtr scale_;
+  VarPtr mean_;
+  VarPtr var_;
+  VarPtr dx_;
+  VarPtr bn_scale_;
+  VarPtr bn_bias_;
+  VarPtr index_;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.cc
@ -0,0 +1,63 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "backend/session/anf_runtime_algorithm.h"
+#include "ir/primitive.h"
+#include "utils/utils.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+const BaseRef ReplaceMomentumCastFusion::DefinePattern() const {
+  VectorRef grad_cast = VectorRef({prim::kPrimCast, grad_});
+  VectorRef momentum = VectorRef({prim::kPrimApplyMomentum, var_, acc_, lr_, grad_cast, mom_});
+  return momentum;
+}
+
+const AnfNodePtr ReplaceMomentumCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
+                                                    const EquivPtr &equiv) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(equiv);
+
+  auto grad_cast = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 3);
+  auto grad = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(grad_cast), 0);
+  MS_EXCEPTION_IF_NULL(grad_cast);
+  MS_EXCEPTION_IF_NULL(grad);
+
+  auto manager = graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  manager->Replace(utils::cast<CNodePtr>(grad_cast), utils::cast<CNodePtr>(grad));
+  std::vector<TypeId> outputs_type;
+  std::vector<std::vector<size_t>> outputs_shape;
+  auto output_num = AnfAlgo::GetOutputTensorNum(node);
+  for (size_t i = 0; i < output_num; i++) {
+    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(node, i));
+    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(node, i));
+  }
+  outputs_type[3] = AnfAlgo::GetPrevNodeOutputInferDataType(grad_cast, 0);
+
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, node.get());
+
+  return node;
+}
+}  // namespace opt
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.h
@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
+
+#include <memory>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ReplaceMomentumCastFusion : public PatternProcessPass {
+ public:
+  explicit ReplaceMomentumCastFusion(bool multigraph = true) : PatternProcessPass("replace_momentum_cast", multigraph) {
+    var_ = std::make_shared<Var>();
+    acc_ = std::make_shared<Var>();
+    lr_ = std::make_shared<Var>();
+    grad_ = std::make_shared<Var>();
+    mom_ = std::make_shared<Var>();
+  }
+  ~ReplaceMomentumCastFusion() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  VarPtr var_;
+  VarPtr acc_;
+  VarPtr lr_;
+  VarPtr grad_;
+  VarPtr mom_;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/kernel_refcount.h
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/kernel_refcount.h
@ -25,7 +25,8 @@
 namespace mindspore {
 namespace memreuse {
 enum RefCountType { kDynamicRefCount, kStaticRefCount };
-enum NodeType { NORMAL, SPECIAL };
+enum NodeType { COMMON_NODE, COMMUNICATION_NODE };
+enum KernelRefType { COMMON, REFNODE_OUTPUT, COMM_NOTREUSE, COMM_REUSE, SUMMARY };
 static constexpr int kInitIndex = -1;
 class KernelRefCount {
 public:
@ -36,6 +37,7 @@ class KernelRefCount {
  size_t offset_;
  size_t size_;
  int index_;
+  KernelRefType type_;
  // remember to reset offset
  KernelRefCount()
      : stream_id_(0),
@ -44,6 +46,7 @@ class KernelRefCount {
        offset_(0),
        size_(0),
        index_(kInitIndex),
+        type_(COMMON),
        reftype_(kStaticRefCount) {}
  ~KernelRefCount() = default;
  void SetKernelRefCountInfo(int index, size_t size, RefCountType reftype);
@ -65,7 +68,7 @@ class KernelDef {
  KernelMap inputs_;
  KernelMap outputs_;
  KernelMap wk_space_;
-  NodeType dirty = NORMAL;
+  NodeType type_ = COMMON_NODE;
  KernelDef() = default;
  ~KernelDef() = default;
  void set_input_refs(const KernelRefCountPtrList &kernelRefPtrList) { input_refs_ = kernelRefPtrList; }
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.cc
@ -46,6 +46,8 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() {
    if (iter == kernel_output_refs_.end()) {
      auto output_sizes = kernel_mod->GetOutputSizeList();
      KernelRefCountPtrList kernel_refs;
+      bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel_cnode);
+      size_t output_index = 0;
      for (auto size : output_sizes) {
        total_dy_size_ += size;
        // do not MallocDynamicMem just record this
@ -54,9 +56,20 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() {
        auto curr_stream_id = AnfAlgo::GetStreamId(kernel_cnode);
        kernel_ref->stream_id_ = curr_stream_id;
        kernel_ref->SetKernelRefCountInfo(index, size, kDynamicRefCount);
+        if (is_comm_op) {
+          kernel_ref->type_ = COMM_REUSE;
+        } else {
+          session::AnfWithOutIndex out_pair(kernel_cnode, output_index);
+          if (graph_->IsInRefOutputMap(out_pair)) {
+            kernel_ref->type_ = REFNODE_OUTPUT;
+          } else {
+            kernel_ref->type_ = COMMON;
+          }
+        }
        kernel_refs.push_back(kernel_ref);
        kernel_out_ref_num++;
        total_refs_list_.push_back(kernel_ref);
+        output_index++;
      }
      if (!kernel_refs.empty()) {
        kernel_output_refs_[key] = kernel_refs;
@ -155,9 +168,19 @@ void MemReuseUtil::SetInputMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr
  MS_EXCEPTION_IF_NULL(kernel);
  MS_EXCEPTION_IF_NULL(kernel_def_ptr);
  auto key = kernel.get();
-  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
+  bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel);
+  size_t input_tensor_num = AnfAlgo::GetInputTensorNum(kernel);
+  for (size_t i = 0; i < input_tensor_num; ++i) {
    auto ref_ptr = GetKernelInputRef(kernel, i);
    if (ref_ptr != nullptr) {
+      if (is_comm_op) {
+        if (input_tensor_num == 1) {
+          ref_ptr->type_ = COMM_REUSE;
+        } else {
+          ref_ptr->type_ = COMM_NOTREUSE;
+        }
+      }
+
      if (ref_ptr->reftype() == kStaticRefCount) {
        continue;
      } else if (ref_ptr->reftype() == kDynamicRefCount) {
@ -258,6 +281,11 @@ void MemReuseUtil::SetKernelDefMap() {
    auto key = kernel.get();
    kernel_def_ptr->set_input_refs(kernel_def_ptr->inputs_[key]);
    kernel_def_ptr->set_output_refs(kernel_def_ptr->outputs_[key]);
+    if (AnfAlgo::IsCommunicationOp(kernel)) {
+      kernel_def_ptr->type_ = COMMUNICATION_NODE;
+    } else {
+      kernel_def_ptr->type_ = COMMON_NODE;
+    }
    kernel_def_ptr_list_.push_back(kernel_def_ptr);
    kernel_map_[key] = kernel_def_ptr;
  }
@ -337,6 +365,7 @@ void MemReuseUtil::SetSummaryNodesRefCount() {
      KernelRefCountPtr kernel_ref = kernel_output_refs_[node.get()][index];
      kernel_ref->ref_count_ = kMaxRefCount;
      kernel_ref->ref_count_dynamic_use_ = kMaxRefCount;
+      kernel_ref->type_ = SUMMARY;
      total_summary_size += kernel_ref->size_;
      MS_LOG(INFO) << "Set summary node's ref count, node: " << node->fullname_with_scope() << " index: " << index;
    } else {
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.h
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.h
@ -83,6 +83,7 @@ class MemReuseUtil {
  void set_mem_base(uint8_t *mem_base) { mem_base_ = mem_base; }
  uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
  uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
+  bool is_all_nop_node() const { return is_all_nop_node_; }

 private:
  int util_index_;
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
@ -33,11 +33,11 @@ void BestFitMemReuse::InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr) {
  set_op_ptr_list(mem_reuse_util_ptr->kernel_def_ptr_list());
  // check info Correctness
  for (auto &tensor : tensor_ptr_list_) {
-    tensor->size_ = AlignMemorySize(tensor->size_);
+    tensor->size_ = AlignCommonMemorySize(tensor->size_);
  }
  // align wk size to 512 && refcount == 1
  for (auto &wk : wk_tensor_list_) {
-    wk->size_ = AlignMemorySize(wk->size_);
+    wk->size_ = AlignCommonMemorySize(wk->size_);
    wk->ref_count_ = 1;
  }
 #ifdef ENABLE_D
@ -135,11 +135,23 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr
  return false;
 }

-void BestFitMemReuse::AssignNodeOutputOffset() {
+void BestFitMemReuse::AssignCommonNodeOutputOffset() {
+  MS_EXCEPTION_IF_NULL(current_kernel_);
  for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
    size_t index = GetTensorIndex(tensor_idx);
    auto tensor_desc = tensor_ptr_list_[index];
    MS_EXCEPTION_IF_NULL(tensor_desc);
+    if (tensor_desc->type_ == REFNODE_OUTPUT) {
+      total_refoutput_size += tensor_desc->size_;
+      continue;
+    } else if (tensor_desc->type_ == COMM_NOTREUSE) {
+      total_comm_not_reuse_size += tensor_desc->size_;
+    } else if (tensor_desc->type_ == COMM_REUSE) {
+      // get align size for communication op's single input
+      tensor_desc->size_ = AlignCommunicationMemorySize(tensor_desc->size_);
+      total_comm_reuse_size += tensor_desc->size_;
+    }
+
    auto reusable_membuf_map = GetReusableMembufMap(tensor_desc->size_);
    if (!reusable_membuf_map.empty()) {
      auto membuf_index = reusable_membuf_map.begin()->second;
@ -152,6 +164,93 @@ void BestFitMemReuse::AssignNodeOutputOffset() {
      MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
 #endif
    }
+    // skip left align border for communication op single input to used
+    if (tensor_desc->type_ == COMM_REUSE) {
+      tensor_desc->offset_ += kDefaultMemAlignSize;
+    }
+  }
+}
+
+void BestFitMemReuse::AssignCommunicationNodeOutputOffset() {
+  size_t total_kernel_output_size = 0;
+  size_t output_num = 0;
+  // get all output size
+  MS_EXCEPTION_IF_NULL(current_kernel_);
+  for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
+    size_t index = GetTensorIndex(tensor_idx);
+    auto tensor_desc = tensor_ptr_list_[index];
+    MS_EXCEPTION_IF_NULL(tensor_desc);
+    if (tensor_desc->type_ == COMM_REUSE) {
+      total_comm_reuse_size += tensor_desc->size_;
+      total_comm_output_reuse_size += tensor_desc->size_;
+      total_kernel_output_size += tensor_desc->size_;
+    } else {
+      MS_LOG(ERROR) << "All communication op's outputs should be memory reuse, Kernel:"
+                    << current_kernel_->scope_full_name();
+      continue;
+    }
+  }
+  total_kernel_output_size = AlignCommunicationMemorySize(total_kernel_output_size);
+
+  // add left align border for the first output and right align border for the last output to alloc align border memory
+  size_t output_index = 0;
+  auto output_ref_indexes = current_kernel_->GetOutputRefIndexs();
+  for (auto &tensor_idx : output_ref_indexes) {
+    size_t index = GetTensorIndex(tensor_idx);
+    auto tensor_desc = tensor_ptr_list_[index];
+    MS_EXCEPTION_IF_NULL(tensor_desc);
+    if (output_index == 0 || output_index == output_num - 1) {
+      tensor_desc->size_ += kDefaultMemAlignSize;
+    }
+
+    if ((output_index == 0) && (output_ref_indexes.size() == 1)) {
+      // add right align border for single output
+      tensor_desc->size_ += kDefaultMemAlignSize;
+    }
+
+    output_index++;
+  }
+
+  auto reusable_membuf_map = GetReusableMembufMap(total_kernel_output_size);
+  if (!reusable_membuf_map.empty()) {
+    auto membuf_index = reusable_membuf_map.begin()->second;
+    output_index = 0;
+    for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
+      size_t index = GetTensorIndex(tensor_idx);
+      auto tensor_desc = tensor_ptr_list_[index];
+      MS_EXCEPTION_IF_NULL(tensor_desc);
+      ReuseExistMembuf(tensor_desc.get(), membuf_index + output_index, kDynamicMem);
+      // skip skip left align border for communication op's first output to used
+      if (output_index == 0) {
+        tensor_desc->offset_ += kDefaultMemAlignSize;
+      }
+      output_index++;
+    }
+  } else {
+    // no membuf can reuse, add new membuf after the membuf_ptr_list
+    output_index = 0;
+    for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
+      size_t index = GetTensorIndex(tensor_idx);
+      auto tensor_desc = tensor_ptr_list_[index];
+      MS_EXCEPTION_IF_NULL(tensor_desc);
+      AddNewMembufPtr(tensor_desc.get(), kDynamicMem);
+      // skip align size offset for first output to used
+      if (output_index == 0) {
+        tensor_desc->offset_ += kDefaultMemAlignSize;
+      }
+      output_index++;
+#ifdef MEM_REUSE_DEBUG
+      MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
+#endif
+    }
+  }
+}
+
+void BestFitMemReuse::AssignNodeOutputOffset() {
+  if (current_kernel_->type_ == COMMUNICATION_NODE) {
+    AssignCommunicationNodeOutputOffset();
+  } else {
+    AssignCommonNodeOutputOffset();
  }
 }

@ -319,11 +418,17 @@ void BestFitMemReuse::ReleaseMembuf(size_t tensor_index, int flag) {
  }
 }

-size_t BestFitMemReuse::AlignMemorySize(size_t size) const {
+size_t BestFitMemReuse::AlignCommonMemorySize(size_t size) const {
  // memory size 512 align
  return (size + kDefaultMemAlignSize + kAttAlignSize) / kDefaultMemAlignSize * kDefaultMemAlignSize;
 }

+size_t BestFitMemReuse::AlignCommunicationMemorySize(size_t size) const {
+  // memory size 512 align and add communication memory:  left align border memory - data - right align border memory
+  return kDefaultMemAlignSize + (size + kDefaultMemAlignSize - 1) / kDefaultMemAlignSize * kDefaultMemAlignSize +
+         kDefaultMemAlignSize;
+}
+
 size_t BestFitMemReuse::GetAllocatedSize() {
  size_t AllocatedSize = kTotalSize;
  if (membuf_ptr_list_.empty()) {
@ -412,6 +517,9 @@ void BestFitMemReuse::Reuse(const MemReuseUtil *mem_reuse_util_ptr) {
    ++op_num;
 #endif
  }
+  MS_LOG(INFO) << "Special Tensor total size: RefOutput: " << total_refoutput_size
+               << " CommReuse: " << total_comm_reuse_size << " CommOutputReuse: " << total_comm_output_reuse_size
+               << " CommNotReuse: " << total_comm_not_reuse_size;
 #ifdef MEM_REUSE_DEBUG
  MemReuseChecker::GetInstance().ExportMembufInfoIR();
  MemReuseChecker::GetInstance().ExportAddNewMmebufIR();
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.h
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.h
@ -74,6 +74,14 @@ class BestFitMemReuse {
   * Assign output tensor memory offset of current kernel
   */
  void AssignNodeOutputOffset();
+  /**
+   * Assign output tensor memory offset of common kernel
+   */
+  void AssignCommonNodeOutputOffset();
+  /**
+   * Assign output tensor memory offset of communication kernel
+   */
+  void AssignCommunicationNodeOutputOffset();
  /**
   * Update input tensor's status of current kernel, and the status of membuf used by current kernel
   */
@ -110,8 +118,10 @@ class BestFitMemReuse {
  void AddNewMembufPtr(KernelRefCount *tensor_desc, int flag);
  // Merge unused membuf
  void ReleaseMembuf(size_t tensor_index, int flag);
-  // Memory address alignment 512
-  size_t AlignMemorySize(size_t size) const;
+  // Memory address alignment for common memory
+  size_t AlignCommonMemorySize(size_t size) const;
+  // Memory address alignment for communication used memory
+  size_t AlignCommunicationMemorySize(size_t size) const;
  int GetRealIndex(size_t index, int flag = kDynamicMem) const;
  size_t GetTensorIndex(int index) const;
  size_t GetWorkspaceIndex(int index) const;
@ -153,6 +163,10 @@ class BestFitMemReuse {
  // kernel_front_map_, key: the kernel_def, value: kernels before this kernel_def
  std::map<KernelDefPtr, std::set<KernelDefPtr>> kernel_front_map_;
  std::vector<std::vector<uint32_t>> stream_groups_;
+  size_t total_refoutput_size{0};
+  size_t total_comm_reuse_size{0};
+  size_t total_comm_output_reuse_size{0};
+  size_t total_comm_not_reuse_size{0};
 };
 }  // namespace memreuse
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_checker.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_checker.cc
@ -170,12 +170,14 @@ void MemReuseChecker::CheckMemReuseIR(const KernelRefCountPtrList &total_refs_li
  ofs << "all_tensor_refs:\n";
  ofs << "index:"
      << "\tsize:"
-      << "\trefcount:\n";
+      << "\trefcount:"
+      << "\ttype:\n";
  for (auto &ref : total_refs_list) {
    ofs << "%" << ref->index_ << "T"
        << "\t"
        << "#" << ref->size_ << "S"
        << "\t" << ref->ref_count_ << "C"
+        << "\t" << ref->type_ << "t"
        << "\n";
  }
  ofs << "kernel_def exc_order:\n";
@ -241,7 +243,7 @@ bool MemReuseChecker::CheckGraphOutputAssigned(const session::KernelGraph *graph
 void MemReuseChecker::ExportMemOpIr(const KernelDef *def, std::ofstream &ofs, int def_idx) {
  auto scope_name = def->scope_full_name();
  std::string split_name = GetSplitName(scope_name);
-  ofs << "$" << def_idx << "\t" << split_name << "\t";
+  ofs << "$" << def_idx << "\t" << split_name << "\t" << static_cast<int>(def->type_) << "\t";
  ofs << "inputs[";
  for (auto &in : def->inputs_) {
    for (auto &in_ref : in.second) {
--- a/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.cc
@ -100,7 +100,10 @@ bool CommunicationOpFusion::GetSplitSegments(const CommunicationOpInfo &communic

  auto parallel_context = parallel::ParallelContext::GetInstance();
  MS_EXCEPTION_IF_NULL(parallel_context);
-  const auto &split_indices = parallel_context->GetAllReduceFusionSplitIndices(group);
+  std::vector<uint32_t> split_indices;
+  if (!parallel_context->enable_parallel_optimizer()) {
+    split_indices = parallel_context->GetAllReduceFusionSplitIndices(group);
+  }

  size_t segments = 0;
  if (split_indices.size() != 0) {
--- a/mindspore/ccsrc/backend/optimizer/pass/replace_node_by_proxy.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/replace_node_by_proxy.cc
@ -71,7 +71,6 @@ bool ReplaceNodeByProxy::Run(const FuncGraphPtr &func_graph) {

      AbstractBasePtrList abstract_list;
      AnfAlgo::CopyNodeAttr(kAttrPsKey, cnode, proxy_node);
-      AnfAlgo::CopyNodeAttr("reduce_scatter_flag", cnode, proxy_node);
      AnfAlgo::CopyNodeAttr("offset", cnode, proxy_node);
      abstract_list.push_back(cnode->abstract());
      auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list);
--- a/mindspore/ccsrc/backend/session/ascend_control_parser.cc
+++ b/mindspore/ccsrc/backend/session/ascend_control_parser.cc
@ -18,9 +18,12 @@
 #include <utility>
 #include <memory>
 #include <algorithm>
+#include <string>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "utils/union_find_set.h"
 #include "runtime/device/ascend/ascend_label_assign.h"
+#include "utils/context/ms_context.h"
+#include "debug/anf_ir_dump.h"

 static constexpr size_t kCNodePrim = 0;
 static constexpr size_t kCNodeCallArg = 1;
@ -104,7 +107,7 @@ static void ReuseParameter(NotNull<KernelGraphPtr> root_kg,

 static CNodePtr GetNextRealKernel(const std::vector<CNodePtr> &list, size_t start) {
  for (size_t i = start; i < list.size() - 1; ++i) {
-    if (!IsPrimitiveCNode(list[i], prim::kPrimPartial) && AnfAlgo::IsRealKernel(list[i])) {
+    if (AnfAlgo::IsRealKernel(list[i])) {
      return list[i];
    }
  }
@ -168,18 +171,43 @@ static void EraseNodeFromExecOrder(const AnfNodePtr &node, const NotNull<std::ve
  exec_order->erase(exec_iter);
 }

+void AscendControlParser::AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph,
+                                                       const NotNull<std::set<KernelGraphPtr> *> memo) {
+  if (memo->find(graph) != memo->end()) {
+    return;
+  }
+  memo->insert(graph.get());
+  const std::vector<std::shared_ptr<KernelGraph>> &child_graph_order = graph->child_graph_order();
+  if (child_graph_order.empty()) {
+    return;
+  }
+
+  std::vector<AnfNodePtr> depend_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimPartial->name()))};
+  for (auto &cg : child_graph_order) {
+    MS_EXCEPTION_IF_NULL(cg);
+    auto fg = cg->cast<FuncGraphPtr>();
+    MS_EXCEPTION_IF_NULL(fg);
+    depend_inputs.emplace_back(NewValueNode(fg));
+    AttachChildGraphToReturnNode(NOT_NULL(cg), memo);
+  }
+  auto child_graphs = graph->NewCNode(depend_inputs);
+  InsertDependToGraph(graph, NOT_NULL(child_graphs));
+}
+
 void AscendControlParser::LinkGraph(NotNull<KernelGraphPtr> kg) {
  std::set<KernelGraphPtr> memo;
  std::vector<std::pair<AnfNodePtr, AnfNodePtr>> link_list;
  // Insert Assign
  ChildGraphDataAssign(kg, NOT_NULL(&link_list), NOT_NULL(&memo));
+  memo.clear();
  // Reuse Parameter
  ReuseParameter(kg, link_list);
  // replace call by label goto / label switch
-  memo.clear();
  (void)ProcessKernelGraph(kg, nullptr, nullptr, NOT_NULL(&memo));
+  memo.clear();
  // assign label resource
  device::ascend::AscendLabelAssign::GetInstance().AssignLabel(kg);
+  AttachChildGraphToReturnNode(kg, NOT_NULL(&memo));
 }

 void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
@ -248,10 +276,14 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
    }
    MS_LOG(INFO) << "Erase " << assign_node->DebugString(5);
    EraseNodeFromExecOrder(assign_node, NOT_NULL(&exec_order));
-
-    auto source = AnfAlgo::VisitKernelWithReturnType(assign_node->input(kCNodeAssignSource), 0).first;
-    parameter_count.AddReadCount(source, -1);
+    auto source = assign_node->input(kCNodeAssignSource);
+    MS_EXCEPTION_IF_NULL(source);
+    auto visit_source = AnfAlgo::VisitKernelWithReturnType(source, 0).first;
    parameter_count.AddWriteCount(para, -1);
+    parameter_count.AddReadCount(para, -1);
+    if (visit_source->isa<Parameter>()) {
+      parameter_count.AddReadCount(visit_source, read - 1);
+    }
    for (auto &node : all_nodes) {
      for (size_t i = 0; i < node->size(); ++i) {
        if (node->input(i) == para) {
@ -260,8 +292,6 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
        }
      }
    }
-    parameter_count.AddReadCount(source, 1);
-    parameter_count.AddReadCount(para, -1);
  }
  root_graph->set_execution_order(exec_order);
 }
@ -318,6 +348,17 @@ void AscendControlParser::ExecutorValidate(NotNull<KernelGraphPtr> root_graph) {
  (void)RecurseGraph(root_graph, NOT_NULL(&memo));
  EraseParameter(root_graph, memo);
  EraseLabel(root_graph);
+
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  auto save_graphs_path = context_ptr->save_graphs_path();
+  if (save_graphs_path.empty()) {
+    save_graphs_path = ".";
+  }
+  if (context_ptr->save_graphs_flag()) {
+    std::string file_path = save_graphs_path + "/after_erase_label_and_parameter.ir";
+    DumpIR(file_path, root_graph.get());
+  }
 }

 std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> AscendControlParser::ParseCallNode(
--- a/mindspore/ccsrc/backend/session/ascend_control_parser.h
+++ b/mindspore/ccsrc/backend/session/ascend_control_parser.h
@ -66,7 +66,8 @@ class AscendControlParser {
  static AnfNodePtr InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to);
  static std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> ParseCallNode(NotNull<CNodePtr> call_node);
  static std::tuple<KernelGraphPtr, std::vector<AnfNodePtr>> ParsePartial(NotNull<AnfNodePtr> node);
-
+  static void AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph,
+                                           const NotNull<std::set<KernelGraphPtr> *> memo);
  // root graph order
  static bool CheckLabelIndex(uint32_t order_index, uint32_t label_index, const CNodePtr &cnode,
                              NotNull<KernelGraphPtr> graph);
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@ -353,6 +353,10 @@ GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) {
  RootGraphExecutorValidate(NOT_NULL(root_graph));
  // adjust kernel
  AdjustKernel(root_graph);
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  // Assign parameter keys.
+  AssignParamKey(root_graph);
+#endif
  // assign stream
  AssignStream(NOT_NULL(root_graph));
  // insert profiling point
@ -511,6 +515,12 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
  }
  // load input data from user input
  LoadInputData(kernel_graph, inputs);
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  // Initialize parameter server
+  if (!ps_init_) {
+    InitPSParamAndOptim(kernel_graph, inputs);
+  }
+#endif
  // convert inputs to model
  predictmodel::StepConvertWeight(inputs);
  {
--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@ -16,6 +16,7 @@

 #include "backend/session/cpu_session.h"
 #include <algorithm>
+#include <sstream>
 #include "ir/tensor.h"
 #include "ir/anf.h"
 #include "backend/kernel_compiler/kernel.h"
@ -25,9 +26,15 @@
 #include "predict/predict.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "runtime/device/cpu/kernel_select_cpu.h"
+#include "backend/optimizer/common/optimizer.h"
+#include "backend/optimizer/common/pass_manager.h"
+#include "backend/optimizer/pass/replace_node_by_proxy.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+#include "frontend/parallel/ps/util.h"
+#endif

 namespace mindspore {
 namespace session {
@ -49,12 +56,29 @@ ParameterPtr CPUSession::CreateNewParameterFromParameter(const AnfNodePtr &anf,
  return new_parameter;
 }

+void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  std::string pass_name = "replace_node_by_proxy";
+  pass_name.append(std::to_string(graph_sum_));
+  pm->AddPass(std::make_shared<opt::ReplaceNodeByProxy>(pass_name));
+  optimizer->AddPassManager(pm);
+  (void)optimizer->Optimize(kernel_graph);
+  kernel_graph->SetExecOrderByDefault();
+}
+
 GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
  auto graph_id = graph_sum_;
  auto graph = ConstructKernelGraph(lst, outputs);
  MS_EXCEPTION_IF_NULL(graph);
  MS_LOG(INFO) << "Set kernel info";
  SetKernelInfo(graph.get());
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  AssignParamKey(graph);
+  if (parallel::ps::Util::IsRoleOfWorker()) {
+    Optimize(graph);
+  }
+#endif
  predictmodel::StepConvertGraph(graph);
  MS_LOG(INFO) << "Build kernel";
  BuildKernel(graph.get());
@ -66,6 +90,12 @@ GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
 void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
  auto &kernel_graph = graphs_[graph_id];
  MS_EXCEPTION_IF_NULL(kernel_graph);
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  // Initialize parameter server
+  if (!ps_init_) {
+    InitPSParamAndOptim(kernel_graph, inputs);
+  }
+#endif
  MS_LOG(INFO) << "Bind input output address";
  std::vector<tensor::TensorPtr> need_sync_outputs;
  runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs, &need_sync_outputs);
@ -119,6 +149,48 @@ void CPUSession::SetKernelInfo(const KernelGraph *kernel_graph) {
  }
 }

+namespace {
+void KernelNotSupportException(const AnfNodePtr &kernel_node) {
+  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
+  std::stringstream operator_info;
+  operator_info << "Operator[" << kernel_name << "] ";
+  auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel_node->kernel_info());
+  if (kernel_info == nullptr) {
+    operator_info << "is not support.";
+    MS_LOG(EXCEPTION) << operator_info.str();
+  }
+  auto kernel_build_Info = kernel_info->select_kernel_build_info();
+  if (kernel_build_Info == nullptr) {
+    operator_info << "is not support.";
+    MS_LOG(EXCEPTION) << operator_info.str();
+  }
+  size_t input_num = kernel_build_Info->GetInputNum();
+  if (input_num > 0) {
+    operator_info << " input(";
+    for (size_t i = 0; i < input_num; ++i) {
+      operator_info << TypeIdLabel(kernel_build_Info->GetInputDeviceType(i));
+      if (i != input_num - 1) {
+        operator_info << ",";
+      }
+    }
+    operator_info << ") ";
+  }
+  size_t output_num = kernel_build_Info->GetOutputNum();
+  if (output_num > 0) {
+    operator_info << "output(";
+    for (size_t i = 0; i < output_num; ++i) {
+      operator_info << TypeIdLabel(kernel_build_Info->GetOutputDeviceType(i));
+      if (i != kernel_build_Info->GetOutputNum() - 1) {
+        operator_info << ",";
+      }
+    }
+    operator_info << ") ";
+  }
+  operator_info << "is not support.";
+  MS_LOG(EXCEPTION) << operator_info.str();
+}
+}  // namespace
+
 void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto &kernel_nodes = kernel_graph->execution_order();
@ -129,7 +201,7 @@ void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
    std::shared_ptr<kernel::CPUKernel> cpu_kernel =
      kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node);
    if (cpu_kernel == nullptr) {
-      MS_LOG(EXCEPTION) << "Operator[" << kernel_name << "] is not support.";
+      KernelNotSupportException(kernel_node);
    }
    cpu_kernel->Init(kernel_node);
    AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get());
--- a/mindspore/ccsrc/backend/session/cpu_session.h
+++ b/mindspore/ccsrc/backend/session/cpu_session.h
@ -37,6 +37,7 @@ class CPUSession : public SessionBasic {

 protected:
  ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph) override;
+  void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph);

 private:
  void SetKernelInfo(const KernelGraph *kernel_graph);
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@ -25,6 +25,11 @@
 #include "backend/optimizer/pass/getitem_tuple.h"
 #include "backend/optimizer/gpu/adam_weight_decay_fusion.h"
 #include "backend/optimizer/gpu/adam_fusion.h"
+#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
+#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
+#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
+#include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
+#include "backend/optimizer/gpu/replace_addn_fusion.h"
 #include "runtime/device/kernel_runtime_manager.h"
 #include "predict/predict.h"
 #include "common/utils.h"
@ -59,6 +64,11 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  auto pm = std::make_shared<opt::PassManager>();
  pm->AddPass(std::make_shared<opt::AdamWeightDecayFusion>());
  pm->AddPass(std::make_shared<opt::AdamFusion>());
+  pm->AddPass(std::make_shared<opt::ReplaceBNCastFusion>());
+  pm->AddPass(std::make_shared<opt::ReplaceBNGradCastFusion>());
+  pm->AddPass(std::make_shared<opt::ReplaceBNGradCast2Fusion>());
+  pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
+  pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
  optimizer->AddPassManager(pm);
  (void)optimizer->Optimize(kernel_graph);
  kernel_graph->SetExecOrderByDefault();
@ -167,6 +177,10 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
  Optimize(graph);
  // Select kernel build info
  SelectKernel(graph);
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  // Assign parameter keys.
+  AssignParamKey(graph);
+#endif
  // Convert kernel Graph to model
  predictmodel::StepConvertGraph(graph);
  // Start gpu kernel runtime
@ -204,6 +218,10 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
  auto &kernel_graph = graphs_[graph_id];
  // Load input data from user input
  LoadInputData(kernel_graph, inputs);
+  // Initialize parameter server
+  if (!ps_init_) {
+    InitPSParamAndOptim(kernel_graph, inputs);
+  }
  MS_EXCEPTION_IF_NULL(kernel_graph);
  // Convert inputs to model
  predictmodel::StepConvertWeight(inputs);
--- a/mindspore/ccsrc/backend/session/kernel_graph.cc
+++ b/mindspore/ccsrc/backend/session/kernel_graph.cc
@ -307,7 +307,7 @@ CNodePtr KernelGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
  if (inputs.size() == 1 || !feature_map_input_indexs.empty()) {
    kernel_info->SetFeatureMapFlag(true);
  }
-  if (AnfAlgo::IsRealCNodeKernel(cnode)) {
+  if (AnfAlgo::IsRealKernel(cnode)) {
    AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(kernel_info->is_feature_map()), cnode);
    AnfAlgo::SetNodeAttr(kIsFeatureMapInputList, MakeValue(feature_map_input_indexs), cnode);
  }
@ -929,10 +929,15 @@ void KernelGraph::AddInternalOutput(const AnfNodePtr &front_node, const AnfNodeP
  }
  MS_LOG(INFO) << "Add internal node " << node->DebugString() << " with front node " << front_node->DebugString();
  front_to_internal_outputs_map_[front_node] = node;
-  internal_outputs_to_front_map_[node] = front_node;
+  int output_idx = 0;
+  if (AnfAlgo::CheckPrimitiveType(front_node, prim::kPrimTupleGetItem)) {
+    output_idx = AnfAlgo::GetTupleGetItemOutIndex(front_node->cast<CNodePtr>());
+  }
+  internal_outputs_to_front_map_[node][output_idx] = front_node;
 }

-void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node) {
+void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx,
+                                        int dst_output_idx) {
  if (new_node == nullptr || node == nullptr) {
    MS_LOG(INFO) << "New node or node is nullptr";
    return;
@ -947,9 +952,30 @@ void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr
    return;
  }
  MS_LOG(INFO) << "Replace internal node " << node->DebugString() << " To " << new_node->DebugString();
-  internal_outputs_to_front_map_[new_node] = iter->second;
-  front_to_internal_outputs_map_[iter->second] = new_node;
-  internal_outputs_to_front_map_.erase(iter);
+  auto &front_nodes = iter->second;
+  // Move all front nodes to new node mapping
+  if (src_output_idx == -1) {
+    internal_outputs_to_front_map_[new_node] = front_nodes;
+    for (const auto &front_node_iter : front_nodes) {
+      front_to_internal_outputs_map_[front_node_iter.second] = new_node;
+    }
+    internal_outputs_to_front_map_.erase(iter);
+    return;
+  }
+  // Move specified front node to new node mapping
+  int index = SizeToInt(src_output_idx);
+  auto front_node_iter = front_nodes.find(index);
+  if (front_node_iter == front_nodes.end()) {
+    MS_LOG(INFO) << "The output " << src_output_idx << " of node " << node->DebugString() << " is not an internal node";
+    return;
+  }
+  auto front_node = front_node_iter->second;
+  internal_outputs_to_front_map_[new_node][dst_output_idx] = front_node;
+  front_to_internal_outputs_map_[front_node] = new_node;
+  front_nodes.erase(index);
+  if (front_nodes.empty()) {
+    internal_outputs_to_front_map_.erase(iter);
+  }
 }

 AnfNodePtr KernelGraph::GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const {
@ -967,14 +993,6 @@ bool KernelGraph::IsInternalOutput(const AnfNodePtr &node) const {
  return false;
 }

-AnfNodePtr KernelGraph::GetFrontNodeByInternalOutput(const AnfNodePtr &node) const {
-  auto iter = internal_outputs_to_front_map_.find(node);
-  if (iter != internal_outputs_to_front_map_.end()) {
-    return iter->second;
-  }
-  return nullptr;
-}
-
 void KernelGraph::AddFinalOutputKernel(const AnfNodePtr &node) {
  if (node == nullptr) {
    return;
--- a/mindspore/ccsrc/backend/session/kernel_graph.h
+++ b/mindspore/ccsrc/backend/session/kernel_graph.h
@ -148,10 +148,10 @@ class KernelGraph : public FuncGraph {
  const std::map<std::string, std::pair<AnfNodePtr, int>> &summary_nodes() const { return summary_nodes_; }
  void set_summary_nodes(const std::map<std::string, std::pair<AnfNodePtr, int>> &nodes) { summary_nodes_ = nodes; }
  void AddInternalOutput(const AnfNodePtr &front_node, const AnfNodePtr &node);
-  void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node);
+  void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx = -1,
+                             int dst_output_idx = -1);
  AnfNodePtr GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const;
  bool IsInternalOutput(const AnfNodePtr &node) const;
-  AnfNodePtr GetFrontNodeByInternalOutput(const AnfNodePtr &node) const;
  void AddFinalOutputKernel(const AnfNodePtr &node);
  bool IsFinalOutputKernel(const AnfNodePtr &node) const;
  uint32_t current_epoch() const { return current_epoch_; }
@ -223,7 +223,7 @@ class KernelGraph : public FuncGraph {
  CNodePtr end_goto_;
  bool null_output_;
  std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_;
-  std::unordered_map<AnfNodePtr, AnfNodePtr> internal_outputs_to_front_map_;
+  std::unordered_map<AnfNodePtr, std::unordered_map<int, AnfNodePtr>> internal_outputs_to_front_map_;
  std::set<AnfNodePtr> final_output_kernels_;
  uint32_t current_epoch_;
 };
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@ -35,6 +35,11 @@
 #include "ir/dtype.h"
 #include "ir/anf.h"
 #include "ir/func_graph_cloner.h"
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+#include "frontend/parallel/ps/worker.h"
+#include "frontend/parallel/ps/common.h"
+#include "frontend/parallel/ps/util.h"
+#endif

 namespace mindspore {
 namespace session {
@ -295,7 +300,11 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
    MS_LOG(INFO) << "No corresponding internal output for output node";
    return;
  }
-  auto real_kernel = AnfAlgo::VisitKernel(ref_node, 0);
+  size_t output_idx = 0;
+  if (AnfAlgo::CheckPrimitiveType(out_node, prim::kPrimTupleGetItem)) {
+    output_idx = AnfAlgo::GetTupleGetItemOutIndex(out_node->cast<CNodePtr>());
+  }
+  auto real_kernel = AnfAlgo::VisitKernel(ref_node, output_idx);
  auto ref_real_node = real_kernel.first;
  auto ref_real_node_index = real_kernel.second;
  if (ref_real_node->isa<CNode>() && node_graph->IsInternalOutput(ref_real_node) &&
@ -320,6 +329,7 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
    builder.SetOutputsFormat({format});
    d_kernel_info->set_select_kernel_build_info(builder.Build());
    AnfAlgo::SetOutputAddr(address, 0, parameter.get());
+    AnfAlgo::SetOutputInferTypeAndShape({type}, {AnfAlgo::GetOutputInferShape(parameter, 0)}, parameter.get());
  }
 }

@ -973,6 +983,16 @@ CNodePtr SessionBasic::ConstructOutput(const AnfNodePtrList &outputs, const std:
      bool internal_output = true;
      std::string kernel_target = GetCNodeTarget(front_real_kernel.first);
      for (auto user : users) {
+        auto cnode = user.first->cast<CNodePtr>();
+        if (cnode == nullptr) {
+          internal_output = false;
+          break;
+        }
+        auto prim = cnode->input(kAnfPrimitiveIndex);
+        if (prim == nullptr || !prim->isa<ValueNode>()) {
+          internal_output = false;
+          break;
+        }
        if (!AnfAlgo::IsRealKernel(user.first) || kernel_target != GetCNodeTarget(user.first)) {
          internal_output = false;
          break;
@ -1097,5 +1117,92 @@ KernelGraphPtr SessionBasic::NewKernelGraph() {
  graphs_[graph_sum_++] = graph;
  return graph;
 }
+
+AnfNodePtr SessionBasic::FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list) {
+  MS_EXCEPTION_IF_NULL(push_node);
+  for (auto &node : node_list) {
+    if (node != nullptr && node->isa<CNode>()) {
+      for (auto input : node->cast<CNodePtr>()->inputs()) {
+        if (push_node == AnfAlgo::VisitKernel(input, 0).first) {
+          if (AnfAlgo::GetCNodeName(node) != kPullOpName) {
+            MS_LOG(EXCEPTION) << "The edge between Push and Pull node is invalid.";
+          }
+          return node;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+void SessionBasic::AssignParamKey(const KernelGraphPtr &kernel_graph) {
+  if (!parallel::ps::Util::IsRoleOfWorker()) {
+    MS_LOG(INFO) << "Not parameter server mode.";
+    return;
+  }
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph->get_return());
+  for (auto &node : node_list) {
+    if (node != nullptr && node->isa<CNode>()) {
+      // Assign key for forward kernel EmbeddingLookup.
+      // The key will be assigned to embedding table ande Push kernel as well.
+      if (AnfAlgo::GetCNodeName(node) == kEmbeddingLookupOpName) {
+        size_t embedding_table_idx = 0;
+        auto embedding_table = AnfAlgo::GetInputNode(node->cast<CNodePtr>(), embedding_table_idx);
+        size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(embedding_table->fullname_with_scope());
+        AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node);
+      } else if (AnfAlgo::GetCNodeName(node) == kPushOpName) {
+        auto pull_node = FindPullNode(node, node_list);
+        if (!pull_node) {
+          MS_LOG(EXCEPTION) << "Assigning parameter key failed: can't find Pull node of the Push node.";
+        }
+
+        // Second input of Pull node is the trainable parameter.
+        size_t parameter_index = 1;
+        auto parameter_node = AnfAlgo::GetInputNode(pull_node->cast<CNodePtr>(), parameter_index);
+        size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(parameter_node->fullname_with_scope());
+        AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node);
+        AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), pull_node);
+
+        std::string optimizer_name = AnfAlgo::GetNodeAttr<std::string>(node, kAttrOptimizerType);
+        parallel::ps::Worker<float>::GetInstance().SetKeyOptimId(key, optimizer_name);
+      }
+    }
+  }
+}
+
+void SessionBasic::InitPSParamAndOptim(const KernelGraphPtr &kernel_graph,
+                                       const std::vector<tensor::TensorPtr> &inputs_const) {
+  if (!parallel::ps::Util::IsRoleOfWorker()) {
+    return;
+  }
+  std::vector<tensor::TensorPtr> inputs(inputs_const);
+  size_t input_ctrl_size = 1;
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  if (kernel_graph->input_ctrl_tensors()) {
+    input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs);
+  }
+  auto input_nodes = kernel_graph->inputs();
+  if ((inputs.size() + input_ctrl_size) - 1 != input_nodes.size()) {
+    MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size()
+                      << ", input_ctrl_size:" << input_ctrl_size;
+  }
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto tensor = inputs[i];
+    MS_EXCEPTION_IF_NULL(tensor);
+    auto input_node = input_nodes[i];
+    MS_EXCEPTION_IF_NULL(input_node);
+    if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
+      auto pk_node = input_node->cast<ParameterPtr>();
+      mindspore::parallel::ps::Worker<float>::GetInstance().InitPSParamAndOptim(
+        pk_node->fullname_with_scope(), tensor->data_c(), LongToSize(tensor->data().nbytes()));
+    }
+  }
+  ps_init_ = true;
+}
+#endif
 }  // namespace session
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/session/session_basic.h
+++ b/mindspore/ccsrc/backend/session/session_basic.h
@ -51,7 +51,7 @@ using OpRunInfoPtr = std::shared_ptr<OpRunInfo>;

 class SessionBasic {
 public:
-  SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0) {
+  SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0), ps_init_(false) {
 #ifdef ENABLE_DEBUGGER
    debugger_ = nullptr;
 #endif
@ -104,6 +104,8 @@ class SessionBasic {
  virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; }
  virtual void SetActive(GraphId, GraphId) {}
  virtual void GetSummaryNodes(KernelGraph *graph);
+  void AssignParamKey(const KernelGraphPtr &kernel_graph);
+  void InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, const std::vector<tensor::TensorPtr> &inputs_const);

 #ifdef ENABLE_DEBUGGER
  // set debugger
@ -140,6 +142,7 @@ class SessionBasic {
  AnfNodePtr CreateNewParameterFromCNode(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph);
  void AddParameterToGraphInputs(const std::vector<AnfNodePtr> &parameters, KernelGraph *graph);
  void InitInternalOutputParameter(const AnfNodePtr &out_node, const AnfNodePtr &parameter);
+  AnfNodePtr FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list);

  std::unordered_map<GraphId, std::shared_ptr<KernelGraph>> graphs_;
  std::unordered_map<GraphInfo, std::shared_ptr<KernelGraph>> run_op_graphs_;
@ -148,6 +151,7 @@ class SessionBasic {
  CallBackFunc summary_callback_;
  static GraphId graph_sum_;
  uint32_t device_id_;
+  bool ps_init_;
 #ifdef ENABLE_DEBUGGER
  std::shared_ptr<Debugger> debugger_;
 #endif
--- a/mindspore/ccsrc/debug/CMakeLists.txt
+++ b/mindspore/ccsrc/debug/CMakeLists.txt
@ -23,9 +23,7 @@ if (ENABLE_D)
    list(APPEND _DEBUG_SRC_LIST
        "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
        )
-    if (ENABLE_DATA_DUMP)
-        list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc")
-    endif(ENABLE_DATA_DUMP)
+    list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc")
 endif()

 if (ENABLE_DUMP_E2E)
--- a/mindspore/ccsrc/debug/common.cc
+++ b/mindspore/ccsrc/debug/common.cc
@ -120,6 +120,10 @@ std::optional<std::string> Common::GetConfigFile(const std::string &env) {
    MS_LOG(ERROR) << dump_config_file << " not exist.";
    return {};
  }
+  auto suffix = dump_config_file.substr(dump_config_file.find_last_of('.') + 1);
+  if (suffix != "json") {
+    MS_LOG(EXCEPTION) << "[DataDump] dump config file suffix only support json! But got:." << suffix;
+  }
  return dump_config_file;
 }
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/data_dump_parser.cc
+++ b/mindspore/ccsrc/debug/data_dump_parser.cc
@ -29,13 +29,13 @@ void DataDumpParser::ResetParam() {
  net_name_.clear();
  dump_mode_ = 0;
  dump_step_ = 0;
-  kernel_set_.clear();
+  kernel_map_.clear();
 }

 bool DataDumpParser::DumpEnabled() const {
  auto enable_dump = std::getenv(kEnableDataDump);
  if (!enable_dump) {
-    MS_LOG(WARNING) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP";
+    MS_LOG(INFO) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP";
    return false;
  }

@ -60,9 +60,18 @@ std::optional<std::string> DataDumpParser::GetDumpPath() const {
    return {};
  }
  std::string dump_path_str(dump_path);
+  if (!std::all_of(dump_path_str.begin(), dump_path_str.end(), ::isalpha)) {
+    MS_LOG(EXCEPTION) << "[DataDump] dump path only support alphas, but got:" << dump_path_str;
+  }
  return dump_path_str;
 }

+std::string GetIfstreamString(const std::ifstream &ifstream) {
+  std::stringstream buffer;
+  buffer << ifstream.rdbuf();
+  return buffer.str();
+}
+
 void DataDumpParser::ParseDumpConfig() {
  std::lock_guard<std::mutex> guard(lock_);
  MS_LOG(INFO) << "[DataDump] parse start";
@ -84,7 +93,12 @@ void DataDumpParser::ParseDumpConfig() {
  }

  nlohmann::json j;
-  json_file >> j;
+  try {
+    json_file >> j;
+  } catch (nlohmann::json::parse_error &e) {
+    MS_LOG(ERROR) << "[DataDump] json contents:" << GetIfstreamString(json_file);
+    MS_LOG(EXCEPTION) << "[DataDump] parse json failed, error:" << e.what();
+  }
  if (j.find("DumpSettings") == j.end()) {
    MS_LOG(EXCEPTION) << "[DataDump] DumpSettings is not exist.";
  }
@ -111,8 +125,8 @@ bool DataDumpParser::NeedDump(const std::string &op_full_name) const {
  if (dump_mode_ == 0) {
    return true;
  }
-  auto iter = kernel_set_.find(op_full_name);
-  return iter != kernel_set_.end();
+  auto iter = kernel_map_.find(op_full_name);
+  return iter != kernel_map_.end();
 }

 bool DataDumpParser::IsConfigExist(const nlohmann::json &dump_settings) const {
@ -145,8 +159,25 @@ bool DataDumpParser::ParseDumpSetting(const nlohmann::json &dump_settings) {
    auto kernel_str = kernel.dump();
    kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end());
    MS_LOG(INFO) << "[DataDump] Need dump kernel:" << kernel_str;
-    kernel_set_.insert(kernel_str);
+    kernel_map_.insert({kernel_str, 0});
  }
  return true;
 }
+
+void DataDumpParser::MatchKernel(const std::string &kernel_name) {
+  auto iter = kernel_map_.find(kernel_name);
+  if (iter == kernel_map_.end()) {
+    return;
+  }
+  iter->second = iter->second + 1;
+  MS_LOG(INFO) << "Match dump kernel:" << iter->first << " match times:" << iter->second;
+}
+
+void DataDumpParser::PrintUnusedKernel() {
+  for (const auto &iter : kernel_map_) {
+    if (iter.second == 0) {
+      MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
+    }
+  }
+}
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/data_dump_parser.h
+++ b/mindspore/ccsrc/debug/data_dump_parser.h
@ -18,7 +18,7 @@
 #define MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_

 #include <string>
-#include <set>
+#include <map>
 #include <mutex>
 #include <optional>
 #include "nlohmann/json.hpp"
@ -39,7 +39,8 @@ class DataDumpParser {
  const std::string &net_name() const { return net_name_; }
  uint32_t dump_mode() const { return dump_mode_; }
  uint32_t dump_step() const { return dump_step_; }
-  const std::set<std::string> &kernel_set() const { return kernel_set_; }
+  void MatchKernel(const std::string &kernel_name);
+  void PrintUnusedKernel();

 private:
  DataDumpParser() = default;
@ -55,7 +56,7 @@ class DataDumpParser {
  std::string net_name_;
  uint32_t dump_mode_{0};
  uint32_t dump_step_{0};
-  std::set<std::string> kernel_set_;
+  std::map<std::string, uint32_t> kernel_map_;
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_
--- a/mindspore/ccsrc/debug/dump_proto.cc
+++ b/mindspore/ccsrc/debug/dump_proto.cc
@ -124,6 +124,8 @@ void ProtoExporter::SetNodeOutputType(const TypePtr &type, const BaseShapePtr &s
    // Do Nothing
  } else if (type->isa<UndeterminedType>()) {
    // Do Nothing
+  } else if (type->isa<SparseTensorType>()) {
+    // Do Nothing
  } else if (type->isa<Tuple>()) {
    TuplePtr tuple_type = dyn_cast<Tuple>(type);
    type_proto->set_data_type(irpb::DT_TUPLE);
--- a/Show More
+++ b/Show More