Merge branch 'master' into code_sync_incubator_f3c32baf_to_master_fcfc75a3_0811

2020-08-11 19:40:04 +08:00 · 2020-08-11 19:40:04 +08:00 · 4964f7703a
parent f3c32baf3e fcfc75a30f
commit 4964f7703a
1278 changed files with 42441 additions and 13079 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -5,9 +5,6 @@
 [submodule "third_party/googletest"]
 	path = third_party/googletest
 	url = https://github.com/google/googletest.git
-[submodule "third_party/incubator-tvm"]
-	path = third_party/incubator-tvm
-	url = https://github.com/apache/incubator-tvm.git
 [submodule "third_party/protobuf"]
 	path = third_party/protobuf
 	url = https://github.com/protocolbuffers/protobuf.git
@ -17,7 +14,7 @@
 	url = https://gitee.com/mindspore/akg.git
 [submodule "graphengine"]
 	path = graphengine
-	url = https://gitee.com/ms-incubator/graphengine.git
+	url = https://gitee.com/mindspore/graphengine.git
 [submodule "third_party/OpenCL-CLHPP"]
 	path = third_party/OpenCL-CLHPP
 	url = https://github.com/KhronosGroup/OpenCL-CLHPP.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -98,6 +98,7 @@ endif()

 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
 add_subdirectory(mindspore/ccsrc)
+add_subdirectory(mindspore/core)
 if (ENABLE_TESTCASES)
    add_subdirectory(tests)
 endif()
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 5fe7e5c8377dccfd35c9f661e10ed3dc136208c5
+Subproject commit 8f9af74f59837579034610a741f5b8f33db12515
--- a/build.sh
+++ b/build.sh
@ -109,7 +109,7 @@ checkopts()
  ENABLE_GPU="off"

  # Process the options
-  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:swB:En' opt
+  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:swB:EnT:' opt
  do
    OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
    case "${opt}" in
@ -282,6 +282,11 @@ checkopts()
        ENABLE_IBVERBS="on"
        echo "enable IBVERBS for parameter server"
        ;;
+      T) 
+        check_on_off $OPTARG T
+        SUPPORT_TRAIN=$OPTARG
+        echo "support train on device "
+        ;;
      *)
        echo "Unknown option ${opt}!"
        usage
@ -397,7 +402,7 @@ checkndk() {
    if [ "${ANDROID_NDK}" ]; then
        echo -e "\e[31mANDROID_NDK_PATH=$ANDROID_NDK  \e[0m"
    else
-        echo -e "\e[31mplease set ANDROID_NDK_PATH in environment variable for example: export ANDROID_NDK=/root/usr/android-ndk-r20b/ \e[0m"
+        echo -e "\e[31mplease set ANDROID_NDK in environment variable for example: export ANDROID_NDK=/root/usr/android-ndk-r20b/ \e[0m"
        exit 1
    fi
 }
@ -569,6 +574,39 @@ build_minddata_lite_deps()
  build_jpeg_turbo
 }

+prepare_md_lite() {
+    if [ "${COMPILE_MINDDATA_LITE}" == "on" ]; then
+    echo "packaging minddata"
+        cp ${BASEPATH}/mindspore/ccsrc/minddata/dataset/include/*h ${OUTPUT_DIR}/include/
+	cp ${BASEPATH}/mindspore/lite/build/minddata/libminddata-lite.so ${OUTPUT_DIR}/lib/
+        if [[ "$LITE_PLATFORM" == "x86_64" ]]; then
+	    mkdir -p ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib
+            cp -r ${BASEPATH}/third_party/libjpeg-turbo/lib/libjpeg.so ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib/
+            cp -r ${BASEPATH}/third_party/libjpeg-turbo/lib/libturbojpeg.so ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib/
+	    mkdir -p ${OUTPUT_DIR}/third_party/opencv/lib/
+	    cp -r ${BASEPATH}/third_party/opencv/build/lib/libopencv_core.so ${OUTPUT_DIR}/third_party/opencv/lib/
+            cp -r ${BASEPATH}/third_party/opencv/build/lib/libopencv_imgcodecs.so ${OUTPUT_DIR}/third_party/opencv/lib/
+            cp -r ${BASEPATH}/third_party/opencv/build/lib/libopencv_imgproc.so ${OUTPUT_DIR}/third_party/opencv/lib/
+        elif [[ "$LITE_PLATFORM" == "arm64" ]]; then
+            mkdir -p ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib
+            cp -r ${BASEPATH}/third_party/libjpeg-turbo/lib/libjpeg.so ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib/
+            cp -r ${BASEPATH}/third_party/libjpeg-turbo/lib/libturbojpeg.so ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib/
+            mkdir -p ${OUTPUT_DIR}/third_party/opencv/lib/arm64-v8a/
+            cp -r ${BASEPATH}/third_party/opencv/build/lib/arm64-v8a/libopencv_core.so ${OUTPUT_DIR}/third_party/opencv/lib/arm64-v8a/
+            cp -r ${BASEPATH}/third_party/opencv/build/lib/arm64-v8a/libopencv_imgcodecs.so ${OUTPUT_DIR}/third_party/opencv/lib/arm64-v8a/
+            cp -r ${BASEPATH}/third_party/opencv/build/lib/arm64-v8a/libopencv_imgproc.so ${OUTPUT_DIR}/third_party/opencv/lib/arm64-v8a/
+        elif [[ "$LITE_PLATFORM" == "arm32" ]]; then
+            mkdir -p ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib
+            cp -r ${BASEPATH}/third_party/libjpeg-turbo/lib/libjpeg.so ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib/
+            cp -r ${BASEPATH}/third_party/libjpeg-turbo/lib/libturbojpeg.so ${OUTPUT_DIR}/third_party/libjpeg-turbo/lib/
+            mkdir -p ${OUTPUT_DIR}/third_party/opencv/lib/armeabi-v7a/
+            cp -r ${BASEPATH}/third_party/opencv/build/lib/armeabi-v7a/libopencv_core.so ${OUTPUT_DIR}/third_party/opencv/lib/armeabi-v7a/
+            cp -r ${BASEPATH}/third_party/opencv/build/lib/armeabi-v7a/libopencv_imgcodecs.so ${OUTPUT_DIR}/third_party/opencv/lib/armeabi-v7a/
+            cp -r ${BASEPATH}/third_party/opencv/build/lib/armeabi-v7a/libopencv_imgproc.so ${OUTPUT_DIR}/third_party/opencv/lib/armeabi-v7a/
+        fi
+    fi
+}
+
 build_lite()
 {
    echo "start build mindspore lite project"
@ -632,6 +670,7 @@ build_lite()
            mkdir -p ${OUTPUT_DIR}/converter && mkdir -p ${OUTPUT_DIR}/time_profile
            mkdir -p ${OUTPUT_DIR}/benchmark && mkdir -p ${OUTPUT_DIR}/include && mkdir -p ${OUTPUT_DIR}/lib
            mkdir -p ${OUTPUT_DIR}/third_party
+	    prepare_md_lite
            cp ${BASEPATH}/mindspore/lite/build/tools/converter/converter_lite ${OUTPUT_DIR}/converter/
            cp ${BASEPATH}/mindspore/lite/build/tools/benchmark/benchmark ${OUTPUT_DIR}/benchmark/
            cp ${BASEPATH}/mindspore/lite/build/tools/time_profile/timeprofile ${OUTPUT_DIR}/time_profile/
@ -643,8 +682,7 @@ build_lite()
            cp ${BASEPATH}/mindspore/lite/build/src/libmindspore-lite.so ${OUTPUT_DIR}/lib/
            mkdir -p ${OUTPUT_DIR}/third_party/protobuf/lib
            cp -r ${BASEPATH}/third_party/protobuf/build/include/ ${OUTPUT_DIR}/third_party/protobuf/
-            cp -r ${BASEPATH}/third_party/protobuf/build/lib/libprotobuf.so.19 ${OUTPUT_DIR}/third_party/protobuf/lib/
-            cp -r ${BASEPATH}/third_party/protobuf/build/lib/libprotobuf.so.19.0.0 ${OUTPUT_DIR}/third_party/protobuf/lib/
+            cp -r ${BASEPATH}/third_party/protobuf/build/lib/libprotobuf.so.19.0.0 ${OUTPUT_DIR}/third_party/protobuf/lib/libprotobuf.so.19
            mkdir -p ${OUTPUT_DIR}/third_party/flatbuffers
            cp -r ${BASEPATH}/third_party/flatbuffers/include/ ${OUTPUT_DIR}/third_party/flatbuffers/
            cd ..
@ -657,6 +695,7 @@ build_lite()
            mkdir -p ${OUTPUT_DIR}/time_profile && mkdir -p ${OUTPUT_DIR}/benchmark
            mkdir -p ${OUTPUT_DIR}/include && mkdir -p ${OUTPUT_DIR}/lib
            mkdir -p ${OUTPUT_DIR}/third_party
+	    prepare_md_lite
            cp ${BASEPATH}/mindspore/lite/build/tools/benchmark/benchmark ${OUTPUT_DIR}/benchmark/
            cp ${BASEPATH}/mindspore/lite/build/tools/time_profile/timeprofile ${OUTPUT_DIR}/time_profile/
            cp ${BASEPATH}/mindspore/lite/include/*.h ${OUTPUT_DIR}/include/
@ -677,6 +716,7 @@ build_lite()
            mkdir -p ${OUTPUT_DIR}/time_profile && mkdir -p ${OUTPUT_DIR}/benchmark
            mkdir -p ${OUTPUT_DIR}/include && mkdir -p ${OUTPUT_DIR}/lib
            mkdir -p ${OUTPUT_DIR}/third_party
+	    prepare_md_lite
            cp ${BASEPATH}/mindspore/lite/build/tools/benchmark/benchmark ${OUTPUT_DIR}/benchmark/
            cp ${BASEPATH}/mindspore/lite/build/tools/time_profile/timeprofile ${OUTPUT_DIR}/time_profile/
            cp ${BASEPATH}/mindspore/lite/include/*.h ${OUTPUT_DIR}/include/
--- a/cmake/external_libs/jpeg_turbo.cmake
+++ b/cmake/external_libs/jpeg_turbo.cmake
@ -8,11 +8,12 @@ endif()
 set(jpeg_turbo_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
 mindspore_add_pkg(jpeg_turbo
        VER 2.0.4
-        LIBS jpeg
+        LIBS jpeg turbojpeg
        URL https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.4.tar.gz
        MD5 44c43e4a9fb352f47090804529317c88
-        CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DCMAKE_SKIP_RPATH=TRUE
+        CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DCMAKE_SKIP_RPATH=TRUE -DWITH_SIMD=ON
        PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/jpeg_turbo/jpeg_turbo.patch001
        )
 include_directories(${jpeg_turbo_INC})
 add_library(mindspore::jpeg_turbo ALIAS jpeg_turbo::jpeg)
+add_library(mindspore::turbojpeg ALIAS jpeg_turbo::turbojpeg)
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@ -52,12 +52,6 @@ install(
    COMPONENT mindspore
 )

-install(
-    TARGETS mindspore_gvar
-    DESTINATION ${INSTALL_LIB_DIR}
-    COMPONENT mindspore
-)
-
 if (USE_GLOG)
    file(GLOB_RECURSE GLOG_LIB_LIST ${glog_LIBPATH}/libglog*)
    install(
@ -146,15 +140,6 @@ if (ENABLE_MPI)
            COMPONENT mindspore
        )
    endif ()
-    file(GLOB_RECURSE MPI_LIB_LIST
-        ${ompi_LIBPATH}/libmpi${CMAKE_SHARED_LIBRARY_SUFFIX}*
-        ${ompi_LIBPATH}/libopen*${CMAKE_SHARED_LIBRARY_SUFFIX}*
-    )
-    install(
-        FILES ${MPI_LIB_LIST}
-        DESTINATION ${INSTALL_LIB_DIR}
-        COMPONENT mindspore
-    )
 endif ()

 if (ENABLE_GPU)
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 377b2165184fbfbb32829266822438e439861f14
+Subproject commit 622af6c1c50034bea5a08bd409c5a410782bfe53
--- a/mindspore/_extends/parse/init.py
+++ b/mindspore/_extends/parse/init.py
@ -22,7 +22,7 @@ from .parser import (Parser, create_obj_instance, generate_scope,
                     get_dataclass_attributes, get_dataclass_methods, get_obj_id,
                     get_module_namespace, get_obj_type, get_object_key,
                     get_parse_method_of_class, get_scope_name,
-                     is_class_member, parse_cb, resolve_symbol)
+                     is_class_member, parse_cb, resolve_symbol, convert_to_ms_tensor)
 from .serialize import *

 __all__ = ['parse_cb', 'get_parse_method_of_class', 'get_bprop_method_of_class', 'resolve_symbol',
@ -30,4 +30,4 @@ __all__ = ['parse_cb', 'get_parse_method_of_class', 'get_bprop_method_of_class',
           'get_obj_type', 'get_obj_id', 'create_obj_instance', 'get_module_namespace',
           'get_class_member_namespace_symbol', 'get_obj_id', 'Parser', 'get_dataclass_attributes',
           'get_dataclass_methods', 'dump_obj', 'load_obj', 'get_dataclass_methods', 'get_scope_name',
-           'create_slice_obj']
+           'create_slice_obj', 'convert_to_ms_tensor']
--- a/mindspore/_extends/parse/parser.py
+++ b/mindspore/_extends/parse/parser.py
@ -25,6 +25,7 @@ from dataclasses import is_dataclass
 import asttokens
 import mindspore.nn as nn
 from mindspore import log as logger
+from mindspore import Tensor as MsTensor
 from mindspore import ops
 from mindspore.common.dtype import pytype_to_dtype
 from mindspore.common.api import _MindSporeFunction
@ -316,6 +317,11 @@ def get_dataclass_methods(cls):
    return methods


+def convert_to_ms_tensor(data):
+    """Convert C++ tensor to mindspore tensor."""
+    return MsTensor(data)
+
+
 class Parser:
    """
    Parser python code to ast tree.
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@ -130,7 +130,7 @@ set(SUB_COMP
        frontend/operator
        pipeline/jit
        pipeline/pynative
-        common debug gvar pybind_api utils vm
+        common debug pybind_api utils vm
 )

 foreach (_comp ${SUB_COMP})
@ -141,32 +141,21 @@ foreach (_comp ${SUB_COMP})
        add_dependencies(_mindspore_${sub}_obj proto_input )
    endif ()
 endforeach ()
-add_subdirectory(${CMAKE_SOURCE_DIR}/mindspore/core/base base)
-list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_base_obj>)
-add_subdirectory(${CMAKE_SOURCE_DIR}/mindspore/core/abstract abstract)
-list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_abstract_obj>)
-add_subdirectory(${CMAKE_SOURCE_DIR}/mindspore/core/utils util)
-list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_core_utils_obj>)
-add_subdirectory(${CMAKE_SOURCE_DIR}/mindspore/core/ir ir)
-list(APPEND SUB_OBJECTS_SRC $<TARGET_OBJECTS:_mindspore_ir_obj>)
-add_dependencies(_mindspore_core_utils_obj _mindspore_base_obj _mindspore_ir_obj _mindspore_abstract_obj proto_input )

 set_property(SOURCE ${SUB_OBJECTS_SRC} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_ME)
 add_library(mindspore STATIC ${SUB_OBJECTS_SRC})

 target_link_libraries(proto_input mindspore::protobuf)

+target_link_libraries(mindspore mindspore_core)
+
 if (ENABLE_DEBUGGER)
    # debugger: link grpc
    target_link_libraries(proto_input mindspore::grpc++)
 endif()

 target_link_libraries(mindspore proto_input)
-if (ENABLE_MPI AND ENABLE_CPU)
-    target_link_libraries(mindspore securec mindspore::flatbuffers mpi_adapter)
-else ()
-    target_link_libraries(mindspore securec mindspore::flatbuffers)
-endif ()
+target_link_libraries(mindspore securec mindspore::flatbuffers)

 if (NOT WIN32)
  target_link_libraries(mindspore dl)
@ -242,7 +231,6 @@ set_target_properties(_c_expression PROPERTIES INSTALL_RPATH ${ORIGIN_PATH})

 if (CMAKE_SYSTEM_NAME MATCHES "Windows")
    target_link_libraries(mindspore mindspore::pybind11_module)
-    target_link_libraries(mindspore mindspore_gvar)
    target_link_libraries(_c_expression PRIVATE -Wl,--whole-archive mindspore -Wl,--no-whole-archive)
 else ()
    if (ENABLE_CPU AND (ENABLE_D OR ENABLE_GPU))
@ -253,7 +241,6 @@ else ()
    endif()
    target_link_libraries(_c_expression PRIVATE -Wl,--whole-archive mindspore -Wl,--no-whole-archive)
    target_link_libraries(_c_expression PRIVATE mindspore::pybind11_module)
-    target_link_libraries(_c_expression PRIVATE mindspore_gvar)
 endif ()

 if (USE_GLOG)
@ -297,7 +284,7 @@ add_library(inference SHARED
        ${LOAD_ONNX_SRC}
        )
 target_link_libraries(inference PRIVATE ${PYTHON_LIBRARIES} ${SECUREC_LIBRARY}
-        -Wl,--whole-archive mindspore -Wl,--no-whole-archive mindspore_gvar mindspore::protobuf)
+        -Wl,--whole-archive mindspore -Wl,--no-whole-archive mindspore::protobuf)

 if (ENABLE_CPU)
    target_link_libraries(inference PRIVATE mindspore::dnnl mindspore::mkldnn)
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
@ -25,6 +25,7 @@
 #include "backend/kernel_compiler/kernel.h"
 #include "ir/dtype.h"
 #include "ir/primitive.h"
+#include "pybind11/pybind11.h"
 #include <nlohmann/json.hpp>
 #include "backend/kernel_compiler/common_utils.h"
 #include "backend/kernel_compiler/oplib/oplib.h"
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.cc
@ -15,7 +15,7 @@
 */
 #include "backend/kernel_compiler/cpu/allgather_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
-#include "runtime/device/cpu/mpi/mpi_adapter.h"
+#include "runtime/device/cpu/mpi/mpi_interface.h"
 #include "utils/log_adapter.h"

 namespace mindspore {
@ -45,9 +45,7 @@ bool AllGatherCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  auto input_data_num = inputs[0]->size / sizeof(float);
-  auto mpi_instance = device::cpu::MPIAdapter::Instance();
-  MS_EXCEPTION_IF_NULL(mpi_instance);
-  return mpi_instance->AllGather(input_addr, output_addr, ranks_group_, input_data_num);
+  return MPIAllGather(input_addr, output_addr, ranks_group_, input_data_num);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.cc
@ -16,7 +16,7 @@
 #include <thread>
 #include "backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
-#include "runtime/device/cpu/mpi/mpi_adapter.h"
+#include "runtime/device/cpu/mpi/mpi_interface.h"

 namespace mindspore {
 namespace kernel {
@ -49,11 +49,8 @@ bool EmbeddingLookUpCommGradCPUKernel::Launch(const std::vector<kernel::AddressP
  const std::vector<int> &rank_group = {0, 1, 2, 3, 4, 5, 6, 7};
  size_t input_split_lens = input_size / split_num_ / sizeof(float_t);
  size_t output_split_lens = output_size / split_num_ / sizeof(float_t);
-  auto mpi_instance = device::cpu::MPIAdapter::Instance();
-  MS_EXCEPTION_IF_NULL(mpi_instance);
  for (int i = 0; i < split_num_; i++) {
-    mpi_instance->AllGather(input_addr + i * input_split_lens, output_addr + i * output_split_lens, rank_group,
-                            input_split_lens);
+    MPIAllGather(input_addr + i * input_split_lens, output_addr + i * output_split_lens, rank_group, input_split_lens);
  }
 #if defined(_WIN32) || defined(_WIN64)
  auto end_time = std::chrono::steady_clock::now();
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_scatter_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_scatter_cpu_kernel.cc
@ -15,7 +15,7 @@
 */
 #include "backend/kernel_compiler/cpu/reduce_scatter_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
-#include "runtime/device/cpu/mpi/mpi_adapter.h"
+#include "runtime/device/cpu/mpi/mpi_interface.h"
 #include "ir/primitive.h"

 namespace mindspore {
@ -24,7 +24,7 @@ namespace {
 constexpr auto kRanksGroup = "group";
 }  // namespace

-ReduceScatterCPUKernel::ReduceScatterCPUKernel() : op_type_(device::cpu::kOpTypeSum) {}
+ReduceScatterCPUKernel::ReduceScatterCPUKernel() : op_type_(kMPIOpTypeSum) {}

 void ReduceScatterCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  auto op = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("op");
@ -46,9 +46,7 @@ bool ReduceScatterCPUKernel::Launch(const std::vector<kernel::AddressPtr> &input
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  auto output_data_num = outputs[0]->size / sizeof(float);
-  auto mpi_instance = device::cpu::MPIAdapter::Instance();
-  MS_EXCEPTION_IF_NULL(mpi_instance);
-  return mpi_instance->ReduceScatter(input_addr, output_addr, ranks_group_, output_data_num, op_type_);
+  return MPIReduceScatter(input_addr, output_addr, ranks_group_, output_data_num, op_type_);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sub_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sub_cpu_kernel.cc
@ -13,8 +13,9 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include <thread>
 #include "backend/kernel_compiler/cpu/sub_cpu_kernel.h"
+#include <sys/time.h>
+#include <thread>
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
@ -182,30 +182,59 @@ class ArrayReduceGpuKernel : public GpuKernel {
  void InferInAndOutDesc(const std::vector<size_t> &input_shape, const std::vector<size_t> &output_shape) {
    std::vector<int> inputA;
    std::vector<size_t> outputC_shape = output_shape;
-    ShapeNdTo4d(input_shape, &inputA);
-    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_, inputA[0],
-                                                           inputA[1], inputA[2], inputA[3]),
-                                "cudnnSetTensor4dDescriptor failed");
+    const int split_dim = 4;
+
+    if (input_shape.size() <= split_dim) {
+      ShapeNdTo4d(input_shape, &inputA);
+      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
+                                                             inputA[0], inputA[1], inputA[2], inputA[3]),
+                                  "cudnnSetTensor4dDescriptor failed");
+    } else {
+      CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_);
+      for (auto dim : input_shape) {
+        inputA.emplace_back(SizeToInt(dim));
+      }
+    }

    if (axis_[0] == -1) {
-      CHECK_CUDNN_RET_WITH_EXCEPT(
-        cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, 1, 1, 1, 1),
-        "cudnnSetTensor4dDescriptor failed");
-      if (inputA[0] == 1 && inputA[1] == 1 && inputA[2] == 1 && inputA[3] == 1) {
-        all_match_ = true;
+      outputC_shape.resize(input_shape.size(), 1);
+      if (outputC_shape.size() <= split_dim) {
+        CHECK_CUDNN_RET_WITH_EXCEPT(
+          cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, 1, 1, 1, 1),
+          "cudnnSetTensor4dDescriptor failed");
+      } else {
+        CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_);
      }
+
+      for (auto dim : inputA) {
+        if (dim != 1) {
+          return;
+        }
+      }
+
+      all_match_ = true;
      return;
    }
+
+    std::vector<int> outputC;
    if (!keep_dims_) {
      for (auto i : axis_) {
        (void)(outputC_shape.insert(outputC_shape.begin() + i, 1));
      }
    }
-    std::vector<int> outputC;
-    ShapeNdTo4d(outputC_shape, &outputC);
-    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
-                                                           outputC[0], outputC[1], outputC[2], outputC[3]),
-                                "cudnnSetTensor4dDescriptor failed");
+
+    if (outputC_shape.size() <= split_dim) {
+      ShapeNdTo4d(outputC_shape, &outputC);
+      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
+                                                             outputC[0], outputC[1], outputC[2], outputC[3]),
+                                  "cudnnSetTensor4dDescriptor failed");
+    } else {
+      CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_);
+      for (auto dim : outputC_shape) {
+        outputC.emplace_back(SizeToInt(dim));
+      }
+    }
+
    if (inputA == outputC) {
      all_match_ = true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h
@ -69,6 +69,10 @@ class ScatterNdGpuFwdKernel : public GpuKernel {
      memcpy_flag_ = true;
    }

+    CHECK_CUDA_RET_WITH_EXCEPT(
+      cudaMemsetAsync(output, static_cast<T>(0.0), output_size_, reinterpret_cast<cudaStream_t>(stream_ptr)),
+      "cudaMemSet failed in ScatterNdGpuFwdKernel::Launch.");
+
    const size_t input_size = input_size_ / sizeof(T);
    const size_t output_size = output_size_ / sizeof(T);

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
@ -14,6 +14,7 @@
 * limitations under the License.
 */

+#include <vector>
 #include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh"
 #include "runtime/device/gpu/cuda_common.h"

@ -54,6 +55,11 @@ struct RealDivFunc {
  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
 };

+template <typename T, typename S>
+struct DivFunc {
+  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
+};
+
 template <typename T, typename S>
 struct MulFunc {
  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs * rhs); }
@ -77,7 +83,7 @@ struct FloorDivFunc {
 template <>
 struct FloorDivFunc<half, half> {
  __device__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
-    return __float2half(floor(__half2float(lhs)/ __half2float(rhs)));
+    return __float2half(floor(__half2float(lhs) / __half2float(rhs)));
  }
 };

@ -95,7 +101,6 @@ struct AbsGradFunc {
  }
 };

-
 template <>
 struct PowerFunc<half, bool> {
  // invalid branch
@ -104,72 +109,100 @@ struct PowerFunc<half, bool> {

 __device__ __forceinline__ int Index(const int &index, const int &dim) { return dim == 1 ? 0 : index; }

-
 template <typename T, typename S, typename Func>
 __device__ __forceinline__ void BroadcastOperator(const int &l0, const int &l1, const int &l2, const int &l3,
-                                                  const int &r0, const int &r1, const int &r2, const int &r3,
-                                                  const int &d0, const int &d1, const int &d2, const int &d3,
-                                                  const T *input0, const T *input1, S *output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < d0 * d1 * d2 * d3; pos += blockDim.x * gridDim.x) {
-    int i = pos / (d1 * d2 * d3) % d0;
-    int j = pos / (d2 * d3) % d1;
-    int k = pos / d3 % d2;
-    int l = pos % d3;
+                                                  const int &l4, const int &l5, const int &l6, const int &r0,
+                                                  const int &r1, const int &r2, const int &r3, const int &r4,
+                                                  const int &r5, const int &r6, const int &d0, const int &d1,
+                                                  const int &d2, const int &d3, const int &d4, const int &d5,
+                                                  const int &d6, const T *input0, const T *input1, S *output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < d0 * d1 * d2 * d3 * d4 * d5 * d6;
+       pos += blockDim.x * gridDim.x) {
+    int i = pos / (d1 * d2 * d3 * d4 * d5 * d6) % d0;
+    int j = pos / (d2 * d3 * d4 * d5 * d6) % d1;
+    int k = pos / (d3 * d4 * d5 * d6) % d2;
+    int l = pos / (d4 * d5 * d6) % d3;
+    int m = pos / (d5 * d6) % d4;
+    int n = pos / d6 % d5;
+    int o = pos % d6;

-    int l_index = Index(i, l0) * l1 * l2 * l3 + Index(j, l1) * l2 * l3 + Index(k, l2) * l3 + Index(l, l3);
-    int r_index = Index(i, r0) * r1 * r2 * r3 + Index(j, r1) * r2 * r3 + Index(k, r2) * r3 + Index(l, r3);
+    int l_index = Index(i, l0) * l1 * l2 * l3 * l4 * l5 * l6;
+    l_index += Index(j, l1) * l2 * l3 * l4 * l5 * l6;
+    l_index += Index(k, l2) * l3 * l4 * l5 * l6;
+    l_index += Index(l, l3) * l4 * l5 * l6;
+    l_index += Index(m, l4) * l5 * l6;
+    l_index += Index(n, l5) * l6;
+    l_index += Index(o, l6);
+    int r_index = Index(i, r0) * r1 * r2 * r3 * r4 * r5 * r6;
+    r_index += Index(j, r1) * r2 * r3 * r4 * r5 * r6;
+    r_index += Index(k, r2) * r3 * r4 * r5 * r6;
+    r_index += Index(l, r3) * r4 * r5 * r6;
+    r_index += Index(m, r4) * r5 * r6;
+    r_index += Index(n, r5) * r6;
+    r_index += Index(o, r6);
    output[pos] = Func()(input0[l_index], input1[r_index]);
  }
 }

 template <typename T, typename S>
-__global__ void BroadcastKernel(const int l0, const int l1, const int l2, const int l3, const int r0, const int r1,
-                                const int r2, const int r3, const int d0, const int d1, const int d2, const int d3,
-                                enum BroadcastOpType op, const T *input0, const T *input1, S *output) {
+__global__ void BroadcastKernel(const int l0, const int l1, const int l2, const int l3, const int l4, const int l5,
+                                const int l6, const int r0, const int r1, const int r2, const int r3, const int r4,
+                                const int r5, const int r6, const int d0, const int d1, const int d2, const int d3,
+                                const int d4, const int d5, const int d6, enum BroadcastOpType op, const T *input0,
+                                const T *input1, S *output) {
  switch (op) {
    case BROADCAST_TYPE_GREATER:
-      return BroadcastOperator<T, S, GreaterFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                        output);
+      return BroadcastOperator<T, S, GreaterFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
+                                                        d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_LESS:
-      return BroadcastOperator<T, S, LessFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                     output);
+      return BroadcastOperator<T, S, LessFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
+                                                     d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_MINIMUM:
-      return BroadcastOperator<T, S, MinimumFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                        output);
+      return BroadcastOperator<T, S, MinimumFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
+                                                        d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_MAXIMUM:
-      return BroadcastOperator<T, S, MaximumFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                        output);
+      return BroadcastOperator<T, S, MaximumFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
+                                                        d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_POWER:
-      return BroadcastOperator<T, S, PowerFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+      return BroadcastOperator<T, S, PowerFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
+                                                      d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_REALDIV:
-      return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+      return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
+                                                        d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_MUL:
-      return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+      return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
+                                                    d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_SUB:
-      return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+      return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
+                                                    d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_ADD:
-      return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+      return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
+                                                    d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_FLOORDIV:
-      return BroadcastOperator<T, S, FloorDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+      return BroadcastOperator<T, S, FloorDivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
+                                                         d2, d3, d4, d5, d6, input0, input1, output);
    case BROADCAST_TYPE_ABSGRAD:
-      return BroadcastOperator<T, S, AbsGradFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+      return BroadcastOperator<T, S, AbsGradFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
+                                                        d2, d3, d4, d5, d6, input0, input1, output);
+    case BROADCAST_TYPE_DIV:
+      return BroadcastOperator<T, S, DivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
+                                                    d3, d4, d5, d6, input0, input1, output);
  }
 }

 template <typename T, typename S>
-void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1, const int &r2,
-               const int &r3, const int &d0, const int &d1, const int &d2, const int &d3, enum BroadcastOpType op,
-               const T *input0, const T *input1, S *output, cudaStream_t stream) {
-  int size = d0 * d1 * d2 * d3;
-  BroadcastKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, op,
-                                                                input0, input1, output);
+void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
+               const std::vector<int> &output_shape, enum BroadcastOpType op, const T *input0, const T *input1,
+               S *output, cudaStream_t stream) {
+  int size = 1;
+  for (auto d : output_shape) {
+    size *= d;
+  }
+  BroadcastKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(
+    lhs_shape[0], lhs_shape[1], lhs_shape[2], lhs_shape[3], lhs_shape[4], lhs_shape[5], lhs_shape[6], rhs_shape[0],
+    rhs_shape[1], rhs_shape[2], rhs_shape[3], rhs_shape[4], rhs_shape[5], rhs_shape[6], output_shape[0],
+    output_shape[1], output_shape[2], output_shape[3], output_shape[4], output_shape[5], output_shape[6], op, input0,
+    input1, output);
 }

 template <typename T, typename S, typename Func>
@ -205,6 +238,8 @@ __global__ void NoBroadcastKernel(const int nums, enum BroadcastOpType op, const
      return NoBroadcastOperator<T, S, FloorDivFunc<T, S>>(nums, input0, input1, output);
    case BROADCAST_TYPE_ABSGRAD:
      return NoBroadcastOperator<T, S, AbsGradFunc<T, S>>(nums, input0, input1, output);
+    case BROADCAST_TYPE_DIV:
+      return NoBroadcastOperator<T, S, DivFunc<T, S>>(nums, input0, input1, output);
  }
 }

@ -215,8 +250,8 @@ void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, cons
 }

 template <typename T>
-__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0,
-                                  const int o1, const int o2, const int o3, const T *input_addr, T *output_addr) {
+__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0, const int o1,
+                                  const int o2, const int o3, const T *input_addr, T *output_addr) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < o0 * o1 * o2 * o3; pos += blockDim.x * gridDim.x) {
    int i = pos / (o1 * o2 * o3) % o0;
    int j = pos / (o2 * o3) % o1;
@ -233,33 +268,27 @@ void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, con
                 const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) {
  int nums = o0 * o1 * o2 * o3;
  BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr,
-          output_addr);
+                                                                  output_addr);
 }

-template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
-                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
-                        enum BroadcastOpType op, const float *input0, const float *input1, bool *output,
-                        cudaStream_t stream);
-template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
-                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
-                        enum BroadcastOpType op, const float *input0, const float *input1, float *output,
-                        cudaStream_t stream);
-template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
-                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
-                        enum BroadcastOpType op, const half *input0, const half *input1, bool *output,
-                        cudaStream_t stream);
-template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
-                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
-                        enum BroadcastOpType op, const half *input0, const half *input1, half *output,
-                        cudaStream_t stream);
-template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
-                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
-                        enum BroadcastOpType op, const int *input0, const int *input1, int *output,
-                        cudaStream_t stream);
-template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
-                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
-                        enum BroadcastOpType op, const int *input0, const int *input1, bool *output,
-                        cudaStream_t stream);
+template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
+                        const std::vector<int> &output_shape, enum BroadcastOpType op, const float *input0,
+                        const float *input1, bool *output, cudaStream_t stream);
+template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
+                        const std::vector<int> &output_shape, enum BroadcastOpType op, const float *input0,
+                        const float *input1, float *output, cudaStream_t stream);
+template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
+                        const std::vector<int> &output_shape, enum BroadcastOpType op, const half *input0,
+                        const half *input1, bool *output, cudaStream_t stream);
+template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
+                        const std::vector<int> &output_shape, enum BroadcastOpType op, const half *input0,
+                        const half *input1, half *output, cudaStream_t stream);
+template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
+                        const std::vector<int> &output_shape, enum BroadcastOpType op, const int *input0,
+                        const int *input1, int *output, cudaStream_t stream);
+template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
+                        const std::vector<int> &output_shape, enum BroadcastOpType op, const int *input0,
+                        const int *input1, bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
                          bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
@ -268,10 +297,10 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
                          bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
                          half *output, cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
-                          int *output, cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
-                          bool *output, cudaStream_t stream);
+template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
+                          cudaStream_t stream);
+template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, bool *output,
+                          cudaStream_t stream);
 template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
                          const int &o2, const int &o3, const float *input_addr, float *output_addr,
                          cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
@ -17,6 +17,7 @@
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_

+#include <vector>
 #include "runtime/device/gpu/cuda_common.h"

 enum BroadcastOpType {
@ -31,13 +32,14 @@ enum BroadcastOpType {
  BROADCAST_TYPE_ADD = 8,
  BROADCAST_TYPE_FLOORDIV = 9,
  BROADCAST_TYPE_ABSGRAD = 10,
+  BROADCAST_TYPE_DIV = 11,
  BROADCAST_TYPE_INVALID = 0xffffffff,
 };

 template <typename T, typename S>
-void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1, const int &r2,
-               const int &r3, const int &d0, const int &d1, const int &d2, const int &d3, enum BroadcastOpType op,
-               const T *input0, const T *input1, S *output, cudaStream_t stream);
+void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
+               const std::vector<int> &output_shape, enum BroadcastOpType op, const T *input0, const T *input1,
+               S *output, cudaStream_t stream);

 template <typename T, typename S>
 void NoBroadcast(const int &size, enum BroadcastOpType op, const T *input0, const T *input1, S *output,
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/check_valid_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/check_valid_impl.cu
@ -25,10 +25,10 @@ __global__ void CheckValidKernel(const size_t size, const T *box, const T *img_m
    const size_t right_y = i * 4 + 3;

    S valid_flag = false;
-    valid_flag |= !(box[left_x] >= 0.f);
-    valid_flag |= !(box[left_y] >= 0.f);
-    valid_flag |= !(img_metas[0] * img_metas[2] - 1.f >= box[right_x]);
-    valid_flag |= !(img_metas[1] * img_metas[2] - 1.f >= box[right_y]);
+    valid_flag |= !(box[left_x] >= static_cast<T>(0.0));
+    valid_flag |= !(box[left_y] >= static_cast<T>(0.0));
+    valid_flag |= !(img_metas[1] * img_metas[2] - static_cast<T>(1.0) >= box[right_x]);
+    valid_flag |= !(img_metas[0] * img_metas[2] - static_cast<T>(1.0) >= box[right_y]);

    valid[i] = !valid_flag;
  }
@ -43,3 +43,5 @@ void CheckValid(const size_t &size, const T *box, const T *img_metas, S *valid,

 template void CheckValid(const size_t &size, const float *box, const float *img_metas, bool *valid,
                         cudaStream_t cuda_stream);
+template void CheckValid(const size_t &size, const half *box, const half *img_metas, bool *valid,
+                         cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cumsum_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cumsum_impl.cu
@ -18,12 +18,85 @@
 #include "runtime/device/gpu/cuda_common.h"

 template <typename T>
-__global__ void CumSumKernel(T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
+__global__ void Copy(T *input, T *output, size_t size) {
+  size_t step = blockDim.x * gridDim.x;
+  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < size; write_index += step) {
+    input[write_index] = output[write_index];
+  }
+}
+
+template <typename T>
+__global__ void LeftMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
+                         size_t stride2) {
+  size_t num = dim0 * dim2;
+  size_t i, k, offset;
+  size_t step = blockDim.x * gridDim.x;
+  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num; write_index += step) {
+    i = write_index / dim2 % dim0;
+    k = write_index % dim2;
+    offset = i * stride + k;
+    for (size_t j = 0; j < dim1; ++j) {
+      size_t read_index = j * stride2 + offset;
+      if (j == 0) {
+        output[read_index] = 0;
+      } else {
+        size_t read_index2 = (j - 1) * stride2 + offset;
+        output[read_index] = input[read_index2];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void RightMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
+                          size_t stride2) {
+  size_t num = dim0 * dim2;
+  size_t i, k, offset;
+  size_t step = blockDim.x * gridDim.x;
+  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num; write_index += step) {
+    i = write_index / dim2 % dim0;
+    k = write_index % dim2;
+    offset = i * stride + k;
+    for (int j = dim1 - 1; j >= 0; --j) {
+      size_t read_index = j * stride2 + offset;
+      if (j == dim1 - 1) {
+        output[read_index] = 0;
+      } else {
+        size_t read_index2 = (j + 1) * stride2 + offset;
+        output[read_index] = input[read_index2];
+      }
+    }
+  }
+}
+template <typename T>
+__global__ void CumSumKernelReverse(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
+                                    size_t stride2) {
+  size_t num = dim0 * dim2;
+  size_t i, k, offset;
+  size_t step = blockDim.x * gridDim.x;
+  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num; write_index += step) {
+    i = write_index / dim2 % dim0;
+    k = write_index % dim2;
+    offset = i * stride + k;
+    for (int j = dim1 - 1; j >= 0; --j) {
+      size_t read_index = j * stride2 + offset;
+      if (j == dim1 - 1) {
+        output[read_index] = input[read_index];
+      } else {
+        size_t read_index2 = (j + 1) * stride2 + offset;
+        output[read_index] = output[read_index2] + input[read_index];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void CumSumKernel(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
                             size_t stride2) {
  size_t num = dim0 * dim2;
  size_t i, k, offset;
-  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num;
-       write_index += blockDim.x * gridDim.x) {
+  size_t step = blockDim.x * gridDim.x;
+  for (size_t write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num; write_index += step) {
    i = write_index / dim2 % dim0;
    k = write_index % dim2;
    offset = i * stride + k;
@ -39,12 +112,32 @@ __global__ void CumSumKernel(T *input, T *output, size_t dim0, size_t dim1, size
  }
 }
 template <typename T>
-void CumSum(T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
-            cudaStream_t stream) {
+void CumSum(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, size_t dim2, size_t stride,
+            size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream) {
  int size = dim0 * dim2;
-  CumSumKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
+  if (exclusive_) {
+    if (reverse_) {
+      RightMove<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
+      Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1);
+      CumSumKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, dim2, stride,
+                                                                        stride2);
+    } else {
+      LeftMove<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
+      Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1);
+      CumSumKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, dim2, stride, stride2);
+    }
+  } else {
+    if (reverse_) {
+      CumSumKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride,
+                                                                        stride2);
+    } else {
+      CumSumKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
+    }
+  }
  return;
 }

-template void CumSum<float>(float *input, float *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
-                            size_t stride2, cudaStream_t stream);
+template void CumSum<float>(const float *input, float *output, float *workspace, size_t dim0, size_t dim1, size_t dim2,
+                            size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
+template void CumSum<half>(const half *input, half *output, half *workspace, size_t dim0, size_t dim1, size_t dim2,
+                           size_t stride, size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cumsum_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cumsum_impl.cuh
@ -17,6 +17,6 @@
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_
 template <typename T>
-void CumSum(T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
-            cudaStream_t stream);
+void CumSum(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, size_t dim2, size_t stride,
+            size_t stride2, bool exclusive_, bool reverse_, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUMSUM_IMPL_CUH_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/iou_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/iou_impl.cu
@ -16,27 +16,26 @@

 #include "backend/kernel_compiler/gpu/cuda_impl/iou_impl.cuh"

-template <typename T>
-__device__ T CoordinateMax(const T a, const T b) {
+__device__ float CoordinateMax(const float a, const float b) {
  return (a > b ? a : b);
 }

-template <typename T>
-__device__ T CoordinateMin(const T a, const T b) {
+__device__ float CoordinateMin(const float a, const float b) {
  return (a < b ? a : b);
 }

 template <typename T>
 __global__ void IOUKernel(const size_t size, const T *box1, const T *box2, T *iou_results, const size_t mode,
                          const size_t input_len_0) {
-  T location_coordinate[IOU_LOCATION_NUM][IOU_DIMENSION];
-  T overlaps_coordinate[IOU_DIMENSION];
-  const T epsilon = 1e-10;
+  float location_coordinate[IOU_LOCATION_NUM][IOU_DIMENSION];
+  float overlaps_coordinate[IOU_DIMENSION];
+  const float epsilon = 1e-10;
+  const float offset = 1.0;

  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
    for (size_t j = 0; j < IOU_DIMENSION; j++) {
-      location_coordinate[0][j] = box1[(i % input_len_0) * IOU_DIMENSION + j];
-      location_coordinate[1][j] = box2[(i / input_len_0) * IOU_DIMENSION + j];
+      location_coordinate[0][j] = static_cast<float>(box1[(i % input_len_0) * IOU_DIMENSION + j]);
+      location_coordinate[1][j] = static_cast<float>(box2[(i / input_len_0) * IOU_DIMENSION + j]);
    }

    overlaps_coordinate[0] = CoordinateMax(location_coordinate[0][0], location_coordinate[1][0]);
@ -44,18 +43,18 @@ __global__ void IOUKernel(const size_t size, const T *box1, const T *box2, T *io
    overlaps_coordinate[2] = CoordinateMin(location_coordinate[0][2], location_coordinate[1][2]);
    overlaps_coordinate[3] = CoordinateMin(location_coordinate[0][3], location_coordinate[1][3]);

-    T overlaps_w = CoordinateMax(0.f, overlaps_coordinate[2] - overlaps_coordinate[0] + 1);
-    T overlaps_h = CoordinateMax(0.f, overlaps_coordinate[3] - overlaps_coordinate[1] + 1);
-    T overlaps = overlaps_w * overlaps_h;
+    float overlaps_w = CoordinateMax(0.0, overlaps_coordinate[2] - overlaps_coordinate[0] + offset);
+    float overlaps_h = CoordinateMax(0.0, overlaps_coordinate[3] - overlaps_coordinate[1] + offset);
+    float overlaps = overlaps_w * overlaps_h;

-    T area1 = (location_coordinate[0][2] - location_coordinate[0][0] + 1) * (location_coordinate[0][3] -
-               location_coordinate[0][1] + 1);
-    T area2 = (location_coordinate[1][2] - location_coordinate[1][0] + 1) * (location_coordinate[1][3] -
-                                                                             location_coordinate[1][1] + 1);
+    float area1 = (location_coordinate[0][2] - location_coordinate[0][0] + offset) * (location_coordinate[0][3] -
+               location_coordinate[0][1] + offset);
+    float area2 = (location_coordinate[1][2] - location_coordinate[1][0] + offset) * (location_coordinate[1][3] -
+                                                                             location_coordinate[1][1] + offset);
    if (mode == 0) {
-      iou_results[i] = overlaps / (area1 + area2 - overlaps + epsilon);
+      iou_results[i] = static_cast<T>(overlaps / (area1 + area2 - overlaps + epsilon));
    } else {
-      iou_results[i] = overlaps / (area2 + epsilon);
+      iou_results[i] = static_cast<T>(overlaps / (area2 + epsilon));
    }
  }

@ -70,3 +69,5 @@ void IOU(const size_t &size, const T *box1, const T *box2, T *iou_results, const

 template void IOU(const size_t &size, const float *box1, const float *box2, float *iou_results, const size_t &mode,
                  const size_t &input_len_0, cudaStream_t cuda_stream);
+template void IOU(const size_t &size, const half *box1, const half *box2, half *iou_results, const size_t &mode,
+                  const size_t &input_len_0, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h
@ -84,6 +84,40 @@ class GpuKernel : public KernelMod {
    }
  }

+  // set the tensor descriptor for cudnn/cublas
+  void CudnnSetTensorNdDescriptor(const std::vector<size_t> &shape, cudnnTensorDescriptor_t descriptor,
+                                  cudnnDataType_t data_type) {
+    if (shape.size() < 3) {
+      MS_EXCEPTION(ValueError) << "cudnnSetTensorNdDescriptor don't support" << shape.size() << "D.";
+    }
+    const int nbDims = shape.size();
+    int *dim = new (std::nothrow) int[nbDims];
+    if (dim == nullptr) {
+      MS_LOG(EXCEPTION) << "malloc dim failed.";
+    }
+    int *stride = new (std::nothrow) int[nbDims];
+    if (stride == nullptr) {
+      MS_LOG(EXCEPTION) << "malloc stride failed.";
+    }
+
+    for (int i = 0; i < nbDims; i++) {
+      dim[i] = SizeToInt(shape[i]);
+      stride[i] = 1;
+    }
+
+    for (int i = nbDims - 2; i >= 0; i--) {
+      stride[i] = stride[i + 1] * SizeToInt(shape[i + 1]);
+    }
+
+    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(descriptor, data_type, nbDims, dim, stride),
+                                "cudnnSetTensorNdDescriptor failed");
+
+    delete[] dim;
+    dim = nullptr;
+    delete[] stride;
+    stride = nullptr;
+  }
+
  // choose the suitable datatype for cudnn/cublas
  inline cudnnDataType_t GetCudnnDataType(const std::string &Type) {
    auto type = kCudnnDtypeMap.find(Type);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
@ -59,6 +59,9 @@ MS_REG_GPU_KERNEL_TWO(
  AbsGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BroadcastOpGpuKernel, float, float)
+MS_REG_GPU_KERNEL_TWO(
+  Div, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  BroadcastOpGpuKernel, float, float)

 // fp16
 MS_REG_GPU_KERNEL_TWO(
@ -101,6 +104,9 @@ MS_REG_GPU_KERNEL_TWO(
  AbsGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BroadcastOpGpuKernel, half, half)
+MS_REG_GPU_KERNEL_TWO(
+  Div, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  BroadcastOpGpuKernel, half, half)

 // int32
 MS_REG_GPU_KERNEL_TWO(
@ -118,14 +124,14 @@ MS_REG_GPU_KERNEL_TWO(
 MS_REG_GPU_KERNEL_TWO(
  Mul, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  BroadcastOpGpuKernel, int, int)
-MS_REG_GPU_KERNEL_TWO(
-  RealDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-  BroadcastOpGpuKernel, int, int)
 MS_REG_GPU_KERNEL_TWO(
  FloorDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  BroadcastOpGpuKernel, int, int)
 MS_REG_GPU_KERNEL_TWO(
  AbsGrad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  BroadcastOpGpuKernel, int, int)
+MS_REG_GPU_KERNEL_TWO(
+  Div, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  BroadcastOpGpuKernel, int, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
@ -27,6 +27,7 @@
 #include "backend/kernel_compiler/gpu/kernel_constants.h"
 namespace mindspore {
 namespace kernel {
+constexpr int MAX_DIMS = 7;
 template <typename T, typename S>
 class BroadcastOpGpuKernel : public GpuKernel {
 public:
@ -45,9 +46,8 @@ class BroadcastOpGpuKernel : public GpuKernel {
    S *output = GetDeviceAddress<S>(outputs, 0);

    if (need_broadcast_) {
-      Broadcast(lhs_shape_[0], lhs_shape_[1], lhs_shape_[2], lhs_shape_[3], rhs_shape_[0], rhs_shape_[1], rhs_shape_[2],
-                rhs_shape_[3], output_shape_[0], output_shape_[1], output_shape_[2], output_shape_[3], op_type_, lhs,
-                rhs, output, reinterpret_cast<cudaStream_t>(stream_ptr));
+      Broadcast(lhs_shape_, rhs_shape_, output_shape_, op_type_, lhs, rhs, output,
+                reinterpret_cast<cudaStream_t>(stream_ptr));
    } else {
      NoBroadcast(output_num_, op_type_, lhs, rhs, output, reinterpret_cast<cudaStream_t>(stream_ptr));
    }
@ -60,10 +60,13 @@ class BroadcastOpGpuKernel : public GpuKernel {
    auto shape2 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape3 = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    need_broadcast_ = IsBroadcast(shape1, shape2);
-    if (need_broadcast_ && shape1.size() > 4) {
-      MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 4";
+    if (need_broadcast_ && shape1.size() > 7) {
+      MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7";
    }

+    lhs_shape_.resize(MAX_DIMS, 1);
+    rhs_shape_.resize(MAX_DIMS, 1);
+    output_shape_.resize(MAX_DIMS, 1);
    for (size_t i = 0; i < shape3.size(); i++) {
      output_shape_[i] = shape3[i];
      output_num_ *= shape3[i];
@ -99,7 +102,7 @@ class BroadcastOpGpuKernel : public GpuKernel {
      {"Greater", BROADCAST_TYPE_GREATER},   {"Less", BROADCAST_TYPE_LESS},       {"Maximum", BROADCAST_TYPE_MAXIMUM},
      {"Minimum", BROADCAST_TYPE_MINIMUM},   {"Pow", BROADCAST_TYPE_POWER},       {"RealDiv", BROADCAST_TYPE_REALDIV},
      {"Mul", BROADCAST_TYPE_MUL},           {"Sub", BROADCAST_TYPE_SUB},         {"TensorAdd", BROADCAST_TYPE_ADD},
-      {"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD},
+      {"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD}, {"Div", BROADCAST_TYPE_DIV},
    };

    auto iter = kBroadcastTypeMap.find(kernel_name);
@ -127,9 +130,9 @@ class BroadcastOpGpuKernel : public GpuKernel {
  int input1_num_;
  int input2_num_;
  int output_num_;
-  int lhs_shape_[4] = {1, 1, 1, 1};
-  int rhs_shape_[4] = {1, 1, 1, 1};
-  int output_shape_[4] = {1, 1, 1, 1};
+  std::vector<int> lhs_shape_;
+  std::vector<int> rhs_shape_;
+  std::vector<int> output_shape_;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumsum_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumsum_gpu_kernel.cc
@ -20,5 +20,7 @@ namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(CumSum, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                      CumSumGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(CumSum, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      CumSumGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumsum_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumsum_gpu_kernel.h
@ -27,7 +27,7 @@ namespace kernel {
 template <typename T>
 class CumSumGpuKernel : public GpuKernel {
 public:
-  CumSumGpuKernel() : axis_(0), input_size_0_(0), stride_(0), stride2_(0) {}
+  CumSumGpuKernel() : exclusive_(false), reverse_(false), axis_(0), input_size_0_(0), stride_(0), stride2_(0) {}
  ~CumSumGpuKernel() = default;

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@ -38,7 +38,8 @@ class CumSumGpuKernel : public GpuKernel {
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    T *input_addr = GetDeviceAddress<T>(inputs, 0);
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
-    CumSum(input_addr, output_addr, dims_[0], dims_[1], dims_[2], stride_, stride2_,
+    T *ws_addr = GetDeviceAddress<T>(workspace, 0);
+    CumSum(input_addr, output_addr, ws_addr, dims_[0], dims_[1], dims_[2], stride_, stride2_, exclusive_, reverse_,
           reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }
@ -51,6 +52,8 @@ class CumSumGpuKernel : public GpuKernel {
    input_size_0_ = sizeof(T);
    shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    axis_ = GetAttr<int>(kernel_node, "axis");
+    exclusive_ = GetAttr<bool>(kernel_node, "exclusive");
+    reverse_ = GetAttr<bool>(kernel_node, "reverse");
    int input_dim_length = SizeToInt(shape_.size());
    if (axis_ >= input_dim_length) {
      MS_LOG(EXCEPTION) << "Axis out of bounds.";
@ -70,6 +73,7 @@ class CumSumGpuKernel : public GpuKernel {
  void InitSizeLists() override {
    input_size_list_.push_back(input_size_0_);
    output_size_list_.push_back(input_size_0_);
+    workspace_size_list_.push_back(input_size_0_);
  }

 private:
@ -87,6 +91,8 @@ class CumSumGpuKernel : public GpuKernel {
    stride2_ = dims_[2];
    return;
  }
+  bool exclusive_;
+  bool reverse_;
  int axis_;
  size_t input_size_0_;
  size_t stride_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h
@ -83,12 +83,19 @@ class ActivationGpuFwdKernel : public GpuKernel {
      return true;
    }
    std::vector<int> shape;
-    ShapeNdTo4d(input_shape, &shape);
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_NOT_PROPAGATE_NAN, 0.0),
                                "cudnnSetActivationDescriptor failed");
-    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
-                                                           shape[0], shape[1], shape[2], shape[3]),
-                                "cudnnSetTensor4dDescriptor failed");
+
+    const int split_dim = 4;
+    if (input_shape.size() <= split_dim) {
+      ShapeNdTo4d(input_shape, &shape);
+      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
+                                                             shape[0], shape[1], shape[2], shape[3]),
+                                  "cudnnSetTensor4dDescriptor failed");
+    } else {
+      CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_);
+    }
+
    InitSizeLists();
    return true;
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
@ -90,12 +90,18 @@ class ActivationGradGpuKernel : public GpuKernel {
      return true;
    }
    std::vector<int> shape;
-    ShapeNdTo4d(input_shape, &shape);
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_PROPAGATE_NAN, 0.0),
                                "SetActivationDescriptor failed");
-    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
-                                                           shape[0], shape[1], shape[2], shape[3]),
-                                "SetTensor4dDescriptor failed");
+
+    const int split_dim = 4;
+    if (input_shape.size() <= split_dim) {
+      ShapeNdTo4d(input_shape, &shape);
+      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
+                                                             shape[0], shape[1], shape[2], shape[3]),
+                                  "SetTensor4dDescriptor failed");
+    } else {
+      CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_);
+    }

    InitSizeLists();
    return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/dropout_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/dropout_gpu_kernel.h
@ -54,12 +54,18 @@ class DropoutGpuFwdKernel : public GpuKernel {
    float *mask_f = GetDeviceAddress<float>(workspace, 0);

    if (!states_init_) {
-      curandCreateGenerator(&mask_generator_, CURAND_RNG_PSEUDO_DEFAULT);
-      curandSetPseudoRandomGeneratorSeed(mask_generator_, time(NULL));
+      CHECK_CURAND_RET_WITH_EXCEPT(curandCreateGenerator(&mask_generator_, CURAND_RNG_PSEUDO_DEFAULT),
+                                   "Failed to create generator");
+      CHECK_CURAND_RET_WITH_EXCEPT(curandSetPseudoRandomGeneratorSeed(mask_generator_, time(NULL)),
+                                   "Failed to SetPseudoRandomGeneratorSeed");
+      MS_EXCEPTION_IF_NULL(mask_generator_);
      states_init_ = true;
    }
+    CHECK_CURAND_RET_WITH_EXCEPT(curandSetStream(mask_generator_, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                                 "Failed to set stream for generator");
    // curandGen only support float or double for mask.
-    curandGenerateUniform(mask_generator_, mask_f, num_count_);
+    CHECK_CURAND_RET_WITH_EXCEPT(curandGenerateUniform(mask_generator_, mask_f, num_count_),
+                                 "Failed to generate uniform");
    DropoutForward(input, mask, output, mask_f, num_count_, keep_prob_, reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/check_valid_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/check_valid_gpu_kernel.cc
@ -22,5 +22,9 @@ MS_REG_GPU_KERNEL_TWO(
  CheckValid,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool),
  CheckValidGpuKernel, float, bool)
+MS_REG_GPU_KERNEL_TWO(
+  CheckValid,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeBool),
+  CheckValidGpuKernel, half, bool)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/iou_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/iou_gpu_kernel.cc
@ -21,5 +21,8 @@ namespace kernel {
 MS_REG_GPU_KERNEL_ONE(
  IOU, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  IOUGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(
+  IOU, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  IOUGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
@ -80,6 +80,7 @@
 #include "backend/optimizer/ascend/buffer_fusion/conv_single_in_fusion_pass.h"
 #include "backend/optimizer/ascend/buffer_fusion/conv_double_in_fusion_pass.h"
 #include "backend/optimizer/ascend/buffer_fusion/matmul_eltwise_fusion_pass.h"
+#include "backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.h"
 #include "backend/optimizer/ascend/buffer_fusion/depthwiseconv_eltwise_fusion_pass.h"
 #include "backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.h"
 #include "backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.h"
@ -124,6 +125,10 @@ void AddAscendIRFusionRulesPass(PassManager *ir_fusion_pm) {
  ir_fusion_pm->AddPass(std::make_shared<LambNextMVRuleCond4>());
  ir_fusion_pm->AddPass(std::make_shared<LambNextRightRule>());
  ir_fusion_pm->AddPass(std::make_shared<LambUpdateWithLrV2>());
+  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneAssignCond1Fusion>());
+  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneAssignCond2Fusion>());
+  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneAssignCond3Fusion>());
+  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneAssignCond4Fusion>());
  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond1Fusion>());
  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond2Fusion>());
  ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond3Fusion>());
@ -308,6 +313,7 @@ void RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::Kerne
  }
  auto optimizer = std::make_shared<GraphOptimizer>();
  auto ir_fusion_pm = std::make_shared<PassManager>("ir_fusion_pm");
+  ir_fusion_pm->AddPass(std::make_shared<SplitFission>());
  ir_fusion_pm->AddPass(std::make_shared<BnSplit>());
  ir_fusion_pm->AddPass(std::make_shared<LayerNormGradSplit>());
  ir_fusion_pm->AddPass(std::make_shared<TopKSplit>());
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.cc
@ -27,15 +27,15 @@

 namespace mindspore {
 namespace opt {
-void BnupdateEltwiseFusionPass::MatchBnupdateRelu(const CNodePtr &cnode, const AnfNodePtr &relu_input,
-                                                  const session::KernelGraph &kernel_graph,
-                                                  FusedNodeRecord *candidate_fusion) {
+void BnupdateEltwiseFusionPass::MatchBnupdateDoubleOutputEltwise(const CNodePtr &cnode, const AnfNodePtr &eltwise_input,
+                                                                 const session::KernelGraph &kernel_graph,
+                                                                 FusedNodeRecord *candidate_fusion) {
  MS_EXCEPTION_IF_NULL(cnode);
  MS_EXCEPTION_IF_NULL(candidate_fusion);
  auto manager = kernel_graph.manager();
  MS_EXCEPTION_IF_NULL(manager);
-  MS_EXCEPTION_IF_NULL(relu_input);
-  auto getitem = relu_input->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(eltwise_input);
+  auto getitem = eltwise_input->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(getitem);
  auto bnupdate = getitem->input(1);
  MS_EXCEPTION_IF_NULL(bnupdate);
@ -68,10 +68,11 @@ void BnupdateEltwiseFusionPass::MatchSingleFusionPattern(const session::KernelGr
    auto cnode = node->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(cnode);
    if (AnfAlgo::GetKernelType(cnode) == KernelType::TBE_KERNEL &&
-        AnfAlgo::GetFusionType(cnode) == kernel::FusionType::ELEMWISE) {
+        AnfAlgo::GetFusionType(cnode) == kernel::FusionType::ELEMWISE &&
+        AnfAlgo::GetOutputTensorNum(cnode) == ELTWISE_DOUBLE_OUTPUT_SIZE) {
      auto eltwise_input = cnode->input(1);
      if (eltwise_input->isa<CNode>() && AnfAlgo::CheckPrimitiveType(eltwise_input, prim::kPrimTupleGetItem)) {
-        MatchBnupdateRelu(cnode, eltwise_input, kernel_graph, candidate_fusion);
+        MatchBnupdateDoubleOutputEltwise(cnode, eltwise_input, kernel_graph, candidate_fusion);
      }
    }
  }
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.h
@ -39,8 +39,8 @@ class BnupdateEltwiseFusionPass : public FusionBasePass {
  void MatchSingleFusionPattern(const session::KernelGraph &kernel_graph, FusedNodeRecord *candidate_fusion) override;

 private:
-  void MatchBnupdateRelu(const CNodePtr &cnode, const AnfNodePtr &relu_input, const session::KernelGraph &kernel_graph,
-                         FusedNodeRecord *candidate_fusion);
+  void MatchBnupdateDoubleOutputEltwise(const CNodePtr &cnode, const AnfNodePtr &eltwise_input,
+                                        const session::KernelGraph &kernel_graph, FusedNodeRecord *candidate_fusion);
 };
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h
@ -33,6 +33,7 @@ const int8_t MAX_ELTWISE_NUM = 3;
 const int8_t MIN_ELTWISE_SIZE = 2;
 const int8_t ELTWISE_INPUT_SIZE = 2;
 const int8_t ELTWISE_DOUBLE_IN_INPUT_SIZE = 3;
+const int8_t ELTWISE_DOUBLE_OUTPUT_SIZE = 2;
 const int8_t CONV_DOUBLE_IN_INPUT_SIZE = 3;
 const int8_t CONV_QUART_IN_INPUT_SIZE = 5;
 const int8_t ELTWISE_USE = 1;
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.cc
@ -0,0 +1,66 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.h"
+#include <vector>
+#include <unordered_set>
+#include <memory>
+#include <string>
+#include "backend/kernel_compiler/kernel_fusion.h"
+#include "debug/anf_ir_dump.h"
+#include "backend/session/anf_runtime_algorithm.h"
+#include "frontend/operator/ops.h"
+#include "utils/ms_context.h"
+#include "backend/optimizer/common/fusion_id_allocator.h"
+
+namespace mindspore {
+namespace opt {
+void MatmulConfusionTranposeFusionPass::MatchMatmulConfusionTranpose(const CNodePtr &cnode,
+                                                                     const session::KernelGraph &kernel_graph,
+                                                                     FusedNodeRecord *candidate_fusion) {
+  MS_EXCEPTION_IF_NULL(cnode);
+  MS_EXCEPTION_IF_NULL(candidate_fusion);
+  auto manager = kernel_graph.manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  auto matmul = cnode->input(1);
+  MS_EXCEPTION_IF_NULL(matmul);
+  if (matmul->isa<CNode>() && AnfAlgo::CheckPrimitiveType(matmul, prim::kPrimMatMul)) {
+    std::vector<int> output_used_num{SizeToInt(manager->node_users()[matmul].size())};
+    AnfAlgo::SetNodeAttr(kAttrOutputUsedNum, MakeValue(output_used_num), matmul);
+    std::unordered_set<AnfNodePtr> record{cnode, matmul};
+    candidate_fusion->push_back(record);
+    SetRecordFusionId(record);
+  }
+}
+
+void MatmulConfusionTranposeFusionPass::MatchSingleFusionPattern(const session::KernelGraph &kernel_graph,
+                                                                 FusedNodeRecord *candidate_fusion) {
+  MS_EXCEPTION_IF_NULL(candidate_fusion);
+  std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph.get_return());
+  for (auto &node : node_list) {
+    if (!AnfAlgo::IsRealCNodeKernel(node) || fusion_id_allocator->HasFusionIdAttr(node) ||
+        AnfAlgo::CheckPrimitiveType(node, prim::kPrimReturn)) {
+      continue;
+    }
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+
+    if (AnfAlgo::GetCNodeName(cnode) == kConfusionTransposeDOpName) {
+      MatchMatmulConfusionTranpose(cnode, kernel_graph, candidate_fusion);
+    }
+  }
+}
+}  // namespace opt
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.h
@ -0,0 +1,48 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_BUFFER_FUSION_PASS_MATMUL_CONFUSIONTRANSPOSE_FUSION_PASS_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_BUFFER_FUSION_PASS_MATMUL_CONFUSIONTRANSPOSE_FUSION_PASS_H_
+
+#include <unordered_set>
+#include <vector>
+
+#include "backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h"
+#include "ir/anf.h"
+#include "backend/optimizer/common/pass.h"
+#include "backend/optimizer/common/fusion_id_allocator.h"
+#include "runtime/device/kernel_info.h"
+#include "backend/kernel_compiler/kernel.h"
+#include "backend/session/kernel_graph.h"
+
+namespace mindspore {
+namespace opt {
+using FusedNodeRecord = std::vector<std::unordered_set<AnfNodePtr>>;
+
+class MatmulConfusionTranposeFusionPass : public FusionBasePass {
+ public:
+  explicit MatmulConfusionTranposeFusionPass(FusionIdAllocatorPtr idAllocator)
+      : FusionBasePass("MatmulConfusionTranposeFusionPass", idAllocator) {}
+  ~MatmulConfusionTranposeFusionPass() override = default;
+  void MatchSingleFusionPattern(const session::KernelGraph &kernel_graph, FusedNodeRecord *candidate_fusion) override;
+
+ private:
+  void MatchMatmulConfusionTranpose(const CNodePtr &cnode, const session::KernelGraph &kernel_graph,
+                                    FusedNodeRecord *candidate_fusion);
+};
+}  // namespace opt
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_BUFFER_FUSION_PASS_MATMUL_CONFUSIONTRANSPOSE_FUSION_PASS_H_
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/merge_cast_to_op.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/merge_cast_to_op.cc
@ -172,7 +172,6 @@ AnfNodePtr MergeCastToNextOp(const FuncGraphPtr &graph, const CNodePtr &node, co
               << "ori kernel info" << ori_kernel_info->ToString() << "alternative kernel info"
               << (*alternative_kernel_info)->ToString();
  AnfAlgo::SetSelectKernelBuildInfo(*alternative_kernel_info, next_cnode.get());
-  ChangeNodeInferInfo(next_cnode, node, cast_index);
  if (node->inputs().size() < kCastInputNum) {
    MS_LOG(EXCEPTION) << "Op[" << node->DebugString() << "] has wrong input num:";
  }
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/adam_apply_one_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/adam_apply_one_fusion.cc
@ -15,30 +15,9 @@
 */
 #include "backend/optimizer/ascend/ir_fusion/adam_apply_one_fusion.h"
 #include "backend/optimizer/common/helper.h"
+#include "backend/session/anf_runtime_algorithm.h"
 namespace mindspore {
 namespace opt {
-AnfNodePtr AdamApplyOneFusion::CreateAdamApplyOneNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const {
-  MS_EXCEPTION_IF_NULL(func_graph);
-  MS_EXCEPTION_IF_NULL(equiv);
-  auto prim = std::make_shared<Primitive>(kAdamApplyOneOpName);
-  std::vector<AnfNodePtr> new_node_inputs = {NewValueNode(prim)};
-  for (const auto &input_var : input_vars_) {
-    auto input_node = utils::cast<AnfNodePtr>((*equiv)[input_var]);
-    MS_EXCEPTION_IF_NULL(input_node);
-    new_node_inputs.push_back(input_node);
-  }
-  for (const auto &mul_x_input_var : mul_x_input_vars_) {
-    auto mul_x_input_node = utils::cast<AnfNodePtr>((*equiv)[mul_x_input_var]);
-    MS_EXCEPTION_IF_NULL(mul_x_input_node);
-    new_node_inputs.push_back(mul_x_input_node);
-  }
-  auto add2_y_node = utils::cast<AnfNodePtr>((*equiv)[add2_y_]);
-  MS_EXCEPTION_IF_NULL(add2_y_node);
-  new_node_inputs.push_back(add2_y_node);
-  auto new_node = func_graph->NewCNode(new_node_inputs);
-  return new_node;
-}
-
 const BaseRef AdamApplyOneFusion::DefinePattern() const {
  const auto prim_sqrt = std::make_shared<Primitive>(kSqrtOpName);
  const auto prim_real_div = std::make_shared<Primitive>(kRealDivOpName);
@ -104,16 +83,152 @@ const BaseRef AdamApplyOneCond4Fusion::DefinePattern() const {
  return VectorRef({prim::kPrimSub, input_vars_[3], VectorRef({prim::kPrimMul, true_div0, input_vars_[4]})});
 }

+const BaseRef AdamApplyOneAssignFusion::DefinePattern() const {
+  const auto prim_sqrt = std::make_shared<Primitive>(kSqrtOpName);
+  const auto prim_real_div = std::make_shared<Primitive>(kRealDivOpName);
+  VectorRef mul2 = VectorRef({prim::kPrimMul, mul_x_input_vars_[2], input_vars_[1]});
+  VectorRef mul3 = VectorRef({prim::kPrimMul, mul_x_input_vars_[3], VectorRef({prim::kPrimSquare, input_vars_[0]})});
+  VectorRef add1 = VectorRef({add1_var_, mul2, mul3});
+  VectorRef sqrt0 = VectorRef({prim_sqrt, add1});
+  VectorRef mul1 = VectorRef({prim::kPrimMul, mul_x_input_vars_[1], input_vars_[0]});
+  VectorRef mul0 = VectorRef({prim::kPrimMul, mul_x_input_vars_[0], input_vars_[2]});
+  VectorRef add0 = VectorRef({add0_var_, mul0, mul1});
+  VectorRef true_div0 = VectorRef({prim_real_div, add0, VectorRef({prim::kPrimTensorAdd, sqrt0, add2_y_})});
+  VectorRef sub0 = VectorRef({sub0_var_, input_vars_[3], VectorRef({prim::kPrimMul, input_vars_[4], true_div0})});
+  VectorRef assign0 = VectorRef({prim::kPrimAssign, input_vars_[3], sub0});
+  VectorRef depend0 = VectorRef({prim::kPrimDepend, sub0, assign0});
+  VectorRef assign1 = VectorRef({prim::kPrimAssign, input_vars_[2], add0});
+  VectorRef depend1 = VectorRef({prim::kPrimDepend, depend0, assign1});
+  VectorRef assign2 = VectorRef({prim::kPrimAssign, input_vars_[1], add1});
+  return VectorRef({prim::kPrimDepend, depend1, assign2});
+}
+
+const BaseRef AdamApplyOneAssignCond1Fusion::DefinePattern() const {
+  const auto prim_sqrt = std::make_shared<Primitive>(kSqrtOpName);
+  const auto prim_real_div = std::make_shared<Primitive>(kRealDivOpName);
+  VectorRef mul2 = VectorRef({prim::kPrimMul, mul_x_input_vars_[2], input_vars_[1]});
+  VectorRef mul3 = VectorRef({prim::kPrimMul, mul_x_input_vars_[3], VectorRef({prim::kPrimSquare, input_vars_[0]})});
+  VectorRef add1 = VectorRef({add1_var_, mul2, mul3});
+  VectorRef sqrt0 = VectorRef({prim_sqrt, add1});
+  VectorRef mul1 = VectorRef({prim::kPrimMul, mul_x_input_vars_[1], input_vars_[0]});
+  VectorRef mul0 = VectorRef({prim::kPrimMul, mul_x_input_vars_[0], input_vars_[2]});
+  VectorRef add0 = VectorRef({add0_var_, mul0, mul1});
+  VectorRef true_div0 = VectorRef({prim_real_div, add0, VectorRef({prim::kPrimTensorAdd, add2_y_, sqrt0})});
+  VectorRef sub0 = VectorRef({sub0_var_, input_vars_[3], VectorRef({prim::kPrimMul, input_vars_[4], true_div0})});
+  VectorRef assign0 = VectorRef({prim::kPrimAssign, input_vars_[3], sub0});
+  VectorRef depend0 = VectorRef({prim::kPrimDepend, sub0, assign0});
+  VectorRef assign1 = VectorRef({prim::kPrimAssign, input_vars_[2], add0});
+  VectorRef depend1 = VectorRef({prim::kPrimDepend, depend0, assign1});
+  VectorRef assign2 = VectorRef({prim::kPrimAssign, input_vars_[1], add1});
+  return VectorRef({prim::kPrimDepend, depend1, assign2});
+}
+
+const BaseRef AdamApplyOneAssignCond2Fusion::DefinePattern() const {
+  const auto prim_sqrt = std::make_shared<Primitive>(kSqrtOpName);
+  const auto prim_real_div = std::make_shared<Primitive>(kRealDivOpName);
+  VectorRef mul2 = VectorRef({prim::kPrimMul, mul_x_input_vars_[2], input_vars_[1]});
+  VectorRef mul3 = VectorRef({prim::kPrimMul, VectorRef({prim::kPrimSquare, input_vars_[0]}), mul_x_input_vars_[3]});
+  VectorRef add1 = VectorRef({add1_var_, mul2, mul3});
+  VectorRef sqrt0 = VectorRef({prim_sqrt, add1});
+  VectorRef mul1 = VectorRef({prim::kPrimMul, mul_x_input_vars_[1], input_vars_[0]});
+  VectorRef mul0 = VectorRef({prim::kPrimMul, mul_x_input_vars_[0], input_vars_[2]});
+  VectorRef add0 = VectorRef({add0_var_, mul0, mul1});
+  VectorRef true_div0 = VectorRef({prim_real_div, add0, VectorRef({prim::kPrimTensorAdd, sqrt0, add2_y_})});
+  VectorRef sub0 = VectorRef({sub0_var_, input_vars_[3], VectorRef({prim::kPrimMul, true_div0, input_vars_[4]})});
+  VectorRef assign0 = VectorRef({prim::kPrimAssign, input_vars_[3], sub0});
+  VectorRef depend0 = VectorRef({prim::kPrimDepend, sub0, assign0});
+  VectorRef assign1 = VectorRef({prim::kPrimAssign, input_vars_[2], add0});
+  VectorRef depend1 = VectorRef({prim::kPrimDepend, depend0, assign1});
+  VectorRef assign2 = VectorRef({prim::kPrimAssign, input_vars_[1], add1});
+  return VectorRef({prim::kPrimDepend, depend1, assign2});
+}
+
+const BaseRef AdamApplyOneAssignCond3Fusion::DefinePattern() const {
+  const auto prim_sqrt = std::make_shared<Primitive>(kSqrtOpName);
+  const auto prim_real_div = std::make_shared<Primitive>(kRealDivOpName);
+  VectorRef mul2 = VectorRef({prim::kPrimMul, mul_x_input_vars_[2], input_vars_[1]});
+  VectorRef mul3 = VectorRef({prim::kPrimMul, mul_x_input_vars_[3], VectorRef({prim::kPrimSquare, input_vars_[0]})});
+  VectorRef add1 = VectorRef({add1_var_, mul2, mul3});
+  VectorRef sqrt0 = VectorRef({prim_sqrt, add1});
+  VectorRef mul1 = VectorRef({prim::kPrimMul, mul_x_input_vars_[1], input_vars_[0]});
+  VectorRef mul0 = VectorRef({prim::kPrimMul, mul_x_input_vars_[0], input_vars_[2]});
+  VectorRef add0 = VectorRef({add0_var_, mul0, mul1});
+  VectorRef true_div0 = VectorRef({prim_real_div, add0, VectorRef({prim::kPrimTensorAdd, sqrt0, add2_y_})});
+  VectorRef sub0 = VectorRef({sub0_var_, input_vars_[3], VectorRef({prim::kPrimMul, true_div0, input_vars_[4]})});
+  VectorRef assign0 = VectorRef({prim::kPrimAssign, input_vars_[3], sub0});
+  VectorRef depend0 = VectorRef({prim::kPrimDepend, sub0, assign0});
+  VectorRef assign1 = VectorRef({prim::kPrimAssign, input_vars_[2], add0});
+  VectorRef depend1 = VectorRef({prim::kPrimDepend, depend0, assign1});
+  VectorRef assign2 = VectorRef({prim::kPrimAssign, input_vars_[1], add1});
+  return VectorRef({prim::kPrimDepend, depend1, assign2});
+}
+
+const BaseRef AdamApplyOneAssignCond4Fusion::DefinePattern() const {
+  const auto prim_sqrt = std::make_shared<Primitive>(kSqrtOpName);
+  const auto prim_real_div = std::make_shared<Primitive>(kRealDivOpName);
+  VectorRef mul2 = VectorRef({prim::kPrimMul, mul_x_input_vars_[2], input_vars_[1]});
+  VectorRef mul3 = VectorRef({prim::kPrimMul, mul_x_input_vars_[3], VectorRef({prim::kPrimSquare, input_vars_[0]})});
+  VectorRef add1 = VectorRef({add1_var_, mul2, mul3});
+  VectorRef sqrt0 = VectorRef({prim_sqrt, add1});
+  VectorRef mul1 = VectorRef({prim::kPrimMul, mul_x_input_vars_[1], input_vars_[0]});
+  VectorRef mul0 = VectorRef({prim::kPrimMul, mul_x_input_vars_[0], input_vars_[2]});
+  VectorRef add0 = VectorRef({add0_var_, mul0, mul1});
+  VectorRef true_div0 = VectorRef({prim_real_div, add0, VectorRef({prim::kPrimTensorAdd, add2_y_, sqrt0})});
+  VectorRef sub0 = VectorRef({sub0_var_, input_vars_[3], VectorRef({prim::kPrimMul, true_div0, input_vars_[4]})});
+  VectorRef assign0 = VectorRef({prim::kPrimAssign, input_vars_[3], sub0});
+  VectorRef depend0 = VectorRef({prim::kPrimDepend, sub0, assign0});
+  VectorRef assign1 = VectorRef({prim::kPrimAssign, input_vars_[2], add0});
+  VectorRef depend1 = VectorRef({prim::kPrimDepend, depend0, assign1});
+  VectorRef assign2 = VectorRef({prim::kPrimAssign, input_vars_[1], add1});
+  return VectorRef({prim::kPrimDepend, depend1, assign2});
+}
+
+AnfNodePtr AdamApplyOneFusion::CreateAdamApplyOneNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
+                                                      const AnfNodePtr &final_node) const {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(equiv);
+  PrimitivePtr prim = nullptr;
+  if (AnfAlgo::CheckPrimitiveType(final_node, prim::kPrimDepend)) {
+    prim = std::make_shared<Primitive>(kAdamApplyOneAssignOpName);
+  } else {
+    prim = std::make_shared<Primitive>(kAdamApplyOneOpName);
+  }
+  std::vector<AnfNodePtr> new_node_inputs = {NewValueNode(prim)};
+  for (const auto &input_var : input_vars_) {
+    auto input_node = utils::cast<AnfNodePtr>((*equiv)[input_var]);
+    MS_EXCEPTION_IF_NULL(input_node);
+    new_node_inputs.push_back(input_node);
+  }
+  for (const auto &mul_x_input_var : mul_x_input_vars_) {
+    auto mul_x_input_node = utils::cast<AnfNodePtr>((*equiv)[mul_x_input_var]);
+    MS_EXCEPTION_IF_NULL(mul_x_input_node);
+    new_node_inputs.push_back(mul_x_input_node);
+  }
+  auto add2_y_node = utils::cast<AnfNodePtr>((*equiv)[add2_y_]);
+  MS_EXCEPTION_IF_NULL(add2_y_node);
+  new_node_inputs.push_back(add2_y_node);
+  auto new_node = func_graph->NewCNode(new_node_inputs);
+  return new_node;
+}
+
 const AnfNodePtr AdamApplyOneFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
                                             const EquivPtr &equiv) const {
  MS_EXCEPTION_IF_NULL(func_graph);
-  MS_EXCEPTION_IF_NULL(node);
-  if (!CheckSupportDataType(node, kFloatDataTypeSet)) {
+  auto sub0 = node;
+  if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimDepend)) {
+    auto iter_sub0 = (*equiv).find(sub0_var_);
+    if (iter_sub0 == (*equiv).end()) {
+      MS_LOG(EXCEPTION) << "The equiv map is expected to contains the sub0 var after matched.";
+    }
+    sub0 = utils::cast<AnfNodePtr>(iter_sub0->second);
+  }
+  MS_EXCEPTION_IF_NULL(sub0);
+  if (!CheckSupportDataType(sub0, kFloatDataTypeSet)) {
    return nullptr;
  }
-  auto new_node = CreateAdamApplyOneNode(func_graph, equiv);
+  auto new_node = CreateAdamApplyOneNode(func_graph, equiv, node);
  MS_EXCEPTION_IF_NULL(new_node);
-  new_node->set_scope(node->scope());
+  new_node->set_scope(sub0->scope());
  // Set abstract of new node
  AbstractBasePtrList new_node_abstract_list;
  auto iter_add0 = (*equiv).find(add0_var_);
@ -130,7 +245,7 @@ const AnfNodePtr AdamApplyOneFusion::Process(const FuncGraphPtr &func_graph, con
  MS_EXCEPTION_IF_NULL(add1);
  new_node_abstract_list.push_back(add1->abstract());
  new_node_abstract_list.push_back(add0->abstract());
-  new_node_abstract_list.push_back(node->abstract());
+  new_node_abstract_list.push_back(sub0->abstract());
  auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(new_node_abstract_list);
  new_node->set_abstract(abstract_tuple);
  // Create tuple_getitem node for outputs
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/adam_apply_one_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/adam_apply_one_fusion.h
@ -40,6 +40,7 @@ class AdamApplyOneFusion : public PatternProcessPass {
    add2_y_ = std::make_shared<Var>();
    add0_var_ = std::make_shared<Var>(std::make_shared<Primitive>(prim::kPrimTensorAdd->name()));
    add1_var_ = std::make_shared<Var>(std::make_shared<Primitive>(prim::kPrimTensorAdd->name()));
+    sub0_var_ = std::make_shared<Var>(std::make_shared<Primitive>(prim::kPrimSub->name()));
  }

  ~AdamApplyOneFusion() override = default;
@ -47,12 +48,14 @@ class AdamApplyOneFusion : public PatternProcessPass {
  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;

 protected:
-  AnfNodePtr CreateAdamApplyOneNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const;
+  AnfNodePtr CreateAdamApplyOneNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
+                                    const AnfNodePtr &final_node) const;
  std::vector<VarPtr> input_vars_;
  std::vector<VarPtr> mul_x_input_vars_;
  VarPtr add2_y_;
  VarPtr add0_var_;
  VarPtr add1_var_;
+  VarPtr sub0_var_;
 };

 class AdamApplyOneCond1Fusion : public AdamApplyOneFusion {
@ -90,6 +93,51 @@ class AdamApplyOneCond4Fusion : public AdamApplyOneFusion {
  ~AdamApplyOneCond4Fusion() override = default;
  const BaseRef DefinePattern() const override;
 };
+
+class AdamApplyOneAssignFusion : public AdamApplyOneFusion {
+ public:
+  explicit AdamApplyOneAssignFusion(bool multigraph = true)
+      : AdamApplyOneFusion("adam_apply_one_assign_fusion", multigraph) {}
+
+  ~AdamApplyOneAssignFusion() override = default;
+  const BaseRef DefinePattern() const override;
+};
+
+class AdamApplyOneAssignCond1Fusion : public AdamApplyOneFusion {
+ public:
+  explicit AdamApplyOneAssignCond1Fusion(bool multigraph = true)
+      : AdamApplyOneFusion("adam_apply_one_assign_cond1_fusion", multigraph) {}
+
+  ~AdamApplyOneAssignCond1Fusion() override = default;
+  const BaseRef DefinePattern() const override;
+};
+
+class AdamApplyOneAssignCond2Fusion : public AdamApplyOneFusion {
+ public:
+  explicit AdamApplyOneAssignCond2Fusion(bool multigraph = true)
+      : AdamApplyOneFusion("adam_apply_one_assign_cond2_fusion", multigraph) {}
+
+  ~AdamApplyOneAssignCond2Fusion() override = default;
+  const BaseRef DefinePattern() const override;
+};
+
+class AdamApplyOneAssignCond3Fusion : public AdamApplyOneFusion {
+ public:
+  explicit AdamApplyOneAssignCond3Fusion(bool multigraph = true)
+      : AdamApplyOneFusion("adam_apply_one_assign_cond3_fusion", multigraph) {}
+
+  ~AdamApplyOneAssignCond3Fusion() override = default;
+  const BaseRef DefinePattern() const override;
+};
+
+class AdamApplyOneAssignCond4Fusion : public AdamApplyOneFusion {
+ public:
+  explicit AdamApplyOneAssignCond4Fusion(bool multigraph = true)
+      : AdamApplyOneFusion("adam_apply_one_assign_cond4_fusion", multigraph) {}
+
+  ~AdamApplyOneAssignCond4Fusion() override = default;
+  const BaseRef DefinePattern() const override;
+};
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_ADAM_APPLY_ONE_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/pass/convert_tuple_output_to_maketuple.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/convert_tuple_output_to_maketuple.cc
@ -62,7 +62,14 @@ const AnfNodePtr ConvertTupleOutputToMaketuple::Process(const FuncGraphPtr &func
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  std::unordered_map<AnfNodePtr, AnfNodePtr> transed_nodes;
-  if (IsPrimitiveCNode(cnode, prim::kPrimTupleGetItem) || IsPrimitiveCNode(cnode, prim::kPrimControlDepend)) {
+  if (IsPrimitiveCNode(cnode, prim::kPrimTupleGetItem)) {
+    auto real_input = AnfAlgo::GetTupleGetItemRealInput(cnode);
+    MS_EXCEPTION_IF_NULL(real_input);
+    if (!real_input->isa<Parameter>() && !real_input->isa<ValueNode>()) {
+      return nullptr;
+    }
+  }
+  if (IsPrimitiveCNode(cnode, prim::kPrimControlDepend)) {
    return nullptr;
  }
  bool cnode_input_changed = false;
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@ -863,7 +863,7 @@ bool TensorNeedSync(const AnfNodePtr &parameter, const tensor::TensorPtr &tensor
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  auto device_address = AnfAlgo::GetMutableOutputAddr(parameter, 0);
-  if (ms_context->enable_pynative_infer()) {
+  if (ms_context->execution_mode() == kPynativeMode) {
    return tensor->device_address().get() == nullptr || tensor->device_address() != device_address;
  }
  if (tensor->is_dirty()) {
--- a/mindspore/ccsrc/frontend/operator/cc_implementations.cc
+++ b/mindspore/ccsrc/frontend/operator/cc_implementations.cc
@ -393,40 +393,5 @@ ValuePtr BoolEq(const ValuePtrList &list) {

  MS_LOG(EXCEPTION) << "Unsported Value for BoolEq, x: " << x->ToString() << ".";
 }
-
-std::vector<int> BroadcastShape_(std::vector<int> shpx, std::vector<int> shpy) {
-  int dlen = SizeToInt(shpx.size()) - SizeToInt(shpy.size());
-  if (dlen < 0) {
-    for (int i = 0; i < -dlen; ++i) {
-      (void)shpx.insert(shpx.begin(), 1);
-    }
-  } else if (dlen > 0) {
-    for (int i = 0; i < dlen; i++) {
-      (void)shpy.insert(shpy.begin(), 1);
-    }
-  }
-  if (shpx.size() != shpy.size()) {
-    MS_LOG(EXCEPTION) << "Failure: shpx.size() != shpy.size().";
-  }
-  std::vector<int> shp;
-  for (size_t i = 0; i < shpx.size(); i++) {
-    auto a = shpx[i];
-    auto b = shpy[i];
-    if (a == 1) {
-      shp.push_back(b);
-    } else if (b == 1) {
-      shp.push_back(a);
-    } else if (a == -1) {
-      shp.push_back(b);
-    } else if (b == -1) {
-      shp.push_back(a);
-    } else if (a == b) {
-      shp.push_back(a);
-    } else {
-      return std::vector<int>();
-    }
-  }
-  return shp;
-}
 }  // namespace prim
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/operator/cc_implementations.h
+++ b/mindspore/ccsrc/frontend/operator/cc_implementations.h
@ -52,7 +52,6 @@ ValuePtr BoolNot(const ValuePtrList &list);
 ValuePtr BoolAnd(const ValuePtrList &list);
 ValuePtr BoolOr(const ValuePtrList &list);
 ValuePtr BoolEq(const ValuePtrList &list);
-std::vector<int> BroadcastShape_(std::vector<int> s1, std::vector<int> s2);
 }  // namespace prim
 }  // namespace mindspore

--- a/mindspore/ccsrc/frontend/operator/composite/composite.cc
+++ b/mindspore/ccsrc/frontend/operator/composite/composite.cc
@ -333,28 +333,28 @@ ArgsPairList HyperMap::Harmonize(const FuncGraphPtr &func_graph, const ArgsPairL
 }

 FuncGraphPtr HyperMap::GenerateFromTypes(const TypePtrList &args_spec_list) {
-  FuncGraphPtr ptrGraph = std::make_shared<FuncGraph>();
-  ptrGraph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
-  ptrGraph->set_flag(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
-  ptrGraph->debug_info()->set_name("hyper_map");
+  FuncGraphPtr ptr_graph = std::make_shared<FuncGraph>();
+  ptr_graph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
+  ptr_graph->set_flag(FUNC_GRAPH_FLAG_SPECIALIZE_PARAMETER, true);
+  ptr_graph->debug_info()->set_name("hyper_map");

  AnfNodePtr ptrFnArg = nullptr;
  std::size_t i = 0;
  ArgsPairList argmap;
  ArgsPairList argmap2;
  if (fn_leaf_ == nullptr) {
-    ptrFnArg = ptrGraph->add_parameter();
+    ptrFnArg = ptr_graph->add_parameter();
    i = 1;
  }

  std::size_t size = args_spec_list.size();
  for (; i < size; ++i) {
-    argmap.push_back(std::make_pair(ptrGraph->add_parameter(), args_spec_list[i]));
+    argmap.push_back(std::make_pair(ptr_graph->add_parameter(), args_spec_list[i]));
  }

-  argmap2 = Harmonize(ptrGraph, argmap);
-  ptrGraph->set_output(Make(ptrGraph, ptrFnArg, argmap2));
-  return ptrGraph;
+  argmap2 = Harmonize(ptr_graph, argmap);
+  ptr_graph->set_output(Make(ptr_graph, ptrFnArg, argmap2));
+  return ptr_graph;
 }

 abstract::AbstractBasePtrList HyperMap::NormalizeArgs(const AbstractBasePtrList &args_spec_list) const {
@ -582,30 +582,30 @@ FuncGraphPtr GradOperation::GetGrad(AnfNodePtr node, const AnfNodePtr &weights,
  inputs.push_back(opsTupleItem);
  inputs.push_back(cnode);
  inputs.push_back(NewValueNode(1));
-  AnfNodePtr ptrBprop = ret->NewCNode(inputs);
+  AnfNodePtr ptr_bprop = ret->NewCNode(inputs);

-  doGetGrad(ret, out, ptrBprop, weights_node, opsTupleItem);
+  doGetGrad(ret, out, ptr_bprop, weights_node, opsTupleItem);
  return ret;
 }

-void GradOperation::doGetGrad(const FuncGraphPtr &func_graph, AnfNodePtr out, AnfNodePtr ptrBprop, AnfNodePtr weights,
+void GradOperation::doGetGrad(const FuncGraphPtr &func_graph, AnfNodePtr out, AnfNodePtr ptr_bprop, AnfNodePtr weights,
                              ValueNodePtr opsTupleItem) {
  MS_EXCEPTION_IF_NULL(func_graph);

-  AnfNodePtr ptrBPropArg = nullptr;
+  AnfNodePtr ptr_bprop_arg = nullptr;
  if (sens_param_) {
-    ptrBPropArg = func_graph->add_parameter();
+    ptr_bprop_arg = func_graph->add_parameter();
  } else {
    auto ones_like = prim::GetPythonOps("ones_like");
-    ptrBPropArg = func_graph->NewCNode({NewValueNode(ones_like), out});
+    ptr_bprop_arg = func_graph->NewCNode({NewValueNode(ones_like), out});
  }

-  AnfNodePtr ptrBApp = func_graph->NewCNode({ptrBprop, ptrBPropArg});
+  AnfNodePtr ptr_bapp = func_graph->NewCNode({ptr_bprop, ptr_bprop_arg});

  CNodePtr fv_bprop = nullptr;
  if (get_by_list_) {
    // python code: grads = hyper_map(F.partial(env_get, env), weights)
-    AnfNodePtr env = func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), ptrBApp, NewValueNode(0)});
+    AnfNodePtr env = func_graph->NewCNode({NewValueNode(prim::kPrimTupleGetItem), ptr_bapp, NewValueNode(0)});
    AnfNodePtr partial_env_get =
      func_graph->NewCNode({NewValueNode(prim::kPrimPartial), NewValueNode(prim::GetPythonOps("env_get")), env});
    MetaFuncGraphPtr hyper_map = std::make_shared<HyperMap>();
@ -614,7 +614,7 @@ void GradOperation::doGetGrad(const FuncGraphPtr &func_graph, AnfNodePtr out, An

  CNodePtr inputs_bprop = nullptr;
  if (get_all_) {
-    inputs_bprop = func_graph->NewCNode({NewValueNode(kTail), ptrBApp});
+    inputs_bprop = func_graph->NewCNode({NewValueNode(kTail), ptr_bapp});
  }

  // Gradients wrt inputs and parameters
@ -636,8 +636,8 @@ void GradOperation::doGetGrad(const FuncGraphPtr &func_graph, AnfNodePtr out, An
  }

  // Gradients wrt first input.
-  // ptrBApp returns (EnvInstance(grads wrt params), grads wrt input0, grads wrt input1, ...), so 1 is for first input
-  func_graph->set_output(func_graph->NewCNode({opsTupleItem, ptrBApp, NewValueNode(1)}));
+  // ptr_bapp returns (EnvInstance(grads wrt params), grads wrt input0, grads wrt input1, ...), so 1 is for first input
+  func_graph->set_output(func_graph->NewCNode({opsTupleItem, ptr_bapp, NewValueNode(1)}));
 }

 // Generate the graph.
@ -657,35 +657,35 @@ FuncGraphPtr GradOperation::GenerateFuncGraph(const AbstractBasePtrList &args_sp
  auto real_fn = dyn_cast<FuncGraphAbstractClosure>(fn);
  MS_EXCEPTION_IF_NULL(real_fn);

-  FuncGraphPtr ptrGraph = real_fn->func_graph();
-  MS_EXCEPTION_IF_NULL(ptrGraph);
-  TraceManager::DebugTrace(std::make_shared<TraceGradOperation>(ptrGraph->debug_info()));
-  FuncGraphPtr dfBuilder = std::make_shared<FuncGraph>();
+  FuncGraphPtr ptr_graph = real_fn->func_graph();
+  MS_EXCEPTION_IF_NULL(ptr_graph);
+  TraceManager::DebugTrace(std::make_shared<TraceGradOperation>(ptr_graph->debug_info()));
+  FuncGraphPtr df_builder = std::make_shared<FuncGraph>();
  TraceManager::EndTrace();
-  auto nparam = ptrGraph->parameters().size();
+  auto nparam = ptr_graph->parameters().size();

  std::ostringstream ss;
  ss << "grad{" << nparam << "}";
-  dfBuilder->set_flag(FUNC_GRAPH_FLAG_CORE, true);
-  dfBuilder->debug_info()->set_name(ss.str());
-  ParameterPtr param_graph = dfBuilder->add_parameter();
+  df_builder->set_flag(FUNC_GRAPH_FLAG_CORE, true);
+  df_builder->debug_info()->set_name(ss.str());
+  ParameterPtr param_graph = df_builder->add_parameter();

  AnfNodePtr weights = nullptr;
  if (get_by_list_) {
-    weights = dfBuilder->add_parameter();
+    weights = df_builder->add_parameter();
  }

  std::vector<AnfNodePtr> inputs;
  inputs.push_back(NewValueNode(prim::kPrimJ));
  inputs.push_back(param_graph);
-  auto jf = dfBuilder->NewCNode(inputs);
+  auto jf = df_builder->NewCNode(inputs);
  // df is checked in GetGrad
-  TraceManager::DebugTrace(std::make_shared<TraceGradOperation>(ptrGraph->debug_info()));
-  auto df = GetGrad(jf, weights, ptrGraph->parameters());
+  TraceManager::DebugTrace(std::make_shared<TraceGradOperation>(ptr_graph->debug_info()));
+  auto df = GetGrad(jf, weights, ptr_graph->parameters());
  TraceManager::EndTrace();
-  dfBuilder->set_output(NewValueNode(df));
+  df_builder->set_output(NewValueNode(df));

-  return dfBuilder;
+  return df_builder;
 }

 REGISTER_PYBIND_DEFINE(GradOperation_, ([](const py::module *m) {
@ -929,7 +929,7 @@ void GenerateTupleSliceParameter(const AbstractTuplePtr &tuple, const AbstractSl

  *step_value = CheckSliceMember(slice->step(), step_default, step_name);
  if (*step_value == 0) {
-    MS_LOG(EXCEPTION) << "TupleSlice require the step value could not be 0, but got 0.";
+    MS_EXCEPTION(ValueError) << "TupleSlice require the step value could not be 0, but got 0.";
  }

  if (*step_value < 0) {
@ -941,8 +941,8 @@ void GenerateTupleSliceParameter(const AbstractTuplePtr &tuple, const AbstractSl
  *stop_index = CheckSliceMember(slice->stop(), stop_default, stop_name);
  if (!CheckIndexInRange(*start_index, -tuple_size, tuple_size - 1) ||
      !CheckIndexInRange(*stop_index, -tuple_size - 1, tuple_size)) {
-    MS_LOG(EXCEPTION) << "TupleSlice the start index " << *start_index << " or end end index " << *stop_index
-                      << " out of range, tuple size " << tuple_size << ".";
+    MS_EXCEPTION(ValueError) << "TupleSlice the start index " << *start_index << " or end end index " << *stop_index
+                             << " out of range, tuple size " << tuple_size << ".";
  }

  *start_index = GetPositiveIndex(*start_index, tuple_size);
--- a/mindspore/ccsrc/frontend/operator/composite/do_signature.cc
+++ b/mindspore/ccsrc/frontend/operator/composite/do_signature.cc
@ -72,10 +72,15 @@ void SetMaxType(TypeId *max_type_id, size_t *max_type_number, const TypeId type_
 bool GetTensorOrScalarTypeInfo(AbstractBasePtr arg_value, bool is_write, TypeId *arg_type_id,
                               TypeId *arg_type = nullptr) {
  if (arg_value->isa<abstract::AbstractRef>()) {
-    if (is_write) {
-      arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref_origin();
-    } else {
-      arg_value = arg_value->cast<abstract::AbstractRefPtr>()->ref();
+    auto ref = arg_value->cast<abstract::AbstractRefPtr>();
+    arg_value = ref->ref();
+    if (!is_write && ref->need_cast()) {
+      auto tensor_type = ref->target_type();
+      *arg_type_id = tensor_type->type_id();
+      if (arg_type != nullptr) {
+        *arg_type = kObjectTypeTensorType;
+      }
+      return true;
    }
  }
  if (arg_value->isa<abstract::AbstractTensor>()) {
@ -248,6 +253,8 @@ void DoAutoCast(const std::string &func_name, const std::vector<Signature> &sign
    if (arg_value->isa<abstract::AbstractTensor>() && arg_type_id == it->second) {
      continue;
    }
+    MS_LOG(DEBUG) << "do cast for inputs " << i << " " << (*op_inputs)[i + 1]->ToString() << " " << arg_type_id
+                  << " to " << it->second;
    (*op_inputs)[i + 1] = DoCast((*op_inputs)[i + 1], it->second, graph);
  }
 }
@ -289,16 +296,23 @@ AnfNodePtr BuildNewCNode(const FuncGraphPtr &func_graph, const std::string &func

    TypePtr type = args_spec_list[i]->GetTypeTrack();
    if (type && type->type_id() == kObjectTypeRef) {
+      auto ref_abs = args_spec_list[i]->cast<abstract::AbstractRefPtr>();
      if (sig == SignatureEnumRW::kRWRead) {
-        param = func_graph->NewCNode({NewValueNode(prim::kPrimGetRefValue), param});
+        param = NewCNode({NewValueNode(prim::kPrimGetRefValue), param}, func_graph);
+        if (ref_abs && ref_abs->need_cast()) {
+          auto cast = prim::GetPythonOps("cast", "mindspore.ops.functional");
+          param = NewCNode({NewValueNode(cast), param, NewValueNode(ref_abs->target_type())}, func_graph);
+        }
      } else if (sig == SignatureEnumRW::kRWWrite) {
-        param = func_graph->NewCNode({NewValueNode(prim::kPrimGetRefOrigin), param});
+        param = NewCNode({NewValueNode(prim::kPrimGetRefValue), param}, func_graph);
        write_indices.insert(i);
      }
      // If sig is SignatureEnumRW::kRWRef, not do anything.
    } else if (sig == SignatureEnumRW::kRWWrite && type->type_id() != kObjectTypeRefKey) {
      MS_EXCEPTION(TypeError) << "Function " << func_name << "'s input " << i << " should be a Parameter.";
    }
+    MS_LOG(DEBUG) << "Function " << func_name << "'s input " << i << " " << param->DebugString(2) << " type "
+                  << args_spec_list[i]->ToString();
    op_inputs.push_back(param);
  }
  // process default
--- a/mindspore/ccsrc/frontend/operator/composite/unpack_call.cc
+++ b/mindspore/ccsrc/frontend/operator/composite/unpack_call.cc
@ -49,13 +49,14 @@ FuncGraphPtr UnpackCall::GenerateFuncGraph(const AbstractBasePtrList &args_spec_
    MS_LOG(EXCEPTION) << op_name << " requires at least two args, but got " << arg_length << ".";
  }

-  (void)abstract::CheckArg<AbstractFunction>(op_name, args_spec_list, 0);
+  // No need to check, check will be done in infer.
  auto ret_graph = std::make_shared<FuncGraph>();
  ret_graph->set_flag(FUNC_GRAPH_FLAG_CORE, true);
+  ret_graph->debug_info()->set_name("UnpackCall");

-  AnfNodePtr fnNode = ret_graph->add_parameter();
+  AnfNodePtr fn_node = ret_graph->add_parameter();
  std::vector<AnfNodePtr> elems;
-  elems.push_back(fnNode);
+  elems.push_back(fn_node);
  for (size_t index = 1; index < arg_length; index++) {
    MS_EXCEPTION_IF_NULL(args_spec_list[index]);
    if (args_spec_list[index]->isa<AbstractTuple>()) {
--- a/mindspore/ccsrc/frontend/operator/ops.h
+++ b/mindspore/ccsrc/frontend/operator/ops.h
@ -31,160 +31,43 @@ ValuePtr GetPythonOps(const std::string &op_name,
                      const std::string &module_name = "mindspore._extends.parse.standard_method",
                      bool use_signature = false);

-// Arithmetic
-inline const PrimitivePtr kPrimScalarAdd = std::make_shared<Primitive>("scalar_add");
-inline const PrimitivePtr kPrimScalarSub = std::make_shared<Primitive>("scalar_sub");
-inline const PrimitivePtr kPrimScalarMul = std::make_shared<Primitive>("scalar_mul");
-inline const PrimitivePtr kPrimScalarDiv = std::make_shared<Primitive>("scalar_div");
-inline const PrimitivePtr kPrimScalarFloordiv = std::make_shared<Primitive>("scalar_floordiv");
-inline const PrimitivePtr kPrimScalarMod = std::make_shared<Primitive>("scalar_mod");
-inline const PrimitivePtr kPrimScalarPow = std::make_shared<Primitive>("scalar_pow");
-inline const PrimitivePtr kPrimScalarTrunc = std::make_shared<Primitive>("scalar_trunc");
-inline const PrimitivePtr kPrimScalarFloor = std::make_shared<Primitive>("scalar_floor");
-inline const PrimitivePtr kPrimScalarUadd = std::make_shared<Primitive>("scalar_uadd");
-inline const PrimitivePtr kPrimScalarUsub = std::make_shared<Primitive>("scalar_usub");
-inline const PrimitivePtr kPrimScalarExp = std::make_shared<Primitive>("scalar_exp");
-inline const PrimitivePtr kPrimScalarLog = std::make_shared<Primitive>("scalar_log");
-inline const PrimitivePtr kPrimScalarSin = std::make_shared<Primitive>("scalar_sin");
-inline const PrimitivePtr kPrimScalarCos = std::make_shared<Primitive>("scalar_cos");
-inline const PrimitivePtr kPrimScalarTan = std::make_shared<Primitive>("scalar_tan");
-
-// Comparisons
-inline const PrimitivePtr kPrimScalarEq = std::make_shared<Primitive>("scalar_eq");
-inline const PrimitivePtr kPrimScalarLt = std::make_shared<Primitive>("scalar_lt");
-inline const PrimitivePtr kPrimScalarGt = std::make_shared<Primitive>("scalar_gt");
-inline const PrimitivePtr kPrimScalarNe = std::make_shared<Primitive>("scalar_ne");
-inline const PrimitivePtr kPrimScalarLe = std::make_shared<Primitive>("scalar_le");
-inline const PrimitivePtr kPrimScalarGe = std::make_shared<Primitive>("scalar_ge");
-inline const PrimitivePtr kPrimBoolNot = std::make_shared<Primitive>("bool_not");
-inline const PrimitivePtr kPrimBoolAnd = std::make_shared<Primitive>("bool_and");
-inline const PrimitivePtr kPrimBoolOr = std::make_shared<Primitive>("bool_or");
-inline const PrimitivePtr kPrimBoolEq = std::make_shared<Primitive>("bool_eq");
-inline const PrimitivePtr kPrimGreater = std::make_shared<Primitive>("Greater");
-inline const PrimitivePtr kPrimGreaterEqual = std::make_shared<Primitive>("GreaterEqual");
-inline const PrimitivePtr kPrimLess = std::make_shared<Primitive>("Less");
-inline const PrimitivePtr kPrimLessEqual = std::make_shared<Primitive>("LessEqual");
-inline const PrimitivePtr kPrimEqual = std::make_shared<Primitive>("Equal");
-inline const PrimitivePtr kPrimNotEqual = std::make_shared<Primitive>("NotEqual");
-
+// Primitives only used by frontend;
 // Type introspection
 inline const PrimitivePtr kPrimTypeOf = std::make_shared<Primitive>("typeof");
 inline const PrimitivePtr kPrimHasType = std::make_shared<Primitive>("hastype");

-inline const PrimitivePtr kPrimDistribute = std::make_shared<Primitive>("distribute");
-inline const PrimitivePtr kPrimDot = std::make_shared<Primitive>("dot");
-inline const PrimitivePtr kPrimIm2Col = std::make_shared<Primitive>("im2col");
-inline const PrimitivePtr kPrimCol2Im = std::make_shared<Primitive>("col2im");
-inline const PrimitivePtr kPrimIm2ColV1 = std::make_shared<Primitive>("im2col_v1");
-inline const PrimitivePtr kPrimCol2ImV1 = std::make_shared<Primitive>("col2im_v1");
-
 inline const PrimitivePtr kPrimResolve = std::make_shared<Primitive>("resolve");
 inline const PrimitivePtr kPrimEmbed = std::make_shared<Primitive>("embed");
 inline const PrimitivePtr kPrimRefToEmbed = std::make_shared<Primitive>("RefToEmbed");
 inline const PrimitivePtr kPrimCreateInstance = std::make_shared<Primitive>("create_instance");

-inline const PrimitivePtr kPrimLabelGoto = std::make_shared<Primitive>("LabelGoto");
-inline const PrimitivePtr kPrimLabelSwitch = std::make_shared<Primitive>("LabelSwitch");
-inline const PrimitivePtr kPrimLabelSet = std::make_shared<Primitive>("LabelSet");
+// Other miscellaneous
+inline const PrimitivePtr kPrimGetRefOrigin = std::make_shared<Primitive>("get_ref_origin");
+inline const PrimitivePtr kPrimInsertGradientOf = std::make_shared<Primitive>("InsertGradientOf");
+inline const PrimitivePtr kPrimCheckBprop = std::make_shared<Primitive>("CheckBprop");
+inline const PrimitivePtr kPrimMixedPrecisionCast = std::make_shared<Primitive>("mixed_precision_cast");
+inline const PrimitivePtr kPrimMakeRecord = std::make_shared<Primitive>("make_record");

-// Arrays
-inline const PrimitivePtr kPrimScalarToArray = std::make_shared<Primitive>("scalar_to_array");
-inline const PrimitivePtr kPrimArrayToScalar = std::make_shared<Primitive>("array_to_scalar");
-inline const PrimitivePtr kPrimBroadcastShape = std::make_shared<Primitive>("broadcast_shape");
-inline const PrimitivePtr kPrimArrayMap = std::make_shared<Primitive>("array_map");
-inline const PrimitivePtr kPrimArrayReduce = std::make_shared<Primitive>("array_reduce");
-inline const PrimitivePtr kPrimCast = std::make_shared<Primitive>("Cast");
-inline const PrimitivePtr kPrimConcat = std::make_shared<Primitive>("Concat");
-inline const PrimitivePtr kPrimSqueeze = std::make_shared<Primitive>("Squeeze");
-inline const PrimitivePtr kPrimTranspose = std::make_shared<Primitive>("Transpose");
-inline const PrimitivePtr kPrimGatherV2 = std::make_shared<Primitive>("GatherV2");
-inline const PrimitivePtr kPrimEmbeddingLookup = std::make_shared<Primitive>("EmbeddingLookup");
-inline const PrimitivePtr kPrimEmbeddingLookupCommGrad = std::make_shared<Primitive>("EmbeddingLookupCommGrad");
-inline const PrimitivePtr kPrimSize = std::make_shared<Primitive>("Size");
-inline const PrimitivePtr kPrimArgMax = std::make_shared<Primitive>("Argmax");
-inline const PrimitivePtr kPrimPack = std::make_shared<Primitive>("Pack");
-inline const PrimitivePtr kPrimUnsortedSegmentSum = std::make_shared<Primitive>("UnsortedSegmentSum");
-inline const PrimitivePtr kPrimUnsortedSegmentMin = std::make_shared<Primitive>("UnsortedSegmentMin");
-inline const PrimitivePtr kPrimConcatOffset = std::make_shared<Primitive>("ConcatOffset");
-inline const PrimitivePtr kPrimReshape = std::make_shared<Primitive>("Reshape");
-inline const PrimitivePtr kPrimTile = std::make_shared<Primitive>("Tile");
-inline const PrimitivePtr kPrimAddN = std::make_shared<Primitive>("AddN");
-inline const PrimitivePtr KPrimTransData = std::make_shared<Primitive>("TransData");
-inline const PrimitivePtr kPrimNMSWithMask = std::make_shared<Primitive>("NMSWithMask");
-inline const PrimitivePtr kPrimPad = std::make_shared<Primitive>("Pad");
-inline const PrimitivePtr kPrimArgMaxWithValue = std::make_shared<Primitive>("ArgMaxWithValue");
-inline const PrimitivePtr kPrimUnique = std::make_shared<Primitive>("Unique");
-inline const PrimitivePtr kPrimUniqueGrad = std::make_shared<Primitive>("UniqueGrad");
+// Structures
+
+inline const PrimitivePtr kPrimListMap = std::make_shared<Primitive>("list_map");
+inline const PrimitivePtr kPrimListReduce = std::make_shared<Primitive>("list_reduce");
+inline const PrimitivePtr kPrimTupleReversed = std::make_shared<Primitive>("tuple_reversed");
+inline const PrimitivePtr kPrimReducedShape = std::make_shared<Primitive>("reduced_shape");
+inline const PrimitivePtr kPrimTupleDiv = std::make_shared<Primitive>("tuple_div");
+inline const PrimitivePtr kPrimTupleToArray = std::make_shared<Primitive>("tuple_to_array");
+inline const PrimitivePtr kPrimShapeMul = std::make_shared<Primitive>("shape_mul");
+inline const PrimitivePtr kPrimTupleEqual = std::make_shared<Primitive>("tuple_equal");
+inline const PrimitivePtr kPrimListEqual = std::make_shared<Primitive>("list_equal");
+inline const PrimitivePtr kPrimMakeRange = std::make_shared<Primitive>("make_range");
+inline const PrimitivePtr kPrimStopGradient = std::make_shared<Primitive>("stop_gradient");
+inline const PrimitivePtr kPrimStringEqual = std::make_shared<Primitive>("string_equal");
+inline const PrimitivePtr kPrimStringConcat = std::make_shared<Primitive>("string_concat");
+inline const PrimitivePtr kPrimDictLen = std::make_shared<Primitive>("dict_len");

-// NN
-inline const PrimitivePtr kPrimFlatten = std::make_shared<Primitive>("Flatten");
-inline const PrimitivePtr kPrimSoftmax = std::make_shared<Primitive>("Softmax");
-inline const PrimitivePtr kPrimLogSoftmax = std::make_shared<Primitive>("LogSoftmax");
-inline const PrimitivePtr kPrimLogSoftmaxGrad = std::make_shared<Primitive>("LogSoftmaxGrad");
-inline const PrimitivePtr kPrimTanh = std::make_shared<Primitive>("Tanh");
-inline const PrimitivePtr kPrimTanhGrad = std::make_shared<Primitive>("TanhGrad");
-inline const PrimitivePtr kPrimPooling = std::make_shared<Primitive>("Pooling");
-inline const PrimitivePtr kPrimPoolingGrad = std::make_shared<Primitive>("PoolingGrad");
-inline const PrimitivePtr kPrimMaxPool = std::make_shared<Primitive>("MaxPool");
-inline const PrimitivePtr kPrimMaxPoolGrad = std::make_shared<Primitive>("MaxPoolGrad");
-inline const PrimitivePtr kPrimApplyCenteredRMSProp = std::make_shared<Primitive>("ApplyCenteredRMSProp");
-inline const PrimitivePtr kPrimAvgPoolGrad = std::make_shared<Primitive>("AvgPoolGrad");
-inline const PrimitivePtr kPrimAvgPoolGradVm = std::make_shared<Primitive>("AvgPoolGradVm");
-inline const PrimitivePtr kPrimFusedBatchNorm = std::make_shared<Primitive>("FusedBatchNorm");
-inline const PrimitivePtr kPrimConv2D = std::make_shared<Primitive>("Conv2D");
-inline const PrimitivePtr kPrimFusedBatchNormGrad = std::make_shared<Primitive>("FusedBatchNormGrad");
-inline const PrimitivePtr kPrimBatchNorm = std::make_shared<Primitive>("BatchNorm");
-inline const PrimitivePtr kPrimBatchNormGrad = std::make_shared<Primitive>("BatchNormGrad");
-inline const PrimitivePtr kPrimReluGrad = std::make_shared<Primitive>("ReluGrad");
-inline const PrimitivePtr kPrimConv2DBackpropInput = std::make_shared<Primitive>("Conv2DBackpropInput");
-inline const PrimitivePtr kPrimConv2DBackpropFilter = std::make_shared<Primitive>("Conv2DBackpropFilter");
-inline const PrimitivePtr kPrimDepthwiseConv2dNative = std::make_shared<Primitive>("DepthwiseConv2dNative");
-inline const PrimitivePtr kPrimDepthwiseConv2dNativeBackpropFilter =
-  std::make_shared<Primitive>("DepthwiseConv2dNativeBackpropFilter");
-inline const PrimitivePtr kPrimDepthwiseConv2dNativeBackpropInput =
-  std::make_shared<Primitive>("DepthwiseConv2dNativeBackpropInput");
-inline const PrimitivePtr kPrimBiasAddGrad = std::make_shared<Primitive>("BiasAddGrad");
-inline const PrimitivePtr kPrimSoftmaxCrossEntropyWithLogits =
-  std::make_shared<Primitive>("SoftmaxCrossEntropyWithLogits");
-inline const PrimitivePtr kPrimSparseSoftmaxCrossEntropyWithLogits =
-  std::make_shared<Primitive>("SparseSoftmaxCrossEntropyWithLogits");
-inline const PrimitivePtr kPrimMomentum = std::make_shared<Primitive>("Momentum");
-inline const PrimitivePtr kPrimApplyMomentum = std::make_shared<Primitive>("ApplyMomentum");
-inline const PrimitivePtr kPrimLayerNorm = std::make_shared<Primitive>("LayerNorm");
-inline const PrimitivePtr kPrimLayerNormGrad = std::make_shared<Primitive>("LayerNormGrad");
-inline const PrimitivePtr kPrimLayerNormXBackprop = std::make_shared<Primitive>("LayerNormXBackprop");
-inline const PrimitivePtr kPrimLayerNormBetaGammaBackprop = std::make_shared<Primitive>("LayerNormBetaGammaBackprop");
-inline const PrimitivePtr kPrimDropoutGenMask = std::make_shared<Primitive>("DropoutGenMask");
-inline const PrimitivePtr kPrimDropoutDoMask = std::make_shared<Primitive>("DropoutDoMask");
-inline const PrimitivePtr kPrimOneHot = std::make_shared<Primitive>("OneHot");
-inline const PrimitivePtr kPrimGelu = std::make_shared<Primitive>("Gelu");
-inline const PrimitivePtr kPrimGeluGrad = std::make_shared<Primitive>("GeluGrad");
-inline const PrimitivePtr kPrimRelu = std::make_shared<Primitive>("ReLU");
-inline const PrimitivePtr kPrimReluV2 = std::make_shared<Primitive>("ReLUV2");
-inline const PrimitivePtr kPrimZerosLike = std::make_shared<Primitive>("ZerosLike");
 inline const PrimitivePtr kPrimFakeBprop = std::make_shared<Primitive>("fake_bprop");
-inline const PrimitivePtr kPrimBpropCut = std::make_shared<Primitive>("bprop_cut");
-inline const PrimitivePtr kPrimFakeQuantPerLayer = std::make_shared<Primitive>("FakeQuantPerLayer");
-inline const PrimitivePtr kPrimFakeQuantPerChannel = std::make_shared<Primitive>("FakeQuantPerChannel");
-inline const PrimitivePtr kPrimApplyRMSProp = std::make_shared<Primitive>("ApplyRMSProp");

-// Comm ops
-inline const PrimitivePtr kPrimMirror = std::make_shared<Primitive>("_MirrorOperator");
-inline const PrimitivePtr kPrimVirtualDiv = std::make_shared<Primitive>("_VirtualDiv");
-inline const PrimitivePtr kPrimVirtualDataset = std::make_shared<Primitive>("_VirtualDataset");
-inline const PrimitivePtr kPrimAllReduce = std::make_shared<Primitive>("AllReduce");
-
-// RowTensor
-inline const PrimitivePtr kPrimMakeRowTensor = std::make_shared<Primitive>("MakeRowTensor");
-inline const PrimitivePtr kPrimRowTensorGetValues = std::make_shared<Primitive>("RowTensorGetValues");
-inline const PrimitivePtr kPrimRowTensorGetIndices = std::make_shared<Primitive>("RowTensorGetIndices");
-inline const PrimitivePtr kPrimRowTensorGetDenseShape = std::make_shared<Primitive>("RowTensorGetDenseShape");
-
-// SparseTensor
-inline const PrimitivePtr kPrimMakeSparseTensor = std::make_shared<Primitive>("MakeSparseTensor");
-inline const PrimitivePtr kPrimSparseTensorGetValues = std::make_shared<Primitive>("SparseTensorGetValues");
-inline const PrimitivePtr kPrimSparseTensorGetIndices = std::make_shared<Primitive>("SparseTensorGetIndices");
-inline const PrimitivePtr kPrimSparseTensorGetDenseShape = std::make_shared<Primitive>("SparseTensorGetDenseShape");
+inline const PrimitivePtr kPrimBroadcastGradientArgs = std::make_shared<Primitive>("BroadcastGradientArgs");

 class UnpackGraphPrimitive : public Primitive {
 public:
--- a/mindspore/ccsrc/frontend/operator/ops_front_infer_function.cc
+++ b/mindspore/ccsrc/frontend/operator/ops_front_infer_function.cc
@ -1,6 +1,4 @@
 /**
- * This is the C++ adaptation and derivative work of Myia (https://github.com/mila-iqia/myia/).
- *
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -15,360 +13,266 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+#include "frontend/operator/ops_front_infer_function.h"

+#include <set>
+#include <string>
+#include <vector>
+#include <memory>
+#include <algorithm>
+
+#include "abstract/abstract_value.h"
 #include "pipeline/jit/static_analysis/prim.h"
-#include "abstract/utils.h"
 #include "abstract/param_validator.h"
-#include "frontend/operator/ops.h"
-#include "utils/convert_utils.h"
 #include "utils/tensor_py.h"
-
-using mindspore::tensor::TensorPy;
-
+#include "frontend/operator/ops.h"
+#include "abstract/infer_functions.h"
 namespace mindspore {
 namespace abstract {
+enum State {
+  SAME,
+  X_ONE,
+  Y_ONE,
+};

-AbstractBasePtr InferImplStringEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                     const AbstractBasePtrList &args_spec_list) {
-  // Inputs: two scalars whose value is a string.
-  const std::string op_name = primitive->name();
+struct SlideInfo {
+  int start;
+  int step;
+  int stop;
+};
+
+template <typename T>
+AbstractBasePtr InferImplTupleOrListEqual(const std::string &op_name, const AbstractBasePtrList &args_spec_list) {
+  // Inputs: two tuples or two lists.
  CheckArgsSize(op_name, args_spec_list, 2);
-  AbstractScalarPtr scalar_x = CheckArg<AbstractScalar>(op_name, args_spec_list, 0);
-  AbstractScalarPtr scalar_y = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);
+  auto input_x = CheckArg<T>(op_name, args_spec_list, 0);
+  auto input_y = CheckArg<T>(op_name, args_spec_list, 1);

-  ValuePtr value_x = scalar_x->BuildValue();
-  ValuePtr value_y = scalar_y->BuildValue();
-  if (!value_x->isa<StringImm>() || !value_y->isa<StringImm>()) {
-    MS_LOG(EXCEPTION) << op_name << " requires 2 parameters are string, but got param0: " << value_x->ToString()
-                      << ", param1: " << value_y->ToString();
-  }
-
-  bool ret = (value_x->cast<StringImmPtr>()->value() == value_y->cast<StringImmPtr>()->value());
-  return std::make_shared<AbstractScalar>(ret);
+  ValuePtr x_value = input_x->BuildValue();
+  ValuePtr y_value = input_y->BuildValue();
+  return std::make_shared<AbstractScalar>(*x_value == *y_value);
 }

-AbstractBasePtr InferImplStringConcat(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                      const AbstractBasePtrList &args_spec_list) {
-  // Inputs: two scalars whose value is a string.
-  const std::string op_name = primitive->name();
-  CheckArgsSize(op_name, args_spec_list, 2);
-  AbstractScalarPtr scalar_x = CheckArg<AbstractScalar>(op_name, args_spec_list, 0);
-  AbstractScalarPtr scalar_y = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);
-
-  ValuePtr value_x = scalar_x->BuildValue();
-  ValuePtr value_y = scalar_y->BuildValue();
-  if (!value_x->isa<StringImm>() || !value_y->isa<StringImm>()) {
-    MS_LOG(EXCEPTION) << op_name << " requires 2 parameters are string, but got param0: " << value_x->ToString()
-                      << ", param1: " << value_y->ToString();
-  }
-
-  std::string ret = (value_x->cast<StringImmPtr>()->value() + value_y->cast<StringImmPtr>()->value());
-  return std::make_shared<AbstractScalar>(ret);
-}
-
-AbstractBasePtr InferImplMakeTuple(const AnalysisEnginePtr &, const PrimitivePtr &,
-                                   const AbstractBasePtrList &args_spec_list) {
-  return std::make_shared<AbstractTuple>(args_spec_list);
-}
-
-AbstractBasePtr InferImplMakeList(const AnalysisEnginePtr &, const PrimitivePtr &,
-                                  const AbstractBasePtrList &args_spec_list) {
-  return std::make_shared<AbstractList>(args_spec_list);
-}
-
-AbstractBasePtr InferImplMakeDict(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                  const AbstractBasePtrList &args_spec_list) {
-  // Inputs: two tuples.
-  const std::string op_name = primitive->name();
-  CheckArgsSize(op_name, args_spec_list, 2);
-  AbstractTuplePtr keys = CheckArg<AbstractTuple>(op_name, args_spec_list, 0);
-  AbstractTuplePtr values = CheckArg<AbstractTuple>(op_name, args_spec_list, 1);
-
-  size_t keys_size = keys->size();
-  if (values->size() != keys_size) {
-    MS_LOG(EXCEPTION) << op_name << " evaluator keys' size is not equal with values' size";
-  }
-
-  std::vector<AbstractAttribute> key_value;
-  AbstractScalarPtr key;
-  AbstractBasePtrList key_list = keys->elements();
-  AbstractBasePtrList value_list = values->elements();
-  for (size_t index = 0; index < keys_size; index++) {
-    key = CheckArg<AbstractScalar>(op_name + "key", key_list, index);
-    ValuePtr keyPtr = key->BuildValue();
-    MS_EXCEPTION_IF_NULL(keyPtr);
-    if (!keyPtr->isa<StringImm>()) {
-      MS_LOG(EXCEPTION) << op_name << " evaluator keys should be string, but got " << keyPtr->ToString();
+void CalcSlidePara(const AbstractBasePtrList &args_spec_list, SlideInfo *slide) {
+  int arg1 = 0;
+  int arg2 = 0;
+  if (!args_spec_list.empty()) {
+    MS_EXCEPTION_IF_NULL(args_spec_list[0]);
+    auto arg_value = args_spec_list[0]->BuildValue();
+    if (!arg_value->isa<Int32Imm>()) {
+      MS_LOG(EXCEPTION) << "Only supported input an int32 number.";
    }
-    std::string key_string = GetValue<std::string>(keyPtr);
-    key_value.emplace_back(key_string, value_list[index]);
+    arg1 = GetValue<int>(arg_value);
  }
-  return std::make_shared<AbstractDictionary>(key_value);
-}

-AbstractBasePtr InferImplMakeKwarg(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                   const AbstractBasePtrList &args_spec_list) {
-  // Inputs: a string and an object of a subclass of AbstractBase.
-  const std::string op_name = primitive->name();
-  CheckArgsSize(op_name, args_spec_list, 2);
-  AbstractScalarPtr key = CheckArg<AbstractScalar>(op_name, args_spec_list, 0);
-
-  ValuePtr keyPtr = key->BuildValue();
-  if (!keyPtr->isa<StringImm>()) {
-    MS_LOG(EXCEPTION) << op_name << " evaluator key should be string, but got " << keyPtr->ToString();
-  }
-  std::string key_string = GetValue<std::string>(keyPtr);
-  return std::make_shared<AbstractKeywordArg>(key_string, args_spec_list[1]);
-}
-
-AbstractBasePtr InferImplExtractKwarg(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                      const AbstractBasePtrList &args_spec_list) {
-  // Inputs: a string and a keyword.
-  const std::string op_name = primitive->name();
-  CheckArgsSize(op_name, args_spec_list, 2);
-  AbstractScalarPtr key = CheckArg<AbstractScalar>(op_name, args_spec_list, 0);
-  AbstractKeywordArgPtr kwarg = CheckArg<AbstractKeywordArg>(op_name, args_spec_list, 1);
-
-  ValuePtr key_value = key->BuildValue();
-  if (!key_value->isa<StringImm>()) {
-    MS_LOG(EXCEPTION) << op_name << " evaluator key should be string, but got " << key_value->ToString();
-  }
-  std::string key_input = GetValue<std::string>(key_value);
-  std::string key_actual = kwarg->get_key();
-  if (key_actual != key_input) {
-    MS_LOG(EXCEPTION) << op_name << " evaluator input key should be same as AbstractKeywordArg' key, but input is "
-                      << key_input << ", AbstractKeywordArg' key is " << key_actual;
-  }
-  return kwarg->get_arg();
-}
-
-AbstractBasePtr InferImplMakeSlice(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                   const AbstractBasePtrList &args_spec_list) {
-  // Inputs: three scalars whose value is an int32 number.
-  CheckArgsSize(primitive->name(), args_spec_list, 3);
-  size_t args_size = args_spec_list.size();
-  for (size_t index = 0; index < args_size; index++) {
-    MS_EXCEPTION_IF_NULL(args_spec_list[index]);
-    if (!args_spec_list[index]->isa<AbstractScalar>() && !args_spec_list[index]->isa<AbstractNone>()) {
-      MS_EXCEPTION(TypeError) << "MakeSlice eval " << index << " parameter is neither AbstractScalar nor AbstractNone.";
+  if (args_spec_list.size() >= 2) {
+    MS_EXCEPTION_IF_NULL(args_spec_list[1]);
+    auto arg_value = args_spec_list[1]->BuildValue();
+    if (!arg_value->isa<Int32Imm>()) {
+      MS_LOG(EXCEPTION) << "Only supported input an int32 number.";
    }
-    if (args_spec_list[index]->isa<AbstractScalar>() &&
-        !dyn_cast<AbstractScalar>(args_spec_list[index])->BuildValue()->isa<Int32Imm>()) {
-      MS_EXCEPTION(TypeError) << "MakeSlice eval " << index
-                              << " parameter is an AbstractScalar, but is not an int32 number.";
+    arg2 = GetValue<int>(arg_value);
+  }
+
+  if (args_spec_list.size() == 3) {
+    MS_EXCEPTION_IF_NULL(args_spec_list[2]);
+    auto arg_value = args_spec_list[2]->BuildValue();
+    if (!arg_value->isa<Int32Imm>()) {
+      MS_LOG(EXCEPTION) << "Only supported input an int32 number.";
+    }
+    slide->step = GetValue<int>(arg_value);
+    slide->start = arg1;
+    slide->stop = arg2;
+  }
+
+  if (args_spec_list.size() == 2) {
+    slide->start = arg1;
+    slide->stop = arg2;
+  }
+
+  if (args_spec_list.size() == 1) {
+    slide->stop = arg1;
+  }
+}
+
+void ComputeReduceIndex(const std::vector<int> &reverse_x, const std::vector<int> &reverse_y,
+                        std::vector<int> *grad_x_reduce_idx, std::vector<int> *grad_y_reduce_idy) {
+  const size_t n = reverse_x.size();
+  for (size_t i = 0; i < n; ++i) {
+    State curr;
+    const int32_t x_i = reverse_x[i];
+    const int32_t y_i = reverse_y[i];
+    const int reduce_idx = SizeToInt(n - 1 - i);
+    if (x_i == y_i) {
+      curr = SAME;
+    } else if (x_i == 1) {
+      grad_x_reduce_idx->push_back(reduce_idx);
+      curr = X_ONE;
+    } else if (y_i == 1) {
+      grad_y_reduce_idy->push_back(reduce_idx);
+      curr = Y_ONE;
+    } else {
+      MS_LOG(EXCEPTION) << "not compatible shape input for BroadcastGradientArgs";
+    }
+    if (curr == SAME && x_i == 1) {
+      grad_x_reduce_idx->push_back(reduce_idx);
+      grad_y_reduce_idy->push_back(reduce_idx);
+      continue;
    }
  }
-  // Slice: start, end, step
-  return std::make_shared<AbstractSlice>(args_spec_list[0], args_spec_list[1], args_spec_list[2]);
+
+  std::reverse(grad_x_reduce_idx->begin(), grad_x_reduce_idx->end());
+  std::reverse(grad_y_reduce_idy->begin(), grad_y_reduce_idy->end());
 }

-// Eval the return type of make_record
-AbstractBasePtr InferImplMakeRecord(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                    const AbstractBasePtrList &args_spec_list) {
-  // Inputs: at lease two objects of a subclass of AbstractBase.
-  if (args_spec_list.size() < 2) {
-    MS_LOG(EXCEPTION) << "Typeof evaluator requires more than 1 parameter, while the input size is "
-                      << args_spec_list.size() << ".";
+AbstractBasePtr BroadcastGradientArgsDiff(const std::vector<ValuePtr> &x_shape, const std::vector<ValuePtr> &y_shape) {
+  std::vector<int> reverse_x;
+  std::vector<int> reverse_y;
+
+  (void)std::transform(x_shape.rbegin(), x_shape.rend(), std::back_inserter(reverse_x),
+                       [](const ValuePtr &v) { return v->cast<Int32ImmPtr>()->value(); });
+  (void)std::transform(y_shape.rbegin(), y_shape.rend(), std::back_inserter(reverse_y),
+                       [](const ValuePtr &v) { return v->cast<Int32ImmPtr>()->value(); });
+
+  if (reverse_x.size() > reverse_y.size()) {
+    reverse_y.resize(reverse_x.size(), 1);
+  } else {
+    reverse_x.resize(reverse_y.size(), 1);
  }

-  // args_spec_list[0] maybe AbstractScalarPtr or AbstractTypePtr
+  std::vector<int> grad_x_reduce_idx;
+  std::vector<int> grad_y_reduce_idy;
+  ComputeReduceIndex(reverse_x, reverse_y, &grad_x_reduce_idx, &grad_y_reduce_idy);
+
+  AbstractBasePtrList abs_list_x;
+  AbstractBasePtrList abs_list_y;
+  (void)std::transform(grad_x_reduce_idx.begin(), grad_x_reduce_idx.end(), std::back_inserter(abs_list_x),
+                       [](int v) { return abstract::FromValue(v); });
+  (void)std::transform(grad_y_reduce_idy.begin(), grad_y_reduce_idy.end(), std::back_inserter(abs_list_y),
+                       [](int v) { return abstract::FromValue(v); });
+  auto x_reduce_idx = std::make_shared<AbstractTuple>(abs_list_x);
+  auto y_reduce_idx = std::make_shared<AbstractTuple>(abs_list_y);
+  AbstractBasePtrList elem_list;
+  elem_list.push_back(x_reduce_idx);
+  elem_list.push_back(y_reduce_idx);
+
+  return std::make_shared<AbstractTuple>(elem_list);
+}
+
+AbstractBasePtr InferImplTypeof(const AnalysisEnginePtr &, const PrimitivePtr &,
+                                const AbstractBasePtrList &args_spec_list) {
+  // Inputs: a pointer to an AbstractBase object
+  if (args_spec_list.size() != 1) {
+    MS_LOG(EXCEPTION) << "Typeof evaluator requires 1 parameter, while the input size is " << args_spec_list.size()
+                      << ".";
+  }
+  AbstractBasePtr abs_base = args_spec_list[0];
+  MS_EXCEPTION_IF_NULL(abs_base);
+  TypePtr type = abs_base->BuildType();
+  return std::make_shared<AbstractType>(type);
+}
+
+AbstractBasePtr InferImplHasType(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                 const AbstractBasePtrList &args_spec_list) {
+  // Inputs: a pointer to an AbstractBase object and a pointer to a Type
+  const std::string op_name = primitive->name();
+  CheckArgsSize(op_name, args_spec_list, 2);
+  AbstractTypePtr abs_type = CheckArg<AbstractType>(op_name, args_spec_list, 1);
+
+  auto mode_v = abs_type->GetValueTrack();
+  MS_EXCEPTION_IF_NULL(mode_v);
+  if (!mode_v->isa<Type>()) {
+    MS_LOG(EXCEPTION) << "Get the type from AbstractType value failed.";
+  }
+
+  TypePtr mode_t = mode_v->cast<TypePtr>();
  MS_EXCEPTION_IF_NULL(args_spec_list[0]);
-  TypePtr type = args_spec_list[0]->GetTypeTrack();
-  MS_EXCEPTION_IF_NULL(type);
-  if (type->type_id() != kMetaTypeTypeType) {
-    MS_LOG(EXCEPTION) << "Can not make type(" << type->ToString() << ")not TypeType";
-  }
-
-  ValuePtr value_track = args_spec_list[0]->GetValueTrack();
-  MS_EXCEPTION_IF_NULL(value_track);
-  TypePtr type_ptr = value_track->cast<TypePtr>();
-  if (type_ptr == nullptr) {
-    MS_LOG(EXCEPTION) << "Value type error, not Me type:" << value_track->ToString();
-  }
-
-  auto cls = dyn_cast<Class>(type_ptr);
-  MS_EXCEPTION_IF_NULL(cls);
-  ClassAttrVector attributes = cls->GetAttributes();
-  CheckArgsSize(primitive->name(), args_spec_list, attributes.size() + 1);
-
-  std::vector<AbstractAttribute> abs_attributes;
-  for (size_t i = 0; i < attributes.size(); i++) {
-    AbstractAttribute elem(attributes[i].first, args_spec_list[i + 1]);
-    abs_attributes.push_back(elem);
-  }
-
-  return std::make_shared<AbstractClass>(cls->tag(), abs_attributes, cls->methods());
+  bool v = IsSubtype(args_spec_list[0], mode_t);
+  return std::make_shared<AbstractScalar>(std::make_shared<BoolImm>(v), kBool);
 }

-template <typename T>
-AbstractBasePtr InferTupleOrListGetItem(const std::string &op_name, const AbstractBasePtrList &args_spec_list) {
-  // Inputs: a tuple or list and a scalar whose value is an int32 number.
-  CheckArgsSize(op_name, args_spec_list, 2);
-  auto queue = CheckArg<T>(op_name, args_spec_list, 0);
-  AbstractScalarPtr index = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);
+bool CompareShape(const std::vector<ValuePtr> &x_shape, const std::vector<ValuePtr> &y_shape) {
+  if (x_shape.size() != y_shape.size()) {
+    return false;
+  }

-  ValuePtr index_value = index->BuildValue();
-  if (!index_value->isa<Int32Imm>()) {
-    // when index_value is an AnyValue and args_spec_list[0] is a scalar, try to return the type of the first element
-    //  and continue
-    if (dyn_cast<AbstractScalar>(queue->elements()[0]) != nullptr) {
-      return std::make_shared<AbstractScalar>(queue->elements()[0]->BuildType());
+  for (size_t i = 0; i < x_shape.size(); ++i) {
+    if (GetValue<int>(x_shape[i]) != GetValue<int>(y_shape[i])) {
+      return false;
    }
-    MS_EXCEPTION(IndexError) << op_name << " evaluator index should be an int32 number, but got "
-                             << index_value->ToString();
-  }
-  int idx_v = GetValue<int>(index_value);
-  std::size_t nelems = queue->elements().size();
-  if (idx_v >= SizeToInt(nelems) || idx_v < -SizeToInt(nelems)) {
-    MS_EXCEPTION(IndexError) << op_name << " evaluator index should be in range[-" << SizeToInt(nelems) << ", "
-                             << SizeToInt(nelems) << "), but got " << idx_v << ".";
  }

-  std::size_t uidx_v = 0;
-  if (idx_v >= 0) {
-    uidx_v = IntToSize(idx_v);
-  } else {
-    uidx_v = IntToSize(idx_v + SizeToInt(nelems));
-  }
-  return queue->elements()[uidx_v];
+  return true;
 }

-template <typename T>
-AbstractBasePtr InferTupleOrListSetItem(const std::string &op_name, const AbstractBasePtrList &args_spec_list) {
-  // Inputs: a tuple or list, a scalar whose value is an int32 number and an object of a subclass of AbstractBase.
-  CheckArgsSize(op_name, args_spec_list, 3);
-  auto queue = CheckArg<T>(op_name, args_spec_list, 0);
-  AbstractScalarPtr index = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);
-
-  ValuePtr index_value = index->BuildValue();
-  if (!index_value->isa<Int32Imm>()) {
-    MS_EXCEPTION(IndexError) << op_name << " evaluator index should be an int32 number, but got "
-                             << index_value->ToString();
-  }
-  int idx_v = GetValue<int>(index_value);
-  if (idx_v < 0) {
-    MS_EXCEPTION(IndexError) << "The index of " << typeid(T).name() << " should be positive number, but got " << idx_v
-                             << ".";
+AbstractBasePtr DoInferReduceShape(const AbstractTuplePtr &x_shape, const ValuePtr &x_shp_value,
+                                   const ValueTuplePtr &axis_value_ptr, const PrimitivePtr &primitive) {
+  size_t x_rank = x_shape->size();
+  std::set<int> axis_set;
+  auto axis_data = axis_value_ptr->value();
+  if (axis_data.empty()) {
+    int size = 1;
+    AbstractBasePtrList values(x_rank, std::make_shared<AbstractScalar>(size));
+    return std::make_shared<AbstractTuple>(values);
  }

-  size_t uidx_v = IntToSize(idx_v);
-  AbstractBasePtrList elements = queue->elements();
-  std::size_t nelems = elements.size();
-  if (uidx_v >= nelems) {
-    MS_EXCEPTION(IndexError) << op_name << " evaluator the index: " << uidx_v << " to set out of range: " << nelems - 1
-                             << ".";
+  for (auto &elem : axis_data) {
+    int e_value = CheckAxis(primitive->name(), elem, -SizeToInt(x_rank), SizeToInt(x_rank) - 1);
+    (void)axis_set.insert(e_value);
  }
-  elements[uidx_v] = args_spec_list[2];
-  return std::make_shared<T>(elements);
+
+  auto x_shp_data = x_shp_value->cast<ValueTuplePtr>()->value();
+  if (x_shp_data.size() < x_rank) {
+    MS_LOG(EXCEPTION) << "x_shape_data.size() " << x_shp_data.size() << " less than x_shape.size() " << x_rank;
+  }
+  AbstractBasePtrList values;
+  for (size_t i = 0; i < x_rank; i++) {
+    if (axis_set.count(SizeToInt(i)) || axis_set.count(SizeToInt(i) - SizeToInt(x_rank))) {
+      auto axis_v = MakeValue(1);
+      values.push_back(std::make_shared<AbstractScalar>(axis_v, axis_v->type()));
+    } else {
+      int dim_value = x_shp_data[i]->cast<Int32ImmPtr>()->value();
+      auto dim = MakeValue(dim_value);
+      values.push_back(std::make_shared<AbstractScalar>(dim, dim->type()));
+    }
+  }
+
+  return std::make_shared<AbstractTuple>(values);
 }

-AbstractBasePtr InferImplTupleGetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                      const AbstractBasePtrList &args_spec_list) {
-  return InferTupleOrListGetItem<AbstractTuple>(primitive->name(), args_spec_list);
-}
-
-AbstractBasePtr InferImplListGetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                     const AbstractBasePtrList &args_spec_list) {
-  return InferTupleOrListGetItem<AbstractList>(primitive->name(), args_spec_list);
-}
-
-AbstractBasePtr InferImplTupleSetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                      const AbstractBasePtrList &args_spec_list) {
-  return InferTupleOrListSetItem<AbstractTuple>(primitive->name(), args_spec_list);
-}
-
-AbstractBasePtr InferImplListSetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                     const AbstractBasePtrList &args_spec_list) {
-  return InferTupleOrListSetItem<AbstractList>(primitive->name(), args_spec_list);
-}
-
-AbstractBasePtr InferImplDictGetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                     const AbstractBasePtrList &args_spec_list) {
-  // Inputs: a dict and a scalar whose value is a string.
+AbstractBasePtr InferImplBroadcastGradientArgs(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                               const AbstractBasePtrList &args_spec_list) {
+  // this primitive get the index that need to reduce
+  // input: x's shape and y's shape, inputs should be tuple
+  // output: tuple of x and y 's reduce index, reduce index should be a tuple
  const std::string op_name = primitive->name();
  CheckArgsSize(op_name, args_spec_list, 2);
-  AbstractDictionaryPtr dict = CheckArg<AbstractDictionary>(op_name, args_spec_list, 0);
-  AbstractScalarPtr key = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);
+  auto arg_x = CheckArg<AbstractTuple>(op_name, args_spec_list, 0);
+  auto arg_y = CheckArg<AbstractTuple>(op_name, args_spec_list, 1);

-  ValuePtr key_value = key->BuildValue();
-  if (!key_value->isa<StringImm>()) {
-    MS_LOG(EXCEPTION) << op_name << " evaluator key should be string, but got " << key_value->ToString();
+  ValueTuplePtr arg_x_value = arg_x->BuildValue()->cast<ValueTuplePtr>();
+  MS_EXCEPTION_IF_NULL(arg_x_value);
+
+  ValueTuplePtr arg_y_value = arg_y->BuildValue()->cast<ValueTuplePtr>();
+  MS_EXCEPTION_IF_NULL(arg_y_value);
+
+  const std::vector<ValuePtr> x_shape = arg_x_value->value();
+  const std::vector<ValuePtr> y_shape = arg_y_value->value();
+  bool is_same_shape = CompareShape(x_shape, y_shape);
+  // if it is the same shape , do not need reduce , return empty tuple
+  if (is_same_shape) {
+    AbstractBasePtrList empty_list;
+    auto x_reduce_idx = std::make_shared<AbstractTuple>(empty_list);
+    auto y_reduce_idx = std::make_shared<AbstractTuple>(empty_list);
+
+    AbstractBasePtrList elem_list;
+    elem_list.push_back(x_reduce_idx);
+    elem_list.push_back(y_reduce_idx);
+
+    return std::make_shared<AbstractTuple>(elem_list);
  }
-  auto key_str = GetValue<std::string>(key_value);
-  std::vector<AbstractAttribute> dict_elems = dict->elements();
-  auto it = std::find_if(dict_elems.begin(), dict_elems.end(),
-                         [key_str](const AbstractAttribute &item) { return item.first == key_str; });

-  if (it == dict_elems.end()) {
-    MS_LOG(EXCEPTION) << "The key " << key_str << " does not exist in the dict:" << args_spec_list[0]->ToString();
-  }
-  return it->second;
-}
-
-AbstractBasePtr InferImplDictSetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                     const AbstractBasePtrList &args_spec_list) {
-  // Inputs: a dict and a scalar whose value is a string and an object of a subclass of AbstractBase.
-  const std::string op_name = primitive->name();
-  CheckArgsSize(op_name, args_spec_list, 3);
-  AbstractDictionaryPtr dict = CheckArg<AbstractDictionary>(op_name, args_spec_list, 0);
-  AbstractScalarPtr key = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);
-
-  ValuePtr key_value = key->BuildValue();
-  if (!key_value->isa<StringImm>()) {
-    MS_LOG(EXCEPTION) << op_name << " evaluator key should be string, but got " << key_value->ToString();
-  }
-  std::string key_str = GetValue<std::string>(key_value);
-  std::vector<AbstractAttribute> dict_elems = dict->elements();
-  auto it = std::find_if(dict_elems.begin(), dict_elems.end(),
-                         [key_str](AbstractAttribute &item) { return item.first == key_str; });
-
-  MS_EXCEPTION_IF_NULL(args_spec_list[2]);
-  auto new_ele = std::make_pair(key_str, args_spec_list[2]);
-  if (it != dict_elems.end()) {
-    int index = it - dict_elems.begin();
-    dict_elems[IntToSize(index)] = new_ele;
-  } else {
-    dict_elems.push_back(new_ele);
-  }
-  return std::make_shared<AbstractDictionary>(dict_elems);
-}
-
-AbstractBasePtr InferImplListAppend(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                    const AbstractBasePtrList &args_spec_list) {
-  // Inputs: a list and an object of a subclass of AbstractBase.
-  const std::string op_name = primitive->name();
-  CheckArgsSize(op_name, args_spec_list, 2);
-  AbstractListPtr list = CheckArg<AbstractList>(op_name, args_spec_list, 0);
-  (void)AbstractJoin(list->elements());
-  return list;
-}
-
-template <typename T>
-AbstractBasePtr InferTupleOrListOrDictLen(const std::string &op_name, const AbstractBasePtrList &args_spec_list) {
-  // Inputs: a tuple or list or dict.
-  CheckArgsSize(op_name, args_spec_list, 1);
-  auto arg = CheckArg<T>(op_name, args_spec_list, 0);
-  return std::make_shared<AbstractScalar>(SizeToInt(arg->size()));
-}
-
-AbstractBasePtr InferImplTupleLen(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                  const AbstractBasePtrList &args_spec_list) {
-  return InferTupleOrListOrDictLen<AbstractTuple>(primitive->name(), args_spec_list);
-}
-
-AbstractBasePtr InferImplListLen(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                 const AbstractBasePtrList &args_spec_list) {
-  return InferTupleOrListOrDictLen<AbstractList>(primitive->name(), args_spec_list);
-}
-
-AbstractBasePtr InferImplDictLen(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                 const AbstractBasePtrList &args_spec_list) {
-  return InferTupleOrListOrDictLen<AbstractDictionary>(primitive->name(), args_spec_list);
-}
-
-AbstractBasePtr InferImplArrayLen(const AnalysisEnginePtr &, const PrimitivePtr &,
-                                  const AbstractBasePtrList &args_spec_list) {
-  return std::make_shared<AbstractScalar>(kAnyValue, kInt32);
+  return BroadcastGradientArgsDiff(x_shape, y_shape);
 }

 AbstractBasePtr InferImplListMap(const AnalysisEnginePtr &engine, const PrimitivePtr &primitive,
@ -430,41 +334,6 @@ AbstractBasePtr InferImplTupleReversed(const AnalysisEnginePtr &, const Primitiv
  return std::make_shared<AbstractTuple>(elem_list);
 }

-AbstractBasePtr DoInferReduceShape(const AbstractTuplePtr &x_shape, const ValuePtr &x_shp_value,
-                                   const ValueTuplePtr &axis_value_ptr, const PrimitivePtr &primitive) {
-  size_t x_rank = x_shape->size();
-  std::set<int> axis_set;
-  auto axis_data = axis_value_ptr->value();
-  if (axis_data.empty()) {
-    int size = 1;
-    AbstractBasePtrList values(x_rank, std::make_shared<AbstractScalar>(size));
-    return std::make_shared<AbstractTuple>(values);
-  }
-
-  for (auto &elem : axis_data) {
-    int e_value = CheckAxis(primitive->name(), elem, -SizeToInt(x_rank), SizeToInt(x_rank) - 1);
-    (void)axis_set.insert(e_value);
-  }
-
-  auto x_shp_data = x_shp_value->cast<ValueTuplePtr>()->value();
-  if (x_shp_data.size() < x_rank) {
-    MS_LOG(EXCEPTION) << "x_shape_data.size() " << x_shp_data.size() << " less than x_shape.size() " << x_rank;
-  }
-  AbstractBasePtrList values;
-  for (size_t i = 0; i < x_rank; i++) {
-    if (axis_set.count(SizeToInt(i)) || axis_set.count(SizeToInt(i) - SizeToInt(x_rank))) {
-      auto axis_v = MakeValue(1);
-      values.push_back(std::make_shared<AbstractScalar>(axis_v, axis_v->type()));
-    } else {
-      int dim_value = x_shp_data[i]->cast<Int32ImmPtr>()->value();
-      auto dim = MakeValue(dim_value);
-      values.push_back(std::make_shared<AbstractScalar>(dim, dim->type()));
-    }
-  }
-
-  return std::make_shared<AbstractTuple>(values);
-}
-
 AbstractBasePtr InferImplReduceShape(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                     const AbstractBasePtrList &args_spec_list) {
  // Inputs: x_shape, axis
@ -563,7 +432,7 @@ AbstractBasePtr InferImplTuple2Array(const AnalysisEnginePtr &, const PrimitiveP

  py::tuple data_tuple = ValuePtrToPyData(input->BuildValue());
  py::array data = py::array(data_tuple);
-  auto tensor = TensorPy::MakeTensor(data);
+  auto tensor = tensor::TensorPy::MakeTensor(data);
  auto ret = tensor->ToAbstract();
  ret->set_value(tensor);
  MS_LOG(DEBUG) << "Tuple2arry result AbstractTensor: " << ret->ToString();
@ -596,76 +465,6 @@ AbstractBasePtr InferImplShapeMul(const AnalysisEnginePtr &, const PrimitivePtr
  return std::make_shared<AbstractScalar>(result_v, result_v->type());
 }

-template <typename T>
-AbstractBasePtr InferImplTupleOrListEqual(const std::string &op_name, const AbstractBasePtrList &args_spec_list) {
-  // Inputs: two tuples or two lists.
-  CheckArgsSize(op_name, args_spec_list, 2);
-  auto input_x = CheckArg<T>(op_name, args_spec_list, 0);
-  auto input_y = CheckArg<T>(op_name, args_spec_list, 1);
-
-  ValuePtr x_value = input_x->BuildValue();
-  ValuePtr y_value = input_y->BuildValue();
-  return std::make_shared<AbstractScalar>(*x_value == *y_value);
-}
-
-AbstractBasePtr InferImplTupleEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                    const AbstractBasePtrList &args_spec_list) {
-  return InferImplTupleOrListEqual<AbstractTuple>(primitive->name(), args_spec_list);
-}
-
-AbstractBasePtr InferImplListEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                   const AbstractBasePtrList &args_spec_list) {
-  return InferImplTupleOrListEqual<AbstractList>(primitive->name(), args_spec_list);
-}
-
-struct SlideInfo {
-  int start;
-  int step;
-  int stop;
-};
-
-void CalcSlidePara(const AbstractBasePtrList &args_spec_list, SlideInfo *slide) {
-  int arg1 = 0;
-  int arg2 = 0;
-  if (!args_spec_list.empty()) {
-    MS_EXCEPTION_IF_NULL(args_spec_list[0]);
-    auto arg_value = args_spec_list[0]->BuildValue();
-    if (!arg_value->isa<Int32Imm>()) {
-      MS_LOG(EXCEPTION) << "Only supported input an int32 number.";
-    }
-    arg1 = GetValue<int>(arg_value);
-  }
-
-  if (args_spec_list.size() >= 2) {
-    MS_EXCEPTION_IF_NULL(args_spec_list[1]);
-    auto arg_value = args_spec_list[1]->BuildValue();
-    if (!arg_value->isa<Int32Imm>()) {
-      MS_LOG(EXCEPTION) << "Only supported input an int32 number.";
-    }
-    arg2 = GetValue<int>(arg_value);
-  }
-
-  if (args_spec_list.size() == 3) {
-    MS_EXCEPTION_IF_NULL(args_spec_list[2]);
-    auto arg_value = args_spec_list[2]->BuildValue();
-    if (!arg_value->isa<Int32Imm>()) {
-      MS_LOG(EXCEPTION) << "Only supported input an int32 number.";
-    }
-    slide->step = GetValue<int>(arg_value);
-    slide->start = arg1;
-    slide->stop = arg2;
-  }
-
-  if (args_spec_list.size() == 2) {
-    slide->start = arg1;
-    slide->stop = arg2;
-  }
-
-  if (args_spec_list.size() == 1) {
-    slide->stop = arg1;
-  }
-}
-
 AbstractBasePtr InferImplMakeRange(const AnalysisEnginePtr &, const PrimitivePtr &,
                                   const AbstractBasePtrList &args_spec_list) {
  if (args_spec_list.empty()) {
@ -709,5 +508,145 @@ AbstractBasePtr InferImplStopGradient(const AnalysisEnginePtr &, const Primitive
  CheckArgsSize(primitive->name(), args_spec_list, 1);
  return args_spec_list[0]->Clone();
 }
+
+AbstractBasePtr InferImplTupleEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                    const AbstractBasePtrList &args_spec_list) {
+  return InferImplTupleOrListEqual<AbstractTuple>(primitive->name(), args_spec_list);
+}
+
+AbstractBasePtr InferImplListEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                   const AbstractBasePtrList &args_spec_list) {
+  return InferImplTupleOrListEqual<AbstractList>(primitive->name(), args_spec_list);
+}
+
+AbstractBasePtr InferImplStringEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                     const AbstractBasePtrList &args_spec_list) {
+  // Inputs: two scalars whose value is a string.
+  const std::string op_name = primitive->name();
+  CheckArgsSize(op_name, args_spec_list, 2);
+  AbstractScalarPtr scalar_x = CheckArg<AbstractScalar>(op_name, args_spec_list, 0);
+  AbstractScalarPtr scalar_y = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);
+
+  ValuePtr value_x = scalar_x->BuildValue();
+  ValuePtr value_y = scalar_y->BuildValue();
+  if (!value_x->isa<StringImm>() || !value_y->isa<StringImm>()) {
+    MS_LOG(EXCEPTION) << op_name << " requires 2 parameters are string, but got param0: " << value_x->ToString()
+                      << ", param1: " << value_y->ToString();
+  }
+
+  bool ret = (value_x->cast<StringImmPtr>()->value() == value_y->cast<StringImmPtr>()->value());
+  return std::make_shared<AbstractScalar>(ret);
+}
+
+AbstractBasePtr InferImplStringConcat(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                      const AbstractBasePtrList &args_spec_list) {
+  // Inputs: two scalars whose value is a string.
+  const std::string op_name = primitive->name();
+  CheckArgsSize(op_name, args_spec_list, 2);
+  AbstractScalarPtr scalar_x = CheckArg<AbstractScalar>(op_name, args_spec_list, 0);
+  AbstractScalarPtr scalar_y = CheckArg<AbstractScalar>(op_name, args_spec_list, 1);
+
+  ValuePtr value_x = scalar_x->BuildValue();
+  ValuePtr value_y = scalar_y->BuildValue();
+  if (!value_x->isa<StringImm>() || !value_y->isa<StringImm>()) {
+    MS_LOG(EXCEPTION) << op_name << " requires 2 parameters are string, but got param0: " << value_x->ToString()
+                      << ", param1: " << value_y->ToString();
+  }
+
+  std::string ret = (value_x->cast<StringImmPtr>()->value() + value_y->cast<StringImmPtr>()->value());
+  return std::make_shared<AbstractScalar>(ret);
+}
+
+AbstractBasePtr InferImplDictLen(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                 const AbstractBasePtrList &args_spec_list) {
+  return InferTupleOrListOrDictLen<AbstractDictionary>(primitive->name(), args_spec_list);
+}
+
+AbstractBasePtr InferImplJ(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                           const AbstractBasePtrList &args_spec_list) {
+  // args: An object of AbstractFunction.
+  CheckArgsSize(primitive->name(), args_spec_list, 1);
+  MS_LOG(DEBUG) << "evaluate J: " << args_spec_list[0]->ToString();
+
+  AbstractFunctionPtr x = dyn_cast<AbstractFunction>(args_spec_list[0]);
+  if (x == nullptr) {
+    return std::make_shared<AbstractJTagged>(args_spec_list[0]);
+  }
+
+  AbstractFuncAtomPtrList jv;
+  auto build_jv = [&jv](const AbstractFuncAtomPtr &func) {
+    auto j_closure = std::make_shared<JTransformedAbstractClosure>(func);
+    jv.push_back(j_closure);
+  };
+  x->Visit(build_jv);
+
+  return AbstractFunction::MakeAbstractFunction(jv);
+}
+
+AbstractBasePtr InferImplFakeBprop(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                   const AbstractBasePtrList &args_spec_list) {
+  // Inputs: a tensor.
+  CheckArgsSize(primitive->name(), args_spec_list, 1);
+  return args_spec_list[0]->Broaden();
+}
+
+// Eval the return type of make_record
+AbstractBasePtr InferImplMakeRecord(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                    const AbstractBasePtrList &args_spec_list) {
+  // Inputs: at lease two objects of a subclass of AbstractBase.
+  if (args_spec_list.size() < 2) {
+    MS_LOG(EXCEPTION) << "Typeof evaluator requires more than 1 parameter, while the input size is "
+                      << args_spec_list.size() << ".";
+  }
+
+  // args_spec_list[0] maybe AbstractScalarPtr or AbstractTypePtr
+  MS_EXCEPTION_IF_NULL(args_spec_list[0]);
+  TypePtr type = args_spec_list[0]->GetTypeTrack();
+  MS_EXCEPTION_IF_NULL(type);
+  if (type->type_id() != kMetaTypeTypeType) {
+    MS_LOG(EXCEPTION) << "Can not make type(" << type->ToString() << ")not TypeType";
+  }
+
+  ValuePtr value_track = args_spec_list[0]->GetValueTrack();
+  MS_EXCEPTION_IF_NULL(value_track);
+  TypePtr type_ptr = value_track->cast<TypePtr>();
+  if (type_ptr == nullptr) {
+    MS_LOG(EXCEPTION) << "Value type error, not Me type:" << value_track->ToString();
+  }
+
+  auto cls = dyn_cast<Class>(type_ptr);
+  MS_EXCEPTION_IF_NULL(cls);
+  ClassAttrVector attributes = cls->GetAttributes();
+  CheckArgsSize(primitive->name(), args_spec_list, attributes.size() + 1);
+
+  std::vector<AbstractAttribute> abs_attributes;
+  for (size_t i = 0; i < attributes.size(); i++) {
+    AbstractAttribute elem(attributes[i].first, args_spec_list[i + 1]);
+    abs_attributes.push_back(elem);
+  }
+
+  return std::make_shared<AbstractClass>(cls->tag(), abs_attributes, cls->methods());
+}
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(TypeOf, prim::kPrimTypeOf, InferImplTypeof);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(HasType, prim::kPrimHasType, InferImplHasType);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(MakeRecord, prim::kPrimMakeRecord, InferImplMakeRecord);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(ListMap, prim::kPrimListMap, InferImplListMap);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(ListReduce, prim::kPrimListReduce, InferImplListReduce);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(TupleReversed, prim::kPrimTupleReversed, InferImplTupleReversed);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(ReducedShape, prim::kPrimReducedShape, InferImplReduceShape);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(TupleDiv, prim::kPrimTupleDiv, InferImplTupleDiv);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(TupleToArray, prim::kPrimTupleToArray, InferImplTuple2Array);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(ShapeMul, prim::kPrimShapeMul, InferImplShapeMul);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(TupleEqual, prim::kPrimTupleEqual, InferImplTupleEqual);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(ListEqual, prim::kPrimListEqual, InferImplListEqual);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(MakeRange, prim::kPrimMakeRange, InferImplMakeRange);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(StopGradient, prim::kPrimStopGradient, InferImplStopGradient);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(StringEqual, prim::kPrimStringEqual, InferImplStringEqual);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(StringConcat, prim::kPrimStringConcat, InferImplStringConcat);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(DictLen, prim::kPrimDictLen, InferImplDictLen);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(FakeBprop, prim::kPrimFakeBprop, InferImplFakeBprop);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(J, prim::kPrimJ, InferImplJ);
+REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(BroadcastGradientArgs, prim::kPrimBroadcastGradientArgs,
+                                      InferImplBroadcastGradientArgs);
 }  // namespace abstract
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/operator/ops_front_infer_function.h
+++ b/mindspore/ccsrc/frontend/operator/ops_front_infer_function.h
@ -0,0 +1,77 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_FRONTEND_OPERATE_OPS_FRONT_INFER_FUNCTION_H_
+#define MINDSPORE_CCSRC_FRONTEND_OPERATE_OPS_FRONT_INFER_FUNCTION_H_
+#include "abstract/abstract_value.h"
+#include "abstract/primitive_infer_map.h"
+namespace mindspore {
+namespace abstract {
+AbstractBasePtr InferImplTypeof(const AnalysisEnginePtr &, const PrimitivePtr &,
+                                const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplHasType(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                 const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplBroadcastGradientArgs(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                               const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplListMap(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                 const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplListReduce(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                    const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplTupleReversed(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                       const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplReduceShape(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                     const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplTupleDiv(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                  const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplTuple2Array(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                     const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplShapeMul(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                  const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplTupleEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                    const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplListEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                   const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplMakeRange(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                   const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplStopGradient(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                      const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplStringEqual(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                     const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplStringConcat(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                      const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplDictLen(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                 const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplJ(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                           const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplFakeBprop(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                   const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplMakeRecord(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                    const AbstractBasePtrList &args_spec_list);
+
+class RegisterFrontendPrimitiveEvalHelper {
+ public:
+  RegisterFrontendPrimitiveEvalHelper(const PrimitivePtr &primitive, const StandardPrimitiveEvalImpl &impl) {
+    const StandardPrimitiveImplReg impl_reg{impl, false};
+    RegisterStandardPrimitiveImpl(primitive, impl_reg);
+  }
+  ~RegisterFrontendPrimitiveEvalHelper() = default;
+};
+
+#define REGISTER_FRONTENT_PRIMITIVE_EVAL_IMPL(name, primitive, impl) \
+  static auto helper_##name = RegisterFrontendPrimitiveEvalHelper(primitive, impl)
+}  // namespace abstract
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_FRONTEND_OPERATE_OPS_FRONT_INFER_FUNCTION_H_
--- a/mindspore/ccsrc/frontend/optimizer/irpass.cc
+++ b/mindspore/ccsrc/frontend/optimizer/irpass.cc
@ -95,10 +95,10 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
  // Ref eliminate
  make_ref_eliminate_ =
    MakeSubstitution(std::make_shared<MakeRefEliminater>(), "make_ref_eliminate", prim::kPrimMakeRef);
-  get_ref_param_eliminate_ = MakeSubstitution(std::make_shared<GetRefParamEliminater>(), "get_ref_param_eliminate",
-                                              {prim::kPrimGetRefValue, prim::kPrimGetRefOrigin});
+  get_ref_param_eliminate_ =
+    MakeSubstitution(std::make_shared<GetRefParamEliminater>(), "get_ref_param_eliminate", {prim::kPrimGetRefValue});
  get_make_ref_eliminate_ = MakeSubstitution(std::make_shared<GetMakeRefEliminater>(), "get_make_ref_eliminate",
-                                             {prim::kPrimGetRefKey, prim::kPrimGetRefValue, prim::kPrimGetRefOrigin});
+                                             {prim::kPrimGetRefKey, prim::kPrimGetRefValue});

  replace_refkey_by_param_ = MakeSubstitution(std::make_shared<ReplaceRefkeyByParam>(), "replace_refkey_by_param",
                                              IsValueNode<RefKey>, opt::FORCE_RENORM);
--- a/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc
@ -20,9 +20,6 @@ namespace mindspore {
 namespace opt {
 namespace irpass {
 AnfNodePtr ArithmeticSimplify::operator()(const OptimizerPtr &, const AnfNodePtr &node) {
-  if (MsContext::GetInstance()->execution_mode() == kPynativeMode) {
-    return nullptr;
-  }
  PatternNode x, y, z, xs;
  PConstant one_(node, false, 1);
  PConstant one_scalar_(node, false, 1, true);
@ -32,16 +29,21 @@ AnfNodePtr ArithmeticSimplify::operator()(const OptimizerPtr &, const AnfNodePtr
  PConstant const_2(node);
  PConstant any_const(node);

-  MATCH_REPLACE(node, x + zero_, x);                                                           // Add by zero
-  MATCH_REPLACE(node, x + zero_scalar_, x);                                                    // Add by zero
-  MATCH_REPLACE(node, PBinOperation(prim::kPrimScalarAdd, x, zero_scalar_, true), x);          // Scalar Add by zero
-  MATCH_REPLACE_IF(node, x * one_, any_const.WithValueOf(x), !one_.CheckFunc(IsParam, node));  // Multiply by one
-  MATCH_REPLACE(node, PBinOperation(prim::kPrimScalarMul, x, one_scalar_, true), x);           // Scalar Mul by one
+  if (MsContext::GetInstance()->execution_mode() != kPynativeMode) {
+    MATCH_REPLACE(node, x + zero_, x);                                                           // Add by zero
+    MATCH_REPLACE(node, x + zero_scalar_, x);                                                    // Add by zero
+    MATCH_REPLACE(node, PBinOperation(prim::kPrimScalarAdd, x, zero_scalar_, true), x);          // Scalar Add by zero
+    MATCH_REPLACE_IF(node, x * one_, any_const.WithValueOf(x), !one_.CheckFunc(IsParam, node));  // Multiply by one
+    MATCH_REPLACE(node, PBinOperation(prim::kPrimScalarMul, x, one_scalar_, true), x);           // Scalar Mul by one

-  // Scalar Mul by zero
-  MATCH_REPLACE(node, PBinOperation(prim::kPrimScalarMul, x, zero_scalar_, true), zero_scalar_.NewValue());
+    // Scalar Mul by zero
+    MATCH_REPLACE(node, PBinOperation(prim::kPrimScalarMul, x, zero_scalar_, true), zero_scalar_.NewValue());
+  }
  // Prim Eliminate (identity)
  MATCH_REPLACE(node, PPrimitive(prim::kPrimIdentity, x), x);
+  if (MsContext::GetInstance()->execution_mode() == kPynativeMode) {
+    return nullptr;
+  }

  // ConstantDuplicateMul
  auto const_dup_lambda = [&node, &x, &const_, &const_2]() -> AnfNodePtr {
@ -95,37 +97,37 @@ AnfNodePtr ArithmeticSimplify2::operator()(const OptimizerPtr &, const AnfNodePt
 // {prim::kPrimAddN, {prim::kPrimMakeTuple, {prim::kPrimMul, {prim::kPrimAllReduce, X}, Y}, Z}} ->
 // {prim::kPrimMul, {prim::kPrimAllReduce, {prim::kPrimAddN,{prim::kPrimMakeTuple, Z, X}}}, Y}
 AnfNodePtr AdjustAllReduceMulAdd::operator()(const OptimizerPtr &, const AnfNodePtr &node) {
-  Reset();
-  // {prim::kPrimAddN, Zs}
-  if (!IsPrimitiveCNode(node, prim::kPrimAddN)) {
-    return nullptr;
-  }
-  auto addn = node->cast<CNodePtr>();
-  if (addn->size() != 2) {
-    return nullptr;
-  }
-  AnfVisitor::Match(prim::kPrimMakeTuple, {IsNode, IsNode})(addn->input(1));
-  if (x_ == nullptr || y_ == nullptr || z_ == nullptr || all_reduce_fg_ == nullptr) {
-    return nullptr;
-  }
-  auto addn_maketuple = addn->input(1);
+  PatternNode x, y, z;
+  auto all_reduce_pat = PPrimitive(prim::kPrimAllReduce, x);
+  auto mul_pat = PBinOperation(prim::kPrimMul, all_reduce_pat, y, true);
+  auto admktup_pat = PBinOperation(prim::kPrimMakeTuple, mul_pat, z, true);
+  auto addn_pat = PPrimitive(prim::kPrimAddN, admktup_pat);
+  auto adjust_lambda = [&node, &x, &y, &z, &addn_pat, &all_reduce_pat, &admktup_pat, &mul_pat, this]() -> AnfNodePtr {
+    auto fg = all_reduce_pat.GetFuncGraph();
+    auto z_ = z.GetNode(node);
+    // If addn inputs cross the graph, make the inputs same as allreduce node.
+    if (z_->isa<CNode>() && fg != z_->func_graph()) {
+      auto cnode_z = z_->cast<CNodePtr>();
+      z_ = NewCNode(cnode_z->inputs(), fg);
+    }

-  auto fg = all_reduce_fg_;
-  // addn inputs cross the graph, make the inputs same as allreduce node.
-  if (z_->isa<CNode>() && fg != z_->func_graph()) {
-    auto cnode_z = z_->cast<CNodePtr>();
-    z_ = NewCNode(cnode_z->inputs(), fg);
-  }
+    auto addn_cnode = addn_pat.GetOriginalNode()->cast<CNodePtr>();
+    auto addn_op_node = addn_cnode->input(0);
+    auto make_tuple_op_node = addn_cnode->input(1)->cast<CNodePtr>()->input(0);
+    auto all_reduce_prim = all_reduce_pat.GetOriginalNode()->cast<CNodePtr>()->input(0);
+    mul_cnode_ = mul_pat.GetOriginalNode();
+    auto mul_prim = mul_cnode_->cast<CNodePtr>()->input(0);
+    auto addn_maketuple = admktup_pat.GetOriginalNode();

-  auto addn_op_node = addn->input(0);
-  auto make_tuple_op_node = addn->input(1)->cast<CNodePtr>()->input(0);
-
-  AnfNodePtr tuple = NewCNode({make_tuple_op_node, z_, x_}, fg);
-  AnfNodePtr add = NewCNode({addn_op_node, tuple}, fg);
-  AnfNodePtr all_reduce = NewCNode({all_reduce_, add}, fg);
-  AnfNodePtr mul = NewCNode({mul_, all_reduce, y_}, fg);
-  ProcessDependEdge(fg, addn_maketuple, all_reduce);
-  return mul;
+    AnfNodePtr tuple = NewCNode({make_tuple_op_node, z_, x.GetNode(node)}, fg);
+    AnfNodePtr add = NewCNode({addn_op_node, tuple}, fg);
+    AnfNodePtr all_reduce = NewCNode({all_reduce_prim, add}, fg);
+    AnfNodePtr mul = NewCNode({mul_prim, all_reduce, y.GetNode(node)}, fg);
+    ProcessDependEdge(fg, addn_maketuple, all_reduce);
+    return mul;
+  };
+  MATCH_REPLACE_LAMBDA(node, addn_pat, adjust_lambda);
+  return nullptr;
 }

 void AdjustAllReduceMulAdd::ProcessDependEdge(const FuncGraphPtr &fg, const AnfNodePtr &addn_maketuple,
@ -146,48 +148,6 @@ void AdjustAllReduceMulAdd::ProcessDependEdge(const FuncGraphPtr &fg, const AnfN
  }
 }

-void AdjustAllReduceMulAdd::Visit(const AnfNodePtr &node) {
-  if (level_ == 0) {
-    level_ = 1;
-    is_reduce_match_ = false;
-    // {prim::kPrimMul, {prim::kPrimAllReduce, X}, Y}
-    AnfVisitor::Match(prim::kPrimMul)(node);
-    level_ = 0;
-    if (is_reduce_match_) {
-      mul_ = node->cast<CNodePtr>()->input(0);
-      mul_cnode_ = node->cast<CNodePtr>();
-      y_ = tmp_;
-    } else {
-      z_ = node;
-    }
-  }
-
-  if (level_ == 1) {
-    // {prim::kPrimAllReduce, X}
-    if (IsPrimitiveCNode(node, prim::kPrimAllReduce)) {
-      auto cnode = node->cast<CNodePtr>();
-      if (cnode->size() > 1) {
-        all_reduce_ = cnode->input(0);
-        x_ = cnode->input(1);
-        is_reduce_match_ = true;
-        all_reduce_fg_ = cnode->func_graph();
-      }
-    } else {
-      tmp_ = node;
-    }
-  }
-}
-
-void AdjustAllReduceMulAdd::Reset() {
-  level_ = 0;
-  is_reduce_match_ = false;
-  x_ = nullptr;
-  y_ = nullptr;
-  z_ = nullptr;
-  tmp_ = nullptr;
-  all_reduce_fg_ = nullptr;
-}
-
 }  // namespace irpass
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.h
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.h
@ -38,20 +38,14 @@ namespace irpass {

 // {prim::kPrimAddN, {prim::kPrimMakeTuple, {prim::kPrimMul, {prim::kPrimAllReduce, X}, Y}, Z}} ->
 // {prim::kPrimMul, {prim::kPrimAllReduce, {prim::kPrimAddN,{prim::kPrimMakeTuple, Z, X}}}, Y}
-class AdjustAllReduceMulAdd : public AnfVisitor {
+class AdjustAllReduceMulAdd : public OptimizerCaller {
 public:
  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override;

  void ProcessDependEdge(const FuncGraphPtr &fg, const AnfNodePtr &addn_maketuple, const AnfNodePtr &new_node);
-  void Visit(const AnfNodePtr &node) override;
-  void Reset();

 private:
-  int level_{0};
-  bool is_reduce_match_{false};
-  AnfNodePtr x_{nullptr}, y_{nullptr}, z_{nullptr}, tmp_{nullptr};
-  AnfNodePtr all_reduce_{nullptr}, mul_{nullptr}, mul_cnode_{nullptr};
-  FuncGraphPtr all_reduce_fg_{nullptr};
+  AnfNodePtr mul_cnode_{nullptr};
 };

 class ArithmeticSimplify : public OptimizerCaller {
--- a/mindspore/ccsrc/frontend/optimizer/irpass/ref_eliminate.h
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/ref_eliminate.h
@ -37,27 +37,23 @@ class MakeRefEliminater : public OptimizerCaller {
 };

 // {prim::kPrimGetRefValue, Parameter} -> Parameter
-// {prim::kPrimGetRefOrigin, Parameter} -> Parameter
 class GetRefParamEliminater : public OptimizerCaller {
 public:
  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
    PatternNode<AnfNodePtr> x;
    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefValue, x), x);
-    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefOrigin, x), x);
    return nullptr;
  }
 };

 // {prim::kPrimGetRefKey, {prim::kPrimMakeRef, X, Y, Z}} -> X
 // {prim::kPrimGetRefValue, {prim::kPrimMakeRef, X, Y, Z}} -> Y
-// {prim::kPrimGetRefOrigin, {prim::kPrimMakeRef, X, Y, Z}} -> Z
 class GetMakeRefEliminater : public OptimizerCaller {
 public:
  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
    PatternNode<AnfNodePtr> x, y, z;
    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefKey, PPrimitive(prim::kPrimMakeRef, x, y, z)), x);
    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefValue, PPrimitive(prim::kPrimMakeRef, x, y, z)), y);
-    MATCH_REPLACE(node, PPrimitive(prim::kPrimGetRefOrigin, PPrimitive(prim::kPrimMakeRef, x, y, z)), z);

    return nullptr;
  }
--- a/mindspore/ccsrc/frontend/parallel/auto_parallel/graph_costmodel.h
+++ b/mindspore/ccsrc/frontend/parallel/auto_parallel/graph_costmodel.h
@ -197,6 +197,9 @@ class CostGraph {
    inputs_tensor_name_list_.push_back(inputs_tensor_name);
  }
  const std::vector<std::vector<std::string>> get_inputs_tensor_name_list() const { return inputs_tensor_name_list_; }
+  void set_inputs_tensor_name_list(const std::vector<std::vector<std::string>> &inputs_tensor_name_list) {
+    inputs_tensor_name_list_ = inputs_tensor_name_list;
+  }
  void add_tuple_getitem(const std::pair<std::string, std::string> &tuple_getitem) {
    auto ret = tuple_getitem_list_.insert(tuple_getitem);
    if (ret.second == false) {
--- a/mindspore/ccsrc/frontend/parallel/auto_parallel/operator_costmodel.h
+++ b/mindspore/ccsrc/frontend/parallel/auto_parallel/operator_costmodel.h
@ -199,6 +199,8 @@ class SoftmaxCost : public OperatorCost {
 using SoftmaxCostPtr = std::shared_ptr<SoftmaxCost>;
 using TileCost = SoftmaxCost;
 using TileCostPtr = std::shared_ptr<TileCost>;
+using ConcatCost = TileCost;
+using ConcatCostPtr = std::shared_ptr<ConcatCost>;

 class TmpIdentityCost : public OperatorCost {
 public:
--- a/mindspore/ccsrc/frontend/parallel/dynamic_creator.h
+++ b/mindspore/ccsrc/frontend/parallel/dynamic_creator.h
@ -136,6 +136,7 @@ REGISTER(EmbeddingLookupInfo);
 REGISTER(TileInfo);
 REGISTER(StridedSliceInfo);
 REGISTER(DropoutInfo);
+REGISTER(ConcatInfo);
 }  // namespace parallel
 }  // namespace mindspore

--- a/mindspore/ccsrc/frontend/parallel/node_check.cc
+++ b/mindspore/ccsrc/frontend/parallel/node_check.cc
@ -24,7 +24,6 @@
 namespace mindspore {
 namespace parallel {
 const std::set<std::string> BLACK_LIST = {TUPLE_GETITEM,
-                                          MAKE_TUPLE,
                                          J,
                                          LIST_GETITEM,
                                          ARRAY_GETITEM,
--- a/mindspore/ccsrc/frontend/parallel/ops_info/concat_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/concat_info.cc
@ -0,0 +1,268 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "frontend/parallel/ops_info/concat_info.h"
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "frontend/parallel/device_matrix.h"
+#include "frontend/parallel/strategy.h"
+#include "frontend/parallel/tensor_layout/tensor_redistribution.h"
+#include "pipeline/jit/resource.h"
+
+namespace mindspore {
+namespace parallel {
+Status ConcatInfo::GetAttrs() {
+  int axis = 0;
+  auto axis_iter = attrs_.find(AXIS);
+  if (axis_iter != attrs_.end()) {
+    MS_EXCEPTION_IF_NULL(axis_iter->second);
+    if (axis_iter->second->isa<Int32Imm>()) {
+      axis = axis_iter->second->cast<Int32ImmPtr>()->value();
+    } else {
+      MS_LOG(ERROR) << name_ << ": The value of axis is not int";
+      return FAILED;
+    }
+  } else {
+    MS_LOG(ERROR) << name_ << ": Can not find the axis attr";
+    return FAILED;
+  }
+
+  if (inputs_shape_.empty()) {
+    MS_LOG(ERROR) << name_ << ": The inputs shape is empty";
+    return FAILED;
+  }
+  int dim = SizeToInt(inputs_shape_[0].size());
+
+  if (axis < 0) {
+    axis = axis + dim;
+  }
+
+  axis_ = SizeToInt(axis);
+  return SUCCESS;
+}
+
+Status ConcatInfo::CheckStrategy(const StrategyPtr &strategy) {
+  MS_EXCEPTION_IF_NULL(strategy);
+  if (CheckStrategyValue(strategy, inputs_shape_, is_auto_parallel_) != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Invalid strategy";
+    return FAILED;
+  }
+
+  std::vector<Dimensions> stra = strategy->GetInputDim();
+  if (stra.empty()) {
+    MS_LOG(ERROR) << name_ << ": The strategy is empty";
+    return FAILED;
+  }
+
+  if (stra.size() != inputs_shape_.size()) {
+    MS_LOG(ERROR) << name_ << ": The size of strategy must be equal to the size of inputs shape";
+    return FAILED;
+  }
+
+  for (size_t i = 0; i < stra.size(); ++i) {
+    auto strategy_ele = stra[i];
+    auto input_shape_ele = inputs_shape_[i];
+    if (strategy_ele.size() != input_shape_ele.size()) {
+      MS_LOG(ERROR) << name_ << ": The size of strategy element must be equal to the size of input shape";
+      return FAILED;
+    }
+
+    if (axis_ >= strategy_ele.size()) {
+      MS_LOG(ERROR) << name_ << ": The axis is out of range, the axis is " << axis_;
+      return FAILED;
+    }
+
+    if (strategy_ele[axis_] != 1) {
+      MS_LOG(ERROR) << name_ << ": The axis can not be split";
+      return FAILED;
+    }
+
+    for (size_t j = 0; j < strategy_ele.size(); ++j) {
+      if (strategy_ele[j] != stra[0][j]) {
+        MS_LOG(ERROR) << name_ << ": The strategy of each input tensor must be equal";
+        return FAILED;
+      }
+    }
+  }
+
+  return SUCCESS;
+}
+
+Status ConcatInfo::InferDevMatrixShape() {
+  MS_EXCEPTION_IF_NULL(strategy_);
+  std::vector<Dimensions> stra = strategy_->GetInputDim();
+  if (stra.empty()) {
+    MS_LOG(ERROR) << name_ << "The strategy is empty";
+    return FAILED;
+  }
+
+  dev_matrix_shape_ = stra[0];
+  return SUCCESS;
+}
+
+Status ConcatInfo::InferTensorMap() {
+  TensorMap tensor_map;
+  if (inputs_shape_.empty()) {
+    MS_LOG(ERROR) << name_ << "The inputs shape is empty";
+    return FAILED;
+  }
+
+  // cannot use dev_matrix_shape_ replace inputs_shape_[0], because it may not be fully split in all devices.
+  int32_t size = SizeToInt(inputs_shape_[0].size());
+  for (int i = 0; i < size; ++i) {
+    tensor_map.push_back(size - i - 1);
+  }
+
+  for (size_t i = 0; i < inputs_shape_.size(); ++i) {
+    inputs_tensor_map_.push_back(tensor_map);
+  }
+  outputs_tensor_map_.push_back(tensor_map);
+  return SUCCESS;
+}
+
+Status ConcatInfo::InferMirrorOps() {
+  mirror_ops_.clear();
+  if (inputs_tensor_map_.empty()) {
+    MS_LOG(ERROR) << name_ << ": The inputs tensor map is empty";
+    return FAILED;
+  }
+
+  Shape input_tensor_map = inputs_tensor_map_[0];
+  std::vector<Group> group;
+  if (CreateGroupByTensorMap(input_tensor_map, &group) != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Create group for input failed.";
+    return FAILED;
+  }
+
+  if (group.empty()) {
+    MS_LOG(INFO) << name_ << ": The mirror group is empty.";
+    return SUCCESS;
+  }
+
+  OperatorVector input_op;
+  input_op = CreateMirrorOps(group[0].name(), group[0].GetDevNum());
+  for (size_t i = 0; i < inputs_shape_.size(); ++i) {
+    mirror_ops_.push_back(input_op);
+  }
+
+  return SUCCESS;
+}
+
+Status ConcatInfo::InferTensorInfo() {
+  if (inputs_shape_.empty() || outputs_shape_.empty() || inputs_tensor_map_.empty() || outputs_tensor_map_.empty()) {
+    MS_LOG(ERROR) << name_ << ": Invalid args";
+    return FAILED;
+  }
+
+  TensorLayout input_layout, output_layout;
+  for (size_t i = 0; i < inputs_shape_.size(); ++i) {
+    // infer tensor layout
+    if (input_layout.InitFromVector(dev_matrix_shape_, inputs_tensor_map_[i], inputs_shape_[i]) != SUCCESS) {
+      MS_LOG(ERROR) << name_ << ": Infer input tensor layout failed.";
+      return FAILED;
+    }
+    TensorInfo input_tensor_info(input_layout);
+    inputs_tensor_info_.push_back(input_tensor_info);
+  }
+
+  if (output_layout.InitFromVector(dev_matrix_shape_, outputs_tensor_map_[0], outputs_shape_[0]) != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Infer output tensor layout failed.";
+    return FAILED;
+  }
+  TensorInfo output_tensor_info(output_layout);
+  outputs_tensor_info_.push_back(output_tensor_info);
+  return SUCCESS;
+}
+
+void ConcatInfo::ReComputeBatchSplitFlagList() {
+  for (size_t i = 0; i < inputs_shape_.size(); i++) {
+    split_flag_list_[i] = true;
+  }
+}
+
+Status ConcatInfo::SetCostUnderStrategy(const StrategyPtr &strategy) {
+  if (SetCostUnderStrategyBase(strategy) != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Set cost under strategy failed.";
+    return FAILED;
+  }
+
+  return SUCCESS;
+}
+
+Status ConcatInfo::GenerateStrategies(int32_t stage_id) {
+  if (InferAttrs() != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Infer attrs failed";
+    return FAILED;
+  }
+  if (inputs_shape_.empty()) {
+    MS_LOG(ERROR) << name_ << ": The inputs shape is empty";
+    return FAILED;
+  }
+  Shape input_split;
+  for (size_t i = 0; i < inputs_shape_[0].size(); ++i) {
+    if (i == axis_) {
+      input_split.push_back(0);
+    } else {
+      input_split.push_back(1);
+    }
+  }
+  Shapes splittable_inputs;
+  for (size_t i = 0; i < inputs_shape_.size(); ++i) {
+    splittable_inputs.push_back(input_split);
+  }
+
+  std::vector<StrategyPtr> sp_vector;
+  is_auto_parallel_ = true;
+  if (GenerateStrategiesWithBroadcast(stage_id, inputs_shape_, splittable_inputs, &sp_vector) != SUCCESS) {
+    return FAILED;
+  }
+
+  size_t success = 0;
+  for (auto &sp : sp_vector) {
+    PrintStrategy(sp);
+    if (SetCostUnderStrategy(sp) == SUCCESS) {
+      success++;
+      MS_LOG(INFO) << name_ << ": Successfully generated " << success << " strategy.";
+      PrintStrategy(sp);
+    }
+  }
+  return SUCCESS;
+}
+
+Status ConcatInfo::Init(const StrategyPtr &strategy) {
+  if (InitWithAutoRepeatCalc(strategy) != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Init failed.";
+    return FAILED;
+  }
+  MS_LOG(INFO) << name_ << ": Init success.";
+  return SUCCESS;
+}
+
+Status ConcatInfo::InitForCostModel(const StrategyPtr &strategy) {
+  if (InitForCostModelWithAutoRepeatCalc(strategy) != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Init for cost model failed.";
+    return FAILED;
+  }
+
+  MS_LOG(INFO) << name_ << ": Init for cost model success.";
+  return SUCCESS;
+}
+}  // namespace parallel
+}  // namespace mindspore
--- a/mindspore/ccsrc/frontend/parallel/ops_info/concat_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/concat_info.h
@ -0,0 +1,62 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_CONCAT_INFO_H_
+#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_CONCAT_INFO_H_
+
+#include <string>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "ir/value.h"
+#include "frontend/parallel/auto_parallel/operator_costmodel.h"
+#include "frontend/parallel/ops_info/operator_info.h"
+#include "frontend/parallel/strategy.h"
+
+namespace mindspore {
+namespace parallel {
+class ConcatInfo : public OperatorInfo {
+ public:
+  ConcatInfo(const std::string &operator_name, const Shapes &inputs_shape, const Shapes &outputs_shape,
+             const PrimitiveAttrs &attrs)
+      : OperatorInfo(operator_name, inputs_shape, outputs_shape, attrs, std::make_shared<ConcatCost>(false)) {}
+  ~ConcatInfo() override = default;
+
+  Status Init(const StrategyPtr &strategy) override;
+  Status InitForCostModel(const StrategyPtr &strategy) override;
+  Status GenerateStrategies(int32_t) override;
+  Status SetCostUnderStrategy(const StrategyPtr &) override;
+  void ReComputeBatchSplitFlagList() override;
+
+ protected:
+  Status GetAttrs() override;
+  Status CheckStrategy(const StrategyPtr &strategy) override;
+  Status InferMirrorOps() override;
+  Status InferForwardCommunication() override { return SUCCESS; }
+  Status InferTensorInfo() override;
+  Status InferDevMatrixShape() override;
+  Status InferTensorMap() override;
+
+ private:
+  size_t axis_ = 0;
+};
+
+using ConcatInfoPtr = std::shared_ptr<ConcatInfo>;
+}  // namespace parallel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_CONCAT_INFO_H_
--- a/mindspore/ccsrc/frontend/parallel/ops_info/ops_info_head_files.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/ops_info_head_files.h
@ -39,5 +39,6 @@
 #include "frontend/parallel/ops_info/gather_v2_p_info.h"
 #include "frontend/parallel/ops_info/tile_info.h"
 #include "frontend/parallel/ops_info/strided_slice_info.h"
+#include "frontend/parallel/ops_info/concat_info.h"

 #endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_HEAD_FILES_H_
--- a/mindspore/ccsrc/frontend/parallel/ps/common.h
+++ b/mindspore/ccsrc/frontend/parallel/ps/common.h
@ -56,9 +56,11 @@ constexpr char kMomentum[] = "momentum";

 constexpr char kApplyMomentum[] = "ApplyMomentum";
 constexpr char kSparseAdam[] = "Adam";
+constexpr char kSparseLazyAdam[] = "LazyAdam";
 constexpr char kSparseFtrl[] = "Ftrl";
 constexpr char kApplyMomentumOp[] = "Momentum";
 constexpr char kSparseAdamOp[] = "Adam";
+constexpr char kSparseLazyAdamOp[] = "LazyAdam";
 constexpr char kSparseFtrlOp[] = "FTRL";

 constexpr int kInitWeightsCmd = 10;
--- a/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.cc
@ -126,6 +126,15 @@ MomentumOptimInfo::MomentumOptimInfo(const AddressPtr &weight, const AddressPtr
  inputs_.push_back(momentum);
 }

+void MomentumOptimInfo::Update(const Values &values, const Lengths &lens) {
+  size_t lr_offset = 0;
+  float *lr = values.data() + lr_offset;
+  auto ret = memcpy_s(inputs_[2]->addr, sizeof(float), lr, sizeof(float));
+  if (ret != 0) {
+    MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
+  }
+}
+
 const AddressPtr &MomentumOptimInfo::gradient() { return inputs_[3]; }

 const AddressPtr &MomentumOptimInfo::indices() { return inputs_[3]; }
--- a/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.h
@ -82,6 +82,7 @@ class MomentumOptimInfo : public DenseOptimInfo {
                    const AddressPtr &gradient, const AddressPtr &momentum);
  ~MomentumOptimInfo() override = default;

+  void Update(const Values &values, const Lengths &lens) override;
  const AddressPtr &gradient();
  const AddressPtr &indices();
  size_t grad_index() override;
--- a/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
+++ b/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
@ -42,6 +42,7 @@
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "backend/kernel_compiler/cpu/ps/pserver_kernel.h"
+#include "backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.h"
 #include "backend/kernel_compiler/cpu/ps/sparse_apply_lazy_adam_ps_kernel.h"
 #include "backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.h"
 #include "backend/kernel_compiler/cpu/ps/apply_momentum_ps_kernel.h"
@ -374,6 +375,11 @@ void ParameterServer<T>::InitOptimInputsShape(const Keys &keys, const Values &va
      const CNodePtr cnode = GetCNode(optim_op_name);
      MS_EXCEPTION_IF_NULL(cnode);
      if (optim_name == kSparseAdam) {
+        std::shared_ptr<PServerKernel> optimizer =
+          std::make_shared<kernel::ps::SparseApplyAdamPSKernel>(rank_id_, pserver_num_);
+        optimizer->InitKernel(cnode, optim_inputs_shape_[key]);
+        optimizers_[key] = optimizer;
+      } else if (optim_name == kSparseLazyAdam) {
        std::shared_ptr<PServerKernel> optimizer =
          std::make_shared<kernel::ps::SparseApplyLazyAdamPSKernel>(rank_id_, pserver_num_);
        optimizer->InitKernel(cnode, optim_inputs_shape_[key]);
--- a/mindspore/ccsrc/frontend/parallel/ps/util.cc
+++ b/mindspore/ccsrc/frontend/parallel/ps/util.cc
@ -25,19 +25,22 @@ namespace ps {
 std::unordered_map<std::string, int> Util::optimizer_to_ids{
  {kApplyMomentum, 0},
  {kSparseAdam, 1},
-  {kSparseFtrl, 2},
+  {kSparseLazyAdam, 2},
+  {kSparseFtrl, 3},
 };

 std::unordered_map<int, std::string> Util::id_to_optimizers{
  {0, kApplyMomentum},
  {1, kSparseAdam},
-  {2, kSparseFtrl},
+  {2, kSparseLazyAdam},
+  {3, kSparseFtrl},
 };

 std::unordered_map<int, std::string> Util::id_to_optimizer_nodes{
  {0, kApplyMomentumOp},
  {1, kSparseAdamOp},
-  {2, kSparseFtrlOp},
+  {2, kSparseLazyAdamOp},
+  {3, kSparseFtrlOp},
 };

 bool Util::IsParamServerMode() { return IsRoleOfWorker() || IsRoleOfPServer() || IsRoleOfScheduler(); }
--- a/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
@ -118,6 +118,9 @@ bool StepAutoParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &) {
 std::vector<bool> ExtractInputParameterByNode(const CNodePtr &node) {
  std::vector<bool> is_parameter;
  std::vector<AnfNodePtr> node_inputs{node->inputs()};
+  if ((node_inputs.size() == 2) && AnfNodeIsPrimitive(node_inputs[1], MAKE_TUPLE)) {
+    node_inputs = node_inputs[1]->cast<CNodePtr>()->inputs();
+  }
  for (size_t i = 1; i < node_inputs.size(); ++i) {
    auto input = node_inputs[i];

@ -192,6 +195,10 @@ std::vector<size_t> ExtractInputTypeLengthByNode(const CNodePtr &node) {
  std::vector<size_t> inputs_type_len;
  std::vector<AnfNodePtr> node_inputs{node->inputs()};

+  if ((node_inputs.size() == 2) && AnfNodeIsPrimitive(node_inputs[1], MAKE_TUPLE)) {
+    node_inputs = node_inputs[1]->cast<CNodePtr>()->inputs();
+  }
+
  // extract input element length
  for (auto &input : node_inputs) {
    if (IsValueNode<RefKey>(input)) {
@ -255,7 +262,7 @@ bool IsSplittableOperator(const std::string &op_name) {
     FLOORDIV, L2_NORMALIZE, TENSOR_ADD, MAXPOOL, MAXPOOLV2, VIRTUAL_DATA_SET, RELU, ONEHOT, DROPOUT_DO_MASK,
     REDUCE_MAX, REDUCE_MIN, ARGMAXWITHVALUE, ARGMINWITHVALUE, REDUCE_SUM, CONV2D, FUSE_BATCH_NORM, POOLING,
     MAX_POOL_WITH_ARGMAX, SIMPLE_MEAN, FLATTEN, BATCH_NORM, LAYER_NORM, BIAS_ADD, ASSIGN_SUB, COS, ACOS, EXP,
-     LOG, REDUCE_MEAN, REAL_DIV, SIGMOID, POW, MAXIMUM, MINIMUM, EQUAL, NOT_EQUAL, LOGICALNOT, GATHERV2, SQRT,
+     LOG, REDUCE_MEAN, REAL_DIV, SIGMOID, POW, MAXIMUM, MINIMUM, EQUAL, NOT_EQUAL, LOGICALNOT, GATHERV2, SQRT, CONCAT,
     STRIDEDSLICE, GET_NEXT, CAST, NEG, SQUARE, BATCH_MATMUL, EXPAND_DIMS, SQUEEZE, SPARSE_GATHERV2, TILE, DROPOUT,
     SOFTMAX_CROSS_ENTROPY_WITH_LOGITS, SIGMOID_CROSS_ENTROPY_WITH_LOGITS, SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS};
  // clang-format on
@ -275,7 +282,7 @@ bool IsAutoParallelCareNode(const CNodePtr &cnode) {
    return false;
  }
  bool bool_result = IsParallelCareNode(cnode) && !IsSplittableOperator(prim->name());
-  if (bool_result) {
+  if (bool_result && (prim->name() != MAKE_TUPLE)) {
    MS_LOG(EXCEPTION) << "Should implementing OperatorInfo for: " << prim->name();
  } else if (prim->name() == CAST) {
    if (cnode->fullname_with_scope().find(OPTIMIZER_SUB_STRING) != std::string::npos) {
@ -520,6 +527,10 @@ Status ConstructCostGraphNodesByUniqueIdTC(const std::vector<AnfNodePtr> &all_no
          MS_LOG(EXCEPTION) << "The OperatorInfo: " << current_op_ptr->name()
                            << " does not match the Prim: " << prim->name();
        }
+
+        // Needed by rec_parser
+        ModifyInputsTensorNameListIfOperatorInfoCreated(current_op_ptr->name(), cnode->UniqueId());
+
        cnode->set_user_data<OperatorInfo>(current_op_ptr);
        MS_LOG(INFO) << "The CNode with UniqueId: " << cnode->UniqueId()
                     << " and UniqueIdThroughCopy: " << cnode->UniqueIdThroughCopy()
@ -1117,6 +1128,27 @@ CNodePtr GetInternalOperatorInfo(const CNodePtr &cnode, const ValueNodePtr &prim
  return nullptr;
 }

+void ModifyInputsTensorNameListIfOperatorInfoCreated(const std::string &name, const std::string &uniqueid) {
+  size_t iter_ops = 0;
+  for (auto op : entire_costgraph->GetOperators()) {
+    if (op->name() == name) {
+      break;
+    }
+    iter_ops = iter_ops + 1;
+  }
+
+  std::vector<std::vector<std::string>> input_tensor_names = entire_costgraph->get_inputs_tensor_name_list();
+  for (size_t i = 0; i < input_tensor_names.size(); i++) {
+    for (size_t j = 0; j < input_tensor_names[i].size(); j++) {
+      if (input_tensor_names[i][j] == uniqueid) {
+        input_tensor_names[i][j] = input_tensor_names[iter_ops][0];
+      }
+    }
+  }
+
+  entire_costgraph->set_inputs_tensor_name_list(input_tensor_names);
+}
+
 Status ParallelStrategyRecSearch(const std::vector<AnfNodePtr> &all_nodes, const FuncGraphPtr &root) {
  if (CostModelContext::GetInstance()->is_multi_subgraphs()) {
    if (ConstructCostGraphNodesByUniqueIdTC(all_nodes, root) == SUCCESS) {
--- a/mindspore/ccsrc/frontend/parallel/step_auto_parallel.h
+++ b/mindspore/ccsrc/frontend/parallel/step_auto_parallel.h
@ -59,6 +59,8 @@ std::vector<std::vector<std::string>> RecInputTensorNames(const std::map<std::st
                                                          std::vector<std::vector<std::string>> input_tensor_names);

 CNodePtr GetInternalOperatorInfo(const CNodePtr &cnode, const ValueNodePtr &prim_anf_node);
+
+void ModifyInputsTensorNameListIfOperatorInfoCreated(const std::string &name, const std::string &uniqueid);
 }  // namespace parallel
 }  // namespace mindspore
 #endif  // PARALLEL_STEP_AUTO_PARALLEL_H_
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
@ -267,6 +267,33 @@ TensorLayout GetTensorInLayout(const CNodePtr &middle_node, const PrimitivePtr &
  return tensorinfo_in.tensor_layout();
 }

+bool AnfNodeIsPrimitive(const AnfNodePtr &anf_node, const std::string &prim_name) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  auto cnode = anf_node->cast<CNodePtr>();
+  if ((cnode == nullptr) || !IsValueNode<Primitive>(cnode->input(0))) {
+    return false;
+  }
+
+  auto value_node = cnode->input(0)->cast<ValueNodePtr>();
+  auto prim = GetValueNode<PrimitivePtr>(value_node);
+  MS_EXCEPTION_IF_NULL(prim);
+  if (prim->name() == prim_name) {
+    return true;
+  }
+  return false;
+}
+
+std::string GetPrimName(const CNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  if (!IsValueNode<Primitive>(node->input(0))) {
+    MS_LOG(EXCEPTION) << "The node is not a primitive";
+  }
+  auto value_node = node->input(0)->cast<ValueNodePtr>();
+  auto prim = GetValueNode<PrimitivePtr>(value_node);
+  MS_EXCEPTION_IF_NULL(prim);
+  return prim->name();
+}
+
 OperatorInfoPtr GetDistributeOperator(const CNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (!IsParallelCareNode(node)) {
@ -274,7 +301,7 @@ OperatorInfoPtr GetDistributeOperator(const CNodePtr &node) {
  }
  OperatorInfoPtr distribute_operator = node->user_data<OperatorInfo>();
  if (distribute_operator == nullptr) {
-    MS_LOG(EXCEPTION) << "GetDistributeOperator:distribute_operator is nullptr";
+    MS_LOG(EXCEPTION) << "Distribute operator is nullptr, the prim is " << GetPrimName(node);
  }
  return distribute_operator;
 }
@ -423,6 +450,11 @@ void StepRedistribution(const CNodePtr &node, const OperatorInfoPtr &distribute_
  MS_EXCEPTION_IF_NULL(manager);
  AnfNodeIndexSet node_set = manager->node_users()[node];
  CNodePtr insert_node_new;
+
+  if (AnfNodeIsPrimitive(node, MAKE_TUPLE)) {
+    MS_LOG(INFO) << "No need to insert redistribution op betweend make_tuple node and the next node";
+    return;
+  }
  if (IsValueNode<Primitive>(node->input(0))) {
    auto current_value = node->input(0)->cast<ValueNodePtr>();
    MS_EXCEPTION_IF_NULL(current_value);
@ -875,9 +907,15 @@ void InsertMirrorOps(const MirrorOps &mirror_ops, const CNodePtr &node) {
  MS_EXCEPTION_IF_NULL(func_graph);
  FuncGraphManagerPtr manager = func_graph->manager();
  MS_EXCEPTION_IF_NULL(manager);
+
+  if ((node->inputs().size() == 2) && AnfNodeIsPrimitive(node->input(1), MAKE_TUPLE)) {
+    MS_LOG(INFO) << "The mirror for " << GetPrimName(node) << " has handle by make_tuple node";
+    return;
+  }
+
  if (mirror_ops.size() != node_size - 1) {
-    MS_LOG(EXCEPTION) << "Failure:Mirrorops's size is wrong! mirror_ops size is  " << mirror_ops.size()
-                      << ", node_size is  " << node_size;
+    MS_LOG(EXCEPTION) << "Mirrorops's size is wrong! mirror_ops size is " << mirror_ops.size() << ", node_size is "
+                      << node_size - 1;
  }
  for (size_t index = 1; index < node_size; ++index) {
    OperatorVector backward_op = mirror_ops[index - 1];
@ -993,7 +1031,7 @@ OperatorInfoPtr OperatorInstance(const PrimitivePtr &prim, const PrimitiveAttrs
                                 const std::vector<Shapes> &shape_list) {
  MS_EXCEPTION_IF_NULL(prim);
  OperatorInfoPtr operator_ = OperatorInstanceByName(prim->name(), attrs, shape_list);
-  if (operator_ == nullptr) {
+  if ((operator_ == nullptr) && (prim->name() != MAKE_TUPLE)) {
    MS_LOG(INFO) << "Creat " << prim->name() << " failed, use batch parallel";
    operator_ = OperatorInstanceByName(BATCH_PARALLEL, attrs, shape_list);
    MS_EXCEPTION_IF_NULL(operator_);
@ -1177,7 +1215,12 @@ std::vector<Shapes> ExtractShape(const CNodePtr &node) {
      continue;
    }
    if (input_shapes.size() != 1) {
-      MS_LOG(EXCEPTION) << "ExtractShape:Get input shape failed";
+      if (inputs_size == 2) {  // like concat
+        shape_inputs = input_shapes;
+        break;
+      } else {
+        MS_LOG(EXCEPTION) << "ExtractShape: Get input shape failed";
+      }
    }
    shape_inputs.push_back(input_shapes[0]);
  }
@ -1269,8 +1312,8 @@ void SetParallelShape(const AnfNodePtr &parameter, const std::pair<AnfNodePtr, i
  }
  TensorInfo tensorinfo_in = distribute_operator->inputs_tensor_info()[IntToSize(res.second - 1)];
  Shape slice_shape = tensorinfo_in.slice_shape();
-  MS_LOG(DEBUG) << "SetParallelShape slice_shape  " << parameter->ToString() << "  shape "
-                << MakeValue(slice_shape)->ToString();
+  MS_LOG(INFO) << "SetParallelShape slice_shape  " << parameter->ToString() << "  shape "
+               << MakeValue(slice_shape)->ToString() << ", op name is " << distribute_operator->name();
  std::shared_ptr<abstract::BaseShape> parallel_shape = std::make_shared<abstract::Shape>(slice_shape);
  MS_EXCEPTION_IF_NULL(parallel_shape);
  // Don't modify it in-place as the pointer of this AbstractValue may used as cache key in StaticAnalysis.
@ -1450,6 +1493,9 @@ void ExtractInformation(const std::vector<AnfNodePtr> &all_nodes) {
    SetVirtualDatasetStrategy(cnode);
    ValueNodePtr prim_anf_node = cnode->input(0)->cast<ValueNodePtr>();
    PrimitivePtr prim = GetValueNode<PrimitivePtr>(prim_anf_node);
+    if (prim->name() == MAKE_TUPLE) {
+      continue;
+    }
    auto attrs = prim->attrs();
    MS_LOG(INFO) << "extract information: node: " << node->ToString() << " prim " << prim->name();
    if (IsParallelCareNode(cnode)) {
@ -2045,13 +2091,13 @@ void ParallelCommunication(const FuncGraphPtr &root, const std::vector<AnfNodePt
    MS_EXCEPTION_IF_NULL(node);
    if (node->isa<CNode>()) {
      auto cnode = node->cast<CNodePtr>();
-      if (!IsValueNode<Primitive>(cnode->input(0))) {
+      // the make_tuple is parallel care node, but it may have not operator info
+      if (!IsParallelCareNode(cnode) || !cnode->has_user_data<OperatorInfo>()) {
        continue;
      }
+
      OperatorInfoPtr distribute_operator = GetDistributeOperator(cnode);
-      if (distribute_operator == nullptr) {
-        continue;
-      }
+      MS_EXCEPTION_IF_NULL(distribute_operator);

      // insert forward ops
      InsertForwardOps(distribute_operator, cnode);
@ -2074,13 +2120,12 @@ void ParallelCommunication(const FuncGraphPtr &root, const std::vector<AnfNodePt
    MS_EXCEPTION_IF_NULL(node);
    if (node->isa<CNode>()) {
      auto cnode = node->cast<CNodePtr>();
-      if (!IsValueNode<Primitive>(cnode->input(0))) {
+      if (!IsParallelCareNode(cnode) || !cnode->has_user_data<OperatorInfo>()) {
        continue;
      }
+
      OperatorInfoPtr distribute_operator = GetDistributeOperator(cnode);
-      if (distribute_operator == nullptr) {
-        continue;
-      }
+      MS_EXCEPTION_IF_NULL(distribute_operator);
      // StepReplace
      StepReplace(distribute_operator, cnode);
    }
@ -2330,6 +2375,44 @@ Status ParallelInit() {
  return SUCCESS;
 }

+void HandleForwardMakeTuple(const std::vector<AnfNodePtr> &all_nodes) {
+  for (auto &node : all_nodes) {
+    if (!AnfNodeIsPrimitive(node, MAKE_TUPLE)) {
+      continue;
+    }
+
+    auto cnode = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cnode);
+    if (!cnode->in_forward_flag()) {
+      continue;
+    }
+
+    FuncGraphManagerPtr manager = cnode->func_graph()->manager();
+    MS_EXCEPTION_IF_NULL(manager);
+    auto make_tuple_user = manager->node_users()[cnode];
+    if (make_tuple_user.size() != 1) {
+      MS_LOG(EXCEPTION) << "Now the make_tuple's user must be 1, but got " << make_tuple_user.size();
+    }
+    CNodePtr make_tuple_next_cnode = make_tuple_user.pop().first->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(make_tuple_next_cnode);
+
+    std::string make_tuple_user_prim_name = GetPrimName(make_tuple_next_cnode);
+    if (!IsParallelCareNode(make_tuple_next_cnode)) {
+      MS_LOG(INFO) << "The make_tuple's user is " << make_tuple_user_prim_name << ", no need to set operator info";
+      continue;
+    }
+    if (make_tuple_next_cnode->inputs().size() != 2) {
+      MS_LOG(EXCEPTION) << "Now the make_tuple's user only support 1 input, but got "
+                        << make_tuple_next_cnode->inputs().size() - 1;
+    }
+
+    MS_LOG(INFO) << "Set the make_tuple's operator info, and the op name is " << make_tuple_user_prim_name;
+    OperatorInfoPtr op_info = GetDistributeOperator(make_tuple_next_cnode);
+    MS_EXCEPTION_IF_NULL(op_info);
+    cnode->set_user_data<OperatorInfo>(op_info);
+  }
+}
+
 bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer) {
  MS_EXCEPTION_IF_NULL(root);
  MS_EXCEPTION_IF_NULL(optimizer);
@ -2383,6 +2466,9 @@ bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer)
    ExtractInformation(all_nodes);
    ReshapeInit(all_nodes);
  }
+
+  HandleForwardMakeTuple(all_nodes);
+
  // save strategy as checkpoint for multi-train
  if (StrategyCheckpoint::GetInstance().SaveCheckPointOn()) {
    CheckpointStrategy(root);
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.h
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.h
@ -149,6 +149,8 @@ Status ParallelInit();
 std::vector<std::string> ExtractInputsTensorName(const CNodePtr &node);

 std::set<FuncGraphPtr> ForwardGraph(const FuncGraphPtr &root);
+
+bool AnfNodeIsPrimitive(const AnfNodePtr &anf_node, const std::string &prim_name);
 }  // namespace parallel
 }  // namespace mindspore

--- a/mindspore/ccsrc/gvar/CMakeLists.txt
+++ b/mindspore/ccsrc/gvar/CMakeLists.txt
@ -1,6 +0,0 @@
-file(GLOB_RECURSE MS_GVAR_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cc)
-set_property(SOURCE ${MS_GVAR_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_COMMON)
-add_library(mindspore_gvar SHARED ${MS_GVAR_SRC_LIST})
-if (APPLE)
-    set_target_properties(mindspore_gvar PROPERTIES MACOSX_RPATH ON)
-endif ()
--- a/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
@ -62,12 +62,15 @@ add_subdirectory(text)
 add_dependencies(utils core)
 add_dependencies(kernels-image core)
 add_dependencies(kernels-data core)
+add_dependencies(kernels-soft-dvpp-image core soft-dvpp-utils)
 add_dependencies(kernels core)
 add_dependencies(engine-datasetops-source core)
 add_dependencies(engine-datasetops-source-sampler core)
 add_dependencies(engine-datasetops core)
 add_dependencies(engine-datasetops-mapop core)
 add_dependencies(engine-opt core)
+add_dependencies(engine-cache-client core)
+add_dependencies(engine-cache-server core)
 add_dependencies(engine-perf core)
 add_dependencies(engine-gnn core)
 add_dependencies(engine core)
@ -88,6 +91,8 @@ set(submodules
    $<TARGET_OBJECTS:kernels-image>
    $<TARGET_OBJECTS:kernels-data>
    $<TARGET_OBJECTS:cpp-API>
+    $<TARGET_OBJECTS:kernels-soft-dvpp-image>
+    $<TARGET_OBJECTS:soft-dvpp-utils>
    $<TARGET_OBJECTS:engine-datasetops-source>
    $<TARGET_OBJECTS:engine-datasetops-source-sampler>
    $<TARGET_OBJECTS:engine-datasetops-mapop>
@ -126,7 +131,7 @@ endif()
 ######################################################################

 ################# Link with external libraries ########################
-target_link_libraries(_c_dataengine PRIVATE mindspore mindspore_gvar)
+target_link_libraries(_c_dataengine PRIVATE mindspore)
 if (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
    if (ENABLE_PYTHON)
        target_link_libraries(_c_dataengine PRIVATE mindspore::pybind11_module ${PYTHON_LIBRARIES} mindspore::protobuf ${SECUREC_LIBRARY})  
@ -141,7 +146,7 @@ else()
        target_link_libraries(_c_dataengine PRIVATE -ldl mindspore::protobuf ${SECUREC_LIBRARY})
    endif()
 endif()
-target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::opencv_core mindspore::opencv_imgcodecs
+target_link_libraries(_c_dataengine PUBLIC mindspore::jpeg_turbo mindspore::turbojpeg mindspore::opencv_core mindspore::opencv_imgcodecs
        mindspore::opencv_imgproc mindspore::tinyxml2 mindspore::sentencepiece mindspore::sentencepiece_train ${ICU_LIB})
 if (ENABLE_GPUQUE)
    target_link_libraries(_c_dataengine PRIVATE gpu_queue
--- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc
@ -61,11 +61,19 @@ namespace api {
  } while (false)

 // Function to create the iterator, which will build and launch the execution tree.
-std::shared_ptr<Iterator> Dataset::CreateIterator() {
+std::shared_ptr<Iterator> Dataset::CreateIterator(std::vector<std::string> columns) {
  std::shared_ptr<Iterator> iter;
  try {
+    auto ds = shared_from_this();
+
+    // The specified columns will be selected from the dataset and passed down the pipeline
+    // in the order specified, other columns will be discarded.
+    if (!columns.empty()) {
+      ds = ds->Project(columns);
+    }
+
    iter = std::make_shared<Iterator>();
-    Status rc = iter->BuildAndLaunchTree(shared_from_this());
+    Status rc = iter->BuildAndLaunchTree(ds);
    if (rc.IsError()) {
      MS_LOG(ERROR) << "CreateIterator failed." << rc;
      return nullptr;
@ -629,13 +637,13 @@ bool VOCDataset::ValidateParams() {
    }
    Path imagesets_file = dir / "ImageSets" / "Segmentation" / mode_ + ".txt";
    if (!imagesets_file.Exists()) {
-      MS_LOG(ERROR) << "[Segmentation] imagesets_file is invalid or not exist";
+      MS_LOG(ERROR) << "Invalid mode: " << mode_ << ", file \"" << imagesets_file << "\" is not exists!";
      return false;
    }
  } else if (task_ == "Detection") {
    Path imagesets_file = dir / "ImageSets" / "Main" / mode_ + ".txt";
    if (!imagesets_file.Exists()) {
-      MS_LOG(ERROR) << "[Detection] imagesets_file is invalid or not exist.";
+      MS_LOG(ERROR) << "Invalid mode: " << mode_ << ", file \"" << imagesets_file << "\" is not exists!";
      return false;
    }
  } else {
@ -655,18 +663,33 @@ std::vector<std::shared_ptr<DatasetOp>> VOCDataset::Build() {
    sampler_ = CreateDefaultSampler();
  }

-  std::shared_ptr<VOCOp::Builder> builder = std::make_shared<VOCOp::Builder>();
-  (void)builder->SetDir(dataset_dir_);
-  (void)builder->SetTask(task_);
-  (void)builder->SetMode(mode_);
-  (void)builder->SetNumWorkers(num_workers_);
-  (void)builder->SetSampler(std::move(sampler_->Build()));
-  (void)builder->SetDecode(decode_);
-  (void)builder->SetClassIndex(class_index_);
+  auto schema = std::make_unique<DataSchema>();
+  VOCOp::TaskType task_type_;

-  std::shared_ptr<VOCOp> op;
-  RETURN_EMPTY_IF_ERROR(builder->Build(&op));
-  node_ops.push_back(op);
+  if (task_ == "Segmentation") {
+    task_type_ = VOCOp::TaskType::Segmentation;
+    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
+      ColDescriptor(std::string(kColumnImage), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
+    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
+      ColDescriptor(std::string(kColumnTarget), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
+  } else if (task_ == "Detection") {
+    task_type_ = VOCOp::TaskType::Detection;
+    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
+      ColDescriptor(std::string(kColumnImage), DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
+    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
+      ColDescriptor(std::string(kColumnBbox), DataType(DataType::DE_FLOAT32), TensorImpl::kFlexible, 1)));
+    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
+      ColDescriptor(std::string(kColumnLabel), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
+    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
+      ColDescriptor(std::string(kColumnDifficult), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
+    RETURN_EMPTY_IF_ERROR(schema->AddColumn(
+      ColDescriptor(std::string(kColumnTruncate), DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 1)));
+  }
+
+  std::shared_ptr<VOCOp> voc_op;
+  voc_op = std::make_shared<VOCOp>(task_type_, mode_, dataset_dir_, class_index_, num_workers_, rows_per_buffer_,
+                                   connector_que_size_, decode_, std::move(schema), std::move(sampler_->Build()));
+  node_ops.push_back(voc_op);
  return node_ops;
 }

--- a/mindspore/ccsrc/minddata/dataset/api/iterator.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/iterator.cc
@ -30,6 +30,19 @@ void Iterator::GetNextRow(TensorMap *row) {
  }
 }

+// Get the next row from the data pipeline.
+void Iterator::GetNextRow(TensorVec *row) {
+  TensorRow tensor_row;
+  Status rc = iterator_->FetchNextTensorRow(&tensor_row);
+  if (rc.IsError()) {
+    MS_LOG(ERROR) << "GetNextRow: Failed to get next row.";
+    row->clear();
+  }
+  // Generate a vector as return
+  row->clear();
+  std::copy(tensor_row.begin(), tensor_row.end(), std::back_inserter(*row));
+}
+
 // Shut down the data pipeline.
 void Iterator::Stop() {
  // Releasing the iterator_ unique_ptre. This should trigger the destructor of iterator_.
@ -61,13 +74,20 @@ Status Iterator::BuildAndLaunchTree(std::shared_ptr<Dataset> ds) {
      RETURN_STATUS_UNEXPECTED("Node operation returned nothing");
    }

-    auto root_op = root_ops.front();
-
-    RETURN_UNEXPECTED_IF_NULL(root_op);
-
-    RETURN_IF_NOT_OK(tree_->AssociateNode(root_op));
-
-    q.push(std::make_pair(ds, root_op));
+    // Iterate through all the DatasetOps returned by Dataset's Build(), associate them
+    // with the execution tree and add the child and parent relationship between the nodes
+    // Note that some Dataset objects might return more than one DatasetOps
+    // e.g. MapDataset will return [ProjectOp, MapOp] if project_columns is set for MapDataset
+    std::shared_ptr<DatasetOp> prev_op = nullptr;
+    for (auto op : root_ops) {
+      RETURN_IF_NOT_OK(tree_->AssociateNode(op));
+      if (prev_op != nullptr) {
+        RETURN_IF_NOT_OK(prev_op->AddChild(op));
+      }
+      prev_op = op;
+    }
+    // Add the last DatasetOp to the queue to be BFS.
+    q.push(std::make_pair(ds, root_ops.back()));

    // Traverse down to the children and convert them to the corresponding DatasetOps (i.e. execution tree nodes)
    while (!q.empty()) {
@ -94,7 +114,7 @@ Status Iterator::BuildAndLaunchTree(std::shared_ptr<Dataset> ds) {
        q.push(std::make_pair(child, child_ops.back()));
      }
    }
-    RETURN_IF_NOT_OK(tree_->AssignRoot(root_op));
+    RETURN_IF_NOT_OK(tree_->AssignRoot(root_ops.front()));
  }

  // Launch the execution tree.
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/image/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/kernels/image/bindings.cc
@ -28,8 +28,10 @@
 #include "minddata/dataset/kernels/image/hwc_to_chw_op.h"
 #include "minddata/dataset/kernels/image/image_utils.h"
 #include "minddata/dataset/kernels/image/invert_op.h"
+#include "minddata/dataset/kernels/image/mixup_batch_op.h"
 #include "minddata/dataset/kernels/image/normalize_op.h"
 #include "minddata/dataset/kernels/image/pad_op.h"
+#include "minddata/dataset/kernels/image/random_affine_op.h"
 #include "minddata/dataset/kernels/image/random_color_adjust_op.h"
 #include "minddata/dataset/kernels/image/random_crop_and_resize_op.h"
 #include "minddata/dataset/kernels/image/random_crop_and_resize_with_bbox_op.h"
@ -48,6 +50,8 @@
 #include "minddata/dataset/kernels/image/resize_bilinear_op.h"
 #include "minddata/dataset/kernels/image/resize_op.h"
 #include "minddata/dataset/kernels/image/resize_with_bbox_op.h"
+#include "minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.h"
+#include "minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_resize_jpeg_op.h"
 #include "minddata/dataset/kernels/image/uniform_aug_op.h"

 namespace mindspore {
@ -92,6 +96,12 @@ PYBIND_REGISTER(CenterCropOp, 1, ([](const py::module *m) {
                    .def(py::init<int32_t, int32_t>(), py::arg("height"), py::arg("width") = CenterCropOp::kDefWidth);
                }));

+PYBIND_REGISTER(MixUpBatchOp, 1, ([](const py::module *m) {
+                  (void)py::class_<MixUpBatchOp, TensorOp, std::shared_ptr<MixUpBatchOp>>(
+                    *m, "MixUpBatchOp", "Tensor operation to mixup a batch of images")
+                    .def(py::init<float>(), py::arg("alpha"));
+                }));
+
 PYBIND_REGISTER(ResizeOp, 1, ([](const py::module *m) {
                  (void)py::class_<ResizeOp, TensorOp, std::shared_ptr<ResizeOp>>(
                    *m, "ResizeOp", "Tensor operation to resize an image. Takes height, width and mode")
@ -108,6 +118,19 @@ PYBIND_REGISTER(ResizeWithBBoxOp, 1, ([](const py::module *m) {
                         py::arg("interpolation") = ResizeWithBBoxOp::kDefInterpolation);
                }));

+PYBIND_REGISTER(RandomAffineOp, 1, ([](const py::module *m) {
+                  (void)py::class_<RandomAffineOp, TensorOp, std::shared_ptr<RandomAffineOp>>(
+                    *m, "RandomAffineOp", "Tensor operation to apply random affine transformations on an image.")
+                    .def(py::init<std::vector<float_t>, std::vector<float_t>, std::vector<float_t>,
+                                  std::vector<float_t>, InterpolationMode, std::vector<uint8_t>>(),
+                         py::arg("degrees") = RandomAffineOp::kDegreesRange,
+                         py::arg("translate_range") = RandomAffineOp::kTranslationPercentages,
+                         py::arg("scale_range") = RandomAffineOp::kScaleRange,
+                         py::arg("shear_ranges") = RandomAffineOp::kShearRanges,
+                         py::arg("interpolation") = RandomAffineOp::kDefInterpolation,
+                         py::arg("fill_value") = RandomAffineOp::kFillValue);
+                }));
+
 PYBIND_REGISTER(
  RandomResizeWithBBoxOp, 1, ([](const py::module *m) {
    (void)py::class_<RandomResizeWithBBoxOp, TensorOp, std::shared_ptr<RandomResizeWithBBoxOp>>(
@ -341,6 +364,24 @@ PYBIND_REGISTER(RandomSelectSubpolicyOp, 1, ([](const py::module *m) {
                      return std::make_shared<RandomSelectSubpolicyOp>(cpp_policy);
                    }));
                }));
+PYBIND_REGISTER(SoftDvppDecodeResizeJpegOp, 1, ([](const py::module *m) {
+                  (void)py::class_<SoftDvppDecodeResizeJpegOp, TensorOp, std::shared_ptr<SoftDvppDecodeResizeJpegOp>>(
+                    *m, "SoftDvppDecodeResizeJpegOp", "TensorOp to use soft dvpp decode and resize jpeg image.")
+                    .def(py::init<int32_t, int32_t>(), py::arg("targetHeight"), py::arg("targetWidth"));
+                }));
+PYBIND_REGISTER(
+  SoftDvppDecodeRandomCropResizeJpegOp, 1, ([](const py::module *m) {
+    (void)
+      py::class_<SoftDvppDecodeRandomCropResizeJpegOp, TensorOp, std::shared_ptr<SoftDvppDecodeRandomCropResizeJpegOp>>(
+        *m, "SoftDvppDecodeRandomCropResizeJpegOp",
+        "TensorOp to use soft dvpp decode, random crop and resize jepg image.")
+        .def(py::init<int32_t, int32_t, float, float, float, float, int32_t>(), py::arg("targetHeight"),
+             py::arg("targetWidth"), py::arg("scaleLb") = RandomCropDecodeResizeOp::kDefScaleLb,
+             py::arg("scaleUb") = RandomCropDecodeResizeOp::kDefScaleUb,
+             py::arg("aspectLb") = RandomCropDecodeResizeOp::kDefAspectLb,
+             py::arg("aspectUb") = RandomCropDecodeResizeOp::kDefAspectUb,
+             py::arg("maxIter") = RandomCropDecodeResizeOp::kDefMaxIter);
+  }));

 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/mindrecord/include/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/mindrecord/include/bindings.cc
@ -48,12 +48,12 @@ PYBIND_REGISTER(
  ShardPkSample, 1, ([](const py::module *m) {
    (void)py::class_<mindrecord::ShardPkSample, mindrecord::ShardOperator, std::shared_ptr<mindrecord::ShardPkSample>>(
      *m, "MindrecordPkSampler")
-      .def(py::init([](int64_t kVal, std::string kColumn, bool shuffle) {
+      .def(py::init([](int64_t kVal, std::string kColumn, bool shuffle, int64_t num_samples) {
        if (shuffle == true) {
          return std::make_shared<mindrecord::ShardPkSample>(kColumn, kVal, std::numeric_limits<int64_t>::max(),
-                                                             GetSeed());
+                                                             GetSeed(), num_samples);
        } else {
-          return std::make_shared<mindrecord::ShardPkSample>(kColumn, kVal);
+          return std::make_shared<mindrecord::ShardPkSample>(kColumn, kVal, num_samples);
        }
      }));
  }));
--- a/mindspore/ccsrc/minddata/dataset/api/transforms.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/transforms.cc
@ -21,8 +21,12 @@
 #include "minddata/dataset/kernels/image/crop_op.h"
 #include "minddata/dataset/kernels/image/cut_out_op.h"
 #include "minddata/dataset/kernels/image/decode_op.h"
+#include "minddata/dataset/kernels/image/hwc_to_chw_op.h"
+#include "minddata/dataset/kernels/image/mixup_batch_op.h"
 #include "minddata/dataset/kernels/image/normalize_op.h"
+#include "minddata/dataset/kernels/data/one_hot_op.h"
 #include "minddata/dataset/kernels/image/pad_op.h"
+#include "minddata/dataset/kernels/image/random_affine_op.h"
 #include "minddata/dataset/kernels/image/random_color_adjust_op.h"
 #include "minddata/dataset/kernels/image/random_crop_op.h"
 #include "minddata/dataset/kernels/image/random_horizontal_flip_op.h"
@ -81,6 +85,26 @@ std::shared_ptr<DecodeOperation> Decode(bool rgb) {
  return op;
 }

+// Function to create HwcToChwOperation.
+std::shared_ptr<HwcToChwOperation> HWC2CHW() {
+  auto op = std::make_shared<HwcToChwOperation>();
+  // Input validation
+  if (!op->ValidateParams()) {
+    return nullptr;
+  }
+  return op;
+}
+
+// Function to create MixUpBatchOperation.
+std::shared_ptr<MixUpBatchOperation> MixUpBatch(float alpha) {
+  auto op = std::make_shared<MixUpBatchOperation>(alpha);
+  // Input validation
+  if (!op->ValidateParams()) {
+    return nullptr;
+  }
+  return op;
+}
+
 // Function to create NormalizeOperation.
 std::shared_ptr<NormalizeOperation> Normalize(std::vector<float> mean, std::vector<float> std) {
  auto op = std::make_shared<NormalizeOperation>(mean, std);
@ -91,6 +115,16 @@ std::shared_ptr<NormalizeOperation> Normalize(std::vector<float> mean, std::vect
  return op;
 }

+// Function to create OneHotOperation.
+std::shared_ptr<OneHotOperation> OneHot(int32_t num_classes) {
+  auto op = std::make_shared<OneHotOperation>(num_classes);
+  // Input validation
+  if (!op->ValidateParams()) {
+    return nullptr;
+  }
+  return op;
+}
+
 // Function to create PadOperation.
 std::shared_ptr<PadOperation> Pad(std::vector<int32_t> padding, std::vector<uint8_t> fill_value,
                                  BorderType padding_mode) {
@ -114,10 +148,27 @@ std::shared_ptr<RandomColorAdjustOperation> RandomColorAdjust(std::vector<float>
  return op;
 }

+// Function to create RandomAffineOperation.
+std::shared_ptr<RandomAffineOperation> RandomAffine(const std::vector<float_t> &degrees,
+                                                    const std::vector<float_t> &translate_range,
+                                                    const std::vector<float_t> &scale_range,
+                                                    const std::vector<float_t> &shear_ranges,
+                                                    InterpolationMode interpolation,
+                                                    const std::vector<uint8_t> &fill_value) {
+  auto op = std::make_shared<RandomAffineOperation>(degrees, translate_range, scale_range, shear_ranges, interpolation,
+                                                    fill_value);
+  // Input validation
+  if (!op->ValidateParams()) {
+    return nullptr;
+  }
+  return op;
+}
+
 // Function to create RandomCropOperation.
 std::shared_ptr<RandomCropOperation> RandomCrop(std::vector<int32_t> size, std::vector<int32_t> padding,
-                                                bool pad_if_needed, std::vector<uint8_t> fill_value) {
-  auto op = std::make_shared<RandomCropOperation>(size, padding, pad_if_needed, fill_value);
+                                                bool pad_if_needed, std::vector<uint8_t> fill_value,
+                                                BorderType padding_mode) {
+  auto op = std::make_shared<RandomCropOperation>(size, padding, pad_if_needed, fill_value, padding_mode);
  // Input validation
  if (!op->ValidateParams()) {
    return nullptr;
@ -271,6 +322,25 @@ bool DecodeOperation::ValidateParams() { return true; }

 std::shared_ptr<TensorOp> DecodeOperation::Build() { return std::make_shared<DecodeOp>(rgb_); }

+// HwcToChwOperation
+bool HwcToChwOperation::ValidateParams() { return true; }
+
+std::shared_ptr<TensorOp> HwcToChwOperation::Build() { return std::make_shared<HwcToChwOp>(); }
+
+// MixUpOperation
+MixUpBatchOperation::MixUpBatchOperation(float alpha) : alpha_(alpha) {}
+
+bool MixUpBatchOperation::ValidateParams() {
+  if (alpha_ < 0) {
+    MS_LOG(ERROR) << "MixUpBatch: alpha must be a positive floating value however it is: " << alpha_;
+    return false;
+  }
+
+  return true;
+}
+
+std::shared_ptr<TensorOp> MixUpBatchOperation::Build() { return std::make_shared<MixUpBatchOp>(alpha_); }
+
 // NormalizeOperation
 NormalizeOperation::NormalizeOperation(std::vector<float> mean, std::vector<float> std) : mean_(mean), std_(std) {}

@ -292,6 +362,20 @@ std::shared_ptr<TensorOp> NormalizeOperation::Build() {
  return std::make_shared<NormalizeOp>(mean_[0], mean_[1], mean_[2], std_[0], std_[1], std_[2]);
 }

+// OneHotOperation
+OneHotOperation::OneHotOperation(int32_t num_classes) : num_classes_(num_classes) {}
+
+bool OneHotOperation::ValidateParams() {
+  if (num_classes_ < 0) {
+    MS_LOG(ERROR) << "OneHot: Number of classes cannot be negative. Number of classes: " << num_classes_;
+    return false;
+  }
+
+  return true;
+}
+
+std::shared_ptr<TensorOp> OneHotOperation::Build() { return std::make_shared<OneHotOp>(num_classes_); }
+
 // PadOperation
 PadOperation::PadOperation(std::vector<int32_t> padding, std::vector<uint8_t> fill_value, BorderType padding_mode)
    : padding_(padding), fill_value_(fill_value), padding_mode_(padding_mode) {}
@ -401,10 +485,90 @@ std::shared_ptr<TensorOp> RandomColorAdjustOperation::Build() {
  return tensor_op;
 }

+// RandomAffineOperation
+RandomAffineOperation::RandomAffineOperation(const std::vector<float_t> &degrees,
+                                             const std::vector<float_t> &translate_range,
+                                             const std::vector<float_t> &scale_range,
+                                             const std::vector<float_t> &shear_ranges, InterpolationMode interpolation,
+                                             const std::vector<uint8_t> &fill_value)
+    : degrees_(degrees),
+      translate_range_(translate_range),
+      scale_range_(scale_range),
+      shear_ranges_(shear_ranges),
+      interpolation_(interpolation),
+      fill_value_(fill_value) {}
+
+bool RandomAffineOperation::ValidateParams() {
+  // Degrees
+  if (degrees_.size() != 2) {
+    MS_LOG(ERROR) << "RandomAffine: degrees vector has incorrect size: degrees.size() = " << degrees_.size();
+    return false;
+  }
+  if (degrees_[0] > degrees_[1]) {
+    MS_LOG(ERROR) << "RandomAffine: minimum of degrees range is greater than maximum: min = " << degrees_[0]
+                  << ", max = " << degrees_[1];
+    return false;
+  }
+  // Translate
+  if (translate_range_.size() != 2) {
+    MS_LOG(ERROR) << "RandomAffine: translate_range vector has incorrect size: translate_range.size() = "
+                  << translate_range_.size();
+    return false;
+  }
+  if (translate_range_[0] > translate_range_[1]) {
+    MS_LOG(ERROR) << "RandomAffine: minimum of translate range is greater than maximum: min = " << translate_range_[0]
+                  << ", max = " << translate_range_[1];
+    return false;
+  }
+  // Scale
+  if (scale_range_.size() != 2) {
+    MS_LOG(ERROR) << "RandomAffine: scale_range vector has incorrect size: scale_range.size() = "
+                  << scale_range_.size();
+    return false;
+  }
+  if (scale_range_[0] > scale_range_[1]) {
+    MS_LOG(ERROR) << "RandomAffine: minimum of scale range is greater than maximum: min = " << scale_range_[0]
+                  << ", max = " << scale_range_[1];
+    return false;
+  }
+  // Shear
+  if (shear_ranges_.size() != 4) {
+    MS_LOG(ERROR) << "RandomAffine: shear_ranges vector has incorrect size: shear_ranges.size() = "
+                  << shear_ranges_.size();
+    return false;
+  }
+  if (shear_ranges_[0] > shear_ranges_[1]) {
+    MS_LOG(ERROR) << "RandomAffine: minimum of horizontal shear range is greater than maximum: min = "
+                  << shear_ranges_[0] << ", max = " << shear_ranges_[1];
+    return false;
+  }
+  if (shear_ranges_[2] > shear_ranges_[3]) {
+    MS_LOG(ERROR) << "RandomAffine: minimum of vertical shear range is greater than maximum: min = " << shear_ranges_[2]
+                  << ", max = " << scale_range_[3];
+    return false;
+  }
+  // Fill Value
+  if (fill_value_.size() != 3) {
+    MS_LOG(ERROR) << "RandomAffine: fill_value vector has incorrect size: fill_value.size() = " << fill_value_.size();
+    return false;
+  }
+  return true;
+}
+
+std::shared_ptr<TensorOp> RandomAffineOperation::Build() {
+  auto tensor_op = std::make_shared<RandomAffineOp>(degrees_, translate_range_, scale_range_, shear_ranges_,
+                                                    interpolation_, fill_value_);
+  return tensor_op;
+}
+
 // RandomCropOperation
 RandomCropOperation::RandomCropOperation(std::vector<int32_t> size, std::vector<int32_t> padding, bool pad_if_needed,
-                                         std::vector<uint8_t> fill_value)
-    : size_(size), padding_(padding), pad_if_needed_(pad_if_needed), fill_value_(fill_value) {}
+                                         std::vector<uint8_t> fill_value, BorderType padding_mode)
+    : size_(size),
+      padding_(padding),
+      pad_if_needed_(pad_if_needed),
+      fill_value_(fill_value),
+      padding_mode_(padding_mode) {}

 bool RandomCropOperation::ValidateParams() {
  if (size_.empty() || size_.size() > 2) {
@ -443,7 +607,7 @@ std::shared_ptr<TensorOp> RandomCropOperation::Build() {
  }

  auto tensor_op = std::make_shared<RandomCropOp>(crop_height, crop_width, pad_top, pad_bottom, pad_left, pad_right,
-                                                  BorderType::kConstant, pad_if_needed_, fill_r, fill_g, fill_b);
+                                                  padding_mode_, pad_if_needed_, fill_r, fill_g, fill_b);
  return tensor_op;
 }

--- a/mindspore/ccsrc/minddata/dataset/engine/opt/post/repeat_pass.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/post/repeat_pass.cc
@ -20,6 +20,7 @@
 #include "minddata/dataset/engine/datasetops/cache_op.h"
 #include "minddata/dataset/engine/datasetops/cache_lookup_op.h"
 #include "minddata/dataset/engine/datasetops/cache_merge_op.h"
+#include "minddata/dataset/engine/datasetops/device_queue_op.h"
 #include "minddata/dataset/engine/datasetops/epoch_ctrl_op.h"

 namespace mindspore {
@ -258,6 +259,13 @@ Status RepeatPass::RunOnNode(std::shared_ptr<CacheLookupOp> node, bool *modified
  return Status::OK();
 }

+Status RepeatPass::RunOnNode(std::shared_ptr<DeviceQueueOp> node, bool *modified) {
+  // Set total repeats and total epochs for the DeviceQueueOp
+  node->set_total_repeats(num_epochs_);
+  node->set_num_repeats_per_epoch(1);
+  return Status::OK();
+}
+
 // Adds an operator to the eoe operator stack save area
 void RepeatPass::AddToEOEOpStack(std::shared_ptr<DatasetOp> dataset_op) {
  op_stack *current_stack = eoe_op_stacks_.top().get();
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/post/repeat_pass.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/post/repeat_pass.h
@ -92,6 +92,12 @@ class RepeatPass : public NodePass {
  /// \return Status The error code return
  Status RunOnNode(std::shared_ptr<CacheLookupOp> node, bool *modified) override;

+  /// \brief Set the epoch count for DeviceQueue
+  /// \param[in] node The node being visited
+  /// \param[inout] modified Indicator if the node was changed at all
+  /// \return Status The error code return
+  Status RunOnNode(std::shared_ptr<DeviceQueueOp> node, bool *modified) override;
+
  /// \brief All operators have a flag that might be set related to the repeat and any leaf nodes need to be set up
  ///     for use with a controlling repeat above it.
  /// \param[in] node The node being visited
--- a/mindspore/ccsrc/minddata/dataset/include/datasets.h
+++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h
@ -196,8 +196,9 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
  }

  /// \brief Function to create an Iterator over the Dataset pipeline
+  /// \param[in] columns List of columns to be used to specify the order of columns
  /// \return Shared pointer to the Iterator
-  std::shared_ptr<Iterator> CreateIterator();
+  std::shared_ptr<Iterator> CreateIterator(std::vector<std::string> columns = {});

  /// \brief Function to create a BatchDataset
  /// \notes Combines batch_size number of consecutive rows into batches
@ -452,6 +453,12 @@ class VOCDataset : public Dataset {
  bool ValidateParams() override;

 private:
+  const std::string kColumnImage = "image";
+  const std::string kColumnTarget = "target";
+  const std::string kColumnBbox = "bbox";
+  const std::string kColumnLabel = "label";
+  const std::string kColumnDifficult = "difficult";
+  const std::string kColumnTruncate = "truncate";
  std::string dataset_dir_;
  std::string task_;
  std::string mode_;
--- a/mindspore/ccsrc/minddata/dataset/include/iterator.h
+++ b/mindspore/ccsrc/minddata/dataset/include/iterator.h
@ -37,6 +37,7 @@ namespace api {
 class Dataset;

 using TensorMap = std::unordered_map<std::string, std::shared_ptr<Tensor>>;
+using TensorVec = std::vector<std::shared_ptr<Tensor>>;

 // Abstract class for iterating over the dataset.
 class Iterator {
@ -53,9 +54,15 @@ class Iterator {
  Status BuildAndLaunchTree(std::shared_ptr<Dataset> ds);

  /// \brief Function to get the next row from the data pipeline.
+  /// \note Type of return data is a map(with column name).
  /// \param[out] row - the output tensor row.
  void GetNextRow(TensorMap *row);

+  /// \brief Function to get the next row from the data pipeline.
+  /// \note Type of return data is a vector(without column name).
+  /// \param[out] row - the output tensor row.
+  void GetNextRow(TensorVec *row);
+
  /// \brief Function to shut down the data pipeline.
  void Stop();

--- a/mindspore/ccsrc/minddata/dataset/include/transforms.h
+++ b/mindspore/ccsrc/minddata/dataset/include/transforms.h
@ -51,8 +51,12 @@ class CenterCropOperation;
 class CropOperation;
 class CutOutOperation;
 class DecodeOperation;
+class HwcToChwOperation;
+class MixUpBatchOperation;
 class NormalizeOperation;
+class OneHotOperation;
 class PadOperation;
+class RandomAffineOperation;
 class RandomColorAdjustOperation;
 class RandomCropOperation;
 class RandomHorizontalFlipOperation;
@ -90,6 +94,18 @@ std::shared_ptr<CutOutOperation> CutOut(int32_t length, int32_t num_patches = 1)
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<DecodeOperation> Decode(bool rgb = true);

+/// \brief Function to create a HwcToChw TensorOperation.
+/// \notes Transpose the input image; shape (H, W, C) to shape (C, H, W).
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<HwcToChwOperation> HWC2CHW();
+
+/// \brief Function to create a MixUpBatch TensorOperation.
+/// \notes Apply MixUp transformation on an input batch of images and labels. The labels must be in one-hot format and
+///    Batch must be called before calling this function.
+/// \param[in] alpha hyperparameter of beta distribution (default = 1.0)
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<MixUpBatchOperation> MixUpBatch(float alpha = 1);
+
 /// \brief Function to create a Normalize TensorOperation.
 /// \notes Normalize the input image with respect to mean and standard deviation.
 /// \param[in] mean - a vector of mean values for each channel, w.r.t channel order.
@ -97,6 +113,12 @@ std::shared_ptr<DecodeOperation> Decode(bool rgb = true);
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<NormalizeOperation> Normalize(std::vector<float> mean, std::vector<float> std);

+/// \brief Function to create a OneHot TensorOperation.
+/// \notes Convert the labels into OneHot format.
+/// \param[in] num_classes number of classes.
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<OneHotOperation> OneHot(int32_t num_classes);
+
 /// \brief Function to create a Pad TensorOp
 /// \notes Pads the image according to padding parameters
 /// \param[in] padding A vector representing the number of pixels to pad the image
@ -119,6 +141,23 @@ std::shared_ptr<NormalizeOperation> Normalize(std::vector<float> mean, std::vect
 std::shared_ptr<PadOperation> Pad(std::vector<int32_t> padding, std::vector<uint8_t> fill_value = {0},
                                  BorderType padding_mode = BorderType::kConstant);

+/// \brief Function to create a RandomAffine TensorOperation.
+/// \notes Applies a Random Affine transformation on input image in RGB or Greyscale mode.
+/// \param[in] degrees A float vector size 2, representing the starting and ending degree
+/// \param[in] translate_range A float vector size 2, representing percentages of translation on x and y axes.
+/// \param[in] scale_range A float vector size 2, representing the starting and ending scales in the range.
+/// \param[in] shear_ranges A float vector size 4, representing the starting and ending shear degrees vertically and
+///    horizontally.
+/// \param[in] interpolation An enum for the mode of interpolation
+/// \param[in] fill_value A uint8_t vector size 3, representing the pixel intensity of the borders, it is used to
+///    fill R, G, B channels respectively.
+/// \return Shared pointer to the current TensorOperation.
+std::shared_ptr<RandomAffineOperation> RandomAffine(
+  const std::vector<float_t> &degrees, const std::vector<float_t> &translate_range = {0.0, 0.0},
+  const std::vector<float_t> &scale_range = {1.0, 1.0}, const std::vector<float_t> &shear_ranges = {0.0, 0.0, 0.0, 0.0},
+  InterpolationMode interpolation = InterpolationMode::kNearestNeighbour,
+  const std::vector<uint8_t> &fill_value = {0, 0, 0});
+
 /// \brief Randomly adjust the brightness, contrast, saturation, and hue of the input image
 /// \param[in] brightness Brightness adjustment factor. Must be a vector of one or two values
 ///     if it's a vector of two values it needs to be in the form of [min, max]. Default value is {1, 1}
@ -148,8 +187,8 @@ std::shared_ptr<RandomColorAdjustOperation> RandomColorAdjust(std::vector<float>
 ///                     fill R, G, B channels respectively.
 /// \return Shared pointer to the current TensorOperation.
 std::shared_ptr<RandomCropOperation> RandomCrop(std::vector<int32_t> size, std::vector<int32_t> padding = {0, 0, 0, 0},
-                                                bool pad_if_needed = false,
-                                                std::vector<uint8_t> fill_value = {0, 0, 0});
+                                                bool pad_if_needed = false, std::vector<uint8_t> fill_value = {0, 0, 0},
+                                                BorderType padding_mode = BorderType::kConstant);

 /// \brief Function to create a RandomHorizontalFlip TensorOperation.
 /// \notes Tensor operation to perform random horizontal flip.
@ -258,6 +297,29 @@ class DecodeOperation : public TensorOperation {
  bool rgb_;
 };

+class HwcToChwOperation : public TensorOperation {
+ public:
+  ~HwcToChwOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  bool ValidateParams() override;
+};
+
+class MixUpBatchOperation : public TensorOperation {
+ public:
+  explicit MixUpBatchOperation(float alpha = 1);
+
+  ~MixUpBatchOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  bool ValidateParams() override;
+
+ private:
+  float alpha_;
+};
+
 class NormalizeOperation : public TensorOperation {
 public:
  NormalizeOperation(std::vector<float> mean, std::vector<float> std);
@ -273,6 +335,20 @@ class NormalizeOperation : public TensorOperation {
  std::vector<float> std_;
 };

+class OneHotOperation : public TensorOperation {
+ public:
+  explicit OneHotOperation(int32_t num_classes_);
+
+  ~OneHotOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  bool ValidateParams() override;
+
+ private:
+  float num_classes_;
+};
+
 class PadOperation : public TensorOperation {
 public:
  PadOperation(std::vector<int32_t> padding, std::vector<uint8_t> fill_value = {0},
@ -290,6 +366,29 @@ class PadOperation : public TensorOperation {
  BorderType padding_mode_;
 };

+class RandomAffineOperation : public TensorOperation {
+ public:
+  RandomAffineOperation(const std::vector<float_t> &degrees, const std::vector<float_t> &translate_range = {0.0, 0.0},
+                        const std::vector<float_t> &scale_range = {1.0, 1.0},
+                        const std::vector<float_t> &shear_ranges = {0.0, 0.0, 0.0, 0.0},
+                        InterpolationMode interpolation = InterpolationMode::kNearestNeighbour,
+                        const std::vector<uint8_t> &fill_value = {0, 0, 0});
+
+  ~RandomAffineOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  bool ValidateParams() override;
+
+ private:
+  std::vector<float_t> degrees_;          // min_degree, max_degree
+  std::vector<float_t> translate_range_;  // maximum x translation percentage, maximum y translation percentage
+  std::vector<float_t> scale_range_;      // min_scale, max_scale
+  std::vector<float_t> shear_ranges_;     // min_x_shear, max_x_shear, min_y_shear, max_y_shear
+  InterpolationMode interpolation_;
+  std::vector<uint8_t> fill_value_;
+};
+
 class RandomColorAdjustOperation : public TensorOperation {
 public:
  RandomColorAdjustOperation(std::vector<float> brightness = {1.0, 1.0}, std::vector<float> contrast = {1.0, 1.0},
@ -311,7 +410,8 @@ class RandomColorAdjustOperation : public TensorOperation {
 class RandomCropOperation : public TensorOperation {
 public:
  RandomCropOperation(std::vector<int32_t> size, std::vector<int32_t> padding = {0, 0, 0, 0},
-                      bool pad_if_needed = false, std::vector<uint8_t> fill_value = {0, 0, 0});
+                      bool pad_if_needed = false, std::vector<uint8_t> fill_value = {0, 0, 0},
+                      BorderType padding_mode = BorderType::kConstant);

  ~RandomCropOperation() = default;

@ -324,6 +424,7 @@ class RandomCropOperation : public TensorOperation {
  std::vector<int32_t> padding_;
  bool pad_if_needed_;
  std::vector<uint8_t> fill_value_;
+  BorderType padding_mode_;
 };

 class RandomHorizontalFlipOperation : public TensorOperation {
--- a/mindspore/ccsrc/minddata/dataset/kernels/data/data_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/data/data_utils.cc
@ -20,6 +20,7 @@
 #include <limits>
 #include <string>
 #include <vector>
+#include <utility>

 #include "minddata/dataset/core/constants.h"
 #include "minddata/dataset/core/data_type.h"
@ -648,5 +649,30 @@ Status Concatenate(const TensorRow &input, TensorRow *output, int8_t axis, std::
  return Status::OK();
 }

+Status BatchTensorToCVTensorVector(const std::shared_ptr<Tensor> &input,
+                                   std::vector<std::shared_ptr<CVTensor>> *output) {
+  std::vector<int64_t> tensor_shape = input->shape().AsVector();
+  TensorShape remaining({-1});
+  std::vector<int64_t> index(tensor_shape.size(), 0);
+  if (tensor_shape.size() <= 1) {
+    RETURN_STATUS_UNEXPECTED("Tensor must be at least 2-D in order to unpack");
+  }
+  TensorShape element_shape(std::vector<int64_t>(tensor_shape.begin() + 1, tensor_shape.end()));
+
+  for (; index[0] < tensor_shape[0]; index[0]++) {
+    uchar *start_addr_of_index = nullptr;
+    std::shared_ptr<Tensor> out;
+
+    RETURN_IF_NOT_OK(input->StartAddrOfIndex(index, &start_addr_of_index, &remaining));
+    RETURN_IF_NOT_OK(input->CreateFromMemory(element_shape, input->type(), start_addr_of_index, &out));
+    std::shared_ptr<CVTensor> cv_out = CVTensor::AsCVTensor(std::move(out));
+    if (!cv_out->mat().data) {
+      RETURN_STATUS_UNEXPECTED("Could not convert to CV Tensor");
+    }
+    output->push_back(cv_out);
+  }
+  return Status::OK();
+}
+
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/data/data_utils.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/data/data_utils.h
@ -152,6 +152,17 @@ Status Mask(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *outpu

 Status Concatenate(const TensorRow &input, TensorRow *output, int8_t axis, std::shared_ptr<Tensor> prepend,
                   std::shared_ptr<Tensor> append);
+
+// helper for concat, always append to the input, and pass that to the output
+Status ConcatenateHelper(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int8_t axis,
+                         std::shared_ptr<Tensor> append);
+
+/// Convert an n-dimensional Tensor to a vector of (n-1)-dimensional CVTensors
+/// @param input[in] input tensor
+/// @param output[out] output tensor
+/// @return Status ok/error
+Status BatchTensorToCVTensorVector(const std::shared_ptr<Tensor> &input,
+                                   std::vector<std::shared_ptr<CVTensor>> *output);
 }  // namespace dataset
 }  // namespace mindspore

--- a/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
@ -1,6 +1,8 @@
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
+add_subdirectory(soft_dvpp)
 add_library(kernels-image OBJECT
+    affine_op.cc
    auto_contrast_op.cc
    center_crop_op.cc
    crop_op.cc
@ -10,8 +12,11 @@ add_library(kernels-image OBJECT
    hwc_to_chw_op.cc
    image_utils.cc
    invert_op.cc
+    math_utils.cc
+    mixup_batch_op.cc
    normalize_op.cc
    pad_op.cc
+    random_affine_op.cc
    random_color_adjust_op.cc
    random_crop_decode_resize_op.cc
    random_crop_and_resize_with_bbox_op.cc
@ -34,3 +39,4 @@ add_library(kernels-image OBJECT
    resize_with_bbox_op.cc
    random_resize_with_bbox_op.cc
    )
+add_dependencies(kernels-image kernels-soft-dvpp-image)
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/affine_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/affine_op.cc
@ -0,0 +1,99 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include "minddata/dataset/kernels/image/affine_op.h"
+#include "minddata/dataset/kernels/image/image_utils.h"
+#include "minddata/dataset/kernels/image/math_utils.h"
+#include "minddata/dataset/util/random.h"
+
+namespace mindspore {
+namespace dataset {
+
+const InterpolationMode AffineOp::kDefInterpolation = InterpolationMode::kNearestNeighbour;
+const float_t AffineOp::kDegrees = 0.0;
+const std::vector<float_t> AffineOp::kTranslation = {0.0, 0.0};
+const float_t AffineOp::kScale = 1.0;
+const std::vector<float_t> AffineOp::kShear = {0.0, 0.0};
+const std::vector<uint8_t> AffineOp::kFillValue = {0, 0, 0};
+
+AffineOp::AffineOp(float_t degrees, const std::vector<float_t> &translation, float_t scale,
+                   const std::vector<float_t> &shear, InterpolationMode interpolation,
+                   const std::vector<uint8_t> &fill_value)
+    : degrees_(degrees),
+      translation_(translation),
+      scale_(scale),
+      shear_(shear),
+      interpolation_(interpolation),
+      fill_value_(fill_value) {}
+
+Status AffineOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  float_t translation_x = translation_[0];
+  float_t translation_y = translation_[1];
+  float_t degrees = 0.0;
+  DegreesToRadians(degrees_, &degrees);
+  float_t shear_x = shear_[0];
+  float_t shear_y = shear_[1];
+  DegreesToRadians(shear_x, &shear_x);
+  DegreesToRadians(-1 * shear_y, &shear_y);
+  std::shared_ptr<CVTensor> input_cv = CVTensor::AsCVTensor(input);
+
+  // Apply Affine Transformation
+  //       T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1]
+  //       C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1]
+  //       RSS is rotation with scale and shear matrix
+  //       RSS(a, s, (sx, sy)) =
+  //       = R(a) * S(s) * SHy(sy) * SHx(sx)
+  //       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(x)/cos(y) - sin(a)), 0 ]
+  //         [ s*sin(a - sy)/cos(sy), s*(-sin(a - sy)*tan(x)/cos(y) + cos(a)), 0 ]
+  //         [ 0                    , 0                                      , 1 ]
+  //
+  // where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
+  // SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]
+  //          [0, 1      ]              [-tan(s), 1]
+  //
+  // Thus, the affine matrix is M = T * C * RSS * C^-1
+
+  float_t cx = ((input_cv->mat().cols - 1) / 2.0);
+  float_t cy = ((input_cv->mat().rows - 1) / 2.0);
+  // Calculate RSS
+  std::vector<float_t> matrix{scale_ * cos(degrees + shear_y) / cos(shear_y),
+                              scale_ * (-1 * cos(degrees + shear_y) * tan(shear_x) / cos(shear_y) - sin(degrees)),
+                              0,
+                              scale_ * sin(degrees + shear_y) / cos(shear_y),
+                              scale_ * (-1 * sin(degrees + shear_y) * tan(shear_x) / cos(shear_y) + cos(degrees)),
+                              0};
+  // Compute T * C * RSS * C^-1
+  matrix[2] = (1 - matrix[0]) * cx - matrix[1] * cy + translation_x;
+  matrix[5] = (1 - matrix[4]) * cy - matrix[3] * cx + translation_y;
+  cv::Mat affine_mat(matrix);
+  affine_mat = affine_mat.reshape(1, {2, 3});
+
+  std::shared_ptr<CVTensor> output_cv;
+  RETURN_IF_NOT_OK(CVTensor::CreateEmpty(input_cv->shape(), input_cv->type(), &output_cv));
+  RETURN_UNEXPECTED_IF_NULL(output_cv);
+  cv::warpAffine(input_cv->mat(), output_cv->mat(), affine_mat, input_cv->mat().size(),
+                 GetCVInterpolationMode(interpolation_), cv::BORDER_CONSTANT,
+                 cv::Scalar(fill_value_[0], fill_value_[1], fill_value_[2]));
+  (*output) = std::static_pointer_cast<Tensor>(output_cv);
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/affine_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/affine_op.h
@ -0,0 +1,68 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_AFFINE_OP_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_AFFINE_OP_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "minddata/dataset/core/cv_tensor.h"
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class AffineOp : public TensorOp {
+ public:
+  /// Default values
+  static const float_t kDegrees;
+  static const std::vector<float_t> kTranslation;
+  static const float_t kScale;
+  static const std::vector<float_t> kShear;
+  static const InterpolationMode kDefInterpolation;
+  static const std::vector<uint8_t> kFillValue;
+
+  /// Constructor
+ public:
+  explicit AffineOp(float_t degrees, const std::vector<float_t> &translation = kTranslation, float_t scale = kScale,
+                    const std::vector<float_t> &shear = kShear, InterpolationMode interpolation = kDefInterpolation,
+                    const std::vector<uint8_t> &fill_value = kFillValue);
+
+  ~AffineOp() override = default;
+
+  std::string Name() const override { return kAffineOp; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+  /// Member variables
+ private:
+  std::string kAffineOp = "AffineOp";
+
+ protected:
+  float_t degrees_;
+  std::vector<float_t> translation_;  // translation_x and translation_y
+  float_t scale_;
+  std::vector<float_t> shear_;  // shear_x and shear_y
+  InterpolationMode interpolation_;
+  std::vector<uint8_t> fill_value_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_AFFINE_OP_H_
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
@ -21,6 +21,7 @@
 #include <utility>
 #include <opencv2/imgcodecs.hpp>
 #include "utils/ms_utils.h"
+#include "minddata/dataset/kernels/image/math_utils.h"
 #include "minddata/dataset/core/constants.h"
 #include "minddata/dataset/core/cv_tensor.h"
 #include "minddata/dataset/core/tensor.h"
@ -631,36 +632,9 @@ Status AutoContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor
      hist.col(0).copyTo(hist_vec);
      // Ignore values in ignore
      for (const auto &item : ignore) hist_vec[item] = 0;
-      int32_t n = std::accumulate(hist_vec.begin(), hist_vec.end(), 0);
-      // Find pixel values that are in the low cutoff and high cutoff.
-      int32_t cut = static_cast<int32_t>((cutoff / 100.0) * n);
-      if (cut != 0) {
-        for (int32_t lo = 0; lo < 256 && cut > 0; lo++) {
-          if (cut > hist_vec[lo]) {
-            cut -= hist_vec[lo];
-            hist_vec[lo] = 0;
-          } else {
-            hist_vec[lo] -= cut;
-            cut = 0;
-          }
-        }
-        cut = static_cast<int32_t>((cutoff / 100.0) * n);
-        for (int32_t hi = 255; hi >= 0 && cut > 0; hi--) {
-          if (cut > hist_vec[hi]) {
-            cut -= hist_vec[hi];
-            hist_vec[hi] = 0;
-          } else {
-            hist_vec[hi] -= cut;
-            cut = 0;
-          }
-        }
-      }
-      int32_t lo = 0;
      int32_t hi = 255;
-      for (; lo < 256 && !hist_vec[lo]; lo++) {
-      }
-      for (; hi >= 0 && !hist_vec[hi]; hi--) {
-      }
+      int32_t lo = 0;
+      RETURN_IF_NOT_OK(ComputeUpperAndLowerPercentiles(&hist_vec, cutoff, cutoff, &hi, &lo));
      if (hi <= lo) {
        for (int32_t i = 0; i < 256; i++) {
          table.push_back(i);
@ -685,7 +659,6 @@ Status AutoContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor
    std::shared_ptr<CVTensor> output_cv;
    RETURN_IF_NOT_OK(CVTensor::CreateFromMat(result, &output_cv));
    (*output) = std::static_pointer_cast<Tensor>(output_cv);
-    (*output) = std::static_pointer_cast<Tensor>(output_cv);
    (*output)->Reshape(input->shape());
  } catch (const cv::Exception &e) {
    RETURN_STATUS_UNEXPECTED("Error in auto contrast");
@ -983,5 +956,24 @@ Status UpdateBBoxesForResize(const std::shared_ptr<Tensor> &bboxList, const size
  return Status::OK();
 }

+Status GetJpegImageInfo(const std::shared_ptr<Tensor> &input, int *img_width, int *img_height) {
+  struct jpeg_decompress_struct cinfo {};
+  struct JpegErrorManagerCustom jerr {};
+  cinfo.err = jpeg_std_error(&jerr.pub);
+  jerr.pub.error_exit = JpegErrorExitCustom;
+  try {
+    jpeg_create_decompress(&cinfo);
+    JpegSetSource(&cinfo, input->GetBuffer(), input->SizeInBytes());
+    (void)jpeg_read_header(&cinfo, TRUE);
+    jpeg_calc_output_dimensions(&cinfo);
+  } catch (std::runtime_error &e) {
+    jpeg_destroy_decompress(&cinfo);
+    RETURN_STATUS_UNEXPECTED(e.what());
+  }
+  *img_height = cinfo.output_height;
+  *img_width = cinfo.output_width;
+  jpeg_destroy_decompress(&cinfo);
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h
@ -268,6 +268,12 @@ Status PadBBoxes(const std::shared_ptr<Tensor> *bboxList, const size_t &bboxCoun
 Status UpdateBBoxesForResize(const std::shared_ptr<Tensor> &bboxList, const size_t &bboxCount, int32_t target_width_,
                             int32_t target_height_, int orig_width, int orig_height);

+// Get jpeg image width and height
+// @param input: CVTensor containing the not decoded image 1D bytes
+// @param img_width: the jpeg image width
+// @param img_height: the jpeg image height
+Status GetJpegImageInfo(const std::shared_ptr<Tensor> &input, int *img_width, int *img_height);
+
 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_IMAGE_UTILS_H_
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/math_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/math_utils.cc
@ -0,0 +1,84 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "minddata/dataset/kernels/image/math_utils.h"
+
+#include <opencv2/imgproc/types_c.h>
+
+#include <algorithm>
+#include <string>
+
+namespace mindspore {
+namespace dataset {
+Status ComputeUpperAndLowerPercentiles(std::vector<int32_t> *hist, int32_t hi_p, int32_t low_p, int32_t *hi,
+                                       int32_t *lo) {
+  try {
+    int32_t n = std::accumulate(hist->begin(), hist->end(), 0);
+    int32_t cut = static_cast<int32_t>((low_p / 100.0) * n);
+    for (int32_t lb = 0; lb < hist->size() + 1 && cut > 0; lb++) {
+      if (cut > (*hist)[lb]) {
+        cut -= (*hist)[lb];
+        (*hist)[lb] = 0;
+      } else {
+        (*hist)[lb] -= cut;
+        cut = 0;
+      }
+    }
+    cut = static_cast<int32_t>((hi_p / 100.0) * n);
+    for (int32_t ub = hist->size() - 1; ub >= 0 && cut > 0; ub--) {
+      if (cut > (*hist)[ub]) {
+        cut -= (*hist)[ub];
+        (*hist)[ub] = 0;
+      } else {
+        (*hist)[ub] -= cut;
+        cut = 0;
+      }
+    }
+    *lo = 0;
+    *hi = hist->size() - 1;
+    for (; (*lo) < (*hi) && !(*hist)[*lo]; (*lo)++) {
+    }
+    for (; (*hi) >= 0 && !(*hist)[*hi]; (*hi)--) {
+    }
+  } catch (const std::exception &e) {
+    const char *err_msg = e.what();
+    std::string err_message = "Error in ComputeUpperAndLowerPercentiles: ";
+    err_message += err_msg;
+    RETURN_STATUS_UNEXPECTED(err_message);
+  }
+  return Status::OK();
+}
+
+Status DegreesToRadians(float_t degrees, float_t *radians_target) {
+  *radians_target = CV_PI * degrees / 180.0;
+  return Status::OK();
+}
+
+Status GenerateRealNumber(float_t a, float_t b, std::mt19937 *rnd, float_t *result) {
+  try {
+    std::uniform_real_distribution<float_t> distribution{a, b};
+    *result = distribution(*rnd);
+  } catch (const std::exception &e) {
+    const char *err_msg = e.what();
+    std::string err_message = "Error in GenerateRealNumber: ";
+    err_message += err_msg;
+    RETURN_STATUS_UNEXPECTED(err_message);
+  }
+  return Status::OK();
+}
+
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/math_utils.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/math_utils.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_MATH_UTILS_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_MATH_UTILS_H_
+
+#include <memory>
+#include <random>
+#include <vector>
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+/// \brief Returns lower and upper pth percentiles of the input histogram.
+/// \param[in] hist: Input histogram (mutates the histogram for computation purposes)
+/// \param[in] hi_p: Right side percentile
+/// \param[in] low_p: Left side percentile
+/// \param[out] hi: Value at high end percentile
+/// \param[out] lo: Value at low end percentile
+Status ComputeUpperAndLowerPercentiles(std::vector<int32_t> *hist, int32_t hi_p, int32_t low_p, int32_t *hi,
+                                       int32_t *lo);
+
+/// \brief Converts degrees input to radians.
+/// \param[in] degrees: Input degrees
+/// \param[out] radians_target: Radians output
+Status DegreesToRadians(float_t degrees, float_t *radians_target);
+
+/// \brief Generates a random real number in [a,b).
+/// \param[in] a: Start of range
+/// \param[in] b: End of range
+/// \param[in] rnd: Random device
+/// \param[out] result: Random number in range [a,b)
+Status GenerateRealNumber(float_t a, float_t b, std::mt19937 *rnd, float_t *result);
+
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_MATH_UTILS_H_
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/mixup_batch_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/mixup_batch_op.cc
@ -0,0 +1,108 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <utility>
+#include "minddata/dataset/core/cv_tensor.h"
+#include "minddata/dataset/kernels/image/mixup_batch_op.h"
+#include "minddata/dataset/kernels/data/data_utils.h"
+#include "minddata/dataset/util/random.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+MixUpBatchOp::MixUpBatchOp(float alpha) : alpha_(alpha) { rnd_.seed(GetSeed()); }
+
+Status MixUpBatchOp::Compute(const TensorRow &input, TensorRow *output) {
+  if (input.size() < 2) {
+    RETURN_STATUS_UNEXPECTED("Both images and labels columns are required for this operation");
+  }
+
+  std::vector<std::shared_ptr<CVTensor>> images;
+  std::vector<int64_t> image_shape = input.at(0)->shape().AsVector();
+  std::vector<int64_t> label_shape = input.at(1)->shape().AsVector();
+
+  // Check inputs
+  if (label_shape.size() != 2 || image_shape.size() != 4 || image_shape[0] != label_shape[0]) {
+    RETURN_STATUS_UNEXPECTED("You must batch before calling MixUpBatch");
+  }
+
+  if ((image_shape[1] != 1 && image_shape[1] != 3) && (image_shape[3] != 1 && image_shape[3] != 3)) {
+    RETURN_STATUS_UNEXPECTED("MixUpBatch: Images must be in the shape of HWC or CHW");
+  }
+
+  // Move images into a vector of CVTensors
+  RETURN_IF_NOT_OK(BatchTensorToCVTensorVector(input.at(0), &images));
+
+  // Calculating lambda
+  // If x1 is a random variable from Gamma(a1, 1) and x2 is a random variable from Gamma(a2, 1)
+  // then x = x1 / (x1+x2) is a random variable from Beta(a1, a2)
+  std::gamma_distribution<float> distribution(alpha_, 1);
+  float x1 = distribution(rnd_);
+  float x2 = distribution(rnd_);
+  float lam = x1 / (x1 + x2);
+
+  // Calculate random labels
+  std::vector<int64_t> rand_indx;
+  for (int64_t i = 0; i < images.size(); i++) rand_indx.push_back(i);
+  std::shuffle(rand_indx.begin(), rand_indx.end(), rnd_);
+
+  // Compute labels
+  std::shared_ptr<Tensor> out_labels;
+  RETURN_IF_NOT_OK(TypeCast(std::move(input.at(1)), &out_labels, DataType("float32")));
+  for (int64_t i = 0; i < label_shape[0]; i++) {
+    for (int64_t j = 0; j < label_shape[1]; j++) {
+      uint64_t first_value, second_value;
+      RETURN_IF_NOT_OK(input.at(1)->GetItemAt(&first_value, {i, j}));
+      RETURN_IF_NOT_OK(input.at(1)->GetItemAt(&second_value, {rand_indx[i], j}));
+      RETURN_IF_NOT_OK(out_labels->SetItemAt({i, j}, lam * first_value + (1 - lam) * second_value));
+    }
+  }
+
+  // Compute images
+  for (int64_t i = 0; i < images.size(); i++) {
+    TensorShape remaining({-1});
+    uchar *start_addr_of_index = nullptr;
+    std::shared_ptr<Tensor> out;
+    RETURN_IF_NOT_OK(input.at(0)->StartAddrOfIndex({rand_indx[i], 0, 0, 0}, &start_addr_of_index, &remaining));
+    RETURN_IF_NOT_OK(input.at(0)->CreateFromMemory(TensorShape({image_shape[1], image_shape[2], image_shape[3]}),
+                                                   input.at(0)->type(), start_addr_of_index, &out));
+    std::shared_ptr<CVTensor> rand_image = CVTensor::AsCVTensor(std::move(out));
+    if (!rand_image->mat().data) {
+      RETURN_STATUS_UNEXPECTED("Could not convert to CV Tensor");
+    }
+    images[i]->mat() = lam * images[i]->mat() + (1 - lam) * rand_image->mat();
+  }
+
+  // Move the output into a TensorRow
+  std::shared_ptr<Tensor> output_image;
+  RETURN_IF_NOT_OK(Tensor::CreateEmpty(input.at(0)->shape(), input.at(0)->type(), &output_image));
+  for (int64_t i = 0; i < images.size(); i++) {
+    RETURN_IF_NOT_OK(output_image->InsertTensor({i}, images[i]));
+  }
+  output->push_back(output_image);
+  output->push_back(out_labels);
+
+  return Status::OK();
+}
+
+void MixUpBatchOp::Print(std::ostream &out) const {
+  out << "MixUpBatchOp: "
+      << "alpha: " << alpha_ << "\n";
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/mixup_batch_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/mixup_batch_op.h
@ -0,0 +1,51 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_MIXUPBATCH_OP_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_MIXUPBATCH_OP_H_
+
+#include <memory>
+#include <vector>
+#include <random>
+#include <string>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class MixUpBatchOp : public TensorOp {
+ public:
+  // Default values, also used by python_bindings.cc
+
+  explicit MixUpBatchOp(float alpha);
+
+  ~MixUpBatchOp() override = default;
+
+  void Print(std::ostream &out) const override;
+
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+
+  std::string Name() const override { return kMixUpBatchOp; }
+
+ private:
+  float alpha_;
+  std::mt19937 rnd_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_MIXUPBATCH_OP_H_
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/random_affine_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/random_affine_op.cc
@ -0,0 +1,77 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include "minddata/dataset/kernels/image/random_affine_op.h"
+#include "minddata/dataset/kernels/image/image_utils.h"
+#include "minddata/dataset/kernels/image/math_utils.h"
+#include "minddata/dataset/util/random.h"
+
+namespace mindspore {
+namespace dataset {
+
+const std::vector<float_t> RandomAffineOp::kDegreesRange = {0.0, 0.0};
+const std::vector<float_t> RandomAffineOp::kTranslationPercentages = {0.0, 0.0};
+const std::vector<float_t> RandomAffineOp::kScaleRange = {1.0, 1.0};
+const std::vector<float_t> RandomAffineOp::kShearRanges = {0.0, 0.0, 0.0, 0.0};
+const InterpolationMode RandomAffineOp::kDefInterpolation = InterpolationMode::kNearestNeighbour;
+const std::vector<uint8_t> RandomAffineOp::kFillValue = {0, 0, 0};
+
+RandomAffineOp::RandomAffineOp(std::vector<float_t> degrees, std::vector<float_t> translate_range,
+                               std::vector<float_t> scale_range, std::vector<float_t> shear_ranges,
+                               InterpolationMode interpolation, std::vector<uint8_t> fill_value)
+    : AffineOp(0.0),
+      degrees_range_(degrees),
+      translate_range_(translate_range),
+      scale_range_(scale_range),
+      shear_ranges_(shear_ranges) {
+  interpolation_ = interpolation;
+  fill_value_ = fill_value;
+  rnd_.seed(GetSeed());
+}
+
+Status RandomAffineOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  dsize_t height = input->shape()[0];
+  dsize_t width = input->shape()[1];
+  float_t max_dx = translate_range_[0] * height;
+  float_t max_dy = translate_range_[1] * width;
+  float_t degrees = 0.0;
+  RETURN_IF_NOT_OK(GenerateRealNumber(degrees_range_[0], degrees_range_[1], &rnd_, &degrees));
+  float_t translation_x = 0.0;
+  RETURN_IF_NOT_OK(GenerateRealNumber(-1 * max_dx, max_dx, &rnd_, &translation_x));
+  float_t translation_y = 0.0;
+  RETURN_IF_NOT_OK(GenerateRealNumber(-1 * max_dy, max_dy, &rnd_, &translation_y));
+  float_t scale = 1.0;
+  RETURN_IF_NOT_OK(GenerateRealNumber(scale_range_[0], scale_range_[1], &rnd_, &scale));
+  float_t shear_x = 0.0;
+  RETURN_IF_NOT_OK(GenerateRealNumber(shear_ranges_[0], shear_ranges_[1], &rnd_, &shear_x));
+  float_t shear_y = 0.0;
+  RETURN_IF_NOT_OK(GenerateRealNumber(shear_ranges_[2], shear_ranges_[3], &rnd_, &shear_y));
+  // assign to base class variables
+  degrees_ = degrees;
+  scale_ = scale;
+  translation_[0] = translation_x;
+  translation_[1] = translation_y;
+  shear_[0] = shear_x;
+  shear_[1] = shear_y;
+  return AffineOp::Compute(input, output);
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/random_affine_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/random_affine_op.h
@ -0,0 +1,64 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_RANDOM_AFFINE_OP_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_RANDOM_AFFINE_OP_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "minddata/dataset/core/cv_tensor.h"
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/image/affine_op.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class RandomAffineOp : public AffineOp {
+ public:
+  /// Default values, also used by python_bindings.cc
+  static const std::vector<float_t> kDegreesRange;
+  static const std::vector<float_t> kTranslationPercentages;
+  static const std::vector<float_t> kScaleRange;
+  static const std::vector<float_t> kShearRanges;
+  static const InterpolationMode kDefInterpolation;
+  static const std::vector<uint8_t> kFillValue;
+
+  explicit RandomAffineOp(std::vector<float_t> degrees, std::vector<float_t> translate_range = kTranslationPercentages,
+                          std::vector<float_t> scale_range = kScaleRange,
+                          std::vector<float_t> shear_ranges = kShearRanges,
+                          InterpolationMode interpolation = kDefInterpolation,
+                          std::vector<uint8_t> fill_value = kFillValue);
+
+  ~RandomAffineOp() override = default;
+
+  std::string Name() const override { return kRandomAffineOp; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  std::string kRandomAffineOp = "RandomAffineOp";
+  std::vector<float_t> degrees_range_;    // min_degree, max_degree
+  std::vector<float_t> translate_range_;  // maximum x translation percentage, maximum y translation percentage
+  std::vector<float_t> scale_range_;      // min_scale, max_scale
+  std::vector<float_t> shear_ranges_;     // min_x_shear, max_x_shear, min_y_shear, max_y_shear
+  std::mt19937 rnd_;                      // random device
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_IMAGE_RANDOM_AFFINE_OP_H_
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_decode_resize_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/random_crop_decode_resize_op.cc
@ -37,22 +37,9 @@ Status RandomCropDecodeResizeOp::Compute(const std::shared_ptr<Tensor> &input, s
    RETURN_IF_NOT_OK(op.Compute(input, &decoded));
    return RandomCropAndResizeOp::Compute(decoded, output);
  } else {
-    struct jpeg_decompress_struct cinfo {};
-    struct JpegErrorManagerCustom jerr {};
-    cinfo.err = jpeg_std_error(&jerr.pub);
-    jerr.pub.error_exit = JpegErrorExitCustom;
-    try {
-      jpeg_create_decompress(&cinfo);
-      JpegSetSource(&cinfo, input->GetBuffer(), input->SizeInBytes());
-      (void)jpeg_read_header(&cinfo, TRUE);
-      jpeg_calc_output_dimensions(&cinfo);
-    } catch (std::runtime_error &e) {
-      jpeg_destroy_decompress(&cinfo);
-      RETURN_STATUS_UNEXPECTED(e.what());
-    }
-    int h_in = cinfo.output_height;
-    int w_in = cinfo.output_width;
-    jpeg_destroy_decompress(&cinfo);
+    int h_in = 0;
+    int w_in = 0;
+    RETURN_IF_NOT_OK(GetJpegImageInfo(input, &w_in, &h_in));

    int x = 0;
    int y = 0;
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/CMakeLists.txt
@ -0,0 +1,6 @@
+file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
+set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
+add_subdirectory(utils)
+add_library(kernels-soft-dvpp-image OBJECT
+            soft_dvpp_decode_resize_jpeg_op.cc
+            soft_dvpp_decode_random_crop_resize_jpeg_op.cc)
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.cc
@ -0,0 +1,84 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.h"
+#include <string>
+
+#include "opencv2/opencv.hpp"
+
+#include "minddata/dataset/core/cv_tensor.h"
+#include "minddata/dataset/kernels/image/image_utils.h"
+#include "minddata/dataset/util/random.h"
+
+namespace mindspore {
+namespace dataset {
+SoftDvppDecodeRandomCropResizeJpegOp::SoftDvppDecodeRandomCropResizeJpegOp(int32_t target_height, int32_t target_width,
+                                                                           float scale_lb, float scale_ub,
+                                                                           float aspect_lb, float aspect_ub,
+                                                                           int32_t max_iter)
+    : RandomCropAndResizeOp(target_height, target_width, scale_lb, scale_ub, aspect_lb, aspect_ub,
+                            InterpolationMode::kLinear, max_iter) {}
+
+Status SoftDvppDecodeRandomCropResizeJpegOp::GetCropInfo(const std::shared_ptr<Tensor> &input,
+                                                         SoftDpCropInfo *crop_info) {
+  int img_width = 0;
+  int img_height = 0;
+  RETURN_IF_NOT_OK(GetJpegImageInfo(input, &img_width, &img_height));
+  int x = 0;
+  int y = 0;
+  int crop_heigh = 0;
+  int crop_widht = 0;
+  RETURN_IF_NOT_OK(GetCropBox(img_height, img_width, &x, &y, &crop_heigh, &crop_widht));
+  crop_info->left = x;
+  crop_info->up = y;
+  crop_info->right = crop_info->left + crop_widht;
+  crop_info->down = crop_info->up + crop_heigh;
+  return Status::OK();
+}
+
+Status SoftDvppDecodeRandomCropResizeJpegOp::Compute(const std::shared_ptr<Tensor> &input,
+                                                     std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  if (!IsNonEmptyJPEG(input)) {
+    RETURN_STATUS_UNEXPECTED("SoftDvppDecodeRandomCropResizeJpeg only support process jpeg image.");
+  }
+  SoftDpCropInfo crop_info;
+  RETURN_IF_NOT_OK(GetCropInfo(input, &crop_info));
+  try {
+    unsigned char *buffer = const_cast<unsigned char *>(input->GetBuffer());
+    CHECK_FAIL_RETURN_UNEXPECTED(buffer != nullptr, "The input image buffer is empty.");
+    SoftDpProcsessInfo info;
+    info.input_buffer = static_cast<uint8_t *>(buffer);
+    info.input_buffer_size = input->SizeInBytes();
+    info.output_width = target_width_;
+    info.output_height = target_height_;
+    cv::Mat out_rgb_img(target_height_, target_width_, CV_8UC3);
+    info.output_buffer = out_rgb_img.data;
+    info.output_buffer_size = target_width_ * target_height_ * 3;
+    info.is_v_before_u = true;
+    int ret = DecodeAndCropAndResizeJpeg(&info, crop_info);
+    std::string error_info("Soft dvpp DecodeAndResizeJpeg failed with return code: ");
+    error_info += std::to_string(ret);
+    CHECK_FAIL_RETURN_UNEXPECTED(ret == 0, error_info);
+    std::shared_ptr<CVTensor> cv_tensor = nullptr;
+    RETURN_IF_NOT_OK(CVTensor::CreateFromMat(out_rgb_img, &cv_tensor));
+    *output = std::static_pointer_cast<Tensor>(cv_tensor);
+  } catch (const cv::Exception &e) {
+    RETURN_STATUS_UNEXPECTED("Error in soft dvpp image decode and resize.");
+  }
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/Show More
+++ b/Show More