diff --git a/1.txt b/1.txt
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/build.bat b/build.bat
index 03b43ed7830..a271f281004 100644
--- a/build.bat
+++ b/build.bat
@@ -18,7 +18,7 @@
 SET BASE_PATH=%CD%
 SET BUILD_PATH=%BASE_PATH%/build
 
-SET threads=6
+SET threads=8
 SET ENABLE_GITEE=OFF
 
 set VERSION_MAJOR=''
diff --git a/build.sh b/build.sh
index 83ea081fca1..4427be77a4f 100755
--- a/build.sh
+++ b/build.sh
@@ -61,7 +61,7 @@ usage()
   echo "    -l Compile with python dependency, default on"
   echo "    -S Enable enable download cmake compile dependency from gitee , default off"
   echo "    -k Enable make clean, clean up compilation generated cache "
-  echo "    -W Enable x86_64 SSE or AVX instruction set, use [sse|neon|avx|avx512|off], default off for lite and avx for CPU"
+  echo "    -W Enable SIMD instruction set, use [sse|neon|avx|avx512|off], default avx for cloud CPU backend"
   echo "    -H Enable hidden"
   echo "    -L Link and specify Tensor-RT library path, default disable Tensor-RT lib linking"
   echo "    -y Compile the symbol table switch and save the symbol table to the directory output"
diff --git a/cmake/external_libs/ffmpeg.cmake b/cmake/external_libs/ffmpeg.cmake
deleted file mode 100644
index 898eab83078..00000000000
--- a/cmake/external_libs/ffmpeg.cmake
+++ /dev/null
@@ -1,44 +0,0 @@
-set(FFMPEG_FLAGS
-        --disable-programs
-        --disable-doc
-        --disable-debug
-        --disable-avdevice
-        --disable-postproc
-        --disable-avfilter
-        --disable-network
-        --disable-encoders
-        --disable-hwaccels
-        --disable-muxers
-        --disable-bsfs
-        --disable-protocols
-        --enable-protocol=file
-        --enable-protocol=pipe
-        --disable-indevs
-        --disable-outdevs
-        --disable-devices
-        --disable-filters
-        --disable-bzlib
-        --disable-iconv
-        --disable-libxcb
-        --disable-lzma
-        --disable-sdl2
-        --disable-xlib
-        --disable-zlib)
-
-set(REQ_URL "https://github.com/FFmpeg/FFmpeg/archive/n4.3.1.tar.gz")
-set(MD5 "426ca412ca61634a248c787e29507206")
-
-mindspore_add_pkg(ffmpeg
-        VER 4.3.1
-        LIBS avcodec avformat avutil swresample swscale
-        URL ${REQ_URL}
-        MD5 ${MD5}
-        CONFIGURE_COMMAND ./configure --disable-static --enable-shared --disable-x86asm ${FFMPEG_FLAGS}
-        )
-
-include_directories(${ffmpeg_INC})
-add_library(mindspore::avcodec ALIAS ffmpeg::avcodec)
-add_library(mindspore::avformat ALIAS ffmpeg::avformat)
-add_library(mindspore::avutil ALIAS ffmpeg::avutil)
-add_library(mindspore::swresample ALIAS ffmpeg::swresample)
-add_library(mindspore::swscale ALIAS ffmpeg::swscale)
diff --git a/cmake/external_libs/glog.cmake b/cmake/external_libs/glog.cmake
index f7ab7f9871e..66f1c508218 100644
--- a/cmake/external_libs/glog.cmake
+++ b/cmake/external_libs/glog.cmake
@@ -1,13 +1,15 @@
-set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -Dgoogle=mindspore_private")
-set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
-if(NOT ENABLE_GLIBCXX)
-    set(glog_CXXFLAGS "${glog_CXXFLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
-endif()
-
 if(BUILD_LITE)
+    set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -Dgoogle=mindspore_private")
+    set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_C_FLAGS}")
+    set(glog_LDFLAGS "${SECURE_SHARED_LINKER_FLAGS}")
     set(glog_patch "")
     set(glog_lib glog)
 else()
+    set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -Dgoogle=mindspore_private")
+    set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
+    if(NOT ENABLE_GLIBCXX)
+        set(glog_CXXFLAGS "${glog_CXXFLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+    endif()
     set(glog_patch ${CMAKE_SOURCE_DIR}/third_party/patch/glog/glog.patch001)
     set(glog_lib mindspore_glog)
 endif()
diff --git a/cmake/external_libs/json.cmake b/cmake/external_libs/json.cmake
index ef9196d19fc..91c1f73b458 100644
--- a/cmake/external_libs/json.cmake
+++ b/cmake/external_libs/json.cmake
@@ -9,7 +9,7 @@ endif()
 
 if(ENABLE_GITEE)
     set(REQ_URL "https://gitee.com/mirrors/JSON-for-Modern-CPP/repository/archive/v3.6.1.zip")
-    set(MD5 "5bda78ce308e6cfcf614dcf1d5ff27a7")
+    set(MD5 "36ea0d9a709c6667b2798a62f6b197ae")
     set(INCLUDE "./include")
 else()
     set(REQ_URL "https://github.com/nlohmann/json/releases/download/v3.6.1/include.zip")
@@ -23,4 +23,4 @@ mindspore_add_pkg(nlohmann_json
         URL ${REQ_URL}
         MD5 ${MD5})
 include_directories(${nlohmann_json_INC})
-add_library(mindspore::json ALIAS nlohmann_json)
\ No newline at end of file
+add_library(mindspore::json ALIAS nlohmann_json)
diff --git a/cmake/package.cmake b/cmake/package.cmake
index 506f5ee86dc..2e4dd74e6ca 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -198,12 +198,6 @@ if(NOT ENABLE_GE)
         set(ASCEND_DRIVER_PATH ${ASCEND_PATH}/driver/lib64/common)
 
         if(ENABLE_D)
-            install(
-                TARGETS ms_profile
-                DESTINATION ${INSTALL_LIB_DIR}
-                COMPONENT mindspore
-            )
-
             install(
               TARGETS hccl_plugin
               DESTINATION ${INSTALL_LIB_DIR}
diff --git a/cmake/package_lite.cmake b/cmake/package_lite.cmake
index 4b6d97cafd4..fff35b85b26 100644
--- a/cmake/package_lite.cmake
+++ b/cmake/package_lite.cmake
@@ -330,8 +330,6 @@ elseif(WIN32)
                 DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME})
         install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/model_parser.h
                 DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/dump_graph.h
-                DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME})
         install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/ops/ops_def.h
                 DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME})
         install(DIRECTORY ${TOP_DIR}/build/mindspore/schema/ DESTINATION ${CONVERTER_ROOT_DIR}/include/schema
@@ -462,8 +460,6 @@ else()
                 DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME})
         install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/model_parser.h
                 DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/dump_graph.h
-                DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME})
         install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/ops/ops_def.h
                 DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME})
         install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/schema/ DESTINATION ${CONVERTER_ROOT_DIR}/include/schema
diff --git a/include/api/callback/callback.h b/include/api/callback/callback.h
index 8c1878c1126..d10cffeb7c4 100644
--- a/include/api/callback/callback.h
+++ b/include/api/callback/callback.h
@@ -23,12 +23,6 @@
 #include "include/api/data_type.h"
 #include "include/api/dual_abi_helper.h"
 
-#ifdef _WIN32
-#define MS_API __declspec(dllexport)
-#else
-#define MS_API __attribute__((visibility("default")))
-#endif
-
 namespace mindspore {
 class Model;
 class ModelImpl;
diff --git a/include/api/callback/ckpt_saver.h b/include/api/callback/ckpt_saver.h
index 27f47035dc1..2c67d3a44e6 100644
--- a/include/api/callback/ckpt_saver.h
+++ b/include/api/callback/ckpt_saver.h
@@ -22,12 +22,6 @@
 #include <memory>
 #include "include/api/callback/callback.h"
 
-#ifdef _WIN32
-#define MS_API __declspec(dllexport)
-#else
-#define MS_API __attribute__((visibility("default")))
-#endif
-
 namespace mindspore {
 
 class CkptSaver: public TrainCallBack {
diff --git a/include/api/callback/loss_monitor.h b/include/api/callback/loss_monitor.h
index 012609f183d..48684f3f1d4 100644
--- a/include/api/callback/loss_monitor.h
+++ b/include/api/callback/loss_monitor.h
@@ -21,12 +21,6 @@
 #include <utility>
 #include "include/api/callback/callback.h"
 
-#ifdef _WIN32
-#define MS_API __declspec(dllexport)
-#else
-#define MS_API __attribute__((visibility("default")))
-#endif
-
 using GraphPoint = std::pair<int, float>;
 
 namespace mindspore {
diff --git a/include/api/callback/lr_scheduler.h b/include/api/callback/lr_scheduler.h
index afe9b43d1ed..2eddc66b44a 100644
--- a/include/api/callback/lr_scheduler.h
+++ b/include/api/callback/lr_scheduler.h
@@ -22,12 +22,6 @@
 #include <memory>
 #include "include/api/callback/callback.h"
 
-#ifdef _WIN32
-#define MS_API __declspec(dllexport)
-#else
-#define MS_API __attribute__((visibility("default")))
-#endif
-
 namespace mindspore {
 
 constexpr int DONT_UPDATE_LR = 0;
diff --git a/include/api/callback/time_monitor.h b/include/api/callback/time_monitor.h
index e38b26a0ceb..7e857849f8a 100644
--- a/include/api/callback/time_monitor.h
+++ b/include/api/callback/time_monitor.h
@@ -22,12 +22,6 @@
 #include <memory>
 #include "include/api/callback/callback.h"
 
-#ifdef _WIN32
-#define MS_API __declspec(dllexport)
-#else
-#define MS_API __attribute__((visibility("default")))
-#endif
-
 namespace mindspore {
 
 class TimeMonitor: public TrainCallBack {
diff --git a/include/api/callback/train_accuracy.h b/include/api/callback/train_accuracy.h
index d20c42ac89d..0b31cfbc617 100644
--- a/include/api/callback/train_accuracy.h
+++ b/include/api/callback/train_accuracy.h
@@ -24,12 +24,6 @@
 #include "include/api/callback/callback.h"
 #include "include/api/metrics/accuracy.h"
 
-#ifdef _WIN32
-#define MS_API __declspec(dllexport)
-#else
-#define MS_API __attribute__((visibility("default")))
-#endif
-
 using GraphPoint = std::pair<int, float>;
 
 namespace mindspore {
diff --git a/include/api/cfg.h b/include/api/cfg.h
index 87c144f397e..a012438ee77 100644
--- a/include/api/cfg.h
+++ b/include/api/cfg.h
@@ -23,12 +23,6 @@
 #include "include/api/data_type.h"
 #include "include/api/dual_abi_helper.h"
 
-#ifdef _WIN32
-#define MS_API __declspec(dllexport)
-#else
-#define MS_API __attribute__((visibility("default")))
-#endif
-
 namespace mindspore {
 
 class MixPrecisionCfg {
diff --git a/include/api/context.h b/include/api/context.h
index d1b525ef713..1184584633f 100644
--- a/include/api/context.h
+++ b/include/api/context.h
@@ -105,14 +105,29 @@ class MS_API DeviceInfoContext : public std::enable_shared_from_this<DeviceInfoC
 
     return std::static_pointer_cast<T>(shared_from_this());
   }
-
+  /// \brief obtain provider's name
+  ///
+  /// \return provider's name.
   std::string GetProvider() const;
+  /// \brief set provider's name.
+  ///
+  /// \param[in] provider define the provider's name.
   void SetProvider(const std::string &provider);
-
+  /// \brief obtain provider's device type.
+  ///
+  /// \return provider's device type.
   std::string GetProviderDevice() const;
+  /// \brief set provider's device type.
+  ///
+  /// \param[in] device define the provider's device type.EG: CPU.
   void SetProviderDevice(const std::string &device);
-
+  /// \brief set memory allocator.
+  ///
+  /// \param[in] allocator define the memory allocator which can be defined by user.
   void SetAllocator(const std::shared_ptr<Allocator> &allocator);
+  /// \brief obtain memory allocator.
+  ///
+  /// \return memory allocator.
   std::shared_ptr<Allocator> GetAllocator() const;
 
  protected:
diff --git a/include/api/kernel.h b/include/api/kernel.h
index 1e1a6dfb040..6ec62dec020 100644
--- a/include/api/kernel.h
+++ b/include/api/kernel.h
@@ -24,9 +24,16 @@
 #include "include/api/context.h"
 
 namespace mindspore::kernel {
+/// \brief The Kernel class is used to define a MindSpore Kernel.
 class Kernel {
  public:
   Kernel() = default;
+  /// \brief Constructor.
+  ///
+  /// \param[in] inputs define the input tensors for kernel.
+  /// \param[in] outputs define the output tensors for kernel.
+  /// \param[in] primitive define the primitive of kernel generated by flatbuffers.
+  /// \param[in] ctx define the context for kernel.
   Kernel(const std::vector<mindspore::MSTensor> &inputs, const std::vector<mindspore::MSTensor> &outputs,
          const schema::Primitive *primitive, const mindspore::Context *ctx)
       : context_(ctx), inputs_(std::move(inputs)), outputs_(std::move(outputs)), primitive_(primitive) {
@@ -34,32 +41,65 @@ class Kernel {
       type_ = primitive->value_type();
     }
   }
+  /// \brief Destructor.
   virtual ~Kernel() = default;
-
+  /// \brief prepare for executing kernel.
+  ///
+  /// \return result code.
   virtual int Prepare() = 0;
-
+  /// \brief execute the kernel.
+  ///
+  /// \return result code.
   virtual int Execute() = 0;
-
+  /// \brief resize the kernel input shape, memory need to refresh.
+  ///
+  /// \return result code.
   virtual int ReSize() = 0;
-
+  /// \brief set kernel's input tensors.
+  ///
+  /// \param[in] in_tensors define the input tensors.
   virtual void set_inputs(const std::vector<mindspore::MSTensor> &in_tensors) { this->inputs_ = in_tensors; }
-
+  /// \brief set kernel's input tensor.
+  ///
+  /// \param[in] in_tensor define the input tensor.
+  /// \param[in] index define the index of the input tensor.
   virtual void set_input(mindspore::MSTensor in_tensor, int index) { this->inputs_[index] = in_tensor; }
-
+  /// \brief set kernel's output tensors.
+  ///
+  /// \param[in] out_tensors define the output tensors.
   virtual void set_outputs(const std::vector<mindspore::MSTensor> &out_tensors) { this->outputs_ = out_tensors; }
-
+  /// \brief set kernel's output tensor.
+  ///
+  /// \param[in] out_tensor define the output tensor.
+  /// \param[in] index define the index of the output tensor.
   virtual void set_output(mindspore::MSTensor out_tensor, int index) { this->outputs_[index] = out_tensor; }
-
+  /// \brief obtain kernel's input tensors.
+  ///
+  /// \return input tensors.
   virtual const std::vector<mindspore::MSTensor> &inputs() { return this->inputs_; }
-
+  /// \brief obtain kernel's output tensors.
+  ///
+  /// \return output tensors.
   virtual const std::vector<mindspore::MSTensor> &outputs() { return this->outputs_; }
-
+  /// \brief obtain kernel's name.
+  ///
+  /// \return kernel's name.
   std::string name() const { return this->name_; }
-
+  /// \brief set kernel's name.
+  ///
+  /// \param[in] name define the kernel's name.
   void set_name(const std::string &name) { this->name_ = name; }
-
+  /// \brief obtain kernel's context.
+  ///
+  /// \return kernel's context.
   const mindspore::Context *context() const { return this->context_; }
+  /// \brief obtain kernel's type.
+  ///
+  /// \return kernel's type.
   virtual schema::PrimitiveType type() const { return type_; }
+  /// \brief obtain the primitive of kernel generated by flatbuffers.
+  ///
+  /// \return the primitive of kernel generated by flatbuffers.
   const schema::Primitive *primitive() const { return this->primitive_; }
 
  protected:
diff --git a/include/api/types.h b/include/api/types.h
index 77f200bda5c..815b39f94c7 100644
--- a/include/api/types.h
+++ b/include/api/types.h
@@ -27,12 +27,16 @@
 
 #ifndef MS_API
 #ifdef _WIN32
+#ifdef _MSC_VER
 #ifdef BUILDING_DLL
 #define MS_API __declspec(dllexport)
 #else
 #define MS_API __declspec(dllimport)
 #endif
 #else
+#define MS_API __declspec(dllexport)
+#endif
+#else
 #define MS_API __attribute__((visibility("default")))
 #endif
 #endif
diff --git a/mindspore/_checkparam.py b/mindspore/_checkparam.py
index 58cec1666a4..25547ace35e 100644
--- a/mindspore/_checkparam.py
+++ b/mindspore/_checkparam.py
@@ -148,7 +148,7 @@ def check_number(arg_value, value, rel, arg_type=int, arg_name=None, prim_name=N
     Check argument integer.
 
     Example:
-    - number = check_int(number, 0, Rel.GE, "number", None) # number >= 0
+    - number = check_number(number, 0, Rel.GE, "number", None) # number >= 0
     """
     rel_fn = Rel.get_fns(rel)
     prim_name = f'in `{prim_name}`' if prim_name else ''
diff --git a/mindspore/_extends/graph_kernel/expanders/__init__.py b/mindspore/_extends/graph_kernel/expanders/__init__.py
index f412f80e78c..11fcd76080a 100644
--- a/mindspore/_extends/graph_kernel/expanders/__init__.py
+++ b/mindspore/_extends/graph_kernel/expanders/__init__.py
@@ -18,7 +18,6 @@ from .addn import AddN
 from .assign_add import AssignAdd
 from .batchnorm import BatchNorm
 from .batchnorm_grad import BatchNormGrad
-from .bias_add import BiasAdd
 from .bias_add_grad import BiasAddGrad
 from .clip_by_norm_no_div_sum import ClipByNormNoDivSum
 from .conv2d import Conv2D
@@ -26,7 +25,6 @@ from .complex import CAbs, CAdd, CDiv, CMul, CSub
 from .dropout_grad import DropoutGrad
 from .equal_count import EqualCount
 from .erfc import Erfc
-from .expand_dims import ExpandDims
 from .fused_adam import FusedAdam
 from .fused_adam_weight_decay import FusedAdamWeightDecay
 from .fused_mul_add import FusedMulAdd
@@ -51,6 +49,7 @@ from .sigmoid import Sigmoid
 from .sigmoid_cross_entropy_with_logits import SigmoidCrossEntropyWithLogits
 from .sigmoid_cross_entropy_with_logits_grad import SigmoidCrossEntropyWithLogitsGrad
 from .sigmoid_grad import SigmoidGrad
+from .slice import Slice
 from .softmax import Softmax
 from .softmax_cross_entropy_with_logits import SoftmaxCrossEntropyWithLogits
 from .softmax_grad_ext import SoftmaxGradExt
diff --git a/mindspore/_extends/graph_kernel/expanders/_utils.py b/mindspore/_extends/graph_kernel/expanders/_utils.py
index aa95793572f..6c573c6c89a 100644
--- a/mindspore/_extends/graph_kernel/expanders/_utils.py
+++ b/mindspore/_extends/graph_kernel/expanders/_utils.py
@@ -80,6 +80,9 @@ class Expander:
 
 class ExpanderInfoValidator:
     """ExpanderInfoValidator is the utility class which defines the validator decorator for expanders"""
+
+    def __init__(self):
+        """Init"""
     @staticmethod
     def _add_check_function(kls, func):
         """
@@ -198,8 +201,8 @@ def to_frac_z_axis(ori_shape, ori_axis):
     return frac_z_axis
 
 
-def infer_shape_from_fractalNz(fractal):
-    "get original shape from fractalNz shape"
+def infer_shape_from_fractalnz(fractal):
+    "get original shape from fractalnz shape"
     shape = []
     dims = len(fractal)
     batch = dims - 4
diff --git a/mindspore/_extends/graph_kernel/expanders/batchnorm.py b/mindspore/_extends/graph_kernel/expanders/batchnorm.py
index 7f6b74c3aef..69f2dfff0f2 100644
--- a/mindspore/_extends/graph_kernel/expanders/batchnorm.py
+++ b/mindspore/_extends/graph_kernel/expanders/batchnorm.py
@@ -24,6 +24,7 @@ from .expand_dims import ExpandDims
 @VLD.check_attrs('is_training', 'momentum', 'epsilon')
 class BatchNorm(Expander):
     """BatchNorm expander"""
+
     def _expand(self, graph_builder):
         # get op info
         input_x = self.inputs[0]
@@ -42,81 +43,8 @@ class BatchNorm(Expander):
             input_x = graph_builder.emit('Cast', [input_x], attrs={'dst_type': input_x_new_type})
 
         if self.attrs['is_training']:
-            reduce_axis = ()
-            shape_x = input_x.shape
-            if input_x.data_format == DF.NHWC:
-                reduce_axis = (0, 1, 2)
-                num = shape_x[0] * shape_x[1] * shape_x[2]
-            else:
-                reduce_axis = (0, 2, 3)
-                num = shape_x[0] * shape_x[2] * shape_x[3]
-            num_rec = 1.0 / num
-            num_rec_v = graph_builder.value(input_scale.dtype, num_rec)
-
-            # compute mean value of input_x
-            mean_sum = graph_builder.emit(
-                'ReduceSum', [input_x], attrs={'reduce_axis': reduce_axis, 'keep_dims': False})
-            mean_muls = graph_builder.emit('Mul', [mean_sum, num_rec_v])
-
-            # compute variance of input_x
-            if input_x.data_format in (DF.DEFAULT, DF.NCHW):
-                mean_muls_expand = graph_builder.emit(
-                    'Reshape', [mean_muls], attrs={'shape': ExpandDims.infer_shape(mean_muls.shape, [-1, -1])})
-            else:
-                mean_muls_expand = mean_muls
-            var_sub = graph_builder.emit('Sub', [input_x, mean_muls_expand])
-            var_mul = graph_builder.emit('Mul', [var_sub, var_sub])
-            var_sum = graph_builder.emit('ReduceSum', [var_mul], attrs={'reduce_axis': reduce_axis, 'keep_dims': False})
-            var_mul = graph_builder.emit('Mul', [var_sum, num_rec_v])
-
-            # y_sqrt_rec means 1 / sqrt(variance + epsilon), which is calculated in backward pass
-            scalar_one = 1.0
-            scalar_one_v = graph_builder.value(input_scale.dtype, scalar_one)
-            y_add = graph_builder.emit('Add', [var_mul, epsilon_v])
-            y_sqrt = graph_builder.emit('Sqrt', [y_add])
-            y_sqrt_rec = graph_builder.emit('RealDiv', [scalar_one_v, y_sqrt])
-
-            # compute res_y
-            tmp_sub = graph_builder.emit('Sub', [input_x, mean_muls_expand])
-            if input_x.data_format in (DF.DEFAULT, DF.NCHW):
-                y_sqrt_rec_expand = graph_builder.emit(
-                    'Reshape', [y_sqrt_rec], attrs={'shape': ExpandDims.infer_shape(y_sqrt_rec.shape, [-1, -1])})
-            else:
-                y_sqrt_rec_expand = y_sqrt_rec
-            y_norm = graph_builder.emit('Mul', [tmp_sub, y_sqrt_rec_expand])
-            if input_x.data_format in (DF.DEFAULT, DF.NCHW):
-                input_scale_expand = graph_builder.emit(
-                    'Reshape', [input_scale], attrs={'shape': ExpandDims.infer_shape(input_scale.shape, [-1, -1])})
-            else:
-                input_scale_expand = input_scale
-            res_y_mul = graph_builder.emit('Mul', [input_scale_expand, y_norm])
-            if input_x.data_format in (DF.DEFAULT, DF.NCHW):
-                input_offset_expand = graph_builder.emit(
-                    'Reshape', [input_offset], attrs={'shape': ExpandDims.infer_shape(input_offset.shape, [-1, -1])})
-            else:
-                input_offset_expand = input_offset
-            res_y = graph_builder.emit('Add', [res_y_mul, input_offset_expand])
-
-            # compute mean_res
-            momentum_sub = scalar_one - self.attrs['momentum']
-            momentum_v_sub = graph_builder.value(input_scale.dtype, momentum_sub)
-            new_running_mean_tmp = graph_builder.emit('Mul', [momentum_v_sub, input_mean])
-            momentum_v = graph_builder.value(input_scale.dtype, self.attrs['momentum'])
-            current_mean_tmp = graph_builder.emit('Mul', [momentum_v, mean_muls])
-            updated_moving_mean = graph_builder.emit('Add', [new_running_mean_tmp, current_mean_tmp])
-            mean_res = graph_builder.emit(
-                'InplaceAssign', [input_mean, updated_moving_mean, updated_moving_mean], attrs={'fake_output': True})
-
-            # variance_res is calculated by sample variance, and need to multiply by num / (num - 1)
-            var_num = float(num) / (num - 1)
-            var_num_v = graph_builder.value(input_scale.dtype, var_num)
-            var_mul_update = graph_builder.emit('Mul', [var_num_v, var_mul])
-            new_running_var_tmp = graph_builder.emit('Mul', [momentum_v_sub, input_variance])
-            current_var_tmp = graph_builder.emit('Mul', [momentum_v, var_mul_update])
-            updated_moving_variance = graph_builder.emit('Add', [new_running_var_tmp, current_var_tmp])
-            variance_res = graph_builder.emit(
-                'InplaceAssign', [input_variance, updated_moving_variance, updated_moving_variance],
-                attrs={'fake_output': True})
+            self.inputs[0] = input_x
+            res_y, mean_res, variance_res, mean_muls, y_sqrt_rec = self._bn_train(graph_builder)
             if input_x_new_type != input_x_ori_type:
                 res_y = graph_builder.emit('Cast', [res_y], attrs={'dst_type': input_x_ori_type})
             return res_y, mean_res, variance_res, mean_muls, y_sqrt_rec
@@ -140,3 +68,88 @@ class BatchNorm(Expander):
         if input_x_new_type != input_x_ori_type:
             res_y = graph_builder.emit('Cast', [res_y], attrs={'dst_type': input_x_ori_type})
         return res_y, var_add, var_add, var_add, var_add
+
+    def _bn_train(self, graph_builder):
+        """expand BatchNorm for training mode"""
+        input_x = self.inputs[0]
+        input_scale = self.inputs[1]
+        input_offset = self.inputs[2]
+        input_mean = self.inputs[3]
+        input_variance = self.inputs[4]
+        epsilon_v = graph_builder.value(input_scale.dtype, self.attrs['epsilon'])
+        reduce_axis = ()
+        shape_x = input_x.shape
+        if input_x.data_format == DF.NHWC:
+            reduce_axis = (0, 1, 2)
+            num = shape_x[0] * shape_x[1] * shape_x[2]
+        else:
+            reduce_axis = (0, 2, 3)
+            num = shape_x[0] * shape_x[2] * shape_x[3]
+        num_rec = 1.0 / num
+        num_rec_v = graph_builder.value(input_scale.dtype, num_rec)
+
+        # compute mean value of input_x
+        mean_sum = graph_builder.emit(
+            'ReduceSum', [input_x], attrs={'reduce_axis': reduce_axis, 'keep_dims': False})
+        mean_muls = graph_builder.emit('Mul', [mean_sum, num_rec_v])
+
+        # compute variance of input_x
+        if input_x.data_format in (DF.DEFAULT, DF.NCHW):
+            mean_muls_expand = graph_builder.emit(
+                'Reshape', [mean_muls], attrs={'shape': ExpandDims.infer_shape(mean_muls.shape, [-1, -1])})
+        else:
+            mean_muls_expand = mean_muls
+        var_sub = graph_builder.emit('Sub', [input_x, mean_muls_expand])
+        var_mul = graph_builder.emit('Mul', [var_sub, var_sub])
+        var_sum = graph_builder.emit('ReduceSum', [var_mul], attrs={'reduce_axis': reduce_axis, 'keep_dims': False})
+        var_mul = graph_builder.emit('Mul', [var_sum, num_rec_v])
+
+        # y_sqrt_rec means 1 / sqrt(variance + epsilon), which is calculated in backward pass
+        scalar_one = 1.0
+        scalar_one_v = graph_builder.value(input_scale.dtype, scalar_one)
+        y_add = graph_builder.emit('Add', [var_mul, epsilon_v])
+        y_sqrt = graph_builder.emit('Sqrt', [y_add])
+        y_sqrt_rec = graph_builder.emit('RealDiv', [scalar_one_v, y_sqrt])
+
+        # compute res_y
+        tmp_sub = graph_builder.emit('Sub', [input_x, mean_muls_expand])
+        if input_x.data_format in (DF.DEFAULT, DF.NCHW):
+            y_sqrt_rec_expand = graph_builder.emit(
+                'Reshape', [y_sqrt_rec], attrs={'shape': ExpandDims.infer_shape(y_sqrt_rec.shape, [-1, -1])})
+        else:
+            y_sqrt_rec_expand = y_sqrt_rec
+        y_norm = graph_builder.emit('Mul', [tmp_sub, y_sqrt_rec_expand])
+        if input_x.data_format in (DF.DEFAULT, DF.NCHW):
+            input_scale_expand = graph_builder.emit(
+                'Reshape', [input_scale], attrs={'shape': ExpandDims.infer_shape(input_scale.shape, [-1, -1])})
+        else:
+            input_scale_expand = input_scale
+        res_y_mul = graph_builder.emit('Mul', [input_scale_expand, y_norm])
+        if input_x.data_format in (DF.DEFAULT, DF.NCHW):
+            input_offset_expand = graph_builder.emit(
+                'Reshape', [input_offset], attrs={'shape': ExpandDims.infer_shape(input_offset.shape, [-1, -1])})
+        else:
+            input_offset_expand = input_offset
+        res_y = graph_builder.emit('Add', [res_y_mul, input_offset_expand])
+
+        # compute mean_res
+        momentum_sub = scalar_one - self.attrs['momentum']
+        momentum_v_sub = graph_builder.value(input_scale.dtype, momentum_sub)
+        new_running_mean_tmp = graph_builder.emit('Mul', [momentum_v_sub, input_mean])
+        momentum_v = graph_builder.value(input_scale.dtype, self.attrs['momentum'])
+        current_mean_tmp = graph_builder.emit('Mul', [momentum_v, mean_muls])
+        updated_moving_mean = graph_builder.emit('Add', [new_running_mean_tmp, current_mean_tmp])
+        mean_res = graph_builder.emit(
+            'InplaceAssign', [input_mean, updated_moving_mean, updated_moving_mean], attrs={'fake_output': True})
+
+        # variance_res is calculated by sample variance, and need to multiply by num / (num - 1)
+        var_num = float(num) / (num - 1)
+        var_num_v = graph_builder.value(input_scale.dtype, var_num)
+        var_mul_update = graph_builder.emit('Mul', [var_num_v, var_mul])
+        new_running_var_tmp = graph_builder.emit('Mul', [momentum_v_sub, input_variance])
+        current_var_tmp = graph_builder.emit('Mul', [momentum_v, var_mul_update])
+        updated_moving_variance = graph_builder.emit('Add', [new_running_var_tmp, current_var_tmp])
+        variance_res = graph_builder.emit(
+            'InplaceAssign', [input_variance, updated_moving_variance, updated_moving_variance],
+            attrs={'fake_output': True})
+        return res_y, mean_res, variance_res, mean_muls, y_sqrt_rec
diff --git a/mindspore/_extends/graph_kernel/expanders/batchnorm_grad.py b/mindspore/_extends/graph_kernel/expanders/batchnorm_grad.py
index edcf3744c78..eeb94ca1df0 100644
--- a/mindspore/_extends/graph_kernel/expanders/batchnorm_grad.py
+++ b/mindspore/_extends/graph_kernel/expanders/batchnorm_grad.py
@@ -17,12 +17,14 @@ from mindspore._extends.graph_kernel.model.model import DataFormat as DF
 from ._utils import Expander, ExpanderInfoValidator as VLD
 from .expand_dims import ExpandDims
 
+
 @VLD.add_format(DF.NHWC, DF.NHWC, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT)
 @VLD.add_format(DF.NCHW, DF.NCHW, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT)
 @VLD.add_format(DF.DEFAULT, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT, DF.DEFAULT)
 @VLD.check_attrs('is_training', 'epsilon')
 class BatchNormGrad(Expander):
     """BatchNormGrad expander"""
+
     def _expand(self, graph_builder):
         # get op info
         input_dy = self.inputs[0]
diff --git a/mindspore/_extends/graph_kernel/expanders/bias_add.py b/mindspore/_extends/graph_kernel/expanders/bias_add.py
deleted file mode 100644
index da1ed5da412..00000000000
--- a/mindspore/_extends/graph_kernel/expanders/bias_add.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright 2020-2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ===========================================================================
-"""generate json desc for bias_add"""
-from mindspore._extends.graph_kernel.model.model import DataFormat as DF
-from ._utils import Expander, ExpanderInfoValidator as VLD
-from .expand_dims import ExpandDims
-
-
-@VLD.add_format(DF.DEFAULT, DF.DEFAULT)
-@VLD.add_format(DF.NCHW, DF.DEFAULT)
-@VLD.add_format(DF.NHWC, DF.DEFAULT)
-class BiasAdd(Expander):
-    """BiasAdd expander"""
-
-    def _expand(self, graph_builder):
-        input_x, input_y = self.inputs
-
-        if input_x.data_format == DF.NCHW:
-            input_y_expand = graph_builder.emit(
-                'Reshape', [input_y], attrs={'shape': ExpandDims.infer_shape(input_y.shape, [1, 2])})
-            result = graph_builder.emit('Add', [input_x, input_y_expand])
-        elif input_x.data_format == DF.DEFAULT:
-            if len(input_x.shape) == 2:
-                result = graph_builder.emit('Add', [input_x, input_y])
-            elif len(input_x.shape) == 3:
-                input_y_expand = graph_builder.emit(
-                    'Reshape', [input_y], attrs={'shape': ExpandDims.infer_shape(input_y.shape, 1)})
-                result = graph_builder.emit('Add', [input_x, input_y_expand])
-            else:  # len == 4
-                input_y_expand = graph_builder.emit(
-                    'Reshape', [input_y], attrs={'shape': ExpandDims.infer_shape(input_y.shape, [1, 2])})
-                result = graph_builder.emit('Add', [input_x, input_y_expand])
-        else:  # NHWC
-            result = graph_builder.emit('Add', [input_x, input_y])
-
-        return result
diff --git a/mindspore/_extends/graph_kernel/expanders/fused_mul_add.py b/mindspore/_extends/graph_kernel/expanders/fused_mul_add.py
index 02a396d0574..86f3a4d1b06 100644
--- a/mindspore/_extends/graph_kernel/expanders/fused_mul_add.py
+++ b/mindspore/_extends/graph_kernel/expanders/fused_mul_add.py
@@ -15,6 +15,7 @@
 """generate json desc for FusedMulAdd"""
 from ._utils import Expander
 
+
 class FusedMulAdd(Expander):
     """FusedMulAdd expander"""
 
diff --git a/mindspore/_extends/graph_kernel/expanders/layernorm.py b/mindspore/_extends/graph_kernel/expanders/layernorm.py
index 28a99c20764..c3433afd3bf 100644
--- a/mindspore/_extends/graph_kernel/expanders/layernorm.py
+++ b/mindspore/_extends/graph_kernel/expanders/layernorm.py
@@ -15,13 +15,15 @@
 """generate json desc for LayerNorm"""
 from mindspore._extends.graph_kernel.model.model import DataFormat as DF
 from ._utils import Expander, ExpanderInfoValidator as VLD
-from ._utils import infer_shape_from_fractalNz, get_reduced_ori_shape, to_frac_z_axis
+from ._utils import infer_shape_from_fractalnz, get_reduced_ori_shape, to_frac_z_axis
+
 
 @VLD.add_format(DF.FRAC_NZ, DF.DEFAULT, DF.DEFAULT)
 @VLD.add_format(DF.DEFAULT, DF.DEFAULT, DF.DEFAULT)
 @VLD.check_attrs('begin_norm_axis', 'begin_params_axis', 'epsilon')
 class LayerNorm(Expander):
     """LayerNorm expander"""
+
     def _expand(self, graph_builder):
         input_x, input_gamma, input_beta = self.inputs
         processor = self.processor
@@ -36,7 +38,7 @@ class LayerNorm(Expander):
 
         ori_shape_x = input_x.shape
         if input_x.data_format == DF.FRAC_NZ:
-            ori_shape_x = infer_shape_from_fractalNz(ori_shape_x)
+            ori_shape_x = infer_shape_from_fractalnz(ori_shape_x)
 
         # Calculate the scaling ratio of the average
         if begin_norm_axis < 0:
diff --git a/mindspore/_extends/graph_kernel/expanders/matmul.py b/mindspore/_extends/graph_kernel/expanders/matmul.py
index 69588b0eded..efd60a6914d 100644
--- a/mindspore/_extends/graph_kernel/expanders/matmul.py
+++ b/mindspore/_extends/graph_kernel/expanders/matmul.py
@@ -17,6 +17,7 @@ from mindspore._extends.graph_kernel.model.model import DataFormat as DF
 from mindspore._extends.graph_kernel.model.model import GraphKernelUnsupportedException as GKException
 from ._utils import Expander, ExpanderInfoValidator as VLD
 
+
 @VLD.check_attrs('transpose_a', 'transpose_b', 'left_format', 'right_format')
 class MatMul(Expander):
     """
@@ -24,7 +25,7 @@ class MatMul(Expander):
     """
 
     def __init__(self, expand_info):
-        super().__init__(expand_info)
+        super(MatMul, self).__init__(expand_info)
         self.transpose_a = self.attrs['transpose_a']
         self.transpose_b = self.attrs['transpose_b']
         self.left_format = self.attrs['left_format']
@@ -47,28 +48,28 @@ class MatMul(Expander):
         if input_num < 2:
             raise GKException("matul inputs number should bigger than 1, but got {}.".format(input_num))
 
-    def _trans_shape(self, shape):
-        trans_shape = list(shape)
-        trans_shape[-2] = shape[-1]
-        trans_shape[-1] = shape[-2]
-        return trans_shape
-
     def _expand(self, graph_builder):
+        def transpose(shape):
+            trans_shape = list(shape)
+            trans_shape[-2] = shape[-1]
+            trans_shape[-1] = shape[-2]
+            return trans_shape
         if not self._optimize_to_mul():
             raise GKException("MatMul/BatchMatMul do not need to be replaced by Mul")
-        #Matmul is replaced by Mul([b m k], [b k n]) when k==1
+        # Matmul is replaced by Mul([b m k], [b k n]) when k==1
         input_a = self.inputs[0]
         input_b = self.inputs[1]
         if self.transpose_a:
-            shape_a_trans = self._trans_shape(self.shape_a)
+            shape_a_trans = transpose(self.shape_a)
             input_a = graph_builder.emit('Reshape', [input_a], attrs={'shape': shape_a_trans})
         if self.transpose_b:
-            shape_b_trans = self._trans_shape(self.shape_b)
+            shape_b_trans = transpose(self.shape_b)
             input_b = graph_builder.emit('Reshape', [input_b], attrs={'shape': shape_b_trans})
         result = graph_builder.emit('Mul', [input_a, input_b])
         if 'dst_type' in self.attrs and self.inputs[0].dtype != self.attrs['dst_type']:
             result = graph_builder.emit('Cast', [result], attrs={'dst_type': self.attrs['dst_type']})
         return result
 
+
 class BatchMatMul(MatMul):
     """BatchMatMul expander"""
diff --git a/mindspore/_extends/graph_kernel/expanders/minimum_grad.py b/mindspore/_extends/graph_kernel/expanders/minimum_grad.py
index 227a0219831..61c4428367d 100644
--- a/mindspore/_extends/graph_kernel/expanders/minimum_grad.py
+++ b/mindspore/_extends/graph_kernel/expanders/minimum_grad.py
@@ -24,7 +24,7 @@ class MinimumGrad(Expander):
     def _check(self):
         if not self.attrs.get('grad_x', True) and not self.attrs.get('grad_y', True):
             raise GKException("both grad_x and grad_y are False.")
-        return super()._check()
+        return super(MinimumGrad, self)._check()
 
     def _expand(self, graph_builder):
         input_x, input_y, input_dout = self.inputs
@@ -34,7 +34,8 @@ class MinimumGrad(Expander):
         dx = graph_builder.emit('Mul', [le_result, input_dout])
         dy = graph_builder.emit('Sub', [input_dout, dx])
 
-        # for minimumgrad op,  output_shape should be equal to input_shape, but some elementwise operating may broadcast input_shape
+        # for minimumgrad op,  output_shape should be equal to input_shape,
+        # but some elementwise operating may broadcast input_shape
         # then output_shape not equal to original input_shape, so need to reduce output to let them equal
         reduce_axis_x = self.get_reduce_axis(input_x.shape, dx.shape)
         reduce_axis_y = self.get_reduce_axis(input_y.shape, dy.shape)
diff --git a/mindspore/_extends/graph_kernel/expanders/softmax.py b/mindspore/_extends/graph_kernel/expanders/softmax.py
index e9f423ef014..335146fe68f 100644
--- a/mindspore/_extends/graph_kernel/expanders/softmax.py
+++ b/mindspore/_extends/graph_kernel/expanders/softmax.py
@@ -15,7 +15,8 @@
 """generate json desc for softmax"""
 from mindspore._extends.graph_kernel.model.model import DataFormat as DF
 from ._utils import Expander, ExpanderInfoValidator as VLD
-from ._utils import infer_shape_from_fractalNz, get_reduced_ori_shape, to_frac_z_axis
+from ._utils import infer_shape_from_fractalnz, get_reduced_ori_shape, to_frac_z_axis
+
 
 @VLD.add_format(DF.FRAC_NZ)
 @VLD.add_format(DF.DEFAULT)
@@ -30,7 +31,7 @@ class Softmax(Expander):
 
         ori_shape = input_x.shape
         if input_x.data_format == DF.FRAC_NZ:
-            ori_shape = infer_shape_from_fractalNz(input_x.shape)
+            ori_shape = infer_shape_from_fractalnz(input_x.shape)
 
         for i, _ in enumerate(list(axis)):
             if axis[i] < 0:
diff --git a/mindspore/_extends/graph_kernel/expanders/softmax_grad_ext.py b/mindspore/_extends/graph_kernel/expanders/softmax_grad_ext.py
index fdc86324bfc..641ea16b6da 100644
--- a/mindspore/_extends/graph_kernel/expanders/softmax_grad_ext.py
+++ b/mindspore/_extends/graph_kernel/expanders/softmax_grad_ext.py
@@ -15,7 +15,8 @@
 """generate json desc for SoftmaxGradExt"""
 from mindspore._extends.graph_kernel.model.model import DataFormat as DF
 from ._utils import Expander, ExpanderInfoValidator as VLD
-from ._utils import infer_shape_from_fractalNz, get_reduced_ori_shape, to_frac_z_axis
+from ._utils import infer_shape_from_fractalnz, get_reduced_ori_shape, to_frac_z_axis
+
 
 @VLD.add_format(DF.FRAC_NZ, DF.FRAC_NZ, DF.DEFAULT)
 @VLD.add_format(DF.DEFAULT, DF.DEFAULT, DF.DEFAULT)
@@ -29,7 +30,7 @@ class SoftmaxGradExt(Expander):
 
         ori_shape = x.shape
         if x.data_format == DF.FRAC_NZ:
-            ori_shape = infer_shape_from_fractalNz(ori_shape)
+            ori_shape = infer_shape_from_fractalnz(ori_shape)
         if not axis:
             axis = []
             for i, _ in enumerate(ori_shape):
diff --git a/mindspore/_extends/graph_kernel/expanders/square_sum_v1.py b/mindspore/_extends/graph_kernel/expanders/square_sum_v1.py
index 11f5aa35233..c65dceca15e 100644
--- a/mindspore/_extends/graph_kernel/expanders/square_sum_v1.py
+++ b/mindspore/_extends/graph_kernel/expanders/square_sum_v1.py
@@ -15,7 +15,7 @@
 """generate json desc for SquareSumV1"""
 from mindspore._extends.graph_kernel.model.model import DataFormat as DF
 from ._utils import Expander, ExpanderInfoValidator as VLD
-from ._utils import infer_shape_from_fractalNz, get_reduced_ori_shape, to_frac_z_axis
+from ._utils import infer_shape_from_fractalnz, get_reduced_ori_shape, to_frac_z_axis
 
 
 @VLD.add_format(DF.FRAC_NZ)
@@ -30,7 +30,7 @@ class SquareSumV1(Expander):
 
         ori_shape = x.shape
         if x.data_format == DF.FRAC_NZ:
-            ori_shape = infer_shape_from_fractalNz(ori_shape)
+            ori_shape = infer_shape_from_fractalnz(ori_shape)
         if not axis:
             axis = []
             for i, _ in enumerate(ori_shape):
diff --git a/mindspore/_extends/graph_kernel/model/graph_parallel.py b/mindspore/_extends/graph_kernel/model/graph_parallel.py
index d4a5cacd0e6..60aa5ea371d 100644
--- a/mindspore/_extends/graph_kernel/model/graph_parallel.py
+++ b/mindspore/_extends/graph_kernel/model/graph_parallel.py
@@ -17,6 +17,8 @@ from .model import PrimLib
 
 
 class ParalGain:
+    """Paral Gain"""
+
     def __init__(self, fusion_type, bottleneck, gain, block_assign, type_info):
         self.fusion_type = fusion_type
         self.bottleneck = bottleneck
@@ -41,7 +43,9 @@ class ScheduleAnalyzer:
         self.ops = graph.ops
         self.dom_op = [out.op for out in outputs]
 
-    def prod(self, shape):
+    @staticmethod
+    def prod(shape):
+        """Compute shape product"""
         res = shape[0]
         for i in range(1, len(shape)):
             res = res * shape[i]
@@ -254,7 +258,7 @@ class ScheduleAnalyzer:
         fusion_type = "block_fusion"
         type_info = None
 
-        activate_pipeline_optimization = False # Disable pipeline optimization for now.
+        activate_pipeline_optimization = False  # Disable pipeline optimization for now.
         if activate_pipeline_optimization:
             pipeline_info = ScheduleAnalyzer.pipeline_fusion_analyze(
                 blocks, op_sizes, exclude_gid)
@@ -287,4 +291,5 @@ def block_parallel_estimate(graphs):
 
 
 def parallel_estimate(graphs):
+    """Estimate parallel gain"""
     return block_parallel_estimate(graphs)
diff --git a/mindspore/_extends/graph_kernel/model/graph_split.py b/mindspore/_extends/graph_kernel/model/graph_split.py
index 363401992eb..f267b928de6 100644
--- a/mindspore/_extends/graph_kernel/model/graph_split.py
+++ b/mindspore/_extends/graph_kernel/model/graph_split.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ===========================================================================
 """Cost model splitter"""
-import os
 from functools import reduce as prod_reduce
 from mindspore import log as logger
 from .model import PrimLib, Graph, Tensor, Operator
@@ -39,20 +38,24 @@ class GraphSplitByPattern:
         def sync(self, x, y):
             """sync from y to x"""
             for i in self.alive:
-                if self.map[y][i] and not self.map[x][i]:
-                    self.map[x][i] = True
+                self._link(self.map[y][i], x, i)
+
+        def _link(self, cond, f, t):
+            """link from `f` to `t`"""
+            if cond:
+                self.map[f][t] = True
 
         def fuse(self, x, y):
             """fuse y to x"""
             for i in self.alive:
+                # i is the succeeding node of y, links the x's previous nodes to i
                 if self.map[y][i] and not self.map[x][i]:
                     for pre in self.alive:
-                        if self.map[pre][x] and not self.map[pre][i]:
-                            self.map[pre][i] = True
+                        self._link(self.map[pre][x], pre, i)
+                # i is the previous node of y, link i to x's succeeding nodes
                 if self.map[i][y] and not self.map[i][x]:
                     for suc in self.alive:
-                        if self.map[x][suc] and not self.map[i][suc]:
-                            self.map[i][suc] = True
+                        self._link(self.map[x][suc], i, suc)
             self.alive.remove(y)
 
     class Area:
@@ -67,6 +70,10 @@ class GraphSplitByPattern:
                 self.stitch_ops = set()
                 self.stitch_atomic_ops = set()
 
+            def has_stitch_op(self):
+                """check stitch_op exists"""
+                return self.stitch_ops or self.stitch_atomic_ops
+
         def __init__(self, init_op, is_output, unique_id, reach_tab, recompute_ops=None):
             self.pattern = PrimLib.iter_type(init_op) if init_op is not None else PrimLib.UNKNOWN
             self.ops = [] if init_op is None else [init_op]
@@ -286,31 +293,35 @@ class GraphSplitByPattern:
 
     def fuse(self, selector):
         """Fuse areas"""
-        changed = False
-        while True:
+        def _fuse_area():
             for dominant in self.areas:
                 result = selector(dominant)
-                if result is not None and result[0]:
-                    fuse_areas, is_forward = result
-                    fuse_areas = self.limit_area_size(dominant, fuse_areas)
-                    if not fuse_areas:
-                        continue
-                    if is_forward:
-                        for area in fuse_areas:
-                            dominant.fuse(area)
-                            self.set_area_map(area.ops, dominant)
-                            self.areas.remove(area)
-                    else:
-                        forward_area = dominant
-                        for area in fuse_areas:
-                            area.fuse(forward_area)
-                            self.set_area_map(forward_area.ops, area)
-                            self.areas.remove(forward_area)
-                            forward_area = area
-                    changed = True
-                    break
-            else:
-                return changed
+                if result is None or not result[0]:
+                    continue
+                fuse_areas, is_forward = result
+                fuse_areas = self.limit_area_size(dominant, fuse_areas)
+                if not fuse_areas:
+                    continue
+                if is_forward:
+                    for area in fuse_areas:
+                        dominant.fuse(area)
+                        self.set_area_map(area.ops, dominant)
+                        self.areas.remove(area)
+                else:
+                    forward_area = dominant
+                    for area in fuse_areas:
+                        area.fuse(forward_area)
+                        self.set_area_map(forward_area.ops, area)
+                        self.areas.remove(forward_area)
+                        forward_area = area
+                return True
+            return False
+
+        changed, do_again = False, True
+        while do_again:
+            do_again = _fuse_area()
+            changed = changed or do_again
+        return changed
 
     def fuse_recom(self, selector):
         """Fuse recompute area to its user"""
@@ -348,21 +359,6 @@ class GraphSplitByPattern:
             graphmodes.append("basic" if area.mode == self.Area.MODE_BASIC else "composite")
         return subgraphs, graphmodes
 
-    def dump_subgraphs(self, subgraphs):
-        """Dump subgraphs"""
-        if os.environ.get("ENABLE_SUBGRAPHS", "off") == "on":
-            subgraphs_str = "subgraphs:\nlen: " + str(len(subgraphs)) + "\n"
-            for i, sub in enumerate(subgraphs):
-                subgraphs_str += str("============") + str(i) + "\n"
-                subgraphs_str += str(sub)
-            dirname = 'subgraphs'
-            if not os.path.exists(dirname):
-                os.makedirs(dirname)
-            graphname = self.graph.name
-            filename = dirname + '/' + graphname + '.log'
-            with os.fdopen(os.open(filename, os.O_RDWR | os.O_CREAT), 'w+') as f:
-                f.write(subgraphs_str)
-
     def pattern_fuse(self, fuse_func=None):
         """fuse Areas by pattern repeatedly"""
         del fuse_func
@@ -376,34 +372,38 @@ class GraphSplitByPattern:
         # Note: after this function, the input output relation is not maintained.
         self.split_output_reshapes()
         subgraphs, graphmodes = self.to_subgraphs()
-        self.dump_subgraphs(subgraphs)
         return subgraphs, graphmodes
 
     def split_output_reshapes(self):
-        """Force split the output reshapes into other new """
+        """Force split the output Reshapes into other new area"""
+        def _remove_output_reshape(reshape_ops, other_ops):
+            def _run():
+                for op in reshape_ops:
+                    if any([to_op in other_ops for to_op in op.output.to_ops]):
+                        reshape_ops.remove(op)
+                        other_ops.append(op)
+                        return True
+                return False
+            while _run():
+                pass
+
         new_areas = []
         for area in self.areas:
-            out_reshape_ops = [op for op in area.ops if PrimLib.iter_type(op) == PrimLib.RESHAPE]
-            remain_ops = [op for op in area.ops if op not in out_reshape_ops]
-            if not remain_ops or not out_reshape_ops:
+            reshape_ops = [op for op in area.ops if PrimLib.iter_type(op) == PrimLib.RESHAPE]
+            other_ops = [op for op in area.ops if op not in reshape_ops]
+            if not other_ops or not reshape_ops:
                 continue
-            changed = True
-            while changed:
-                changed = False
-                for op in out_reshape_ops:
-                    if any([to_op in remain_ops for to_op in op.output.to_ops]):
-                        out_reshape_ops.remove(op)
-                        remain_ops.append(op)
-                        changed = True
-                        break
-            if out_reshape_ops:
-                for op in out_reshape_ops:
-                    a = self.Area(op, False, 0, self.reach_tab)
-                    self.set_default_mode(a)
-                    new_areas.append(a)
-                area.ops = remain_ops
-                if len(remain_ops) == 1:
-                    self.set_default_mode(area)
+            # remove the output reshape from "reshape_ops" and add it into "other_ops"
+            _remove_output_reshape(reshape_ops, other_ops)
+            if not reshape_ops:
+                continue
+            for op in reshape_ops:
+                a = self.Area(op, False, 0, self.reach_tab)
+                self.set_default_mode(a)
+                new_areas.append(a)
+            area.ops = other_ops
+            if len(other_ops) == 1:
+                self.set_default_mode(area)
         if new_areas:
             self.areas += new_areas
 
@@ -472,8 +472,8 @@ class GraphSplitByPattern:
                 region_ops.append(op)
                 return False, None, weight, True
             # region fails to grow
-            MAX_WEIGHT = 20
-            if weight > MAX_WEIGHT or len(op.inputs) > 1 or PrimLib.iter_type(op) > PrimLib.BROADCAST:
+            max_weight = 20
+            if weight > max_weight or len(op.inputs) > 1 or PrimLib.iter_type(op) > PrimLib.BROADCAST:
                 return False, None, weight, False
             # region grows successfully
             weight = weight + 1
@@ -486,7 +486,7 @@ class GraphSplitByPattern:
             cheap_regions = []
             for output in outputs:
                 #  tensor should have user other than user_area to be fused
-                if output.para_type != Tensor.PARA_OUTPUT and len(output.to_ops) < 2:
+                if len(output.to_ops) < 2:
                     continue
                 region_ops = []
                 grow = True
@@ -533,14 +533,7 @@ class GraphSplitByPattern:
         """find recompute regions and copy them out to new Areas"""
         def do_recompute_fuse():
             """split the unfusing pattern by add recompute area"""
-            recompute_suc = False
-            orig_areas = []
-            orig_areas.extend(self.areas)
-            for dom in orig_areas:
-                if dom not in self.areas or not dom.out_relations:
-                    continue
-                cheap_regions = self.find_cheap_regions(dom)
-                dom_changed = False
+            def recompute_cheap_region(dom):
                 for cheap_region in cheap_regions:
                     user_areas = self.select_user_area(cheap_region[-1].output)
                     if not user_areas:
@@ -550,12 +543,17 @@ class GraphSplitByPattern:
                         self.pattern_fuse(self.fuse_recom)
                         self.clear_recompute()
                         if self.recom_res:
-                            recompute_suc = True
-                            # Copy region at most once for this dom
-                            dom_changed = True
-                            break
-                    if dom_changed:
-                        break
+                            return True
+                return False
+            recompute_suc = False
+            orig_areas = []
+            orig_areas.extend(self.areas)
+            for dom in orig_areas:
+                if dom not in self.areas or not dom.out_relations:
+                    continue
+                cheap_regions = self.find_cheap_regions(dom)
+                if recompute_cheap_region(dom):
+                    recompute_suc = True
             return recompute_suc
 
         if self.enable_recompute:
@@ -563,9 +561,6 @@ class GraphSplitByPattern:
                 self.pattern_fuse()
 
 
-use_poly_reduce = True
-
-
 class GraphSplitGpu(GraphSplitByPattern):
     """Graph splitter"""
     BORADCAST_FUSE_DEPTH = 20
@@ -616,7 +611,7 @@ class GraphSplitGpu(GraphSplitByPattern):
             return fused, True
 
         def _broadcast_pat_exclude(dom, a, r):
-            if use_poly_reduce and a.pattern == PrimLib.REDUCE:
+            if a.pattern == PrimLib.REDUCE:
                 return dom.pattern > PrimLib.ELEMWISE or r > PrimLib.ELEMWISE
             return a.pattern > PrimLib.REDUCE or r > PrimLib.BROADCAST
 
@@ -641,34 +636,14 @@ class GraphSplitGpu(GraphSplitByPattern):
                 fused.append(a)
             return fused, False
 
-        def _check_reduce_exclude(dom):
-            if use_poly_reduce:
-                return False
-            # exclude large all-reduce
-            if len(dom.ops[0].inputs[0].shape) == len(dom.ops[0].attrs["reduce_axis"]) and \
-                    dom.ops[0].inputs[0].get_size() > 10000:
-                return True
-
-            # exclude multi output
-            for a in dom.in_relations.keys():
-                if len(a.out_relations) > 1:
-                    return True
-                if any([op.output.para_type == Tensor.PARA_OUTPUT for op in a.ops]):
-                    return True
-            return False
-
         def _reduce_pat_exclude(_, a, r):
             if len(a.ops) > self.REDUCE_FUSE_DEPTH:
                 return True
-            if use_poly_reduce:
-                return a.pattern > PrimLib.ELEMWISE or r > PrimLib.REDUCE or r == PrimLib.BROADCAST
-            return a.pattern > PrimLib.BROADCAST or r > PrimLib.REDUCE
+            return a.pattern > PrimLib.ELEMWISE or r > PrimLib.REDUCE or r == PrimLib.BROADCAST
 
         def _reduce_depth(dom):
             if dom.pattern != PrimLib.REDUCE or len(dom.in_relations) != 1:
                 return None
-            if _check_reduce_exclude(dom):
-                return None
             a, r = list(dom.in_relations.items())[0]
             if dom.ops[0].inputs[0].dtype == "float16" and a.is_output and len(a.ops) >= 10 and \
                     _is_atomic_add_available(dom):
@@ -681,8 +656,6 @@ class GraphSplitGpu(GraphSplitByPattern):
         def _reduce_width(dom):
             if dom.pattern != PrimLib.REDUCE:
                 return None
-            if _check_reduce_exclude(dom):
-                return None
             fused = []
             for a, r in dom.in_relations.items():
                 if dom.ops[0].inputs[0].dtype == "float16" and a.is_output and len(a.ops) >= 10 and \
@@ -763,16 +736,16 @@ class GraphSplitGpu(GraphSplitByPattern):
 
         def _may_stitch(dom, a, r):
             if a.pattern <= PrimLib.REDUCE and r <= PrimLib.BROADCAST and dom.check_acyclic(a):
-                if _reduce_nums(a.ops) < 2:
-                    dom_outs = [op.output for op in dom.ops]
-                    a_ins = [op_input for op in a.ops for op_input in op.inputs]
-                    a_outs = [op.output for op in a.ops]
-                    a_final_outs = [tensor for tensor in a_outs if tensor not in a_ins]
-                    stitch_tensors = [tensor for tensor in dom_outs if tensor in a_ins]
-                    if _same_stitch_axis(stitch_tensors, a_final_outs):
-                        for tensor in stitch_tensors:
-                            if _tensor_size(tensor) >= 1024 * 1024:
-                                return True
+                if _reduce_nums(a.ops) >= 2:
+                    return False
+                dom_outs = [op.output for op in dom.ops]
+                a_ins = [op_input for op in a.ops for op_input in op.inputs]
+                a_outs = [op.output for op in a.ops]
+                a_final_outs = [tensor for tensor in a_outs if tensor not in a_ins]
+                stitch_tensors = [tensor for tensor in dom_outs if tensor in a_ins]
+                if not _same_stitch_axis(stitch_tensors, a_final_outs):
+                    return False
+                return any([_tensor_size(tensor) >= 1024 * 1024 for tensor in stitch_tensors])
             return False
 
         def _reduce_stitch(dom):
@@ -785,14 +758,15 @@ class GraphSplitGpu(GraphSplitByPattern):
 
             fused = []
             for a, r in dom.out_relations.items():
-                if _may_stitch(dom, a, r):
-                    if a.pattern == PrimLib.REDUCE:
-                        if a.ops[0].attrs['reduce_axis'] == dom.ops[0].attrs['reduce_axis']:
-                            dom.stitch_info.stitch_ops.add(dom.ops[0].output.name)
-                            fused.append(a)
-                    elif a.pattern == PrimLib.BROADCAST:
+                if not _may_stitch(dom, a, r):
+                    continue
+                if a.pattern == PrimLib.REDUCE:
+                    if a.ops[0].attrs['reduce_axis'] == dom.ops[0].attrs['reduce_axis']:
                         dom.stitch_info.stitch_ops.add(dom.ops[0].output.name)
                         fused.append(a)
+                elif a.pattern == PrimLib.BROADCAST:
+                    dom.stitch_info.stitch_ops.add(dom.ops[0].output.name)
+                    fused.append(a)
             return fused, False
 
         def _transpose(dom):
@@ -804,6 +778,16 @@ class GraphSplitGpu(GraphSplitByPattern):
                     fused.append(a)
             return fused, True
 
+        def _strided_slice(dom):
+            if dom.dom_op().prim != "StridedSlice":
+                return None
+            fused = []
+            for a, _ in dom.in_relations.items():
+                if a.pattern <= PrimLib.BROADCAST and a.check_acyclic(dom) and \
+                        len(a.out_relations) == 1 and not a.is_output:
+                    fused.append(a)
+            return fused, True
+
         def _fuse_loop():
             changed = True
             while changed:
@@ -814,10 +798,10 @@ class GraphSplitGpu(GraphSplitByPattern):
                 changed = self.fuse(_reduce_width) or changed
                 changed = self.fuse(_broadcast_depth) or changed
                 changed = self.fuse(_broadcast_width) or changed
-                if use_poly_reduce:
-                    changed = self.fuse(_reduce_output) or changed
-                    if enable_stitch_fusion:
-                        changed = self.fuse(_reduce_stitch) or changed
+                changed = self.fuse(_strided_slice) or changed
+                changed = self.fuse(_reduce_output) or changed
+                if enable_stitch_fusion:
+                    changed = self.fuse(_reduce_stitch) or changed
             self.fuse(_transpose)
 
         def _fuse_once(fuse_func):
@@ -825,9 +809,8 @@ class GraphSplitGpu(GraphSplitByPattern):
                     fuse_func(_reduce_depth) or fuse_func(_reduce_width) or fuse_func(_broadcast_depth) or \
                     fuse_func(_broadcast_width):
                 return
-            if use_poly_reduce:
-                if fuse_func(_reduce_output) or (enable_stitch_fusion and fuse_func(_reduce_stitch)):
-                    return
+            if fuse_func(_reduce_output) or (enable_stitch_fusion and fuse_func(_reduce_stitch)):
+                return
             fuse_func(_transpose)
             return
 
diff --git a/mindspore/_extends/graph_kernel/model/model.py b/mindspore/_extends/graph_kernel/model/model.py
index 4dcec3e1466..06a1c18a2ad 100644
--- a/mindspore/_extends/graph_kernel/model/model.py
+++ b/mindspore/_extends/graph_kernel/model/model.py
@@ -216,6 +216,7 @@ class PrimLib:
         'Transpose': Prim(OPAQUE),
         'Tile': Prim(BROADCAST),
         'BroadcastTo': Prim(BROADCAST),
+        'StridedSlice': Prim(OPAQUE),
         'MatMul': Prim(OPAQUE),
         'TransData': Prim(OPAQUE),
         'BatchMatMul': Prim(OPAQUE),
@@ -421,14 +422,13 @@ class Graph:
             for t in op.inputs:
                 if t not in inputs and t.op not in self.ops:
                     inputs.append(t)
-            if op.output not in outputs:
-                if op.output.para_type == Tensor.PARA_OUTPUT or not op.output.to_ops:
-                    outputs.append(op.output)
-                else:
-                    for d in op.output.to_ops:
-                        if d not in self.ops:
-                            outputs.append(op.output)
-                            break
+            if op.output in outputs:
+                continue
+            if op.output.para_type == Tensor.PARA_OUTPUT or not op.output.to_ops:
+                outputs.append(op.output)
+                continue
+            if any([succ not in self.ops for succ in op.output.to_ops]):
+                outputs.append(op.output)
         if self.inputs:
             inputs = self.inputs
 
diff --git a/mindspore/_extends/graph_kernel/model/model_builder.py b/mindspore/_extends/graph_kernel/model/model_builder.py
index 68c6b0f7cf5..e23efd54992 100644
--- a/mindspore/_extends/graph_kernel/model/model_builder.py
+++ b/mindspore/_extends/graph_kernel/model/model_builder.py
@@ -28,11 +28,13 @@ class GraphBuilder:
             self.graph = Graph(name, [])
 
         def set_input(self, *para):
+            """set input to graph inputs"""
             for t in para:
                 t.para_type = Tensor.PARA_INPUT
                 self.graph.inputs.append(t)
 
         def set_output(self, *para):
+            """set output to graph inputs"""
             for t in para:
                 t.para_type = Tensor.PARA_OUTPUT
                 self.graph.outputs.append(t)
@@ -50,6 +52,8 @@ class GraphBuilder:
     def graph_scope(self, name):
         """The graph scope to be processed"""
         class GraphScope:
+            """Graph Scope"""
+
             def __init__(self, gb):
                 self.gb = gb
 
@@ -77,7 +81,6 @@ class GraphBuilder:
         """Create a new Value"""
         if name in (None, ''):
             name = self._alloc_tensor_name()
-
         v = Value(name, dtype, value)
         return v
 
@@ -105,6 +108,7 @@ class GraphBuilder:
         return output
 
     def get(self):
+        """Get graphs"""
         return self.graphs
 
 
@@ -123,34 +127,14 @@ class CompositeGraph:
 
     def load(self, desc):
         """Load Graph from json"""
-        def _attr_of(op, inputs, output):
-            def _get_axis_while_none(input_shape, output_shape):
-                red_axis = []
-                if len(output_shape) == len(input_shape):
-                    for i, s in enumerate(output_shape):
-                        if s == 1 and input_shape[i] > 1:
-                            red_axis.append(i)
-                else:
-                    red_axis = list(range(len(output_shape)))
-                return red_axis
-
+        def _attr_of(op):
+            if not op['attr']:
+                return dict()
             attr = {}
-            if op['name'] in ('ReduceSum', 'ReduceMax', 'ReduceMin'):
-                for a in op['attr']:
-                    if a['name'] == 'axis':
-                        red_axis, dim_size = [], len(inputs[0].shape)
-                        if not a['value']:
-                            red_axis = _get_axis_while_none(inputs[0].shape, output.shape)
-                        else:
-                            if isinstance(a['value'], int):
-                                a['value'] = [a['value']]
-                            for i in a['value']:
-                                red_axis.append(i if i >= 0 else dim_size + i)
-                        attr['reduce_axis'] = red_axis
-                    if a['name'] == "reduce_output_fuse":
-                        attr['reduce_output_fuse'] = a['value']
-            elif op['attr']:
-                for a in op['attr']:
+            for a in op['attr']:
+                if a['name'] == 'axis' and op['name'] in ('ReduceSum', 'ReduceMax', 'ReduceMin'):
+                    attr['reduce_axis'] = a['value']
+                else:
                     attr[a['name']] = a['value']
             return attr
 
@@ -166,7 +150,6 @@ class CompositeGraph:
                     'shape'], out_desc['data_type'], out_desc['format']
                 self.tensors[name] = builder.tensor(
                     shape, dtype, data_format, name=name, para_type=Tensor.PARA_OUTPUT)
-            cur_fusion = None
             for op in desc['op_desc']:
                 inputs = [self.tensors[d['tensor_name']] for x in op['input_desc'] for d in x if 'value' not in d]
                 out_desc = op['output_desc']
@@ -177,25 +160,17 @@ class CompositeGraph:
                     inputs[1].para_type = Tensor.PARA_OUTPUT
                     output = inputs[2]
                     self.tensors[name] = output
-                else:
-                    output = self.tensors.get(name, None)
-                    if not output:
-                        output = builder.tensor(
-                            shape, dtype, data_format, name=name)
-                        self.tensors[name] = output
-                    builder.op(op['name'], output, inputs,
-                               attrs=_attr_of(op, inputs, output))
-                if 'fusion' in op:
-                    if cur_fusion is None:
-                        cur_fusion = output
-                    else:
-                        cur_fusion.add_buddy(output)
-                        if op['fusion'].endswith('_end'):
-                            cur_fusion = None
+                    continue
+                output = self.tensors.get(name, None)
+                if not output:
+                    output = builder.tensor(shape, dtype, data_format, name=name)
+                    self.tensors[name] = output
+                builder.op(op['name'], output, inputs, attrs=_attr_of(op))
         self.graph = builder.get()[0]
         self.desc = desc
 
     def add_stitch_info(self, subgraph, desc):
+        """add stitch info to desc"""
         if subgraph.stitch_info and subgraph.stitch_info.stitch_ops:
             buffer_stitch = {'stitch_op': list(subgraph.stitch_info.stitch_ops)}
             if subgraph.stitch_info.stitch_atomic_ops:
@@ -204,6 +179,7 @@ class CompositeGraph:
         return desc
 
     def add_recompute_ops(self, subgraph, desc):
+        """add recompute ops to desc"""
         if subgraph.recompute_ops:
             desc['recompute_ops'] = [op.output.name for op in subgraph.recompute_ops]
         return desc
@@ -227,43 +203,40 @@ class CompositeGraph:
         inputs, outputs = subgraph.deduce_parameters()
         graph_ops = set(subgraph.ops)
         inplace_assign, inplace_assign_z = self._pre_dump(outputs)
-        for key in self.desc:
+
+        def dump_output(t):
+            if t.name in inplace_assign:
+                z = inplace_assign_z if inplace_assign_z is not None else self.tensors[t.name]
+                return {'data_type': z.dtype, 'shape': z.shape, 'tensor_name': inplace_assign[t.name]}
+            return {'data_type': t.dtype, 'shape': t.shape, 'tensor_name': t.name}
+
+        def dump_op_desc(d):
+            if d['name'] == 'InplaceAssign':
+                y = d['input_desc'][1][0]['tensor_name']
+                if self.tensors[y].op in graph_ops:
+                    z, fake = (inplace_assign_z, False) if inplace_assign_z is not None else (self.tensors[y], True)
+                    inplace_desc = copy.deepcopy(d)
+                    inplace_desc['attr'] = {'name': 'fake_output', 'value': fake}
+                    z_desc, out_desc = inplace_desc['input_desc'][2][0], inplace_desc['output_desc'][0]
+                    z_desc['shape'] = z.shape
+                    z_desc['data_type'] = z.dtype
+                    z_desc['tensor_name'] = z.name
+                    out_desc['shape'] = z.shape
+                    out_desc['data_type'] = z.dtype
+                    return inplace_desc
+            op = self.tensors[d['output_desc'][0]['tensor_name']].op
+            if op in graph_ops or op in subgraph.recompute_ops:
+                return d
+            return None
+
+        for key in self.desc.keys():
             if key == 'input_desc':
-                desc[key] = [
-                    [{'data_type': t.dtype, 'shape': t.shape, 'tensor_name': t.name}] for t in inputs]
+                desc[key] = [[{'data_type': t.dtype, 'shape': t.shape, 'tensor_name': t.name}] for t in inputs]
             elif key == 'output_desc':
-                out_desc = []
-                for t in outputs:
-                    if t.name in inplace_assign:
-                        z = inplace_assign_z if inplace_assign_z is not None else self.tensors[t.name]
-                        out_desc.append(
-                            {'data_type': z.dtype, 'shape': z.shape, 'tensor_name': inplace_assign[t.name]})
-                    else:
-                        out_desc.append(
-                            {'data_type': t.dtype, 'shape': t.shape, 'tensor_name': t.name})
-                desc[key] = out_desc
+                desc[key] = list(map(dump_output, outputs))
             elif key == 'op_desc':
-                op_desc = []
-                for d in self.desc[key]:
-                    if d['name'] == 'InplaceAssign':
-                        y = d['input_desc'][1][0]['tensor_name']
-                        if self.tensors[y].op in graph_ops:
-                            z, fake = (inplace_assign_z, False) if inplace_assign_z is not None else (
-                                self.tensors[y], True)
-                            inplace_desc = copy.deepcopy(d)
-                            inplace_desc['attr'] = {'name': 'fake_output', 'value': fake}
-                            z_desc, out_desc = inplace_desc['input_desc'][2][0], inplace_desc['output_desc'][0]
-                            z_desc['shape'] = z.shape
-                            z_desc['data_type'] = z.dtype
-                            z_desc['tensor_name'] = z.name
-                            out_desc['shape'] = z.shape
-                            out_desc['data_type'] = z.dtype
-                            op_desc.append(inplace_desc)
-                    else:
-                        op = self.tensors[d['output_desc'][0]['tensor_name']].op
-                        if op in graph_ops or op in subgraph.recompute_ops:
-                            op_desc.append(d)
-                desc[key] = op_desc
+                op_desc = map(dump_op_desc, self.desc[key])
+                desc[key] = [d for d in op_desc if d is not None]
             elif key == 'op':
                 desc[key] = subgraph.name
             else:
diff --git a/mindspore/_extends/graph_kernel/model/op_infer.py b/mindspore/_extends/graph_kernel/model/op_infer.py
index 5bbb1e8d2cb..bf442d07a2c 100644
--- a/mindspore/_extends/graph_kernel/model/op_infer.py
+++ b/mindspore/_extends/graph_kernel/model/op_infer.py
@@ -16,7 +16,7 @@
 
 import copy
 import sys
-from functools import reduce
+from functools import reduce as prod_reduce
 from .model import GraphKernelUnsupportedException as GKException
 from .model import PrimLib, DataFormat as DF
 
@@ -101,22 +101,24 @@ class OpInfer:
 
 class _Elemwise(OpInfer):
     """Common infer for elementwise operators"""
-
-    def _broadcast_shape(self, shapes):
+    @staticmethod
+    def broadcast_shape(shapes):
         """deduce broadcast shape using same rules as numpy"""
         dim_size = max([len(shape) for shape in shapes])
         align_shapes = [[1] * (dim_size - len(shape)) + shape for shape in shapes]
         out_shape = [1] * dim_size
         for i in range(dim_size):
             for align_shape in align_shapes:
-                if align_shape[i] > 1:
-                    if out_shape[i] == 1:
-                        out_shape[i] = align_shape[i]
-                    if out_shape[i] != align_shape[i]:
-                        raise GKException("shape broadcast failed!")
+                if align_shape[i] == 1:
+                    continue
+                if out_shape[i] == 1:
+                    out_shape[i] = align_shape[i]
+                elif out_shape[i] != align_shape[i]:
+                    raise GKException("shape broadcast failed!")
         return out_shape
 
-    def _to_nz(self, default_shape):
+    @staticmethod
+    def defaultformat_to_nz(default_shape):
         """default format shape to fractal_Nz format shape"""
         if len(default_shape) not in (1, 2):
             raise GKException("shape is too long!")
@@ -142,17 +144,17 @@ class _Elemwise(OpInfer):
         """returns the output shape with broadcast"""
 
         # in case all inputs are default format/NHWC/NCHW
-        is_default = [input.data_format in (DF.DEFAULT, DF.NHWC, DF.NCHW) for input in self.inputs]
+        is_default = [op_input.data_format in (DF.DEFAULT, DF.NHWC, DF.NCHW) for op_input in self.inputs]
         if all(is_default):
-            return self._broadcast_shape([input.shape for input in self.inputs])
+            return self.broadcast_shape([op_input.shape for op_input in self.inputs])
 
         # in case formats are fractal_nz, default_fromat/NHWC/HCHW(optional)
-        is_default_frac_nz = [input.data_format in (DF.DEFAULT, DF.NHWC, DF.NCHW, DF.FRAC_NZ)
-                              for input in self.inputs]
+        is_default_frac_nz = [op_input.data_format in (DF.DEFAULT, DF.NHWC, DF.NCHW, DF.FRAC_NZ)
+                              for op_input in self.inputs]
         if all(is_default_frac_nz):
-            nz_shapes = [self._to_nz(input.shape) if input.data_format != DF.FRAC_NZ else input.shape
-                         for input in self.inputs]
-            return self._broadcast_shape(nz_shapes)
+            nz_shapes = [self.defaultformat_to_nz(op_input.shape) if op_input.data_format != DF.FRAC_NZ
+                         else op_input.shape for op_input in self.inputs]
+            return self.broadcast_shape(nz_shapes)
 
         raise GKException("Only support default and fractal_nz")
 
@@ -214,9 +216,11 @@ class _Reshape(OpInfer):
 
 
 class Reshape(_Reshape):
+    """Reshape op infer"""
+
     def _check_shape(self):
-        size_before_reshape = reduce(lambda x, y: x * y, self.inputs[0].shape)
-        size_after_reshape = reduce(lambda x, y: x * y, self.attrs["shape"])
+        size_before_reshape = prod_reduce(lambda x, y: x * y, self.inputs[0].shape)
+        size_after_reshape = prod_reduce(lambda x, y: x * y, self.attrs["shape"])
         if size_before_reshape != size_after_reshape:
             raise GKException("The shape product before and after reshaping should be equal")
 
@@ -225,11 +229,15 @@ class Reshape(_Reshape):
 
 
 class Cast(_Elemwise):
+    """Cast op infer"""
+
     def _infer_type(self):
         return self.attrs["dst_type"]
 
 
 class InplaceAssign(_Elemwise):
+    """InplaceAssign op infer"""
+
     def _infer_shape(self):
         return self.inputs[2].shape
 
@@ -241,6 +249,8 @@ class InplaceAssign(_Elemwise):
 
 
 class BroadcastTo(OpInfer):
+    """BroadcastTo op infer"""
+
     def _infer_shape(self):
         return self.attrs["shape"]
 
@@ -256,6 +266,8 @@ class _CompareOp(_Elemwise):
 
 
 class CImag(OpInfer):
+    """CImag op infer"""
+
     def _check_type(self):
         if self.inputs[0].dtype != "complex64":
             raise GKException(
@@ -266,6 +278,8 @@ class CImag(OpInfer):
 
 
 class CReal(OpInfer):
+    """CReal op infer"""
+
     def _check_type(self):
         if self.inputs[0].dtype != "complex64":
             raise GKException(
@@ -276,6 +290,8 @@ class CReal(OpInfer):
 
 
 class Complex(OpInfer):
+    """Complex op infer"""
+
     def _check_type(self):
         if self.inputs[0].dtype != "float32":
             raise GKException(
@@ -288,26 +304,28 @@ class Complex(OpInfer):
 
 
 class Less(_CompareOp):
-    pass
+    """Less op infer"""
 
 
 class LessEqual(_CompareOp):
-    pass
+    """LessEqual op infer"""
 
 
 class Equal(_CompareOp):
-    pass
+    """Equal op infer"""
 
 
 class Greater(_CompareOp):
-    pass
+    """Greater op infer"""
 
 
 class GreaterEqual(_CompareOp):
-    pass
+    """GreaterEqual op infer"""
 
 
 class Select(_Elemwise):
+    """Select op infer"""
+
     def _check_type(self):
         if self.inputs[0].dtype != "bool":
             raise GKException("Select's input[0] should be a bool condition but got {}".format(self.inputs[0].dtype))
@@ -319,6 +337,7 @@ class Select(_Elemwise):
 
 
 def check_format_any(formats, checked_format):
+    """Check whether input format in formats list"""
     if not isinstance(formats, (list, tuple)):
         raise GKException("formats {} should be list or tuple, but got {}.".format(formats, type(formats)))
     if checked_format not in formats:
@@ -326,11 +345,13 @@ def check_format_any(formats, checked_format):
 
 
 def check_nd(data, nd):
+    """Check whether data are nd format"""
     if not isinstance(data, (list, tuple)) or len(data) != nd:
         raise GKException("input should be {}D list or tuple, but got {}.".format(nd, data))
 
 
 def conv_had_pad(pad_list, pad_mode):
+    """Check whether conv need to add pad"""
     if not isinstance(pad_list, (list, tuple)) or len(pad_list) != 4:
         raise GKException("pad_list should be 4D list or tuple, but got {}".format(pad_list))
     if pad_list[0] != pad_list[1] or pad_list[2] != pad_list[3]:
diff --git a/mindspore/_extends/graph_kernel/splitter.py b/mindspore/_extends/graph_kernel/splitter.py
index 87b7da1260a..027a588c22b 100644
--- a/mindspore/_extends/graph_kernel/splitter.py
+++ b/mindspore/_extends/graph_kernel/splitter.py
@@ -57,11 +57,11 @@ def _dump_split_info(flags, graph_json, graph_desc, subgraphs, graph_mode):
         return
     utils.create_dir(utils.GRAPH_KERNEL_DUMP_PATH)
     filename = os.path.join(utils.GRAPH_KERNEL_DUMP_PATH, "graph_kernel_split_mode.txt")
-    with open(filename, "a+") as f:
+    with os.fdopen(os.open(filename, os.O_WRONLY | os.O_CREAT), "a+") as f:
         f.write("********** main graph: {} **********\n".format(graph_desc.name))
         f.write("input json:\n{}\n".format(graph_json))
         f.write("graph desc:\n{}\n".format(str(graph_desc)))
-        if len(subgraphs) > 1:
+        if len(subgraphs) > 1 or subgraphs[0].stitch_info.has_stitch_op():
             for i, g in enumerate(subgraphs):
                 f.write("-------- subgraph {}, mode: {} --------\n".format(i, graph_mode[i]))
                 f.write("{}\n".format(str(g)))
diff --git a/mindspore/_extends/graph_kernel/utils.py b/mindspore/_extends/graph_kernel/utils.py
index ed9a32ab44f..7d4cc7ae9ae 100644
--- a/mindspore/_extends/graph_kernel/utils.py
+++ b/mindspore/_extends/graph_kernel/utils.py
@@ -26,3 +26,5 @@ def create_dir(pathname):
         os.mkdir(pathname)
     except OSError:
         pass
+    finally:
+        pass
diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_adapter.py b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_adapter.py
index 5f20341d0b3..1d56c3b7f17 100644
--- a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_adapter.py
+++ b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_adapter.py
@@ -32,7 +32,7 @@ from te_fusion.parallel_compilation import init_multi_process_env, start_ga_mult
     get_finished_compilation_task
 
 from .tbe_helper import get_soc_info, assemble_op_args, get_compute_op_list, get_options_info, get_fuzz_build_info, \
-    BuildType, adjust_custom_op_info, pack_op_args
+    BuildType, adjust_custom_op_info, pack_op_args, get_module_name
 from .tbe_job import TbeJob, JobStatus
 
 PLATFORM_FLAG = ["Ascend310", "Ascend910", "Hi3796CV300ES", "Ascend710", "Ascend610", "Hi3796CV300CS", "SD3403"]
@@ -242,7 +242,7 @@ def check_support(job: TbeJob):
     op_func_name = compute_op_info["func_name"]
     if op_func_name in ("resize_nearest_neighbor_v2_grad_d", "resize_bilinear_v2_grad"):
         attrs.pop(-2)
-    op_module_name = compute_op_info["module_name"]
+    op_module_name = get_module_name(compute_op_info)
     py_module_path = compute_op_info["py_module_path"]
     _normalize_module_name(op_module_name, py_module_path)
     func_name = "check_supported"
@@ -281,7 +281,7 @@ def select_op_format(job: TbeJob):
     compute_op_info = compute_op_info_list[0]
     adjust_custom_op_info(compute_op_info)
     inputs, outputs, attrs = assemble_op_args(compute_op_info)
-    op_module_name = compute_op_info["module_name"]
+    op_module_name = get_module_name(compute_op_info)
     py_module_path = compute_op_info["py_module_path"]
     _normalize_module_name(op_module_name, py_module_path)
     op_func_name = "op_select_format"
@@ -317,7 +317,7 @@ def _pre_build_compute_op_info(compute_op, job):
     if l1_size != -1:
         set_L1_info("op_L1_space", -1)
     inputs, outputs, attrs = assemble_op_args(compute_op)
-    op_module_name = compute_op["module_name"]
+    op_module_name = get_module_name(compute_op)
     py_module_path = compute_op["py_module_path"]
     op_func_name = compute_op["func_name"]
     op_type = compute_op["type"]
@@ -340,8 +340,8 @@ def _pre_build_compute_op_info(compute_op, job):
         job.info("OpType {} support op_impl_mode, current op_impl_mode:{}".format(op_type, op_impl_mode))
     options = get_options_info(job.content)
     dispatch_prebuild_task(job.source_id, job.id, l1_size, op_module_name, op_type, op_func_name, unknown_shape,
-                           (inputs, outputs, attrs, options), int64_mode, dynamic_compile_static, job.rl_tune_switch,
-                           job.rl_tune_list, job.pass_list, job.op_tune_switch, job.op_tune_list)
+                           (inputs, outputs, attrs, options), int64_mode, dynamic_compile_static, unknown_shape,
+                           job.rl_tune_switch, job.rl_tune_list, job.pass_list, job.op_tune_switch, job.op_tune_list)
 
 
 def get_prebuild_output(op_name):
@@ -391,7 +391,7 @@ def build_single_pre_op(job: TbeJob):
     inputs, outputs, attrs = assemble_op_args(compute_op_info)
     op_type = compute_op_info["type"]
     l1_size = job.content["l1_size"]
-    op_module_name = compute_op_info["module_name"]
+    op_module_name = get_module_name(compute_op_info)
     op_kernel_name = compute_op_info["op_name"]
     py_module_path = compute_op_info["py_module_path"]
     op_func_name = compute_op_info["func_name"]
@@ -404,9 +404,9 @@ def build_single_pre_op(job: TbeJob):
     fuzz_build_info = get_fuzz_build_info(job.content)
     dispatch_single_op_compile_task(job.source_id, job.id, l1_size, op_module_name, op_type, op_func_name,
                                     op_kernel_name, unknown_shape, (inputs, outputs, attrs, options), int64_mode,
-                                    None, None, dynamic_compile_static, op_pattern, json.dumps(fuzz_build_info),
-                                    job.rl_tune_switch, job.rl_tune_list, job.pass_list, job.op_tune_switch,
-                                    job.op_tune_list)
+                                    None, None, dynamic_compile_static, unknown_shape, op_pattern,
+                                    json.dumps(fuzz_build_info), job.rl_tune_switch, job.rl_tune_list, job.pass_list,
+                                    job.op_tune_switch, job.op_tune_list)
     return True
 
 
@@ -487,7 +487,7 @@ def rl_tune_single_op(job: TbeJob):
     inputs, outputs, attrs = assemble_op_args(compute_op_info)
     op_type = compute_op_info["type"]
     l1_size = job.content["l1_size"]
-    op_module_name = compute_op_info["module_name"]
+    op_module_name = get_module_name(compute_op_info)
     op_kernel_name = compute_op_info["op_name"]
     full_name = compute_op_info["name"]
     py_module_path = compute_op_info["py_module_path"]
@@ -503,7 +503,7 @@ def rl_tune_single_op(job: TbeJob):
     device_id = job.content["SocInfo"]["deviceId"]
     try:
         build_single_op_from_c(op_module_name, op_func_name, op_type, "build", unknown_shape,
-                               (inputs, outputs, attrs), int64_mode, dynamic_compile_static, op_pattern,
+                               (inputs, outputs, attrs), int64_mode, dynamic_compile_static, unknown_shape, op_pattern,
                                auto_tiling_mode, device_id, json.dumps(fuzz_build_info))
     # pylint: disable=broad-except
     except Exception:
@@ -547,7 +547,7 @@ def rl_tune_fusion_op(job: TbeJob):
     compute_op_list = get_compute_op_list(job.content)
     op_module_names_str = ""
     for op in compute_op_list:
-        op_module_names_str = op_module_names_str + "," + op["module_name"]
+        op_module_names_str = op_module_names_str + "," + get_module_name(op)
     op_module_names_str = op_module_names_str[1:]
     from schedule_search.rl_online_tune import dispatch_fusion_tune_task
     res = dispatch_fusion_tune_task(job.source_id, job.id, l1_size, base_kernel, op_kernel_name, op_module_names_str,
diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py
index 015c67e7806..806051f9eb0 100644
--- a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py
+++ b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py
@@ -179,8 +179,6 @@ def get_options_info(job_content):
     options["op_debug_level"] = job_content["SocInfo"]["op_debug_level"]
     options["op_impl_mode"] = job_content["SocInfo"]["op_impl_mode"]
     options["op_debug_dir"] = job_content["SocInfo"]["op_debug_dir"]
-    options["op_compiler_cache_dir"] = job_content["SocInfo"]["op_compiler_cache_dir"]
-    options["op_compiler_cache_mode"] = job_content["SocInfo"]["op_compiler_cache_mode"]
     options["mdl_bank_path"] = job_content["SocInfo"]["op_debug_level"]
     options["op_bank_path"] = job_content["SocInfo"]["op_bank_path"]
     options["deviceId"] = job_content["SocInfo"]["deviceId"]
@@ -220,6 +218,19 @@ def get_func_names(job_content):
     return func_names
 
 
+def get_module_name(compute_op_info):
+    """
+    get compute_op_info
+    :param compute_op_info:
+    :return:
+    """
+    unknown_shape = compute_op_info["unknown_shape"]
+    op_module_name = compute_op_info["module_name"]
+    if unknown_shape:
+        op_module_name = op_module_name.split(".")[0] + ".dynamic." + op_module_name.split(".")[-1]
+    return op_module_name
+
+
 def adjust_custom_op_info(compute_op_info):
     """
     adjust custom op info
diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job.py b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job.py
index 8100257dbd5..ce609d06147 100644
--- a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job.py
+++ b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job.py
@@ -71,12 +71,13 @@ def _get_message(msg, args):
 class TbeJob:
     """ Tbe compilation job """
 
-    def __init__(self, source_id, job_id, job_type, content, json_str, sys_info):
+    def __init__(self, source_id, job_id, job_type, content, fusion_op_name, json_str, sys_info):
         self.source_id = source_id
         self.id = job_id
         self.type = JobType(job_type)
         self.status = JobStatus.JOB_INITIAL
         self.content = content
+        self.fusion_op_name = fusion_op_name
         self.result = ""
         self.process_info = []
         self.json_string = json_str
@@ -149,8 +150,8 @@ class TbeJob:
         result["source_id"] = self.source_id
         result["job_id"] = self.id
         result["job_type"] = self.type.value
+        result["fusion_op_name"] = self.fusion_op_name
         result["result"] = self.result
-        self.debug("Resp result:{}".format(json.dumps(result)))
         process_info = []
         for info in self.process_info:
             msg = {"index": info.index, "level": info.level.value, "message": info.info}
diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job_manager.py b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job_manager.py
index 9b1a2a9342d..e2e6e7895a8 100644
--- a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job_manager.py
+++ b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_job_manager.py
@@ -102,8 +102,9 @@ class TbeJobManager:
             source_id = job_json["source_id"]
             job_type = job_json["job_type"]
             sys_info = self._get_job_sys_info()
-            job = TbeJob(source_id, job_id, job_type, job_json["job_content"], job_str, sys_info)
-            job.debug("Req job string: {}".format(job_str))
+            fusion_op_name = "NA" if "fusion_op_name" not in job_json["job_content"] else job_json["job_content"][
+                "fusion_op_name"]
+            job = TbeJob(source_id, job_id, job_type, job_json["job_content"], fusion_op_name, job_str, sys_info)
             post_job(self._all_jobs, job)
             if not self.tbe_initialize and job.type != JobType.INITIALIZE_JOB:
                 job.error(
@@ -115,6 +116,7 @@ class TbeJobManager:
             return res
         # pylint: disable=broad-except
         except Exception:
+            # pylint: disable=no-value-for-parameter
             sys_info = self._get_job_sys_info()
             job = TbeJob(-1, -1, "", None, job_str, sys_info) if job is None else job
             job.status = JobStatus.JOB_FAILED
@@ -261,9 +263,6 @@ class TbeJobManager:
             return self.add_to_finished_jobs(query_job, JobStatus.JOB_SUCCESS)
         target_job = get_job(self._running_jobs, target_source_id, target_job_id)
         if target_job:
-            query_job.debug("Found job in Running jobs, source_id:{}, job_id:{}".format(target_source_id,
-                                                                                        target_job_id))
-            target_job.debug("Be Queried")
             query_job.result = target_job.get_result()
             return self.add_to_finished_jobs(query_job, JobStatus.JOB_SUCCESS)
         target_job = get_job(self._all_jobs, target_source_id, target_job_id)
diff --git a/mindspore/_extends/remote/kernel_build_server.py b/mindspore/_extends/remote/kernel_build_server.py
index da042e95a28..72f589f385f 100644
--- a/mindspore/_extends/remote/kernel_build_server.py
+++ b/mindspore/_extends/remote/kernel_build_server.py
@@ -16,7 +16,6 @@
 import os
 from mindspore import log as logger
 from mindspore._extends.parallel_compile.akg_compiler.akg_process import create_akg_parallel_process
-from mindspore._extends.parallel_compile.akg_compiler.compiler import run_compiler as akg_compile_single
 
 
 class Messager:
@@ -146,9 +145,7 @@ class AkgBuilder():
 
     def handle(self, messager, arg):
         """Handle message about akg"""
-        if arg == 'AKG/PID':
-            messager.send_res(os.getpid())
-        elif arg == 'AKG/START':
+        if arg == 'AKG/START':
             messager.send_ack()
             process_num_str = messager.get_message()
             messager.send_ack()
@@ -173,17 +170,8 @@ class AkgBuilder():
                 else:
                     messager.send_ack(False)
                     break
-        elif arg == 'AKG/COMPILE':
-            messager.send_ack()
-            json = messager.get_message()
-            try:
-                akg_compile_single(json, self.attrs)
-            except ValueError:
-                messager.send_ack(False)
-                messager.exit()
-            finally:
-                pass
-            messager.send_ack()
+        else:
+            raise RuntimeError("Unknown message type: %s" % arg)
 
 
 def get_logger():
diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt
index 444d08a5edd..33ebdc3887f 100644
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -297,20 +297,14 @@ if(MODE_ASCEND_ALL)
                 ${ASCEND_DRIVER_BACK_PATH})
     find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}
                 ${ASCEND_DRIVER_BACK_PATH})
-    find_library(PROFILING msprofiler_fwkacl ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(PROFILING msprofiler ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
     find_library(ACL ascendcl ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
     find_library(PLATFORM platform ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
     find_library(OPTILING optiling ${ASCEND_OPP_PATH} ${ASCEND_TOOLKIT_OPP_PATH})
     find_library(OPT_FEATURE opt_feature ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
 
-    add_library(ms_profile SHARED
-                ${CMAKE_CURRENT_SOURCE_DIR}/runtime/device/ascend/profiling/profiling_callback_register.cc)
-    set_target_properties(ms_profile PROPERTIES LINKER_LANGUAGE CXX)
-    target_link_options(ms_profile PRIVATE -Wl,-init,common_log_init)
-    target_link_libraries(ms_profile -Wl,--start-group -Wl,--whole-archive ${PROFILING} -Wl,--no-whole-archive
-                          mindspore::protobuf -Wl,--end-group)
     target_link_libraries(mindspore ${RUNTIME_LIB} ${TSDCLIENT} ${DATATRANSFER} ${ERROR_MANAGER} -Wl,--no-as-needed
-      ${OPTILING} ${PLATFORM} ${ACL} ${OPT_FEATURE})
+      ${OPTILING} ${PLATFORM} ${ACL} ${OPT_FEATURE} ${PROFILING})
     target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group)
 elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
     target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf mindspore::sentencepiece
@@ -325,7 +319,7 @@ endif()
 set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
 set_property(SOURCE "pipeline/jit/init.cc" PROPERTY
             COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PIPELINE)
-pybind11_add_module(_c_expression NO_EXTRAS "pipeline/jit/init.cc")
+pybind11_add_module(_c_expression NO_EXTRAS "pipeline/jit/init.cc" NO_EXTRAS)
 
 MESSAGE(STATUS "operation system is ${CMAKE_SYSTEM}")
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
@@ -375,9 +369,6 @@ else()
         proto_input -Wl,--no-whole-archive)
     target_link_libraries(_c_expression PRIVATE mindspore::pybind11_module)
     target_link_libraries(_c_expression PRIVATE mindspore_gvar)
-    if(MODE_ASCEND_ALL)
-        target_link_libraries(_c_expression PRIVATE -Wl,--no-as-needed ms_profile)
-    endif()
 endif()
 
 if(USE_GLOG)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
index 954402e5c9e..d2174ece35e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@@ -36,6 +36,7 @@ if(ENABLE_CPU)
         "cpu/ps/*.cc"
         "cpu/quantum/*.cc"
         "cpu/pyfunc/*.cc"
+        "cpu/rl/*.cc"
     )
 
     if(NOT ENABLE_MPI)
@@ -84,6 +85,7 @@ if(NOT ENABLE_CPU OR WIN32)
     list(REMOVE_ITEM CPU_SRC_LIST "cpu/fl/get_model_kernel.cc")
     list(REMOVE_ITEM CPU_SRC_LIST "cpu/fl/start_fl_job_kernel.cc")
     list(REMOVE_ITEM CPU_SRC_LIST "cpu/fl/update_model_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/fl/push_metrics_kernel.cc")
 endif()
 
 if(ENABLE_GPU)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
index 500be4de4ad..0d53e84abbd 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
@@ -197,17 +197,37 @@ int32_t AkgKernelPool::Init(const std::vector<JsonNodePair> &build_args) {
 }
 
 AkgKernelPool::~AkgKernelPool() {
-  // Detach shared memory
-  auto ret = shmdt(reinterpret_cast<void *>(kernel_lists_[0]));
-  if (ret < 0) {
-    MS_LOG(EXCEPTION) << "Shared_mem detach failed, errno:" << strerror(errno);
-  }
+  {
+    LockMng lock(fd_);
+    if (!lock.locked_) {
+      MS_LOG(EXCEPTION) << "Failed to acquire lock.";
+    }
 
-  // Realse shared_memroy
-  if (is_creator_) {
-    ret = shmctl(shm_id_, IPC_RMID, nullptr);
+    struct shmid_ds buf;
+    auto ret = shmctl(shm_id_, IPC_STAT, &buf);
+    if (ret == -1) {
+      MS_LOG(EXCEPTION) << "Failed to get the info of shared memory, errno:" << strerror(errno);
+    }
+
+    bool need_delete_by_last = false;
+
+    // if the creator exits unexpectedly and fails to delete the shm, the last process will try to delete the shm
+    if (((buf.shm_perm.mode & SHM_DEST) == 0) && (buf.shm_nattch == 1)) {
+      need_delete_by_last = true;
+    }
+
+    // Detach shared memory
+    ret = shmdt(reinterpret_cast<void *>(kernel_lists_[0]));
     if (ret < 0) {
-      MS_LOG(EXCEPTION) << "Realse shared_mem failed, errno:" << strerror(errno);
+      MS_LOG(EXCEPTION) << "Shared_mem detach failed, errno:" << strerror(errno);
+    }
+
+    // Realse shared_memroy
+    if (is_creator_ || need_delete_by_last) {
+      ret = shmctl(shm_id_, IPC_RMID, nullptr);
+      if (ret < 0) {
+        MS_LOG(EXCEPTION) << "Realse shared_mem failed, errno:" << strerror(errno);
+      }
     }
   }
 
@@ -354,35 +374,6 @@ int32_t AkgKernelPool::Wait() {
   return -1;
 }
 
-std::vector<std::string> AkgKernelBuilder::GetNotCachedKernelJsons(const std::vector<JsonNodePair> &build_args) {
-  // Remove cached nodes, gether unique nodes, and collect repeated nodes which need postprecess.
-  std::vector<std::string> jsons;
-  std::unordered_set<std::string> kernel_name_set;
-  for (const auto &[json_generator, anf_node] : build_args) {
-    MS_EXCEPTION_IF_NULL(anf_node);
-    auto kernel_name = json_generator.kernel_name();
-    MS_LOG(DEBUG) << "Akg start compile op: " << kernel_name;
-
-    auto cached_kernel_pack = AkgSearchCache(kernel_name);
-    if (cached_kernel_pack != nullptr) {
-      MS_LOG(DEBUG) << "Use cached kernel, kernel_name[" << kernel_name << "], fullname_with_scope["
-                    << anf_node->fullname_with_scope() << "].";
-      AkgSetKernelMod(cached_kernel_pack, json_generator, anf_node);
-      continue;
-    }
-
-    if (kernel_name_set.count(kernel_name) != 0) {
-      repeat_nodes_.push_back({json_generator, anf_node});
-      continue;
-    }
-    kernel_name_set.insert(kernel_name);
-    auto kernel_json = json_generator.kernel_json_str();
-    AkgSaveJsonInfo(kernel_name, kernel_json);
-    jsons.push_back(kernel_json);
-  }
-  return jsons;
-}
-
 std::vector<JsonNodePair> AkgKernelBuilder::GetNotCachedKernels(const std::vector<JsonNodePair> &build_args) {
   std::unordered_set<std::string> kernel_name_set;
   std::vector<JsonNodePair> new_build_args;
@@ -432,8 +423,8 @@ bool AkgKernelBuilder::HandleRepeatNodes() {
                     << anf_node->fullname_with_scope() << "].";
       return false;
     }
-    MS_LOG(INFO) << "Use just compiled kernel, kernel_name[" << kernel_name << "], fullname_with_scope["
-                 << anf_node->fullname_with_scope() << "].";
+    MS_LOG(DEBUG) << "Use just compiled kernel, kernel_name[" << kernel_name << "], fullname_with_scope["
+                  << anf_node->fullname_with_scope() << "].";
     AkgSetKernelMod(cached_kernel_pack, json_generator, anf_node);
   }
   return true;
@@ -555,7 +546,7 @@ bool AkgKernelBuilder::AkgKernelParallelBuild(const std::vector<AnfNodePtr> &anf
   }
 
   if (json_and_node.empty()) {
-    MS_LOG(DEBUG) << "There is no kernel needed to be compiled.";
+    MS_LOG(INFO) << "There is no akg kernel to be compiled.";
     return true;
   }
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
index 9f9958f1464..615687f0ae7 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
@@ -47,7 +47,6 @@ class AkgKernelBuilder {
   bool AkgKernelParallelBuild(const std::vector<AnfNodePtr> &anf_nodes);
 
  private:
-  std::vector<std::string> GetNotCachedKernelJsons(const std::vector<JsonNodePair> &build_args);
   std::vector<JsonNodePair> GetNotCachedKernels(const std::vector<JsonNodePair> &build_args);
   std::vector<std::string> GetKernelJsonsByHashId(const std::vector<JsonNodePair> &build_args,
                                                   std::set<size_t> fetched_ids);
@@ -91,7 +90,6 @@ class AkgKernelPool {
   int32_t UpdateAndWait(const std::set<size_t> &ids);
 
   constexpr inline static size_t kMaxKernelNum_{1000};
-  constexpr inline static key_t kSharedMemKey_{0x57565845};
 
   // allocate memory for todo_list, doing_list, done_list
   constexpr inline static size_t kListNum_{3};
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_decoder.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_decoder.cc
index 9c20203eeb7..9b7bf47b2a7 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_decoder.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_decoder.cc
@@ -15,12 +15,6 @@
  */
 #include "backend/kernel_compiler/akg/akg_kernel_json_decoder.h"
 
-#include <algorithm>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <map>
-#include <vector>
 #include "backend/kernel_compiler/akg/akg_kernel_json_generator.h"
 #include "backend/kernel_compiler/common_utils.h"
 #include "backend/session/anf_runtime_algorithm.h"
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.cc
index 39b2445bbfd..8314467b302 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.cc
@@ -16,12 +16,6 @@
 
 #include "backend/kernel_compiler/akg/akg_kernel_json_generator.h"
 
-#include <algorithm>
-#include <functional>
-#include <map>
-#include <set>
-#include <sstream>
-#include <tuple>
 #if ENABLE_GPU
 #include <cuda.h>
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_metadata.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_metadata.cc
index f3567428d35..22243fcf9db 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_metadata.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_metadata.cc
@@ -15,7 +15,6 @@
  */
 
 #include "backend/kernel_compiler/akg/akg_kernel_metadata.h"
-#include <memory>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/oplib/oplib.h"
 #include "backend/kernel_compiler/common_utils.h"
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.cc
index 4f0b619848c..c5c39589ff9 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.cc
@@ -16,13 +16,6 @@
 
 #include "backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.h"
 
-#include <algorithm>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
 #include "ir/dtype.h"
 #include "ir/func_graph.h"
 #include "backend/kernel_compiler/common_utils.h"
@@ -34,11 +27,11 @@
 namespace mindspore {
 namespace kernel {
 KernelPackPtr AkgAscendKernelBuilder::AkgSearchCache(const std::string &kernel_name) {
-  return tbe::TbeUtils::SearchCache(kernel_name, kProcessorAiCore);
+  return tbe::TbeUtils::SearchCache(kernel_name, true);
 }
 
 KernelPackPtr AkgAscendKernelBuilder::AkgInsertCache(const std::string &kernel_name) {
-  return tbe::TbeUtils::InsertCache(kernel_name, kProcessorAiCore);
+  return tbe::TbeUtils::InsertCache(kernel_name, kProcessorAiCore, true);
 }
 
 void AkgAscendKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack,
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.cc
index 47d5c0f31ba..32539661829 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.cc
@@ -49,6 +49,5 @@ void AkgGpuKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack,
 void AkgGpuKernelBuilder::AkgSaveJsonInfo(const string &kernel_name, const string &kernel_json) {
   kernel::SaveJsonInfo(kernel_name, kernel_json, kernel::KernelMeta::GetInstance()->kernel_meta_path());
 }
-
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc
index 0971bdcf42b..4ed0d553340 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc
@@ -15,8 +15,7 @@
  */
 
 #include "backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.h"
-#include <fstream>
-#include <algorithm>
+
 #include "nlohmann/json.hpp"
 #include "utils/ms_utils.h"
 
@@ -126,7 +125,7 @@ bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
                        [](const AddressPtr &output) -> void * { return reinterpret_cast<void *>(&(output->addr)); });
   if (!workspace.empty()) {
     (void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtimeargs),
-                         [](const AddressPtr &addr) -> void * { return addr->addr; });
+                         [](const AddressPtr &addr) -> void * { return reinterpret_cast<void *>(&(addr->addr)); });
   }
   result = cuLaunchKernel(kernel_addr, thread_info[0], thread_info[1], thread_info[2], thread_info[3], thread_info[4],
                           thread_info[5], 0, reinterpret_cast<CUstream>(stream_ptr),
diff --git a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
index b9124449dd8..ee0c753c409 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
@@ -970,5 +970,39 @@ size_t CalOffset(const std::vector<int64_t> &start, const std::vector<int64_t> &
   }
   return offset;
 }
+
+size_t UnitSizeInBytes(const mindspore::TypeId &t) {
+  size_t bytes = 0;
+  switch (t) {
+    case kNumberTypeBool:
+    case kNumberTypeInt8:
+    case kNumberTypeUInt8:
+      bytes = sizeof(int8_t);
+      break;
+    case kNumberTypeInt16:
+    case kNumberTypeUInt16:
+    case kNumberTypeFloat16:
+      bytes = sizeof(int16_t);
+      break;
+    case kNumberTypeInt:
+    case kNumberTypeUInt:
+    case kNumberTypeInt32:
+    case kNumberTypeUInt32:
+    case kNumberTypeFloat:
+    case kNumberTypeFloat32:
+      bytes = sizeof(int32_t);
+      break;
+    case kNumberTypeUInt64:
+    case kNumberTypeInt64:
+    case kNumberTypeFloat64:
+      bytes = sizeof(int64_t);
+      break;
+    default:
+      MS_LOG(EXCEPTION) << "Invalid types " << t;
+      break;
+  }
+
+  return bytes;
+}
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
index 507517954bd..7ad2cade9dc 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
@@ -143,6 +143,7 @@ size_t CalOffset(const std::vector<int64_t> &start, const std::vector<int64_t> &
 std::vector<int64_t> CalDimOffset(const std::vector<int64_t> &input_shape);
 size_t GetCopySize(const std::vector<int64_t> &dim_offset, const std::vector<int64_t> &start,
                    const std::vector<int64_t> &stop);
+size_t UnitSizeInBytes(const mindspore::TypeId &t);
 }  // namespace kernel
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_cpu_kernel.cc
index b2a851136e5..76e3b9de885 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_cpu_kernel.cc
@@ -83,7 +83,7 @@ void AdamCPUKernel::LaunchAdamNnacl(const std::vector<kernel::AddressPtr> &input
       MS_LOG(EXCEPTION) << "AdamFp32 failed.";
     }
   };
-  CPUKernelUtils::ParallelForAutoSearch(task, lens, &parallel_search_info_);
+  ParallelLaunchAutoSearch(task, lens, this, &parallel_search_info_);
 }
 
 void AdamCPUKernel::InitKernel(const CNodePtr &kernel_node) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.cc
index 2ee4d031018..1fcc52d078f 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.cc
@@ -19,6 +19,7 @@
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "nnacl/fp32/power_fp32.h"
 #include "nnacl/fp32/sub_fp32.h"
+#include "nnacl/fp32/mul_fp32.h"
 
 namespace mindspore {
 namespace kernel {
@@ -54,7 +55,7 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) {
       auto task = [&](size_t start, size_t end) {
         ElementSub(input1 + start, input2 + start, out + start, end - start);
       };
-      CPUKernelUtils::ParallelFor(task, output_size_, MAX_SUB_SERIAL_SIZE);
+      ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
       return;
     }
     if (op_para.in_elements_num0_ == 1 || op_para.in_elements_num1_ == 1) {
@@ -65,7 +66,7 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) {
           ElementOptSub(input1 + start, input2, out + start, end - start, &op_para);
         }
       };
-      CPUKernelUtils::ParallelFor(task, output_size_, MAX_SUB_SERIAL_SIZE);
+      ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
       return;
     }
   }
@@ -84,6 +85,26 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) {
 
 template <typename T>
 void ArithmeticCPUKernel<T>::Mul(const T *input1, const T *input2, T *out) {
+  if constexpr (std::is_same_v<T, float>) {
+    if (input_shape1_ == input_shape2_) {
+      auto task = [&](size_t start, size_t end) {
+        ElementMul(input1 + start, input2 + start, out + start, end - start);
+      };
+      ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
+      return;
+    }
+    if (op_para.in_elements_num0_ == 1 || op_para.in_elements_num1_ == 1) {
+      auto task = [&](size_t start, size_t end) {
+        if (op_para.in_elements_num0_ == 1) {
+          ElementOptMul(input1, input2 + start, out + start, end - start, &op_para);
+        } else {
+          ElementOptMul(input1 + start, input2, out + start, end - start, &op_para);
+        }
+      };
+      ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
+      return;
+    }
+  }
   BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
   auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
     auto iter = base_iter;
@@ -128,21 +149,21 @@ void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) {
     auto task = [&](size_t start, size_t end) {
       ElementRealDiv<T>(input1 + start, input2 + start, out + start, end - start, 1, 1);
     };
-    CPUKernelUtils::ParallelFor(task, output_size_, MAX_DIV_SERIAL_SIZE);
+    ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
     return;
   }
   if (op_para.in_elements_num0_ == 1) {
     auto task = [&](size_t start, size_t end) {
       ElementRealDiv<T>(input1, input2 + start, out + start, end - start, 0, 1);
     };
-    CPUKernelUtils::ParallelFor(task, output_size_, MAX_DIV_SERIAL_SIZE);
+    ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
     return;
   }
   if (op_para.in_elements_num1_ == 1) {
     auto task = [&](size_t start, size_t end) {
       ElementRealDiv<T>(input1 + start, input2, out + start, end - start, 1, 0);
     };
-    CPUKernelUtils::ParallelFor(task, output_size_, MAX_DIV_SERIAL_SIZE);
+    ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
     return;
   }
 
@@ -339,7 +360,7 @@ void ArithmeticCPUKernel<T>::SquaredDifference(const T *input1, const T *input2,
       iter.GenNextPos();
     }
   };
-  CPUKernelUtils::ParallelFor(task, output_size_);
+  ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
 }
 
 template <typename T>
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h
index ab67b3134d8..61d4172da08 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h
@@ -77,6 +77,8 @@ MS_REG_CPU_KERNEL_T(RealDiv, KernelAttr(), ArithmeticCPUKernel, int64_t);
 MS_REG_CPU_KERNEL_T(Div, KernelAttr(), ArithmeticCPUKernel, int32_t);
 MS_REG_CPU_KERNEL_T(Div, KernelAttr(), ArithmeticCPUKernel, float);
 MS_REG_CPU_KERNEL_T(Div, KernelAttr(), ArithmeticCPUKernel, int64_t);
+MS_REG_CPU_KERNEL_T(Mul, KernelAttr(), ArithmeticCPUKernel, float);
+MS_REG_CPU_KERNEL_T(Mul, KernelAttr(), ArithmeticCPUKernel, int32_t);
 MS_REG_CPU_KERNEL_T(
   FloorDiv, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
   ArithmeticCPUKernel, int64_t);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc
index 583a986dba0..4aa1e06122a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc
@@ -20,6 +20,7 @@
 #include <map>
 #include "backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
+#include "nnacl/fp32/exp_fp32.h"
 
 namespace mindspore {
 namespace kernel {
@@ -31,7 +32,15 @@ void Square(const T *in, T *out, size_t size) {
       out[i] = in[i] * in[i];
     }
   };
-  CPUKernelUtils::ParallelFor(task, size, MAX_SQUARE_SERIAL_SIZE);
+  ParallelLaunch(task, size, MAX_SQUARE_SERIAL_SIZE);
+}
+
+template <typename T>
+void Exp(const T *in, T *out, size_t size) {
+  if constexpr (std::is_same_v<T, float>) {
+    auto task = [&in, &out](size_t start, size_t end) { ExpFp32(in + start, out + start, end - start); };
+    ParallelLaunch(task, size, MAX_EXP_SERIAL_SIZE);
+  }
 }
 
 template <typename T>
@@ -57,7 +66,7 @@ void Neg(const T *in, T *out, size_t size) {
       out[i] = -in[i];
     }
   };
-  CPUKernelUtils::ParallelFor(task, size, MAX_NEG_SERIAL_SIZE);
+  ParallelLaunch(task, size, MAX_NEG_SERIAL_SIZE);
 }
 
 template <typename T>
@@ -262,6 +271,7 @@ void Identity(const T *in, T *out, size_t size) {
 static const std::map<std::string, OperateType> kArithmeticOpTypeMap = {{prim::kPrimNeg->name(), NEG},
                                                                         {prim::kPrimSquare->name(), SQUARE},
                                                                         {prim::kPrimOnesLike->name(), ONESLIKE},
+                                                                        {prim::kPrimExp->name(), EXP},
                                                                         {prim::kPrimZerosLike->name(), ZEROSLIKE},
                                                                         {prim::kPrimLogicalNot->name(), LOGICALNOT},
                                                                         {prim::kPrimSign->name(), SIGN},
@@ -324,17 +334,29 @@ void ArithmeticSelfCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs
   T *output = reinterpret_cast<T *>(outputs[0]->addr);
   size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;
   static const std::map<OperateType, std::function<void(const T *in, T *out, size_t size)>> kArithmeticOpFuncMap = {
-    {SQUARE, Square<T>},     {SIGN, Sign<T>},
-    {NEG, Neg<T>},           {LOGICALNOT, LogicalNot<T>},
-    {ONESLIKE, OnesLike<T>}, {ZEROSLIKE, ZerosLike<T>},
-    {FLOOR, Floor<T>},       {RECIPROCAL, Reciprocal<T>},
-    {GELU, Gelu<T>},         {SIN, Sin<T>},
-    {COS, Cos<T>},           {TAN, Tan<T>},
-    {ASIN, Asin<T>},         {ACOS, ACos<T>},
-    {ATAN, Atan<T>},         {SINH, Sinh<T>},
-    {COSH, Cosh<T>},         {ASINH, Asinh<T>},
-    {ACOSH, Acosh<T>},       {ATANH, Atanh<T>},
-    {RINT, Rint<T>},         {ROUND, Round<T>}};
+    {SQUARE, Square<T>},
+    {SIGN, Sign<T>},
+    {NEG, Neg<T>},
+    {LOGICALNOT, LogicalNot<T>},
+    {ONESLIKE, OnesLike<T>},
+    {ZEROSLIKE, ZerosLike<T>},
+    {FLOOR, Floor<T>},
+    {RECIPROCAL, Reciprocal<T>},
+    {GELU, Gelu<T>},
+    {SIN, Sin<T>},
+    {COS, Cos<T>},
+    {TAN, Tan<T>},
+    {ASIN, Asin<T>},
+    {ACOS, ACos<T>},
+    {ATAN, Atan<T>},
+    {SINH, Sinh<T>},
+    {COSH, Cosh<T>},
+    {ASINH, Asinh<T>},
+    {ACOSH, Acosh<T>},
+    {ATANH, Atanh<T>},
+    {RINT, Rint<T>},
+    {ROUND, Round<T>},
+    {EXP, Exp<T>}};
   if (kArithmeticOpFuncMap.find(operate_type_) != kArithmeticOpFuncMap.end()) {
     kArithmeticOpFuncMap.at(operate_type_)(input, output, lens);
   } else {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h
index bdede9b4631..dc91f3d7608 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h
@@ -20,8 +20,9 @@
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 
-const float MAX_NEG_SERIAL_SIZE = 20000;
-const float MAX_SQUARE_SERIAL_SIZE = 20000;
+const float MAX_NEG_SERIAL_SIZE = 5000;
+const float MAX_SQUARE_SERIAL_SIZE = 5000;
+const float MAX_EXP_SERIAL_SIZE = 15000;
 
 namespace mindspore {
 namespace kernel {
@@ -58,6 +59,10 @@ class IdentityCPUKernel : public ArithmeticSelfCPUKernel {
 
 MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                   ArithmeticSelfCPUKernel);
+MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  ArithmeticSelfCPUKernel);
+MS_REG_CPU_KERNEL(Exp, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  ArithmeticSelfCPUKernel);
 MS_REG_CPU_KERNEL(Neg, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                   ArithmeticSelfCPUKernel);
 MS_REG_CPU_KERNEL(Neg, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.cc
index a5ac37a794a..7aed780948a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.cc
@@ -90,7 +90,7 @@ bool BiasAddCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::
         ElementAdd(src_addr + n_offset, bias_addr, output_addr + n_offset, input_shape_[1]);
       }
     };
-    CPUKernelUtils::ParallelForAutoSearch(task, input_shape_[0], &parallel_search_info_);
+    ParallelLaunchAutoSearch(task, input_shape_[0], this, &parallel_search_info_);
   }
   return true;
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.cc
index 95eedcb086a..5ab2aed2c72 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.cc
@@ -55,7 +55,7 @@ bool BiasAddGradCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const s
     auto task = [&](size_t start, size_t end) {
       ReduceSumDim2Axis0(end - start, input_shape_[1], input_shape_[0], input_addr + start, output_addr + start);
     };
-    CPUKernelUtils::ParallelForAutoSearch(task, input_shape_[1], &parallel_search_info_);
+    ParallelLaunchAutoSearch(task, input_shape_[1], this, &parallel_search_info_);
   }
   return true;
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.cc
index cc7c6639b7c..e767d822e3f 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.cc
@@ -74,7 +74,7 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
       }
     }
   };
-  CPUKernelUtils::ParallelForAutoSearch(task, before_axis, &parallel_search_info_);
+  ParallelLaunchAutoSearch(task, before_axis, this, &parallel_search_info_);
   return true;
 }
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
index fa6e4f36d67..70f7dffc68c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
@@ -138,6 +138,77 @@ void CPUKernelUtils::ParallelForAutoSearch(const CTask &task, size_t count, Para
   }
 }
 
+ActorThreadPool *GetActorMgrInnerThreadPool() {
+  auto actor_manager = ActorMgr::GetActorMgrRef();
+  auto thread_pool = actor_manager->GetActorThreadPool();
+  // Init thread_pool if env is windows or ascend, in case that it won't be init in graph_scheduler.
+  if (thread_pool == nullptr) {
+    const size_t kMaxThreadNum = 23;
+    size_t max_thread_num = std::thread::hardware_concurrency() - 1;
+    if (max_thread_num < 1) {
+      max_thread_num = 1;
+    }
+    max_thread_num = max_thread_num < kMaxThreadNum ? max_thread_num : kMaxThreadNum;
+    actor_manager->Initialize(true, 0, max_thread_num);
+    thread_pool = actor_manager->GetActorThreadPool();
+    MS_EXCEPTION_IF_NULL(thread_pool);
+  }
+  return thread_pool;
+}
+
+// Use threadpool of mindrt
+void ParallelLaunch(const CTask &task, size_t count, float block_size, Content content) {
+  auto thread_pool = GetActorMgrInnerThreadPool();
+  size_t kernel_thread_num = thread_pool->GetKernelThreadNum();
+  if (kernel_thread_num == 0) {
+    MS_LOG(EXCEPTION) << "Actor inner pool has been init, but kernel thread is 0!";
+  }
+
+  size_t thread_num = count < block_size * kernel_thread_num ? std::ceil(count / block_size) : kernel_thread_num;
+  size_t once_compute_size = (count + thread_num - 1) / thread_num;
+  size_t task_num = count / once_compute_size;
+  if (count % once_compute_size != 0) {
+    task_num += 1;
+  }
+  auto func = [&](void *, int task_id, float, float) {
+    size_t start = task_id * once_compute_size;
+    size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
+    task(start, end);
+    return common::SUCCESS;
+  };
+  thread_pool->ParallelLaunch(func, content, task_num);
+}
+
+void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content,
+                              ParallelSearchInfo *parallel_search_info) {
+  const size_t MAX_POW = 6;
+  const size_t AVG_COUNT = 5;
+  size_t current_pow = parallel_search_info->search_count / AVG_COUNT;
+  if (current_pow < MAX_POW) {
+    if (parallel_search_info->search_count % AVG_COUNT == 0) {
+      parallel_search_info->tmp_sum_cost_time = 0;
+    }
+    float block_size = static_cast<float>(count) / std::pow(2.0f, current_pow);
+    double start_time = GetTime();
+    ParallelLaunch(task, count, block_size, content);
+    double cost_time = GetTime() - start_time;
+    parallel_search_info->tmp_sum_cost_time += cost_time;
+    parallel_search_info->search_count++;
+    if (parallel_search_info->search_count % AVG_COUNT == 0) {
+      double avg_time = parallel_search_info->tmp_sum_cost_time / AVG_COUNT;
+      if (parallel_search_info->min_cost_time > avg_time) {
+        parallel_search_info->min_cost_time = avg_time;
+        parallel_search_info->best_block_size = block_size;
+        parallel_search_info->best_pow = current_pow;
+      } else if (current_pow - parallel_search_info->best_pow >= 2) {
+        parallel_search_info->search_count = AVG_COUNT * MAX_POW;
+      }
+    }
+  } else {
+    ParallelLaunch(task, count, parallel_search_info->best_block_size, content);
+  }
+}
+
 std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &shape, int axis) {
   if (axis < 0) {
     axis = axis + SizeToInt(shape.size());
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
index b85568f505e..c3bd29f7e65 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
@@ -25,6 +25,8 @@
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/common_utils.h"
 #include "ir/anf.h"
+#include "runtime/framework/graph_scheduler.h"
+#include "actor/actormgr.h"
 
 using mindspore::kernel::Address;
 using mindspore::kernel::AddressPtr;
@@ -62,6 +64,7 @@ const char DELTA[] = "delta";
 const char SORTED[] = "sorted";
 const char ADJ_ST[] = "adjoint_st";
 const char ADJ_dT[] = "adjoint_dt";
+const char PERIODS[] = "periods";
 
 enum OperateType {
   ADD = 0,
@@ -119,6 +122,7 @@ enum OperateType {
   ATAN2,
   RINT,
   ROUND,
+  EXP,
   IDENTITY,
 };
 
@@ -152,6 +156,19 @@ class CPUKernel : public kernel::KernelMod {
   std::vector<size_t> output_size_list_;
   std::vector<size_t> workspace_size_list_;
   ParallelSearchInfo parallel_search_info_;
+
+  template <typename T>
+  inline T *GetDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
+    if (index >= addr_list.size()) {
+      MS_LOG(EXCEPTION) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
+    }
+
+    if ((addr_list[index] == nullptr) || (addr_list[index]->addr == nullptr) || (addr_list[index]->size == 0)) {
+      MS_LOG(EXCEPTION) << "The device address is empty, address index: " << index;
+    }
+
+    return reinterpret_cast<T *>(addr_list[index]->addr);
+  }
 };
 
 class CPUKernelUtils {
@@ -209,6 +226,12 @@ class TransposeIterator {
   std::vector<size_t> axes_;
   size_t pos_{0};
 };
+
+ActorThreadPool *GetActorMgrInnerThreadPool();
+void ParallelLaunch(const CTask &task, size_t count, float block_size = 128.0, Content content = nullptr);
+void ParallelLaunchAutoSearch(const CTask &task, size_t count, Content content,
+                              ParallelSearchInfo *parallel_search_info);
+
 }  // namespace kernel
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.cc
index cc8d37147ec..ec0c396b2e8 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.cc
@@ -144,8 +144,7 @@ bool CropAndResizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &in
         const int bottom_y_index = ceilf(target_y);
         const int left_x_index = floorf(target_x);
         const int right_x_index = ceilf(target_x);
-        const float y_lerp = target_y - top_y_index;
-        const float x_lerp = target_x - left_x_index;
+
         const float top_left = static_cast<float>(
           input_image[((box_index * input_height_ + top_y_index) * input_width_ + left_x_index) * channel_ +
                       pos_channel]);
@@ -158,9 +157,9 @@ bool CropAndResizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &in
         const float bottom_right = static_cast<float>(
           input_image[((box_index * input_height_ + bottom_y_index) * input_width_ + right_x_index) * channel_ +
                       pos_channel]);
-        const float top = top_left + (top_right - top_left) * x_lerp;
-        const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
-        output[pos] = top + (bottom - top) * y_lerp;
+        const float top = top_left + (top_right - top_left) * (target_x - left_x_index);
+        const float bottom = bottom_left + (bottom_right - bottom_left) * (target_x - left_x_index);
+        output[pos] = top + (bottom - top) * (target_y - top_y_index);
       } else if (method_ == 3) {
         int y1h = static_cast<int>(y1 * input_height_);
         int x1w = static_cast<int>(x1 * input_width_);
@@ -170,36 +169,37 @@ bool CropAndResizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &in
         int h = ((y2h - y1h + 1) > 1) ? y2h - y1h + 1 : 1;
 
         float y_point = (pos_y + 0.5) * (h / static_cast<float>(final_height_)) - 0.5;
-        int top_y_index = floorf(y_point);
-        top_y_index = std::min(std::max(0, top_y_index), h - 1);
-
-        int bottom_y_index = ceilf(y_point);
-        bottom_y_index = std::min(std::max(0, bottom_y_index), h - 1);
+        int top_y_index = std::min(std::max(0, static_cast<int>(floorf(y_point))), h - 1);
+        int bottom_y_index = std::min(std::max(0, static_cast<int>(ceilf(y_point))), h - 1);
 
         float x_point = (pos_x + 0.5) * (w / static_cast<float>(final_width_)) - 0.5;
-        int left_x_index = floorf(x_point);
-        left_x_index = std::min(std::max(0, left_x_index), w - 1);
-
-        int right_x_index = ceilf(x_point);
-        right_x_index = std::min(std::max(0, right_x_index), w - 1);
+        int left_x_index = std::min(std::max(0, static_cast<int>(floorf(x_point))), w - 1);
+        int right_x_index = std::min(std::max(0, static_cast<int>(ceilf(x_point))), w - 1);
 
         const float y_lerp = y_point - top_y_index;
         const float x_lerp = x_point - left_x_index;
-        const int y_top_index = box_index * input_height_ + y1h + top_y_index;
-        const int y_bottom_index = box_index * input_height_ + y1h + bottom_y_index;
 
-        const float top_left =
-          static_cast<float>(input_image[(y_top_index * input_width_ + x1w + left_x_index) * channel_ + pos_channel]);
-        const float top_right =
-          static_cast<float>(input_image[(y_top_index * input_width_ + x1w + right_x_index) * channel_ + pos_channel]);
+        const int y_top_index = std::max(0, y1h + top_y_index);
+        const int y_bottom_index = std::max(0, y1h + bottom_y_index);
+        const int x_left_index = std::max(0, x1w + left_x_index);
+        const int x_right_index = std::max(0, x1w + right_x_index);
+
+        const float top_left = static_cast<float>(
+          input_image[((box_index * input_height_ + y_top_index) * input_width_ + x_left_index) * channel_ +
+                      pos_channel]);
+        const float top_right = static_cast<float>(
+          input_image[((box_index * input_height_ + y_top_index) * input_width_ + x_right_index) * channel_ +
+                      pos_channel]);
         const float bottom_left = static_cast<float>(
-          input_image[(y_bottom_index * input_width_ + x1w + left_x_index) * channel_ + pos_channel]);
+          input_image[((box_index * input_height_ + y_bottom_index) * input_width_ + x_left_index) * channel_ +
+                      pos_channel]);
         const float bottom_right = static_cast<float>(
-          input_image[(y_bottom_index * input_width_ + x1w + right_x_index) * channel_ + pos_channel]);
+          input_image[((box_index * input_height_ + y_bottom_index) * input_width_ + x_right_index) * channel_ +
+                      pos_channel]);
+
+        output[pos] = top_left * (1 - y_lerp) * (1 - x_lerp) + bottom_right * y_lerp * x_lerp +
+                      top_right * (1 - y_lerp) * x_lerp + bottom_left * y_lerp * (1 - x_lerp);
 
-        float ret = top_left * (1 - y_lerp) * (1 - x_lerp) + bottom_right * y_lerp * x_lerp +
-                    top_right * (1 - y_lerp) * x_lerp + bottom_left * y_lerp * (1 - x_lerp);
-        output[pos] = ret;
       } else {
         // Nearest Neighbour
         const int closest_x_index = roundf(target_x);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h
index 62c43c35317..0b0e2bae110 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h
@@ -35,15 +35,14 @@ class CropAndResizeCPUKernel : public CPUKernel {
               const std::vector<AddressPtr> &outputs) override;
 
  private:
-  int method_;
-  float extrapolation_value_;
-  int input_crop_size_;
-  int output_size_;
-  int input_height_;
-  int input_width_;
-  int final_height_;
-  int final_width_;
-  int channel_;
+  int method_{1};
+  float extrapolation_value_{0.0};
+  int output_size_{0};
+  int input_height_{0};
+  int input_width_{0};
+  int final_height_{0};
+  int final_width_{0};
+  int channel_{0};
 };
 
 MS_REG_CPU_KERNEL_T(CropAndResize,
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
index 926d8e172ef..3a320c8263b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
@@ -259,9 +259,9 @@ bool EltWiseGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inpu
   const auto input1 = reinterpret_cast<T *>(inputs[1]->addr);
   auto output = reinterpret_cast<T *>(outputs[0]->addr);
 
-  CPUKernelUtils::ParallelForAutoSearch(
+  ParallelLaunchAutoSearch(
     std::bind(elt_map.at(kernel_name_), this, input0, input1, output, std::placeholders::_1, std::placeholders::_2),
-    outputs[0]->size / sizeof(T), &parallel_search_info_);
+    outputs[0]->size / sizeof(T), this, &parallel_search_info_);
   return true;
 }
 }  // namespace kernel
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/fl/fused_pull_weight_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/fl/fused_pull_weight_kernel.h
index 4548d728803..07505a2b248 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/fl/fused_pull_weight_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/fl/fused_pull_weight_kernel.h
@@ -30,7 +30,7 @@
 
 namespace mindspore {
 namespace kernel {
-// The duration between two downloading requests when return code is ResponseCode_SucNotReady.
+// The duration between two PullWeights requests when return code is ResponseCode_SucNotReady.
 constexpr int kRetryDurationOfPullWeights = 200;
 template <typename T>
 class FusedPullWeightKernel : public CPUKernel {
@@ -51,19 +51,17 @@ class FusedPullWeightKernel : public CPUKernel {
     MS_EXCEPTION_IF_NULL(fbb);
 
     total_iteration_++;
+    uint64_t step_num_per_iteration = fl::worker::FLWorker::GetInstance().worker_step_num_per_iteration();
     // The worker has to train kWorkerTrainStepNum standalone iterations before it communicates with server.
-    if (total_iteration_ % fl::worker::FLWorker::GetInstance().worker_step_num_per_iteration() !=
-        fl::kTrainBeginStepNum) {
+    MS_LOG(INFO) << "Try to pull weights. Local step number: " << total_iteration_
+                 << ", step number needs to run per iteration: " << step_num_per_iteration;
+    if (step_num_per_iteration != fl::kOneStepPerIteration &&
+        total_iteration_ % step_num_per_iteration != fl::kTrainBeginStepNum) {
       return true;
     }
 
     fl_iteration_++;
-    if (fl_iteration_ > ps::PSContext::instance()->fl_iteration_num()) {
-      MS_LOG(INFO) << ps::PSContext::instance()->fl_iteration_num() << " iterations are completed.";
-      fl_iteration_ = 1;
-    }
-
-    MS_LOG(INFO) << "Start pulling weight for federated learning iteration " << fl_iteration_;
+    MS_LOG(INFO) << "Launching pulling weight for federated learning iteration " << fl_iteration_;
     if (!BuildPullWeightReq(fbb)) {
       MS_LOG(EXCEPTION) << "Building request for FusedPullWeight failed.";
       return false;
@@ -73,11 +71,16 @@ class FusedPullWeightKernel : public CPUKernel {
     const schema::ResponsePullWeight *pull_weight_rsp = nullptr;
     int retcode = schema::ResponseCode_SucNotReady;
     while (retcode == schema::ResponseCode_SucNotReady) {
+      if (!fl::worker::FLWorker::GetInstance().running()) {
+        MS_LOG(WARNING) << "Worker has finished.";
+        return true;
+      }
       if (!fl::worker::FLWorker::GetInstance().SendToServer(
             0, fbb->GetBufferPointer(), fbb->GetSize(), ps::core::TcpUserCommand::kPullWeight, &pull_weight_rsp_msg)) {
-        MS_LOG(WARNING) << "Sending request for FusedPullWeight to server 0 failed. This iteration is dropped.";
-        fl::worker::FLWorker::GetInstance().SetIterationRunning();
-        return true;
+        MS_LOG(WARNING) << "Sending request for FusedPullWeight to server 0 failed. Retry later.";
+        retcode = schema::ResponseCode_SucNotReady;
+        std::this_thread::sleep_for(std::chrono::milliseconds(kRetryDurationOfPullWeights));
+        continue;
       }
       MS_EXCEPTION_IF_NULL(pull_weight_rsp_msg);
 
@@ -88,6 +91,8 @@ class FusedPullWeightKernel : public CPUKernel {
         fl_iteration_ = pull_weight_rsp->iteration();
         MS_LOG(DEBUG) << "Server is not ready for downloading yet. Reason: " << pull_weight_rsp->reason()->str()
                       << ". Retry later.";
+        // Recreate fbb to avoid memory leak of FlatBuffers.
+        fbb = std::make_shared<fl::FBBuilder>();
         if (!BuildPullWeightReq(fbb)) {
           MS_LOG(EXCEPTION) << "Building request for FusedDownloadWeightsByKeys failed.";
           return false;
@@ -116,7 +121,7 @@ class FusedPullWeightKernel : public CPUKernel {
         return false;
       }
     }
-    MS_LOG(INFO) << "Pull weights for " << weight_full_names_ << " succeed. Iteration: " << fl_iteration_;
+    MS_LOG(INFO) << "Pull weights for " << weight_full_names_ << " success. Iteration: " << fl_iteration_;
     fl::worker::FLWorker::GetInstance().SetIterationRunning();
     return true;
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/fl/fused_push_weight_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/fl/fused_push_weight_kernel.h
index e9590764c89..eb4175556ca 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/fl/fused_push_weight_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/fl/fused_push_weight_kernel.h
@@ -28,7 +28,7 @@
 
 namespace mindspore {
 namespace kernel {
-// The duration between two uploading requests when return code is ResponseCode_SucNotReady.
+// The duration between two PushWeights requests when return code is ResponseCode_SucNotReady.
 constexpr int kRetryDurationOfPushWeights = 200;
 template <typename T>
 class FusedPushWeightKernel : public CPUKernel {
@@ -49,19 +49,17 @@ class FusedPushWeightKernel : public CPUKernel {
     MS_EXCEPTION_IF_NULL(fbb);
 
     total_iteration_++;
+    uint64_t step_num_per_iteration = fl::worker::FLWorker::GetInstance().worker_step_num_per_iteration();
     // The worker has to train kWorkerTrainStepNum standalone iterations before it communicates with server.
-    if (total_iteration_ % fl::worker::FLWorker::GetInstance().worker_step_num_per_iteration() !=
-        fl::kTrainBeginStepNum) {
+    MS_LOG(INFO) << "Try to push weights. Local step number: " << total_iteration_
+                 << ", step number needs to run per iteration: " << step_num_per_iteration;
+    if (step_num_per_iteration != fl::kOneStepPerIteration &&
+        total_iteration_ % step_num_per_iteration != fl::kTrainEndStepNum) {
       return true;
     }
 
     fl_iteration_++;
-    if (fl_iteration_ > ps::PSContext::instance()->fl_iteration_num()) {
-      MS_LOG(INFO) << ps::PSContext::instance()->fl_iteration_num() << " iterations are completed.";
-      fl_iteration_ = 1;
-    }
-
-    MS_LOG(INFO) << "Start pushing weight for federated learning iteration " << fl_iteration_;
+    MS_LOG(INFO) << "Launching pushing weight for federated learning iteration " << fl_iteration_;
     if (!BuildPushWeightReq(fbb, inputs)) {
       MS_LOG(EXCEPTION) << "Building request for FusedPushWeight failed.";
       return false;
@@ -73,13 +71,17 @@ class FusedPushWeightKernel : public CPUKernel {
       const schema::ResponsePushWeight *push_weight_rsp = nullptr;
       int retcode = schema::ResponseCode_SucNotReady;
       while (retcode == schema::ResponseCode_SucNotReady) {
+        if (!fl::worker::FLWorker::GetInstance().running()) {
+          MS_LOG(WARNING) << "Worker has finished.";
+          return true;
+        }
         if (!fl::worker::FLWorker::GetInstance().SendToServer(i, fbb->GetBufferPointer(), fbb->GetSize(),
                                                               ps::core::TcpUserCommand::kPushWeight,
                                                               &push_weight_rsp_msg)) {
-          MS_LOG(WARNING) << "Sending request for FusedPushWeight to server " << i
-                          << " failed. This iteration is dropped.";
-          fl::worker::FLWorker::GetInstance().SetIterationCompleted();
-          return true;
+          MS_LOG(WARNING) << "Sending request for FusedPushWeight to server " << i << " failed.";
+          retcode = schema::ResponseCode_SucNotReady;
+          std::this_thread::sleep_for(std::chrono::milliseconds(kRetryDurationOfPushWeights));
+          continue;
         }
         MS_EXCEPTION_IF_NULL(push_weight_rsp_msg);
 
@@ -105,8 +107,7 @@ class FusedPushWeightKernel : public CPUKernel {
       }
     }
 
-    MS_LOG(INFO) << "Push weights for " << weight_full_names_ << " succeed. Iteration: " << fl_iteration_;
-    fl::worker::FLWorker::GetInstance().SetIterationCompleted();
+    MS_LOG(INFO) << "Push weights for " << weight_full_names_ << " success. Iteration: " << fl_iteration_;
     return true;
   }
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.h
index 0a0991ee613..c38ca6a2d6b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.h
@@ -52,6 +52,26 @@ MS_REG_CPU_KERNEL_T(
   MaskedSelect,
   KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeInt32),
   MaskedSelectCPUKernel, int);
+
+MS_REG_CPU_KERNEL_T(
+  MaskedSelect,
+  KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeInt16),
+  MaskedSelectCPUKernel, int16_t);
+
+MS_REG_CPU_KERNEL_T(
+  MaskedSelect,
+  KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeInt64),
+  MaskedSelectCPUKernel, int64_t);
+
+MS_REG_CPU_KERNEL_T(
+  MaskedSelect,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeFloat16),
+  MaskedSelectCPUKernel, float16);
+
+MS_REG_CPU_KERNEL_T(
+  MaskedSelect,
+  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeFloat64),
+  MaskedSelectCPUKernel, double);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h
index 44fcdd4622f..80c0b37adfd 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h
@@ -58,6 +58,38 @@ MS_REG_CPU_KERNEL_T(MaskedSelectGrad,
                       .AddInputAttr(kNumberTypeInt32)
                       .AddOutputAttr(kNumberTypeInt32),
                     MaskedSelectGradCPUKernel, int);
+
+MS_REG_CPU_KERNEL_T(MaskedSelectGrad,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat16)
+                      .AddInputAttr(kNumberTypeBool)
+                      .AddInputAttr(kNumberTypeFloat16)
+                      .AddOutputAttr(kNumberTypeFloat16),
+                    MaskedSelectGradCPUKernel, float16);
+
+MS_REG_CPU_KERNEL_T(MaskedSelectGrad,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat64)
+                      .AddInputAttr(kNumberTypeBool)
+                      .AddInputAttr(kNumberTypeFloat64)
+                      .AddOutputAttr(kNumberTypeFloat64),
+                    MaskedSelectGradCPUKernel, double);
+
+MS_REG_CPU_KERNEL_T(MaskedSelectGrad,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeInt16)
+                      .AddInputAttr(kNumberTypeBool)
+                      .AddInputAttr(kNumberTypeInt16)
+                      .AddOutputAttr(kNumberTypeInt16),
+                    MaskedSelectGradCPUKernel, int16_t);
+
+MS_REG_CPU_KERNEL_T(MaskedSelectGrad,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddInputAttr(kNumberTypeBool)
+                      .AddInputAttr(kNumberTypeInt64)
+                      .AddOutputAttr(kNumberTypeInt64),
+                    MaskedSelectGradCPUKernel, int64_t);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_GRAD_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_cpu_kernel.cc
index 622f3bb6dce..d54978d47f9 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_cpu_kernel.cc
@@ -86,6 +86,8 @@ bool MirrorPadCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, c
     LaunchKernel<float16>(inputs, outputs);
   } else if (dtype_ == kNumberTypeFloat32) {
     LaunchKernel<float>(inputs, outputs);
+  } else if (dtype_ == kNumberTypeFloat64) {
+    LaunchKernel<double>(inputs, outputs);
   } else if (dtype_ == kNumberTypeInt32) {
     LaunchKernel<int>(inputs, outputs);
   } else {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_cpu_kernel.h
index c0a13bc6365..f586220d699 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_cpu_kernel.h
@@ -74,6 +74,11 @@ MS_REG_CPU_KERNEL(
   KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
   MirrorPadCPUKernel);
 
+MS_REG_CPU_KERNEL(
+  MirrorPad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
+  MirrorPadCPUKernel);
+
 MS_REG_CPU_KERNEL(
   MirrorPad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
   MirrorPadCPUKernel);
@@ -88,6 +93,11 @@ MS_REG_CPU_KERNEL(
   KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
   MirrorPadCPUKernel);
 
+MS_REG_CPU_KERNEL(
+  MirrorPad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat64),
+  MirrorPadCPUKernel);
+
 MS_REG_CPU_KERNEL(
   MirrorPad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
   MirrorPadCPUKernel);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_grad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_grad_cpu_kernel.cc
index 9b7d2665e99..0d0fdf8e068 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_grad_cpu_kernel.cc
@@ -110,6 +110,8 @@ bool MirrorPadGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &input
     LaunchKernel<float16>(inputs, workspace, outputs);
   } else if (dtype_ == kNumberTypeFloat32) {
     LaunchKernel<float>(inputs, workspace, outputs);
+  } else if (dtype_ == kNumberTypeFloat64) {
+    LaunchKernel<double>(inputs, workspace, outputs);
   } else if (dtype_ == kNumberTypeInt32) {
     LaunchKernel<int>(inputs, workspace, outputs);
   } else {
@@ -130,6 +132,8 @@ void MirrorPadGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
     InitWorkspaceSize<float16>();
   } else if (dtype_ == kNumberTypeFloat32) {
     InitWorkspaceSize<float>();
+  } else if (dtype_ == kNumberTypeFloat64) {
+    InitWorkspaceSize<double>();
   } else if (dtype_ == kNumberTypeInt32) {
     InitWorkspaceSize<int>();
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_grad_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_grad_cpu_kernel.h
index 57eff40b55c..96fa1584a75 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_grad_cpu_kernel.h
@@ -90,6 +90,11 @@ MS_REG_CPU_KERNEL(
   KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
   MirrorPadGradCPUKernel);
 
+MS_REG_CPU_KERNEL(
+  MirrorPadGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
+  MirrorPadGradCPUKernel);
+
 MS_REG_CPU_KERNEL(
   MirrorPadGrad,
   KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
@@ -105,6 +110,11 @@ MS_REG_CPU_KERNEL(
   KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
   MirrorPadGradCPUKernel);
 
+MS_REG_CPU_KERNEL(
+  MirrorPadGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat64),
+  MirrorPadGradCPUKernel);
+
 MS_REG_CPU_KERNEL(
   MirrorPadGrad,
   KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h
index cd695e2a9e6..e44638ca240 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h
@@ -52,8 +52,6 @@ MS_REG_CPU_KERNEL(Sigmoid, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutp
                   EltWiseCPUKernel);
 MS_REG_CPU_KERNEL(Sqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                   EltWiseCPUKernel);
-MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-                  EltWiseCPUKernel);
 MS_REG_CPU_KERNEL(Tanh, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                   EltWiseCPUKernel);
 MS_REG_CPU_KERNEL(Softplus, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.cc
index e59303a646c..21ac41deb38 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.cc
@@ -111,22 +111,16 @@ bool MKLCPUKernel::BinaryBroadCast(std::vector<size_t> *src0_shape, std::vector<
 }
 
 dnnl::memory::format_tag MKLCPUKernel::GetDefaultFormatTag(const dnnl::memory::dims &dims) const {
-  dnnl::memory::format_tag mem_tag;
-  auto dim_size = dims.size();
-  if (dim_size == 5) {
-    mem_tag = dnnl::memory::format_tag::abcde;
-  } else if (dim_size == 4) {
-    mem_tag = dnnl::memory::format_tag::abcd;
-  } else if (dim_size == 3) {
-    mem_tag = dnnl::memory::format_tag::abc;
-  } else if (dim_size == 2) {
-    mem_tag = dnnl::memory::format_tag::ab;
-  } else if (dim_size == 1) {
-    mem_tag = dnnl::memory::format_tag::a;
-  } else {
-    MS_LOG(EXCEPTION) << "Kernel dims invalid " << dim_size;
+  static const std::vector<dnnl::memory::format_tag> tag_vec = {
+    dnnl::memory::format_tag::a,      dnnl::memory::format_tag::ab,    dnnl::memory::format_tag::abc,
+    dnnl::memory::format_tag::abcd,   dnnl::memory::format_tag::abcde, dnnl::memory::format_tag::abcdef,
+    dnnl::memory::format_tag::abcdefg};
+
+  auto rank = dims.size();
+  if (rank > tag_vec.size()) {
+    MS_LOG(EXCEPTION) << "The kernel does not support construct " << rank << "-D tensor dnnl memory format_tag.";
   }
-  return mem_tag;
+  return tag_vec[rank - 1];
 }
 
 dnnl::memory::desc MKLCPUKernel::GetDefaultMemDesc(const std::vector<size_t> &shape) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h
index b17d84b7d0e..1a965e17301 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h
@@ -36,9 +36,6 @@ class MulCPUKernel : public MKLCPUKernel {
  private:
   bool need_swap_{false};
 };
-
-MS_REG_CPU_KERNEL(Mul, KernelAttr(), MulCPUKernel);
-MS_REG_CPU_KERNEL_T(Mul, KernelAttr(), ArithmeticCPUKernel, int32_t);
 }  // namespace kernel
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt
index 1b4f1e4d969..f73dde8b708 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt
@@ -45,7 +45,7 @@ if(MSLITE_STRING_KERNEL)
             ${KERNEL_SRC_INFER_STRING}
             )
 endif()
-if(MSLITE_CONTROL_TENSORLIST)
+if(MSLITE_CONTROLFLOW_TENSORLIST)
     file(GLOB KERNEL_SRC_INFER_CONTROL_TENSORLIST
             ${NNACL_DIR}/infer/control/*.c
             )
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32Opt.S
index 62880ea15a7..7dda0cfa8e4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32Opt.S
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32Opt.S
@@ -29,10 +29,28 @@ asm_function MatmulFloatNeon64Opt
 
     mov x21, #48 // sizeof(float) * 12
     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
+    cmp x9, #3 // c4
+    beq C4Stride
     cbnz x9, NoC8Steps
     mov x11, x2
     mov x21, #32
     mul x16, x6, x21 // row * 8 * sizeof(float)
+    b NoC8Steps
+C4Stride:
+    mov x18, #48 // 12 * sizeof(float)
+    mov x22, #4
+    mul x8, x8, x22 // stride * sizeof(float), in c4 stride == row
+    mul x8, x8, x22 // col stride
+    // col >= 4 , block stride 192, otherwise 12 * 4 * col
+    cmp x7, #4
+    bge C4StrideCommon
+    mul x18, x18, x7 // block stride
+    b LoopRowStart
+C4StrideCommon:
+    mov x18, #192 // block stride
+
+    b LoopRowStart
+
 NoC8Steps:
     cmp x9, #2
     bne NoWinoSteps
@@ -46,10 +64,14 @@ NoWinoSteps:
     mul x8, x8, x21
 
 LoopRowStart:
+    cmp x9, #3
+    bne RowStart
+    mov x20, x2
+RowStart:
     cmp x6, #4
     ble LoopRow4
     cmp x6, #8
-    blt LoopRow8
+    ble LoopRow8
 
 LoopRow:
     mov x14, x1 // reload rhs ptr
@@ -58,7 +80,12 @@ LoopRow:
 
     LoopCol:
         cbz x9, NoReloadDst
+        cmp x9, #3
+        beq C4ReloadDst
         mov x11, x2
+        b NoReloadDst
+    C4ReloadDst:
+        mov x11, x20
     NoReloadDst:
         mov x10, x0 // reload lhs ptr
         mov x19, x5 // reload depth
@@ -192,7 +219,7 @@ LoopRow:
             fmin v29.4s, v29.4s, v2.4s
             fmin v30.4s, v30.4s, v2.4s
             fmin v31.4s, v31.4s, v2.4s
-        
+
         Relu:
             dup v3.4s, wzr
             fmax v8.4s, v8.4s, v3.4s
@@ -324,7 +351,12 @@ LoopRow8:
 
     LoopCol8:
         cbz x9, NoReloadDst8
+        cmp x9, #3
+        beq C4ReloadDst8
         mov x11, x2
+        b NoReloadDst8
+    C4ReloadDst8:
+        mov x11, x20
     NoReloadDst8:
         mov x10, x0 // reload lhs ptr
         mov x19, x5 // reload depth
@@ -426,7 +458,7 @@ LoopRow8:
             fmin v21.4s, v21.4s, v2.4s
             fmin v22.4s, v22.4s, v2.4s
             fmin v23.4s, v23.4s, v2.4s
-        
+
         Relu8:
             dup v3.4s, wzr
             fmax v8.4s, v8.4s, v3.4s
@@ -529,7 +561,12 @@ LoopRow4:
 
     LoopCol4:
         cbz x9, NoReloadDst4
+        cmp x9, #3
+        beq C4ReloadDst4
         mov x11, x2
+        b NoReloadDst4
+    C4ReloadDst4:
+        mov x11, x20
     NoReloadDst4:
         mov x10, x0 // reload lhs ptr
         mov x19, x5 // reload depth
@@ -599,7 +636,7 @@ LoopRow4:
             fmin v13.4s, v13.4s, v2.4s
             fmin v14.4s, v14.4s, v2.4s
             fmin v15.4s, v15.4s, v2.4s
-        
+
         Relu4:
             dup v3.4s, wzr
             fmax v8.4s, v8.4s, v3.4s
@@ -669,6 +706,8 @@ LoopRow4:
         Write:
             cmp x9, #2
             beq WriteWino
+            cmp x9, #3
+            beq WriteC4
             cbz x9, WriteC8
             cmp x13, #1
             beq Write1
@@ -1102,6 +1141,508 @@ LoopRow4:
             beq WriteEnd
             st1 {v30.4s, v31.4s}, [x11], x8
             add x11, x11, #32
+            b WriteEnd
+        WriteC4:
+            cmp x13, #1
+            beq C4Write1
+            cmp x13, #2
+            beq C4Write2
+            cmp x13, #3
+            beq C4Write3
+            cmp x13, #4
+            beq C4Write4
+            cmp x13, #5
+            beq C4Write5
+            cmp x13, #6
+            beq C4Write6
+            cmp x13, #7
+            beq C4Write7
+            b C4Write8
+        C4Write1:
+           // add x20, x11, x8
+            str s8, [x11], #4
+            cmp x6, #1
+            beq WriteEnd
+            str s10, [x11], #4
+            cmp x6, #2
+            beq WriteEnd
+            str s12, [x11], #4
+            cmp x6, #3
+            beq WriteEnd
+            str s14, [x11], #4
+            cmp x6, #4
+            beq WriteEnd
+            str s16, [x11], #4
+            cmp x6, #5
+            beq WriteEnd
+            str s18, [x11], #4
+            cmp x6, #6
+            beq WriteEnd
+            str s20, [x11], #4
+            cmp x6, #7
+            beq WriteEnd
+            str s22, [x11], #4
+            cmp x6, #8
+            beq WriteEnd
+            str s24, [x11], #4
+            cmp x6, #9
+            beq WriteEnd
+            str s26, [x11], #4
+            cmp x6, #10
+            beq WriteEnd
+            str s28, [x11], #4
+            cmp x6, #11
+            beq WriteEnd
+            str s30, [x11], #4
+            b WriteEnd
+        C4Write2:
+           // add x20, x11, x8
+            st1 {v8.2s}, [x11], #8
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.2s}, [x11], #8
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.2s}, [x11], #8
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.2s}, [x11], #8
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.2s}, [x11], #8
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.2s}, [x11], #8
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.2s}, [x11], #8
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.2s}, [x11], #8
+            cmp x6, #8
+            beq WriteEnd
+            st1 {v24.2s}, [x11], #8
+            cmp x6, #9
+            beq WriteEnd
+            st1 {v26.2s}, [x11], #8
+            cmp x6, #10
+            beq WriteEnd
+            st1 {v28.2s}, [x11], #8
+            cmp x6, #11
+            beq WriteEnd
+            st1 {v30.2s}, [x11], #8
+            b WriteEnd
+        C4Write3:
+           // add x20, x11, x8
+            add x19, x11, #8
+            st1 {v8.2s}, [x11]
+            add x11, x11, #12
+            st1 {v8.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.2s}, [x11]
+            add x11, x11, #12
+            st1 {v10.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.2s}, [x11]
+            add x11, x11, #12
+            st1 {v12.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.2s}, [x11]
+            add x11, x11, #12
+            st1 {v14.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.2s}, [x11]
+            add x11, x11, #12
+            st1 {v16.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.2s}, [x11]
+            add x11, x11, #12
+            st1 {v18.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.2s}, [x11]
+            add x11, x11, #12
+            st1 {v20.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.2s}, [x11]
+            add x11, x11, #12
+            st1 {v22.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #8
+            beq WriteEnd
+            st1 {v24.2s}, [x11]
+            add x11, x11, #12
+            st1 {v24.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #9
+            beq WriteEnd
+            st1 {v26.2s}, [x11]
+            add x11, x11, #12
+            st1 {v26.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #10
+            beq WriteEnd
+            st1 {v28.2s}, [x11]
+            add x11, x11, #12
+            st1 {v28.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #11
+            beq WriteEnd
+            st1 {v30.2s}, [x11]
+            add x11, x11, #12
+            st1 {v30.s}[2], [x19]
+            add x19, x19, #12
+            b WriteEnd
+
+        C4Write4:
+            add x20, x11, x8
+            st1 {v8.4s}, [x11], #16
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11], #16
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11], #16
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11], #16
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.4s}, [x11], #16
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.4s}, [x11], #16
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.4s}, [x11], #16
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.4s}, [x11], #16
+            cmp x6, #8
+            beq WriteEnd
+            st1 {v24.4s}, [x11], #16
+            cmp x6, #9
+            beq WriteEnd
+            st1 {v26.4s}, [x11], #16
+            cmp x6, #10
+            beq WriteEnd
+            st1 {v28.4s}, [x11], #16
+            cmp x6, #11
+            beq WriteEnd
+            st1 {v30.4s}, [x11], #16
+            b WriteEnd
+        C4Write5:
+            add x19, x11, #16
+            st1 {v8.4s}, [x11]
+            add x11, x11, #20
+            str s9, [x19]
+            add x19, x19, #20
+            cmp x6, #1
+            beq WriteEnd
+
+            st1 {v10.4s}, [x11]
+            add x11, x11, #20
+            str s11, [x19]
+            add x19, x19, #20
+            cmp x6, #2
+            beq WriteEnd
+
+            st1 {v12.4s}, [x11]
+            add x11, x11, #20
+            str s13, [x19]
+            add x19, x19, #20
+            cmp x6, #3
+            beq WriteEnd
+
+            st1 {v14.4s}, [x11]
+            add x11, x11, #20
+            str s15, [x19]
+            add x19, x19, #20
+            cmp x6, #4
+            beq WriteEnd
+
+            st1 {v16.4s}, [x11]
+            add x11, x11, #20
+            str s17, [x19]
+            add x19, x19, #20
+            cmp x6, #5
+            beq WriteEnd
+
+            st1 {v18.4s}, [x11]
+            add x11, x11, #20
+            str s19, [x19]
+            add x19, x19, #20
+            cmp x6, #6
+            beq WriteEnd
+
+            st1 {v20.4s}, [x11]
+            add x11, x11, #20
+            str s21, [x19]
+            add x19, x19, #20
+            cmp x6, #7
+            beq WriteEnd
+
+            st1 {v22.4s}, [x11]
+            add x11, x11, #20
+            str s23, [x19]
+            add x19, x19, #20
+            cmp x6, #8
+            beq WriteEnd
+
+            st1 {v24.4s}, [x11]
+            add x11, x11, #20
+            str s25, [x19]
+            add x19, x19, #20
+            cmp x6, #9
+            beq WriteEnd
+
+            st1 {v26.4s}, [x11]
+            add x11, x11, #20
+            str s27, [x19]
+            add x19, x19, #20
+            cmp x6, #10
+            beq WriteEnd
+
+            st1 {v28.4s}, [x11]
+            add x11, x11, #20
+            str s29, [x19]
+            add x19, x19, #20
+            cmp x6, #11
+            beq WriteEnd
+
+            st1 {v30.4s}, [x11]
+            str s31, [x19]
+            b WriteEnd
+        C4Write6:
+            add x19, x11, #16
+            st1 {v8.4s}, [x11]
+            add x11, x11, #24
+            st1 {v9.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #1
+            beq WriteEnd
+
+            st1 {v10.4s}, [x11]
+            add x11, x11, #24
+            st1 {v11.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #2
+            beq WriteEnd
+
+            st1 {v12.4s}, [x11]
+            add x11, x11, #24
+            st1 {v13.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #3
+            beq WriteEnd
+
+            st1 {v14.4s}, [x11]
+            add x11, x11, #24
+            st1 {v15.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #4
+            beq WriteEnd
+
+            st1 {v16.4s}, [x11]
+            add x11, x11, #24
+            st1 {v17.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #5
+            beq WriteEnd
+
+            st1 {v18.4s}, [x11]
+            add x11, x11, #24
+            st1 {v19.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #6
+            beq WriteEnd
+
+            st1 {v20.4s}, [x11]
+            add x11, x11, #24
+            st1 {v21.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #7
+            beq WriteEnd
+
+            st1 {v22.4s}, [x11]
+            add x11, x11, #24
+            st1 {v23.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #8
+            beq WriteEnd
+
+            st1 {v24.4s}, [x11]
+            add x11, x11, #24
+            st1 {v25.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #9
+            beq WriteEnd
+
+            st1 {v26.4s}, [x11]
+            add x11, x11, #24
+            st1 {v27.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #10
+            beq WriteEnd
+
+            st1 {v28.4s}, [x11]
+            add x11, x11, #24
+            st1 {v29.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #11
+            beq WriteEnd
+
+            st1 {v30.4s}, [x11]
+            st1 {v31.2s}, [x19]
+            b WriteEnd
+        C4Write7:
+            add x19, x11, #16
+            add x16, x11, #24
+            mov x10, #28
+            st1 {v8.4s}, [x11], x10
+            st1 {v9.2s}, [x19], x10
+            st1 {v9.s}[2], [x16], x10
+            cmp x6, #1
+            beq WriteEnd
+
+            st1 {v10.4s}, [x11], x10
+            st1 {v11.2s}, [x19], x10
+            st1 {v11.s}[2], [x16], x10
+            cmp x6, #2
+            beq WriteEnd
+
+            st1 {v12.4s}, [x11], x10
+            st1 {v13.2s}, [x19], x10
+            st1 {v13.s}[2], [x16], x10
+            cmp x6, #3
+            beq WriteEnd
+
+            st1 {v14.4s}, [x11], x10
+            st1 {v15.2s}, [x19], x10
+            st1 {v15.s}[2], [x16], x10
+            cmp x6, #4
+            beq WriteEnd
+
+            st1 {v16.4s}, [x11], x10
+            st1 {v17.2s}, [x19], x10
+            st1 {v17.s}[2], [x16], x10
+            cmp x6, #5
+            beq WriteEnd
+
+            st1 {v18.4s}, [x11], x10
+            st1 {v19.2s}, [x19], x10
+            st1 {v19.s}[2], [x16], x10
+            cmp x6, #6
+            beq WriteEnd
+
+            st1 {v20.4s}, [x11], x10
+            st1 {v21.2s}, [x19], x10
+            st1 {v21.s}[2], [x16], x10
+            cmp x6, #7
+            beq WriteEnd
+
+            st1 {v22.4s}, [x11], x10
+            st1 {v23.2s}, [x19], x10
+            st1 {v23.s}[2], [x16], x10
+            cmp x6, #8
+            beq WriteEnd
+
+            st1 {v24.4s}, [x11], x10
+            st1 {v25.2s}, [x19], x10
+            st1 {v25.s}[2], [x16], x10
+            cmp x6, #9
+            beq WriteEnd
+
+            st1 {v26.4s}, [x11], x10
+            st1 {v27.2s}, [x19], x10
+            st1 {v27.s}[2], [x16], x10
+            cmp x6, #10
+            beq WriteEnd
+
+            st1 {v28.4s}, [x11], x10
+            st1 {v29.2s}, [x19], x10
+            st1 {v29.s}[2], [x16], x10
+            cmp x6, #11
+            beq WriteEnd
+
+            st1 {v30.4s}, [x11]
+            st1 {v31.2s}, [x19]
+            st1 {v31.s}[2], [x16]
+            b WriteEnd
+        C4Write8:
+            add x19, x11, x8
+            add x20, x19, x8
+            st1 {v8.4s}, [x11], #16
+            st1 {v9.4s}, [x19], #16
+            cmp x6, #1
+            beq WriteEnd
+
+            st1 {v10.4s}, [x11], #16
+            st1 {v11.4s}, [x19], #16
+            cmp x6, #2
+            beq WriteEnd
+
+            st1 {v12.4s}, [x11], #16
+            st1 {v13.4s}, [x19], #16
+            cmp x6, #3
+            beq WriteEnd
+
+            st1 {v14.4s}, [x11], #16
+            st1 {v15.4s}, [x19], #16
+            cmp x6, #4
+            beq WriteEnd
+
+            st1 {v16.4s}, [x11], #16
+            st1 {v17.4s}, [x19], #16
+            cmp x6, #5
+            beq WriteEnd
+
+            st1 {v18.4s}, [x11], #16
+            st1 {v19.4s}, [x19], #16
+            cmp x6, #6
+            beq WriteEnd
+
+            st1 {v20.4s}, [x11], #16
+            st1 {v21.4s}, [x19], #16
+            cmp x6, #7
+            beq WriteEnd
+
+            st1 {v22.4s}, [x11], #16
+            st1 {v23.4s}, [x19], #16
+            cmp x6, #8
+            beq WriteEnd
+
+            st1 {v24.4s}, [x11], #16
+            st1 {v25.4s}, [x19], #16
+            cmp x6, #9
+            beq WriteEnd
+
+            st1 {v26.4s}, [x11], #16
+            st1 {v27.4s}, [x19], #16
+            cmp x6, #10
+            beq WriteEnd
+
+            st1 {v28.4s}, [x11], #16
+            st1 {v29.4s}, [x19], #16
+            cmp x6, #11
+            beq WriteEnd
+
+            st1 {v30.4s}, [x11]
+            st1 {v31.4s}, [x19]
+            b WriteEnd
 
         WriteEnd:
             subs x13, x13, #8 // rhs col - 8
@@ -1115,11 +1656,16 @@ LoopRow4:
 LoopColEnd:
         add x0, x0, x17
         cbz x9, C8DstStep
+        cmp x9, #3
+        beq C4DstStep
         mov x21, #4
         mul x21, x21, x7
         sub x11, x11, x21
         mov x2, x11
         b NoDstStep
+    C4DstStep:
+        add x2, x2, x18
+        b NoDstStep
     C8DstStep:
         add x2, x2, #384
         mov x11, x2
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow12.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow12.S
index f006a74b68d..eae7a436fea 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow12.S
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow12.S
@@ -29,10 +29,27 @@ asm_function MatmulFloatNeon64OptRow12
 
     mov x21, #48 // sizeof(float) * 12
     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
+    cmp x9, #3 // c4
+    beq C4Stride
     cbnz x9, NoC8Steps
     mov x11, x2
     mov x21, #32
     mul x16, x6, x21 // row * 8 * sizeof(float)
+    b NoC8Steps
+C4Stride:
+    mov x18, #48 // 12 * sizeof(float)
+    mov x22, #4
+    mul x8, x8, x22 // stride * sizeof(float), in c4 stride == row
+    mul x8, x8, x22 // col stride
+    // col >= 4 , block stride 192, otherwise 12 * 4 * col
+    cmp x7, #4
+    bge C4StrideCommon
+    mul x18, x18, x7 // block stride
+    b LoopRowStart
+C4StrideCommon:
+    mov x18, #192 // block stride
+    b LoopRowStart
+
 NoC8Steps:
     cmp x9, #2
     bne NoWinoSteps
@@ -45,6 +62,10 @@ NoWinoSteps:
     mov x21, #4
     mul x8, x8, x21
 
+LoopRowStart:
+    cmp x9, #3
+    bne LoopRow
+    mov x20, x2
 LoopRow:
     mov x14, x1 // reload rhs ptr
     mov x13, x7 // reload rhs col
@@ -52,7 +73,12 @@ LoopRow:
 
     LoopCol:
         cbz x9, NoReloadDst
+        cmp x9, #3
+        beq C4ReloadDst
         mov x11, x2
+        b NoReloadDst
+    C4ReloadDst:
+        mov x11, x20
     NoReloadDst:
         mov x10, x0 // reload lhs ptr
         mov x19, x5 // reload depth
@@ -186,7 +212,7 @@ LoopRow:
             fmin v29.4s, v29.4s, v2.4s
             fmin v30.4s, v30.4s, v2.4s
             fmin v31.4s, v31.4s, v2.4s
-        
+
         Relu:
             dup v3.4s, wzr
             fmax v8.4s, v8.4s, v3.4s
@@ -312,6 +338,8 @@ LoopRow:
         Write:
             cmp x9, #2
             beq WriteWino
+            cmp x9, #3
+            beq WriteC4
             cbz x9, WriteC8
             cmp x13, #1
             beq Write1
@@ -370,7 +398,7 @@ LoopRow:
             str s26, [x11]
             cmp x6, #10
             beq WriteEnd
-            add x11, x11, x8
+add x11, x11, x8
             str s28, [x11]
             cmp x6, #11
             beq WriteEnd
@@ -745,7 +773,458 @@ LoopRow:
             beq WriteEnd
             st1 {v30.4s, v31.4s}, [x11], x8
             add x11, x11, #32
-
+            b WriteEnd
+        WriteC4:
+            cmp x13, #1
+            beq C4Write1
+            cmp x13, #2
+            beq C4Write2
+            cmp x13, #3
+            beq C4Write3
+            cmp x13, #4
+            beq C4Write4
+            cmp x13, #5
+            beq C4Write5
+            cmp x13, #6
+            beq C4Write6
+            cmp x13, #7
+            beq C4Write7
+            b C4Write8
+        C4Write1:
+            str s8, [x11], #4
+            cmp x6, #1
+            beq WriteEnd
+            str s10, [x11], #4
+            cmp x6, #2
+            beq WriteEnd
+            str s12, [x11], #4
+            cmp x6, #3
+            beq WriteEnd
+            str s14, [x11], #4
+            cmp x6, #4
+            beq WriteEnd
+            str s16, [x11], #4
+            cmp x6, #5
+            beq WriteEnd
+            str s18, [x11], #4
+            cmp x6, #6
+            beq WriteEnd
+            str s20, [x11], #4
+            cmp x6, #7
+            beq WriteEnd
+            str s22, [x11], #4
+            cmp x6, #8
+            beq WriteEnd
+            str s24, [x11], #4
+            cmp x6, #9
+            beq WriteEnd
+            str s26, [x11], #4
+            cmp x6, #10
+            beq WriteEnd
+            str s28, [x11], #4
+            cmp x6, #11
+            beq WriteEnd
+            str s30, [x11], #4
+            b WriteEnd
+        C4Write2:
+            st1 {v8.2s}, [x11], #8
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.2s}, [x11], #8
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.2s}, [x11], #8
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.2s}, [x11], #8
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.2s}, [x11], #8
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.2s}, [x11], #8
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.2s}, [x11], #8
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.2s}, [x11], #8
+            cmp x6, #8
+            beq WriteEnd
+            st1 {v24.2s}, [x11], #8
+            cmp x6, #9
+            beq WriteEnd
+            st1 {v26.2s}, [x11], #8
+            cmp x6, #10
+            beq WriteEnd
+            st1 {v28.2s}, [x11], #8
+            cmp x6, #11
+            beq WriteEnd
+            st1 {v30.2s}, [x11], #8
+            b WriteEnd
+        C4Write3:
+            add x19, x11, #8
+            st1 {v8.2s}, [x11]
+            add x11, x11, #12
+            st1 {v8.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.2s}, [x11]
+            add x11, x11, #12
+            st1 {v10.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.2s}, [x11]
+            add x11, x11, #12
+            st1 {v12.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.2s}, [x11]
+            add x11, x11, #12
+            st1 {v14.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.2s}, [x11]
+            add x11, x11, #12
+            st1 {v16.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.2s}, [x11]
+            add x11, x11, #12
+            st1 {v18.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.2s}, [x11]
+            add x11, x11, #12
+            st1 {v20.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.2s}, [x11]
+            add x11, x11, #12
+            st1 {v22.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #8
+            beq WriteEnd
+            st1 {v24.2s}, [x11]
+            add x11, x11, #12
+            st1 {v24.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #9
+            beq WriteEnd
+            st1 {v26.2s}, [x11]
+            add x11, x11, #12
+            st1 {v26.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #10
+            beq WriteEnd
+            st1 {v28.2s}, [x11]
+            add x11, x11, #12
+            st1 {v28.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #11
+            beq WriteEnd
+            st1 {v30.2s}, [x11]
+            add x11, x11, #12
+            st1 {v30.s}[2], [x19]
+            add x19, x19, #12
+            b WriteEnd
+        C4Write4:
+            st1 {v8.4s}, [x11], #16
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11], #16
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11], #16
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11], #16
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.4s}, [x11], #16
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.4s}, [x11], #16
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.4s}, [x11], #16
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.4s}, [x11], #16
+            cmp x6, #8
+            beq WriteEnd
+            st1 {v24.4s}, [x11], #16
+            cmp x6, #9
+            beq WriteEnd
+            st1 {v26.4s}, [x11], #16
+            cmp x6, #10
+            beq WriteEnd
+            st1 {v28.4s}, [x11], #16
+            cmp x6, #11
+            beq WriteEnd
+            st1 {v30.4s}, [x11], #16
+            b WriteEnd
+        C4Write5:
+            add x19, x11, #16
+            st1 {v8.4s}, [x11]
+            add x11, x11, #20
+            str s9, [x19]
+            add x19, x19, #20
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11]
+            add x11, x11, #20
+            str s11, [x19]
+            add x19, x19, #20
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11]
+            add x11, x11, #20
+            str s13, [x19]
+            add x19, x19, #20
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11]
+            add x11, x11, #20
+            str s15, [x19]
+            add x19, x19, #20
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.4s}, [x11]
+            add x11, x11, #20
+            str s17, [x19]
+            add x19, x19, #20
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.4s}, [x11]
+            add x11, x11, #20
+            str s19, [x19]
+            add x19, x19, #20
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.4s}, [x11]
+            add x11, x11, #20
+            str s21, [x19]
+            add x19, x19, #20
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.4s}, [x11]
+            add x11, x11, #20
+            str s23, [x19]
+            add x19, x19, #20
+            cmp x6, #8
+            beq WriteEnd
+            st1 {v24.4s}, [x11]
+            add x11, x11, #20
+            str s25, [x19]
+            add x19, x19, #20
+            cmp x6, #9
+            beq WriteEnd
+            st1 {v26.4s}, [x11]
+            add x11, x11, #20
+            str s27, [x19]
+            add x19, x19, #20
+            cmp x6, #10
+            beq WriteEnd
+            st1 {v28.4s}, [x11]
+            add x11, x11, #20
+            str s29, [x19]
+            add x19, x19, #20
+            cmp x6, #11
+            beq WriteEnd
+            st1 {v30.4s}, [x11]
+            str s31, [x19]
+            b WriteEnd
+        C4Write6:
+            add x19, x11, #16
+            st1 {v8.4s}, [x11]
+            add x11, x11, #24
+            st1 {v9.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11]
+            add x11, x11, #24
+            st1 {v11.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11]
+            add x11, x11, #24
+            st1 {v13.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11]
+            add x11, x11, #24
+            st1 {v15.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.4s}, [x11]
+            add x11, x11, #24
+            st1 {v17.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.4s}, [x11]
+            add x11, x11, #24
+            st1 {v19.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.4s}, [x11]
+            add x11, x11, #24
+            st1 {v21.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.4s}, [x11]
+            add x11, x11, #24
+            st1 {v23.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #8
+            beq WriteEnd
+            st1 {v24.4s}, [x11]
+            add x11, x11, #24
+            st1 {v25.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #9
+            beq WriteEnd
+            st1 {v26.4s}, [x11]
+            add x11, x11, #24
+            st1 {v27.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #10
+            beq WriteEnd
+            st1 {v28.4s}, [x11]
+            add x11, x11, #24
+            st1 {v29.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #11
+            beq WriteEnd
+            st1 {v30.4s}, [x11]
+            st1 {v31.2s}, [x19]
+            b WriteEnd
+        C4Write7:
+            add x19, x11, #16
+            add x16, x11, #24
+            mov x10, #28
+            st1 {v8.4s}, [x11], x10
+            st1 {v9.2s}, [x19], x10
+            st1 {v9.s}[2], [x16], x10
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11], x10
+            st1 {v11.2s}, [x19], x10
+            st1 {v11.s}[2], [x16], x10
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11], x10
+            st1 {v13.2s}, [x19], x10
+            st1 {v13.s}[2], [x16], x10
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11], x10
+            st1 {v15.2s}, [x19], x10
+            st1 {v15.s}[2], [x16], x10
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.4s}, [x11], x10
+            st1 {v17.2s}, [x19], x10
+            st1 {v17.s}[2], [x16], x10
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.4s}, [x11], x10
+            st1 {v19.2s}, [x19], x10
+            st1 {v19.s}[2], [x16], x10
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.4s}, [x11], x10
+            st1 {v21.2s}, [x19], x10
+            st1 {v21.s}[2], [x16], x10
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.4s}, [x11], x10
+            st1 {v23.2s}, [x19], x10
+            st1 {v23.s}[2], [x16], x10
+            cmp x6, #8
+            beq WriteEnd
+            st1 {v24.4s}, [x11], x10
+            st1 {v25.2s}, [x19], x10
+            st1 {v25.s}[2], [x16], x10
+            cmp x6, #9
+            beq WriteEnd
+            st1 {v26.4s}, [x11], x10
+            st1 {v27.2s}, [x19], x10
+            st1 {v27.s}[2], [x16], x10
+            cmp x6, #10
+            beq WriteEnd
+            st1 {v28.4s}, [x11], x10
+            st1 {v29.2s}, [x19], x10
+            st1 {v29.s}[2], [x16], x10
+            cmp x6, #11
+            beq WriteEnd
+            st1 {v30.4s}, [x11]
+            st1 {v31.2s}, [x19]
+            st1 {v31.s}[2], [x16]
+            b WriteEnd
+        C4Write8:
+            add x19, x11, x8
+            add x20, x19, x8
+            st1 {v8.4s}, [x11], #16
+            st1 {v9.4s}, [x19], #16
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11], #16
+            st1 {v11.4s}, [x19], #16
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11], #16
+            st1 {v13.4s}, [x19], #16
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11], #16
+            st1 {v15.4s}, [x19], #16
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.4s}, [x11], #16
+            st1 {v17.4s}, [x19], #16
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.4s}, [x11], #16
+            st1 {v19.4s}, [x19], #16
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.4s}, [x11], #16
+            st1 {v21.4s}, [x19], #16
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.4s}, [x11], #16
+            st1 {v23.4s}, [x19], #16
+            cmp x6, #8
+            beq WriteEnd
+            st1 {v24.4s}, [x11], #16
+            st1 {v25.4s}, [x19], #16
+            cmp x6, #9
+            beq WriteEnd
+            st1 {v26.4s}, [x11], #16
+            st1 {v27.4s}, [x19], #16
+            cmp x6, #10
+            beq WriteEnd
+            st1 {v28.4s}, [x11], #16
+            st1 {v29.4s}, [x19], #16
+            cmp x6, #11
+            beq WriteEnd
+            st1 {v30.4s}, [x11]
+            st1 {v31.4s}, [x19]
         WriteEnd:
             subs x13, x13, #8 // rhs col - 8
             bgt LoopCol
@@ -753,11 +1232,16 @@ LoopRow:
 LoopColEnd:
         add x0, x0, x17
         cbz x9, C8DstStep
+        cmp x9, #3
+        beq C4DstStep
         mov x21, #4
         mul x21, x21, x7
         sub x11, x11, x21
         mov x2, x11
         b NoDstStep
+    C4DstStep:
+        add x2, x2, x18
+        b NoDstStep
     C8DstStep:
         add x2, x2, #384
         mov x11, x2
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow4.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow4.S
index c2a2cde9157..eaa9e47db50 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow4.S
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow4.S
@@ -28,11 +28,29 @@ asm_function MatmulFloatNeon64OptRow4
     ldr x9, [sp, #8]
 
     mov x21, #48 // sizeof(float) * 12
+
     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
+    cmp x9, #3 // c4
+    beq C4Stride
     cbnz x9, NoC8Steps
     mov x11, x2
     mov x21, #32
     mul x16, x6, x21 // row * 8 * sizeof(float)
+    b NoC8Steps
+C4Stride:
+    mov x18, #16 // 4 * sizeof(float)
+    mov x22, #4
+    mul x8, x8, x22 // stride * sizeof(float), in c4 stride == row
+    mul x8, x8, x22 // col stride
+    // col >= 4 , block stride 64, otherwise 4 * 4 * col
+    cmp x7, #4
+    bge C4StrideCommon
+    mul x18, x18, x7 // block stride
+    b LoopRowStart
+C4StrideCommon:
+    mov x18, #64 // block stride
+    b LoopRowStart
+
 NoC8Steps:
     cmp x9, #2
     bne NoWinoSteps
@@ -45,6 +63,10 @@ NoWinoSteps:
     mov x21, #4
     mul x8, x8, x21
 
+LoopRowStart:
+    cmp x9, #3
+    bne LoopRow4
+    mov x20, x2
 LoopRow4:
     mov x14, x1 // reload rhs ptr
     mov x13, x7 // reload rhs col
@@ -52,7 +74,12 @@ LoopRow4:
 
     LoopCol4:
         cbz x9, NoReloadDst4
+        cmp x9, #3
+        beq C4ReloadDst4
         mov x11, x2
+        b NoReloadDst4
+    C4ReloadDst4:
+        mov x11, x20
     NoReloadDst4:
         mov x10, x0 // reload lhs ptr
         mov x19, x5 // reload depth
@@ -194,6 +221,8 @@ LoopRow4:
         Write:
             cmp x9, #2
             beq WriteWino
+            cmp x9, #3
+            beq WriteC4
             cbz x9, WriteC8
             cmp x13, #1
             beq Write1
@@ -369,7 +398,168 @@ LoopRow4:
             beq WriteEnd
             st1 {v14.4s, v15.4s}, [x11], x8
             add x11, x11, #32
-
+            b WriteEnd
+        WriteC4:
+            cmp x13, #1
+            beq C4Write1
+            cmp x13, #2
+            beq C4Write2
+            cmp x13, #3
+            beq C4Write3
+            cmp x13, #4
+            beq C4Write4
+            cmp x13, #5
+            beq C4Write5
+            cmp x13, #6
+            beq C4Write6
+            cmp x13, #7
+            beq C4Write7
+            b C4Write8
+        C4Write1:
+            str s8, [x11], #4
+            cmp x6, #1
+            beq WriteEnd
+            str s10, [x11], #4
+            cmp x6, #2
+            beq WriteEnd
+            str s12, [x11], #4
+            cmp x6, #3
+            beq WriteEnd
+            str s14, [x11], #4
+            b WriteEnd
+        C4Write2:
+            st1 {v8.2s}, [x11], #8
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.2s}, [x11], #8
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.2s}, [x11], #8
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.2s}, [x11], #8
+            b WriteEnd
+        C4Write3:
+            add x19, x11, #8
+            st1 {v8.2s}, [x11]
+            add x11, x11, #12
+            st1 {v8.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.2s}, [x11]
+            add x11, x11, #12
+            st1 {v10.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.2s}, [x11]
+            add x11, x11, #12
+            st1 {v12.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.2s}, [x11]
+            st1 {v14.s}[2], [x19]
+            b WriteEnd
+        C4Write4:
+            st1 {v8.4s}, [x11], #16
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11], #16
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11], #16
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11], #16
+            b WriteEnd
+        C4Write5:
+            add x19, x11, #16
+            st1 {v8.4s}, [x11]
+            add x11, x11, #20
+            str s9, [x19]
+            add x19, x19, #20
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11]
+            add x11, x11, #20
+            str s11, [x19]
+            add x19, x19, #20
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11]
+            add x11, x11, #20
+            str s13, [x19]
+            add x19, x19, #20
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11]
+            str s15, [x19]
+            b WriteEnd
+        C4Write6:
+            add x19, x11, #16
+            st1 {v8.4s}, [x11]
+            add x11, x11, #24
+            st1 {v9.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11]
+            add x11, x11, #24
+            st1 {v11.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11]
+            add x11, x11, #24
+            st1 {v13.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11]
+            st1 {v15.2s}, [x19]
+            b WriteEnd
+        C4Write7:
+            add x19, x11, #16
+            add x16, x11, #24
+            mov x10, #28
+            st1 {v8.4s}, [x11], x10
+            st1 {v9.2s}, [x19], x10
+            st1 {v9.s}[2], [x16], x10
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11], x10
+            st1 {v11.2s}, [x19], x10
+            st1 {v11.s}[2], [x16], x10
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11], x10
+            st1 {v13.2s}, [x19], x10
+            st1 {v13.s}[2], [x16], x10
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11], x10
+            st1 {v15.2s}, [x19], x10
+            st1 {v15.s}[2], [x16], x10
+            b WriteEnd
+        C4Write8:
+            add x19, x11, x8
+            add x20, x19, x8
+            st1 {v8.4s}, [x11], #16
+            st1 {v9.4s}, [x19], #16
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11], #16
+            st1 {v11.4s}, [x19], #16
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11], #16
+            st1 {v13.4s}, [x19], #16
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11], #16
+            st1 {v15.4s}, [x19], #16
         WriteEnd:
             subs x13, x13, #8 // rhs col - 8
             bgt LoopCol4
@@ -378,11 +568,16 @@ LoopRow4:
 LoopColEnd:
         add x0, x0, x17
         cbz x9, C8DstStep
+        cmp x9, #3
+        beq C4DstStep
         mov x21, #4
         mul x21, x21, x7
         sub x11, x11, x21
         mov x2, x11
         b NoDstStep
+    C4DstStep:
+        add x2, x2, x18
+        b NoDstStep
     C8DstStep:
         add x2, x2, #384
         mov x11, x2
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow8.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow8.S
index c59a2f78ef9..c6dc3191259 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow8.S
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm64/MatmulFp32OptRow8.S
@@ -29,10 +29,27 @@ asm_function MatmulFloatNeon64OptRow8
 
     mov x21, #48 // sizeof(float) * 12
     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
+    cmp x9, #3 // c4
+    beq C4Stride
     cbnz x9, NoC8Steps
     mov x11, x2
     mov x21, #32
     mul x16, x6, x21 // row * 8 * sizeof(float)
+    b NoC8Steps
+C4Stride:
+    mov x18, #32 // 8 * sizeof(float)
+    mov x22, #4
+    mul x8, x8, x22 // stride * sizeof(float), in c4 stride == row
+    mul x8, x8, x22 // col stride
+    // col >= 4 , block stride 128, otherwise 8 * 4 * col
+    cmp x7, #4
+    bge C4StrideCommon
+    mul x18, x18, x7 // block stride
+    b LoopRowStart
+C4StrideCommon:
+    mov x18, #128 // block stride
+    b LoopRowStart
+
 NoC8Steps:
     cmp x9, #2
     bne NoWinoSteps
@@ -45,6 +62,10 @@ NoWinoSteps:
     mov x21, #4
     mul x8, x8, x21
 
+LoopRowStart:
+    cmp x9, #3
+    bne LoopRow8
+    mov x20, x2
 LoopRow8:
     mov x14, x1 // reload rhs ptr
     mov x13, x7 // reload rhs col
@@ -52,7 +73,12 @@ LoopRow8:
 
     LoopCol8:
         cbz x9, NoReloadDst8
+        cmp x9, #3
+        beq C4ReloadDst8
         mov x11, x2
+        b NoReloadDst8
+    C4ReloadDst8:
+        mov x11, x20
     NoReloadDst8:
         mov x10, x0 // reload lhs ptr
         mov x19, x5 // reload depth
@@ -254,6 +280,8 @@ LoopRow8:
         Write:
             cmp x9, #2
             beq WriteWino
+            cmp x9, #3
+            beq WriteC4
             cbz x9, WriteC8
             cmp x13, #1
             beq Write1
@@ -557,7 +585,312 @@ LoopRow8:
             beq WriteEnd
             st1 {v22.4s, v23.4s}, [x11], x8
             add x11, x11, #32
-
+            b WriteEnd
+        WriteC4:
+            cmp x13, #1
+            beq C4Write1
+            cmp x13, #2
+            beq C4Write2
+            cmp x13, #3
+            beq C4Write3
+            cmp x13, #4
+            beq C4Write4
+            cmp x13, #5
+            beq C4Write5
+            cmp x13, #6
+            beq C4Write6
+            cmp x13, #7
+            beq C4Write7
+            b C4Write8
+        C4Write1:
+            str s8, [x11], #4
+            cmp x6, #1
+            beq WriteEnd
+            str s10, [x11], #4
+            cmp x6, #2
+            beq WriteEnd
+            str s12, [x11], #4
+            cmp x6, #3
+            beq WriteEnd
+            str s14, [x11], #4
+            cmp x6, #4
+            beq WriteEnd
+            str s16, [x11], #4
+            cmp x6, #5
+            beq WriteEnd
+            str s18, [x11], #4
+            cmp x6, #6
+            beq WriteEnd
+            str s20, [x11], #4
+            cmp x6, #7
+            beq WriteEnd
+            str s22, [x11], #4
+            b WriteEnd
+        C4Write2:
+            st1 {v8.2s}, [x11], #8
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.2s}, [x11], #8
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.2s}, [x11], #8
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.2s}, [x11], #8
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.2s}, [x11], #8
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.2s}, [x11], #8
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.2s}, [x11], #8
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.2s}, [x11], #8
+            b WriteEnd
+        C4Write3:
+            add x19, x11, #8
+            st1 {v8.2s}, [x11]
+            add x11, x11, #12
+            st1 {v8.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.2s}, [x11]
+            add x11, x11, #12
+            st1 {v10.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.2s}, [x11]
+            add x11, x11, #12
+            st1 {v12.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.2s}, [x11]
+            add x11, x11, #12
+            st1 {v14.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.2s}, [x11]
+            add x11, x11, #12
+            st1 {v16.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.2s}, [x11]
+            add x11, x11, #12
+            st1 {v18.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.2s}, [x11]
+            add x11, x11, #12
+            st1 {v20.s}[2], [x19]
+            add x19, x19, #12
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.2s}, [x11]
+            st1 {v22.s}[2], [x19]
+            b WriteEnd
+        C4Write4:
+            st1 {v8.4s}, [x11], #16
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11], #16
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11], #16
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11], #16
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.4s}, [x11], #16
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.4s}, [x11], #16
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.4s}, [x11], #16
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.4s}, [x11], #16
+            b WriteEnd
+        C4Write5:
+            add x19, x11, #16
+            st1 {v8.4s}, [x11]
+            add x11, x11, #20
+            str s9, [x19]
+            add x19, x19, #20
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11]
+            add x11, x11, #20
+            str s11, [x19]
+            add x19, x19, #20
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11]
+            add x11, x11, #20
+            str s13, [x19]
+            add x19, x19, #20
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11]
+            add x11, x11, #20
+            str s15, [x19]
+            add x19, x19, #20
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.4s}, [x11]
+            add x11, x11, #20
+            str s17, [x19]
+            add x19, x19, #20
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.4s}, [x11]
+            add x11, x11, #20
+            str s19, [x19]
+            add x19, x19, #20
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.4s}, [x11]
+            add x11, x11, #20
+            str s21, [x19]
+            add x19, x19, #20
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.4s}, [x11]
+            str s23, [x19]
+            b WriteEnd
+        C4Write6:
+            add x19, x11, #16
+            st1 {v8.4s}, [x11]
+            add x11, x11, #24
+            st1 {v9.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11]
+            add x11, x11, #24
+            st1 {v11.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11]
+            add x11, x11, #24
+            st1 {v13.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11]
+            add x11, x11, #24
+            st1 {v15.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.4s}, [x11]
+            add x11, x11, #24
+            st1 {v17.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.4s}, [x11]
+            add x11, x11, #24
+            st1 {v19.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.4s}, [x11]
+            add x11, x11, #24
+            st1 {v21.2s}, [x19]
+            add x19, x19, #24
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.4s}, [x11]
+            st1 {v23.2s}, [x19]
+            b WriteEnd
+        C4Write7:
+            add x19, x11, #16
+            add x16, x11, #24
+            mov x10, #28
+            st1 {v8.4s}, [x11], x10
+            st1 {v9.2s}, [x19], x10
+            st1 {v9.s}[2], [x16], x10
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11], x10
+            st1 {v11.2s}, [x19], x10
+            st1 {v11.s}[2], [x16], x10
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11], x10
+            st1 {v13.2s}, [x19], x10
+            st1 {v13.s}[2], [x16], x10
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11], x10
+            st1 {v15.2s}, [x19], x10
+            st1 {v15.s}[2], [x16], x10
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.4s}, [x11], x10
+            st1 {v17.2s}, [x19], x10
+            st1 {v17.s}[2], [x16], x10
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.4s}, [x11], x10
+            st1 {v19.2s}, [x19], x10
+            st1 {v19.s}[2], [x16], x10
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.4s}, [x11], x10
+            st1 {v21.2s}, [x19], x10
+            st1 {v21.s}[2], [x16], x10
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.4s}, [x11], x10
+            st1 {v23.2s}, [x19], x10
+            st1 {v23.s}[2], [x16], x10
+            b WriteEnd
+        C4Write8:
+            add x19, x11, x8
+            add x20, x19, x8
+            st1 {v8.4s}, [x11], #16
+            st1 {v9.4s}, [x19], #16
+            cmp x6, #1
+            beq WriteEnd
+            st1 {v10.4s}, [x11], #16
+            st1 {v11.4s}, [x19], #16
+            cmp x6, #2
+            beq WriteEnd
+            st1 {v12.4s}, [x11], #16
+            st1 {v13.4s}, [x19], #16
+            cmp x6, #3
+            beq WriteEnd
+            st1 {v14.4s}, [x11], #16
+            st1 {v15.4s}, [x19], #16
+            cmp x6, #4
+            beq WriteEnd
+            st1 {v16.4s}, [x11], #16
+            st1 {v17.4s}, [x19], #16
+            cmp x6, #5
+            beq WriteEnd
+            st1 {v18.4s}, [x11], #16
+            st1 {v19.4s}, [x19], #16
+            cmp x6, #6
+            beq WriteEnd
+            st1 {v20.4s}, [x11], #16
+            st1 {v21.4s}, [x19], #16
+            cmp x6, #7
+            beq WriteEnd
+            st1 {v22.4s}, [x11], #16
+            st1 {v23.4s}, [x19], #16
         WriteEnd:
             subs x13, x13, #8 // rhs col - 8
             bgt LoopCol8
@@ -565,11 +898,16 @@ LoopRow8:
 LoopColEnd:
         add x0, x0, x17
         cbz x9, C8DstStep
+        cmp x9, #3
+        beq C4DstStep
         mov x21, #4
         mul x21, x21, x7
         sub x11, x11, x21
         mov x2, x11
         b NoDstStep
+    C4DstStep:
+        add x2, x2, x18
+        b NoDstStep
     C8DstStep:
         add x2, x2, #384
         mov x11, x2
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/gather_base.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/gather_base.c
index 786d7130528..3888270cccd 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/gather_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/gather_base.c
@@ -29,12 +29,14 @@ int Gather(const void *input, int outer_size, int inner_size, int limit, const i
     int8_t *int8_out_m = int8_out + inner_size * m * indices_element_size * data_size;
 
     for (int i = 0; i < indices_element_size; ++i) {
-      if (indices[i] < 0 || indices[i] >= limit) {
-        printf("[ERROR] [%s:%d] %s] indices[%d]:%d is out of range [%d, %d)\n", __FILE__, __LINE__, __func__, i,
-               indices[i], 0, limit);
+      int index = indices[i];
+      if (index < -limit || indices[i] >= limit) {
         return NNACL_ERR;
       }
-      memcpy(int8_out_m + i * inner_size * data_size, int8_in_m + indices[i] * inner_size * data_size,
+      if (indices[i] < 0) {
+        index = limit + indices[i];
+      }
+      memcpy(int8_out_m + i * inner_size * data_size, int8_in_m + index * inner_size * data_size,
              data_size * inner_size);
     }
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/slice_base.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/slice_base.c
index 5773c6d74c7..f8e536ad504 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/slice_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/slice_base.c
@@ -43,7 +43,7 @@ void PadSliceParameterTo8D(SliceParameter *param) {
   param->param_length_ = DIMENSION_8D;
 }
 
-void DoSlice(const void *input, void *output, SliceParameter *param, int thread_id, int data_size) {
+void DoSlice(const void *input, void *output, const SliceParameter *param, int thread_id, int data_size) {
   int8_t *int8_in = (int8_t *)input;
   int8_t *int8_out = (int8_t *)output;
 
@@ -94,14 +94,14 @@ void DoSlice(const void *input, void *output, SliceParameter *param, int thread_
   }
 }
 
-static bool WhetherCopyByAxis(int begin[], int end[], const int shape[], int dim) {
+static bool WhetherCopyByAxis(const int begin[], const int end[], const int shape[], int dim) {
   for (int i = dim + 1; i < DIMENSION_8D; ++i) {
     if (begin[i] != 0 || end[i] != shape[i]) return false;
   }
   return true;
 }
 
-void DoSliceNoParallel(const void *input, void *output, SliceParameter *param, int data_size) {
+void DoSliceNoParallel(const void *input, void *output, const SliceParameter *param, int data_size) {
   int8_t *int8_in = (int8_t *)input;
   int8_t *int8_out = (int8_t *)output;
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/slice_base.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/slice_base.h
index bfe91f02da6..4c11ff2f57a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/slice_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/slice_base.h
@@ -25,8 +25,8 @@ extern "C" {
 #endif
 void PadSliceParameterTo8D(SliceParameter *param);
 
-void DoSlice(const void *input, void *output, SliceParameter *param, int thread_id, int data_size);
-void DoSliceNoParallel(const void *input, void *output, SliceParameter *param, int data_size);
+void DoSlice(const void *input, void *output, const SliceParameter *param, int thread_id, int data_size);
+void DoSliceNoParallel(const void *input, void *output, const SliceParameter *param, int data_size);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_base.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_base.c
index 9f7f70bab58..f822b94155b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_base.c
@@ -20,12 +20,12 @@
 #include "nnacl/errorcode.h"
 
 int DoSplit(void *in_data, void **out_data, const int *input_shape, int offset, int num_unit,
-            SplitParameter *split_param, int data_size) {
+            const SplitParameter *split_param, int data_size) {
   int8_t *int8_in = (int8_t *)in_data;
 
   int num_split = split_param->num_split_;
   int *split_sizes = split_param->split_sizes_;
-  int *strides = split_param->strides_;
+  const int *strides = split_param->strides_;
   int split_dim = split_param->split_dim_;
   int in_stride = strides[split_dim];
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_base.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_base.h
index c6b554ae6a9..5f497f20b8a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_base.h
@@ -24,7 +24,7 @@
 extern "C" {
 #endif
 int DoSplit(void *in_data, void **out_data, const int *input_shape, int offset, int num_unit,
-            SplitParameter *split_param, int data_size);
+            const SplitParameter *split_param, int data_size);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_with_over_lap_base.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_with_over_lap_base.c
index 0426bac74f8..012894c9d6a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_with_over_lap_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_with_over_lap_base.c
@@ -18,7 +18,7 @@
 #include <string.h>
 #include "nnacl/errorcode.h"
 
-int DoSplitWithOverlapParallel(char *in_data, char **out_data, int slice_idx, SplitWithOverlapParameter *param,
+int DoSplitWithOverlapParallel(char *in_data, char **out_data, int slice_idx, const SplitWithOverlapParameter *param,
                                const int *start_indices, const int *end_indices) {
   if (in_data == NULL || out_data == NULL) {
     return NNACL_NULL_PTR;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_with_over_lap_base.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_with_over_lap_base.h
index 2bd32cc9c8d..fe236160ae4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_with_over_lap_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_with_over_lap_base.h
@@ -23,7 +23,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-int DoSplitWithOverlapParallel(char *in_data, char **out_data, int slice_idx, SplitWithOverlapParameter *param,
+int DoSplitWithOverlapParallel(char *in_data, char **out_data, int slice_idx, const SplitWithOverlapParameter *param,
                                const int *start_indices, const int *end_indices);
 #ifdef __cplusplus
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unstack_base.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unstack_base.c
index d0e5a25fb7e..57ea8c5891e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unstack_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unstack_base.c
@@ -16,7 +16,7 @@
 
 #include "nnacl/base/unstack_base.h"
 
-void Unstack(const void *input, void **output, UnstackParameter *para, int data_size) {
+void Unstack(const void *input, void **output, const UnstackParameter *para, int data_size) {
   const int8_t *in_addr = (int8_t *)input;
   for (int j = 0; j < para->num_; j++) {
     int8_t *out_addr = (int8_t *)output[j];
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unstack_base.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unstack_base.h
index d4915a4823c..df6ba0c7a0a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unstack_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unstack_base.h
@@ -24,7 +24,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void Unstack(const void *input, void **output, UnstackParameter *para, int data_size);
+void Unstack(const void *input, void **output, const UnstackParameter *para, int data_size);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/conv_parameter.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/conv_parameter.h
index 450db005b76..2946e6878aa 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/conv_parameter.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/conv_parameter.h
@@ -54,6 +54,7 @@ typedef struct ConvParameter {
   int channel_multiplie_;
   int output_padding_w_;
   int output_padding_h_;
+  int out_format_;
 } ConvParameter;
 
 typedef struct SlidingWindowParam {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_fp16.c
index 2e8f301e543..8e6f86b7c7a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_fp16.c
@@ -69,7 +69,7 @@ int ElementMulFp16(const float16_t *input0, const float16_t *input1, float16_t *
 }
 
 int ElementOptMulFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                      ArithmeticParameter *param) {
+                      const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -123,7 +123,7 @@ int ElementMulReluFp16(const float16_t *input0, const float16_t *input1, float16
 }
 
 int ElementOptMulReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                          ArithmeticParameter *param) {
+                          const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -180,7 +180,7 @@ int ElementMulRelu6Fp16(const float16_t *input0, const float16_t *input1, float1
 }
 
 int ElementOptMulRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                           ArithmeticParameter *param) {
+                           const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -239,7 +239,7 @@ int ElementAddFp16(const float16_t *input0, const float16_t *input1, float16_t *
 }
 
 int ElementOptAddFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                      ArithmeticParameter *param) {
+                      const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -299,7 +299,7 @@ int ElementAddReluFp16(const float16_t *input0, const float16_t *input1, float16
 }
 
 int ElementOptAddReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                          ArithmeticParameter *param) {
+                          const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -365,7 +365,7 @@ int ElementAddRelu6Fp16(const float16_t *input0, const float16_t *input1, float1
 }
 
 int ElementOptAddRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                           ArithmeticParameter *param) {
+                           const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -418,7 +418,7 @@ int ElementSubFp16(const float16_t *input0, const float16_t *input1, float16_t *
 }
 
 int ElementOptSubFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                      ArithmeticParameter *param) {
+                      const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -470,7 +470,7 @@ int ElementSubReluFp16(const float16_t *input0, const float16_t *input1, float16
 }
 
 int ElementOptSubReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                          ArithmeticParameter *param) {
+                          const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -527,7 +527,7 @@ int ElementSubRelu6Fp16(const float16_t *input0, const float16_t *input1, float1
 }
 
 int ElementOptSubRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                           ArithmeticParameter *param) {
+                           const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -581,7 +581,7 @@ int ElementDivFp16(const float16_t *input0, const float16_t *input1, float16_t *
 }
 
 int ElementOptDivFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                      ArithmeticParameter *param) {
+                      const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -641,7 +641,7 @@ int ElementDivReluFp16(const float16_t *input0, const float16_t *input1, float16
 }
 
 int ElementOptDivReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                          ArithmeticParameter *param) {
+                          const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -704,7 +704,7 @@ int ElementDivRelu6Fp16(const float16_t *input0, const float16_t *input1, float1
 }
 
 int ElementOptDivRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                           ArithmeticParameter *param) {
+                           const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -755,7 +755,7 @@ int ElementFloorModFp16(const float16_t *input0, const float16_t *input1, float1
 }
 
 int ElementOptFloorModFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                           ArithmeticParameter *param) {
+                           const ArithmeticParameter *param) {
   if (param->in_elements_num1_ == 1) {
     for (int i = 0; i < element_size; ++i) {
       NNACL_ASSERT(input1[0] != 0);
@@ -778,7 +778,7 @@ int ElementFloorDivFp16(const float16_t *input0, const float16_t *input1, float1
   return NNACL_OK;
 }
 int ElementOptFloorDivFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                           ArithmeticParameter *param) {
+                           const ArithmeticParameter *param) {
   if (param->in_elements_num1_ == 1) {
     for (int i = 0; i < element_size; ++i) {
       NNACL_ASSERT(input1[0] != 0);
@@ -814,7 +814,7 @@ int ElementLogicalAndFp16(const float16_t *input0, const float16_t *input1, floa
 }
 
 int ElementOptLogicalAndFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                             ArithmeticParameter *param) {
+                             const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -875,7 +875,7 @@ int ElementLogicalOrFp16(const float16_t *input0, const float16_t *input1, float
 }
 
 int ElementOptLogicalOrFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                            ArithmeticParameter *param) {
+                            const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -922,7 +922,7 @@ int ElementSquaredDifferenceFp16(const float16_t *input0, const float16_t *input
 }
 
 int ElementOptSquaredDifferenceFp16(const float16_t *input0, const float16_t *input1, float16_t *output,
-                                    int element_size, ArithmeticParameter *param) {
+                                    int element_size, const ArithmeticParameter *param) {
   ElementOptSubFp16(input0, input1, output, element_size, param);
   return ElementMulFp16(output, output, output, element_size);
 }
@@ -944,7 +944,7 @@ int ElementMaximumFp16(const float16_t *input0, const float16_t *input1, float16
 }
 
 int ElementOptMaximumFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                          ArithmeticParameter *param) {
+                          const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -993,7 +993,7 @@ int ElementMinimumFp16(const float16_t *input0, const float16_t *input1, float16
 }
 
 int ElementOptMinimumFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                          ArithmeticParameter *param) {
+                          const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -1042,7 +1042,7 @@ int ElementNotEqualFp16(const float16_t *input0, const float16_t *input1, uint8_
 }
 
 int ElementOptNotEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
-                           ArithmeticParameter *param) {
+                           const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -1091,7 +1091,7 @@ int ElementEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *
 }
 
 int ElementOptEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
-                        ArithmeticParameter *param) {
+                        const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -1140,7 +1140,7 @@ int ElementLessFp16(const float16_t *input0, const float16_t *input1, uint8_t *o
 }
 
 int ElementOptLessFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
-                       ArithmeticParameter *param) {
+                       const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -1189,7 +1189,7 @@ int ElementLessEqualFp16(const float16_t *input0, const float16_t *input1, uint8
 }
 
 int ElementOptLessEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
-                            ArithmeticParameter *param) {
+                            const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -1238,7 +1238,7 @@ int ElementGreaterFp16(const float16_t *input0, const float16_t *input1, uint8_t
 }
 
 int ElementOptGreaterFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
-                          ArithmeticParameter *param) {
+                          const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
@@ -1287,7 +1287,7 @@ int ElementGreaterEqualFp16(const float16_t *input0, const float16_t *input1, ui
 }
 
 int ElementOptGreaterEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
-                               ArithmeticParameter *param) {
+                               const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_fp16.h
index 813e48c7079..e8a629c389a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_fp16.h
@@ -31,55 +31,55 @@ void TileDimensionsFp16(const float16_t *data0, const float16_t *data1, float16_
                         ArithmeticParameter *param);
 
 int ElementOptMulFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                      ArithmeticParameter *param);
+                      const ArithmeticParameter *param);
 int ElementOptMulReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                          ArithmeticParameter *param);
+                          const ArithmeticParameter *param);
 int ElementOptMulRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                           ArithmeticParameter *param);
+                           const ArithmeticParameter *param);
 int ElementOptAddFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                      ArithmeticParameter *param);
+                      const ArithmeticParameter *param);
 int ElementOptAddReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                          ArithmeticParameter *param);
+                          const ArithmeticParameter *param);
 int ElementOptAddRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                           ArithmeticParameter *param);
+                           const ArithmeticParameter *param);
 int ElementOptSubFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                      ArithmeticParameter *param);
+                      const ArithmeticParameter *param);
 int ElementOptSubReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                          ArithmeticParameter *param);
+                          const ArithmeticParameter *param);
 int ElementOptSubRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                           ArithmeticParameter *param);
+                           const ArithmeticParameter *param);
 int ElementOptDivFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                      ArithmeticParameter *param);
+                      const ArithmeticParameter *param);
 int ElementOptDivReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                          ArithmeticParameter *param);
+                          const ArithmeticParameter *param);
 int ElementOptDivRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                           ArithmeticParameter *param);
+                           const ArithmeticParameter *param);
 int ElementOptFloorModFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                           ArithmeticParameter *param);
+                           const ArithmeticParameter *param);
 int ElementOptFloorDivFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                           ArithmeticParameter *param);
+                           const ArithmeticParameter *param);
 int ElementOptLogicalAndFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                             ArithmeticParameter *param);
+                             const ArithmeticParameter *param);
 int ElementOptLogicalOrFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                            ArithmeticParameter *param);
+                            const ArithmeticParameter *param);
 int ElementOptSquaredDifferenceFp16(const float16_t *input0, const float16_t *input1, float16_t *output,
-                                    int element_size, ArithmeticParameter *param);
+                                    int element_size, const ArithmeticParameter *param);
 int ElementOptMaximumFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                          ArithmeticParameter *param);
+                          const ArithmeticParameter *param);
 int ElementOptMinimumFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
-                          ArithmeticParameter *param);
+                          const ArithmeticParameter *param);
 int ElementOptNotEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
-                           ArithmeticParameter *param);
+                           const ArithmeticParameter *param);
 int ElementOptEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
-                        ArithmeticParameter *param);
+                        const ArithmeticParameter *param);
 int ElementOptLessFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
-                       ArithmeticParameter *param);
+                       const ArithmeticParameter *param);
 int ElementOptLessEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
-                            ArithmeticParameter *param);
+                            const ArithmeticParameter *param);
 int ElementOptGreaterFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
-                          ArithmeticParameter *param);
+                          const ArithmeticParameter *param);
 int ElementOptGreaterEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
-                               ArithmeticParameter *param);
+                               const ArithmeticParameter *param);
 
 int ElementMulFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size);
 int ElementMulReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_self_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_self_fp16.c
index be3c5f0b0be..55507760ec7 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_self_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_self_fp16.c
@@ -16,21 +16,21 @@
 #include <math.h>
 #include "nnacl/fp16/arithmetic_self_fp16.h"
 
-int ElementAbsFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementAbsFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; i++) {
     output[i] = fabsf(input[i]);
   }
   return NNACL_OK;
 }
 
-int ElementCosFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementCosFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; i++) {
     output[i] = cosf(input[i]);
   }
   return NNACL_OK;
 }
 
-int ElementLogFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementLogFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; i++) {
     if (input[i] <= 0) {
       return NNACL_ERRCODE_LOG_NEGATIVE_OR_ZERO;
@@ -40,14 +40,14 @@ int ElementLogFp16(float16_t *input, float16_t *output, int element_size) {
   return NNACL_OK;
 }
 
-int ElementSquareFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementSquareFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; i++) {
     output[i] = input[i] * input[i];
   }
   return NNACL_OK;
 }
 
-int ElementSqrtFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementSqrtFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; i++) {
     if (input[i] < 0) {
       return NNACL_ERRCODE_SQRT_NEGATIVE;
@@ -57,56 +57,56 @@ int ElementSqrtFp16(float16_t *input, float16_t *output, int element_size) {
   return NNACL_OK;
 }
 
-int ElementRsqrtFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementRsqrtFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; i++) {
     output[i] = 1.f / sqrtf(input[i]);
   }
   return NNACL_OK;
 }
 
-int ElementSinFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementSinFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; i++) {
     output[i] = sinf(input[i]);
   }
   return NNACL_OK;
 }
 
-int ElementLogicalNotFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementLogicalNotFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; i++) {
     output[i] = (float)(!((bool)(input[i])));
   }
   return NNACL_OK;
 }
 
-int ElementRoundFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementRoundFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; i++) {
     output[i] = roundf(input[i]);
   }
   return NNACL_OK;
 }
 
-int ElementFloorFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementFloorFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; i++) {
     output[i] = floorf(input[i]);
   }
   return NNACL_OK;
 }
 
-int ElementCeilFp16(float16_t *input, float16_t *output, int number) {
+int ElementCeilFp16(const float16_t *input, float16_t *output, int number) {
   for (int i = 0; i < number; ++i) {
     output[i] = ceilf(input[i]);
   }
   return NNACL_OK;
 }
 
-int ElementNegativeFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementNegativeFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; ++i) {
     output[i] = -input[i];
   }
   return NNACL_OK;
 }
 
-int ElementReciprocalFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementReciprocalFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; ++i) {
     if (input[i] == 0.0f) {
       return NNACL_ERR;
@@ -116,7 +116,7 @@ int ElementReciprocalFp16(float16_t *input, float16_t *output, int element_size)
   return NNACL_OK;
 }
 
-int ElementErfFp16(float16_t *input, float16_t *output, int element_size) {
+int ElementErfFp16(const float16_t *input, float16_t *output, int element_size) {
   for (int i = 0; i < element_size; i++) {
     output[i] = erff(input[i]);
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_self_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_self_fp16.h
index 58ad411aa29..65c4d171474 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_self_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_self_fp16.h
@@ -23,33 +23,33 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-int ElementAbsFp16(float16_t *input, float16_t *output, int element_size);
+int ElementAbsFp16(const float16_t *input, float16_t *output, int element_size);
 
-int ElementCosFp16(float16_t *input, float16_t *output, int element_size);
+int ElementCosFp16(const float16_t *input, float16_t *output, int element_size);
 
-int ElementLogFp16(float16_t *input, float16_t *output, int element_size);
+int ElementLogFp16(const float16_t *input, float16_t *output, int element_size);
 
-int ElementSquareFp16(float16_t *input, float16_t *output, int element_size);
+int ElementSquareFp16(const float16_t *input, float16_t *output, int element_size);
 
-int ElementSqrtFp16(float16_t *input, float16_t *output, int element_size);
+int ElementSqrtFp16(const float16_t *input, float16_t *output, int element_size);
 
-int ElementRsqrtFp16(float16_t *input, float16_t *output, int element_size);
+int ElementRsqrtFp16(const float16_t *input, float16_t *output, int element_size);
 
-int ElementSinFp16(float16_t *input, float16_t *output, int element_size);
+int ElementSinFp16(const float16_t *input, float16_t *output, int element_size);
 
-int ElementLogicalNotFp16(float16_t *input, float16_t *output, int element_size);
+int ElementLogicalNotFp16(const float16_t *input, float16_t *output, int element_size);
 
-int ElementRoundFp16(float16_t *input, float16_t *output, int element_size);
+int ElementRoundFp16(const float16_t *input, float16_t *output, int element_size);
 
-int ElementFloorFp16(float16_t *input, float16_t *output, int element_size);
+int ElementFloorFp16(const float16_t *input, float16_t *output, int element_size);
 
-int ElementCeilFp16(float16_t *input, float16_t *output, int number);
+int ElementCeilFp16(const float16_t *input, float16_t *output, int number);
 
-int ElementNegativeFp16(float16_t *input, float16_t *output, int element_size);
+int ElementNegativeFp16(const float16_t *input, float16_t *output, int element_size);
 
-int ElementReciprocalFp16(float16_t *input, float16_t *output, int element_size);
+int ElementReciprocalFp16(const float16_t *input, float16_t *output, int element_size);
 
-int ElementErfFp16(float16_t *input, float16_t *output, int element_size);
+int ElementErfFp16(const float16_t *input, float16_t *output, int element_size);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/batchnorm_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/batchnorm_fp16.c
index 0395bbaecc6..a2693e2a52e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/batchnorm_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/batchnorm_fp16.c
@@ -17,7 +17,7 @@
 #include "nnacl/fp16/batchnorm_fp16.h"
 #include <math.h>
 
-void BatchNormFp16(const float16_t *input, const void *mean, const void *variance, BatchNormParameter *param,
+void BatchNormFp16(const float16_t *input, const void *mean, const void *variance, const BatchNormParameter *param,
                    int task_id, float16_t *output) {
   int units_per_thread = UP_DIV(param->unit_, param->op_parameter_.thread_num_);
   int completed_units = task_id * units_per_thread;
@@ -36,7 +36,7 @@ void BatchNormFp16(const float16_t *input, const void *mean, const void *varianc
 }
 
 void FusedBatchNormFp16(const void *input, const void *scale, const void *offset, const void *mean,
-                        const void *variance, BatchNormParameter *param, int task_id, void *output) {
+                        const void *variance, const BatchNormParameter *param, int task_id, void *output) {
   int units_per_thread = UP_DIV(param->unit_, param->op_parameter_.thread_num_);
   int completed_units = task_id * units_per_thread;
   int cur_unit = MSMIN(units_per_thread, param->unit_ - completed_units);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/batchnorm_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/batchnorm_fp16.h
index 678a1ae6598..bc9450badae 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/batchnorm_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/batchnorm_fp16.h
@@ -22,10 +22,10 @@
 extern "C" {
 #endif
 
-void BatchNormFp16(const float16_t *input, const void *mean, const void *variance, BatchNormParameter *param,
+void BatchNormFp16(const float16_t *input, const void *mean, const void *variance, const BatchNormParameter *param,
                    int task_id, float16_t *output);
 void FusedBatchNormFp16(const void *input, const void *scale, const void *offset, const void *mean,
-                        const void *variance, BatchNormParameter *param, int task_id, void *output);
+                        const void *variance, const BatchNormParameter *param, int task_id, void *output);
 void FusedBatchNormFp16MeanVar(const float16_t *input, float16_t *run_mean, float16_t *run_var,
                                const BatchNormParameter *param, float16_t *save_mean, float16_t *save_var);
 #ifdef __cplusplus
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_fp16.c
index 01fbe9e2fc5..02ede43d008 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_fp16.c
@@ -20,8 +20,9 @@
 #include "nnacl/fp16/matmul_fp16.h"
 
 // fp16 convolution common (im2col+gemm)
-void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_weight, float16_t *bias_data,
-              float16_t *col_major_input, float16_t *output_data, int task_id, ConvParameter *conv_param) {
+void ConvFp16(const float16_t *input_data, float16_t *packed_input, const float16_t *packed_weight,
+              const float16_t *bias_data, float16_t *col_major_input, float16_t *output_data, int task_id,
+              const ConvParameter *conv_param) {
 #ifdef ENABLE_ARM64
   const int tile_n = 16;
 #else
@@ -59,10 +60,55 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_
   }
 }
 
+void ConvOutNc8hw8Fp16(const float16_t *input_data, float16_t *packed_input, const float16_t *packed_weight,
+                       const float16_t *bias_data, float16_t *col_major_input, float16_t *output_data, int task_id,
+                       const ConvParameter *conv_param) {
+#ifdef ENABLE_ARM64
+  const int tile_n = 16;
+#else
+  const int tile_n = 12;
+#endif
+  int output_hw = conv_param->output_h_ * conv_param->output_w_;
+  int input_block = UP_DIV(output_hw, tile_n);
+  int block_per_thread = UP_DIV(input_block, conv_param->thread_num_);
+  int start_block = block_per_thread * task_id;
+  int end_block = MSMIN(start_block + block_per_thread, input_block);
+  if (start_block >= end_block) {
+    return;
+  }
+  int weight_block = UP_DIV(conv_param->output_channel_, C8NUM);
+  int deep = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
+  packed_input += deep * tile_n * task_id;
+  col_major_input += deep * tile_n * task_id;
+  size_t input_size = deep * tile_n * sizeof(float16_t);
+
+  for (int b = 0; b < conv_param->input_batch_; b++) {
+    int in_offset = b * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_;
+    for (int i = start_block; i < end_block; i++) {
+      int real_in_row = (i != input_block - 1) ? tile_n : output_hw - i * tile_n;
+      memset(packed_input, 0, input_size);
+      Im2ColPackUnitFp16(input_data + in_offset, conv_param, packed_input, real_in_row, i * tile_n);
+#ifdef ENABLE_ARM64
+      RowMajor2Col16MajorFp16Opt(packed_input, col_major_input, tile_n, deep);
+#else
+      RowMajor2Col12MajorFp16Opt(packed_input, col_major_input, tile_n, deep);
+#endif
+      for (int j = 0; j < weight_block; j++) {
+        int real_weight_row = (j != weight_block - 1) ? C8NUM : conv_param->output_channel_ - j * C8NUM;
+        int weight_offset = j * C8NUM * deep;
+        int bias_offset = j * real_weight_row;
+        int out_offset = j * output_hw * C8NUM + i * tile_n * real_weight_row;
+        MatMulFp16(col_major_input, packed_weight + weight_offset, output_data + out_offset, bias_data + bias_offset,
+                   conv_param->act_type_, deep, real_in_row, real_weight_row, real_weight_row, OutType_Nhwc);
+      }
+    }
+  }
+}
+
 // fp16 convolution winograd
-void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const float16_t *bias_data,
-                      float16_t *output_data, TmpBufferAddressFp16 *buffer_list, int task_id, ConvParameter *conv_param,
-                      InputTransFp16Func in_func, OutputTransFp16Func out_func) {
+void ConvWinogardFp16(const float16_t *input_data, const float16_t *trans_weight, const float16_t *bias_data,
+                      float16_t *output_data, TmpBufferAddressFp16 *buffer_list, int task_id,
+                      const ConvParameter *conv_param, InputTransFp16Func in_func, OutputTransFp16Func out_func) {
 #ifdef ENABLE_ARM64
   const int tile_num = 16;
 #else
@@ -116,8 +162,13 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
       }
 
       // step 4 : output transform
-      WinogradOutputTransformFp16(gemm_out + task_id * gemm_out_offset, output_data + out_batch_offset, bias_data,
-                                  cal_num, out_tile_index, out_w_block, conv_param, out_func);
+      if (conv_param->out_format_ != NNACL_NC4HW4) {  // nc4hw4
+        WinogradOutputNHWCTransformFp16(gemm_out + task_id * gemm_out_offset, output_data + out_batch_offset, bias_data,
+                                        cal_num, out_tile_index, out_w_block, conv_param, out_func);
+      } else {
+        WinogradOutputNC4HW4TransformFp16(gemm_out + task_id * gemm_out_offset, output_data + out_batch_offset,
+                                          bias_data, cal_num, out_tile_index, out_w_block, conv_param, out_func);
+      }
     }
   }
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_fp16.h
index 34d97fb75a7..1d7ec585596 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/conv_fp16.h
@@ -29,13 +29,18 @@ extern "C" {
 #endif
 
 // fp16 convolution common (im2col+gemm)
-void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_weight, float16_t *bias_data,
-              float16_t *col_major_input, float16_t *output_data, int task_id, ConvParameter *conv_param);
+void ConvFp16(const float16_t *input_data, float16_t *packed_input, const float16_t *packed_weight,
+              const float16_t *bias_data, float16_t *col_major_input, float16_t *output_data, int task_id,
+              const ConvParameter *conv_param);
+
+void ConvOutNc8hw8Fp16(const float16_t *input_data, float16_t *packed_input, const float16_t *packed_weight,
+                       const float16_t *bias_data, float16_t *col_major_input, float16_t *output_data, int task_id,
+                       const ConvParameter *conv_param);
 
 // fp16 convolution winograd
-void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const float16_t *bias_data,
-                      float16_t *output_data, TmpBufferAddressFp16 *buffer_list, int task_id, ConvParameter *conv_param,
-                      InputTransFp16Func in_func, OutputTransFp16Func out_func);
+void ConvWinogardFp16(const float16_t *input_data, const float16_t *trans_weight, const float16_t *bias_data,
+                      float16_t *output_data, TmpBufferAddressFp16 *buffer_list, int task_id,
+                      const ConvParameter *conv_param, InputTransFp16Func in_func, OutputTransFp16Func out_func);
 
 #ifdef __cplusplus
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/crop_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/crop_fp16.c
index f014f03a424..7beeac172ca 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/crop_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/crop_fp16.c
@@ -20,7 +20,7 @@
 
 #include "nnacl/crop_parameter.h"
 
-void Fp16Crop(const float16_t *input, float16_t *output, int task_id, CropParameter *para) {
+void Fp16Crop(const float16_t *input, float16_t *output, int task_id, const CropParameter *para) {
   int input_dim = para->input_dim_;
   switch (input_dim) {
     case 1:
@@ -40,7 +40,7 @@ void Fp16Crop(const float16_t *input, float16_t *output, int task_id, CropParame
   }
 }
 
-void Fp16Crop1D(const float16_t *input, float16_t *output, int task_id, CropParameter *para) {
+void Fp16Crop1D(const float16_t *input, float16_t *output, int task_id, const CropParameter *para) {
   const int out_batch = para->out_shape_[0];
   const int thread_count = para->thread_count_;
   int64_t task_id_stride = thread_count > 1 ? UP_DIV(out_batch, thread_count) : out_batch;
@@ -57,7 +57,7 @@ void Fp16Crop1D(const float16_t *input, float16_t *output, int task_id, CropPara
   memcpy(out_ptr, in_ptr, sizeof(float16_t) * out_dist_stride);
 }
 
-void Fp16Crop2D(const float16_t *input, float16_t *output, int task_id, CropParameter *para) {
+void Fp16Crop2D(const float16_t *input, float16_t *output, int task_id, const CropParameter *para) {
   const int in_height = para->in_shape_[1];
   const int out_batch = para->out_shape_[0];
   const int out_height = para->out_shape_[1];
@@ -79,7 +79,7 @@ void Fp16Crop2D(const float16_t *input, float16_t *output, int task_id, CropPara
   }
 }
 
-void Fp16Crop3D(const float16_t *input, float16_t *output, int task_id, CropParameter *para) {
+void Fp16Crop3D(const float16_t *input, float16_t *output, int task_id, const CropParameter *para) {
   const int in_height = para->in_shape_[1];
   const int in_width = para->in_shape_[2];
 
@@ -113,7 +113,7 @@ void Fp16Crop3D(const float16_t *input, float16_t *output, int task_id, CropPara
   }
 }
 
-void Fp16Crop4D(const float16_t *input, float16_t *output, int task_id, CropParameter *para) {
+void Fp16Crop4D(const float16_t *input, float16_t *output, int task_id, const CropParameter *para) {
   const int in_height = para->in_shape_[1];
   const int in_width = para->in_shape_[2];
   const int in_channel = para->in_shape_[3];
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/crop_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/crop_fp16.h
index 2bae96ca4f4..0186190a493 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/crop_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/crop_fp16.h
@@ -23,11 +23,11 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void Fp16Crop(const float16_t *input, float16_t *output, int task_id, CropParameter *para);
-void Fp16Crop1D(const float16_t *input, float16_t *output, int task_id, CropParameter *para);
-void Fp16Crop2D(const float16_t *input, float16_t *output, int task_id, CropParameter *para);
-void Fp16Crop3D(const float16_t *input, float16_t *output, int task_id, CropParameter *para);
-void Fp16Crop4D(const float16_t *input, float16_t *output, int task_id, CropParameter *para);
+void Fp16Crop(const float16_t *input, float16_t *output, int task_id, const CropParameter *para);
+void Fp16Crop1D(const float16_t *input, float16_t *output, int task_id, const CropParameter *para);
+void Fp16Crop2D(const float16_t *input, float16_t *output, int task_id, const CropParameter *para);
+void Fp16Crop3D(const float16_t *input, float16_t *output, int task_id, const CropParameter *para);
+void Fp16Crop4D(const float16_t *input, float16_t *output, int task_id, const CropParameter *para);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_fp16.c
index 4ef8f232357..8bb192700aa 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_fp16.c
@@ -17,7 +17,7 @@
 #include "nnacl/fp16/deconv_fp16.h"
 
 int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias, float16_t *dst, int output_channel,
-                   ConvParameter *conv_param) {
+                   const ConvParameter *conv_param) {
   /* row8x8-major(ih*iw x oc*kh*kw)  ->  row8-major(oh*ow x oc) */
   size_t input_plane = conv_param->input_w_ * conv_param->input_h_;
   size_t kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_fp16.h
index b1de538b19f..5390238c3b5 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_fp16.h
@@ -28,7 +28,7 @@ extern "C" {
 #endif
 
 int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias, float16_t *dst, int output_channel,
-                   ConvParameter *conv_param);
+                   const ConvParameter *conv_param);
 
 #ifdef __cplusplus
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c
index 3f327b6fe39..044b6ff8643 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c
@@ -17,10 +17,10 @@
 #include "nnacl/fp16/deconv_winograd_fp16.h"
 #include "nnacl/base/minimal_filtering_generator.h"
 
-void DeConvWgInputPackFp16(float16_t *src_ptr, float16_t *dst_ptr, int channel, int stride) {
+void DeConvWgInputPackFp16(const float16_t *src_ptr, float16_t *dst_ptr, int channel, int stride) {
   int ic4div = channel / C4NUM;
   int ic4mod = channel % C4NUM;
-  float16_t *src = src_ptr;
+  const float16_t *src = src_ptr;
   float16_t *dst = dst_ptr;
 
   for (int ic = 0; ic < ic4div; ic++) {
@@ -172,10 +172,10 @@ void DeConvWgMergeFp16(const float16_t *src, float16_t *dst, size_t src_stride,
   return;
 }
 
-void DeConvWgCalWgFp16(float16_t *tile_in, float16_t *tile_out, float16_t *weight_buf, float16_t *tmp_buf,
-                       float16_t *at_buf, float16_t *a_mid_buf, float16_t *trans_a_buf, bool *transferred,
-                       float16_t *bt_buf, float16_t *b_tmp_buf, int unit_size, int w_start, int h_start,
-                       ConvParameter *conv_param, DeConvParam *deconv_param) {
+void DeConvWgCalWgFp16(const float16_t *tile_in, float16_t *tile_out, const float16_t *weight_buf, float16_t *tmp_buf,
+                       const float16_t *at_buf, float16_t *a_mid_buf, float16_t *trans_a_buf, bool *transferred,
+                       const float16_t *bt_buf, float16_t *b_tmp_buf, int unit_size, int w_start, int h_start,
+                       const ConvParameter *conv_param, const DeConvParam *deconv_param) {
   int winograd_plane = unit_size * unit_size;
   if (!transferred[unit_size]) {
     WinogradTransLeftFp16(tile_in, at_buf, a_mid_buf, DECONV_WINOGRAD_DEFAULT_UNIT, unit_size,
@@ -188,7 +188,7 @@ void DeConvWgCalWgFp16(float16_t *tile_in, float16_t *tile_out, float16_t *weigh
   for (int index = 0; index < winograd_plane; index++) {
     float16_t *src = trans_a_buf + index * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_;
     float16_t *dst = tmp_buf + index * deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
-    float16_t *weight = weight_buf + index * deconv_param->ic_up4_ * deconv_param->oc_up4_;
+    const float16_t *weight = weight_buf + index * deconv_param->ic_up4_ * deconv_param->oc_up4_;
     TiledC4MatmulFp16(dst, src, weight, DECONV_WINOGRAD_DEFAULT_TILE * C4NUM, deconv_param->ic_div4_,
                       deconv_param->oc_div4_);
   }
@@ -213,15 +213,16 @@ void DeConvWgCalWgFp16(float16_t *tile_in, float16_t *tile_out, float16_t *weigh
   return;
 }
 
-void DeConvWgCalCommFp16(float16_t *tile_in, float16_t *tile_out, float16_t *weight, float16_t *tmp_buf, int h_start,
-                         int w_start, int h_size, int w_size, ConvParameter *conv_param, DeConvParam *deconv_param) {
+void DeConvWgCalCommFp16(const float16_t *tile_in, float16_t *tile_out, const float16_t *weight, float16_t *tmp_buf,
+                         int h_start, int w_start, int h_size, int w_size, const ConvParameter *conv_param,
+                         const DeConvParam *deconv_param) {
   int count = deconv_param->oc_div4_ * w_size * h_size;
   int in_stride = DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_;
   int out_stride = DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_;
 
   for (int hi = 0; hi < DECONV_WINOGRAD_DEFAULT_UNIT; hi++) {
     for (int wi = 0; wi < DECONV_WINOGRAD_DEFAULT_UNIT; wi++) {
-      float16_t *src_in = tile_in + (wi + hi * DECONV_WINOGRAD_DEFAULT_UNIT) * in_stride;
+      const float16_t *src_in = tile_in + (wi + hi * DECONV_WINOGRAD_DEFAULT_UNIT) * in_stride;
       TiledC4MatmulFp16(tmp_buf, src_in, weight, DECONV_WINOGRAD_DEFAULT_TILE * 4, deconv_param->ic_div4_, count);
 
       for (int uhi = 0; uhi < h_size; uhi++) {
@@ -238,8 +239,8 @@ void DeConvWgCalCommFp16(float16_t *tile_in, float16_t *tile_out, float16_t *wei
   return;
 }
 
-int PackDeConvWgDataFp16(float16_t *nhwc_weight, DeConvComputeUnit *unit, ConvParameter *conv_param,
-                         DeConvParam *deconv_param) {
+int PackDeConvWgDataFp16(const float16_t *nhwc_weight, DeConvComputeUnit *unit, const ConvParameter *conv_param,
+                         const DeConvParam *deconv_param) {
   int tmp_kernel_plane = unit->w_size_ * unit->h_size_;
   int output_channel = conv_param->output_channel_;
   int size = conv_param->input_channel_ * output_channel * tmp_kernel_plane;
@@ -248,13 +249,13 @@ int PackDeConvWgDataFp16(float16_t *nhwc_weight, DeConvComputeUnit *unit, ConvPa
     return NNACL_NULL_PTR;
   }
   for (int ic = 0; ic < conv_param->input_channel_; ic++) {
-    float16_t *src_ic = nhwc_weight + deconv_param->kernel_plane_ * output_channel * ic;
+    const float16_t *src_ic = nhwc_weight + deconv_param->kernel_plane_ * output_channel * ic;
     float16_t *dst_ic = current_unit_weight + tmp_kernel_plane * output_channel * ic;
     for (int uhi = 0; uhi < unit->h_size_; uhi++) {
       for (int uwi = 0; uwi < unit->w_size_; uwi++) {
         int src_h_offset = unit->h_start_ + uhi * conv_param->stride_h_;
         int src_w_offset = unit->w_start_ + uwi * conv_param->stride_w_;
-        float16_t *src_hw = src_ic + (src_h_offset * conv_param->kernel_w_ + src_w_offset) * output_channel;
+        const float16_t *src_hw = src_ic + (src_h_offset * conv_param->kernel_w_ + src_w_offset) * output_channel;
         float16_t *dst_hw = dst_ic + (uhi * unit->w_size_ + uwi) * output_channel;
         memcpy(dst_hw, src_hw, output_channel * sizeof(float16_t));
       }
@@ -340,8 +341,8 @@ int PackDeConvWgDataFp16(float16_t *nhwc_weight, DeConvComputeUnit *unit, ConvPa
   return NNACL_OK;
 }
 
-void DeconvWgFp16(float16_t *nhwc_input_, float16_t *tile_in, float16_t *tile_out, int start_index, int calculate_count,
-                  ConvParameter *conv_param, DeConvParam *deconv_param, int task_id) {
+void DeconvWgFp16(const float16_t *nhwc_input_, float16_t *tile_in, float16_t *tile_out, int start_index,
+                  int calculate_count, const ConvParameter *conv_param, DeConvParam *deconv_param, int task_id) {
   /* pack tile input */
   int tile_in_unit_stride = deconv_param->ic_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
   float16x4_t zero = vdup_n_f16(0.0f);
@@ -366,7 +367,7 @@ void DeconvWgFp16(float16_t *nhwc_input_, float16_t *tile_in, float16_t *tile_ou
           continue;
         }
 
-        float16_t *src = nhwc_input_ + (w_index + h_index * conv_param->input_w_) * conv_param->input_channel_;
+        const float16_t *src = nhwc_input_ + (w_index + h_index * conv_param->input_w_) * conv_param->input_channel_;
         DeConvWgInputPackFp16(src, dst, conv_param->input_channel_, DECONV_WINOGRAD_DEFAULT_TILE * C4NUM);
       }
     }
@@ -402,8 +403,8 @@ void DeconvWgFp16(float16_t *nhwc_input_, float16_t *tile_in, float16_t *tile_ou
   return;
 }
 
-void DeconvWgPostFp16(float16_t *tile_out, float16_t *nc4hw4_output, ConvParameter *conv_param,
-                      DeConvParam *deconv_param, int calculate_count, int tile_index) {
+void DeconvWgPostFp16(const float16_t *tile_out, float16_t *nc4hw4_output, const ConvParameter *conv_param,
+                      const DeConvParam *deconv_param, int calculate_count, int tile_index) {
   /* merge */
   int src_unit_stride = deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
 
@@ -411,7 +412,7 @@ void DeconvWgPostFp16(float16_t *tile_out, float16_t *nc4hw4_output, ConvParamet
   int dst_stride = conv_param->output_w_ * conv_param->output_h_ * C4NUM;
 
   for (int index = 0; index < calculate_count; ++index) {
-    float16_t *src_start = tile_out + index * C4NUM;
+    const float16_t *src_start = tile_out + index * C4NUM;
 
     int plane_index = tile_index * DECONV_WINOGRAD_DEFAULT_TILE + index;
     int w_unit_index = plane_index % deconv_param->in_tile_w_count_;
@@ -427,7 +428,7 @@ void DeconvWgPostFp16(float16_t *tile_out, float16_t *nc4hw4_output, ConvParamet
 
     for (int hi = merge_h_start; hi < merge_h_end; hi++) {
       for (int wi = merge_w_start; wi < merge_w_end; wi++) {
-        float16_t *src = src_start + (hi * deconv_param->out_tile_w_ + wi) * src_unit_stride;
+        const float16_t *src = src_start + (hi * deconv_param->out_tile_w_ + wi) * src_unit_stride;
         float16_t *dst = dst_start + (hi * conv_param->output_w_ + wi) * C4NUM;
         DeConvWgMergeFp16(src, dst, src_stride, dst_stride, deconv_param->oc_div4_);
       }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.h
index cfe9a40e5a8..96d631c148b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.h
@@ -23,14 +23,14 @@
 extern "C" {
 #endif
 
-int PackDeConvWgDataFp16(float16_t *nhwc_weight, DeConvComputeUnit *unit, ConvParameter *conv_param,
-                         DeConvParam *deconv_param);
+int PackDeConvWgDataFp16(const float16_t *nhwc_weight, DeConvComputeUnit *unit, const ConvParameter *conv_param,
+                         const DeConvParam *deconv_param);
 
-void DeconvWgFp16(float16_t *nhwc_input_, float16_t *tile_in, float16_t *tile_out, int start_index, int calculate_count,
-                  ConvParameter *conv_param, DeConvParam *deconv_param, int task_id);
+void DeconvWgFp16(const float16_t *nhwc_input_, float16_t *tile_in, float16_t *tile_out, int start_index,
+                  int calculate_count, const ConvParameter *conv_param, DeConvParam *deconv_param, int task_id);
 
-void DeconvWgPostFp16(float16_t *tile_out, float16_t *nc4hw4_output, ConvParameter *conv_param,
-                      DeConvParam *deconv_param, int calculate_count, int tile_index);
+void DeconvWgPostFp16(const float16_t *tile_out, float16_t *nc4hw4_output, const ConvParameter *conv_param,
+                      const DeConvParam *deconv_param, int calculate_count, int tile_index);
 
 void TiledC4MatmulFp16(float16_t *dst, const float16_t *src, const float16_t *weight, size_t ic4, size_t cal_num,
                        size_t oc4);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/instance_norm_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/instance_norm_fp16.c
index 00a8ff0be6c..c4329a21577 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/instance_norm_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/instance_norm_fp16.c
@@ -20,24 +20,26 @@
 
 int InstanceNormFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *gamma_data,
                      const float16_t *beta_data, const InstanceNormParameter *param, size_t task_id) {
-  if (src_data == NULL || dst_data == NULL) {
-    return NNACL_NULL_PTR;
-  }
-  int channel_step = UP_DIV(param->channel_, param->op_parameter_.thread_num_);
+  NNACL_CHECK_NULL_RETURN_ERR(src_data);
+  NNACL_CHECK_NULL_RETURN_ERR(dst_data);
+  NNACL_CHECK_NULL_RETURN_ERR(param->op_parameter_.thread_num_);
+  int channel = param->channel_;
+  int hw_plane = param->inner_size_;
+  int channel_step = UP_DIV(channel, param->op_parameter_.thread_num_);
   int channel_begin = task_id * channel_step;
-  int channel_end = MSMIN(channel_begin + channel_step, param->channel_);
+  int channel_end = MSMIN(channel_begin + channel_step, channel);
 
   for (int b = 0; b < param->batch_; b++) {
-    const float16_t *src_b = src_data + b * param->channel_ * param->inner_size_;
-    float16_t *dst_b = dst_data + b * param->channel_ * param->inner_size_;
+    const float16_t *src_b = src_data + b * channel * hw_plane;
+    float16_t *dst_b = dst_data + b * channel * hw_plane;
     for (int c = channel_begin; c < channel_end; c++) {
-      const float16_t *src = src_b + c * param->inner_size_;
-      float16_t *dst = dst_b + c * param->inner_size_;
+      const float16_t *src = src_b + c * hw_plane;
+      float16_t *dst = dst_b + c * hw_plane;
       float mean = 0.0f;
       float square_mean = 0.0f;
 
       int index = 0;
-      for (; index <= param->inner_size_ - C8NUM; index += C8NUM) {
+      for (; index <= hw_plane - C8NUM; index += C8NUM) {
         float16x8_t srcv = vld1q_f16(src + index);
         float16x8_t squarev = vmulq_f16(srcv, srcv);
 
@@ -49,19 +51,19 @@ int InstanceNormFp16(const float16_t *src_data, float16_t *dst_data, const float
         float32x4_t square_f32 = vcvt_f32_f16(square2);
         square_mean += MS_ADDVQ_F32(square_f32);
       }
-      for (; index < param->inner_size_; index++) {
+      for (; index < hw_plane; index++) {
         mean += src[index];
         square_mean += src[index] * src[index];
       }
 
-      mean /= (float)param->inner_size_;
-      square_mean /= (float)param->inner_size_;
+      mean /= (float)hw_plane;
+      square_mean /= (float)hw_plane;
       const float deno = 1 / sqrtf(square_mean - mean * mean + param->epsilon_);
 
       index = 0;
       float16x8_t meanv = vdupq_n_f16(mean);
       float16x8_t denov = vdupq_n_f16(deno);
-      for (; index <= param->inner_size_ - C8NUM; index += C8NUM) {
+      for (; index <= hw_plane - C8NUM; index += C8NUM) {
         float16x8_t srcv = vld1q_f16(src + index);
         float16x8_t outv = vsubq_f16(srcv, meanv);
         outv = vmulq_f16(outv, denov);
@@ -72,7 +74,7 @@ int InstanceNormFp16(const float16_t *src_data, float16_t *dst_data, const float
         outv = vaddq_f16(outv, betav);
         vst1q_f16(dst + index, outv);
       }
-      for (; index < param->inner_size_; index++) {
+      for (; index < hw_plane; index++) {
         dst[index] = (src[index] - mean) * deno;
         dst[index] = dst[index] * gamma_data[c] + beta_data[c];
       }
@@ -80,3 +82,75 @@ int InstanceNormFp16(const float16_t *src_data, float16_t *dst_data, const float
   }
   return NNACL_OK;
 }
+
+int InstanceNormNC8HW8Fp16(const float16_t *src_data, float16_t *dst_data, const float16_t *gamma_data,
+                           const float16_t *beta_data, const InstanceNormParameter *param, size_t task_id) {
+  NNACL_CHECK_NULL_RETURN_ERR(src_data);
+  NNACL_CHECK_NULL_RETURN_ERR(dst_data);
+  NNACL_CHECK_NULL_RETURN_ERR(param->op_parameter_.thread_num_);
+  int channel = param->channel_;
+  int hw_plane = param->inner_size_;
+  int channel_step = UP_DIV(UP_DIV(channel, C8NUM), param->op_parameter_.thread_num_) * C8NUM;
+  int channel_begin = (int)(task_id)*channel_step;
+  int channel_end = MSMIN(channel_begin + channel_step, channel);
+  int c8_down = channel_end / C8NUM * C8NUM;
+  int c_res = channel_end - c8_down;
+  float32x4_t hw_plane_4 = vdupq_n_f32(hw_plane);
+  for (int b = 0; b < param->batch_; b++) {
+    const float16_t *src_b = src_data + b * channel * hw_plane;
+    float16_t *dst_b = dst_data + b * channel * hw_plane;
+    int c = channel_begin;
+    for (; c < c8_down; c += C8NUM) {
+      const float16_t *src = src_b + c * hw_plane;
+      float16_t *dst = dst_b + c;
+      float32x4_t mean1 = vdupq_n_f32(0.0f);
+      float32x4_t mean2 = vdupq_n_f32(0.0f);
+      float32x4_t square_mean1 = vdupq_n_f32(0.0f);
+      float32x4_t square_mean2 = vdupq_n_f32(0.0f);
+      for (int index = 0; index < hw_plane; ++index) {
+        float16x8_t srcv = vld1q_f16(src + index * C8NUM);
+        float32x4_t srcv1 = vcvt_f32_f16(vget_low_f16(srcv));
+        float32x4_t srcv2 = vcvt_f32_f16(vget_high_f16(srcv));
+        mean1 = vaddq_f32(mean1, srcv1);
+        mean2 = vaddq_f32(mean2, srcv2);
+        square_mean1 = vaddq_f32(square_mean1, vmulq_f32(srcv1, srcv1));
+        square_mean2 = vaddq_f32(square_mean2, vmulq_f32(srcv2, srcv2));
+      }
+      float16x8_t mean =
+        vcombine_f16(vcvt_f16_f32(MS_DIVQ_F32(mean1, hw_plane_4)), vcvt_f16_f32(MS_DIVQ_F32(mean2, hw_plane_4)));
+      float16x8_t square_mean = vcombine_f16(vcvt_f16_f32(MS_DIVQ_F32(square_mean1, hw_plane_4)),
+                                             vcvt_f16_f32(MS_DIVQ_F32(square_mean2, hw_plane_4)));
+      float16x8_t deno =
+        vaddq_f16(vsubq_f16(square_mean, vmulq_f16(mean, mean)), vdupq_n_f16(param->epsilon_));  // question
+      deno = 1 / MS_SQRTFX8_F16(deno);                                                           // question
+
+      float16x8_t gammav = vmulq_f16(vld1q_f16(gamma_data + c), deno);  // deno * gamma_data[c]
+      float16x8_t betav = vld1q_f16(beta_data + c);
+      for (int index = 0; index < hw_plane; ++index) {
+        float16x8_t srcv = vld1q_f16(src + index * C8NUM);
+        float16x8_t outv = vsubq_f16(srcv, mean);
+        outv = vmulq_f16(outv, gammav);
+        outv = vaddq_f16(outv, betav);
+        vst1q_f16(dst + index * channel, outv);
+      }
+    }
+    for (; c < channel_end; ++c) {
+      const float16_t *src = src_b + c8_down * hw_plane + c;
+      float16_t *dst = dst_b + c;
+      float mean = 0.0f;
+      float square_mean = 0.0f;
+      for (int index = 0; index < hw_plane; ++index) {
+        float16_t tmp = src[index * c_res];
+        mean += tmp;
+        square_mean += tmp * tmp;
+      }
+      mean /= (float)hw_plane;
+      square_mean /= (float)hw_plane;
+      const float deno = gamma_data[c] / sqrtf(square_mean - mean * mean + param->epsilon_);
+      for (int index = 0; index < hw_plane; ++index) {
+        dst[index * channel] = (src[index * c_res] - mean) * deno + beta_data[c];
+      }
+    }
+  }
+  return NNACL_OK;
+}
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/instance_norm_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/instance_norm_fp16.h
index 5b743f2d74e..92ded955a69 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/instance_norm_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/instance_norm_fp16.h
@@ -23,6 +23,8 @@ extern "C" {
 
 int InstanceNormFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *gamma_data,
                      const float16_t *beta_data, const InstanceNormParameter *param, size_t task_id);
+int InstanceNormNC8HW8Fp16(const float16_t *src_data, float16_t *dst_data, const float16_t *gamma_data,
+                           const float16_t *beta_data, const InstanceNormParameter *param, size_t task_id);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/log_softmax_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/log_softmax_fp16.c
index c75362a331e..55a1050129c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/log_softmax_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/log_softmax_fp16.c
@@ -46,10 +46,10 @@ void LogSoftmaxLastAxisFp16(const float16_t *src, float16_t *dst, float16_t *exp
 
 // output = (input - reduce_max(input, axis)) - log(reduce_sum(exp(input - reduce_max(input, axis)), axis))
 void LogSoftmaxFp16(const float16_t *input_ptr, float16_t *output_ptr, float16_t *sum_data,
-                    SoftmaxParameter *parameter) {
+                    const SoftmaxParameter *parameter) {
   int axis = parameter->axis_;
   int n_dim = parameter->n_dim_;
-  int *input_shape = parameter->input_shape_;
+  const int *input_shape = parameter->input_shape_;
   int inner_size = 1;
   int outter_size = 1;
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/log_softmax_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/log_softmax_fp16.h
index 5485ca7f6a8..14cd0346550 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/log_softmax_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/log_softmax_fp16.h
@@ -27,7 +27,7 @@ extern "C" {
 #endif
 void LogSoftmaxLastAxisFp16(const float16_t *src, float16_t *dst, float16_t *exp_data, int batch, int channel);
 void LogSoftmaxFp16(const float16_t *input_ptr, float16_t *output_ptr, float16_t *sum_data,
-                    SoftmaxParameter *parameter);
+                    const SoftmaxParameter *parameter);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/lstm_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/lstm_fp16.c
index 30f122f0cca..630726d08d5 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/lstm_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/lstm_fp16.c
@@ -131,7 +131,7 @@ int ElementOptMulAccFp16(const float16_t *input0, const float16_t input1, float1
   return NNACL_OK;
 }
 
-void UpdataStateFp16(float16_t *cell_state, float16_t *forget_gate, const float16_t *input_gate,
+void UpdataStateFp16(float16_t *cell_state, const float16_t *forget_gate, const float16_t *input_gate,
                      const float16_t *cell_gate, float16_t *state_buffer, int batch, int hidden_size,
                      float16_t zoneout) {
   if (!(zoneout >= -FLT_EPSILON && zoneout <= FLT_EPSILON)) {  // zoneout * old_cell_state
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.c
index 4e1e12ca110..be0438c7fdf 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.c
@@ -37,8 +37,8 @@ void PackWeightConvDw3x3Fp16(const void *src, void *dst, int channel) {
 }
 #endif
 
-void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float16_t *packed_input, int real_cal_num,
-                        int block_index) {
+void Im2ColPackUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
+                        int real_cal_num, int block_index) {
   // input format : nhwc
   int kernel_h = conv_param->kernel_h_;
   int kernel_w = conv_param->kernel_w_;
@@ -92,7 +92,8 @@ void PackHWCToWHCFp16(const float16_t *src, float16_t *dst, int height, int widt
   }
 }
 
-void PackWeightToC8Fp16(const float16_t *origin_weight_data, float16_t *packed_weight_data, ConvParameter *conv_param) {
+void PackWeightToC8Fp16(const float16_t *origin_weight_data, float16_t *packed_weight_data,
+                        const ConvParameter *conv_param) {
   // origin weight format : ohwi
   int input_channel = conv_param->input_channel_;
   int ic8 = UP_DIV(input_channel, C8NUM);
@@ -116,7 +117,8 @@ void PackWeightToC8Fp16(const float16_t *origin_weight_data, float16_t *packed_w
   }
 }
 
-void PackWeightToC4Fp16(const float16_t *origin_weight_data, float16_t *packed_weight_data, ConvParameter *conv_param) {
+void PackWeightToC4Fp16(const float16_t *origin_weight_data, float16_t *packed_weight_data,
+                        const ConvParameter *conv_param) {
   // origin weight format : ohwi
   int input_channel = conv_param->input_channel_;
   int ic8 = UP_DIV(input_channel, C8NUM);
@@ -395,7 +397,7 @@ void PackNC4HW4ToNCHWFp16(const void *src, void *dst, int batch, int plane, int
   }
 }
 
-void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) {
+void PackNCHWFp32ToNC8HW8Fp16(const float *src, float16_t *dst, int batch, int plane, int channel) {
   int c8 = UP_DIV(channel, C8NUM);
   for (int b = 0; b < batch; b++) {
     int src_offset = b * plane * channel;
@@ -414,7 +416,7 @@ void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane,
   }
 }
 
-void PackNCHWFp16ToNC8HW8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) {
+void PackNCHWFp16ToNC8HW8Fp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel) {
   int c8 = UP_DIV(channel, C8NUM);
   for (int b = 0; b < batch; b++) {
     int src_offset = b * plane * channel;
@@ -433,14 +435,31 @@ void PackNCHWFp16ToNC8HW8Fp16(float16_t *src, float16_t *dst, int batch, int pla
   }
 }
 
-void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) {
+#ifdef ENABLE_DEBUG
+void PackNC8HW8ToNHWCFp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel) {
+  int block = UP_DIV(channel, C8NUM);
+  int last_block_idx = block - 1;
+  int last_src_col = channel - last_block_idx * C8NUM;
+  for (size_t i = 0; i < block; i++) {
+    size_t src_col = (i != last_block_idx) ? C8NUM : last_src_col;
+    float16_t *dst_cur = dst + i * C8NUM;
+    for (size_t j = 0; j < plane; j++) {
+      memcpy(dst_cur, src, src_col * sizeof(float16_t));
+      src += src_col;
+      dst_cur += channel;
+    }
+  }
+}
+#endif
+
+void PackNHWCFp32ToNHWC8Fp16(const float *src, float16_t *dst, int batch, int plane, int channel) {
   int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
   for (int b = 0; b < batch; b++) {
     float16_t *dst_batch = dst + b * plane * c8_channel;
-    float *src_batch = src + b * plane * channel;
+    const float *src_batch = src + b * plane * channel;
     for (int i = 0; i < plane; i++) {
       float16_t *dst_plane = dst_batch + i * c8_channel;
-      float *src_plane = src_batch + i * channel;
+      const float *src_plane = src_batch + i * channel;
       for (int c = 0; c < channel; c++) {
         dst_plane[c] = (float16_t)(src_plane[c]);
       }
@@ -448,7 +467,7 @@ void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, i
   }
 }
 
-void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) {
+void PackNHWCFp32ToC8HWN8Fp16(const float *src, float16_t *dst, int batch, int plane, int channel) {
   for (int n = 0; n < batch; n++) {
     for (int hw = 0; hw < plane; hw++) {
       for (int c = 0; c < channel; c++) {
@@ -463,7 +482,7 @@ void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane,
   return;
 }
 
-void PackNHWCFp16ToC8HWN8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) {
+void PackNHWCFp16ToC8HWN8Fp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel) {
   for (int n = 0; n < batch; n++) {
     for (int hw = 0; hw < plane; hw++) {
       for (int c = 0; c < channel; c++) {
@@ -478,13 +497,13 @@ void PackNHWCFp16ToC8HWN8Fp16(float16_t *src, float16_t *dst, int batch, int pla
   return;
 }
 
-void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel) {
+void PackNHWC8Fp16ToNHWCFp32(const float16_t *src, float *dst, int batch, int plane, int channel) {
   int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
   for (int b = 0; b < batch; b++) {
-    float16_t *src_batch = src + b * plane * c8_channel;
+    const float16_t *src_batch = src + b * plane * c8_channel;
     float *dst_batch = dst + b * plane * channel;
     for (int i = 0; i < plane; i++) {
-      float16_t *src_plane = src_batch + i * c8_channel;
+      const float16_t *src_plane = src_batch + i * c8_channel;
       float *dst_plane = dst_batch + i * channel;
       for (int c = 0; c < channel; c++) {
         dst_plane[c] = (float16_t)(src_plane[c]);
@@ -493,13 +512,13 @@ void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, i
   }
 }
 
-void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) {
+void PackNHWC8ToNHWCFp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel) {
   int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
   for (int b = 0; b < batch; b++) {
-    float16_t *src_batch = src + b * plane * c8_channel;
+    const float16_t *src_batch = src + b * plane * c8_channel;
     float16_t *dst_batch = dst + b * plane * channel;
     for (int i = 0; i < plane; i++) {
-      float16_t *src_plane = src_batch + i * c8_channel;
+      const float16_t *src_plane = src_batch + i * c8_channel;
       float16_t *dst_plane = dst_batch + i * channel;
       memcpy(dst_plane, src_plane, channel * sizeof(float16_t));
     }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.h
index 7d23a40701f..ed699135289 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pack_fp16.h
@@ -24,14 +24,16 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float16_t *packed_input, int real_cal_num,
-                        int block_index);
+void Im2ColPackUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
+                        int real_cal_num, int block_index);
 
-void PackWeightToC8Fp16(const float16_t *origin_weight_data, float16_t *packed_weight_data, ConvParameter *conv_param);
+void PackWeightToC8Fp16(const float16_t *origin_weight_data, float16_t *packed_weight_data,
+                        const ConvParameter *conv_param);
 
 void PackHWCToWHCFp16(const float16_t *src, float16_t *dst, int height, int width, int channel);
 
-void PackWeightToC4Fp16(const float16_t *origin_weight_data, float16_t *packed_weight_data, ConvParameter *conv_param);
+void PackWeightToC4Fp16(const float16_t *origin_weight_data, float16_t *packed_weight_data,
+                        const ConvParameter *conv_param);
 
 void PackNHWCToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int channel);
 
@@ -55,21 +57,21 @@ void PackNC4HW4ToNHWCFp16(const void *src, void *dst, int batch, int plane, int
 
 void PackNC4HW4ToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel);
 
-void PackNC8HW8ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel);
+void PackNCHWFp32ToNC8HW8Fp16(const float *src, float16_t *dst, int batch, int plane, int channel);
 
-void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
+void PackNCHWFp16ToNC8HW8Fp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel);
 
-void PackNCHWFp16ToNC8HW8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel);
+void PackNC8HW8ToNHWCFp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel);
 
-void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
+void PackNHWCFp32ToNHWC8Fp16(const float *src, float16_t *dst, int batch, int plane, int channel);
 
-void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
+void PackNHWCFp32ToC8HWN8Fp16(const float *src, float16_t *dst, int batch, int plane, int channel);
 
-void PackNHWCFp16ToC8HWN8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel);
+void PackNHWCFp16ToC8HWN8Fp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel);
 
-void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel);
+void PackNHWC8Fp16ToNHWCFp32(const float16_t *src, float *dst, int batch, int plane, int channel);
 
-void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, int channel);
+void PackNHWC8ToNHWCFp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel);
 
 #ifdef ENABLE_ARM82_A32
 void Transpose8x8A32Fp16(const float16_t *src, float16_t *dst, size_t src_stride, size_t dst_stride);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pad_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pad_fp16.c
index e0d69be8409..1edd751469a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pad_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pad_fp16.c
@@ -18,7 +18,7 @@
 #include "nnacl/common_func.h"
 
 void PadFp16(const float16_t *input_data, float16_t *output_data, const int *input_shape, const int *output_shape,
-             const int *paddings, const int tid, const int thread_num) {
+             const int *paddings, int tid, int thread_num) {
   int in[DEFAULT_PAD_NDIMS], out[DEFAULT_PAD_NDIMS];
   for (in[0] = 0; in[0] < input_shape[0]; in[0]++) {
     out[0] = in[0] + paddings[0];
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pad_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pad_fp16.h
index e41db9528d6..5725aed83fb 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pad_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pad_fp16.h
@@ -22,7 +22,7 @@
 extern "C" {
 #endif
 void PadFp16(const float16_t *input_data, float16_t *output_data, const int *input_shape, const int *output_shape,
-             const int *paddings, const int tid, const int thread_num);
+             const int *paddings, int tid, int thread_num);
 void MirrorPadFp16(const float16_t *input_data, float16_t *output_data, const int *input_shape,
                    const PadParameter *pad_param, int begin, int end);
 #ifdef __cplusplus
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pooling_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pooling_fp16.c
index 5c831ac3ff3..d9b4921c5fb 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pooling_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pooling_fp16.c
@@ -17,8 +17,8 @@
 #include <float.h>
 #include "nnacl/errorcode.h"
 
-int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id,
-                   float16_t min, float16_t max) {
+int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, const PoolingParameter *pooling_param,
+                   int task_id, float16_t min, float16_t max) {
   int win_w = pooling_param->window_w_;
   int win_h = pooling_param->window_h_;
   int channel = pooling_param->input_channel_;
@@ -134,8 +134,8 @@ int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingPar
   return NNACL_OK;
 }
 
-void MaxPoolingC8Fp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, float16_t min,
-                      float16_t max, int in_batch_offset, int out_plane_offset, int real_win_h_start,
+void MaxPoolingC8Fp16(const float16_t *input_ptr, float16_t *output_ptr, const PoolingParameter *pooling_param,
+                      float16_t min, float16_t max, int in_batch_offset, int out_plane_offset, int real_win_h_start,
                       int real_win_h_end, int real_win_w_start, int real_win_w_end, int in_h_index, int in_w_index) {
   int channel = pooling_param->input_channel_;
   int in_w = pooling_param->input_w_;
@@ -178,8 +178,8 @@ void MaxPoolingC8Fp16(const float16_t *input_ptr, float16_t *output_ptr, Pooling
   }  // c8 loop
 }
 
-void MaxPoolingC4Fp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, float16_t min,
-                      float16_t max, int in_batch_offset, int out_plane_offset, int real_win_h_start,
+void MaxPoolingC4Fp16(const float16_t *input_ptr, float16_t *output_ptr, const PoolingParameter *pooling_param,
+                      float16_t min, float16_t max, int in_batch_offset, int out_plane_offset, int real_win_h_start,
                       int real_win_h_end, int real_win_w_start, int real_win_w_end, int in_h_index, int in_w_index) {
   int channel = pooling_param->input_channel_;
   int in_w = pooling_param->input_w_;
@@ -224,8 +224,8 @@ void MaxPoolingC4Fp16(const float16_t *input_ptr, float16_t *output_ptr, Pooling
 #endif
   }  // c4 loop
 }
-void MaxPoolingC1Fp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, float16_t min,
-                      float16_t max, int in_batch_offset, int out_plane_offset, int real_win_h_start,
+void MaxPoolingC1Fp16(const float16_t *input_ptr, float16_t *output_ptr, const PoolingParameter *pooling_param,
+                      float16_t min, float16_t max, int in_batch_offset, int out_plane_offset, int real_win_h_start,
                       int real_win_h_end, int real_win_w_start, int real_win_w_end, int in_h_index, int in_w_index) {
   int channel = pooling_param->input_channel_;
   int in_w = pooling_param->input_w_;
@@ -249,8 +249,8 @@ void MaxPoolingC1Fp16(const float16_t *input_ptr, float16_t *output_ptr, Pooling
   }  // channel_res loop
 }
 
-void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id,
-                    float16_t min, float16_t max) {
+void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, const PoolingParameter *pooling_param,
+                    int task_id, float16_t min, float16_t max) {
   int stride_w = pooling_param->stride_w_;
   int stride_h = pooling_param->stride_h_;
   int pad_w = pooling_param->pad_l_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pooling_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pooling_fp16.h
index d20ca72457f..d671248d384 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pooling_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pooling_fp16.h
@@ -23,11 +23,11 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id,
-                   float16_t min, float16_t max);
+int AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, const PoolingParameter *pooling_param,
+                   int task_id, float16_t min, float16_t max);
 
-void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id,
-                    float16_t min, float16_t max);
+void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, const PoolingParameter *pooling_param,
+                    int task_id, float16_t min, float16_t max);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/quant_dtype_cast_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/quant_dtype_cast_fp16.c
index d1c18e5bdf3..af7a76f59de 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/quant_dtype_cast_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/quant_dtype_cast_fp16.c
@@ -18,7 +18,7 @@
 #include "nnacl/fp16/quant_dtype_cast_fp16.h"
 #include "nnacl/errorcode.h"
 
-int DoDequantizeInt8ToFp16(int8_t *quant_values, float16_t *real_values, float scale, int32_t zp, int size) {
+int DoDequantizeInt8ToFp16(const int8_t *quant_values, float16_t *real_values, float scale, int32_t zp, int size) {
   if (quant_values == NULL || real_values == NULL) {
     return NNACL_PARAM_INVALID;
   }
@@ -29,7 +29,7 @@ int DoDequantizeInt8ToFp16(int8_t *quant_values, float16_t *real_values, float s
   return NNACL_OK;
 }
 
-int DoQuantizeFp16ToInt8(float16_t *real_values, int8_t *quant_values, float scale, int32_t zp, int size) {
+int DoQuantizeFp16ToInt8(const float16_t *real_values, int8_t *quant_values, float scale, int32_t zp, int size) {
   if (quant_values == NULL || real_values == NULL) {
     return NNACL_PARAM_INVALID;
   }
@@ -51,7 +51,7 @@ int DoQuantizeFp16ToInt8(float16_t *real_values, int8_t *quant_values, float sca
   return NNACL_OK;
 }
 
-int DoDequantizeUInt8ToFp16(uint8_t *quant_values, float16_t *real_values, float scale, int32_t zp, int size) {
+int DoDequantizeUInt8ToFp16(const uint8_t *quant_values, float16_t *real_values, float scale, int32_t zp, int size) {
   uint8_t zp_ = (uint8_t)zp;
   if (quant_values == NULL || real_values == NULL) {
     return NNACL_PARAM_INVALID;
@@ -63,7 +63,7 @@ int DoDequantizeUInt8ToFp16(uint8_t *quant_values, float16_t *real_values, float
   return NNACL_OK;
 }
 
-int DoQuantizeFp16ToUInt8(float16_t *real_values, uint8_t *quant_values, float scale, int32_t zp, int size) {
+int DoQuantizeFp16ToUInt8(const float16_t *real_values, uint8_t *quant_values, float scale, int32_t zp, int size) {
   if (quant_values == NULL || real_values == NULL) {
     return NNACL_PARAM_INVALID;
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/quant_dtype_cast_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/quant_dtype_cast_fp16.h
index f9a612526b4..08f9036cb4d 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/quant_dtype_cast_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/quant_dtype_cast_fp16.h
@@ -23,11 +23,11 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-int DoDequantizeInt8ToFp16(int8_t *quant_values, float16_t *real_values, float scale, int32_t zp, int size);
-int DoQuantizeFp16ToInt8(float16_t *real_values, int8_t *quant_values, float scale, int32_t zp, int size);
+int DoDequantizeInt8ToFp16(const int8_t *quant_values, float16_t *real_values, float scale, int32_t zp, int size);
+int DoQuantizeFp16ToInt8(const float16_t *real_values, int8_t *quant_values, float scale, int32_t zp, int size);
 
-int DoDequantizeUInt8ToFp16(uint8_t *quant_values, float16_t *real_values, float scale, int32_t zp, int size);
-int DoQuantizeFp16ToUInt8(float16_t *real_values, uint8_t *quant_values, float scale, int32_t zp, int size);
+int DoDequantizeUInt8ToFp16(const uint8_t *quant_values, float16_t *real_values, float scale, int32_t zp, int size);
+int DoQuantizeFp16ToUInt8(const float16_t *real_values, uint8_t *quant_values, float scale, int32_t zp, int size);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/reduce_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/reduce_fp16.c
index e77d040399f..3e163f89e39 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/reduce_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/reduce_fp16.c
@@ -18,11 +18,14 @@
 #include "nnacl/fp16/reduce_fp16.h"
 #include "nnacl/errorcode.h"
 
-int ReduceMeanFp16(const int outer_size, const int inner_size, const int axis_size, const float16_t *src_data,
-                   float16_t *dst_data, const int tid, const int thread_num) {
+int ReduceMeanFp16(int outer_size, int inner_size, int axis_size, const float16_t *src_data, float16_t *dst_data,
+                   int tid, int thread_num) {
   if (src_data == NULL || dst_data == NULL) {
     return NNACL_NULL_PTR;
   }
+  if (axis_size == 0) {
+    return NNACL_ERR;
+  }
   int i, j, k;
   for (j = tid; j < outer_size; j += thread_num) {
     const float16_t *outer_src = src_data + j * axis_size * inner_size;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/reduce_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/reduce_fp16.h
index f11b6751f7e..0d6a99fc43c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/reduce_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/reduce_fp16.h
@@ -22,8 +22,8 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-int ReduceMeanFp16(const int outer_size, const int inner_size, const int axis_size, const float16_t *src_data,
-                   float16_t *dst_data, const int tid, const int thread_num);
+int ReduceMeanFp16(int outer_size, int inner_size, int axis_size, const float16_t *src_data, float16_t *dst_data,
+                   int tid, int thread_num);
 int ReduceMaxFp16(int outer_size, int inner_size, int axis_size, const float16_t *src_data, float16_t *dst_data,
                   int tid, int thread_num);
 int ReduceSumFp16(int outer_size, int inner_size, int axis_size, const float16_t *src_data, float16_t *dst_data,
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/scale_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/scale_fp16.c
index 954540de6b1..e1b50c2303e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/scale_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/scale_fp16.c
@@ -16,8 +16,8 @@
 
 #include "nnacl/fp16/scale_fp16.h"
 
-void Fp16ScaleInner(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
-                    int outer_end, int axis_size, int inner_size) {
+void Fp16ScaleInner(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
+                    int outer_start, int outer_end, int axis_size, int inner_size) {
   for (int out = outer_start; out < outer_end; out++) {
     int out_offset = out * axis_size * inner_size;
     for (int i = 0; i < axis_size; i++) {
@@ -42,8 +42,8 @@ void Fp16ScaleInner(float16_t *in_data, float16_t *out_data, float16_t *scale, f
   }
 }
 
-void Fp16ScaleAxis(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
-                   int outer_end, int axis_size) {
+void Fp16ScaleAxis(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
+                   int outer_start, int outer_end, int axis_size) {
   for (int out = outer_start; out < outer_end; out++) {
     int out_offset = out * axis_size;
     int index = 0;
@@ -64,8 +64,8 @@ void Fp16ScaleAxis(float16_t *in_data, float16_t *out_data, float16_t *scale, fl
   }
 }
 
-void DoScaleFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
-                 ScaleParameter *scale_param) {
+void DoScaleFp16(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
+                 int task_id, const ScaleParameter *scale_param) {
   int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
   int outer_start = task_id * outer_step;
   int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
@@ -78,8 +78,8 @@ void DoScaleFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, floa
   }
 }
 
-void Fp16ScaleInnerRelu(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
-                        int outer_end, int axis_size, int inner_size) {
+void Fp16ScaleInnerRelu(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
+                        int outer_start, int outer_end, int axis_size, int inner_size) {
 #ifdef ENABLE_NEON
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
 #endif
@@ -108,8 +108,8 @@ void Fp16ScaleInnerRelu(float16_t *in_data, float16_t *out_data, float16_t *scal
   }
 }
 
-void Fp16ScaleAxisRelu(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
-                       int outer_end, int axis_size) {
+void Fp16ScaleAxisRelu(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
+                       int outer_start, int outer_end, int axis_size) {
 #ifdef ENABLE_NEON
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
 #endif
@@ -135,8 +135,8 @@ void Fp16ScaleAxisRelu(float16_t *in_data, float16_t *out_data, float16_t *scale
   }
 }
 
-void Fp16DoScaleRelu(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
-                     ScaleParameter *scale_param) {
+void Fp16DoScaleRelu(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
+                     int task_id, const ScaleParameter *scale_param) {
   int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
   int outer_start = task_id * outer_step;
   int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
@@ -149,8 +149,8 @@ void Fp16DoScaleRelu(float16_t *in_data, float16_t *out_data, float16_t *scale,
   }
 }
 
-void Fp16ScaleInnerRelu6(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
-                         int outer_end, int axis_size, int inner_size) {
+void Fp16ScaleInnerRelu6(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
+                         int outer_start, int outer_end, int axis_size, int inner_size) {
 #ifdef ENABLE_NEON
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
   float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
@@ -180,8 +180,8 @@ void Fp16ScaleInnerRelu6(float16_t *in_data, float16_t *out_data, float16_t *sca
   }
 }
 
-void Fp16ScaleAxisRelu6(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start,
-                        int outer_end, int axis_size) {
+void Fp16ScaleAxisRelu6(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
+                        int outer_start, int outer_end, int axis_size) {
 #ifdef ENABLE_NEON
   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
   float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
@@ -208,8 +208,8 @@ void Fp16ScaleAxisRelu6(float16_t *in_data, float16_t *out_data, float16_t *scal
   }
 }
 
-void DoScaleRelu6Fp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
-                      ScaleParameter *scale_param) {
+void DoScaleRelu6Fp16(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
+                      int task_id, const ScaleParameter *scale_param) {
   int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
   int outer_start = task_id * outer_step;
   int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/scale_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/scale_fp16.h
index 98208d3819e..638dfe8be1e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/scale_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/scale_fp16.h
@@ -24,12 +24,12 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void DoScaleFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
-                 ScaleParameter *scale_param);
-void Fp16DoScaleRelu(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
-                     ScaleParameter *scale_param);
-void DoScaleRelu6Fp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id,
-                      ScaleParameter *scale_param);
+void DoScaleFp16(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
+                 int task_id, const ScaleParameter *scale_param);
+void Fp16DoScaleRelu(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
+                     int task_id, const ScaleParameter *scale_param);
+void DoScaleRelu6Fp16(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
+                      int task_id, const ScaleParameter *scale_param);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/softmax_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/softmax_fp16.c
index ee6432a1ff6..68daca53a66 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/softmax_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/softmax_fp16.c
@@ -91,10 +91,11 @@ void SoftmaxLastAxisFp16(const float16_t *src, float16_t *dst, int batch, int ch
 }
 
 // output = exp(input) / reduce_sum(exp(input), axis)
-void SoftmaxFp16(const float16_t *input_ptr, float16_t *output_ptr, float16_t *sum_data, SoftmaxParameter *parameter) {
+void SoftmaxFp16(const float16_t *input_ptr, float16_t *output_ptr, float16_t *sum_data,
+                 const SoftmaxParameter *parameter) {
   int axis = parameter->axis_;
   int n_dim = parameter->n_dim_;
-  int *input_shape = parameter->input_shape_;
+  const int *input_shape = parameter->input_shape_;
   int inner_size = 1;
   int outter_size = 1;
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/softmax_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/softmax_fp16.h
index 3de8e7133e7..8c522a08803 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/softmax_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/softmax_fp16.h
@@ -25,7 +25,8 @@
 extern "C" {
 #endif
 void SoftmaxNormFp16(const float16_t *src, float16_t *dst, int batch, int channel);
-void SoftmaxFp16(const float16_t *input_ptr, float16_t *output_ptr, float16_t *sum_data, SoftmaxParameter *parameter);
+void SoftmaxFp16(const float16_t *input_ptr, float16_t *output_ptr, float16_t *sum_data,
+                 const SoftmaxParameter *parameter);
 void SoftmaxLastAxisFp16(const float16_t *src, float16_t *dst, int batch, int channel);
 #ifdef __cplusplus
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.c
index efd8eed3dcc..39304518019 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.c
@@ -18,8 +18,8 @@
 #include <string.h>
 #include "nnacl/errorcode.h"
 
-void Fp16TransposeDim2(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
-                       const int *output_shape) {
+void Fp16TransposeDim2(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides,
+                       const int *perm, const int *output_shape) {
   const int stride0 = strides[perm[0]];
   const int stride1 = strides[perm[1]];
   const int output0 = output_shape[0];
@@ -33,8 +33,8 @@ void Fp16TransposeDim2(const float16_t *in_data, float16_t *out_data, int *strid
   }
 }
 
-void Fp16TransposeDim3(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
-                       const int *output_shape) {
+void Fp16TransposeDim3(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides,
+                       const int *perm, const int *output_shape) {
   const int stride0 = strides[perm[0]];
   const int stride1 = strides[perm[1]];
   const int stride2 = strides[perm[2]];
@@ -56,8 +56,8 @@ void Fp16TransposeDim3(const float16_t *in_data, float16_t *out_data, int *strid
   }
 }
 
-void Fp16TransposeDim4(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
-                       const int *output_shape) {
+void Fp16TransposeDim4(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides,
+                       const int *perm, const int *output_shape) {
   const int stride0 = strides[perm[0]];
   const int stride1 = strides[perm[1]];
   const int stride2 = strides[perm[2]];
@@ -88,8 +88,8 @@ void Fp16TransposeDim4(const float16_t *in_data, float16_t *out_data, int *strid
   }
 }
 
-void Fp16TransposeDim5(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
-                       const int *output_shape) {
+void Fp16TransposeDim5(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides,
+                       const int *perm, const int *output_shape) {
   const int stride0 = strides[perm[0]];
   const int stride1 = strides[perm[1]];
   const int stride2 = strides[perm[2]];
@@ -127,8 +127,8 @@ void Fp16TransposeDim5(const float16_t *in_data, float16_t *out_data, int *strid
   }
 }
 
-void Fp16TransposeDim6(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm,
-                       const int *output_shape) {
+void Fp16TransposeDim6(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides,
+                       const int *perm, const int *output_shape) {
   const int stride0 = strides[perm[0]];
   const int stride1 = strides[perm[1]];
   const int stride2 = strides[perm[2]];
@@ -174,10 +174,10 @@ void Fp16TransposeDim6(const float16_t *in_data, float16_t *out_data, int *strid
 }
 
 void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape,
-                       TransposeParameter *param, int task_id, int thread_num) {
-  int *perm = param->perm_;
-  int *strides = param->strides_;
-  int *out_strides = param->out_strides_;
+                       const TransposeParameter *param, int task_id, int thread_num) {
+  const int *perm = param->perm_;
+  const int *strides = param->strides_;
+  const int *out_strides = param->out_strides_;
   int num_axes = param->num_axes_;
   size_t data_size = (*out_strides) * output_shape[0];
   size_t offset_size = UP_DIV(data_size, thread_num);
@@ -202,13 +202,14 @@ void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int
   }
 }
 
-int DoTransposeFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape, TransposeParameter *param) {
+int DoTransposeFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape,
+                    const TransposeParameter *param) {
   if (in_data == NULL || out_data == NULL) {
     return NNACL_ERR;
   }
-  int *perm = param->perm_;
-  int *strides = param->strides_;
-  int *out_strides = param->out_strides_;
+  const int *perm = param->perm_;
+  const int *strides = param->strides_;
+  const int *out_strides = param->out_strides_;
   int data_size = param->data_num_ * sizeof(float16_t);
   int num_axes = param->num_axes_;
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.h
index d9434510b6a..a4b10b7e988 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.h
@@ -25,8 +25,9 @@
 extern "C" {
 #endif
 void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape,
-                       TransposeParameter *param, int task_id, int thread_num);
-int DoTransposeFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape, TransposeParameter *param);
+                       const TransposeParameter *param, int task_id, int thread_num);
+int DoTransposeFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape,
+                    const TransposeParameter *param);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_transform_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_transform_fp16.c
index b0aa3383ec9..6177749a14f 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_transform_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_transform_fp16.c
@@ -18,7 +18,7 @@
 
 // fp16 common winograd
 void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_input, float16_t *tmp_data, int cal_num,
-                                int out_tile_index, int out_w_block_num, ConvParameter *conv_param,
+                                int out_tile_index, int out_w_block_num, const ConvParameter *conv_param,
                                 InputTransFp16Func func) {
 #ifdef ENABLE_ARM64
   const int tile_num = 16;
@@ -125,9 +125,9 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in
   }  // cal_tile_num loop
 }
 
-void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data,
-                                 int cal_num, int out_tile_index, int output_unit_num, ConvParameter *conv_param,
-                                 OutputTransFp16Func func) {
+void WinogradOutputNHWCTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data,
+                                     int cal_num, int out_tile_index, int output_unit_num,
+                                     const ConvParameter *conv_param, OutputTransFp16Func func) {
   int output_unit = conv_param->output_unit_;
   int output_w = conv_param->output_w_;
   int output_h = conv_param->output_h_;
@@ -166,9 +166,51 @@ void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_d
   }
 }
 
-int WinogradWeightTransformFp16(const float16_t *weight_data, float16_t *winograd_data, float *matrix_g,
-                                float *matrix_gt, int oc_block, int input_unit, int kernel_unit, int filter_channel,
-                                int filter_batch, bool pack) {
+void WinogradOutputNC4HW4TransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data,
+                                       int cal_num, int out_tile_index, int output_unit_num,
+                                       const ConvParameter *conv_param, OutputTransFp16Func func) {
+  int output_unit = conv_param->output_unit_;
+  int output_w = conv_param->output_w_;
+  int output_h = conv_param->output_h_;
+  int plane = output_w * output_h;
+  int output_channel = conv_param->output_channel_;
+  int oc8 = UP_DIV(output_channel, C8NUM);
+  int input_unit = conv_param->input_unit_;
+  if (output_unit_num == 0) {
+    return;
+  }
+  for (int i = 0; i < cal_num; i++) {
+    int dst_x_s = out_tile_index % output_unit_num;
+    int dst_y_s = out_tile_index / output_unit_num;
+    int r_w = output_w - dst_x_s * output_unit;
+    r_w = r_w > output_unit ? output_unit : r_w;
+    int r_h = output_h - dst_y_s * output_unit;
+    r_h = r_h > output_unit ? output_unit : r_h;
+    int tmp_ix = dst_x_s * output_unit;
+    dst_x_s = tmp_ix > output_w ? output_w : tmp_ix;
+    int tmp_iy = dst_y_s * output_unit;
+    dst_y_s = tmp_iy > output_h ? output_h : tmp_iy;
+
+    int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit;
+    int dst_tile_offset = dst_x_s + dst_y_s * output_w;
+
+    for (int j = 0; j < oc8; j++) {
+      int r_c = output_channel - j * C8NUM;
+      r_c = r_c > C8NUM ? C8NUM : r_c;
+      int src_oc8_offset = src_tile_offset + j * input_unit * input_unit * C8NUM;
+      int dst_oc8_offset = (dst_tile_offset + plane * j) * C8NUM;
+      const float16_t *src_ptr = gemm_out + src_oc8_offset;
+      const float16_t *bias_ptr = bias_data + j * C8NUM;
+      float16_t *dst_ptr = tmp_out_data + dst_oc8_offset;
+      func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w, r_c, r_w, r_h, r_c);
+    }
+    out_tile_index++;
+  }
+}
+
+int WinogradWeightTransformFp16(const float16_t *weight_data, float16_t *winograd_data, const float *matrix_g,
+                                const float *matrix_gt, int oc_block, int input_unit, int kernel_unit,
+                                int filter_channel, int filter_batch, bool pack) {
   // original weight format : ohwi
   int oc_block_num = UP_DIV(filter_batch, oc_block);
   int block_stride = filter_channel * oc_block;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_transform_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_transform_fp16.h
index e217d64b482..e99fcde1ced 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_transform_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_transform_fp16.h
@@ -30,17 +30,21 @@ extern "C" {
 #endif
 // fp16 common winograd
 void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_input, float16_t *tmp_data, int cal_num,
-                                int out_tile_index, int out_w_block_num, ConvParameter *conv_param,
+                                int out_tile_index, int out_w_block_num, const ConvParameter *conv_param,
                                 InputTransFp16Func func);
 
-void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data,
-                                 int cal_num, int out_tile_index, int output_unit_num, ConvParameter *conv_param,
-                                 OutputTransFp16Func func);
+void WinogradOutputNHWCTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data,
+                                     int cal_num, int out_tile_index, int output_unit_num,
+                                     const ConvParameter *conv_param, OutputTransFp16Func func);
+
+void WinogradOutputNC4HW4TransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data,
+                                       int cal_num, int out_tile_index, int output_unit_num,
+                                       const ConvParameter *conv_param, OutputTransFp16Func func);
 
 // fp16 winograd weight trans
-int WinogradWeightTransformFp16(const float16_t *weight_data, float16_t *winograd_data, float *matrix_g,
-                                float *matrix_gt, int oc_block, int input_unit, int kernel_unit, int filter_channel,
-                                int filter_batch, bool pack);
+int WinogradWeightTransformFp16(const float16_t *weight_data, float16_t *winograd_data, const float *matrix_g,
+                                const float *matrix_gt, int oc_block, int input_unit, int kernel_unit,
+                                int filter_channel, int filter_batch, bool pack);
 
 #ifdef __cplusplus
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_utils_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_utils_fp16.c
index 745a4285ad1..0c46bd323d2 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_utils_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_utils_fp16.c
@@ -20,63 +20,6 @@
 #define MIN_UNIT_FP16 2
 #define MAX_UNIT_FP16 4
 
-void GeneralInputTransformUnitFp16(const float16_t *src_data, float16_t *dst_data, float16_t *matrix_b,
-                                   float16_t *matrix_bt, int src_step, int dst_step, int in_unit) {
-  int len = in_unit * in_unit;
-  if (len > MAX_LEN) return;
-  float16x8_t src[MAX_LEN];
-  float16x8_t t[MAX_LEN];
-  float16x8_t m[MAX_LEN];
-  float16x8_t vec_b[MAX_LEN];
-  float16x8_t vec_bt[MAX_LEN];
-  for (int i = 0; i < len; i++) {
-    src[i] = vld1q_f16(src_data + i * src_step);
-    vec_b[i] = vdupq_n_f16(matrix_b[i]);
-    vec_bt[i] = vdupq_n_f16(matrix_bt[i]);
-  }
-  MatrixMultiplyVecFp16(vec_bt, src, t, NULL, in_unit, in_unit, in_unit);
-  MatrixMultiplyVecFp16(t, vec_b, m, NULL, in_unit, in_unit, in_unit);
-  for (int i = 0; i < len; i++) {
-    int dst_step_offset = i * dst_step;
-    vst1_f16(dst_data + dst_step_offset, vget_low_f16(m[i]));
-    vst1_f16(dst_data + dst_step_offset + 64, vget_high_f16(m[i]));
-  }
-}
-
-void GeneralOutputTransformUnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
-                                    float16_t *matrix_a, float16_t *matrix_at, int src_step, int dst_step, int in_unit,
-                                    int out_unit) {
-  int src_len = in_unit * in_unit;
-  if (src_len > MAX_LEN) {
-    return;
-  }
-  float16x8_t src[MAX_LEN];
-  float16x8_t t[MAX_LEN];
-  float16x8_t m[MAX_LEN];
-  float16x8_t vec_a[MAX_LEN];
-  float16x8_t vec_at[MAX_LEN];
-  int tmp_len = in_unit * out_unit;
-  if (tmp_len > MAX_LEN) return;
-
-  for (int i = 0; i < tmp_len; i++) {
-    vec_a[i] = vdupq_n_f16(matrix_a[i]);
-    vec_at[i] = vdupq_n_f16(matrix_at[i]);
-  }
-  for (int i = 0; i < src_len; i++) {
-    src[i] = vld1q_f16(src_data + i * src_step);
-  }
-  MatrixMultiplyVecFp16(vec_at, src, t, NULL, out_unit, in_unit, in_unit);
-  MatrixMultiplyVecFp16(t, vec_a, m, bias_data, out_unit, in_unit, out_unit);
-
-  for (int i = 0; i < out_unit; i++) {
-    int dst_k_offset = i * dst_step * C8NUM;
-    int m_k_offset = i * out_unit;
-    for (int j = 0; j < out_unit; j++) {
-      vst1q_f16(dst_data + dst_k_offset + j * C8NUM, m[m_k_offset + j]);
-    }
-  }
-}
-
 static InputTransFp16Func InputTransFp16FuncList[] = {
   NULL, NULL, NULL, NULL, InputTransform4x4UnitFp16, NULL, InputTransform6x6UnitFp16, NULL, InputTransform8x8UnitFp16};
 
@@ -2943,7 +2886,7 @@ void OutputTransform8x7Relu6UnitFp16(const float16_t *src_data, float16_t *dst_d
   }
 }
 
-int SelectOutputUnitFp16(ConvParameter *conv_param) {
+int SelectOutputUnitFp16(const ConvParameter *conv_param) {
   int kernel_h = conv_param->kernel_h_;
   int kernel_w = conv_param->kernel_w_;
   int in_c = conv_param->input_channel_;
@@ -2980,7 +2923,7 @@ int SelectOutputUnitFp16(ConvParameter *conv_param) {
   return unit;
 }
 
-void CheckIfUseWinogradFp16(bool *use_winograd, int *output_unit, ConvParameter *conv_param) {
+void CheckIfUseWinogradFp16(bool *use_winograd, int *output_unit, const ConvParameter *conv_param) {
   if (conv_param->kernel_w_ == conv_param->kernel_h_ && conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1 &&
       conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1) {
     *output_unit = SelectOutputUnitFp16(conv_param);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_utils_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_utils_fp16.h
index f177e005bbc..dfae3fb1182 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_utils_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/winograd_utils_fp16.h
@@ -32,13 +32,6 @@ typedef void (*InputTransFp16Func)(const float16_t *src_data, float16_t *dst_dat
 typedef void (*OutputTransFp16Func)(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
                                     int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c);
 
-void GeneralInputTransformUnitFp16(const float16_t *src_data, float16_t *dst_data, float16_t *matrix_b,
-                                   float16_t *matrix_bt, int src_step, int dst_step, int in_unit);
-
-void GeneralOutputTransformUnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
-                                    float16_t *matrix_a, float16_t *matrix_at, int src_step, int dst_step, int in_unit,
-                                    int out_unit);
-
 #define Load16DataFp16                           \
   src[0] = vld1q_f16(src_data + 0 * src_step);   \
   src[1] = vld1q_f16(src_data + 1 * src_step);   \
@@ -492,9 +485,9 @@ void OutputTransform8x7ReluUnitFp16(const float16_t *src_data, float16_t *dst_da
 void OutputTransform8x7Relu6UnitFp16(const float16_t *src_data, float16_t *dst_data, const float16_t *bias_data,
                                      int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c);
 
-int SelectOutputUnitFp16(ConvParameter *conv_param);
+int SelectOutputUnitFp16(const ConvParameter *conv_param);
 
-void CheckIfUseWinogradFp16(bool *use_winograd, int *output_unit, ConvParameter *conv_param);
+void CheckIfUseWinogradFp16(bool *use_winograd, int *output_unit, const ConvParameter *conv_param);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c
index 960946d4336..5d3b9688d48 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c
@@ -103,6 +103,61 @@ void ConvFp32(const float *input_data, float *packed_input, const float *packed_
   }
 }
 
+#ifdef ENABLE_ARM64
+void ConvFp32OutNC4HW4(const float *input_data, float *packed_input, const float *packed_weight, const float *bias_data,
+                       float *col_major_input, float *output_data, int task_id, const ConvParameter *conv_param) {
+  if (conv_param->thread_num_ == 0) {
+    return;
+  }
+  int output_hw = conv_param->output_h_ * conv_param->output_w_;
+  Row2ColMajorFuncPtr Row2ColMajor = NULL;
+  int cal_num = 0;
+  MatmulFloatOptFuncPtr MatmulFloatOpt = NULL;
+  if (output_hw <= C4NUM) {
+    cal_num = C4NUM;
+    Row2ColMajor = RowMajor2Col4Major;
+    MatmulFloatOpt = MatmulFloatNeon64OptRow4;
+  } else if (output_hw <= C8NUM) {
+    cal_num = C8NUM;
+    Row2ColMajor = RowMajor2Col8Major;
+    MatmulFloatOpt = MatmulFloatNeon64OptRow8;
+  } else {
+    cal_num = C12NUM;
+    Row2ColMajor = RowMajor2Col12Major;
+    MatmulFloatOpt = MatmulFloatNeon64OptRow12;
+  }
+
+  int block_per_thread = UP_DIV(UP_DIV(output_hw, cal_num), conv_param->thread_num_);
+  int start_block = block_per_thread * task_id;
+  int start_hw = start_block * cal_num;
+  int end_hw = MSMIN(output_hw, (start_block + block_per_thread) * cal_num);
+  if (start_hw >= end_hw) {
+    return;
+  }
+  int out_stride = MSMIN(conv_param->output_channel_, C4NUM) * cal_num;
+  int deep = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
+  packed_input += task_id * deep * cal_num;
+  col_major_input += task_id * deep * cal_num;
+  size_t input_size = deep * cal_num * sizeof(float);
+
+  for (int b = 0; b < conv_param->input_batch_; b++) {
+    int out_channel = conv_param->output_channel_;
+    int in_offset = b * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_;
+    int out_offset = b * out_channel * output_hw + start_hw * MSMIN(out_channel, C4NUM);
+    for (int i = start_hw; i < end_hw; i += cal_num, out_offset += out_stride) {
+      int real_cal_row = MSMIN(output_hw - i, cal_num);
+      memset(packed_input, 0, input_size);
+      Im2ColPackUnitFp32(input_data + in_offset, conv_param, packed_input, real_cal_row, i);
+      Row2ColMajor(packed_input, col_major_input, cal_num, deep);
+      float *gemm_output = output_data + out_offset;
+
+      MatmulFloatOpt(col_major_input, packed_weight, gemm_output, bias_data, conv_param->act_type_, deep, real_cal_row,
+                     out_channel, output_hw, OutType_NC4HW4);
+    }
+  }
+}
+#endif
+
 #ifdef ENABLE_AVX
 void SWBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, int left,
               int right, const ConvParameter *conv_param, const SlidingWindowParam *sw_param, const SWConvKernel kernel,
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.h
index 507ac4dd6a9..ae8b581a9d3 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.h
@@ -29,6 +29,10 @@ typedef void (*Row2ColMajorFuncPtr)(const float *src_ptr, float *dst_ptr, int ro
 #ifdef ENABLE_ARM64
 typedef void (*MatmulFloatOptFuncPtr)(const float *a, const float *b, float *c, const float *bias, int act_type,
                                       int depth, int row, int col, size_t stride, size_t write_mode);
+
+// common convolution output C4HW4, if out_channel mod 4 remains, just output real channel, no zeros padded.
+void ConvFp32OutNC4HW4(const float *input_data, float *packed_input, const float *packed_weight, const float *bias_data,
+                       float *col_major_input, float *output_data, int task_id, const ConvParameter *conv_param);
 #endif
 
 // fp32 convolution common (im2col+gemm)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c
index 621abed1dcc..0f1cf3e9b3c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c
@@ -356,14 +356,6 @@ bool CheckConvDwUse3X3(const ConvParameter *conv_param) {
 }
 
 #if defined(ENABLE_ARM) || (defined(ENABLE_SSE) && !defined(ENABLE_AVX))
-bool CheckConvDw1DWinograd(const ConvParameter *conv_param, int thread_num) {
-  return conv_param->kernel_h_ == 3 && conv_param->kernel_w_ == 3 && conv_param->stride_w_ == 1 &&
-         conv_param->stride_h_ == 1 && conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1 &&
-         conv_param->pad_u_ == 1 && conv_param->pad_d_ == 1 && conv_param->pad_l_ == 1 && conv_param->pad_r_ == 1 &&
-         conv_param->input_channel_ == conv_param->output_channel_ && conv_param->output_w_ >= 4 &&
-         conv_param->output_h_ >= thread_num * 4;  // better had more than 4 rows for each thread
-}
-
 static void ConvDw3x3RowLeft(const float *src, float *line, int lw, int channel) {
   MS_FLOAT32X4 v0, v1, v2, v3;
   v0 = MS_MOVQ_F32(0.0f);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.h
index dd9dd8ebd32..ad991393978 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.h
@@ -18,6 +18,7 @@
 #define MINDSPORE_NNACL_FP32_CONV_DEPTHWISE_H_
 
 #include "nnacl/conv_parameter.h"
+#include "nnacl/base/conv_common_base.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -123,8 +124,6 @@ void ConvDw3x3Line(float *dst, float **lines, const float *weight, const float *
                    bool relu, bool relu6);
 void ConvDw3x3(float *output_data, float *buffer, const float *input_data, const float *weight_data,
                const float *bias_data, const ConvParameter *conv_param, int start_oh, int end_oh);
-
-bool CheckConvDw1DWinograd(const ConvParameter *conv_param, int thread_num);
 #endif
 
 void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, const float *bias, int channels,
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_winograd_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_winograd_fp32.c
index 5ab8297e44c..a9c8bb16e8f 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_winograd_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_winograd_fp32.c
@@ -86,8 +86,13 @@ void ConvWinogardFp32(const float *input_data, const float *trans_weight, const
 
       // step 4 : output transform
       float *output_ptr = output_data + out_batch_offset;
-      WinogradOutputTransform(dst_ptr, output_ptr, bias_data, cal_num, out_tile_index, out_w_block, conv_param,
-                              out_func);
+      if (conv_param->out_format_ != NNACL_NC4HW4) {  // nc4hw4
+        WinogradOutputNHWCTransform(dst_ptr, output_ptr, bias_data, cal_num, out_tile_index, out_w_block, conv_param,
+                                    out_func);
+      } else {
+        WinogradOutputNC4HW4Transform(dst_ptr, output_ptr, bias_data, cal_num, out_tile_index, out_w_block, conv_param,
+                                      out_func);
+      }
     }
   }
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/exp_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/exp_fp32.c
index a6e997b94ec..43584af697b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/exp_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/exp_fp32.c
@@ -26,6 +26,12 @@ void ExpFp32(const float *src, float *dst, int num) {
   for (; i < count; i += C4NUM) {
     simd_exp(vld1q_f32(src + i), dst + i);
   }
+#endif
+#ifdef ENABLE_AVX
+  int count = (num / C8NUM) * C8NUM;
+  for (; i < count; i += C8NUM) {
+    simd_exp_avx(_mm256_loadu_ps(src + i), dst + i);
+  }
 #endif
   for (; i < num; ++i) {
     single_exp(src[i], dst + i);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/instance_norm_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/instance_norm_fp32.c
index 9ef31a5a0f9..250814770e8 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/instance_norm_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/instance_norm_fp32.c
@@ -21,14 +21,11 @@
 
 int InstanceNorm(const float *src_data, float *dst_data, const float *gamma_data, const float *beta_data,
                  const InstanceNormParameter *param, size_t task_id) {
-  if (src_data == NULL || dst_data == NULL) {
-    return NNACL_NULL_PTR;
-  }
-  if (param->op_parameter_.thread_num_ == 0) {
-    return NNACL_PARAM_INVALID;
-  }
+  NNACL_CHECK_NULL_RETURN_ERR(src_data);
+  NNACL_CHECK_NULL_RETURN_ERR(dst_data);
+  NNACL_CHECK_NULL_RETURN_ERR(param->op_parameter_.thread_num_)
   int channel_step = UP_DIV(param->channel_, param->op_parameter_.thread_num_);
-  int channel_begin = task_id * channel_step;
+  int channel_begin = (int)(task_id)*channel_step;
   int channel_end = MSMIN(channel_begin + channel_step, param->channel_);
 
   for (int b = 0; b < param->batch_; b++) {
@@ -114,3 +111,74 @@ int InstanceNorm(const float *src_data, float *dst_data, const float *gamma_data
   }
   return NNACL_OK;
 }
+
+int InstanceNormNC4HW4(const float *src_data, float *dst_data, const float *gamma_data, const float *beta_data,
+                       const InstanceNormParameter *param, size_t task_id) {
+  NNACL_CHECK_NULL_RETURN_ERR(src_data);
+  NNACL_CHECK_NULL_RETURN_ERR(dst_data);
+  NNACL_CHECK_NULL_RETURN_ERR(param->op_parameter_.thread_num_);
+  int channel = param->channel_;
+  int hw_plane = param->inner_size_;
+  int channel_step = UP_DIV(UP_DIV(channel, C4NUM), param->op_parameter_.thread_num_) * C4NUM;
+  int channel_begin = (int)(task_id)*channel_step;
+  int channel_end = MSMIN(channel_begin + channel_step, channel);
+#if defined(ENABLE_SSE) || defined(ENABLE_ARM)
+  int c4_down = channel_end / C4NUM * C4NUM;
+  MS_FLOAT32X4 hw_planev = MS_MOVQ_F32((float)(hw_plane));
+#endif
+  for (int b = 0; b < param->batch_; b++) {
+    const float *src_b = src_data + b * channel * hw_plane;
+    float *dst_b = dst_data + b * channel * hw_plane;
+    int c = channel_begin;
+#if defined(ENABLE_ARM) || defined(ENABLE_SSE)
+    for (; c < c4_down; c += C4NUM) {
+      const float *src = src_b + c * hw_plane;
+      float *dst = dst_b + c;
+      MS_FLOAT32X4 mean = MS_MOVQ_F32(0.0f);
+      MS_FLOAT32X4 square_mean = MS_MOVQ_F32(0.0f);
+      for (int index = 0; index < hw_plane; ++index) {
+        MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM);
+        MS_FLOAT32X4 squarev = MS_MULQ_F32(srcv, srcv);
+        mean = MS_ADDQ_F32(mean, srcv);
+        square_mean = MS_ADDQ_F32(square_mean, squarev);
+      }
+      mean = MS_DIVQ_F32(mean, hw_planev);
+      square_mean = MS_DIVQ_F32(square_mean, hw_planev);
+      MS_FLOAT32X4 deno =
+        MS_ADDQ_F32(MS_SUBQ_F32(square_mean, MS_MULQ_F32(mean, mean)), MS_MOVQ_F32(param->epsilon_));  // question
+      deno = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno));
+
+      MS_FLOAT32X4 gammav = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c), deno);  // deno * gamma_data[c]
+      MS_FLOAT32X4 betav = MS_LDQ_F32(beta_data + c);
+      for (int index = 0; index < hw_plane; ++index) {
+        MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM);
+        MS_FLOAT32X4 outv = MS_SUBQ_F32(srcv, mean);
+        outv = MS_MULQ_F32(outv, gammav);
+        outv = MS_ADDQ_F32(outv, betav);
+        MS_STQ_F32(dst + index * channel, outv);
+      }
+    }
+#endif
+    for (; c < channel_end; ++c) {
+      int c4_down_loop = c / C4NUM * C4NUM;
+      int c4_mod = c % C4NUM;
+      int c_res = MSMIN(channel_end - c4_down_loop, C4NUM);
+      const float *src = src_b + c4_down_loop * hw_plane + c4_mod;
+      float *dst = dst_b + c;
+      float mean = 0.0f;
+      float square_mean = 0.0f;
+      for (int index = 0; index < hw_plane; ++index) {
+        float tmp = src[index * c_res];
+        mean += tmp;
+        square_mean += tmp * tmp;
+      }
+      mean /= (float)hw_plane;
+      square_mean /= (float)hw_plane;
+      const float deno = gamma_data[c] / sqrtf(square_mean - mean * mean + param->epsilon_);
+      for (int index = 0; index < hw_plane; ++index) {
+        dst[index * channel] = (src[index * c_res] - mean) * deno + beta_data[c];
+      }
+    }
+  }
+  return NNACL_OK;
+}
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/instance_norm_fp32.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/instance_norm_fp32.h
index b0bf3bf64cc..509fc6481ef 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/instance_norm_fp32.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/instance_norm_fp32.h
@@ -25,6 +25,8 @@ extern "C" {
 
 int InstanceNorm(const float *src_data, float *dst_data, const float *gamma_data, const float *beta_data,
                  const InstanceNormParameter *param, size_t task_id);
+int InstanceNormNC4HW4(const float *src_data, float *dst_data, const float *gamma_data, const float *beta_data,
+                       const InstanceNormParameter *param, size_t task_id);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_transform.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_transform.c
index e23023dfa2f..ccde50e3dc5 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_transform.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_transform.c
@@ -95,9 +95,9 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
   }  // cal_tile_num loop
 }
 
-void WinogradOutputTransform(const float *gemm_out, float *out_data, const float *bias_data, int cal_num,
-                             int out_tile_index, int output_unit_num, const ConvParameter *conv_param,
-                             OutputTransFunc func) {
+void WinogradOutputNHWCTransform(const float *gemm_out, float *out_data, const float *bias_data, int cal_num,
+                                 int out_tile_index, int output_unit_num, const ConvParameter *conv_param,
+                                 OutputTransFunc func) {
   int output_unit = conv_param->output_unit_;
   int output_w = conv_param->output_w_;
   int output_h = conv_param->output_h_;
@@ -137,3 +137,47 @@ void WinogradOutputTransform(const float *gemm_out, float *out_data, const float
     out_tile_index++;
   }
 }
+
+void WinogradOutputNC4HW4Transform(const float *gemm_out, float *out_data, const float *bias_data, int cal_num,
+                                   int out_tile_index, int output_unit_num, const ConvParameter *conv_param,
+                                   OutputTransFunc func) {
+  int output_unit = conv_param->output_unit_;
+  int output_w = conv_param->output_w_;
+  int output_h = conv_param->output_h_;
+  int output_plane = output_w * output_h;
+  int output_channel = conv_param->output_channel_;
+  int oc4 = UP_DIV(output_channel, C4NUM);
+  int oc8 = UP_DIV(output_channel, C8NUM);
+  int input_unit = conv_param->input_unit_;
+  NNACL_CHECK_ZERO_RETURN(output_unit_num);
+
+  for (int i = 0; i < cal_num; i++) {
+    int dst_x_s = out_tile_index % output_unit_num;
+    int dst_y_s = out_tile_index / output_unit_num;
+    int r_w = output_w - dst_x_s * output_unit;
+    r_w = r_w > output_unit ? output_unit : r_w;
+    int r_h = output_h - dst_y_s * output_unit;
+    r_h = r_h > output_unit ? output_unit : r_h;
+    int tmp_ix = dst_x_s * output_unit;
+    dst_x_s = tmp_ix > output_w ? output_w : tmp_ix;
+    int tmp_iy = dst_y_s * output_unit;
+    dst_y_s = tmp_iy > output_h ? output_h : tmp_iy;
+
+    int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit;
+    int dst_tile_offset = dst_x_s + dst_y_s * output_w;
+
+    for (int j = 0; j < oc4; j++) {
+      int c8_block = j / 2;
+      int c8_res = j % 2;
+      int r_c = output_channel - j * C4NUM;
+      r_c = r_c > C4NUM ? C4NUM : r_c;
+      int src_oc4_offset = src_tile_offset + c8_block * input_unit * input_unit * C8NUM + c8_res * C4NUM;
+      int dst_oc4_offset = (dst_tile_offset + output_plane * j) * C4NUM;
+      const float *src_ptr = gemm_out + src_oc4_offset;
+      const float *bias_ptr = bias_data + j * C4NUM;
+      float *dst_ptr = out_data + dst_oc4_offset;
+      func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w, r_c, r_w, r_h, r_c);
+    }
+    out_tile_index++;
+  }
+}
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_transform.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_transform.h
index 9bfc99ebcbb..ac44f169fb4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_transform.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_transform.h
@@ -32,9 +32,13 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
                             int out_tile_index, int out_w_block_num, const ConvParameter *conv_param,
                             InputTransFunc func);
 
-void WinogradOutputTransform(const float *gemm_out, float *out_data, const float *bias_data, int cal_num,
-                             int out_tile_index, int output_unit_num, const ConvParameter *conv_param,
-                             OutputTransFunc func);
+void WinogradOutputNHWCTransform(const float *gemm_out, float *out_data, const float *bias_data, int cal_num,
+                                 int out_tile_index, int output_unit_num, const ConvParameter *conv_param,
+                                 OutputTransFunc func);
+
+void WinogradOutputNC4HW4Transform(const float *gemm_out, float *out_data, const float *bias_data, int cal_num,
+                                   int out_tile_index, int output_unit_num, const ConvParameter *conv_param,
+                                   OutputTransFunc func);
 
 #ifdef __cplusplus
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.c
index 99b7272beb3..6d7695a9fbc 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.c
@@ -16,165 +16,22 @@
 #include "nnacl/fp32/winograd_utils.h"
 #include "nnacl/intrinsics/ms_simd_instructions.h"
 #include "nnacl/base/minimal_filtering_generator.h"
+#include "nnacl/base/conv_common_base.h"
 #include "nnacl/errorcode.h"
 
-#define MIN_UNIT 2
-#define MAX_UNIT 8
-
 static InputTransFunc InputTransFuncList[] = {
   NULL, NULL, NULL, NULL, InputTransform4x4Unit, NULL, InputTransform6x6Unit, NULL, InputTransform8x8Unit};
 
-static OutputTransFunc OutputTransFuncList4[] = {NULL, NULL, OutputTransform4x2Unit, OutputTransform4x3Unit};
-
-static OutputTransFunc OutputTransFuncReluList4[] = {NULL, NULL, OutputTransform4x2ReluUnit,
-                                                     OutputTransform4x3ReluUnit};
-static OutputTransFunc OutputTransFuncRelu6List4[] = {NULL, NULL, OutputTransform4x2Relu6Unit,
-                                                      OutputTransform4x3Relu6Unit};
-
-static OutputTransFunc OutputTransFuncList6[] = {
-  NULL, NULL, OutputTransform6x2Unit, OutputTransform6x3Unit, OutputTransform6x4Unit, OutputTransform6x5Unit};
-
-static OutputTransFunc OutputTransFuncReluList6[] = {NULL,
-                                                     NULL,
-                                                     OutputTransform6x2ReluUnit,
-                                                     OutputTransform6x3ReluUnit,
-                                                     OutputTransform6x4ReluUnit,
-                                                     OutputTransform6x5ReluUnit};
-
-static OutputTransFunc OutputTransFuncRelu6List6[] = {NULL,
-                                                      NULL,
-                                                      OutputTransform6x2Relu6Unit,
-                                                      OutputTransform6x3Relu6Unit,
-                                                      OutputTransform6x4Relu6Unit,
-                                                      OutputTransform6x5Relu6Unit};
-
-static OutputTransFunc OutputTransFuncList8[] = {NULL,
-                                                 NULL,
-                                                 OutputTransform8x2Unit,
-                                                 OutputTransform8x3Unit,
-                                                 OutputTransform8x4Unit,
-                                                 OutputTransform8x5Unit,
-                                                 OutputTransform8x6Unit,
-                                                 OutputTransform8x7Unit};
-
-static OutputTransFunc OutputTransFuncReluList8[] = {NULL,
-                                                     NULL,
-                                                     OutputTransform8x2ReluUnit,
-                                                     OutputTransform8x3ReluUnit,
-                                                     OutputTransform8x4ReluUnit,
-                                                     OutputTransform8x5ReluUnit,
-                                                     OutputTransform8x6ReluUnit,
-                                                     OutputTransform8x7ReluUnit};
-
-static OutputTransFunc OutputTransFuncRelu6List8[] = {NULL,
-                                                      NULL,
-                                                      OutputTransform8x2Relu6Unit,
-                                                      OutputTransform8x3Relu6Unit,
-                                                      OutputTransform8x4Relu6Unit,
-                                                      OutputTransform8x5Relu6Unit,
-                                                      OutputTransform8x6Relu6Unit,
-                                                      OutputTransform8x7Relu6Unit};
-
-void GeneralInputTransformUnit(const float *src_data, float *dst_data, const float *matrix_b, const float *matrix_bt,
-                               int src_step, int dst_step, int in_unit) {
-  int len = in_unit * in_unit;
-  if (len > MAX_LEN) return;
-#if defined(ENABLE_ARM) || defined(ENABLE_SSE)
-  MS_FLOAT32X4 src[MAX_LEN];
-  MS_FLOAT32X4 t[MAX_LEN];
-  MS_FLOAT32X4 m[MAX_LEN];
-  MS_FLOAT32X4 vec_b[MAX_LEN];
-  MS_FLOAT32X4 vec_bt[MAX_LEN];
-  for (int i = 0; i < len; i++) {
-    src[i] = MS_LDQ_F32(src_data + i * src_step);
-    vec_b[i] = MS_MOVQ_F32(matrix_b[i]);
-    vec_bt[i] = MS_MOVQ_F32(matrix_bt[i]);
-  }
-  MatrixMultiplyVec(vec_bt, src, t, NULL, in_unit, in_unit, in_unit);
-  MatrixMultiplyVec(t, vec_b, m, NULL, in_unit, in_unit, in_unit);
-  for (int i = 0; i < len; i++) {
-    MS_STQ_F32(dst_data + i * dst_step, m[i]);
-  }
-#else
-  float src[MAX_LEN];
-  float t[MAX_LEN];
-  float m[MAX_LEN];
-  for (int i = 0; i < C4NUM; ++i) {
-    for (int j = 0; j < len; ++j) {
-      src[j] = src_data[i + j * src_step];
-    }
-    MatrixMultiply(matrix_bt, src, t, in_unit, in_unit, in_unit);
-    MatrixMultiply(t, matrix_b, m, in_unit, in_unit, in_unit);
-    for (int k = 0; k < len; ++k) {
-      dst_data[i + k * dst_step] = m[k];
-    }
-  }
-#endif
-}
-
-void GeneralOutputTransformUnit(const float *src_data, float *dst_data, const float *bias_data, const float *matrix_a,
-                                const float *matrix_at, int src_step, int dst_step, int in_unit, int out_unit) {
-  int src_len = in_unit * in_unit;
-  if (src_len > MAX_LEN) {
-    return;
-  }
-#if defined(ENABLE_ARM) || defined(ENABLE_SSE)
-  MS_FLOAT32X4 src[MAX_LEN];
-  MS_FLOAT32X4 t[MAX_LEN];
-  MS_FLOAT32X4 m[MAX_LEN];
-  MS_FLOAT32X4 vec_a[MAX_LEN];
-  MS_FLOAT32X4 vec_at[MAX_LEN];
-  int tmp_len = in_unit * out_unit;
-  if (tmp_len > MAX_LEN) {
-    return;
-  }
-  if (out_unit * out_unit > MAX_LEN) {
-    return;
-  }
-
-  for (int i = 0; i < tmp_len; i++) {
-    vec_a[i] = MS_MOVQ_F32(matrix_a[i]);
-    vec_at[i] = MS_MOVQ_F32(matrix_at[i]);
-  }
-  for (int i = 0; i < src_len; i++) {
-    src[i] = MS_LDQ_F32(src_data + i * src_step);
-  }
-  MatrixMultiplyVec(vec_at, src, t, NULL, out_unit, in_unit, in_unit);
-  MatrixMultiplyVec(t, vec_a, m, bias_data, out_unit, in_unit, out_unit);
-  if ((out_unit - 1) * out_unit + out_unit - 1 > MAX_LEN) {
-    return;
-  }
-  for (int i = 0; i < out_unit; i++) {
-    int dst_k_offset = i * dst_step * C4NUM;
-    int m_k_offset = i * out_unit;
-    for (int j = 0; j < out_unit; j++) {
-      MS_STQ_F32(dst_data + dst_k_offset + j * C4NUM, m[m_k_offset + j]);
-    }
-  }
-#else
-  float src[MAX_LEN];
-  float t[MAX_LEN];
-  float m[MAX_LEN];
-  for (int i = 0; i < C4NUM; ++i) {
-    // load source data
-    for (int j = 0; j < src_len; ++j) {
-      src[j] = src_data[i + j * src_step];
-    }
-    // AT * x * A
-    MatrixMultiply(matrix_at, src, t, out_unit, in_unit, in_unit);
-    MatrixMultiply(t, matrix_a, m, out_unit, in_unit, out_unit);
-
-    // store output
-    for (int k = 0; k < out_unit; ++k) {
-      int dst_k_offset = k * dst_step * C4NUM;
-      int m_k_offset = k * out_unit;
-      for (int j = 0; j < out_unit; ++j) {
-        dst_data[i + dst_k_offset + j * C4NUM] = m[j + m_k_offset] + bias_data[i];
-      }
-    }
-  }
-#endif
-}
+static OutputTransFunc OutputTransFuncList[] = {
+  OutputTransform4x2Unit,      OutputTransform4x3Unit,      OutputTransform4x2ReluUnit,  OutputTransform4x3ReluUnit,
+  OutputTransform4x2Relu6Unit, OutputTransform4x3Relu6Unit, OutputTransform6x2Unit,      OutputTransform6x3Unit,
+  OutputTransform6x4Unit,      OutputTransform6x5Unit,      OutputTransform6x2ReluUnit,  OutputTransform6x3ReluUnit,
+  OutputTransform6x4ReluUnit,  OutputTransform6x5ReluUnit,  OutputTransform6x2Relu6Unit, OutputTransform6x3Relu6Unit,
+  OutputTransform6x4Relu6Unit, OutputTransform6x5Relu6Unit, OutputTransform8x2Unit,      OutputTransform8x3Unit,
+  OutputTransform8x4Unit,      OutputTransform8x5Unit,      OutputTransform8x6Unit,      OutputTransform8x7Unit,
+  OutputTransform8x2ReluUnit,  OutputTransform8x3ReluUnit,  OutputTransform8x4ReluUnit,  OutputTransform8x5ReluUnit,
+  OutputTransform8x6ReluUnit,  OutputTransform8x7ReluUnit,  OutputTransform8x2Relu6Unit, OutputTransform8x3Relu6Unit,
+  OutputTransform8x4Relu6Unit, OutputTransform8x5Relu6Unit, OutputTransform8x6Relu6Unit, OutputTransform8x7Relu6Unit};
 
 InputTransFunc GetInputTransFunc(int input_unit) { return InputTransFuncList[input_unit]; }
 
@@ -431,33 +288,23 @@ void InputTransform8x8Unit(const float *src_data, float *dst_data, int src_step,
 }
 
 OutputTransFunc GetOutputTransFunc(int input_unit, int output_unit, ActType act_type) {
-  if (input_unit == 4 && output_unit < 4) {
-    if (act_type == ActType_Relu) {
-      return OutputTransFuncReluList4[output_unit];
-    } else if (act_type == ActType_Relu6) {
-      return OutputTransFuncRelu6List4[output_unit];
-    } else {
-      return OutputTransFuncList4[output_unit];
-    }
-  } else if (input_unit == 6 && output_unit < 6) {
-    if (act_type == ActType_Relu) {
-      return OutputTransFuncReluList6[output_unit];
-    } else if (act_type == ActType_Relu6) {
-      return OutputTransFuncRelu6List6[output_unit];
-    } else {
-      return OutputTransFuncList6[output_unit];
-    }
-  } else if (input_unit == 8 && output_unit < 8) {
-    if (act_type == ActType_Relu) {
-      return OutputTransFuncReluList8[output_unit];
-    } else if (act_type == ActType_Relu6) {
-      return OutputTransFuncRelu6List8[output_unit];
-    } else {
-      return OutputTransFuncList8[output_unit];
-    }
-  } else {
+  if (!CheckWinogradInputOutputUnit(input_unit, output_unit)) {
     return NULL;
   }
+  int in_index = (input_unit - 4) / 2;
+  int index = 0;
+  for (int i = 0; i < in_index; i++) {
+    index += ((i * 2 + 4) - 2) * 3;
+  }
+  int act_index;
+  if (act_type == ActType_Relu) {
+    act_index = 1;
+  } else if (act_type == ActType_Relu6) {
+    act_index = 2;
+  } else {
+    act_index = 0;
+  }
+  return OutputTransFuncList[index + (input_unit - 2) * act_index + output_unit - 2];
 }
 
 void OutputTransform4x2Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step,
@@ -3849,57 +3696,3 @@ void OutputTransform8x7Relu6Unit(const float *src_data, float *dst_data, const f
   }
 }
 #endif
-
-// Reference to the paper "Fast Algorithms for Convolutional Neural Networks"
-// Utilize cost model to compute performance gain.
-// If the gain is greater than got from Im2col, winograd algorithm will be chosen.
-int SelectOutputUnit(const ConvParameter *conv_param) {
-  int kernel_h = conv_param->kernel_h_;
-  int kernel_w = conv_param->kernel_w_;
-  int in_c = conv_param->input_channel_;
-  int out_w = conv_param->output_w_;
-  int out_h = conv_param->output_h_;
-  int out_c = conv_param->output_channel_;
-  if (conv_param->op_parameter_.thread_num_ == 0) {
-    return NNACL_PARAM_INVALID;
-  }
-  int unit2 = UP_DIV(out_w * out_h, C12NUM * conv_param->op_parameter_.thread_num_);
-  int max_out_unit = (int)(sqrtf((float)unit2));
-  max_out_unit = max_out_unit < MAX_UNIT ? max_out_unit : MAX_UNIT;
-  max_out_unit = max_out_unit > MIN_UNIT ? max_out_unit : MIN_UNIT;
-
-  int unit = 0;
-  float max_rate = 0.0f;
-  float common_cost = (float)out_h * out_w * in_c * out_c * kernel_h * kernel_w;
-
-  for (int i = MIN_UNIT; i <= max_out_unit; ++i) {
-    int input_unit = i + kernel_w - 1;
-    if (!GetOutputTransFunc(input_unit, i, ActType_No)) {
-      continue;
-    }
-    float penalty = ((float)input_unit * input_unit) / ((float)kernel_h * kernel_w) * 0.12f;
-    float wino_cost = ((2 + out_c) * (float)input_unit * input_unit * in_c + ((float)input_unit + i) * i * out_c) *
-                      UP_DIV(out_w, i) * UP_DIV(out_h, i);
-    float reduce_rate = common_cost / wino_cost - penalty;
-    if (reduce_rate > max_rate) {
-      max_rate = reduce_rate;
-      unit = i;
-    }
-  }
-  if (max_rate < 1.0f) {
-    return 1;
-  }
-  // If output_unit is 1, then it is conventional convolution
-  return unit;
-}
-
-bool CheckIfUseWinograd(int *output_unit, const ConvParameter *conv_param) {
-  if (conv_param->kernel_w_ == conv_param->kernel_h_ && conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1 &&
-      conv_param->stride_h_ == 1 && conv_param->stride_w_ == 1 && conv_param->input_channel_ != 1) {
-    *output_unit = SelectOutputUnit(conv_param);
-    if (*output_unit > 1) {
-      return true;
-    }
-  }
-  return false;
-}
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.h
index 39c0b270ec5..539ba6a42df 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.h
@@ -31,12 +31,6 @@ typedef void (*InputTransFunc)(const float *src_data, float *dst_data, int src_s
 typedef void (*OutputTransFunc)(const float *src_data, float *dst_data, const float *bias_data, int src_step,
                                 int dst_step, int out_c, int r_w, int r_h, int r_c);
 
-void GeneralInputTransformUnit(const float *src_data, float *dst_data, const float *matrix_b, const float *matrix_bt,
-                               int src_step, int dst_step, int in_unit);
-
-void GeneralOutputTransformUnit(const float *src_data, float *dst_data, const float *bias_data, const float *matrix_a,
-                                const float *matrix_at, int src_step, int dst_step, int in_unit, int out_unit);
-
 #define Load16Data                                \
   src[0] = MS_LDQ_F32(src_data + 0 * src_step);   \
   src[1] = MS_LDQ_F32(src_data + 1 * src_step);   \
@@ -308,7 +302,6 @@ void OutputTransform8x7Relu6Unit(const float *src_data, float *dst_data, const f
 
 int SelectOutputUnit(const ConvParameter *conv_param);
 
-bool CheckIfUseWinograd(int *output_unit, const ConvParameter *conv_param);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/pooling_grad.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/pooling_grad.c
index 0c97397bdff..7f91923fffb 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/pooling_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/pooling_grad.c
@@ -170,7 +170,7 @@ void MaxPoolingGrad(const float *input_ptr, const float *dy_ptr, float *output_p
           float delta = dyPtr[idx];
           for (int kh = kh_s; kh < kh_e; kh++) {
             int xh = yh * stride_h + kh - pad_h;
-            for (int kw = kw_e; kw < kw_s; kw++) {
+            for (int kw = kw_s; kw < kw_e; kw++) {
               int xw = yw * stride_w + kw - pad_w;
               int val_idx = (xw + in_w * xh) * channel + ic;
               float val = inPtr[val_idx];
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/broadcast_to_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/broadcast_to_infer.c
index 31e36427ad0..8212f291c4b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/broadcast_to_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/broadcast_to_infer.c
@@ -62,6 +62,19 @@ int GetShapeByType(const TensorC *shape_tensor, size_t shape_size, int32_t *dst_
   return NNACL_OK;
 }
 
+int CheckShape(const int *input_shape, const int *dst_shape, const int input_shape_index, const int dst_shape_index) {
+  if (dst_shape[dst_shape_index] < 0) {
+    return NNACL_ERR;
+  }
+  if (input_shape_index >= 0) {
+    int input_shape_i = input_shape[input_shape_index];
+    if (input_shape_i != dst_shape[dst_shape_index] && input_shape_i != 1 && dst_shape[dst_shape_index] != 1) {
+      return NNACL_ERR;
+    }
+  }
+  return NNACL_OK;
+}
+
 int BroadcastToInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
                           OpParameter *parameter) {
   int ret = CheckAugmentNull(inputs, inputs_size, outputs, outputs_size, parameter);
@@ -112,21 +125,12 @@ int BroadcastToInferShape(const TensorC *const *inputs, size_t inputs_size, Tens
   size_t input_shape_size = input->shape_size_;
   int shape[MAX_SHAPE_SIZE];
   int input_shape_index = (int)(input_shape_size)-1;
-  if (input_shape_size > dst_shape_size) {
-    return NNACL_ERR;
-  }
 
   for (int i = (int)(dst_shape_size)-1; i >= 0; --i) {
-    if (dst_shape[i] < 0) {
+    if (CheckShape(input_shape, dst_shape, input_shape_index, i) != NNACL_OK) {
       return NNACL_ERR;
     }
-    if (input_shape_index >= 0) {
-      int dim = input_shape[input_shape_index];
-      if (dim != dst_shape[i] && dim != 1) {
-        return NNACL_ERR;
-      }
-    }
-    shape[i] = dst_shape[i];
+    shape[i] = dst_shape[i] == 1 ? input_shape[input_shape_index] : dst_shape[i];
     --input_shape_index;
   }
   SetShapeArray(outputs[0], shape, dst_shape_size);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.c
index 30c75d62d1e..580641d182b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.c
@@ -18,7 +18,7 @@
 #include <string.h>
 #include "nnacl/infer/infer_register.h"
 
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 int MallocTensorListData(TensorListC *tensor_list, TypeIdC dtype, const vvector *tensor_shape) {
   // This function will create a new tensors_
   // Your must to set shape(param2: tensor_shape) and data_type_(tensors_data_type_ = param1: dtype) of each tensor in
@@ -418,7 +418,7 @@ bool InferFlag(const TensorC *const *inputs, size_t inputs_size) {
     if (inputs[i] == NULL) {
       return false;
     }
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
     if (inputs[i]->data_type_ == kObjectTypeTensorType) {
       TensorListC *input_tensor_list = (TensorListC *)inputs[i];
       if (input_tensor_list->shape_value_ == -1) {
@@ -431,7 +431,7 @@ bool InferFlag(const TensorC *const *inputs, size_t inputs_size) {
           return false;
         }
       }
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
     }
 #endif
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.h
index 63e95a1203e..641cf1326c7 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.h
@@ -138,7 +138,7 @@ typedef struct vvector {
   size_t size_;      // number of shapes
 } vvector;
 
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 typedef struct TensorListC {
   bool is_ready_;
   int data_type_;
@@ -160,7 +160,7 @@ typedef struct VectorC {
   size_t per_malloc_size_;
 } VectorC;
 
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 int MallocTensorListData(TensorListC *tensor_list, TypeIdC dtype, const vvector *tensor_shape);
 int TensorListMergeShape(int *element_shape, size_t *element_shape_size, const int *tmp, size_t tmp_size);
 bool TensorListIsFullyDefined(const int *shape, size_t shape_size);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/conv2d_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/conv2d_infer.c
index 4193630893e..e7805d26857 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/conv2d_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/conv2d_infer.c
@@ -16,7 +16,7 @@
 #include "nnacl/infer/conv2d_infer.h"
 #include "nnacl/infer/infer_register.h"
 
-void ConvInferShape(int input_h, int input_w, int *output_h, int *output_w, ConvParameter *param) {
+int ConvInferShape(int input_h, int input_w, int *output_h, int *output_w, ConvParameter *param) {
   int kernel_w = param->kernel_w_;
   int kernel_h = param->kernel_h_;
   int stride_w = param->stride_w_;
@@ -52,6 +52,12 @@ void ConvInferShape(int input_h, int input_w, int *output_h, int *output_w, Conv
     *output_w = ((input_w) + param->pad_l_ + param->pad_r_ - kernel_width) / stride_w + 1;
     *output_h = ((input_h) + param->pad_u_ + param->pad_d_ - kernel_height) / stride_h + 1;
   }
+
+  if (param->kernel_h_ > input_h + param->pad_u_ + param->pad_d_ ||
+      param->kernel_w_ > input_w + param->pad_l_ + param->pad_r_) {
+    return NNACL_PARAM_INVALID;
+  }
+  return NNACL_OK;
 }
 
 int Conv2dInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
@@ -89,9 +95,13 @@ int Conv2dInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC *
   if (param->stride_h_ == 0 || param->stride_w_ == 0) {
     return NNACL_PARAM_INVALID;
   }
+
   param->kernel_h_ = param->kernel_h_ != -1 ? param->kernel_h_ : weight_tensor->shape_[1];
   param->kernel_w_ = param->kernel_w_ != -1 ? param->kernel_w_ : weight_tensor->shape_[2];
-  ConvInferShape(input_h, input_w, &output_h, &output_w, param);
+  int ret = ConvInferShape(input_h, input_w, &output_h, &output_w, param);
+  if (ret != NNACL_OK) {
+    return ret;
+  }
 
   int out_shape[MAX_SHAPE_SIZE];
   size_t out_shape_size = 0;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_extract_features_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_extract_features_infer.c
deleted file mode 100644
index 656b938a1f4..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_extract_features_infer.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/custom_extract_features_infer.h"
-#include "nnacl/infer/infer_register.h"
-
-int CustomExtractFeaturesInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs,
-                                    size_t outputs_size, OpParameter *parameter) {
-  int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 1, 2);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-
-  const TensorC *input = inputs[0];
-  TensorC *output0 = outputs[0];
-  TensorC *output1 = outputs[1];
-
-  output0->data_type_ = kNumberTypeInt32;
-  output0->format_ = input->format_;
-  output1->data_type_ = kNumberTypeFloat32;
-  output1->format_ = input->format_;
-
-  if (input->data_ == NULL) {
-    return NNACL_INFER_INVALID;
-  }
-  int string_num = *((const int32_t *)(input->data_));
-
-  int res = (string_num == 0 ? 1 : string_num);
-  output0->shape_size_ = 1;
-  output0->shape_[0] = res;
-  output1->shape_size_ = 1;
-  output1->shape_[0] = res;
-  return NNACL_OK;
-}
-
-REG_INFER(CustomExtractFeatures, PrimType_CustomExtractFeatures, CustomExtractFeaturesInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_extract_features_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_extract_features_infer.h
deleted file mode 100644
index 8890561c805..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_extract_features_infer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_CUSTOM_EXTRACT_FEATURES_INFER_H
-#define MINDSPORE_NNACL_CUSTOM_EXTRACT_FEATURES_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int CustomExtractFeaturesInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs,
-                                    size_t outputs_size, OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_CUSTOM_EXTRACT_FEATURES_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_normalize_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_normalize_infer.c
deleted file mode 100644
index 00c6ea1c552..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_normalize_infer.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/custom_normalize_infer.h"
-#include "nnacl/infer/infer_register.h"
-
-int CustomNormalizeInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                              OpParameter *parameter) {
-  int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 1, 1);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-
-  const TensorC *input = inputs[0];
-  TensorC *output = outputs[0];
-
-  SetDataTypeFormat(output, input);
-
-  if (input->data_ == NULL) {
-    return NNACL_INFER_INVALID;
-  }
-  int string_num = *((const int32_t *)(input->data_));  // also look custom_extract_features
-
-  output->shape_size_ = 1;
-  output->shape_[0] = (string_num == 0 ? 1 : string_num);
-  return NNACL_OK;
-}
-
-REG_INFER(CustomNormalize, PrimType_CustomNormalize, CustomNormalizeInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_normalize_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_normalize_infer.h
deleted file mode 100644
index 63558b5b443..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_normalize_infer.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_CUSTOM_NORMALIZE_INFER_H
-#define MINDSPORE_NNACL_CUSTOM_NORMALIZE_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-#include "nnacl/softmax_parameter.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int CustomNormalizeInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                              OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_CUSTOM_NORMALIZE_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_predict_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_predict_infer.c
deleted file mode 100644
index 8c2b84463e7..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_predict_infer.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/custom_predict_infer.h"
-#include "nnacl/infer/infer_register.h"
-
-int CustomPredictInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                            OpParameter *parameter) {
-  int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 1, 2);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-
-  const TensorC *input = inputs[0];
-  TensorC *output0 = outputs[0];
-  TensorC *output1 = outputs[1];
-
-  CustomPredictParameter *param = (CustomPredictParameter *)parameter;
-  output0->shape_size_ = 1;
-  output0->shape_[0] = param->output_num;
-  output0->data_type_ = kNumberTypeInt32;
-  output0->format_ = input->format_;
-  output1->shape_size_ = 1;
-  output1->shape_[0] = param->output_num;
-  output1->data_type_ = kNumberTypeFloat32;
-  output1->format_ = input->format_;
-  return NNACL_OK;
-}
-
-REG_INFER(CustomPredict, PrimType_CustomPredict, CustomPredictInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_predict_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_predict_infer.h
deleted file mode 100644
index e78ec87c2b4..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/custom_predict_infer.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_CUSTOM_PREDICT_INFER_H
-#define MINDSPORE_NNACL_CUSTOM_PREDICT_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct CustomPredictParameter {
-  OpParameter op_parameter_;
-  int output_num;
-} CustomPredictParameter;
-
-int CustomPredictInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                            OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_CUSTOM_PREDICT_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/hashtable_lookup_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/hashtable_lookup_infer.c
deleted file mode 100644
index 4c1be9c280b..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/hashtable_lookup_infer.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/hashtable_lookup_infer.h"
-#include "nnacl/infer/infer_register.h"
-
-int HashtableLoopupInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                              OpParameter *parameter) {
-  int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 2, 2);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-
-  const TensorC *input = inputs[0];
-  const TensorC *values = inputs[2];
-  TensorC *output = outputs[0];
-  TensorC *hits = outputs[1];
-
-  output->data_type_ = values->data_type_;
-  output->format_ = input->format_;
-  hits->shape_size_ = 1;
-  hits->shape_[0] = GetDimensionSize(input, 0);
-  hits->data_type_ = kNumberTypeUInt8;
-  hits->format_ = input->format_;
-
-  if (input->data_ == NULL) {
-    return NNACL_INFER_INVALID;
-  }
-  return NNACL_OK;
-}
-
-REG_INFER(HashtableLookup, PrimType_HashtableLookup, HashtableLoopupInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/hashtable_lookup_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/hashtable_lookup_infer.h
deleted file mode 100644
index 7e0c0349725..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/hashtable_lookup_infer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_HASHTABLE_LOOKUP_INFER_H
-#define MINDSPORE_NNACL_HASHTABLE_LOOKUP_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int HashtableLoopupInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                              OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_HASHTABLE_LOOKUP_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/lsh_projection_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/lsh_projection_infer.c
deleted file mode 100644
index 09fd4837663..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/lsh_projection_infer.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/lsh_projection_infer.h"
-#include "nnacl/infer/infer_register.h"
-
-int LshProjectionInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                            OpParameter *parameter) {
-  int check_ret = CheckAugmentNullSizeInputTwo(inputs, inputs_size, outputs, outputs_size, parameter, 2, 3, 1);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-
-  const TensorC *in_hash = inputs[0];
-  if (in_hash->shape_size_ != 2 || GetDimensionSize(in_hash, 1) > 32) {
-    return NNACL_ERR;
-  }
-  TensorC *out_tensor = outputs[0];
-  out_tensor->data_type_ = kNumberTypeInt32;
-  out_tensor->format_ = Format_NHWC;
-
-  int out_shape[MAX_SHAPE_SIZE] = {0};
-  size_t out_shape_size = 0;
-  LshProjectionParameter *param = (LshProjectionParameter *)parameter;
-  switch (param->lsh_type_) {
-    case LshProjectionType_SPARSE:
-      ShapePush(out_shape, &out_shape_size, GetDimensionSize(in_hash, 0));
-      break;
-    case LshProjectionType_DENSE:
-      ShapePush(out_shape, &out_shape_size, GetDimensionSize(in_hash, 0) * GetDimensionSize(in_hash, 1));
-      break;
-    default:
-      return NNACL_ERR;
-  }
-  SetShapeArray(out_tensor, out_shape, out_shape_size);
-  return NNACL_OK;
-}
-
-REG_INFER(LshProjection, PrimType_LshProjection, LshProjectionInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/lsh_projection_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/lsh_projection_infer.h
deleted file mode 100644
index 24017cf7932..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/lsh_projection_infer.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_LSH_PROJECTION_INFER_H
-#define MINDSPORE_NNACL_LSH_PROJECTION_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-#include "nnacl/lsh_projection_parameter.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int LshProjectionInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                            OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_LSH_PROJECTION_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/matmul_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/matmul_infer.c
index a252684f19b..f2cbd0870c9 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/matmul_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/matmul_infer.c
@@ -17,9 +17,11 @@
 #include "nnacl/infer/matmul_infer.h"
 #include "nnacl/infer/infer_register.h"
 
+#define MIN_SHAPE_SIZE 2
+
 int CheckMatmulInputShape(int *a_shape, size_t a_shape_size, int *b_shape, size_t b_shape_size,
                           const MatMulParameter *param) {
-  if (a_shape_size < 2 || b_shape_size < 2) {
+  if (a_shape_size < MIN_SHAPE_SIZE || b_shape_size < MIN_SHAPE_SIZE) {
     return NNACL_PARAM_INVALID;
   }
   for (size_t i = 0; i < (a_shape_size - 2) && i < (b_shape_size - 2); ++i) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/merge_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/merge_infer.c
deleted file mode 100644
index 502159149f4..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/merge_infer.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/merge_infer.h"
-#include <string.h>
-#include "nnacl/infer/infer_register.h"
-
-bool MergeAbleToInfer(const TensorC *const *inputs, size_t inputs_size) {
-  for (size_t i = 0; i < inputs_size; i++) {
-    if (!inputs[i]->is_ready_) {
-      return false;
-    }
-  }
-  return true;
-}
-
-int MergeInfer(TensorC **inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size) {
-  for (size_t i = 0; i < inputs_size; i++) {
-    outputs[i] = inputs[i];
-    inputs[i] = NULL;
-  }
-  return NNACL_OK;
-}
-
-void MergeDataTypeInfer(TensorC **inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size) {
-  for (size_t i = 0; i < outputs_size; i++) {
-    if (inputs[i]->data_type_ == kObjectTypeTensorType) {
-      TensorListC *input_tensor_list = (TensorListC *)inputs[i];
-      if (input_tensor_list->tensors_data_type_ != kTypeUnknown) {
-        outputs[i] = inputs[i];
-        inputs[i] = NULL;
-      } else {
-        outputs[i] = inputs[i + outputs_size];
-        inputs[i + outputs_size] = NULL;
-      }
-    } else {
-      if (inputs[i]->data_type_ != kTypeUnknown) {
-        outputs[i] = inputs[i];
-        inputs[i] = NULL;
-      } else {
-        outputs[i] = inputs[i + outputs_size];
-        inputs[i + outputs_size] = NULL;
-      }
-    }
-  }
-}
-
-int MergeInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                    OpParameter *parameter) {
-  for (size_t i = 0; i < inputs_size; i++) {
-    if (inputs[i] == NULL) {
-      return NNACL_NULL_PTR;
-    }
-  }
-  if (inputs_size != 2 * outputs_size) {
-    return NNACL_ERR;
-  }
-
-  const TensorC *const *left_part_inputs = inputs;
-  size_t left_part_inputs_size = inputs_size / 2;
-
-  const TensorC *const *right_part_inputs = inputs + left_part_inputs_size;
-  size_t right_part_inputs_size = inputs_size / 2;
-
-  if (MergeAbleToInfer(left_part_inputs, left_part_inputs_size)) {
-    return MergeInfer((TensorC **)left_part_inputs, left_part_inputs_size, outputs, outputs_size);
-  }
-
-  if (MergeAbleToInfer(right_part_inputs, right_part_inputs_size)) {
-    return MergeInfer((TensorC **)right_part_inputs, right_part_inputs_size, outputs, outputs_size);
-  }
-
-  MergeDataTypeInfer((struct TensorC **)inputs, inputs_size, outputs, outputs_size);
-  return NNACL_INFER_INVALID;
-}
-
-REG_INFER(Merge, PrimType_Merge, MergeInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/merge_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/merge_infer.h
deleted file mode 100644
index 372138d0f18..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/merge_infer.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_MERGE_INFER_H
-#define MINDSPORE_NNACL_MERGE_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-#include "nnacl/softmax_parameter.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int MergeInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                    OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_MERGE_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/select_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/select_infer.c
index 9708755ea64..0d441afbbd3 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/select_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/select_infer.c
@@ -34,7 +34,7 @@ int SelectInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC *
     TensorC *output = outputs[i];
     SetDataTypeFormat(output, input);
     if (input->data_type_ == kObjectTypeTensorType) {
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
       TensorListC *input_tensorlist = (TensorListC *)(input);
       TensorListC *output_tensorlist = (TensorListC *)(output);
       output_tensorlist->element_shape_size_ = input_tensorlist->element_shape_size_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/skip_gram_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/skip_gram_infer.c
deleted file mode 100644
index 3c533fbf6e2..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/skip_gram_infer.c
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/skip_gram_infer.h"
-#include "nnacl/infer/infer_register.h"
-
-int SkipGramInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                       OpParameter *parameter) {
-  int check_ret = CheckAugmentNullSize(inputs, inputs_size, outputs, outputs_size, parameter, 1, 1);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-
-  const TensorC *input = inputs[0];
-  TensorC *output = outputs[0];
-
-  SetDataTypeFormat(output, input);
-  if (input->data_ == NULL) {
-    return NNACL_INFER_INVALID;
-  }
-  return NNACL_OK;
-}
-
-REG_INFER(SkipGram, PrimType_SkipGram, SkipGramInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/skip_gram_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/skip_gram_infer.h
deleted file mode 100644
index 7af14f57ff2..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/skip_gram_infer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_SKIP_GRAM_INFER_H
-#define MINDSPORE_NNACL_SKIP_GRAM_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int SkipGramInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                       OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_SKIP_GRAM_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/switch_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/switch_infer.c
deleted file mode 100644
index bff16a8d271..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/switch_infer.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/switch_infer.h"
-#include <string.h>
-#include "nnacl/infer/infer_register.h"
-
-int SwitchInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                     OpParameter *parameter) {
-  for (size_t i = 0; i < inputs_size; i++) {
-    if (inputs[i] == NULL) {
-      return NNACL_NULL_PTR;
-    }
-  }
-  if (outputs_size < 1 || 2 * (inputs_size - 1) != outputs_size) {
-    return NNACL_ERR;
-  }
-
-  for (size_t i = 0; i < outputs_size / 2; i++) {
-    outputs[i] = (TensorC *)inputs[i + 1];
-    if (inputs[i + 1]->data_type_ == kObjectTypeTensorType) {
-      TensorListC *input = (TensorListC *)inputs[i + 1];
-      TensorListC *mirror_tensorlist = (TensorListC *)malloc(sizeof(TensorListC));  // free in infer_manager
-      if (mirror_tensorlist == NULL) {
-        return NNACL_ERR;  // memory that has been applied will be free in infer_manager
-      }
-      memcpy(mirror_tensorlist, input, sizeof(TensorListC));
-
-      TensorC *tensor_buffer = (TensorC *)malloc(input->element_num_ * sizeof(TensorC));
-      if (tensor_buffer == NULL) {
-        free(mirror_tensorlist);
-        return NNACL_ERR;
-      }
-      memcpy(tensor_buffer, input->tensors_, input->element_num_ * sizeof(TensorC));
-      mirror_tensorlist->tensors_ = tensor_buffer;
-      outputs[i + outputs_size / 2] = (TensorC *)(mirror_tensorlist);
-    } else {
-      TensorC *mirror_tensor = (TensorC *)malloc(sizeof(TensorC));
-      if (mirror_tensor == NULL) {
-        return NNACL_ERR;
-      }
-      memcpy(mirror_tensor, inputs[i + 1], sizeof(TensorC));
-      outputs[i + outputs_size / 2] = mirror_tensor;
-    }
-  }
-  bool infer_flag = InferFlag(inputs, inputs_size);
-  for (size_t i = 0; i < outputs_size / 2; i++) {
-    *((const TensorC **)inputs + i + 1) = NULL;
-  }
-  if (!infer_flag) {
-    return NNACL_INFER_INVALID;
-  }
-  return NNACL_OK;
-}
-
-REG_INFER(Switch, PrimType_Switch, SwitchInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/switch_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/switch_infer.h
deleted file mode 100644
index bac22b3a16c..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/switch_infer.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_SWITCH_INFER_H
-#define MINDSPORE_NNACL_SWITCH_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-#include "nnacl/softmax_parameter.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int SwitchInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                     OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_SWITCH_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_infer.c
deleted file mode 100644
index 620612308fc..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_infer.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/tensor_array_infer.h"
-#include "nnacl/infer/infer_register.h"
-#include "nnacl/tensor_array_parameter.h"
-
-int TensorArrayInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                          OpParameter *parameter) {
-#ifdef Debug
-  int check_ret = CheckAugmentNullSize(inputs, inputs_size, outputs, outputs_size, parameter, 1, 1);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-#endif
-
-  TensorC *output = outputs[0];
-
-  if (!InferFlag(inputs, inputs_size)) {
-    return NNACL_INFER_INVALID;
-  }
-
-  TensorArrayParameter *param = (TensorArrayParameter *)parameter;
-  if (param == NULL) {
-    return NNACL_NULL_PTR;
-  }
-
-  output->data_type_ = param->data_type_;
-  SetShapeArray(output, param->element_shape_, param->element_shape_size_);
-
-  return NNACL_OK;
-}
-
-REG_INFER(TensorArray, PrimType_TensorArray, TensorArrayInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_infer.h
deleted file mode 100644
index 08966118ece..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_infer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_TENSOR_ARRAY_INFER_H_
-#define MINDSPORE_NNACL_TENSOR_ARRAY_INFER_H_
-
-#include "nnacl/infer/common_infer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int TensorArrayInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                          OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_TENSOR_ARRAY_INFER_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_read_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_read_infer.c
deleted file mode 100644
index f7945d99f97..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_read_infer.c
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/tensor_array_read_infer.h"
-#include "nnacl/infer/infer_register.h"
-#include "nnacl/tensor_array_parameter.h"
-
-int TensorArrayReadInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                              OpParameter *parameter) {
-#ifdef Debug
-  // { prim, handle, index } -> node
-  int check_ret = CheckAugmentNullSize(inputs, inputs_size, outputs, outputs_size, parameter, 3, 1);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-#endif
-
-  TensorC *handle = (TensorC *)inputs[0];
-  TensorC *output = outputs[0];
-
-  if (!InferFlag(inputs, inputs_size)) {
-    return NNACL_INFER_INVALID;
-  }
-
-  output->data_type_ = handle->data_type_;
-  SetShapeArray(output, handle->shape_, handle->shape_size_);
-
-  return NNACL_OK;
-}
-
-REG_INFER(TensorArrayRead, PrimType_TensorArrayRead, TensorArrayReadInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_read_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_read_infer.h
deleted file mode 100644
index 55b69d51852..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_read_infer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_TENSOR_ARRAY_READ_INFER_H_
-#define MINDSPORE_NNACL_TENSOR_ARRAY_READ_INFER_H_
-
-#include "nnacl/infer/common_infer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int TensorArrayReadInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                              OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_TENSOR_ARRAY_READ_INFER_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_write_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_write_infer.c
deleted file mode 100644
index 8bb166bb4be..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_write_infer.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/tensor_array_write_infer.h"
-#include "nnacl/infer/infer_register.h"
-#include "nnacl/tensor_array_parameter.h"
-
-int TensorArrayWriteInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                               OpParameter *parameter) {
-#ifdef Debug
-  // { handle, index, value, flow_in } -> empty
-  int check_ret = CheckAugmentNullSize(inputs, inputs_size, outputs, outputs_size, parameter, 4, 1);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-#endif
-
-  TensorC *handle = (TensorC *)inputs[0];
-  TensorC *value = (TensorC *)inputs[2];
-
-  if (!InferFlag(inputs, inputs_size)) {
-    return NNACL_INFER_INVALID;
-  }
-
-  TensorArrayParameter *param = (TensorArrayParameter *)parameter;
-  if (param == NULL) {
-    return NNACL_NULL_PTR;
-  }
-
-  if (handle->shape_size_ != value->shape_size_) {
-    return NNACL_INFER_INVALID;
-  }
-
-  for (int i = 0; i < handle->shape_size_; ++i) {
-    if (handle->shape_[i] != value->shape_[i]) {
-      return NNACL_INFER_INVALID;
-    }
-  }
-
-  return NNACL_OK;
-}
-
-REG_INFER(TensorArrayWrite, PrimType_TensorArrayWrite, TensorArrayWriteInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_write_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_write_infer.h
deleted file mode 100644
index 1cb811ac678..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensor_array_write_infer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_TENSOR_ARRAY_WRITE_INFER_H_
-#define MINDSPORE_NNACL_TENSOR_ARRAY_WRITE_INFER_H_
-
-#include "nnacl/infer/common_infer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int TensorArrayWriteInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                               OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_TENSOR_ARRAY_WRITE_INFER_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_fromtensor_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_fromtensor_infer.c
deleted file mode 100644
index d2cf972edb9..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_fromtensor_infer.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/tensorlist_fromtensor_infer.h"
-#include "nnacl/infer/infer_register.h"
-
-int TensorListFromTensorInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs,
-                                   size_t outputs_size, OpParameter *parameter) {
-  int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 2, 1);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-
-  TensorListC *output = (TensorListC *)(outputs[0]);
-  const TensorC *input0 = inputs[0];
-  output->data_type_ = kObjectTypeTensorType;
-  output->format_ = Format_NHWC;
-  output->tensors_data_type_ = input0->data_type_;
-
-  if (!InferFlag(inputs, inputs_size)) {
-    return NNACL_INFER_INVALID;
-  }
-
-  if (input0->shape_size_ < 1) {
-    return NNACL_ERR;
-  }
-  int dim0 = input0->shape_[0];
-  if (dim0 < 0) {
-    return NNACL_ERR;
-  }
-  const TensorC *input1 = inputs[1];
-  if (input1->data_ == NULL) {
-    return NNACL_NULL_PTR;
-  }
-  int *ele_shape_ptr = (int *)(input1->data_);
-
-  vvector tensor_shape;
-  tensor_shape.size_ = dim0;
-  tensor_shape.shape_ = (int **)malloc(tensor_shape.size_ * sizeof(int *));
-  if (tensor_shape.shape_ == NULL) {
-    return NNACL_NULL_PTR;
-  }
-  tensor_shape.shape_size_ = (int *)malloc(tensor_shape.size_ * sizeof(int));
-  if (tensor_shape.shape_size_ == NULL) {
-    free(tensor_shape.shape_);
-    return NNACL_NULL_PTR;
-  }
-
-  for (size_t i = 0; i < dim0; i++) {
-    tensor_shape.shape_[i] = (int *)(input0->shape_ + 1);
-    tensor_shape.shape_size_[i] = input0->shape_size_ - 1;
-  }
-
-  ShapeSet(output->element_shape_, &(output->element_shape_size_), ele_shape_ptr, GetElementNum(input1));
-  output->element_num_ = dim0;
-  int ret = MallocTensorListData(output, input0->data_type_, &tensor_shape);
-  if (ret != NNACL_OK) {
-    return NNACL_ERR;
-  }
-  free(tensor_shape.shape_);
-  free(tensor_shape.shape_size_);
-  return NNACL_OK;
-}
-
-REG_INFER(TensorListFromTensor, PrimType_TensorListFromTensor, TensorListFromTensorInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_fromtensor_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_fromtensor_infer.h
deleted file mode 100644
index f9d9a091675..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_fromtensor_infer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_TENSORLIST_FROMTENSOR_INFER_H
-#define MINDSPORE_NNACL_TENSORLIST_FROMTENSOR_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int TensorListFromTensorInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs,
-                                   size_t outputs_size, OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_TENSORLIST_FROMTENSOR_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_getitem_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_getitem_infer.c
deleted file mode 100644
index d0312871aeb..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_getitem_infer.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/tensorlist_getitem_infer.h"
-#include "nnacl/infer/infer_register.h"
-
-int TensorListGetItemInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs,
-                                size_t outputs_size, OpParameter *parameter) {
-  int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 2, 1);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-
-  TensorListC *input0 = (TensorListC *)(inputs[0]);
-  const TensorC *get_index = inputs[1];
-  if (get_index->data_ == NULL) {
-    return NNACL_INFER_INVALID;
-  }
-  if (GetElementNum(get_index) != 1) {
-    return NNACL_ERR;
-  }
-  TensorC *output = outputs[0];
-  if (!InferFlag(inputs, inputs_size) || input0->element_num_ == 0) {
-    return NNACL_INFER_INVALID;
-  }
-  int index = ((int *)(get_index->data_))[0];
-  if (index < 0 || index > (input0->element_num_ - 1)) {
-    return NNACL_ERR;
-  }
-  TensorC *tensor_index = &input0->tensors_[index];
-
-  if (tensor_index->data_type_ != kTypeUnknown) {
-    output->data_type_ = tensor_index->data_type_;
-  } else {
-    output->data_type_ = input0->tensors_data_type_;
-  }
-  output->format_ = input0->tensors_[index].format_;
-
-  if (!InferFlag(inputs, inputs_size)) {
-    return NNACL_INFER_INVALID;
-  }
-
-  if (tensor_index->data_type_ != kTypeUnknown) {
-    ShapeSet(output->shape_, &(output->shape_size_), tensor_index->shape_, tensor_index->shape_size_);
-  } else {
-    const TensorC *input2 = inputs[2];
-    if (input2->data_ == NULL) {
-      return NNACL_NULL_PTR;
-    }
-    int *ele_shape_data = (int *)(input2->data_);
-    int element_shape[MAX_SHAPE_SIZE] = {0};
-    size_t element_shape_size = 0;
-    for (int i = 0; i < GetElementNum(input2); ++i) {
-      ShapePush(element_shape, &element_shape_size, ele_shape_data[i]);
-    }
-    int status =
-      TensorListMergeShape(element_shape, &element_shape_size, input0->element_shape_, input0->element_shape_size_);
-    if (status != NNACL_OK) {
-      return NNACL_ERR;
-    }
-    if (!TensorListIsFullyDefined(element_shape, element_shape_size)) {
-      for (int i = 0; i < input0->element_num_; ++i) {
-        TensorC *input = &input0->tensors_[i];
-        if (input->data_type_ != kTypeUnknown) {
-          status = TensorListMergeShape(element_shape, &element_shape_size, input->shape_, input->shape_size_);
-          if (status != NNACL_OK) {
-            return NNACL_ERR;
-          }
-        }
-      }
-    }
-    if (!TensorListIsFullyDefined(element_shape, element_shape_size)) {  // the pre is the same judge condition
-      return NNACL_ERR;
-    }
-
-    SetShapeArray(output, element_shape, element_shape_size);
-  }
-
-  return NNACL_OK;
-}
-
-REG_INFER(TensorListGetItem, PrimType_TensorListGetItem, TensorListGetItemInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_getitem_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_getitem_infer.h
deleted file mode 100644
index 107fdd46e33..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_getitem_infer.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_TENSORLIST_GETITEM_INFER_H
-#define MINDSPORE_NNACL_TENSORLIST_GETITEM_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-#include "nnacl/tensorlist_parameter.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int TensorListGetItemInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs,
-                                size_t outputs_size, OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_TENSORLIST_GETITEM_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_reserve_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_reserve_infer.c
deleted file mode 100644
index 6827db30cb9..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_reserve_infer.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/tensorlist_reserve_infer.h"
-#include "nnacl/infer/infer_register.h"
-#include "nnacl/tensorlist_parameter.h"
-
-int TensorListReserveInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs,
-                                size_t outputs_size, OpParameter *parameter) {
-  int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 2, 1);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-
-  TensorListParameter *reserve_param = (TensorListParameter *)parameter;
-  const TensorC *input0 = inputs[0];
-  int ele_shape_type = input0->data_type_;
-  if (ele_shape_type != kNumberTypeInt && ele_shape_type != kNumberTypeInt32) {
-    return NNACL_ERR;
-  }
-
-  TensorListC *output = (TensorListC *)(outputs[0]);
-  output->data_type_ = kObjectTypeTensorType;
-  output->format_ = Format_NHWC;
-  output->tensors_data_type_ = reserve_param->element_dtype_;
-
-  if (input0->data_ == NULL) {
-    return NNACL_INFER_INVALID;
-  }
-  int *ele_shape_ptr = (int *)(input0->data_);
-
-  const TensorC *input1 = inputs[1];
-  int num_ele_type = input1->data_type_;
-  if (num_ele_type != kNumberTypeInt && ele_shape_type != kNumberTypeInt32) {
-    return NNACL_ERR;
-  }
-  if (input1->data_ == NULL) {
-    return NNACL_INFER_INVALID;
-  }
-  if (GetElementNum(input1) != 1) {
-    return NNACL_ERR;
-  }
-  int num_elements = ((int *)(input1->data_))[0];
-  ShapeSet(output->element_shape_, &(output->element_shape_size_), ele_shape_ptr, GetElementNum(input0));
-  output->element_num_ = num_elements;
-
-  vvector tmp_shape;
-  tmp_shape.size_ = num_elements;
-  tmp_shape.shape_ = (int **)malloc(tmp_shape.size_ * sizeof(int *));
-  if (tmp_shape.shape_ == NULL) {
-    return NNACL_NULL_PTR;
-  }
-  tmp_shape.shape_size_ = (int *)malloc(tmp_shape.size_ * sizeof(int));
-  if (tmp_shape.shape_size_ == NULL) {
-    free(tmp_shape.shape_);
-    return NNACL_NULL_PTR;
-  }
-
-  for (size_t i = 0; i < num_elements; i++) {
-    tmp_shape.shape_size_[i] = 0;
-    tmp_shape.shape_[i] = NULL;
-  }
-  int ret = MallocTensorListData(output, kTypeUnknown, &tmp_shape);
-  if (ret != NNACL_OK) {
-    return NNACL_ERR;
-  }
-  free(tmp_shape.shape_size_);
-  free(tmp_shape.shape_);
-  return NNACL_OK;
-}
-
-REG_INFER(TensorListReserve, PrimType_TensorListReserve, TensorListReserveInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_reserve_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_reserve_infer.h
deleted file mode 100644
index f1c5ce4cd59..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_reserve_infer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_TENSORLIST_RESERVE_INFER_H
-#define MINDSPORE_NNACL_TENSORLIST_RESERVE_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int TensorListReserveInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs,
-                                size_t outputs_size, OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_TENSORLIST_RESERVE_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_setitem_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_setitem_infer.c
deleted file mode 100644
index 495f0609523..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_setitem_infer.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/tensorlist_setitem_infer.h"
-#include "nnacl/infer/infer_register.h"
-
-int PreJudge(const TensorC *get_index, TensorListC *input0, const TensorC *value_tensor) {
-  if (get_index->data_ == NULL) {
-    return NNACL_INFER_INVALID;
-  }
-
-  if (get_index->data_type_ != kNumberTypeInt && get_index->data_type_ != kNumberTypeInt32) {
-    return NNACL_ERR;
-  }
-  if (GetElementNum(get_index) != 1) {
-    return NNACL_ERR;
-  }
-  if (get_index->data_ == NULL) {
-    return NNACL_NULL_PTR;
-  }
-  return NNACL_OK;
-}
-
-int TensorListSetItemInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs,
-                                size_t outputs_size, OpParameter *parameter) {
-  int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 3, 1);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-
-  TensorListC *input0 = (TensorListC *)(inputs[0]);
-  const TensorC *get_index = inputs[1];
-  const TensorC *value_tensor = inputs[2];
-  TensorListC *output0 = (TensorListC *)(outputs[0]);
-  output0->data_type_ = input0->data_type_;
-  output0->format_ = input0->format_;
-  output0->tensors_data_type_ = value_tensor->data_type_;
-
-  if (!InferFlag(inputs, inputs_size)) {
-    return NNACL_INFER_INVALID;
-  }
-
-  int judge_ret = PreJudge(get_index, input0, value_tensor);
-  if (judge_ret != NNACL_OK) {
-    return judge_ret;
-  }
-
-  int index = ((int *)(get_index->data_))[0];
-  output0->max_elements_num_ = input0->max_elements_num_;
-
-  if (input0->element_num_ == 0 && input0->element_shape_size_ == 0 && index == 0) {
-    ShapeSet(input0->element_shape_, &(input0->element_shape_size_), value_tensor->shape_, value_tensor->shape_size_);
-    ShapeSet(output0->element_shape_, &(output0->element_shape_size_), value_tensor->shape_, value_tensor->shape_size_);
-  } else {
-    ShapeSet(output0->element_shape_, &(output0->element_shape_size_), input0->element_shape_,
-             input0->element_shape_size_);
-  }
-
-  vvector out_shape;
-  out_shape.size_ = 0;
-  out_shape.shape_ = (int **)malloc((input0->element_num_ + 1) * sizeof(int *));
-  if (out_shape.shape_ == NULL) {
-    return NNACL_NULL_PTR;
-  }
-  out_shape.shape_size_ = (int *)malloc((input0->element_num_ + 1) * sizeof(int));
-  if (out_shape.shape_size_ == NULL) {
-    free(out_shape.shape_);
-    return NNACL_NULL_PTR;
-  }
-
-  if (index == 0 && input0->element_num_ == 0) {  // uninitialized tensorlist
-    out_shape.shape_[out_shape.size_] = (int *)(value_tensor->shape_);
-    out_shape.shape_size_[out_shape.size_] = value_tensor->shape_size_;
-    out_shape.size_++;
-    output0->element_num_ = 1;
-  } else {
-    output0->element_num_ = input0->element_num_;
-    for (int i = 0; i < input0->element_num_; ++i) {
-      TensorC *src_ptr = &input0->tensors_[i];
-      if (src_ptr->data_type_ != kTypeUnknown) {
-        out_shape.shape_[out_shape.size_] = src_ptr->shape_;
-        out_shape.shape_size_[out_shape.size_] = src_ptr->shape_size_;
-        out_shape.size_++;
-      } else {
-        out_shape.shape_[out_shape.size_] = NULL;
-        out_shape.shape_size_[out_shape.size_] = 0;
-        out_shape.size_++;
-      }
-    }
-  }
-
-  if (input0->tensors_data_type_ == kTypeUnknown) {
-    input0->tensors_data_type_ = value_tensor->data_type_;
-  }
-
-  out_shape.shape_[index] = (int *)(value_tensor->shape_);
-  out_shape.shape_size_[index] = value_tensor->shape_size_;
-  int ret = MallocTensorListData(output0, input0->tensors_data_type_, &out_shape);
-  if (ret != NNACL_OK) {
-    return NNACL_ERR;
-  }
-  free(out_shape.shape_);
-  free(out_shape.shape_size_);
-  return NNACL_OK;
-}
-
-REG_INFER(TensorListSetItem, PrimType_TensorListSetItem, TensorListSetItemInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_setitem_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_setitem_infer.h
deleted file mode 100644
index a73773c5d9a..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_setitem_infer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_TENSORLIST_SETITEM_INFER_H
-#define MINDSPORE_NNACL_TENSORLIST_SETITEM_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int TensorListSetItemInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs,
-                                size_t outputs_size, OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_TENSORLIST_SETITEM_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_stack_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_stack_infer.c
deleted file mode 100644
index 07634be77b6..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_stack_infer.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/infer/tensorlist_stack_infer.h"
-#include "nnacl/infer/infer_register.h"
-
-int TensorListStackInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                              OpParameter *parameter) {
-  int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 2, 1);
-  if (check_ret != NNACL_OK) {
-    return check_ret;
-  }
-
-  TensorC *output = outputs[0];
-  if (inputs[0]->data_type_ != kObjectTypeTensorType) {
-    return NNACL_INPUT_TENSOR_ERROR;
-  }
-  TensorListC *input0 = (TensorListC *)(inputs[0]);
-  output->data_type_ = input0->tensors_data_type_;
-  output->format_ = input0->format_;
-  if (!InferFlag(inputs, inputs_size)) {
-    return NNACL_INFER_INVALID;
-  }
-  if (input0->element_num_ == 0) {
-    return NNACL_INFER_INVALID;
-  }
-  const TensorC *ele_shape = inputs[1];  // element shape
-  if (ele_shape->data_ == NULL) {
-    return NNACL_NULL_PTR;
-  }
-  int *ele_shape_ptr = (int *)(ele_shape->data_);
-  int output_shape[MAX_SHAPE_SIZE] = {0};
-  size_t output_shape_size = 0;
-  if (ele_shape_ptr[0] == -1) {
-    if (input0->element_shape_size_ > MAX_SHAPE_SIZE) {
-      return NNACL_ERR;
-    }
-    for (int i = 0; i < input0->element_shape_size_; i++) {
-      ShapePush(output_shape, &output_shape_size, input0->element_shape_[i]);
-    }
-  } else {
-    int ele_shape_num = GetElementNum(ele_shape);
-    if (ele_shape_num > MAX_SHAPE_SIZE) {
-      return NNACL_ERR;
-    }
-    for (int i = 0; i < ele_shape_num; ++i) {
-      ShapePush(output_shape, &output_shape_size, ele_shape_ptr[i]);
-    }
-  }
-
-  int status =
-    TensorListMergeShape(output_shape, &output_shape_size, input0->element_shape_, input0->element_shape_size_);
-  if (status == NNACL_ERR) {
-    return NNACL_ERR;
-  }
-  if (!TensorListIsFullyDefined(output_shape, output_shape_size)) {
-    return NNACL_ERR;
-  }
-  if (!TensorListIsFullyDefined(input0->element_shape_, input0->element_shape_size_)) {
-    for (int i = 0; i < input0->element_num_; ++i) {
-      TensorC *tensor_ele = &input0->tensors_[i];
-      if (tensor_ele->data_type_ != kTypeUnknown) {
-        status = TensorListMergeShape(output_shape, &output_shape_size, tensor_ele->shape_, tensor_ele->shape_size_);
-        if (status == NNACL_ERR) {
-          return NNACL_ERR;
-        }
-      }
-    }
-  }
-  if (output_shape_size >= MAX_SHAPE_SIZE) {
-    return NNACL_ERR;
-  }
-  int ret = ShapeInsert(output_shape, &output_shape_size, 0, input0->element_num_);
-  if (ret != NNACL_OK) {
-    return NNACL_ERR;
-  }
-  SetShapeArray(output, output_shape, output_shape_size);
-  return NNACL_OK;
-}
-
-REG_INFER(TensorListStack, PrimType_TensorListStack, TensorListStackInferShape)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_stack_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_stack_infer.h
deleted file mode 100644
index ad991d66a62..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tensorlist_stack_infer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_NNACL_TENSORLIST_STACK_INFER_H
-#define MINDSPORE_NNACL_TENSORLIST_STACK_INFER_H
-
-#include "nnacl/infer/common_infer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int TensorListStackInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
-                              OpParameter *parameter);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // MINDSPORE_NNACL_TENSORLIST_STACK_INFER_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tile_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tile_infer.c
index 77609e8b1a2..62f089a56c7 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tile_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/tile_infer.c
@@ -56,13 +56,14 @@ int TileInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **o
   TileParameter *param = (TileParameter *)parameter;
 
   size_t multiples_size = 0;
-  int data_num = GetElementNum(inputs[1]);
-  if (data_num > (int)(input->shape_size_) || input->shape_size_ > MAX_SHAPE_SIZE) {
+  int input1_shape_size = inputs[1]->shape_size_;
+  if (input1_shape_size > (int)(input->shape_size_) || input->shape_size_ > MAX_SHAPE_SIZE) {
     return NNACL_INPUT_TENSOR_ERROR;
   }
-  if (data_num > MAX_TILE_DIM_SIZE) {
+  if (input1_shape_size > MAX_TILE_DIM_SIZE) {
     return NNACL_ERR;
   }
+  int data_num = GetElementNum(inputs[1]);
   multiples_size = (size_t)(data_num);
   if (inputs[1]->data_type_ != kNumberTypeInt && inputs[1]->data_type_ != kNumberTypeInt32) {
     return NNACL_INPUT_TENSOR_ERROR;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/where_infer.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/where_infer.c
index 4c05f58bced..faea34b373a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/where_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/where_infer.c
@@ -71,7 +71,6 @@ int WhereInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **
     temp += 1;
     if (temp == input0->shape_size_) {
       SetShapeTensor(output, input);
-      output->data_type_ = input->data_type_;
       return NNACL_OK;
     }
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.c
index 73313cf679c..35357fcf237 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.c
@@ -78,7 +78,7 @@ void AddInt8OutputRounding(int32x4_t *out1, int32x4_t *out2, int32x4_t *out3, in
 }
 #endif
 
-void AddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int size, AddQuantParameter *params) {
+void AddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int size, const AddQuantParameter *params) {
   int in0_left_shift = (1 << params->left_shift_) * (1 << params->in0_args_.left_shift_);
   int in1_left_shift = (1 << params->left_shift_) * (1 << params->in1_args_.left_shift_);
   int index = 0;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.h
index e971f1134d6..cdd9e2c753e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.h
@@ -50,7 +50,7 @@ typedef struct AddQuantParameter {
 extern "C" {
 #endif
 
-void AddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int size, AddQuantParameter *params);
+void AddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int size, const AddQuantParameter *params);
 
 void AddOptInt8(const int8_t *ptr_in, const int8_t element_in, int8_t *output, int size,
                 const AddQuantParameter *params, const AddQuantQrgs *ptr_args, const AddQuantQrgs *ele_args);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/conv3x3_int8.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/conv3x3_int8.c
index 6ad20cade63..92994d8c6f1 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/conv3x3_int8.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/conv3x3_int8.c
@@ -867,9 +867,9 @@ void Conv3x3Int8Gemm(int32_t *dst, const int16_t *src, const int16_t *weight, in
 }
 
 // int8 convolution 3x3
-void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data,
-                 int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out,
-                 int task_id, const ConvParameter *conv_param) {
+void Conv3x3Int8(const int16_t *input_data, const int16_t *transed_weight, const int32_t *bias_data,
+                 int8_t *output_data, int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer,
+                 int8_t *tmp_out, int task_id, const ConvParameter *conv_param) {
   int ic8 = UP_DIV(conv_param->input_channel_, C8NUM);
   int out_w_block = UP_DIV(conv_param->output_w_, OUPUT_UNIT);
   int out_h_block = UP_DIV(conv_param->output_h_, OUPUT_UNIT);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/conv3x3_int8.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/conv3x3_int8.h
index b857833fa78..c296bd00ff4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/conv3x3_int8.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/conv3x3_int8.h
@@ -37,9 +37,9 @@ extern "C" {
 void Conv3x3Int8FilterTransform(const int16_t *weight_data, int16_t *trans_weight, int iC8, int output_channel,
                                 int kernel_plane);
 
-void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data,
-                 int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out,
-                 int task_id, const ConvParameter *conv_param);
+void Conv3x3Int8(const int16_t *input_data, const int16_t *transed_weight, const int32_t *bias_data,
+                 int8_t *output_data, int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer,
+                 int8_t *tmp_out, int task_id, const ConvParameter *conv_param);
 
 #ifdef __cplusplus
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/deconv_int8.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/deconv_int8.c
index cfa160e0ac5..f5407f84ee2 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/deconv_int8.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/deconv_int8.c
@@ -130,9 +130,9 @@ void DeConvPackInputSum(const int8_t *src, int32_t *dst, int32_t filter_zp, size
   return;
 }
 
-int DeConvInt8(const int8_t *input, const int8_t *weight, int32_t *output, int32_t *weight_sum, int32_t *input_sum,
-               size_t act_row, size_t act_col, size_t act_deep, ConvParameter *conv_param,
-               MATMUL_OPT_R4_FUNC matmul_func) {
+int DeConvInt8(const int8_t *input, const int8_t *weight, int32_t *output, const int32_t *weight_sum,
+               const int32_t *input_sum, size_t act_row, size_t act_col, size_t act_deep,
+               const ConvParameter *conv_param, MATMUL_OPT_R4_FUNC matmul_func) {
   if (matmul_func != NULL) {
     matmul_func(input, weight, output, act_row, act_col, act_deep, input_sum, weight_sum);
   } else {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/deconv_int8.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/deconv_int8.h
index f4a27a700cc..22070c2ad5d 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/deconv_int8.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/deconv_int8.h
@@ -34,9 +34,9 @@ void DeConvPackInputSum(const int8_t *src, int32_t *dst, int32_t filter_zp, size
 void DeConvWeightTransInt8(const int8_t *src, int8_t *dst, int input_channel, int output_channel, int plane,
                            bool support_optimize_);
 
-int DeConvInt8(const int8_t *input, const int8_t *weight, int32_t *output, int32_t *weight_sum, int32_t *input_sum,
-               size_t act_row, size_t act_col, size_t act_deep, ConvParameter *conv_param,
-               MATMUL_OPT_R4_FUNC matmul_func);
+int DeConvInt8(const int8_t *input, const int8_t *weight, int32_t *output, const int32_t *weight_sum,
+               const int32_t *input_sum, size_t act_row, size_t act_col, size_t act_deep,
+               const ConvParameter *conv_param, MATMUL_OPT_R4_FUNC matmul_func);
 int DeConvPostInt8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8_t *out, int output_channel,
                    ConvParameter *conv_param, bool support_optimize);
 #ifdef __cplusplus
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.c
index ac2c3b04d13..a75433c3e13 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.c
@@ -17,7 +17,7 @@
 #include "nnacl/int8/matmul_int8.h"
 #include "nnacl/int8/fixed_point.h"
 
-void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
+void RowMajor2Row2x16MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
   int col16 = UP_ROUND(col, C16NUM);
   for (int r = 0; r < row; r++) {
     int rd2 = r / C2NUM;
@@ -32,7 +32,7 @@ void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int co
   }
 }
 
-void RowMajor2Col16x2MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
+void RowMajor2Col16x2MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
   int row16 = UP_ROUND(row, C16NUM);
   int stride = sizeof(int8_t) * C16NUM * C2NUM;
   for (int r = 0; r < row; ++r) {
@@ -60,9 +60,9 @@ void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, i
   }
 }
 
-void MatrixPack4x16UnitInt8(int8_t *src, int8_t *dst, int row, int col, int stride) {
+void MatrixPack4x16UnitInt8(const int8_t *src, int8_t *dst, int row, int col, int stride) {
   for (int r = 0; r < row; r++) {
-    int8_t *src_r = src + r * stride;
+    const int8_t *src_r = src + r * stride;
     int8_t *dst_r = dst + r * C16NUM;
     memcpy(dst_r, src_r, col * sizeof(int8_t));
   }
@@ -196,9 +196,9 @@ void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int
 }
 
 void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
-                      size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
-                      int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
-                      bool peroc) {
+                      size_t stride, const int32_t *input_sum, const int32_t *bias, const int32_t *left_shift,
+                      const int32_t *right_shift, const int32_t *multiplier, int32_t output_zp, int32_t mini,
+                      int32_t maxi, bool peroc) {
   /* support per-layer && weight per-channel */
   /*  row4x16-major * row16x2-major => (int8)row-major*/
   for (int r = 0; r < row; r++) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.h
index 87424e20098..0e11ff24d4b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.h
@@ -48,12 +48,12 @@ void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
 
 /* 4x16 16x2 -> 4x2 */
 /* arm32 conv1x1 */
-void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
-void RowMajor2Col16x2MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
+void RowMajor2Row2x16MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
+void RowMajor2Col16x2MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
-                      size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
-                      int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
-                      bool peroc);
+                      size_t stride, const int32_t *input_sum, const int32_t *bias, const int32_t *left_shift,
+                      const int32_t *right_shift, const int32_t *multiplier, int32_t output_zp, int32_t mini,
+                      int32_t maxi, bool peroc);
 
 /* 4x4 4x16 -> 4x16 */
 /* optimize conv1x1 */
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/pack_int8.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/pack_int8.c
index ea1b4c45731..993d82bad62 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/pack_int8.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/pack_int8.c
@@ -807,7 +807,7 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int8_
   }
 }
 
-void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param) {
+void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, const ConvParameter *conv_param) {
   int in_batch = conv_param->input_batch_;
   int in_channel = conv_param->input_channel_;
   int in_h = conv_param->input_h_;
@@ -961,7 +961,7 @@ void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter
 }
 
 void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, int plane, int channel,
-                             ConvQuantArg *quant_qrg) {
+                             const ConvQuantArg *quant_qrg) {
   int weight_zp = quant_qrg->filter_quant_args_[0].zp_;
   for (int c = 0; c < channel; c++) {
     if (quant_qrg->per_channel_ & FILTER_PER_CHANNEL) {
@@ -980,7 +980,7 @@ void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight
 }
 
 void PackDeconvDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, int plane, int channel,
-                                   ConvQuantArg *quant_qrg) {
+                                   const ConvQuantArg *quant_qrg) {
   int weight_zp = quant_qrg->filter_quant_args_[0].zp_;
   for (int c = 0; c < channel; c++) {
     if (quant_qrg->per_channel_ & FILTER_PER_CHANNEL) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/pack_int8.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/pack_int8.h
index ecac124e689..0a974b70d98 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/pack_int8.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/pack_int8.h
@@ -39,7 +39,7 @@ void PackNHWCToNCHWInt8(const void *src, void *dst, int batch, int plane, int ch
 void PackInputSum16x4Int8(const int8_t *input, int32_t *input_sum, const int32_t *filter_zp,
                           const ConvParameter *conv_param);
 void PackInputSum16x4PerLayer(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16);
-void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param);
+void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, const ConvParameter *conv_param);
 void PackWeightToC8Int8(const int8_t *origin_weight_data, int16_t *packed_weight_data, const ConvParameter *conv_param);
 void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int real_cal_num,
                            int block_index, const int32_t *filter_zp, int32_t *input_sum,
@@ -52,9 +52,9 @@ void PreSum4x16Int8Peroc(const int8_t *src, int32_t *sum, const int32_t *zp, siz
 
 void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter *conv_param);
 void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, int plane, int channel,
-                             ConvQuantArg *quant_qrg);
+                             const ConvQuantArg *quant_qrg);
 void PackDeconvDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, int plane, int channel,
-                                   ConvQuantArg *quant_qrg);
+                                   const ConvQuantArg *quant_qrg);
 
 #ifdef __cplusplus
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/power_int8.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/power_int8.c
index 6932500b70e..593391cbf89 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/power_int8.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/power_int8.c
@@ -16,7 +16,7 @@
 
 #include "nnacl/int8/power_int8.h"
 
-int PowerInt8(const int8_t *input, const int8_t *exp_ptr, int8_t *output, int count, PowerParameter *param) {
+int PowerInt8(const int8_t *input, const int8_t *exp_ptr, int8_t *output, int count, const PowerParameter *param) {
   double input_scale = param->quant_arg_.in_args_.scale_;
   int input_zp = param->quant_arg_.in_args_.zp_;
   double output_scale = param->quant_arg_.out_args_.scale_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/power_int8.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/power_int8.h
index be86ea03291..e36db54a420 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/power_int8.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/power_int8.h
@@ -24,7 +24,8 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-int PowerInt8(const int8_t *input_ptr, const int8_t *exp_ptr, int8_t *output_ptr, int count, PowerParameter *parameter);
+int PowerInt8(const int8_t *input_ptr, const int8_t *exp_ptr, int8_t *output_ptr, int count,
+              const PowerParameter *parameter);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/unsqueeze_int8.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/unsqueeze_int8.c
index e5f8c0aab59..1a67043e64a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/unsqueeze_int8.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/unsqueeze_int8.c
@@ -17,7 +17,7 @@
 #include "nnacl/int8/unsqueeze_int8.h"
 #include "nnacl/unsqueeze_parameter.h"
 
-int Int8Unsqueeze(const int8_t *input_ptr, int8_t *output_ptr, UnSqueezeParameter *para_, size_t data_size,
+int Int8Unsqueeze(const int8_t *input_ptr, int8_t *output_ptr, const UnSqueezeParameter *para_, size_t data_size,
                   int task_id) {
   float output_scale = para_->quant_arg.out_quant_args_.scale_;
   int8_t output_zp = para_->quant_arg.out_quant_args_.zp_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/unsqueeze_int8.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/unsqueeze_int8.h
index 0fe040d9522..6943f18c20c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/unsqueeze_int8.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/unsqueeze_int8.h
@@ -24,7 +24,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-int Int8Unsqueeze(const int8_t *input_ptr, int8_t *output_ptr, UnSqueezeParameter *para_, size_t data_size,
+int Int8Unsqueeze(const int8_t *input_ptr, int8_t *output_ptr, const UnSqueezeParameter *para_, size_t data_size,
                   int task_id);
 #ifdef __cplusplus
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h
index 929fdb8fde0..a7d865d771b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h
@@ -145,6 +145,15 @@ static inline float32x4_t vrecp(float32x4_t v) {
 #endif
 
 #if defined(ENABLE_ARM) || defined(ENABLE_SSE)
+static inline MS_FLOAT32X4 MS_SQRTFX4_F32(MS_FLOAT32X4 src) {
+  MS_FLOAT32X4 dst;
+  dst[0] = sqrtf(src[0]);
+  dst[1] = sqrtf(src[1]);
+  dst[2] = sqrtf(src[2]);
+  dst[3] = sqrtf(src[3]);
+  return dst;
+}
+
 #define LOAD128X8_F32(src, input_ptr, num)               \
   MS_FLOAT32X4 src##1 = MS_LDQ_F32(input_ptr + 0 * num); \
   MS_FLOAT32X4 src##2 = MS_LDQ_F32(input_ptr + 1 * num); \
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h
index 8a8fcb833b8..1bceec9ed5e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h
@@ -125,6 +125,28 @@ static inline float16x8_t MS_ERFX8_F16(float16x8_t src) {
   return dst;
 }
 
+static inline float16x8_t MS_SQRTFX8_F16(float16x8_t src) {
+  float16x8_t dst;
+  dst[0] = sqrtf(src[0]);
+  dst[1] = sqrtf(src[1]);
+  dst[2] = sqrtf(src[2]);
+  dst[3] = sqrtf(src[3]);
+  dst[4] = sqrtf(src[4]);
+  dst[5] = sqrtf(src[5]);
+  dst[6] = sqrtf(src[6]);
+  dst[7] = sqrtf(src[7]);
+  return dst;
+}
+
+static inline float16x4_t MS_SQRTFX4_F16(float16x4_t src) {
+  float16x4_t dst;
+  dst[0] = sqrtf(src[0]);
+  dst[1] = sqrtf(src[1]);
+  dst[2] = sqrtf(src[2]);
+  dst[3] = sqrtf(src[3]);
+  return dst;
+}
+
 static inline float32x4_t MS_VMLAL_F16(float16x4_t x, float16x4_t dy, float32x4_t sum) {
   float32x4_t x_fp32 = MS_CVT_F32_F16(x);
   float32x4_t dy_fp32 = MS_CVT_F32_F16(dy);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/matmul_parameter.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/matmul_parameter.h
index d11feea2207..cb8963151e0 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/matmul_parameter.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/matmul_parameter.h
@@ -33,7 +33,7 @@ typedef void (*MATMUL_OPT_DP_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst
                                    int32_t output_zp, int32_t mini, int32_t maxi, size_t per_channel,
                                    const int *filter_zp);
 
-typedef enum OutType { OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2 } OutType;
+typedef enum OutType { OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2, OutType_NC4HW4 = 3 } OutType;
 
 typedef struct MatMulParameter {
   // Primitive parameter
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h
index 52241ba13c8..a9a09f96705 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h
@@ -76,6 +76,7 @@
 #define MAX_AXIS_SIZE 6
 #define MAX_LEN 256
 #define FLT16_MAX 65504
+#define NNACL_NC4HW4 13
 
 #ifndef ENABLE_HIGH_PERFORMANCE
 #define CHECK_NULL_RETURN(ptr)                       \
@@ -90,7 +91,7 @@
   do {                                                             \
     if ((size1) < (size2)) {                                       \
       MS_LOG(ERROR) << #size1 << " must not less than " << #size2; \
-      return RET_ERROR;                                            \
+      return lite::RET_ERROR;                                      \
     }                                                              \
   } while (0);
 
@@ -108,11 +109,19 @@
     }                                \
   } while (0);
 
+#define NNACL_CHECK_NULL_RETURN_ERR(ptr) \
+  do {                                   \
+    if ((ptr) == NULL) {                 \
+      return NNACL_NULL_PTR;             \
+    }                                    \
+  } while (0);
+
 #else
 #define CHECK_NULL_RETURN(ptr)
 #define CHECK_LESS_RETURN(size1, size2)
 #define NNACL_CHECK_ZERO_RETURN_ERR(val)
 #define NNACL_CHECK_ZERO_RETURN(val)
+#define NNACL_CHECK_NULL_RETURN_ERR(ptr)
 #endif
 
 typedef enum LiteDataType {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/one_hot_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/one_hot_cpu_kernel.cc
index 20e97786151..c496365d6cc 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/one_hot_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/one_hot_cpu_kernel.cc
@@ -66,7 +66,7 @@ bool OneHotCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, cons
       }
     }
   };
-  CPUKernelUtils::ParallelForAutoSearch(task, elem_num, &parallel_search_info_);
+  ParallelLaunchAutoSearch(task, elem_num, this, &parallel_search_info_);
 
   return true;
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_cpu_kernel.cc
index cd2783e7264..5188ed3890c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_cpu_kernel.cc
@@ -63,6 +63,8 @@ bool PadCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const s
     LaunchKernel<float16>(inputs, outputs);
   } else if (dtype_ == kNumberTypeFloat32) {
     LaunchKernel<float>(inputs, outputs);
+  } else if (dtype_ == kNumberTypeFloat64) {
+    LaunchKernel<double>(inputs, outputs);
   } else if (dtype_ == kNumberTypeInt32) {
     LaunchKernel<int>(inputs, outputs);
   } else {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/pyfunc/py_func_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/pyfunc/py_func_cpu_kernel.cc
index 0d81da83a2e..c790b4d926c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/pyfunc/py_func_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/pyfunc/py_func_cpu_kernel.cc
@@ -135,7 +135,8 @@ void ScalarToRawMemory(const py::object &obj, const TypePtr &type, const Address
 void ArrayToRawMemory(const py::array &array, const AddressPtr &address) {
   if (static_cast<unsigned int>(array.flags()) & pybind11::detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_) {
     const py::buffer_info &buf_info = array.request();
-    CHECK_RET_WITH_EXCEPT(memcpy_s(address->addr, address->size, buf_info.ptr, buf_info.size), EOK, "memcpy failed.");
+    CHECK_RET_WITH_EXCEPT(memcpy_s(address->addr, address->size, buf_info.ptr, buf_info.size * buf_info.itemsize), EOK,
+                          "memcpy failed.");
   } else {
     // Transform numpy array to row major buffer.
     Py_buffer pybuf;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/random_choice_with_mask_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/random_choice_with_mask_cpu_kernel.cc
index 921f2811cbb..7e17cd7cf6e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/random_choice_with_mask_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/random_choice_with_mask_cpu_kernel.cc
@@ -151,8 +151,6 @@ bool RandomChoiceWithMaskCPUKernel::Launch(const std::vector<kernel::AddressPtr>
     return false;
   }
 
-  std::mt19937 gen(seedc);
-  std::uniform_int_distribution<> dis(0, non_zero_num - 1);
   int *mask_dim = new (std::nothrow) int[output_length];
   if (mask_dim == nullptr) {
     MS_LOG(EXCEPTION) << "Malloc memory failed!";
@@ -163,8 +161,12 @@ bool RandomChoiceWithMaskCPUKernel::Launch(const std::vector<kernel::AddressPtr>
   (void)memset_s(mask_dim, output_length, 0X00, output_length);
   (void)memset_s(tmp_output, output_length, 0X00, output_length);
 
+  std::vector<int32_t> all_nums(non_zero_num);
+  std::iota(begin(all_nums), end(all_nums), 0);
+  shuffle(all_nums.begin(), all_nums.end(), std::default_random_engine(seedc));
+
   for (int32_t i = 0; i < output_non_zero_length; i++) {
-    int32_t mean = dis(gen);
+    int32_t mean = all_nums[i];
     tmp_output[i] = input_dim[mean];
     mask_dim[i] = 1;
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/random_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/random_cpu_kernel.cc
index c34643fd79b..d9622a23f8e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/random_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/random_cpu_kernel.cc
@@ -27,17 +27,7 @@ void StandardNormal(float *output, std::normal_distribution<float> distribution,
   }
 }
 
-void LaunchStandardNormal(int seed, int seed2, const std::vector<AddressPtr> &outputs) {
-  unsigned int RNG_seed;
-  std::random_device rd;
-  if (seed2 != 0) {
-    RNG_seed = IntToUint(seed2);
-  } else if (seed != 0) {
-    RNG_seed = IntToUint(seed);
-  } else {
-    RNG_seed = rd();
-  }
-
+void LaunchStandardNormal(unsigned int seed, const std::vector<AddressPtr> &outputs) {
   auto output = reinterpret_cast<float *>(outputs[0]->addr);
   // multithreading
   size_t lens = outputs[0]->size / sizeof(float);
@@ -58,7 +48,7 @@ void LaunchStandardNormal(int seed, int seed2, const std::vector<AddressPtr> &ou
   std::normal_distribution<float> distribution;
   while (start < lens) {
     // avoid different threads using the same seed to generate the same random number
-    std::default_random_engine random_generator(++RNG_seed);
+    std::default_random_engine random_generator(++seed);
     size_t end = (start + once_compute_size) > lens ? lens : (start + once_compute_size);
     threads.emplace_back(std::thread(StandardNormal, output, distribution, random_generator, start, end));
     start += once_compute_size;
@@ -68,6 +58,63 @@ void LaunchStandardNormal(int seed, int seed2, const std::vector<AddressPtr> &ou
   }
 }
 
+void LaunchUniformInt(unsigned int seed, const std::vector<AddressPtr> &inputs,
+                      const std::vector<AddressPtr> &outputs) {
+  if (inputs.size() != 3) {
+    MS_LOG(EXCEPTION) << "Expect input number 3, actual got input number " << inputs.size();
+  }
+  if (outputs.size() != 1) {
+    MS_LOG(EXCEPTION) << "Expect output number 1, actual got output number " << outputs.size();
+  }
+  // Init min/max values.
+  int min_val = reinterpret_cast<int *>(inputs[1]->addr)[0];
+  int max_val = reinterpret_cast<int *>(inputs[2]->addr)[0];
+  if (max_val <= min_val) {
+    MS_LOG(EXCEPTION) << "Invalid min/max values: (" << min_val << "/" << max_val << ")";
+  }
+
+  // Init output address.
+  auto output = reinterpret_cast<int *>(outputs[0]->addr);
+  MS_EXCEPTION_IF_NULL(output);
+
+  // Init sample number.
+  size_t num_sample = outputs[0]->size / sizeof(int);
+
+  // Init random int generator.
+  std::mt19937 gen(seed);
+  std::uniform_int_distribution<> distrib(min_val, max_val - 1);
+
+  // Generate random int values.
+  for (size_t i = 0; i < num_sample; ++i) {
+    output[i] = distrib(gen);
+  }
+}
+
+void LaunchUniformReal(unsigned int seed, const std::vector<AddressPtr> &inputs,
+                       const std::vector<AddressPtr> &outputs) {
+  if (inputs.size() != 1) {
+    MS_LOG(EXCEPTION) << "Expect input number 1, actual got input number " << inputs.size();
+  }
+  if (outputs.size() != 1) {
+    MS_LOG(EXCEPTION) << "Expect output number 1, actual got output number " << outputs.size();
+  }
+  // Init output address.
+  auto output = reinterpret_cast<float *>(outputs[0]->addr);
+  MS_EXCEPTION_IF_NULL(output);
+
+  // Init sample number.
+  size_t num_sample = outputs[0]->size / sizeof(int);
+
+  // Init random real generator.
+  std::mt19937 gen(seed);
+  std::uniform_real_distribution<> distrib(0.0, 1.0);
+
+  // Generate random real values.
+  for (size_t i = 0; i < num_sample; ++i) {
+    output[i] = distrib(gen);
+  }
+}
+
 void RandomCPUKernel::InitKernel(const CNodePtr &kernel_node) {
   MS_EXCEPTION_IF_NULL(kernel_node);
   std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
@@ -94,8 +141,22 @@ void RandomCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 
 bool RandomCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                              const std::vector<kernel::AddressPtr> &outputs) {
+  unsigned int RNG_seed = 0;
+  std::random_device rd;
+  if (seed2_ != 0) {
+    RNG_seed = IntToUint(seed2_);
+  } else if (seed_ != 0) {
+    RNG_seed = IntToUint(seed_);
+  } else {
+    RNG_seed = rd();
+  }
+
   if (random_op_type_ == RANDOM_OP_NORMAL) {
-    LaunchStandardNormal(seed_, seed2_, outputs);
+    LaunchStandardNormal(RNG_seed, outputs);
+  } else if (random_op_type_ == RANDOM_OP_UNIFORM_INT) {
+    LaunchUniformInt(RNG_seed, inputs, outputs);
+  } else if (random_op_type_ == RANDOM_OP_UNIFORM_REAL) {
+    LaunchUniformReal(RNG_seed, inputs, outputs);
   } else {
     MS_LOG(EXCEPTION) << "Random operation " << random_op_type_ << " is not supported.";
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/random_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/random_cpu_kernel.h
index fa8d9d32089..f27d7c97adb 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/random_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/random_cpu_kernel.h
@@ -45,6 +45,15 @@ class RandomCPUKernel : public CPUKernel {
 
 MS_REG_CPU_KERNEL(StandardNormal, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
                   RandomCPUKernel);
+MS_REG_CPU_KERNEL(UniformInt,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeInt32)
+                    .AddInputAttr(kNumberTypeInt32)
+                    .AddInputAttr(kNumberTypeInt32)
+                    .AddOutputAttr(kNumberTypeInt32),
+                  RandomCPUKernel)
+MS_REG_CPU_KERNEL(UniformReal, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
+                  RandomCPUKernel)
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RANDOM_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc
index bb7675be4a5..8666358700c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc
@@ -139,7 +139,7 @@ bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
             }
           }
         };
-        CPUKernelUtils::ParallelForAutoSearch(task, output_size, &parallel_search_info_);
+        ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
         return true;
       }
     }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_cpu_kernel.cc
index f3c6eb22988..72c7bc9639e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_cpu_kernel.cc
@@ -20,7 +20,6 @@
 
 namespace mindspore {
 namespace kernel {
-
 void ResizeBilinearCPUKernel::InitKernel(const CNodePtr &kernel_node) {
   CheckParam(kernel_node);
   shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_grad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_grad_cpu_kernel.cc
index 47ab3dc339a..1156b830d61 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_grad_cpu_kernel.cc
@@ -20,7 +20,6 @@
 
 namespace mindspore {
 namespace kernel {
-
 void ResizeBilinearGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
   CheckParam(kernel_node);
   shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_grad_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_grad_cpu_kernel.h
index be87ceb50cb..6241f1890c8 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_grad_cpu_kernel.h
@@ -42,8 +42,8 @@ class ResizeBilinearGradCPUKernel : public CPUKernel {
   void CheckParam(const CNodePtr &kernel_node);
   TypeId dtype_{kTypeUnknown};
   bool align_corners_ = false;
-  float height_scale;
-  float width_scale;
+  float height_scale = 1.;
+  float width_scale = 1.;
   std::vector<size_t> size_;
   std::vector<size_t> shape_;
 };
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_cpu_kernel.cc
index 286d4556929..c97d586e924 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_cpu_kernel.cc
@@ -20,7 +20,6 @@
 
 namespace mindspore {
 namespace kernel {
-
 void ResizeNearestNeighborCPUKernel::InitKernel(const CNodePtr &kernel_node) {
   CheckParam(kernel_node);
   std::vector<size_t> input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_grad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_grad_cpu_kernel.cc
index f1ab2bf3446..294a8a0854c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_grad_cpu_kernel.cc
@@ -20,7 +20,6 @@
 
 namespace mindspore {
 namespace kernel {
-
 void ResizeNearestNeighborGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
   CheckParam(kernel_node);
   std::vector<size_t> input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/searchsorted_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/searchsorted_cpu_kernel.cc
index 2a987be45d2..d08b161dcc4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/searchsorted_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/searchsorted_cpu_kernel.cc
@@ -103,7 +103,7 @@ void SearchSortedCPUKernel<S, T>::CheckParam(const std::vector<AddressPtr> &inpu
       }
     }
   };
-  CPUKernelUtils::ParallelFor(task, list_count);
+  CPUKernelUtils::ParallelFor(task, IntToSize(list_count));
 }
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/slice_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/slice_cpu_kernel.cc
index 4f7de54837f..156f06495cd 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/slice_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/slice_cpu_kernel.cc
@@ -113,7 +113,7 @@ bool SliceCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const
       auto dst = static_cast<int8_t *>(output_addr) + data_size_ * slice_param_.size_[1] * start;
       SliceSimpleDim2(src, dst, &slice_param_, data_size_, end - start);
     };
-    CPUKernelUtils::ParallelForAutoSearch(task, slice_param_.size_[0], &parallel_search_info_);
+    ParallelLaunchAutoSearch(task, slice_param_.size_[0], this, &parallel_search_info_);
     return true;
   }
   DoSliceNoParallel(input_addr, output_addr, &slice_param_, data_size_);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/split_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/split_cpu_kernel.cc
index 338ff4b405c..79996c8c0d9 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/split_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/split_cpu_kernel.cc
@@ -67,7 +67,7 @@ void SplitCPUKernel<T>::LaunchSplit(T *input, T **output, size_t size) {
     (void)DoSplit(input, reinterpret_cast<void **>(output), &input_shape_[0], SizeToInt(start), SizeToInt(end - start),
                   &param, SizeToInt(sizeof(T)));
   };
-  CPUKernelUtils::ParallelForAutoSearch(task, param.split_count_ * param.num_split_, &parallel_search_info_);
+  ParallelLaunchAutoSearch(task, param.split_count_ * param.num_split_, this, &parallel_search_info_);
   return;
 }
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc
index de0902f1895..0a5e63934c8 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc
@@ -42,7 +42,7 @@ bool TensorAddCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs
         output_addr[i] = input_addr_a[i] + input_addr_b[i];
       }
     };
-    CPUKernelUtils::ParallelForAutoSearch(task, output_size, &parallel_search_info_);
+    ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
   } else {  // Broadcast
     BroadcastIterator base_iter(input_shape_a_, input_shape_b_, output_shape_);
     auto task = [&base_iter, output_addr, input_addr_a, input_addr_b](size_t start, size_t end) {
@@ -53,7 +53,7 @@ bool TensorAddCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs
         iter.GenNextPos();
       }
     };
-    CPUKernelUtils::ParallelForAutoSearch(task, output_size, &parallel_search_info_);
+    ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
   }
   return true;
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.cc
index cfe83ba839b..7d090861462 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.cc
@@ -112,7 +112,7 @@ void TileCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const st
 
   if (one_dim_tile_) {
     auto task = [&](size_t start, size_t end) { TileSimple(x_addr, y_addr, start, end, &tile_parameter_); };
-    CPUKernelUtils::ParallelForAutoSearch(task, tile_parameter_.fast_outer_size_, &parallel_search_info_);
+    ParallelLaunchAutoSearch(task, tile_parameter_.fast_outer_size_, this, &parallel_search_info_);
     return;
   }
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
index 3bf8624d37f..e10f1e3df25 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
@@ -369,6 +369,64 @@ struct SquaredDifferenceFunc {
   }
 };
 
+template <typename T>
+struct TruncateDivFunc {
+  __device__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
+    T res = static_cast<T>(static_cast<double>(lhs) / static_cast<double>(rhs));
+    return res;
+  }
+};
+
+template <>
+struct TruncateDivFunc<half> {
+  __device__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
+    float res = __half2float(lhs) / __half2float(rhs);
+    return __float2half_rn(res);
+  }
+};
+
+template <>
+struct TruncateDivFunc<half2> {
+  __device__ __host__ __forceinline__ half2 operator()(const half2 &lhs, const half2 &rhs) {
+    float2 l = __half22float2(lhs);
+    float2 r = __half22float2(rhs);
+    float2 res;
+    res.x = l.x / r.x;
+    res.y = l.y / r.y;
+    return __float22half2_rn(res);
+  }
+};
+
+template <typename T>
+struct TruncateModFunc {
+  __device__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
+    T res = static_cast<T>(lhs - static_cast<int>(lhs / rhs) * rhs);
+    return res;
+  }
+};
+
+template <>
+struct TruncateModFunc<half> {
+  __device__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
+    float l = __half2float(lhs);
+    float r = __half2float(rhs);
+    float res = l - static_cast<int>(l / r) * r;
+    return __float2half_rn(res);
+  }
+};
+
+template <>
+struct TruncateModFunc<half2> {
+  __device__ __host__ __forceinline__ half2 operator()(const half2 &lhs, const half2 &rhs) {
+    float2 l = __half22float2(lhs);
+    float2 r = __half22float2(rhs);
+    float2 res;
+    res.x = l.x - static_cast<int>(l.x / r.x) * r.x;
+    res.y = l.y - static_cast<int>(l.y / r.y) * r.y;
+    return __float22half2_rn(res);
+  }
+};
+
 template <typename T>
 struct Atan2Func {
   __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return atan2f(lhs, rhs); }
@@ -494,6 +552,10 @@ void ElewiseArithKernel(const int &nums, enum BroadcastOpType op, const T *x0, c
       return ElewiseArithKernel<T, DivNoNanFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
     case BROADCAST_TYPE_SQUARED_DIFFERENCE:
       return ElewiseArithKernel<T, SquaredDifferenceFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    case BROADCAST_TYPE_TRUNCATEDIV:
+      return ElewiseArithKernel<T, TruncateDivFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    case BROADCAST_TYPE_TRUNCATEMOD:
+      return ElewiseArithKernel<T, TruncateModFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
     case BROADCAST_TYPE_MOD:
       return ElewiseArithKernel<T, ModFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
     case BROADCAST_TYPE_FLOORMOD:
@@ -779,6 +841,16 @@ void BroadcastArith(const std::vector<size_t> &x0_dims, const std::vector<size_t
         x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
         x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
         y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
+    case BROADCAST_TYPE_TRUNCATEDIV:
+      return BroadcastArithKernel<T, TruncateDivFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
+    case BROADCAST_TYPE_TRUNCATEMOD:
+      return BroadcastArithKernel<T, TruncateModFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
     case BROADCAST_TYPE_MOD:
       return BroadcastArithKernel<T, ModFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
         x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
index 397961dfd31..87384cdd89c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
@@ -46,6 +46,8 @@ enum BroadcastOpType {
   BROADCAST_TYPE_NOT_EQUAL = 20,
   BROADCAST_TYPE_LOGICAL_AND = 21,
   BROADCAST_TYPE_LOGICAL_OR = 22,
+  BROADCAST_TYPE_TRUNCATEDIV = 23,
+  BROADCAST_TYPE_TRUNCATEMOD = 24,
   BROADCAST_TYPE_INVALID = 0xffffffff,
 };
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cu
deleted file mode 100644
index e9b1cd06133..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cu
+++ /dev/null
@@ -1,457 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cuh"
-#include <stdio.h>
-#include <vector>
-
-__device__ __host__ VECTOR operator-(const VECTOR &vecb) {
-  VECTOR vec;
-  vec.x = -vecb.x;
-  vec.y = -vecb.y;
-  vec.z = -vecb.z;
-  return vec;
-}
-
-__device__ __host__ VECTOR Get_Periodic_Displacement(const VECTOR vec_a, const VECTOR vec_b, const VECTOR box_length) {
-  VECTOR dr;
-  // dr = vec_a - vec_b;
-  dr.x = vec_a.x - vec_b.x;
-  dr.y = vec_a.y - vec_b.y;
-  dr.x = vec_a.z - vec_b.z;
-
-  dr.x = dr.x - floorf(dr.x / box_length.x + 0.5) * box_length.x;
-  dr.y = dr.y - floorf(dr.y / box_length.y + 0.5) * box_length.y;
-  dr.z = dr.z - floorf(dr.z / box_length.z + 0.5) * box_length.z;
-  return dr;
-}
-
-__global__ void Copy_List(const int element_numbers, const int *origin_list, int *list) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < element_numbers) {
-    list[i] = origin_list[i];
-  }
-}
-__global__ void Copy_List(const int element_numbers, const float *origin_list, float *list) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < element_numbers) {
-    list[i] = origin_list[i];
-  }
-}
-
-__global__ void Crd_To_Uint_Crd(const int atom_numbers, float *scale_factor, const VECTOR *crd,
-                                UNSIGNED_INT_VECTOR *uint_crd) {
-  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (atom_i < atom_numbers) {
-    INT_VECTOR tempi;
-    VECTOR temp = crd[atom_i];
-
-    temp.x *= scale_factor[0];
-    temp.y *= scale_factor[1];
-    temp.z *= scale_factor[2];
-
-    tempi.int_x = temp.x;
-    tempi.int_y = temp.y;
-    tempi.int_z = temp.z;
-
-    uint_crd[atom_i].uint_x = (tempi.int_x << 2);
-    uint_crd[atom_i].uint_y = (tempi.int_y << 2);
-    uint_crd[atom_i].uint_z = (tempi.int_z << 2);
-  }
-}
-
-__global__ void Vector_Translation(const int vector_numbers, VECTOR *vec_list, const VECTOR translation_vec) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < vector_numbers) {
-    vec_list[i].x = vec_list[i].x + translation_vec.x;
-    vec_list[i].y = vec_list[i].y + translation_vec.y;
-    vec_list[i].z = vec_list[i].z + translation_vec.z;
-  }
-}
-__global__ void Vector_Translation(const int vector_numbers, VECTOR *vec_list, const VECTOR *translation_vec) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < vector_numbers) {
-    vec_list[i].x = vec_list[i].x + translation_vec[0].x;
-    vec_list[i].y = vec_list[i].y + translation_vec[0].y;
-    vec_list[i].z = vec_list[i].z + translation_vec[0].z;
-  }
-}
-__global__ void Crd_Periodic_Map(const int atom_numbers, VECTOR *crd, const float *box_length) {
-  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (atom_i < atom_numbers) {
-    if (crd[atom_i].x >= 0) {
-      if (crd[atom_i].x < box_length[0]) {
-      } else {
-        crd[atom_i].x = crd[atom_i].x - box_length[0];
-      }
-    } else {
-      crd[atom_i].x = crd[atom_i].x + box_length[0];
-    }
-
-    if (crd[atom_i].y >= 0) {
-      if (crd[atom_i].y < box_length[1]) {
-      } else {
-        crd[atom_i].y = crd[atom_i].y - box_length[1];
-      }
-    } else {
-      crd[atom_i].y = crd[atom_i].y + box_length[1];
-    }
-    if (crd[atom_i].z >= 0) {
-      if (crd[atom_i].z < box_length[2]) {
-      } else {
-        crd[atom_i].z = crd[atom_i].z - box_length[2];
-      }
-    } else {
-      crd[atom_i].z = crd[atom_i].z + box_length[2];
-    }
-  }
-}
-
-__global__ void Clear_Grid_Bucket(const int grid_numbers, int *atom_numbers_in_grid_bucket, GRID_BUCKET *bucket) {
-  int grid_serial = blockDim.x * blockIdx.x + threadIdx.x;
-  if (grid_serial < grid_numbers) {
-    GRID_BUCKET bucket_i = bucket[grid_serial];
-    for (int i = 0; i < atom_numbers_in_grid_bucket[grid_serial]; i = i + 1) {
-      bucket_i.atom_serial[i] = -1;
-    }
-    atom_numbers_in_grid_bucket[grid_serial] = 0;
-  }
-}
-
-__global__ void Find_Atom_In_Grid_Serial(const int atom_numbers, const float *grid_length_inverse, const VECTOR *crd,
-                                         const int *grid_N, const int gridxy, int *atom_in_grid_serial) {
-  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (atom_i < atom_numbers) {
-    int Nx = static_cast<float>(crd[atom_i].x) * grid_length_inverse[0];  // crd.x must < boxlength.x
-    int Ny = static_cast<float>(crd[atom_i].y) * grid_length_inverse[1];
-    int Nz = static_cast<float>(crd[atom_i].z) * grid_length_inverse[2];
-    Nx = Nx & ((Nx - grid_N[0]) >> 31);
-    Ny = Ny & ((Ny - grid_N[1]) >> 31);
-    Nz = Nz & ((Nz - grid_N[2]) >> 31);
-    atom_in_grid_serial[atom_i] = Nz * gridxy + Ny * grid_N[0] + Nx;
-  }
-}
-
-__global__ void Put_Atom_In_Grid_Bucket(const int atom_numbers, const int *atom_in_grid_serial, GRID_BUCKET *bucket,
-                                        int *atom_numbers_in_grid_bucket) {
-  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (atom_i < atom_numbers) {
-    int grid_serial = atom_in_grid_serial[atom_i];
-    GRID_BUCKET bucket_i = bucket[grid_serial];
-    int a = atom_numbers_in_grid_bucket[grid_serial];
-    atomicCAS(&bucket_i.atom_serial[a], -1, atom_i);
-    if (bucket_i.atom_serial[a] != atom_i) {
-      while (true) {
-        a = a + 1;
-        atomicCAS(&bucket_i.atom_serial[a], -1, atom_i);
-        if (bucket_i.atom_serial[a] == atom_i) {
-          atomicAdd(&atom_numbers_in_grid_bucket[grid_serial], 1);
-          break;
-        }
-      }
-    } else {
-      atomicAdd(&atom_numbers_in_grid_bucket[grid_serial], 1);
-    }
-  }
-}
-__global__ void Find_atom_neighbors(const int atom_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
-                                    const float *uint_dr_to_dr_cof, const int *atom_in_grid_serial,
-                                    const GRID_POINTER *gpointer, const GRID_BUCKET *bucket,
-                                    const int *atom_numbers_in_grid_bucket, NEIGHBOR_LIST *nl,
-                                    const float cutoff_skin_square) {
-  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (atom_i < atom_numbers) {
-    int grid_serial = atom_in_grid_serial[atom_i];
-    int grid_serial2;
-    int atom_numbers_in_nl_lin = 0;
-    int atom_j;
-    int int_x;
-    int int_y;
-    int int_z;
-    UNSIGNED_INT_VECTOR uint_crd_i = uint_crd[atom_i];
-    NEIGHBOR_LIST nl_i = nl[atom_i];
-    GRID_POINTER gpointer_i = gpointer[grid_serial];
-    VECTOR dr;
-    float dr2;
-    for (int grid_cycle = 0; grid_cycle < 125; grid_cycle = grid_cycle + 1) {
-      grid_serial2 = gpointer_i.grid_serial[grid_cycle];
-      GRID_BUCKET bucket_i = bucket[grid_serial2];
-      for (int i = 0; i < atom_numbers_in_grid_bucket[grid_serial2]; i = i + 1) {
-        atom_j = bucket_i.atom_serial[i];
-        if (atom_j > atom_i) {
-          int_x = uint_crd[atom_j].uint_x - uint_crd_i.uint_x;
-          int_y = uint_crd[atom_j].uint_y - uint_crd_i.uint_y;
-          int_z = uint_crd[atom_j].uint_z - uint_crd_i.uint_z;
-          dr.x = uint_dr_to_dr_cof[0] * int_x;
-          dr.y = uint_dr_to_dr_cof[1] * int_y;
-          dr.z = uint_dr_to_dr_cof[2] * int_z;
-          dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
-          if (dr2 < cutoff_skin_square) {
-            nl_i.atom_serial[atom_numbers_in_nl_lin] = atom_j;
-            atom_numbers_in_nl_lin = atom_numbers_in_nl_lin + 1;
-          }
-        }
-      }
-    }  // 124 grid cycle
-    nl[atom_i].atom_numbers = atom_numbers_in_nl_lin;
-  }
-}
-
-__global__ void Is_need_refresh_neighbor_list_cuda(const int atom_numbers, const VECTOR *crd, const VECTOR *old_crd,
-                                                   const float half_skin_square, int *need_refresh_flag) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < atom_numbers) {
-    VECTOR r1 = crd[i];
-    VECTOR r2 = old_crd[i];
-    r1.x = r1.x - r2.x;
-    r1.y = r1.y - r2.y;
-    r1.z = r1.z - r2.z;
-    float r1_2 = r1.x * r1.x + r1.y * r1.y + r1.z * r1.z;
-    if (r1_2 > half_skin_square) {
-      atomicExch(&need_refresh_flag[0], 1);
-    }
-  }
-}
-
-__global__ void Is_need_refresh_neighbor_list_cuda(const int atom_numbers, const VECTOR *crd, const VECTOR *old_crd,
-                                                   const VECTOR *box_length, const float half_skin_square,
-                                                   int *need_refresh_flag) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < atom_numbers) {
-    VECTOR r1 = crd[i];
-    VECTOR r2 = old_crd[i];
-    r1 = Get_Periodic_Displacement(r1, r2, box_length[0]);
-    float r1_2 = r1.x * r1.x + r1.y * r1.y + r1.z * r1.z;
-    if (r1_2 > half_skin_square) {
-      atomicExch(&need_refresh_flag[0], 1);
-    }
-  }
-}
-
-__global__ void Delete_Excluded_Atoms_Serial_In_Neighbor_List(const int atom_numbers, NEIGHBOR_LIST *nl,
-                                                              const int *excluded_list_start, const int *excluded_list,
-                                                              const int *excluded_atom_numbers) {
-  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (atom_i < atom_numbers) {
-    int excluded_number = excluded_atom_numbers[atom_i];
-    if (excluded_number > 0) {
-      int list_start = excluded_list_start[atom_i];
-      int atom_min = excluded_list[list_start];
-      int list_end = list_start + excluded_number;
-      int atom_max = excluded_list[list_end - 1];
-      NEIGHBOR_LIST nl_i = nl[atom_i];
-      int atomnumbers_in_nl_lin = nl_i.atom_numbers;
-      int atom_j;
-      int excluded_atom_numbers_lin = list_end - list_start;
-      int excluded_atom_numbers_count = 0;
-      for (int i = 0; i < atomnumbers_in_nl_lin; i = i + 1) {
-        atom_j = nl_i.atom_serial[i];
-        if (atom_j < atom_min || atom_j > atom_max) {
-          continue;
-        } else {
-          for (int j = list_start; j < list_end; j = j + 1) {
-            if (atom_j == excluded_list[j]) {
-              atomnumbers_in_nl_lin = atomnumbers_in_nl_lin - 1;
-              nl_i.atom_serial[i] = nl_i.atom_serial[atomnumbers_in_nl_lin];
-              excluded_atom_numbers_count = excluded_atom_numbers_count + 1;
-              i = i - 1;
-            }
-          }
-          if (excluded_atom_numbers_count < excluded_atom_numbers_lin) {
-          } else {
-            break;
-          }  // break
-        }    // in the range of excluded min to max
-      }      // cycle for neighbors
-      nl[atom_i].atom_numbers = atomnumbers_in_nl_lin;
-    }  // if need excluded
-  }
-}
-
-void Refresh_Neighbor_List(int *refresh_sign, const int thread, const int atom_numbers, VECTOR *crd, VECTOR *old_crd,
-                           UNSIGNED_INT_VECTOR *uint_crd, float *crd_to_uint_crd_cof, float *uint_dr_to_dr_cof,
-                           int *atom_in_grid_serial, const float skin, float *box_length, const GRID_POINTER *gpointer,
-                           GRID_BUCKET *bucket, int *atom_numbers_in_grid_bucket, NEIGHBOR_LIST *d_nl,
-                           int *excluded_list_start, int *excluded_list, int *excluded_numbers,
-                           float cutoff_skin_square, int grid_numbers, float *grid_length_inverse, int *grid_N, int Nxy,
-                           cudaStream_t stream) {
-  std::vector<int> h_refresh_sign(1);
-  cudaMemcpyAsync(h_refresh_sign.data(), refresh_sign, sizeof(int), cudaMemcpyDeviceToHost, stream);
-  if (h_refresh_sign[0] == 1) {
-    Clear_Grid_Bucket<<<ceilf(static_cast<float>(grid_numbers) / thread), thread, 0, stream>>>(
-      grid_numbers, atom_numbers_in_grid_bucket, bucket);
-
-    Crd_Periodic_Map<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(atom_numbers, crd,
-                                                                                              box_length);
-
-    Find_Atom_In_Grid_Serial<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(
-      atom_numbers, grid_length_inverse, crd, grid_N, Nxy, atom_in_grid_serial);
-
-    Copy_List<<<ceilf(static_cast<float>(3. * atom_numbers) / thread), thread, 0, stream>>>(
-      3 * atom_numbers, reinterpret_cast<float *>(crd), reinterpret_cast<float *>(old_crd));
-
-    Put_Atom_In_Grid_Bucket<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(
-      atom_numbers, atom_in_grid_serial, bucket, atom_numbers_in_grid_bucket);
-
-    Crd_To_Uint_Crd<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(
-      atom_numbers, crd_to_uint_crd_cof, crd, uint_crd);
-
-    Find_atom_neighbors<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(
-      atom_numbers, uint_crd, uint_dr_to_dr_cof, atom_in_grid_serial, gpointer, bucket, atom_numbers_in_grid_bucket,
-      d_nl, cutoff_skin_square);
-
-    Delete_Excluded_Atoms_Serial_In_Neighbor_List<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0,
-                                                    stream>>>(atom_numbers, d_nl, excluded_list_start, excluded_list,
-                                                              excluded_numbers);
-    h_refresh_sign[0] = 0;
-  }
-}
-
-__global__ void construct_neighbor_list_kernel(int atom_numbers, int max_neighbor_numbers, int *nl_atom_numbers,
-                                               int *nl_atom_serial, NEIGHBOR_LIST *nl) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < atom_numbers; i += gridDim.x * blockDim.x) {
-    nl[i].atom_numbers = nl_atom_numbers[i];
-    nl[i].atom_serial = nl_atom_serial + i * max_neighbor_numbers;
-  }
-}
-
-void Construct_Neighbor_List(int atom_numbers, int max_neighbor_numbers, int *nl_atom_numbers, int *nl_atom_serial,
-                             NEIGHBOR_LIST *nl, cudaStream_t stream) {
-  construct_neighbor_list_kernel<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
-    atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl);
-}
-
-__global__ void copy_neighbor_list_atom_number(int atom_numbers, int max_neighbor_numbers, NEIGHBOR_LIST *nl,
-                                               int *nl_atom_numbers, int *nl_atom_serial) {
-  int i, j;
-  for (i = blockIdx.x * blockDim.x + threadIdx.x; i < atom_numbers; i += gridDim.x * blockDim.x) {
-    nl_atom_numbers[i] = nl[i].atom_numbers;
-    for (j = blockIdx.y * blockDim.y + threadIdx.y; j < max_neighbor_numbers; j += gridDim.y * blockDim.y) {
-      if (j < nl_atom_numbers[i]) {
-        nl_atom_serial[i * max_neighbor_numbers + j] = nl[i].atom_serial[j];
-      } else {
-        nl_atom_serial[i * max_neighbor_numbers + j] = 0;
-      }
-    }
-  }
-}
-
-__global__ void Reset_List(const int element_numbers, int *list, const int replace_element) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < element_numbers) {
-    list[i] = replace_element;
-  }
-}
-
-__global__ void Reset_List(const int element_numbers, float *list, const float replace_element) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < element_numbers) {
-    list[i] = replace_element;
-  }
-}
-
-void CopyNeighborListAtomNumber(int atom_numbers, int max_neighbor_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers,
-                                int *nl_atom_serial, cudaStream_t stream) {
-  copy_neighbor_list_atom_number<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
-    atom_numbers, max_neighbor_numbers, nl, nl_atom_numbers, nl_atom_serial);
-}
-
-void Refresh_Neighbor_List_No_Check(int grid_numbers, int atom_numbers, float skin, int Nxy, float cutoff_skin_square,
-                                    int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket,
-                                    float *grid_length_inverse, int *atom_in_grid_serial, GRID_BUCKET *bucket,
-                                    VECTOR *crd, VECTOR *old_crd, float *crd_to_uint_crd_cof,
-                                    UNSIGNED_INT_VECTOR *uint_crd, float *uint_dr_to_dr_cof, GRID_POINTER *gpointer,
-                                    NEIGHBOR_LIST *d_nl, int *excluded_list_start, int *excluded_list,
-                                    int *excluded_numbers, cudaStream_t stream) {
-  Clear_Grid_Bucket<<<ceilf(static_cast<float>(grid_numbers) / 32), 32, 0, stream>>>(
-    grid_numbers, atom_numbers_in_grid_bucket, bucket);
-
-  Crd_Periodic_Map<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd, box_length);
-
-  Find_Atom_In_Grid_Serial<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
-    atom_numbers, grid_length_inverse, crd, grid_N, Nxy, atom_in_grid_serial);
-  cudaMemcpyAsync(old_crd, crd, sizeof(VECTOR) * atom_numbers, cudaMemcpyDeviceToDevice, stream);
-
-  Put_Atom_In_Grid_Bucket<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
-    atom_numbers, atom_in_grid_serial, bucket, atom_numbers_in_grid_bucket);
-
-  Crd_To_Uint_Crd<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd_to_uint_crd_cof,
-                                                                                   crd, uint_crd);
-
-  Find_atom_neighbors<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
-    atom_numbers, uint_crd, uint_dr_to_dr_cof, atom_in_grid_serial, gpointer, bucket, atom_numbers_in_grid_bucket, d_nl,
-    cutoff_skin_square);
-
-  Delete_Excluded_Atoms_Serial_In_Neighbor_List<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
-    atom_numbers, d_nl, excluded_list_start, excluded_list, excluded_numbers);
-}
-
-__global__ void Mul_half(float *src, float *dst) {
-  int index = threadIdx.x;
-  if (index < 3) {
-    dst[index] = src[index] * 0.5;
-  }
-}
-
-__global__ void Mul_quarter(float *src, float *dst) {
-  int index = threadIdx.x;
-  if (index < 3) {
-    dst[index] = src[index] * 0.25;
-  }
-}
-
-int refresh_count = 0;
-
-void Neighbor_List_Update_New(int grid_numbers, int atom_numbers, int *d_refresh_count, int refresh_interval,
-                              int not_first_time, float skin, int Nxy, float cutoff_square,
-                              float cutoff_with_skin_square, int *grid_N, float *box_length,
-                              int *atom_numbers_in_grid_bucket, float *grid_length_inverse, int *atom_in_grid_serial,
-                              GRID_BUCKET *bucket, float *crd, float *old_crd, float *crd_to_uint_crd_cof,
-                              float *half_crd_to_uint_crd_cof, unsigned int *uint_crd, float *uint_dr_to_dr_cof,
-                              GRID_POINTER *gpointer, NEIGHBOR_LIST *d_nl, int *excluded_list_start, int *excluded_list,
-                              int *excluded_numbers, float half_skin_square, int *is_need_refresh_neighbor_list,
-                              int forced_update, int forced_check, cudaStream_t stream) {
-  if (forced_update) {
-    Mul_quarter<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
-    Refresh_Neighbor_List_No_Check(
-      grid_numbers, atom_numbers, skin, Nxy, cutoff_square, grid_N, box_length, atom_numbers_in_grid_bucket,
-      grid_length_inverse, atom_in_grid_serial, bucket, reinterpret_cast<VECTOR *>(crd),
-      reinterpret_cast<VECTOR *>(old_crd), half_crd_to_uint_crd_cof, reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
-      uint_dr_to_dr_cof, gpointer, d_nl, excluded_list_start, excluded_list, excluded_numbers, stream);
-
-  } else if (refresh_interval > 0 && !forced_check) {
-    if (refresh_count % refresh_interval == 0) {
-      Mul_quarter<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
-      Refresh_Neighbor_List_No_Check(grid_numbers, atom_numbers, skin, Nxy, cutoff_square, grid_N, box_length,
-                                     atom_numbers_in_grid_bucket, grid_length_inverse, atom_in_grid_serial, bucket,
-                                     reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(old_crd),
-                                     half_crd_to_uint_crd_cof, reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
-                                     uint_dr_to_dr_cof, gpointer, d_nl, excluded_list_start, excluded_list,
-                                     excluded_numbers, stream);
-    }
-    refresh_count += 1;
-  } else {
-    Is_need_refresh_neighbor_list_cuda<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
-      atom_numbers, reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(old_crd),
-      reinterpret_cast<VECTOR *>(box_length), half_skin_square, is_need_refresh_neighbor_list);
-    Mul_quarter<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
-    Refresh_Neighbor_List(is_need_refresh_neighbor_list, 32, atom_numbers, reinterpret_cast<VECTOR *>(crd),
-                          reinterpret_cast<VECTOR *>(old_crd), reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
-                          half_crd_to_uint_crd_cof, uint_dr_to_dr_cof, atom_in_grid_serial, skin, box_length, gpointer,
-                          bucket, atom_numbers_in_grid_bucket, d_nl, excluded_list_start, excluded_list,
-                          excluded_numbers, cutoff_with_skin_square, grid_numbers, grid_length_inverse, grid_N, Nxy,
-                          stream);
-  }
-}
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cuh
deleted file mode 100644
index 646857a0f68..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cuh
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_NEW_IMPL_H_
-#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_NEW_IMPL_H_
-
-struct VECTOR {
-  float x;
-  float y;
-  float z;
-};
-struct INT_VECTOR {
-  int int_x;
-  int int_y;
-  int int_z;
-};
-struct UNSIGNED_INT_VECTOR {
-  unsigned int uint_x;
-  unsigned int uint_y;
-  unsigned int uint_z;
-};
-struct NEIGHBOR_LIST {
-  int atom_numbers;
-  int *atom_serial;
-};
-struct GRID_BUCKET {
-  int *atom_serial;
-};
-struct GRID_POINTER {
-  int *grid_serial;
-};
-
-void Construct_Neighbor_List(int grid_numbers, int max_neighbor_numbers, int *nl_atom_numbers, int *nl_atom_serial,
-                             NEIGHBOR_LIST *nl, cudaStream_t stream);
-
-void CopyNeighborListAtomNumber(int atom_numbers, int max_neighbor_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers,
-                                int *nl_atom_serial, cudaStream_t stream);
-
-void Neighbor_List_Update_New(int grid_numbers, int atom_numbers, int *d_refresh_count, int refresh_interval,
-                              int not_first_time, float skin, int Nxy, float cutoff_square,
-                              float cutoff_with_skin_square, int *grid_N, float *box_length,
-                              int *atom_numbers_in_grid_bucket, float *grid_length_inverse, int *atom_in_grid_serial,
-                              GRID_BUCKET *bucket, float *crd, float *old_crd, float *crd_to_uint_crd_cof,
-                              float *half_crd_to_uint_crd_cof, unsigned int *uint_crd, float *uint_dr_to_dr_cof,
-                              GRID_POINTER *gpointer, NEIGHBOR_LIST *d_nl, int *excluded_list_start, int *excluded_list,
-                              int *excluded_numbers, float half_skin_square, int *is_need_refresh_neighbor_list,
-                              int forced_update, int forced_check, cudaStream_t stream);
-
-#endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_init_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_init_kernel.cc
index ae6153dbe35..551d36aaf52 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_init_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_init_kernel.cc
@@ -17,6 +17,7 @@
 #include "backend/kernel_compiler/gpu/data/dataset_init_kernel.h"
 #include <algorithm>
 #include "backend/kernel_compiler/gpu/data/dataset_utils.h"
+#include "backend/kernel_compiler/common_utils.h"
 #include "runtime/device/gpu/gpu_buffer_mgr.h"
 #include "runtime/device/gpu/gpu_memory_allocator.h"
 #include "utils/convert_utils.h"
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc
index 0bcfdbc13a6..db72eafaa67 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc
@@ -21,6 +21,7 @@
 #include <vector>
 #include <algorithm>
 #include "backend/kernel_compiler/gpu/data/dataset_utils.h"
+#include "backend/kernel_compiler/common_utils.h"
 #include "profiler/device/gpu/gpu_profiling.h"
 #include "runtime/device/gpu/gpu_buffer_mgr.h"
 #include "runtime/device/gpu/gpu_common.h"
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_utils.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_utils.cc
index f3e1414a834..2534030bff4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_utils.cc
@@ -20,40 +20,6 @@
 
 namespace mindspore {
 namespace kernel {
-size_t UnitSizeInBytes(const mindspore::TypeId &t) {
-  size_t bytes = 0;
-  switch (t) {
-    case kNumberTypeBool:
-    case kNumberTypeInt8:
-    case kNumberTypeUInt8:
-      bytes = sizeof(int8_t);
-      break;
-    case kNumberTypeInt16:
-    case kNumberTypeUInt16:
-    case kNumberTypeFloat16:
-      bytes = sizeof(int16_t);
-      break;
-    case kNumberTypeInt:
-    case kNumberTypeUInt:
-    case kNumberTypeInt32:
-    case kNumberTypeUInt32:
-    case kNumberTypeFloat:
-    case kNumberTypeFloat32:
-      bytes = sizeof(int32_t);
-      break;
-    case kNumberTypeUInt64:
-    case kNumberTypeInt64:
-    case kNumberTypeFloat64:
-      bytes = sizeof(int64_t);
-      break;
-    default:
-      MS_LOG(EXCEPTION) << "Invalid types " << t;
-      break;
-  }
-
-  return bytes;
-}
-
 int ElementNums(const std::vector<int> &shape) {
   if (shape.size() == 0) {
     return 0;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_utils.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_utils.h
index a892cbfd7e3..4010a7c87c1 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_utils.h
@@ -21,7 +21,6 @@
 #include "ir/dtype/type.h"
 namespace mindspore {
 namespace kernel {
-size_t UnitSizeInBytes(const mindspore::TypeId &t);
 int ElementNums(const std::vector<int> &shape);
 void GetShapeAndType(const CNodePtr &kernel_node, std::vector<std::vector<int>> *shapes, std::vector<TypePtr> *types);
 }  // namespace kernel
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/debug/print_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/debug/print_gpu_kernel.h
index 384a562398b..edcb2916868 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/debug/print_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/debug/print_gpu_kernel.h
@@ -27,7 +27,7 @@
 #include "ir/tensor.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
-#include "backend/kernel_compiler/gpu/data/dataset_utils.h"
+#include "backend/kernel_compiler/common_utils.h"
 
 using mindspore::tensor::Tensor;
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
index 932e07f5a45..0382749ca52 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
@@ -149,6 +149,14 @@ MS_REG_GPU_KERNEL_ONE(
   NotEqual,
   KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool),
   BroadcastOpGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(
+  TruncateDiv,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  BroadcastOpGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(
+  TruncateMod,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  BroadcastOpGpuKernel, float)
 
 // fp16
 MS_REG_GPU_KERNEL_ONE(
@@ -223,6 +231,14 @@ MS_REG_GPU_KERNEL_ONE(
   NotEqual,
   KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeBool),
   BroadcastOpGpuKernel, half)
+MS_REG_GPU_KERNEL_ONE(
+  TruncateDiv,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  BroadcastOpGpuKernel, half)
+MS_REG_GPU_KERNEL_ONE(
+  TruncateMod,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  BroadcastOpGpuKernel, half)
 
 // int32
 MS_REG_GPU_KERNEL_ONE(
@@ -280,6 +296,14 @@ MS_REG_GPU_KERNEL_ONE(
 MS_REG_GPU_KERNEL_ONE(
   NotEqual, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool),
   BroadcastOpGpuKernel, int)
+MS_REG_GPU_KERNEL_ONE(
+  TruncateDiv,
+  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  BroadcastOpGpuKernel, int)
+MS_REG_GPU_KERNEL_ONE(
+  TruncateMod,
+  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  BroadcastOpGpuKernel, int)
 
 // int64
 MS_REG_GPU_KERNEL_ONE(
@@ -351,6 +375,12 @@ MS_REG_GPU_KERNEL_ONE(
 MS_REG_GPU_KERNEL_ONE(
   Mul, KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
   BroadcastOpGpuKernel, int8_t)
+MS_REG_GPU_KERNEL_ONE(
+  TruncateDiv, KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
+  BroadcastOpGpuKernel, int8_t)
+MS_REG_GPU_KERNEL_ONE(
+  TruncateMod, KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
+  BroadcastOpGpuKernel, int8_t)
 
 // uint32
 MS_REG_GPU_KERNEL_ONE(
@@ -380,6 +410,14 @@ MS_REG_GPU_KERNEL_ONE(
 MS_REG_GPU_KERNEL_ONE(
   Mul, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
   BroadcastOpGpuKernel, uint8_t)
+MS_REG_GPU_KERNEL_ONE(
+  TruncateDiv,
+  KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
+  BroadcastOpGpuKernel, uint8_t)
+MS_REG_GPU_KERNEL_ONE(
+  TruncateMod,
+  KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
+  BroadcastOpGpuKernel, uint8_t)
 
 // int16
 MS_REG_GPU_KERNEL_ONE(
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
index cbc93e1e55d..c561fee08ad 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
@@ -165,11 +165,22 @@ class BroadcastOpGpuKernel : public GpuKernel {
     }
 
     static const std::map<std::string, BroadcastOpType> kBroadcastArithmetricTypeMap = {
-      {"Maximum", BROADCAST_TYPE_MAXIMUM},   {"Minimum", BROADCAST_TYPE_MINIMUM},   {"Pow", BROADCAST_TYPE_POWER},
-      {"RealDiv", BROADCAST_TYPE_REALDIV},   {"Mul", BROADCAST_TYPE_MUL},           {"Sub", BROADCAST_TYPE_SUB},
-      {"Add", BROADCAST_TYPE_ADD},           {"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD},
-      {"Div", BROADCAST_TYPE_DIV},           {"DivNoNan", BROADCAST_TYPE_DIVNONAN}, {"Mod", BROADCAST_TYPE_MOD},
-      {"FloorMod", BROADCAST_TYPE_FLOORMOD}, {"Atan2", BROADCAST_TYPE_ATAN2},
+      {"Maximum", BROADCAST_TYPE_MAXIMUM},
+      {"Minimum", BROADCAST_TYPE_MINIMUM},
+      {"Pow", BROADCAST_TYPE_POWER},
+      {"RealDiv", BROADCAST_TYPE_REALDIV},
+      {"Mul", BROADCAST_TYPE_MUL},
+      {"Sub", BROADCAST_TYPE_SUB},
+      {"Add", BROADCAST_TYPE_ADD},
+      {"FloorDiv", BROADCAST_TYPE_FLOORDIV},
+      {"AbsGrad", BROADCAST_TYPE_ABSGRAD},
+      {"Div", BROADCAST_TYPE_DIV},
+      {"DivNoNan", BROADCAST_TYPE_DIVNONAN},
+      {"Mod", BROADCAST_TYPE_MOD},
+      {"FloorMod", BROADCAST_TYPE_FLOORMOD},
+      {"Atan2", BROADCAST_TYPE_ATAN2},
+      {"TruncateDiv", BROADCAST_TYPE_TRUNCATEDIV},
+      {"TruncateMod", BROADCAST_TYPE_TRUNCATEMOD},
     };
 
     iter = kBroadcastArithmetricTypeMap.find(kernel_name);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_new_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_new_kernel.cc
deleted file mode 100644
index dcd61375060..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_new_kernel.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_new_kernel.h"
-
-namespace mindspore {
-namespace kernel {
-MS_REG_GPU_KERNEL_TWO(NeighborListUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeUInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddOutputAttr(kNumberTypeFloat32),
-                      NeighborListUpdateNewGpuKernel, int, float)
-}  // namespace kernel
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_new_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_new_kernel.h
deleted file mode 100644
index 1e2357a090a..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_new_kernel.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_NEIGHBOR_LIST_UPDATE_NEW_KERNEL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_NEIGHBOR_LIST_UPDATE_NEW_KERNEL_H_
-
-#include <cuda_runtime_api.h>
-#include <vector>
-#include <string>
-#include <map>
-#include "backend/kernel_compiler/gpu/gpu_kernel.h"
-#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
-#include "runtime/device/gpu/cuda_common.h"
-#include "backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cuh"
-
-namespace mindspore {
-namespace kernel {
-template <typename T, typename T1>
-class NeighborListUpdateNewGpuKernel : public GpuKernel {
- public:
-  NeighborListUpdateNewGpuKernel() : skin(2.0), cutoff(9.0), max_atom_in_grid_numbers(64), max_neighbor_numbers(800) {}
-  ~NeighborListUpdateNewGpuKernel() override = default;
-  bool Init(const CNodePtr &kernel_node) override {
-    grid_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "grid_numbers"));
-    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
-    refresh_interval = static_cast<int>(GetAttr<int64_t>(kernel_node, "refresh_interval"));
-    not_first_time = static_cast<int>(GetAttr<int64_t>(kernel_node, "not_first_time"));
-    nxy = static_cast<int>(GetAttr<int64_t>(kernel_node, "nxy"));
-    excluded_atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "excluded_atom_numbers"));
-
-    cutoff_square = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_square"));
-    half_skin_square = static_cast<float>(GetAttr<float>(kernel_node, "half_skin_square"));
-    cutoff_with_skin = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_with_skin"));
-    half_cutoff_with_skin = static_cast<float>(GetAttr<float>(kernel_node, "half_cutoff_with_skin"));
-    cutoff_with_skin_square = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_with_skin_square"));
-    forced_update = static_cast<int>(GetAttr<int64_t>(kernel_node, "forced_update"));
-    forced_check = static_cast<int>(GetAttr<int64_t>(kernel_node, "forced_check"));
-    h_bucket.resize(grid_numbers);
-    h_gpointer.resize(grid_numbers);
-    InitSizeLists();
-    return true;
-  }
-
-  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
-  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
-  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
-
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspaces,
-              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    auto atom_numbers_in_grid_bucket = GetDeviceAddress<int>(inputs, 0);
-    auto bucket = GetDeviceAddress<int>(inputs, 1);
-    auto crd = GetDeviceAddress<float>(inputs, 2);
-    auto box_length = GetDeviceAddress<float>(inputs, 3);
-    auto grid_n = GetDeviceAddress<int>(inputs, 4);
-    auto grid_length_inverse = GetDeviceAddress<float>(inputs, 5);
-    auto atom_in_grid_serial = GetDeviceAddress<int>(inputs, 6);
-    auto old_crd = GetDeviceAddress<float>(inputs, 7);
-    auto crd_to_uint_crd_cof = GetDeviceAddress<float>(inputs, 8);
-    auto uint_crd = GetDeviceAddress<unsigned int>(inputs, 9);
-    auto gpointer = GetDeviceAddress<int>(inputs, 10);
-    auto nl_atom_numbers = GetDeviceAddress<int>(inputs, 11);
-    auto nl_atom_serial = GetDeviceAddress<int>(inputs, 12);
-    auto uint_dr_to_dr_cof = GetDeviceAddress<float>(inputs, 13);
-    auto excluded_list_start = GetDeviceAddress<int>(inputs, 14);
-    auto excluded_list = GetDeviceAddress<int>(inputs, 15);
-    auto excluded_numbers = GetDeviceAddress<int>(inputs, 16);
-    auto need_refresh_flag = GetDeviceAddress<int>(inputs, 17);
-    auto d_refresh_count = GetDeviceAddress<int>(inputs, 18);
-
-    GRID_BUCKET *d_bucket = reinterpret_cast<GRID_BUCKET *>(GetDeviceAddress<int>(workspaces, 0));
-    GRID_POINTER *d_gpointer = reinterpret_cast<GRID_POINTER *>(GetDeviceAddress<int>(workspaces, 1));
-    NEIGHBOR_LIST *nl = GetDeviceAddress<NEIGHBOR_LIST>(workspaces, 2);
-    float *half_crd_to_uint_crd_cof = GetDeviceAddress<float>(workspaces, 3);
-
-    // std::vector<GRID_BUCKET> h_bucket(grid_numbers);
-    for (size_t i = 0; i < h_bucket.size(); i += 1) {
-      h_bucket[i].atom_serial = bucket + i * max_atom_in_grid_numbers;
-    }
-    // std::vector<GRID_POINTER> h_gpointer(grid_numbers);
-    for (size_t i = 0; i < h_gpointer.size(); i += 1) {
-      h_gpointer[i].grid_serial = gpointer + i * 125;
-    }
-
-    cudaMemcpyAsync(d_bucket, h_bucket.data(), sizeof(GRID_BUCKET) * grid_numbers, cudaMemcpyHostToDevice,
-                    reinterpret_cast<cudaStream_t>(stream_ptr));
-    cudaMemcpyAsync(d_gpointer, h_gpointer.data(), sizeof(GRID_POINTER) * grid_numbers, cudaMemcpyHostToDevice,
-                    reinterpret_cast<cudaStream_t>(stream_ptr));
-    Construct_Neighbor_List(atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl,
-                            reinterpret_cast<cudaStream_t>(stream_ptr));
-
-    Neighbor_List_Update_New(grid_numbers, atom_numbers, d_refresh_count, refresh_interval, not_first_time, skin, nxy,
-                             cutoff_square, cutoff_with_skin_square, grid_n, box_length, atom_numbers_in_grid_bucket,
-                             grid_length_inverse, atom_in_grid_serial, d_bucket, crd, old_crd, crd_to_uint_crd_cof,
-                             half_crd_to_uint_crd_cof, uint_crd, uint_dr_to_dr_cof, d_gpointer, nl, excluded_list_start,
-                             excluded_list, excluded_numbers, half_skin_square, need_refresh_flag, forced_update,
-                             forced_check, reinterpret_cast<cudaStream_t>(stream_ptr));
-    CopyNeighborListAtomNumber(atom_numbers, max_neighbor_numbers, nl, nl_atom_numbers, nl_atom_serial,
-                               reinterpret_cast<cudaStream_t>(stream_ptr));
-    return true;
-  }
-
- protected:
-  void InitSizeLists() override {
-    input_size_list_.push_back(sizeof(int) * grid_numbers);
-    input_size_list_.push_back(sizeof(int) * max_atom_in_grid_numbers * grid_numbers);
-    input_size_list_.push_back(sizeof(VECTOR) * atom_numbers);
-    input_size_list_.push_back(sizeof(VECTOR));
-
-    input_size_list_.push_back(sizeof(INT_VECTOR));
-    input_size_list_.push_back(sizeof(VECTOR));
-    input_size_list_.push_back(sizeof(int) * atom_numbers);
-
-    input_size_list_.push_back(sizeof(VECTOR) * atom_numbers);
-    input_size_list_.push_back(sizeof(VECTOR));
-    input_size_list_.push_back(sizeof(UNSIGNED_INT_VECTOR) * atom_numbers);
-
-    input_size_list_.push_back(sizeof(int) * grid_numbers * 125);
-    input_size_list_.push_back(sizeof(int) * atom_numbers);
-    input_size_list_.push_back(sizeof(int) * atom_numbers * max_neighbor_numbers);
-    input_size_list_.push_back(sizeof(VECTOR));
-
-    input_size_list_.push_back(sizeof(int) * atom_numbers);
-    input_size_list_.push_back(sizeof(int) * excluded_atom_numbers);
-    input_size_list_.push_back(sizeof(int) * atom_numbers);
-
-    input_size_list_.push_back(sizeof(int));
-    input_size_list_.push_back(sizeof(int));
-
-    workspace_size_list_.push_back(sizeof(GRID_BUCKET) * grid_numbers);
-    workspace_size_list_.push_back(sizeof(GRID_POINTER) * grid_numbers);
-    workspace_size_list_.push_back(sizeof(NEIGHBOR_LIST) * atom_numbers);
-    workspace_size_list_.push_back(sizeof(float) * 3);
-
-    output_size_list_.push_back(sizeof(float));
-  }
-
- private:
-  float skin;
-  float cutoff;
-  int not_first_time;
-  int atom_numbers;
-  int grid_numbers;
-  int refresh_interval;
-  int nxy;
-  int max_atom_in_grid_numbers;
-  int max_neighbor_numbers;
-  int excluded_atom_numbers;
-  float half_skin_square;
-  float cutoff_square;
-  float cutoff_with_skin;
-  float half_cutoff_with_skin;
-  float cutoff_with_skin_square;
-  int forced_update;
-  int forced_check;
-
-  std::vector<size_t> input_size_list_;
-  std::vector<size_t> output_size_list_;
-  std::vector<size_t> workspace_size_list_;
-  std::vector<GRID_BUCKET> h_bucket;
-  std::vector<GRID_POINTER> h_gpointer;
-};
-}  // namespace kernel
-}  // namespace mindspore
-
-#endif
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/trt/trt_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/trt/trt_kernel.cc
index 1495acab375..f1317d2aafc 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/trt/trt_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/trt/trt_kernel.cc
@@ -17,6 +17,7 @@
 
 #include "backend/kernel_compiler/gpu/data/dataset_utils.h"
 #include "backend/kernel_compiler/gpu/trt/trt_utils.h"
+#include "backend/kernel_compiler/common_utils.h"
 #include "runtime/device/gpu/trt_loader.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.cc
index 77edb57eaba..e71eab88cdf 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.cc
@@ -197,7 +197,8 @@ const std::vector<size_t> &HcclKernel::GetWorkspaceSizeList() const {
   MS_EXCEPTION_IF_NULL(context_ptr);
   bool is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
   auto mode = context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE);
-  if (!workspace_size_list_.empty() || hccl_data_type_list_.empty() || (!is_task_sink && mode == kGraphMode)) {
+  if (!workspace_size_list_.empty() || hccl_data_type_list_.empty() || (!is_task_sink && mode == kGraphMode) ||
+      mode == kPynativeMode) {
     return workspace_size_list_;
   }
   workspace_size_list_.emplace_back(
diff --git a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.cc
index b215f43684b..93f0101e122 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.cc
@@ -129,7 +129,9 @@ std::vector<int64_t> GetInputShape(const CNodePtr &cnode, size_t index) {
   std::vector<int64_t> x{SizeToLong(x_num)};
 
   auto x_shape_value = std::make_shared<tensor::Tensor>(type_x, x);
-  x_shape_value->set_device_address(address_x);
+  // The second parameter must be false, otherwise the device address cannot be released and allocated, and the
+  // address size will be wrong in the dynamic shape scenario.
+  x_shape_value->set_device_address(address_x, false);
   x_shape_value->data_sync();
 
   auto x_value = reinterpret_cast<int64_t *>(x_shape_value->data_c());
diff --git a/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc b/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc
index 88442c17511..0319ec04995 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/kash/kernel_pack.cc
@@ -197,7 +197,7 @@ void KernelPack::ParseKernelJson(const nlohmann::json &js) {
   kernel_json_info_.sha256 = js["sha256"];
 }
 
-bool KernelPack::LoadKernelMeta(const std::string &json_f, const std::string &processor) {
+bool KernelPack::LoadKernelMeta(const std::string &json_f) {
   if (json_f.length() <= strlen(kJsonSuffix)) {
     MS_LOG(ERROR) << "please check json path.";
     return false;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/kernel.h b/mindspore/ccsrc/backend/kernel_compiler/kernel.h
index 2f7b79fb716..ec7acfc0178 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel.h
@@ -128,7 +128,7 @@ class KernelPack {
   KernelPack() : json_(nullptr), kernel_(nullptr) {}
   KernelPack(const KernelPack &) = default;
   KernelJsonInfo kernel_json_info() const;
-  bool LoadKernelMeta(const std::string &json_f, const std::string &processor);
+  bool LoadKernelMeta(const std::string &json_f);
   bool ReadFromJsonFile(const std::string &json_f, const std::string &processor);
   const FlexArray *GetJson() const { return json_; }
   const FlexArray *GetKernel() const { return kernel_; }
@@ -185,11 +185,22 @@ class KernelMod {
   void set_unique_name(const std::string &unique_name) { unique_name_ = unique_name; }
   void set_fullname(const std::string &fullname) { fullname_ = fullname; }
   void set_is_monad(bool is_monad) { is_monad_ = is_monad; }
+  void set_inputs_addr(const std::vector<AddressPtr> &addr) { inputs_addr_ = addr; }
+  void set_workspaces_addr(const std::vector<AddressPtr> &addr) { workspaces_addr_ = addr; }
+  void set_outputs_addr(const std::vector<AddressPtr> &addr) { outputs_addr_ = addr; }
+  const std::vector<AddressPtr> &GetInputsAddr() { return inputs_addr_; }
+  const std::vector<AddressPtr> &GetWorkSpacesAddr() { return workspaces_addr_; }
+  const std::vector<AddressPtr> &GetOutputsAddr() { return outputs_addr_; }
 
  protected:
   std::string unique_name_;
   std::string fullname_;
   bool is_monad_{false};
+
+ private:
+  std::vector<AddressPtr> inputs_addr_;
+  std::vector<AddressPtr> workspaces_addr_;
+  std::vector<AddressPtr> outputs_addr_;
 };
 using KernelModPtr = std::shared_ptr<KernelMod>;
 }  // namespace kernel
diff --git a/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.cc b/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.cc
index 379f7ed16a8..7d2602c45be 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.cc
@@ -92,7 +92,7 @@ std::map<int64_t, KernelModPtr> KernelFusion(const std::vector<FusionScopeInfo>
       continue;
     }
     // search cache
-    auto kernel_pack = TbeUtils::SearchCache(json_name, tbe::kProcessorAiCore);
+    auto kernel_pack = TbeUtils::SearchCache(json_name);
     if (kernel_pack != nullptr && ((!offline_tune.empty() && offline_tune != "true") || tune_mode == "NO_TUNE")) {
       auto kernel_mod = build_manger->GenKernelMod(input_size_list, output_size_list, kernel_pack);
       if (kernel_mod != nullptr) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_convert_utils.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_convert_utils.cc
index 26c708c81ce..d65dd78d428 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_convert_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_convert_utils.cc
@@ -27,31 +27,43 @@ namespace mindspore {
 namespace kernel {
 namespace tbe {
 const std::unordered_map<std::string, TypeId> type_str_id_maps = {
-  {"float", TypeId::kNumberTypeFloat32},   {"float16", TypeId::kNumberTypeFloat16},
-  {"float32", TypeId::kNumberTypeFloat32}, {"float64", TypeId::kNumberTypeFloat64},
-  {"int", TypeId::kNumberTypeInt},         {"int8", TypeId::kNumberTypeInt8},
-  {"int16", TypeId::kNumberTypeInt16},     {"int32", TypeId::kNumberTypeInt32},
-  {"int64", TypeId::kNumberTypeInt64},     {"uint", TypeId::kNumberTypeUInt},
-  {"uint8", TypeId::kNumberTypeUInt8},     {"uint16", TypeId::kNumberTypeUInt16},
-  {"uint32", TypeId::kNumberTypeUInt32},   {"uint64", TypeId::kNumberTypeUInt64},
-  {"bool", TypeId::kNumberTypeBool},       {"", TypeId::kMetaTypeNone},
+  {"float", TypeId::kNumberTypeFloat32},
+  {"float16", TypeId::kNumberTypeFloat16},
+  {"float32", TypeId::kNumberTypeFloat32},
+  {"float64", TypeId::kNumberTypeFloat64},
+  {"int", TypeId::kNumberTypeInt},
+  {"int8", TypeId::kNumberTypeInt8},
+  {"int16", TypeId::kNumberTypeInt16},
+  {"int32", TypeId::kNumberTypeInt32},
+  {"int64", TypeId::kNumberTypeInt64},
+  {"uint", TypeId::kNumberTypeUInt},
+  {"uint8", TypeId::kNumberTypeUInt8},
+  {"uint16", TypeId::kNumberTypeUInt16},
+  {"uint32", TypeId::kNumberTypeUInt32},
+  {"uint64", TypeId::kNumberTypeUInt64},
+  {"bool", TypeId::kNumberTypeBool},
+  {"int4", TypeId::kNumberTypeInt4},
+  {"", TypeId::kMetaTypeNone},
 };
 
 const std::map<TypeId, std::string> type_id_str_maps = {
-  {TypeId::kNumberTypeFloat32, "float32"}, {TypeId::kNumberTypeFloat16, "float16"},
-  {TypeId::kNumberTypeFloat, "float"},     {TypeId::kNumberTypeFloat64, "float64"},
-  {TypeId::kNumberTypeInt, "int"},         {TypeId::kNumberTypeInt8, "int8"},
-  {TypeId::kNumberTypeInt16, "int16"},     {TypeId::kNumberTypeInt32, "int32"},
-  {TypeId::kNumberTypeInt64, "int64"},     {TypeId::kNumberTypeUInt, "uint"},
-  {TypeId::kNumberTypeUInt8, "uint8"},     {TypeId::kNumberTypeUInt16, "uint16"},
-  {TypeId::kNumberTypeUInt32, "uint32"},   {TypeId::kNumberTypeUInt64, "uint64"},
-  {TypeId::kNumberTypeBool, "int8"},       {TypeId::kMetaTypeNone, ""},
-};
-
-const std::map<std::string, std::string> type_str_maps = {
-  {"Float32", "float32"}, {"Float16", "float16"}, {"Int8", "int8"},   {"Int16", "int16"},
-  {"UInt16", "uint16"},   {"UInt8", "uint8"},     {"Int32", "int32"}, {"UInt32", "uint32"},
-  {"Int64", "int64"},     {"UInt64", "uint64"},   {"Bool", "int8"},   {"Float64", "float64"},
+  {TypeId::kNumberTypeFloat32, "float32"},
+  {TypeId::kNumberTypeFloat16, "float16"},
+  {TypeId::kNumberTypeFloat, "float"},
+  {TypeId::kNumberTypeFloat64, "float64"},
+  {TypeId::kNumberTypeInt, "int"},
+  {TypeId::kNumberTypeInt8, "int8"},
+  {TypeId::kNumberTypeInt16, "int16"},
+  {TypeId::kNumberTypeInt32, "int32"},
+  {TypeId::kNumberTypeInt64, "int64"},
+  {TypeId::kNumberTypeUInt, "uint"},
+  {TypeId::kNumberTypeUInt8, "uint8"},
+  {TypeId::kNumberTypeUInt16, "uint16"},
+  {TypeId::kNumberTypeUInt32, "uint32"},
+  {TypeId::kNumberTypeUInt64, "uint64"},
+  {TypeId::kNumberTypeBool, "int8"},
+  {TypeId::kNumberTypeInt4, "int4"},
+  {TypeId::kMetaTypeNone, ""},
 };
 
 const std::unordered_map<std::string, size_t> type_nbyte_maps = {
@@ -59,6 +71,7 @@ const std::unordered_map<std::string, size_t> type_nbyte_maps = {
   {"int8", sizeof(int) / 4},      {"int16", sizeof(int) / 2},  {"int32", sizeof(int)},
   {"int64", sizeof(int) * 2},     {"uint8", sizeof(int) / 4},  {"uint16", sizeof(int) / 2},
   {"uint32", sizeof(int)},        {"uint64", sizeof(int) * 2}, {"bool", sizeof(char)},
+  {"int4", sizeof(int) / 4},
 };
 
 TypeId DtypeToTypeId(const std::string &dtypes) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/fusion_tbe_json_creator.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/fusion_tbe_json_creator.cc
index d46f5289ef3..70a3451af51 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/fusion_tbe_json_creator.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/fusion_tbe_json_creator.cc
@@ -42,7 +42,7 @@ bool FusionBuildTbeJsonCreator::GenJson(const FusionScopeInfo &fusion_scope_info
 
   std::vector<nlohmann::json> op_list_json;
   if (!GenOpListJson(fusion_scope_info, &op_list_json)) {
-    MS_LOG(ERROR) << "Generate fusion json failed.";
+    MS_LOG(WARNING) << "Fusion Error: generate fusion json failed.";
     return false;
   }
   (*fusion_json)[kJOpList] = op_list_json;
@@ -62,6 +62,10 @@ bool FusionBuildTbeJsonCreator::GenOpListJson(const FusionScopeInfo &fusion_scop
   MS_EXCEPTION_IF_NULL(fusion_json);
   MS_LOG(DEBUG) << "Start";
   if (!CheckInput(fusion_scope_info)) {
+    for (const auto &cnode : fusion_scope_info.compute_nodes) {
+      MS_LOG(WARNING) << "Fusion Error: check input failed, scope id: " << fusion_scope_info.scope_id
+                      << ", compute node: " << cnode->fullname_with_scope();
+    }
     return false;
   }
 
@@ -71,8 +75,8 @@ bool FusionBuildTbeJsonCreator::GenOpListJson(const FusionScopeInfo &fusion_scop
   for (const auto &compute_node : compute_nodes) {
     nlohmann::json compute_json;
     if (!GenComputeJson(compute_node, &compute_json)) {
-      MS_LOG(ERROR) << "Fusion Error: gen fusion compute json failed. node full name: "
-                    << compute_node->fullname_with_scope();
+      MS_LOG(WARNING) << "Fusion Error: gen fusion compute json failed. node full name: "
+                      << compute_node->fullname_with_scope();
       return false;
     }
     compute_json[kJOriName] = {fusion_scope_info.full_name};
@@ -99,7 +103,7 @@ bool FusionBuildTbeJsonCreator::CheckInput(const FusionScopeInfo &fusion_scope_i
     MS_EXCEPTION_IF_NULL(node);
     auto cnode = node->cast<CNodePtr>();
     if (cnode == nullptr) {
-      MS_LOG(ERROR) << "Fusion error: fusion compute node must be cnode, but the node is " << cnode->DebugString();
+      MS_LOG(WARNING) << "Fusion Error: fusion compute node must be cnode, but the node is " << cnode->DebugString();
       return false;
     }
     for (size_t i = 1; i < cnode->inputs().size(); ++i) {
@@ -111,8 +115,8 @@ bool FusionBuildTbeJsonCreator::CheckInput(const FusionScopeInfo &fusion_scope_i
     }
   }
   if (input_nodes.size() != input_size) {
-    MS_LOG(ERROR) << "Fusion error: fusion scope error, compute node input size:" << input_size
-                  << ", input nodes num:" << input_nodes.size();
+    MS_LOG(WARNING) << "Fusion Error: compute node input size: [ " << input_size
+                    << " ] is not equal to input nodes num: [ " << input_nodes.size() << " ].";
     return false;
   }
   MS_LOG(DEBUG) << "End";
@@ -218,19 +222,19 @@ bool FusionBuildTbeJsonCreator::GenInputsJson(const AnfNodePtr &anf_node, nlohma
 bool FusionBuildTbeJsonCreator::CheckDynamicInput(const CNodePtr &cnode) {
   MS_EXCEPTION_IF_NULL(cnode);
   if (!AnfAlgo::HasNodeAttr(kAttrDynInputSizes, cnode)) {
-    MS_LOG(ERROR) << "Fusion error: cnode [ " << AnfAlgo::GetCNodeName(cnode) << "] has not attr dyn_input_sizes.";
+    MS_LOG(WARNING) << "Fusion Error: cnode [ " << AnfAlgo::GetCNodeName(cnode) << "] has not attr dyn_input_sizes.";
     return false;
   }
   // for dynamic input number, dyn_input_sizes has the info of dynamic input num for each input.
   auto dyn_input_sizes = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(cnode, kAttrDynInputSizes);
   if (dyn_input_sizes.size() != 1) {
-    MS_LOG(ERROR) << "Fusion error: fusion build not support dynamic input size > 1";
+    MS_LOG(WARNING) << "Fusion Error: fusion build not support dynamic input size > 1";
     return false;
   }
   auto real_input_size = cnode->inputs().size() - 1;
   if (LongToSize(dyn_input_sizes[0]) != real_input_size) {
-    MS_LOG(ERROR) << "Fusion error: dyn_input_size" << dyn_input_sizes[0] << "not equal real_input_size"
-                  << real_input_size;
+    MS_LOG(WARNING) << "Fusion Error: dyn_input_size" << dyn_input_sizes[0] << "not equal real_input_size"
+                    << real_input_size;
     return false;
   }
   return true;
@@ -246,9 +250,9 @@ bool FusionBuildTbeJsonCreator::GenOutputsJson(const AnfNodePtr &anf_node, nlohm
   if (AnfAlgo::HasNodeAttr(kAttrOutputUsedNum, cnode)) {
     auto output_used_nums = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(anf_node, kAttrOutputUsedNum);
     if (output_used_nums.size() != output_size) {
-      MS_LOG(ERROR) << "Fusion error: [" << AnfAlgo::GetCNodeName(anf_node) << " ]'s output tenor num(" << output_size
-                    << ")"
-                    << " is not match output used num(" << output_used_nums.size() << ")";
+      MS_LOG(WARNING) << "Fusion Error: [" << AnfAlgo::GetCNodeName(anf_node) << " ]'s output tensor num("
+                      << output_size << ")"
+                      << " is not match output used num(" << output_used_nums.size() << ")";
       return false;
     }
     auto desc_output_index = GetDescOutputIndex(output_used_nums);
@@ -299,7 +303,8 @@ std::vector<size_t> FusionBuildTbeJsonCreator::GetDescOutputIndex(const std::vec
 
 bool FusionBuildTbeJsonCreator::AttrsJsonPostProcessing(const AnfNodePtr &anf_node, const OpInfoPtr &op_info_ptr,
                                                         nlohmann::json *attrs_json) {
-  tbe::TbeAdapter::CastAttrJsonPost(anf_node, attrs_json);
+  // just keep it
+  // tbe::TbeAdapter::CastAttrJsonPost(anf_node, attrs_json);
   return true;
 }
 }  // namespace mindspore::kernel
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/single_tbe_json_creator.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/single_tbe_json_creator.cc
index 2db7b0cea00..f784eebfcfd 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/single_tbe_json_creator.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/single_tbe_json_creator.cc
@@ -154,8 +154,8 @@ void SingleTbeJsonCreator::GenInputDescJson(const AnfNodePtr &anf_node, size_t r
   MS_EXCEPTION_IF_NULL(anf_node);
   MS_EXCEPTION_IF_NULL(input_desc);
   GenDesJsonCommon(input_desc);
-  auto shape = AnfAlgo::GetInputDeviceShape(anf_node, real_input_index);
-  auto ori_shape = AnfAlgo::GetPrevNodeOutputInferShape(anf_node, real_input_index);
+  auto shape = TbeJsonUtils::GetInputDeviceShapeForTbeBuild(anf_node, real_input_index);
+  auto ori_shape = TbeJsonUtils::GetInputOriShapeForTbeBuild(anf_node, real_input_index);
   if (shape.empty()) {
     shape.emplace_back(1);
   }
@@ -332,7 +332,7 @@ void SelectTbeJsonCreator::GenDescJson(const AnfNodePtr &anf_node, size_t node_o
   GenDesJsonCommon(output_desc);
   std::vector<int64_t> shape;
   std::vector<int64_t> ori_shape;
-  AnfAlgo::GetRealDynamicShape(AnfAlgo::GetOutputInferShape(anf_node, node_out_idx), NOT_NULL(&ori_shape));
+  ori_shape = TbeJsonUtils::GetOutputOriShapeForTbeBuild(anf_node, node_out_idx);
   if (ori_shape.empty()) {
     ori_shape.emplace_back(1);
   }
@@ -354,7 +354,7 @@ void SelectTbeJsonCreator::GenInputDescJson(const AnfNodePtr &anf_node, size_t r
                                             nlohmann::json *input_desc) {
   MS_EXCEPTION_IF_NULL(anf_node);
   GenDesJsonCommon(input_desc);
-  auto shape = AnfAlgo::GetPrevNodeOutputInferShape(anf_node, real_input_index);
+  auto shape = TbeJsonUtils::GetInputOriShapeForTbeBuild(anf_node, real_input_index);
   if (shape.empty()) {
     shape.emplace_back(1);
   }
@@ -386,7 +386,7 @@ void CheckTbeJsonCreator::GenDescJson(const AnfNodePtr &anf_node, size_t node_ou
   GenDesJsonCommon(output_desc);
   std::vector<int64_t> shape;
   std::vector<int64_t> ori_shape;
-  AnfAlgo::GetRealDynamicShape(AnfAlgo::GetOutputInferShape(anf_node, node_out_idx), NOT_NULL(&ori_shape));
+  ori_shape = TbeJsonUtils::GetOutputOriShapeForTbeBuild(anf_node, node_out_idx);
   if (ori_shape.empty()) {
     ori_shape.emplace_back(1);
   }
@@ -408,7 +408,7 @@ void CheckTbeJsonCreator::GenInputDescJson(const AnfNodePtr &anf_node, size_t re
                                            nlohmann::json *input_desc) {
   MS_EXCEPTION_IF_NULL(anf_node);
   GenDesJsonCommon(input_desc);
-  auto shape = AnfAlgo::GetPrevNodeOutputInferShape(anf_node, real_input_index);
+  auto shape = TbeJsonUtils::GetInputOriShapeForTbeBuild(anf_node, real_input_index);
   if (shape.empty()) {
     shape.emplace_back(1);
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_creator.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_creator.cc
index f194b8f2a81..69cc855f2cd 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_creator.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_creator.cc
@@ -346,8 +346,8 @@ void TbeJsonCreator::GenDescJson(const AnfNodePtr &anf_node, size_t node_out_idx
   GenDesJsonCommon(output_desc);
   std::vector<int64_t> shape;
   std::vector<int64_t> ori_shape;
-  AnfAlgo::GetRealDynamicShape(AnfAlgo::GetOutputDeviceShape(anf_node, node_out_idx), NOT_NULL(&shape));
-  AnfAlgo::GetRealDynamicShape(AnfAlgo::GetOutputInferShape(anf_node, node_out_idx), NOT_NULL(&ori_shape));
+  shape = TbeJsonUtils::GetOutputDeviceShapeForTbeBuild(anf_node, node_out_idx);
+  ori_shape = TbeJsonUtils::GetOutputOriShapeForTbeBuild(anf_node, node_out_idx);
   if (shape.empty()) {
     shape.emplace_back(1);
   }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_utils.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_utils.cc
index c0080a0a929..aaefc09b42e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_utils.cc
@@ -80,4 +80,48 @@ bool TbeJsonUtils::IsNeedChangeDefaultFormat(const AnfNodePtr &anf_node) {
          AnfAlgo::GetNodeAttr<std::string>(anf_node, kAttrFormat) == kOpFormat_NCDHW;
 }
 
+std::vector<int64_t> TbeJsonUtils::GetInputOriShapeForTbeBuild(const AnfNodePtr &anf_node, size_t real_idx) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  session::KernelWithIndex kernel_with_index = AnfAlgo::GetPrevNodeOutput(anf_node, real_idx);
+  return GetOutputOriShapeForTbeBuild(kernel_with_index.first, kernel_with_index.second);
+}
+
+std::vector<int64_t> TbeJsonUtils::GetInputDeviceShapeForTbeBuild(const AnfNodePtr &anf_node, size_t real_idx) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  std::vector<int64_t> shape;
+  session::KernelWithIndex kernel_with_index = AnfAlgo::GetPrevNodeOutput(anf_node, real_idx);
+  auto format = AnfAlgo::GetInputFormat(anf_node, real_idx);
+  shape = AnfAlgo::GetOutputDeviceShapeForTbeBuild(kernel_with_index.first, kernel_with_index.second, format);
+  if (shape.empty()) {
+    shape.emplace_back(1);
+  }
+  return shape;
+}
+
+std::vector<int64_t> TbeJsonUtils::GetOutputOriShapeForTbeBuild(const AnfNodePtr &anf_node, size_t real_idx) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  std::vector<int64_t> shape;
+  auto out_shape = AnfAlgo::GetOutputDetailShape(anf_node, real_idx);
+  MS_EXCEPTION_IF_NULL(out_shape);
+  if (out_shape->isa<abstract::Shape>()) {
+    auto shape_ptr = out_shape->cast<abstract::ShapePtr>();
+    MS_EXCEPTION_IF_NULL(shape_ptr);
+    shape = shape_ptr->shape();
+  }
+  if (shape.empty()) {
+    shape.emplace_back(1);
+  }
+  return shape;
+}
+
+std::vector<int64_t> TbeJsonUtils::GetOutputDeviceShapeForTbeBuild(const AnfNodePtr &anf_node, size_t real_idx) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  std::vector<int64_t> shape;
+  auto format = AnfAlgo::GetOutputFormat(anf_node, real_idx);
+  shape = AnfAlgo::GetOutputDeviceShapeForTbeBuild(anf_node, real_idx, format);
+  if (shape.empty()) {
+    shape.emplace_back(1);
+  }
+  return shape;
+}
 }  // namespace mindspore::kernel
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_utils.h b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_utils.h
index 645c21a5aa9..6e49f1d135f 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_utils.h
@@ -108,6 +108,11 @@ class TbeJsonUtils {
   static bool GetOutputsRealNum(const AnfNodePtr &anf_node, const std::vector<OpIOInfoPtr> &outputs_ptr,
                                 std::vector<size_t> *outputs_num);
   static bool IsNeedChangeDefaultFormat(const AnfNodePtr &anf_node);
+  // just for generate json for ascend op build, it will be deleted after unify size_t and int64_t.
+  static std::vector<int64_t> GetInputOriShapeForTbeBuild(const AnfNodePtr &anf_node, size_t real_idx);
+  static std::vector<int64_t> GetInputDeviceShapeForTbeBuild(const AnfNodePtr &anf_node, size_t real_idx);
+  static std::vector<int64_t> GetOutputOriShapeForTbeBuild(const AnfNodePtr &anf_node, size_t real_idx);
+  static std::vector<int64_t> GetOutputDeviceShapeForTbeBuild(const AnfNodePtr &anf_node, size_t real_idx);
 };
 
 }  // namespace mindspore::kernel
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
index 9defc6f8b61..606b240809c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
@@ -265,8 +265,8 @@ bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspor
   (*kernel_json)[kJSocInfo] = soc_info_json;
   (*kernel_json)[kJOpInfo] = op_info_json;
 
-  MS_LOG(DEBUG) << "Operate type:" << creater_type_ << ", full scope name is :" << anf_node->fullname_with_scope()
-                << ", json info name is : " << json_name_ << ", kernel json:" << kernel_json->dump();
+  MS_LOG(INFO) << "Operate type:" << creater_type_ << ", full scope name is :" << anf_node->fullname_with_scope()
+               << ", json info name is : " << json_name_ << ", kernel json:" << kernel_json->dump();
 
   return true;
 }
@@ -884,6 +884,113 @@ bool TbeKernelBuild::GetIOSize(const nlohmann::json &kernel_json, std::vector<si
   return true;
 }
 
+void GetRealInputSize(const nlohmann::json &input_json, const AnfNodePtr &anf_node, size_t i,
+                      std::vector<size_t> *input_size_list, size_t *size_i) {
+  for (size_t j = 0; j < input_json[kJShape].size(); ++j) {
+    if (input_json[kJShape][j] == -1) {
+      auto input_max_shape = AnfAlgo::GetInputMaxShape(anf_node, i);
+      if (j >= input_max_shape.size()) {
+        MS_LOG(EXCEPTION) << "Invalid Dynamic Shape Max Shape";
+      }
+      MS_LOG(INFO) << "Change -1 Shape to Max Shape:" << input_max_shape[j];
+      (*size_i) *= input_max_shape[j];
+      continue;
+    }
+    (*size_i) *= static_cast<size_t>(input_json[kJShape][j]);
+  }
+  std::string dtype = input_json[kJDtype];
+  size_t nbyte = tbe::GetDtypeNbyte(dtype);
+  (*size_i) *= nbyte;
+  input_size_list->push_back((*size_i));
+}
+
+void GetInputSizeList2(const nlohmann::json &input_json, std::vector<size_t> *input_size_list,
+                       const AnfNodePtr &anf_node) {
+  for (size_t i = 0; i < input_json.size(); i++) {
+    if (input_json[i].is_array()) {
+      for (size_t m = 0; m < input_json[i].size(); m++) {
+        size_t size_i = 1;
+        if (input_json[i][m][kJValid] == false) {
+          std::string input_name = input_json[i][m][kJName];
+          continue;
+        }
+        GetRealInputSize(input_json[i][m], anf_node, i, input_size_list, &size_i);
+      }
+    } else {
+      size_t size_i = 1;
+      if (input_json[i][kJValid] == false) {
+        std::string input_name = input_json[i][kJName];
+        continue;
+      }
+      GetRealInputSize(input_json[i], anf_node, i, input_size_list, &size_i);
+    }
+  }
+}
+
+void GetRealOutputSize(const nlohmann::json &output_json, const AnfNodePtr &anf_node, size_t i,
+                       std::vector<size_t> *output_size_list, size_t *size_i) {
+  for (size_t j = 0; j < output_json[kJShape].size(); ++j) {
+    if (output_json[kJShape][j] == -1) {
+      auto output_max_shape = AnfAlgo::GetOutputMaxShape(anf_node, i);
+      if (j >= output_max_shape.size()) {
+        MS_LOG(EXCEPTION) << "Invalid Dynamic Shape Max Shape";
+      }
+      MS_LOG(INFO) << "Change -1 Shape to Max Shape:" << output_max_shape[j];
+      (*size_i) *= output_max_shape[j];
+      continue;
+    }
+    (*size_i) *= static_cast<size_t>(output_json[kJShape][j]);
+  }
+  std::string dtype = output_json[kJDtype];
+  size_t nbyte = tbe::GetDtypeNbyte(dtype);
+  (*size_i) *= nbyte;
+  output_size_list->push_back((*size_i));
+}
+
+void GetOutputSizeList2(const nlohmann::json &output_json, std::vector<size_t> *output_size_list,
+                        const AnfNodePtr &anf_node) {
+  for (size_t i = 0; i < output_json.size(); i++) {
+    if (output_json[i].is_array()) {
+      for (size_t m = 0; m < output_json[i].size(); m++) {
+        size_t size_i = 1;
+        if (output_json[i][m][kJValid] == false) {
+          std::string output_name = output_json[i][m][kJName];
+          MS_LOG(INFO) << "Output name:" << output_name << " is optional, valid is false.";
+          continue;
+        }
+        GetRealOutputSize(output_json[i][m], anf_node, i, output_size_list, &size_i);
+      }
+    } else {
+      size_t size_i = 1;
+      if (output_json[i][kJValid] == false) {
+        std::string output_name = output_json[i][kJName];
+        MS_LOG(INFO) << "Output name:" << output_name << " is optional, valid is false.";
+        continue;
+      }
+      GetRealOutputSize(output_json[i], anf_node, i, output_size_list, &size_i);
+    }
+  }
+}
+
+bool TbeKernelBuild::GetIOSize2(const nlohmann::json &kernel_json, std::vector<size_t> *input_size_list,
+                                std::vector<size_t> *output_size_list, const AnfNodePtr &anf_node) {
+  if (input_size_list == nullptr || output_size_list == nullptr) {
+    MS_LOG(ERROR) << "Input size or output size is nullptr";
+    return false;
+  }
+  input_size_list->clear();
+  output_size_list->clear();
+  auto op_list = kernel_json["op_list"];
+  for (size_t i = 0; i < op_list.size(); i++) {
+    auto op_info = op_list[i];
+    if (op_info["type"] != "Data") {
+      GetInputSizeList2(op_info["input_desc"], input_size_list, anf_node);
+      GetOutputSizeList2(op_info["output_desc"], output_size_list, anf_node);
+    }
+  }
+  return true;
+}
+
 bool TbeKernelBuild::GenFusionScopeJson(const std::vector<mindspore::AnfNodePtr> &input_nodes,
                                         const std::vector<mindspore::AnfNodePtr> &compute_nodes,
                                         nlohmann::json *fusion_json, std::string *fusion_kernel_name) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.h b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.h
index 00e630ce1fa..4f8d49c4361 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.h
@@ -37,6 +37,8 @@ class TbeKernelBuild {
   enum FusionDataType { kFusionNormal = 0, kFusionAddN, kFusionReLUGradV2, kFusionAdd };
 
  public:
+  static bool GetIOSize2(const nlohmann::json &kernel_json, std::vector<size_t> *input_size_list,
+                         std::vector<size_t> *output_size_list, const AnfNodePtr &anf_node);
   static bool GetIOSize(const nlohmann::json &kernel_json, std::vector<size_t> *input_size_list,
                         std::vector<size_t> *output_size_list, const AnfNodePtr &anf_node);
   // Ub Fuison
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.cc
index 6197194a8ef..da36895548b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.cc
@@ -21,6 +21,7 @@
 #include <vector>
 #include <string>
 #include "utils/ms_context.h"
+#include "backend/kernel_compiler/common_utils.h"
 #include "backend/kernel_compiler/tbe/tbe_adapter.h"
 #include "backend/kernel_compiler/tbe/tbe_kernel_build.h"
 #include "backend/kernel_compiler/tbe/tbe_kernel_mod.h"
@@ -29,6 +30,7 @@
 #include "backend/kernel_compiler/tbe/tbe_utils.h"
 #include "backend/kernel_compiler/tbe/tbe_dynaminc_shape_util.h"
 #include "utils/trace_base.h"
+#include "utils/json_operation_utils.h"
 
 namespace mindspore {
 namespace kernel {
@@ -56,7 +58,6 @@ bool TbeOpParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
     if (AnfAlgo::GetKernelMod(anf_node) != nullptr) {
       continue;
     }
-    const std::string &processor = tbe::GetProcessor(anf_node);
     nlohmann::json kernel_json;
     TbeKernelJsonCreator creator(SINGLE_BUILD);
     if (!creator.GenTbeSingleKernelJson(anf_node, &kernel_json)) {
@@ -70,7 +71,7 @@ bool TbeOpParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
     (void)TbeKernelBuild::GetIOSize(kernel_json, &input_size_list, &output_size_list, anf_node);
     // search cache
     const std::string &json_name = creator.json_name();
-    if (build_manger->SearchInCache(json_name, processor, input_size_list, output_size_list, anf_node.get()) &&
+    if (build_manger->SearchInCache(json_name, input_size_list, output_size_list, anf_node.get()) &&
         ((!offline_tune.empty() && offline_tune != "true") || tune_mode == "NO_TUNE")) {
       continue;
     }
@@ -106,10 +107,24 @@ bool TbeOpParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
 
 ParallelBuildManager::~ParallelBuildManager() { ResetTaskInfo(); }
 
+void ParallelBuildManager::SavePreBuildTaskInfo(int32_t task_id, const AnfNodePtr &anf_node,
+                                                const std::string &json_name) {
+  MS_LOG(DEBUG) << "SavePreBuildTaskInfo, task id: " << task_id;
+  struct KernelBuildTaskInfo task_info;
+  task_info.node = anf_node;
+  task_info.json_name = json_name;
+  if (anf_node == nullptr) {
+    task_info.processor = tbe::kProcessorAiCore;
+  } else {
+    task_info.processor = tbe::GetProcessor(anf_node);
+  }
+  pre_build_task_map_[task_id] = task_info;
+}
+
 void ParallelBuildManager::SaveTaskInfo(int32_t task_id, const mindspore::AnfNodePtr &anf_node,
                                         const std::string &json_name, const std::vector<size_t> &input_size_list,
                                         const std::vector<size_t> &output_size_list, int64_t scope_id) {
-  MS_LOG(INFO) << "SaveTaskInfo, task id: " << task_id;
+  MS_LOG(DEBUG) << "SaveTaskInfo, task id: " << task_id;
   struct KernelBuildTaskInfo task_info;
   task_info.node = anf_node;
   task_info.json_name = json_name;
@@ -130,28 +145,23 @@ bool ParallelBuildManager::IsAllTaskFinish() const {
 }
 
 void ParallelBuildManager::PreTaskFinishProcess(int32_t task_id, const std::string &pre_build_result) {
-  auto task_iter = pre_task_map_.find(task_id);
-  if (task_iter == pre_task_map_.end()) {
+  MS_LOG(DEBUG) << "can find pre task_id : " << task_id << " result:" << pre_build_result;
+  auto task_iter = pre_build_task_map_.find(task_id);
+  if (task_iter == pre_build_task_map_.end()) {
     MS_EXCEPTION(ArgumentError) << "can find pre task_id:" << task_id;
   }
-  auto node = task_iter->second;
-  auto builder =
-    std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(AnfAlgo::GetSelectKernelBuildInfo(node));
-  std::string start_flag = "fusion_pattern_start";
-  std::string end_flag = "fusion_pattern_end";
-  auto start = pre_build_result.find(start_flag);
-  auto end = pre_build_result.find(end_flag);
-  if (start != std::string::npos && end != std::string::npos && end >= start) {
-    std::string result = pre_build_result.substr(start + start_flag.size(), end - start - start_flag.size());
-    if (result.empty()) {
-      (void)pre_task_map_.erase(task_iter);
-      return;
-    }
-    transform(result.begin(), result.end(), result.begin(), ::toupper);
-    AnfAlgo::SetNodeAttr(kAttrFusionType, MakeValue(result), node);
-    AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), node.get());
+  nlohmann::json result;
+  if (!ParseJson(pre_build_result, &result)) {
+    MS_LOG(EXCEPTION) << "Parse prebuild result error.";
   }
-  (void)pre_task_map_.erase(task_iter);
+  auto fusion_name = GetJsonValue<std::string>(result, "op_pattern");
+  auto fusion_type = kernel::GetFusionTypeByName(fusion_name);
+  auto output_data_desc = GetJsonValue<nlohmann::json>(result, "op_params");
+
+  auto node = task_iter->second.node;
+  AnfAlgo::SetFusionType(node, fusion_type);
+  AnfAlgo::SetOutputDataDesc(node, {output_data_desc});
+  (void)pre_build_task_map_.erase(task_iter);
 }
 
 std::pair<int32_t, KernelModPtr> ParallelBuildManager::TaskFinishProcess(int32_t task_id, const std::string &build_ret,
@@ -176,9 +186,25 @@ std::pair<int32_t, KernelModPtr> ParallelBuildManager::TaskFinishProcess(int32_t
   auto kernel_mod = GenKernelMod(task_iter->second.input_size_list, task_iter->second.output_size_list, kernel_pack);
   MS_EXCEPTION_IF_NULL(kernel_mod);
   if (set_kernel_mod) {
-    AnfAlgo::SetKernelMod(kernel_mod, task_iter->second.node.get());
-    AnfAlgo::SetNodeAttr(kAttrCompileInfo, MakeValue(build_ret), task_iter->second.node);
-    MS_LOG(INFO) << "Set Node Attr compile_info:" << build_ret;
+    auto cur_node = task_iter->second.node;
+    MS_EXCEPTION_IF_NULL(cur_node);
+    if (AnfAlgo::IsDynamicShape(cur_node) && (build_ret.empty() || build_ret.find("vars") == std::string::npos)) {
+      MS_LOG(EXCEPTION) << "Build failed. The build result of dynamic shape op [" << AnfAlgo::GetCNodeName(cur_node)
+                        << "] should not be empty, or can not find key ['vars'] in the result. build_res:[" << build_ret
+                        << "].";
+    }
+    AnfAlgo::SetKernelMod(kernel_mod, cur_node.get());
+    MS_LOG(INFO) << json_name << ": save compile info to json file, compile_info:" << build_ret;
+    std::string old_build = common::GetEnv("MS_OLD_BUILD_PROCESS");
+    if (!old_build.empty()) {
+      AnfAlgo::SetNodeAttr(kAttrCompileInfo, MakeValue(build_ret), cur_node);
+    } else {
+      bool save_flag = true;
+      TbeUtils::SaveCompileInfo(json_name, build_ret, &save_flag);
+      if (!save_flag) {
+        MS_LOG(EXCEPTION) << "Save json file failed, compile_info:" << build_ret;
+      }
+    }
   }
   auto ret = std::make_pair(task_iter->second.scope_id, kernel_mod);
   (void)task_map_.erase(task_iter);
@@ -213,8 +239,8 @@ void ParallelBuildManager::SaveSameFusionOpInfo(const int64_t scope_id, const st
 
 bool ParallelBuildManager::GenSameOpKernelMod() const {
   for (const auto &task_info : same_op_list_) {
-    bool ret = SearchInCache(task_info.json_name, task_info.processor, task_info.input_size_list,
-                             task_info.output_size_list, task_info.node.get());
+    bool ret =
+      SearchInCache(task_info.json_name, task_info.input_size_list, task_info.output_size_list, task_info.node.get());
     if (!ret) {
       MS_LOG(INFO) << "can't find " << task_info.json_name << " in cache.";
       return false;
@@ -226,7 +252,7 @@ bool ParallelBuildManager::GenSameOpKernelMod() const {
 bool ParallelBuildManager::GenSameFusionOpKernelMod(std::map<int64_t, KernelModPtr> *kernel_mode_ret) const {
   bool ret = true;
   for (const auto &task_info : same_op_list_) {
-    auto kernel_pack = TbeUtils::SearchCache(task_info.json_name, tbe::kProcessorAiCore);
+    auto kernel_pack = TbeUtils::SearchCache(task_info.json_name);
     if (kernel_pack != nullptr) {
       auto kernel_mode = GenKernelMod(task_info.input_size_list, task_info.output_size_list, kernel_pack);
       if (kernel_mode != nullptr) {
@@ -240,10 +266,9 @@ bool ParallelBuildManager::GenSameFusionOpKernelMod(std::map<int64_t, KernelModP
   return ret;
 }
 
-bool ParallelBuildManager::SearchInCache(const std::string &json_name, const std::string &processor,
-                                         const std::vector<size_t> &input_size_list,
+bool ParallelBuildManager::SearchInCache(const std::string &json_name, const std::vector<size_t> &input_size_list,
                                          const std::vector<size_t> &output_size_list, mindspore::AnfNode *node) const {
-  auto cached_kernel_pack = TbeUtils::SearchCache(json_name, processor);
+  auto cached_kernel_pack = TbeUtils::SearchCache(json_name);
   if (cached_kernel_pack != nullptr) {
     auto kernel_mod_ptr = GenKernelMod(input_size_list, output_size_list, cached_kernel_pack);
     MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
@@ -272,18 +297,19 @@ int ParallelBuildManager::StartCompileOp(const nlohmann::json &kernel_json) {
   return AscendKernelBuildClient::Instance().TbeStart(kernel_json.dump(), tune_mode);
 }
 
+std::string ParallelBuildManager::ProcessTbeJob(const nlohmann::json &kernel_json) {
+  return AscendKernelBuildClient::Instance().TbeSendJob(kernel_json.dump());
+}
+
 bool ParallelBuildManager::WaitOne(int *task_id, std::string *task_result, std::string *pre_build_result) {
   MS_EXCEPTION_IF_NULL(task_id);
   return AscendKernelBuildClient::Instance().TbeWait(task_id, task_result, pre_build_result);
 }
 
 void ParallelBuildManager::ResetTaskInfo() noexcept {
-  if (task_map_.empty()) {
-    MS_LOG(INFO) << "All tasks are compiled success.";
-    return;
-  }
   task_map_.clear();
   same_op_list_.clear();
+  pre_build_task_map_.clear();
 }
 
 AnfNodePtr ParallelBuildManager::GetAnfNodeByTaskID(int32_t task_id) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.h b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.h
index bf71cece3c9..858981fd801 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.h
@@ -46,15 +46,15 @@ class ParallelBuildManager {
   void SaveTaskInfo(int32_t task_id, const AnfNodePtr &anf_node, const std::string &json_name,
                     const std::vector<size_t> &input_size_list, const std::vector<size_t> &output_size_list,
                     int64_t scope_id = 0);
+  void SavePreBuildTaskInfo(int32_t task_id, const AnfNodePtr &anf_node, const std::string &json_name);
   void SaveSameOpInfo(const AnfNodePtr &anf_node, const std::string &json_name,
                       const std::vector<size_t> &input_size_list, const std::vector<size_t> &output_size_list);
   void SaveSameFusionOpInfo(const int64_t scope_id, const std::string &json_name, const std::string &processor,
                             const std::vector<size_t> &input_size_list, const std::vector<size_t> &output_size_list);
   bool GenSameOpKernelMod() const;
   bool GenSameFusionOpKernelMod(std::map<int64_t, KernelModPtr> *kernel_mode_ret) const;
-  bool SearchInCache(const std::string &json_name, const std::string &processor,
-                     const std::vector<size_t> &input_size_list, const std::vector<size_t> &output_size_list,
-                     AnfNode *node) const;
+  bool SearchInCache(const std::string &json_name, const std::vector<size_t> &input_size_list,
+                     const std::vector<size_t> &output_size_list, AnfNode *node) const;
   bool IsAllTaskFinish() const;
   void PreTaskFinishProcess(int32_t task_id, const std::string &pre_build_result);
   std::pair<int32_t, KernelModPtr> TaskFinishProcess(int32_t task_id, const std::string &build_ret,
@@ -64,12 +64,14 @@ class ParallelBuildManager {
 
   // Interactive with real backend, who could be implemented by Python.
   static int StartCompileOp(const nlohmann::json &kernel_json);
+  static std::string ProcessTbeJob(const nlohmann::json &kernel_json);
   static bool WaitOne(int *task_id, std::string *task_result, std::string *build_result);
   void ResetTaskInfo() noexcept;
   AnfNodePtr GetAnfNodeByTaskID(int32_t task_id);
 
  private:
   std::map<int32_t, AnfNodePtr> pre_task_map_;
+  std::map<int32_t, KernelBuildTaskInfo> pre_build_task_map_;
   std::map<int32_t, KernelBuildTaskInfo> task_map_;
   std::vector<KernelBuildTaskInfo> same_op_list_;
 };
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_select/tbe_kernel_select.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_select/tbe_kernel_select.cc
index 99f3884873f..f3ef5a95733 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_select/tbe_kernel_select.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_select/tbe_kernel_select.cc
@@ -25,6 +25,7 @@
 #include "backend/kernel_compiler/tbe/tbe_convert_utils.h"
 #include "backend/kernel_compiler/tbe/tbe_dynaminc_shape_util.h"
 #include "backend/kernel_compiler/tbe/tbe_kernel_build.h"
+#include "backend/kernel_compiler/tbe/ascend_kernel_compile.h"
 #include "backend/kernel_compiler/tbe/tbe_kernel_select/common_utils.h"
 #include "backend/kernel_compiler/tbe/tbe_kernel_select/tbe_kernel_broadcast_selecter.h"
 #include "backend/kernel_compiler/tbe/tbe_kernel_select/tbe_kernel_reduce_selecter.h"
@@ -34,6 +35,7 @@
 #include "backend/session/kernel_build_client.h"
 #include "nlohmann/json.hpp"
 #include "utils/convert_utils_base.h"
+#include "utils/json_operation_utils.h"
 
 namespace mindspore::kernel {
 constexpr auto kName = "name";
@@ -258,13 +260,24 @@ bool TbeKernelSelect::TbeCheckSupported(const KernelBuildInfoIter &kernel_build_
   // replace kernel_info with current kernel info
   auto kernel_build_info_tmp = AnfAlgo::GetSelectKernelBuildInfo(cnode_ptr_);
   AnfAlgo::SetSelectKernelBuildInfo(*kernel_build_info_iter, cnode_ptr_.get());
-  nlohmann::json kernel_json;
-  TbeKernelJsonCreator creator(CHECK_SUPPORTED);
-  bool ret = creator.GenTbeSingleKernelJson(cnode_ptr_, &kernel_json);
-  if (!ret) {
-    MS_LOG(EXCEPTION) << "Gen tbe single kernel json for check support failed.";
+  std::string old_build = common::GetEnv("MS_OLD_BUILD_PROCESS");
+  bool ret = true;
+  if (!old_build.empty()) {
+    nlohmann::json kernel_json;
+    TbeKernelJsonCreator creator(CHECK_SUPPORTED);
+    ret = creator.GenTbeSingleKernelJson(cnode_ptr_, &kernel_json);
+    if (!ret) {
+      MS_LOG(EXCEPTION) << "Gen tbe single kernel json for check support failed.";
+    }
+    ret = AscendKernelBuildClient::Instance().CheckSupported(kernel_json.dump());
+  } else {
+    auto build_manager = kernel::ascend::AscendKernelCompileManager::GetInstance();
+    MS_EXCEPTION_IF_NULL(build_manager);
+    if (!build_manager->AscendOpCheckSupported(cnode_ptr_)) {
+      MS_LOG(WARNING) << "Tbe check supported failed";
+      ret = false;
+    }
   }
-  ret = AscendKernelBuildClient::Instance().CheckSupported(kernel_json.dump());
   AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_tmp, cnode_ptr_.get());
   return ret;
 }
@@ -416,19 +429,28 @@ std::vector<std::string> TbeKernelSelect::SplitStrToVec(const std::string &op_se
 }
 
 std::string TbeKernelSelect::OpSelectFormat() {
-  nlohmann::json kernel_json;
   std::string res_json_str;
-  TbeKernelJsonCreator creator(OP_SELECT_FORMAT);
-  bool ret = creator.GenTbeSingleKernelJson(cnode_ptr_, &kernel_json);
-  if (!ret) {
-    MS_LOG(EXCEPTION) << "GenTbeSingleKernelJson failed.";
-  }
-  res_json_str = AscendKernelBuildClient::Instance().SelectFormat(kernel_json.dump());
-  if (res_json_str.empty()) {
-    MS_LOG(EXCEPTION) << "Op select format error, input args: " << kernel_json.dump();
-  }
-  if (res_json_str.find("TBEException") != std::string::npos) {
-    MS_LOG(EXCEPTION) << "Dynamic op select failed: " << res_json_str << ", input args: " << kernel_json.dump();
+  std::string old_build = common::GetEnv("MS_OLD_BUILD_PROCESS");
+  if (!old_build.empty()) {
+    nlohmann::json kernel_json;
+    TbeKernelJsonCreator creator(OP_SELECT_FORMAT);
+    bool ret = creator.GenTbeSingleKernelJson(cnode_ptr_, &kernel_json);
+    if (!ret) {
+      MS_LOG(EXCEPTION) << "GenTbeSingleKernelJson failed.";
+    }
+    res_json_str = AscendKernelBuildClient::Instance().SelectFormat(kernel_json.dump());
+    if (res_json_str.empty()) {
+      MS_LOG(EXCEPTION) << "Op select format error, input args: " << kernel_json.dump();
+    }
+    if (res_json_str.find("TBEException") != std::string::npos) {
+      MS_LOG(EXCEPTION) << "Dynamic op select failed: " << res_json_str << ", input args: " << kernel_json.dump();
+    }
+  } else {
+    MS_LOG(INFO) << "Format select for node:[" << AnfAlgo::GetCNodeName(cnode_ptr_) << ", "
+                 << cnode_ptr_->fullname_with_scope() << "].";
+    auto build_manager = kernel::ascend::AscendKernelCompileManager::GetInstance();
+    MS_EXCEPTION_IF_NULL(build_manager);
+    res_json_str = build_manager->AscendOpSelectFormat(cnode_ptr_);
   }
   return res_json_str;
 }
@@ -471,7 +493,10 @@ void TbeKernelSelect::CreateNewOpInfo(const mindspore::kernel::OpInfo &op_info,
   MS_EXCEPTION_IF_NULL(op_info_new);
   auto op_seclect_json = OpSelectFormat();
   if (!op_seclect_json.empty()) {
-    nlohmann::json json_obj = nlohmann::json::parse(op_seclect_json);
+    nlohmann::json json_obj;
+    if (!ParseJson(op_seclect_json, &json_obj)) {
+      MS_LOG(EXCEPTION) << "Parse op_select_json error.";
+    }
     if (!json_obj.is_object()) {
       MS_LOG(EXCEPTION) << "JsonStr is not an object, the jsonStr is:" << op_seclect_json;
     }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.cc
index 13262d4e7e6..97992894d31 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.cc
@@ -30,90 +30,184 @@
 #include "utils/ms_utils.h"
 #include "utils/ms_context.h"
 #include "ir/dtype/type.h"
+#include "runtime/dev.h"
+#include "runtime/device/ascend/lic_manager.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/tbe/tbe_convert_utils.h"
+#include "mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_creator.h"
+#include "mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/single_tbe_json_creator.h"
 #include "securec/include/securec.h"
+#include "utils/json_operation_utils.h"
+#include "mindspore/ccsrc/debug/common.h"
 
 namespace mindspore {
 namespace kernel {
 namespace tbe {
-constexpr auto kCceKernelMeta = "./kernel_meta/";
+constexpr auto kCceKernelMeta = "kernel_meta/";
 constexpr auto kJsonSuffix = ".json";
 constexpr auto kInfoSuffix = ".info";
+constexpr auto kSOC_VERSION = "SOC_VERSION";
+constexpr auto kBuildRes = "build_result";
+constexpr auto kTUNE_BANK_PATH = "TUNE_BANK_PATH";
+constexpr auto kTUNE_DUMP_PATH = "TUNE_DUMP_PATH";
+constexpr auto kJRlTuneSwitch = "rl_tune_switch";
+constexpr auto kJRlTuneList = "rl_tune_list";
+constexpr auto kJOpTuneSwitch = "op_tune_switch";
+constexpr auto kJOpTuneList = "op_tune_list";
+constexpr auto kJPassList = "pass_list";
+constexpr auto kRankID = "RANK_ID";
+constexpr auto kCOMPILER_OP_LEVEL = "MS_COMPILER_OP_LEVEL";
+constexpr auto kCOMPILER_CACHE_PATH = "MS_COMPILER_CACHE_PATH";
 
 uintptr_t KernelManager::kernel_stub_gen_ = 0;
 std::unordered_map<string, KernelMetaPtr> KernelManager::info_table_ = {};
 
+void TbeUtils::GenLicInfo(nlohmann::json *lic_info_json) {
+  MS_EXCEPTION_IF_NULL(lic_info_json);
+  (*lic_info_json)[kJRlTuneSwitch] = LicManager::GetInstance().GetRlTuneSwitch();
+  (*lic_info_json)[kJRlTuneList] = LicManager::GetInstance().GetRlTuneList();
+  (*lic_info_json)[kJOpTuneSwitch] = LicManager::GetInstance().GetOpTuneSwitch();
+  (*lic_info_json)[kJOpTuneList] = LicManager::GetInstance().GetOpTuneList();
+  (*lic_info_json)[kJPassList] = LicManager::GetInstance().GetPassSwitch();
+}
+
+std::string TbeUtils::GetBankPath() {
+  // tune bank path
+  auto save_path = common::GetEnv(kTUNE_BANK_PATH);
+  char real_path[PATH_MAX] = {0};
+  if (!save_path.empty()) {
+    if (realpath(save_path.c_str(), real_path)) {
+      save_path = real_path;
+      return save_path;
+    }
+    MS_LOG(EXCEPTION) << "Invalid env TUNE_BANK_PATH, path : " << save_path;
+  }
+  return "";
+}
+
+std::string TbeUtils::GetTuneDumpPath() {
+  // tune dump path
+  auto save_path = common::GetEnv(kTUNE_DUMP_PATH);
+  char real_path[PATH_MAX] = {0};
+  if (!save_path.empty()) {
+    if (realpath(save_path.c_str(), real_path)) {
+      save_path = real_path;
+      return save_path;
+    }
+    MS_LOG(EXCEPTION) << "Invalid env kTUNE_DUMP_PATH, path : " << save_path;
+  }
+  return "";
+}
+
+std::string TbeUtils::GetOpDebugPath() {
+  auto old_build = common::GetEnv("MS_OLD_BUILD_PROCESS");
+  auto config_path = Common::CommonFuncForConfigPath("./", common::GetEnv(kCOMPILER_CACHE_PATH));
+  if (!old_build.empty()) {
+    if (config_path[config_path.length() - 1] == '/') {
+      return config_path;
+    }
+    return config_path + "/";
+  } else {
+    std::string rank_id_str = common::GetEnv(kRankID);
+    if (rank_id_str.empty()) {
+      MS_LOG(DEBUG) << "Using the default value: 0";
+      rank_id_str = "0";
+    }
+    if (config_path[config_path.length() - 1] == '/') {
+      return config_path + "rank_" + rank_id_str + "/";
+    }
+    return config_path + "/" + "rank_" + rank_id_str + "/";
+  }
+}
+
+std::string GetOpDebugLevel() {
+  const std::set<std::string> exp = {"0", "1"};
+  std::string op_debug_level = "0";
+  auto env_level = common::GetEnv(kCOMPILER_OP_LEVEL);
+  if (!env_level.empty()) {
+    if (exp.find(env_level) == exp.end()) {
+      MS_LOG(WARNING) << "Invalid COMPILER_OP_LEVEL env:" << env_level
+                      << ", the value should be 0 or 1, now using the default value 0";
+    } else {
+      op_debug_level = env_level;
+    }
+  }
+  return op_debug_level;
+}
+
 void TbeUtils::GenSocInfo(nlohmann::json *soc_info_json) {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
   MS_EXCEPTION_IF_NULL(soc_info_json);
   std::list<int64_t> list;
   (*soc_info_json)["coreNum"] = "";
   (*soc_info_json)["coreType"] = "";
+  (*soc_info_json)["op_impl_mode"] = "";
+  (*soc_info_json)["vector_fp_ceiling"] = "";
+  (*soc_info_json)["op_impl_mode_list"] = list;
+  (*soc_info_json)["l2Mode"] = "2";
   (*soc_info_json)["l1Fusion"] = "false";
   (*soc_info_json)["l2Fusion"] = "false";
-  (*soc_info_json)["l2Mode"] = "2";
-  (*soc_info_json)["op_debug_level"] = "";
-  (*soc_info_json)["op_impl_mode"] = "";
-  (*soc_info_json)["op_impl_mode_list"] = list;
+  (*soc_info_json)["op_bank_update"] = false;
+  (*soc_info_json)["socVersion"] = GetSocVersion();
+  (*soc_info_json)["offlineTune"] = CheckOfflineTune();
+  (*soc_info_json)["op_debug_dir"] = GetOpDebugPath();
+  (*soc_info_json)["op_debug_level"] = GetOpDebugLevel();
+  (*soc_info_json)["autoTilingMode"] = context_ptr->get_param<std::string>(MS_CTX_TUNE_MODE);
+  (*soc_info_json)["deviceId"] = std::to_string(context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID));
+  (*soc_info_json)["op_bank_path"] = Common::CommonFuncForConfigPath("", common::GetEnv("OP_BANK_PATH"));
+  (*soc_info_json)["mdl_bank_path"] = Common::CommonFuncForConfigPath("", common::GetEnv("MDL_BANK_PATH"));
 }
 
 void TbeUtils::SaveJsonInfo(const std::string &json_name, const std::string &info) {
-  char real_path[PATH_MAX] = {0};
-  std::string path = kCceKernelMeta + json_name + kInfoSuffix;
-  if (path.size() >= PATH_MAX) {
-    MS_LOG(ERROR) << "File path: " << path << "is too long.";
+  auto config_path = TbeUtils::GetOpDebugPath();
+  std::string path = config_path + kCceKernelMeta + json_name + kInfoSuffix;
+  auto realpath = Common::GetRealPath(path);
+  if (!realpath.has_value()) {
+    MS_LOG(WARNING) << "Get real path failed, invalid path: " << realpath.value();
     return;
   }
-  std::ifstream fin(path);
-  if (fin) {
-    MS_LOG(INFO) << "Json file exist(" << path << "), no need to create.";
-    return;
-  }
-  std::ofstream file_write;
-  file_write.open(path);
+  ChangeFileMode(realpath.value(), S_IWUSR);
+  std::ofstream file_write(realpath.value());
   if (!file_write.is_open()) {
-    MS_LOG(WARNING) << "Create info file failed(" << path << ").";
+    MS_LOG(WARNING) << "Create info file failed(" << realpath.value() << ").";
     return;
   }
   file_write << info << std::endl;
   file_write.close();
-  if (realpath(path.c_str(), real_path) == nullptr) {
-    MS_LOG(WARNING) << "Get realpath failed(" << path << ").";
-    return;
-  }
-  MS_LOG(INFO) << "real path is: " << real_path;
-  if (chmod(real_path, S_IRUSR) == -1) {
-    MS_LOG(INFO) << "modify file: " << real_path << "to read only fail.";
-  }
+  file_write.clear();
+  ChangeFileMode(realpath.value(), S_IRUSR);
 }
 
 void TbeUtils::LoadCache() {
   static bool has_load = false;
   if (!has_load) {
     auto bin_map = KernelMeta::GetInstance();
-    if (!bin_map->ReadIndex(kCceKernelMeta)) {
-      MS_LOG(INFO) << "Cache initialize failed[" << kCceKernelMeta << "]";
+    auto config_path = TbeUtils::GetOpDebugPath();
+    auto path = config_path + kCceKernelMeta;
+    if (!bin_map->ReadIndex(path)) {
+      MS_LOG(INFO) << "Cache initialize failed[" << path << "]";
     }
     has_load = true;
   }
 }
 
-KernelPackPtr TbeUtils::SearchCache(const std::string &kernel_name, const std::string &processor) {
+KernelPackPtr TbeUtils::SearchCache(const std::string &kernel_name, const bool is_akg) {
   // search cache.
   KernelMeta *bin_map = KernelMeta::GetInstance();
   if (bin_map == nullptr) {
     MS_LOG(INFO) << "kernel cache is invalid.";
     return nullptr;
   }
-  return bin_map->GetKernelPack(kernel_name, processor);
+  return bin_map->GetKernelPack(kernel_name, is_akg);
 }
 
-KernelPackPtr TbeUtils::InsertCache(const std::string &kernel_name, const std::string &processor) {
+KernelPackPtr TbeUtils::InsertCache(const std::string &kernel_name, const std::string &processor, const bool is_akg) {
   MS_LOG(INFO) << "kernel name:  " << kernel_name << ", processr:" << processor;
   if (processor != kProcessorAiCore) {
     MS_LOG(EXCEPTION) << "process type should be aicore, actually is: " << processor;
   }
-  return SearchCache(kernel_name, processor);
+  return SearchCache(kernel_name, is_akg);
 }
 
 int KernelManager::BinaryRegister(const mindspore::kernel::FlexArray &kernel_buffer, void **module, const string &magic,
@@ -245,7 +339,123 @@ bool KernelMeta::ReadIndex(const std::string &bin_dir) {
   return true;
 }
 
-KernelPackPtr KernelMeta::GetKernelPack(const std::string &kernel_name, const std::string &processor) {
+void TbeUtils::GetCompileInfo(const AnfNodePtr &node, std::string *compile_info, bool *get_flag) {
+  MS_EXCEPTION_IF_NULL(node);
+  MS_LOG(INFO) << "Get compile info from json file start. [" << node->fullname_with_scope() << "]";
+  auto json_creator = std::make_shared<kernel::BuildTbeJsonCreator>();
+  MS_EXCEPTION_IF_NULL(json_creator);
+  nlohmann::json kernel_json;
+  if (!json_creator->GenJson(node, &kernel_json)) {
+    MS_LOG(WARNING) << "Gen kernel json failed [" << node->fullname_with_scope() << "]";
+    *get_flag = false;
+    return;
+  }
+  auto json_name = json_creator->GetJsonName();
+  auto config_path = TbeUtils::GetOpDebugPath();
+  std::string path = config_path + kCceKernelMeta + json_name + kJsonSuffix;
+  if (path.size() > PATH_MAX) {
+    MS_LOG(WARNING) << "File path: " << path << "is too long.";
+    *get_flag = false;
+    return;
+  }
+  nlohmann::json read_new_json;
+  std::ifstream file(path.c_str());
+  std::string ori_file = std::string((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+  if (!ParseJson(ori_file, &read_new_json)) {
+    MS_LOG(EXCEPTION) << "Parse compile info error.";
+  }
+  *compile_info = read_new_json[kBuildRes].dump();
+  file.close();
+  file.clear();
+  MS_LOG(INFO) << "Get compile info from json file success";
+}
+
+void TbeUtils::SaveCompileInfo(const std::string &json_name, const std::string &build_res, bool *save_flag) {
+  MS_LOG(INFO) << "Save compile info to json file start. [" << json_name << "], value: " << build_res;
+  auto config_path = TbeUtils::GetOpDebugPath();
+  std::string path = config_path + kCceKernelMeta + json_name + kJsonSuffix;
+  if (path.size() > PATH_MAX) {
+    MS_LOG(WARNING) << "File path: " << path << "is too long.";
+    *save_flag = false;
+    return;
+  }
+  nlohmann::json save_new_json;
+  std::ifstream file(path.c_str());
+  std::string ori_file = std::string((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+  if (!ParseJson(ori_file, &save_new_json)) {
+    MS_LOG(EXCEPTION) << "Parse compile info error.";
+  }
+  file.close();
+  file.clear();
+  if (build_res.empty()) {
+    save_new_json[kBuildRes] = build_res;
+  } else {
+    save_new_json[kBuildRes] = nlohmann::json::parse(build_res);
+  }
+  std::ofstream file_write;
+  file_write.open(path);
+  if (!file_write.is_open()) {
+    MS_LOG(WARNING) << "Create info file failed. [" << path << "]";
+    *save_flag = false;
+    return;
+  }
+  const int indent = 4;
+  auto info = save_new_json.dump(indent);
+  file_write << info << std::endl;
+  file_write.close();
+  file_write.clear();
+  MS_LOG(INFO) << "Save compile info to json file success";
+}
+
+bool TbeUtils::CheckOfflineTune() {
+  bool offline = false;
+  std::string offline_tune = common::GetEnv("ENABLE_TUNE_DUMP");
+  if (!offline_tune.empty()) {
+    for (size_t j = 0; j < offline_tune.length(); j++) {
+      offline_tune[j] = tolower(offline_tune[j]);
+    }
+    if (!(offline_tune == "true" || offline_tune == "false")) {
+      MS_LOG(EXCEPTION) << "The value of ENABLE_TUNE_DUMP must be 'true' or 'false'";
+    }
+    offline = (offline_tune == "true");
+  }
+  return offline;
+}
+
+std::string TbeUtils::GetSocVersion() {
+  // Get default soc version.
+  static std::string version;
+  if (version.empty()) {
+    const int kSocVersionLen = 50;
+    char soc_version[kSocVersionLen] = {0};
+    auto ret = rtGetSocVersion(soc_version, kSocVersionLen);
+    if (ret != RT_ERROR_NONE) {
+      MS_LOG(EXCEPTION) << "GetSocVersion failed.";
+    }
+    // Get soc version from env value.
+    const char *soc_version_env = nullptr;
+    std::string str_soc_version_env = common::GetEnv(kSOC_VERSION);
+    if (!str_soc_version_env.empty()) {
+      soc_version_env = common::SafeCStr(str_soc_version_env);
+    }
+    if (soc_version_env != nullptr) {
+      if (std::strcmp(soc_version, soc_version_env) != 0) {
+        MS_LOG(DEBUG) << "Detected the env SOC_VERSION, so the SocVersion will be changed to " << str_soc_version_env
+                      << ".";
+        ret = rtSetSocVersion(soc_version_env);
+        if (ret != RT_ERROR_NONE) {
+          MS_LOG(EXCEPTION) << "SetSocVersion failed, errorno: " << ret;
+        }
+        version = soc_version_env;
+        return soc_version_env;
+      }
+    }
+    version = soc_version;
+  }
+  return version;
+}
+
+KernelPackPtr KernelMeta::GetKernelPack(const std::string &kernel_name, const bool is_akg) {
   KernelPackPtr ret = nullptr;
   // 1. pack has been created
   auto kernel_pack_iter = kernel_pack_map_.find(kernel_name);
@@ -253,10 +463,11 @@ KernelPackPtr KernelMeta::GetKernelPack(const std::string &kernel_name, const st
     ret = kernel_pack_iter->second;
   } else {
     // 2. kernel file has been create, but pack does not been created.
-    std::string cce_json = kCceKernelMeta;
-    (void)cce_json.append(kernel_name).append(kJsonSuffix);
+    auto config_path = TbeUtils::GetOpDebugPath();
+    std::string cce_json = is_akg ? ("./kernel_meta/" + kernel_name + kJsonSuffix)
+                                  : (config_path + kCceKernelMeta + kernel_name + kJsonSuffix);
     ret = std::make_shared<KernelPack>();
-    if (!ret->LoadKernelMeta(cce_json, processor)) {
+    if (!ret->LoadKernelMeta(cce_json)) {
       MS_LOG(INFO) << "Read cache json and bin file failed[" << cce_json << "]";
       return nullptr;
     }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.h b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.h
index 88c6baaea07..329721198a4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.h
@@ -21,6 +21,7 @@
 #include <vector>
 #include <utility>
 #include <map>
+#include <tuple>
 #include <unordered_map>
 #include <nlohmann/json.hpp>
 
@@ -33,7 +34,6 @@ namespace kernel {
 namespace tbe {
 using std::string;
 using std::vector;
-
 class TbeUtils {
  public:
   TbeUtils() = default;
@@ -44,11 +44,28 @@ class TbeUtils {
 
   static void LoadCache();
 
+  static void GenLicInfo(nlohmann::json *lic_info_json);
+
   static void GenSocInfo(nlohmann::json *soc_info_json);
 
-  static KernelPackPtr SearchCache(const std::string &kernel_name, const std::string &processor);
+  static std::string GetSocVersion();
 
-  static KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &processor);
+  static std::string GetOpDebugPath();
+
+  static std::string GetBankPath();
+
+  static std::string GetTuneDumpPath();
+
+  static void SaveCompileInfo(const std::string &json_name, const std::string &build_res, bool *save_flag);
+
+  static void GetCompileInfo(const AnfNodePtr &node, std::string *compile_info, bool *get_flag);
+
+  static bool CheckOfflineTune();
+
+  static KernelPackPtr SearchCache(const std::string &kernel_name, const bool is_akg = false);
+
+  static KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &processor,
+                                   const bool is_akg = false);
 };
 
 struct KernelMetaInfo {
@@ -77,7 +94,7 @@ class KernelMeta {
  public:
   static KernelMeta *GetInstance();
   bool ReadIndex(const std::string &bin_dir);
-  KernelPackPtr GetKernelPack(const std::string &kernel_name, const std::string &processor);
+  KernelPackPtr GetKernelPack(const std::string &kernel_name, const bool is_akg = false);
 
  private:
   KernelMeta() = default;
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
index e7cb89eff63..39266199203 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
@@ -38,6 +38,7 @@
 #include "backend/optimizer/ascend/ir_fusion/square_sum_fusion.h"
 #include "backend/optimizer/ascend/ir_fusion/clip_by_norm_no_div_square_sum_fusion.h"
 #include "backend/optimizer/ascend/ir_fusion/lamb_update_with_lr_rule_fusion.h"
+#include "backend/optimizer/ascend/ir_fusion/prelu_fusion.h"
 #include "backend/optimizer/ascend/ir_fusion/clip_by_value_fusion.h"
 #include "backend/optimizer/ascend/ir_fusion/confusion_softmax_grad_rule.h"
 #include "backend/optimizer/ascend/ir_fusion/lamb_next_mv_rule.h"
@@ -128,6 +129,7 @@
 #include "backend/optimizer/ascend/enhancer/add_placeholder_for_dynamic_gru.h"
 #include "backend/optimizer/ascend/enhancer/add_attr_for_3d_graph.h"
 #include "backend/optimizer/ascend/enhancer/split_n_optimizer.h"
+#include "backend/kernel_compiler/tbe/ascend_kernel_compile.h"
 #include "utils/ms_context.h"
 #include "utils/config_manager.h"
 #include "utils/context/graph_kernel_flags.h"
@@ -164,6 +166,7 @@ void AddAscendIRFusionRulesPass(PassManager *ir_fusion_pm) {
   ir_fusion_pm->AddPass(std::make_shared<ClipByNormNoDivSquareSumFusion>());
   ir_fusion_pm->AddPass(std::make_shared<SquareSumFusion>());
   ir_fusion_pm->AddPass(std::make_shared<ClipByValueFusion>());
+  ir_fusion_pm->AddPass(std::make_shared<PReluFusion>());
 }
 
 void AddAscendIRFusionPass(PassManager *ir_fusion_pm) {
@@ -322,8 +325,13 @@ void RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::Kerne
   }
   auto optimizer = std::make_shared<GraphOptimizer>();
   auto ir_fusion_pm = std::make_shared<PassManager>("ir_fusion_pm");
+  ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicRNN>());
+  ir_fusion_pm->AddPass(std::make_shared<DynamicGRUV2GradFission>());
+  ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicGRUV2>());
+  ir_fusion_pm->AddPass(std::make_shared<DynamicRnnGradFissionV2>());
   ir_fusion_pm->AddPass(std::make_shared<SplitFission>());
   ir_fusion_pm->AddPass(std::make_shared<SplitVFission>());
+  ir_fusion_pm->AddPass(std::make_shared<ConcatFission>());
   ir_fusion_pm->AddPass(std::make_shared<BnSplit>());
   ir_fusion_pm->AddPass(std::make_shared<BnGradSplit>());
   ir_fusion_pm->AddPass(std::make_shared<LayerNormGradSplit>());
@@ -338,10 +346,6 @@ void RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::Kerne
   ir_fusion_pm->AddPass(std::make_shared<AddnFission>());
   ir_fusion_pm->AddPass(std::make_shared<InsertPadForNMSWithMask>());
   ir_fusion_pm->AddPass(std::make_shared<TensorScatterUpdateFission>());
-  ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicRNN>());
-  ir_fusion_pm->AddPass(std::make_shared<DynamicGRUV2GradFission>());
-  ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicGRUV2>());
-  ir_fusion_pm->AddPass(std::make_shared<DynamicRnnGradFissionV2>());
   ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
   ir_fusion_pm->AddPass(std::make_shared<BCEWithLogitsLossFission>());
 
@@ -382,6 +386,8 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern
   // other optimization
   auto optimizer = std::make_shared<GraphOptimizer>();
   auto other_pm = std::make_shared<PassManager>("other_pm");
+  other_pm->AddPass(std::make_shared<SendFusion>());
+  other_pm->AddPass(std::make_shared<RecvFusion>());
   other_pm->AddPass(std::make_shared<AllReduceFusion>());
   other_pm->AddPass(std::make_shared<AllGatherFusion>());
   other_pm->AddPass(std::make_shared<ConcatOutputsForAllGather>());
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc b/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
index e0bb6aab830..95ea2527aa7 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
@@ -197,7 +197,7 @@ AnfNodePtr AddTransOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePt
                                       : trans::IsNeedPadding(input_format, input_node_out_shape.size());
   if (!need_padding) {
     // don't need padding insert transdata only
-    trans_data = NewTransOpNode(func_graph, input_node, kernel_select, need_padding, prim::KPrimTransData->name());
+    trans_data = NewTransOpNode(func_graph, input_node, kernel_select, need_padding, prim::kPrimTransData->name());
     trans_node = trans_data;
   } else if (is_insert_input) {
     // if need padding & is input need insert a transdata
@@ -205,13 +205,13 @@ AnfNodePtr AddTransOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePt
     auto padding_shape = trans::PaddingShape(input_node_out_shape, AnfAlgo::GetInputFormat(node, insert_index),
                                              AnfAlgo::GetInputReshapeType(node, insert_index));
     auto reshape_node = CreateReshapeNode(func_graph, input_node, kernel_select, padding_shape);
-    trans_data = NewTransOpNode(func_graph, reshape_node, kernel_select, need_padding, prim::KPrimTransData->name());
+    trans_data = NewTransOpNode(func_graph, reshape_node, kernel_select, need_padding, prim::kPrimTransData->name());
     trans_node = trans_data;
     trans_data->set_abstract(input_node->abstract());
   } else {
     // if need padding & is output need insert a transdata
     // node -> transdata[padding shape] -> reshape[ori_shape]
-    trans_data = NewTransOpNode(func_graph, input_node, kernel_select, need_padding, prim::KPrimTransData->name());
+    trans_data = NewTransOpNode(func_graph, input_node, kernel_select, need_padding, prim::kPrimTransData->name());
     auto reshape_node = CreateReshapeNode(func_graph, trans_data, kernel_select, input_node_out_shape);
     trans_node = reshape_node;
   }
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/batchmatmul_fusedmuladd_fusion_pass.cc b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/batchmatmul_fusedmuladd_fusion_pass.cc
index 3a789bfb95c..31bcaeaa112 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/batchmatmul_fusedmuladd_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/batchmatmul_fusedmuladd_fusion_pass.cc
@@ -28,12 +28,10 @@
 namespace mindspore {
 namespace opt {
 void BatchMatmulFusedMulAddFusionPass::MatchBatchMatmulFusedMulAdd(const CNodePtr &cnode,
-                                                                   const session::KernelGraph &kernel_graph,
+                                                                   const session::KernelGraph & /*kernel_graph*/,
                                                                    FusedNodeRecord *candidate_fusion) {
   MS_EXCEPTION_IF_NULL(cnode);
   MS_EXCEPTION_IF_NULL(candidate_fusion);
-  auto manager = kernel_graph.manager();
-  MS_EXCEPTION_IF_NULL(manager);
   auto batch_matmul = cnode->input(kIndex2);
   MS_EXCEPTION_IF_NULL(batch_matmul);
   if (batch_matmul->isa<CNode>() && AnfAlgo::CheckPrimitiveType(batch_matmul, prim::kPrimBatchMatMul)) {
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.cc b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.cc
index fa3629938ee..0d6d3279415 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.cc
@@ -33,8 +33,6 @@ void BnupdateEltwiseEltwiseFusionPass::MatchBnupdateAddRelu(const CNodePtr &cnod
                                                             FusedNodeRecord *candidate_fusion) {
   MS_EXCEPTION_IF_NULL(cnode);
   MS_EXCEPTION_IF_NULL(candidate_fusion);
-  auto manager = kernel_graph.manager();
-  MS_EXCEPTION_IF_NULL(manager);
   MS_EXCEPTION_IF_NULL(relu_input);
   auto add = relu_input->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(add);
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.cc b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.cc
index cd5ddcdb660..7b914c4426c 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.cc
@@ -33,8 +33,6 @@ void BnupdateEltwiseFusionPass::MatchBnupdateDoubleOutputEltwise(const CNodePtr
                                                                  FusedNodeRecord *candidate_fusion) {
   MS_EXCEPTION_IF_NULL(cnode);
   MS_EXCEPTION_IF_NULL(candidate_fusion);
-  auto manager = kernel_graph.manager();
-  MS_EXCEPTION_IF_NULL(manager);
   MS_EXCEPTION_IF_NULL(eltwise_input);
   auto getitem = eltwise_input->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(getitem);
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/conv2dbackprop_eltwise_fusion_pass.cc b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/conv2dbackprop_eltwise_fusion_pass.cc
index 05c29c4ab68..4465b11521f 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/conv2dbackprop_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/conv2dbackprop_eltwise_fusion_pass.cc
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace opt {
 void Conv2DBackpropEltwiseFusionPass::MatchConv2DBackpropInputEltwise(const CNodePtr &cnode,
-                                                                      const session::KernelGraph &,
+                                                                      const session::KernelGraph & /*kernel_graph*/,
                                                                       FusedNodeRecord *candidate_fusion) {
   MS_EXCEPTION_IF_NULL(cnode);
   MS_EXCEPTION_IF_NULL(candidate_fusion);
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/conv_bnreduce_fusion_pass.cc b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/conv_bnreduce_fusion_pass.cc
index 2c42b491b85..c4261dba50d 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/conv_bnreduce_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/conv_bnreduce_fusion_pass.cc
@@ -33,8 +33,6 @@ void ConvBnReduceFusionPass::MatchConvBnreduce(const CNodePtr &cnode, const sess
                                                FusedNodeRecord *candidate_fusion) {
   MS_EXCEPTION_IF_NULL(cnode);
   MS_EXCEPTION_IF_NULL(candidate_fusion);
-  auto manager = kernel_graph.manager();
-  MS_EXCEPTION_IF_NULL(manager);
   auto conv = cnode->input(kIndex1);
   MS_EXCEPTION_IF_NULL(conv);
   if (conv->isa<CNode>() && AnfAlgo::GetCNodeName(conv) == prim::kPrimConv2D->name() &&
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/depthwiseconv_eltwise_fusion_pass.cc b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/depthwiseconv_eltwise_fusion_pass.cc
index cafa4366035..e8aa3fcbbbc 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/depthwiseconv_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/depthwiseconv_eltwise_fusion_pass.cc
@@ -29,12 +29,10 @@
 namespace mindspore {
 namespace opt {
 void DepthwiseConvEltwiseFusionPass::MatchDepthwiseConvRelu(const CNodePtr &cnode,
-                                                            const session::KernelGraph &kernel_graph,
+                                                            const session::KernelGraph & /*kernel_graph*/,
                                                             FusedNodeRecord *candidate_fusion, bool is_order) {
   MS_EXCEPTION_IF_NULL(cnode);
   MS_EXCEPTION_IF_NULL(candidate_fusion);
-  auto manager = kernel_graph.manager();
-  MS_EXCEPTION_IF_NULL(manager);
   if (is_order) {
     // DepthwiseConvolution--->Elemwise
     auto depthwise_conv = cnode->input(kIndex1);
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.cc b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.cc
index 343e79477e4..4a1ead9a83c 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.cc
@@ -32,9 +32,9 @@ bool FusionBasePass::CheckEltWiseNode(const session::KernelGraph &kernel_graph,
   }
   auto cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
-  auto user_nodes = manager->node_users()[node];
+  size_t not_updatestate_nums = GetNotUpdateStateUserNums(kernel_graph, node);
   return AnfAlgo::GetKernelType(node) == KernelType::TBE_KERNEL &&
-         AnfAlgo::GetFusionType(node) == kernel::FusionType::ELEMWISE && user_nodes.size() == ELTWISE_USE &&
+         AnfAlgo::GetFusionType(node) == kernel::FusionType::ELEMWISE && not_updatestate_nums == ELTWISE_USE &&
          cnode->inputs().size() == ELTWISE_INPUT_SIZE;
 }
 
@@ -47,9 +47,9 @@ bool FusionBasePass::CheckDoubleInEltWiseNode(const session::KernelGraph &kernel
   }
   auto cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
-  auto user_nodes = manager->node_users()[node];
+  size_t not_updatestate_nums = GetNotUpdateStateUserNums(kernel_graph, node);
   return AnfAlgo::GetKernelType(node) == KernelType::TBE_KERNEL &&
-         AnfAlgo::GetFusionType(node) == kernel::FusionType::ELEMWISE && user_nodes.size() == ELTWISE_USE &&
+         AnfAlgo::GetFusionType(node) == kernel::FusionType::ELEMWISE && not_updatestate_nums == ELTWISE_USE &&
          cnode->inputs().size() == ELTWISE_DOUBLE_IN_INPUT_SIZE;
 }
 
@@ -62,12 +62,27 @@ bool FusionBasePass::CheckMultiOutputEltWiseNode(const session::KernelGraph &ker
   }
   auto cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
-  auto user_nodes = manager->node_users()[node];
+  size_t not_updatestate_nums = GetNotUpdateStateUserNums(kernel_graph, node);
   return AnfAlgo::GetKernelType(node) == KernelType::TBE_KERNEL &&
-         AnfAlgo::GetFusionType(node) == kernel::FusionType::ELEMWISE && user_nodes.size() == ELTWISE_MULTI_USE &&
+         AnfAlgo::GetFusionType(node) == kernel::FusionType::ELEMWISE && not_updatestate_nums == ELTWISE_MULTI_USE &&
          cnode->inputs().size() == ELTWISE_INPUT_SIZE;
 }
 
+size_t FusionBasePass::GetNotUpdateStateUserNums(const session::KernelGraph &kernel_graph, const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
+  auto manager = kernel_graph.manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  auto user_nodes = manager->node_users()[node];
+  size_t not_updatestate_users = 0;
+  for (auto &user : user_nodes) {
+    auto user_node = user.first;
+    if (!AnfAlgo::CheckPrimitiveType(user_node, prim::kPrimUpdateState)) {
+      not_updatestate_users++;
+    }
+  }
+  return not_updatestate_users;
+}
+
 void FusionBasePass::SetRecordFusionId(const std::unordered_set<AnfNodePtr> &record) {
   auto id = fusion_id_allocator->AllocateFusionId();
   for (auto node : record) {
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h
index de886357c8a..e9617da090e 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h
@@ -70,6 +70,7 @@ class FusionBasePass : public Pass {
   bool CheckEltWiseNode(const session::KernelGraph &kernel_graph, const AnfNodePtr &node);
   bool CheckDoubleInEltWiseNode(const session::KernelGraph &kernel_graph, const AnfNodePtr &node);
   bool CheckMultiOutputEltWiseNode(const session::KernelGraph &kernel_graph, const AnfNodePtr &node);
+  size_t GetNotUpdateStateUserNums(const session::KernelGraph &kernel_graph, const AnfNodePtr &node);
   FusionIdAllocatorPtr fusion_id_allocator;
 };
 }  // namespace opt
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.cc b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.cc
index 1221fcaf24d..55c5ccb7bce 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.cc
@@ -28,12 +28,10 @@
 namespace mindspore {
 namespace opt {
 void MatmulConfusionTranposeFusionPass::MatchMatmulConfusionTranpose(const CNodePtr &cnode,
-                                                                     const session::KernelGraph &kernel_graph,
+                                                                     const session::KernelGraph & /*kernel_graph*/,
                                                                      FusedNodeRecord *candidate_fusion) {
   MS_EXCEPTION_IF_NULL(cnode);
   MS_EXCEPTION_IF_NULL(candidate_fusion);
-  auto manager = kernel_graph.manager();
-  MS_EXCEPTION_IF_NULL(manager);
   auto matmul = cnode->input(kIndex1);
   MS_EXCEPTION_IF_NULL(matmul);
   if (matmul->isa<CNode>() && (AnfAlgo::CheckPrimitiveType(matmul, prim::kPrimMatMul) ||
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_eltwise_fusion_pass.cc b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_eltwise_fusion_pass.cc
index 792937bf7d3..0d5616ae54a 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/matmul_eltwise_fusion_pass.cc
@@ -28,12 +28,10 @@
 namespace mindspore {
 namespace opt {
 void MatmulEltwiseFusionPass::MatchMatmulEltwise(const CNodePtr &cnode, const AnfNodePtr &relu_input,
-                                                 const session::KernelGraph &kernel_graph,
+                                                 const session::KernelGraph & /*kernel_graph*/,
                                                  FusedNodeRecord *candidate_fusion) {
   MS_EXCEPTION_IF_NULL(cnode);
   MS_EXCEPTION_IF_NULL(candidate_fusion);
-  auto manager = kernel_graph.manager();
-  MS_EXCEPTION_IF_NULL(manager);
   if (fusion_id_allocator->HasFusionIdAttr(relu_input)) {
     return;
   }
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/multi_output_fusion_pass.cc b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/multi_output_fusion_pass.cc
index 4142f297f25..c698377757f 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/multi_output_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/multi_output_fusion_pass.cc
@@ -31,8 +31,6 @@ void MultiOutputFusionPass::MatchMultiOutputEltwise(const CNodePtr &cnode, const
                                                     FusedNodeRecord *candidate_fusion) {
   MS_EXCEPTION_IF_NULL(cnode);
   MS_EXCEPTION_IF_NULL(candidate_fusion);
-  auto manager = kernel_graph.manager();
-  MS_EXCEPTION_IF_NULL(manager);
   std::unordered_set<AnfNodePtr> record{cnode};
   auto eltwise_input = cnode->input(kIndex1);
   MS_EXCEPTION_IF_NULL(eltwise_input);
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/ub_pattern_fusion.cc b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/ub_pattern_fusion.cc
index d1c1037bf0e..9dd02f3b612 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/ub_pattern_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/ub_pattern_fusion.cc
@@ -21,6 +21,8 @@
 #include <memory>
 #include <string>
 #include <algorithm>
+#include "backend/kernel_compiler/tbe/tbe_convert_utils.h"
+#include "backend/kernel_compiler/tbe/ascend_kernel_compile.h"
 #include "backend/kernel_compiler/kernel_fusion.h"
 #include "debug/anf_ir_dump.h"
 #include "backend/session/anf_runtime_algorithm.h"
@@ -70,11 +72,11 @@ CNodePtr CreateFusionOp(const std::vector<AnfNodePtr> &inputs_list, const std::v
   MS_EXCEPTION_IF_NULL(fusion_op);
 
   std::vector<std::string> input_names;
-  for (uint8_t i = 0; i < inputs_list.size(); i++) {
+  for (size_t i = 0; i < inputs_list.size(); i++) {
     (void)input_names.emplace_back("input" + std::to_string(i));
   }
   std::vector<std::string> output_names;
-  for (uint8_t i = 0; i < outputs_list.size(); i++) {
+  for (size_t i = 0; i < outputs_list.size(); i++) {
     (void)output_names.emplace_back("output" + std::to_string(i));
   }
 
@@ -268,6 +270,22 @@ bool TupleGetitemNodeCompare(const AnfNodePtr &node1, const AnfNodePtr &node2) {
   return output_idx1 < output_idx2;
 }
 
+AnfNodePtr RemoveNodeFromUpdateState(session::KernelGraph *kernel_graph, const AnfNodePtr &node,
+                                     const AnfNodePtr &updatestate) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(updatestate);
+  auto updatestate_cnode = updatestate->cast<CNodePtr>();
+  auto inputs = updatestate_cnode->inputs();
+  std::vector<AnfNodePtr> new_inputs;
+  std::copy_if(inputs.begin(), inputs.end(), std::back_inserter(new_inputs),
+               [node](const AnfNodePtr &input) { return node != input; });
+  auto new_updatestate = kernel_graph->NewCNode(new_inputs);
+  new_updatestate->set_scope(updatestate->scope());
+  new_updatestate->set_abstract(updatestate->abstract());
+  return new_updatestate;
+}
+
 void GetFusionScopeOutputNodeList(session::KernelGraph *kernel_graph,
                                   std::unordered_map<int64_t, BufferFusionInfo_t> *buffer_fusion_infos) {
   MS_EXCEPTION_IF_NULL(kernel_graph);
@@ -280,7 +298,15 @@ void GetFusionScopeOutputNodeList(session::KernelGraph *kernel_graph,
     const auto &fusion_info = buffer_fusion_info.second;
     for (const auto &node : fusion_info.anf_nodes) {
       if (AnfAlgo::GetOutputTensorNum(node) == 1) {
-        for (auto use_node : manager->node_users()[node]) {
+        auto use_nodes = manager->node_users()[node];
+        for (auto use_node : use_nodes) {
+          // Do not think of updatestate as real output,
+          // Ensuring normal fusion requires eliminating the node of the updatestate
+          if (AnfAlgo::CheckPrimitiveType(use_node.first, prim::kPrimUpdateState)) {
+            auto new_updatestate = RemoveNodeFromUpdateState(kernel_graph, node, use_node.first);
+            manager->Replace(use_node.first, new_updatestate);
+            continue;
+          }
           if (std::find(fusion_info.anf_nodes.begin(), fusion_info.anf_nodes.end(), use_node.first) ==
               fusion_info.anf_nodes.end()) {
             (*buffer_fusion_infos)[fusion_id].outputs_list.push_back(node);
@@ -290,7 +316,13 @@ void GetFusionScopeOutputNodeList(session::KernelGraph *kernel_graph,
       } else {
         int64_t prev_idx = 0;
         std::vector<AnfNodePtr> tuple_getitem_nodes;
-        for (auto &user : manager->node_users()[node]) {
+        auto users = manager->node_users()[node];
+        for (auto &user : users) {
+          if (AnfAlgo::CheckPrimitiveType(user.first, prim::kPrimUpdateState)) {
+            auto new_updatestate = RemoveNodeFromUpdateState(kernel_graph, node, user.first);
+            manager->Replace(user.first, new_updatestate);
+            continue;
+          }
           if (AnfAlgo::CheckPrimitiveType(user.first, prim::kPrimTupleGetItem)) {
             (void)tuple_getitem_nodes.emplace_back(user.first);
           }
@@ -432,7 +464,16 @@ bool UbPatternFusion::FuseBufferFusionPattern(session::KernelGraph *kernel_graph
         buffer_fusion_info.first, buffer_fusion_info.second.full_name, buffer_fusion_info.second.inputs_list,
         buffer_fusion_info.second.anf_nodes, buffer_fusion_info.second.outputs_list);
     });
-  auto kernel_mods = mindspore::kernel::KernelFusion(fusion_scope_infos);
+  std::map<int64_t, kernel::KernelModPtr> kernel_mods;
+  std::string old_build = common::GetEnv("MS_OLD_BUILD_PROCESS");
+  if (!old_build.empty()) {
+    kernel_mods = mindspore::kernel::KernelFusion(fusion_scope_infos);
+  } else if (!fusion_scope_infos.empty()) {
+    auto build_manager = kernel::ascend::AscendKernelCompileManager::GetInstance();
+    MS_EXCEPTION_IF_NULL(build_manager);
+    build_manager->ResetOldTask();
+    kernel_mods = build_manager->AscendFusionOpCompile(fusion_scope_infos);
+  }
   std::set<int64_t> fusion_ids;
   for (auto &buffer_fusion_info : buffer_fusion_infos) {
     MS_LOG(DEBUG) << "anf node size: " << buffer_fusion_info.second.anf_nodes.size()
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/format_type/convert_unsupported_transnode_to_aicpu.cc b/mindspore/ccsrc/backend/optimizer/ascend/format_type/convert_unsupported_transnode_to_aicpu.cc
index 22955f547d0..a0719876bd9 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/convert_unsupported_transnode_to_aicpu.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/convert_unsupported_transnode_to_aicpu.cc
@@ -34,7 +34,7 @@ const AnfNodePtr ConvertUnSupportNodeToAICPU::Process(const mindspore::FuncGraph
     return nullptr;
   }
   auto node_name = AnfAlgo::GetCNodeName(node);
-  if (node_name != prim::KPrimTransData->name() && node_name != prim::kPrimCast->name()) {
+  if (node_name != prim::kPrimTransData->name() && node_name != prim::kPrimCast->name()) {
     return nullptr;
   }
   auto kernel_builder_info = AnfAlgo::GetSelectKernelBuildInfo(node);
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/format_type/deal_ref_and_split_unsupported_transdata.cc b/mindspore/ccsrc/backend/optimizer/ascend/format_type/deal_ref_and_split_unsupported_transdata.cc
index b68c36a8354..6e6112f327c 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/deal_ref_and_split_unsupported_transdata.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/deal_ref_and_split_unsupported_transdata.cc
@@ -120,7 +120,7 @@ CNodePtr DealRefAndSpiltUnSupportedTransdata::AddAdditionalToRefOutput(const Fun
   // insert trans
   if (origin_format != cur_format && cur_shape.size() > 1) {
     auto kernel_select = std::make_shared<KernelSelect>();
-    final_node = NewTransOpNode(func_graph, final_node, kernel_select, false, prim::KPrimTransData->name());
+    final_node = NewTransOpNode(func_graph, final_node, kernel_select, false, prim::kPrimTransData->name());
     RefreshKernelBuildInfo(cur_format, origin_format, final_node, {}, cur_type);
     final_node = SplitTransdataIfNotSupported(func_graph, final_node);
     final_index = 0;
@@ -288,7 +288,7 @@ CNodePtr DealRefAndSpiltUnSupportedTransdata::SplitTransdataIfNotSupported(const
   builder_info_to_default->SetOutputsFormat({kOpFormat_DEFAULT});
   builder_info_to_special_foramt->SetInputsFormat({kOpFormat_DEFAULT});
   std::vector<AnfNodePtr> next_trans_node_inputs = {
-    NewValueNode(std::make_shared<Primitive>(prim::KPrimTransData->name())), cnode};
+    NewValueNode(std::make_shared<Primitive>(prim::kPrimTransData->name())), cnode};
   MS_EXCEPTION_IF_NULL(func_graph);
   auto next_trans_node = func_graph->NewCNode(next_trans_node_inputs);
   next_trans_node->set_abstract(cnode->abstract());
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_transpose_for_dynamic_gru_v2.cc b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_transpose_for_dynamic_gru_v2.cc
index 203ba3ee874..ca9da767f9b 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_transpose_for_dynamic_gru_v2.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_transpose_for_dynamic_gru_v2.cc
@@ -37,7 +37,7 @@ const BaseRef InsertTransposeForDynamicGRUV2::DefinePattern() const {
   MS_EXCEPTION_IF_NULL(X1);
   MS_EXCEPTION_IF_NULL(Xs);
   return VectorRef(
-    {prim::kPrimDynamicGRUV2, X1, VectorRef({prim::KPrimTransData, VectorRef({prim::kPrimReshape, X})}), Xs});
+    {prim::kPrimDynamicGRUV2, X1, VectorRef({prim::kPrimTransData, VectorRef({prim::kPrimReshape, X})}), Xs});
 }
 
 CNodePtr Insert(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
@@ -62,7 +62,7 @@ CNodePtr Insert(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
       RefreshKernelBuildInfo(input_format, kOpFormat_HWCN, new_transpose_node);
       // trans hwcn to output_format
       new_transdata_node =
-        NewTransOpNode(func_graph, new_transpose_node, kernel_select, false, prim::KPrimTransData->name());
+        NewTransOpNode(func_graph, new_transpose_node, kernel_select, false, prim::kPrimTransData->name());
       RefreshKernelBuildInfo(kOpFormat_HWCN, output_format, new_transdata_node, padding_axis);
       new_transdata_node->set_abstract(transdata_node->abstract());
       new_node = new_transdata_node;
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/dynamic_gru_v2_grad_fission.cc b/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/dynamic_gru_v2_grad_fission.cc
index 100db27702e..e64934a1ce2 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/dynamic_gru_v2_grad_fission.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/dynamic_gru_v2_grad_fission.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 #include "backend/optimizer/ascend/ir_fission/dynamic_gru_v2_grad_fission.h"
 #include <vector>
+#include <map>
 #include <memory>
 #include <string>
 #include <algorithm>
@@ -26,70 +27,211 @@ namespace opt {
 namespace {
 constexpr size_t kDynamicGRUV2GradInputNum = 12;
 constexpr size_t kDynamicGRUV2GradOutputNum = 6;
-constexpr size_t kSplitVOutputNum = 2;
-constexpr size_t kGRUV2HiddenGradOutputNum = 3;
-constexpr size_t kConcatNum = 2;
+constexpr size_t kGRUV2HiddenGradCellOutputNum = 3;
 constexpr size_t kGateNum = 3;
 constexpr size_t k3Dims = 3;
+constexpr size_t kConcatNum = 2;
+constexpr size_t kSplitVOutputNum = 2;
+size_t t_size = 0;
+size_t batch_size = 0;
+size_t hidden_size = 0;
+size_t input_size = 0;
+TypeId dh_dtype = kNumberTypeFloat32;
 
-AnfNodePtr CreateGRUV2HiddenGradNode(const FuncGraphPtr &graph, const AnfNodePtr &node) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(node);
-  auto cnode = node->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(cnode);
-  const auto &dynamic_gru_v2_grad_inputs = cnode->inputs();
-  std::vector<AnfNodePtr> gru_v2_hidden_grad_inputs = {
-    NewValueNode(std::make_shared<Primitive>(kGRUV2HiddenGradOpName)),
-    dynamic_gru_v2_grad_inputs[kIndex3],
-    dynamic_gru_v2_grad_inputs[kIndex5],
-    dynamic_gru_v2_grad_inputs[kIndex6],
-    dynamic_gru_v2_grad_inputs[kIndex7],
-    dynamic_gru_v2_grad_inputs[kIndex8],
-    dynamic_gru_v2_grad_inputs[kIndex9],
-    dynamic_gru_v2_grad_inputs[kIndex10],
-    dynamic_gru_v2_grad_inputs[kIndex11],
-    dynamic_gru_v2_grad_inputs[kIndex12]};
+std::map<std::string, size_t> input_index = {
+  {"x", kIndex1},           {"weight_input", kIndex2}, {"weight_hidden", kIndex3},
+  {"y", kIndex4},           {"init_h", kIndex5},       {"h", kIndex6},
+  {"dy", kIndex7},          {"dh", kIndex8},           {"update", kIndex9},
+  {"reset", kIndex10},      {"new", kIndex11},         {"hidden_new", kIndex12},
+  {"seq_length", kIndex13}, {"mask", kIndex14}};
 
-  std::vector<AnfNodePtr> ori_outputs;
-  CreateMultipleOutputsOfAnfNode(graph, node, kDynamicGRUV2GradOutputNum, &ori_outputs);
-  auto gru_v2_hidden_grad_op = graph->NewCNode(gru_v2_hidden_grad_inputs);
-  MS_EXCEPTION_IF_NULL(gru_v2_hidden_grad_op);
-  auto h_dtype = AnfAlgo::GetOutputInferDataType(dynamic_gru_v2_grad_inputs[kIndex6], 0);
-  auto types = {h_dtype, h_dtype, h_dtype};
-  std::vector<size_t> dh_preh_shape = AnfAlgo::GetOutputInferShape(ori_outputs[kIndex5], 0);
-  std::vector<size_t> dgate_h_shape = {
-    AnfAlgo::GetOutputInferShape(dynamic_gru_v2_grad_inputs[kIndex6], 0)[kDim0],
-    AnfAlgo::GetOutputInferShape(dynamic_gru_v2_grad_inputs[kIndex6], 0)[kDim1],
-    kGateNum * AnfAlgo::GetOutputInferShape(dynamic_gru_v2_grad_inputs[kIndex6], 0)[kDim2]};
-  std::vector<size_t> dnx_t_shape = AnfAlgo::GetOutputInferShape(dynamic_gru_v2_grad_inputs[kIndex6], 0);
-  auto shapes = {dh_preh_shape, dgate_h_shape, dnx_t_shape};
-  AnfAlgo::SetOutputInferTypeAndShape(types, shapes, gru_v2_hidden_grad_op.get());
-  auto gate_order = AnfAlgo::GetNodeAttr<std::string>(cnode, "gate_order");
-  AnfAlgo::SetNodeAttr("gate_order", MakeValue(gate_order), gru_v2_hidden_grad_op);
-  return gru_v2_hidden_grad_op;
+std::map<std::string, size_t> output_index = {{"dw_input", kIndex0},  {"dw_hidden", kIndex1}, {"db_input", kIndex2},
+                                              {"db_hidden", kIndex3}, {"dx", kIndex4},        {"dh_prev", kIndex5}};
+
+std::map<std::string, size_t> hidden_grad_input_index = {
+  {"dh_pre_t", kIndex1}, {"h", kIndex2},     {"dy", kIndex3},  {"dh", kIndex4},
+  {"update", kIndex5},   {"reset", kIndex6}, {"new", kIndex7}, {"hidden_new", kIndex8}};
+
+std::map<std::string, size_t> hidden_grad_output_index = {
+  {"dh_prev", kIndex0}, {"dgate_h", kIndex1}, {"dnt_x", kIndex2}};
+
+AnfNodePtr CreateGRUV2HiddenGradCellNode(const FuncGraphPtr &func_graph, const CNodePtr &dynamic_gru_v2_grad_cnode,
+                                         const AnfNodePtr &last_gru_hidden_grad_node,
+                                         const AnfNodePtr &last_matmul_node, const std::string &gate_order,
+                                         const size_t cur_t) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(dynamic_gru_v2_grad_cnode);
+  const auto &dynamic_gru_v2_grad_inputs = dynamic_gru_v2_grad_cnode->inputs();
+  std::vector<AnfNodePtr> gru_v2_hidden_grad_cell_inputs = {
+    NewValueNode(std::make_shared<Primitive>(kGRUV2HiddenGradCellOpName))};
+  std::vector<AnfNodePtr> dynamic_gru_grad_outputs;
+  CreateMultipleOutputsOfAnfNode(func_graph, dynamic_gru_v2_grad_cnode, kDynamicGRUV2GradOutputNum,
+                                 &dynamic_gru_grad_outputs);
+  if (cur_t == 0) {
+    gru_v2_hidden_grad_cell_inputs.emplace_back(dynamic_gru_v2_grad_inputs[input_index["dh"]]);
+  } else {
+    MS_EXCEPTION_IF_NULL(last_gru_hidden_grad_node);
+    std::vector<AnfNodePtr> last_gru_hidden_grad_outputs;
+    CreateMultipleOutputsOfAnfNode(func_graph, last_gru_hidden_grad_node->cast<CNodePtr>(),
+                                   kGRUV2HiddenGradCellOutputNum, &last_gru_hidden_grad_outputs);
+    gru_v2_hidden_grad_cell_inputs.emplace_back(last_gru_hidden_grad_outputs[hidden_grad_output_index["dh_prev"]]);
+  }
+  if (cur_t < t_size - 1) {
+    gru_v2_hidden_grad_cell_inputs.emplace_back(dynamic_gru_v2_grad_inputs[input_index["h"]]);
+  } else {
+    gru_v2_hidden_grad_cell_inputs.emplace_back(dynamic_gru_v2_grad_inputs[input_index["init_h"]]);
+  }
+  gru_v2_hidden_grad_cell_inputs.emplace_back(dynamic_gru_v2_grad_inputs[input_index["dy"]]);
+  auto input_dh = dynamic_gru_v2_grad_inputs[input_index["dh"]];
+  dh_dtype = AnfAlgo::GetOutputInferDataType(input_dh, 0);
+  if (cur_t == 0) {
+    gru_v2_hidden_grad_cell_inputs.emplace_back(input_dh);
+  } else {
+    MS_EXCEPTION_IF_NULL(last_matmul_node);
+    gru_v2_hidden_grad_cell_inputs.emplace_back(last_matmul_node);
+  }
+  gru_v2_hidden_grad_cell_inputs.emplace_back(dynamic_gru_v2_grad_inputs[input_index["update"]]);
+  gru_v2_hidden_grad_cell_inputs.emplace_back(dynamic_gru_v2_grad_inputs[input_index["reset"]]);
+  gru_v2_hidden_grad_cell_inputs.emplace_back(dynamic_gru_v2_grad_inputs[input_index["new"]]);
+  gru_v2_hidden_grad_cell_inputs.emplace_back(dynamic_gru_v2_grad_inputs[input_index["hidden_new"]]);
+  auto gru_v2_hidden_grad_cell_op = func_graph->NewCNode(gru_v2_hidden_grad_cell_inputs);
+
+  std::vector<size_t> dh_prev_shape =
+    AnfAlgo::GetOutputInferShape(dynamic_gru_grad_outputs[output_index["dh_prev"]], 0);
+  std::vector<size_t> dgate_h_shape = {1, batch_size, kGateNum * hidden_size};
+  std::vector<size_t> dnt_x_shape = {1, batch_size, hidden_size};
+  AnfAlgo::SetOutputInferTypeAndShape({dh_dtype, dh_dtype, dh_dtype}, {dh_prev_shape, dgate_h_shape, dnt_x_shape},
+                                      gru_v2_hidden_grad_cell_op.get());
+  AnfAlgo::SetNodeAttr("t_state", MakeValue(SizeToLong(cur_t)), gru_v2_hidden_grad_cell_op);
+  AnfAlgo::SetNodeAttr("gate_order", MakeValue(gate_order), gru_v2_hidden_grad_cell_op);
+  return gru_v2_hidden_grad_cell_op;
 }
 
-AnfNodePtr CreateHSplitVDNode(const FuncGraphPtr &graph, const AnfNodePtr &node) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(node);
-  // SplitV
-  std::vector<AnfNodePtr> splitvd_input = {NewValueNode(std::make_shared<Primitive>(prim::kPrimSplitV->name())), node};
-  auto split_vd = graph->NewCNode(splitvd_input);
-  MS_EXCEPTION_IF_NULL(split_vd);
-  auto dtypes = {AnfAlgo::GetOutputInferDataType(node, 0), AnfAlgo::GetOutputInferDataType(node, 0)};
-  size_t t_size = AnfAlgo::GetOutputInferShape(node, 0)[kDim0];
-  size_t batch = AnfAlgo::GetOutputInferShape(node, 0)[kDim1];
-  size_t hidden_size = AnfAlgo::GetOutputInferShape(node, 0)[kDim2];
-  std::vector<size_t> shape = {t_size - IntToSize(1), batch, hidden_size};
-  std::vector<size_t> shape2 = {IntToSize(1), batch, hidden_size};
-  std::vector<std::vector<size_t>> shapes = {shape, shape2};
-  AnfAlgo::SetOutputInferTypeAndShape(dtypes, shapes, split_vd.get());
-  AnfAlgo::SetNodeAttr("split_dim", MakeValue(SizeToLong(kDim0)), split_vd);
-  AnfAlgo::SetNodeAttr("num_split", MakeValue(SizeToLong(kSplitVOutputNum)), split_vd);
-  std::vector<int64_t> size_splits = {SizeToLong(t_size - 1), SizeToLong(1)};
-  AnfAlgo::SetNodeAttr("size_splits", MakeValue(size_splits), split_vd);
-  AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), split_vd);
-  return split_vd;
+void AddTLoopNode(const FuncGraphPtr &func_graph, const CNodePtr &dynamic_gru_v2_grad_cnode,
+                  std::vector<std::vector<AnfNodePtr>> *result_nodes) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(dynamic_gru_v2_grad_cnode);
+  MS_EXCEPTION_IF_NULL(result_nodes);
+  std::string gate_order = "rzh";
+  if (AnfAlgo::HasNodeAttr("gate_order", dynamic_gru_v2_grad_cnode)) {
+    gate_order = AnfAlgo::GetNodeAttr<std::string>(dynamic_gru_v2_grad_cnode, "gate_order");
+  }
+  std::vector<AnfNodePtr> gru_hidden_grad_cells;
+  std::vector<AnfNodePtr> matmul_nodes;
+  AnfNodePtr last_hidden_grad_node = nullptr;
+  AnfNodePtr last_matmul_node = nullptr;
+  const auto &dynamic_gru_v2_grad_inputs = dynamic_gru_v2_grad_cnode->inputs();
+  for (size_t i = 0; i < t_size; ++i) {
+    // Create gru_hidden_grad_cell
+    auto gru_hidden_grad_cell_node = CreateGRUV2HiddenGradCellNode(
+      func_graph, dynamic_gru_v2_grad_cnode, last_hidden_grad_node, last_matmul_node, gate_order, i);
+    // add matmul node
+    std::vector<AnfNodePtr> matmul_inputs = {NewValueNode(std::make_shared<Primitive>(kBatchMatMulOpName))};
+    auto gru_hidden_grad_cnode = gru_hidden_grad_cell_node->cast<CNodePtr>();
+    std::vector<AnfNodePtr> hidden_grad_outputs;
+    CreateMultipleOutputsOfAnfNode(func_graph, gru_hidden_grad_cnode, kGRUV2HiddenGradCellOutputNum,
+                                   &hidden_grad_outputs);
+    auto dgate_h = hidden_grad_outputs[hidden_grad_output_index["dgate_h"]];
+    matmul_inputs.emplace_back(dgate_h);
+    auto weight_hidden = dynamic_gru_v2_grad_inputs[input_index["weight_hidden"]];
+    std::vector<AnfNodePtr> reshape_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimReshape->name())),
+                                              weight_hidden};
+    auto reshape = func_graph->NewCNode(reshape_inputs);
+    auto reshape_out_shape = {IntToSize(1), AnfAlgo::GetOutputInferShape(weight_hidden, 0)[0],
+                              AnfAlgo::GetOutputInferShape(weight_hidden, 0)[1]};
+    AnfAlgo::SetOutputInferTypeAndShape({dh_dtype}, {reshape_out_shape}, reshape.get());
+    matmul_inputs.emplace_back(reshape);
+    auto matmul_node = func_graph->NewCNode(matmul_inputs);
+    MS_EXCEPTION_IF_NULL(matmul_node);
+    std::vector<size_t> out_shape = {1, batch_size, hidden_size};
+    AnfAlgo::SetOutputInferTypeAndShape({dh_dtype}, {out_shape}, matmul_node.get());
+    AnfAlgo::SetNodeAttr("transpose_x1", MakeValue(false), matmul_node);
+    AnfAlgo::SetNodeAttr("transpose_x2", MakeValue(true), matmul_node);
+
+    last_hidden_grad_node = gru_hidden_grad_cell_node;
+    last_matmul_node = matmul_node;
+    gru_hidden_grad_cells.emplace_back(gru_hidden_grad_cell_node);
+    matmul_nodes.emplace_back(matmul_node);
+  }
+  // Add last GRUV2HiddenGradCell node
+  auto gru_hidden_grad_cell_node = CreateGRUV2HiddenGradCellNode(
+    func_graph, dynamic_gru_v2_grad_cnode, last_hidden_grad_node, last_matmul_node, gate_order, t_size);
+  gru_hidden_grad_cells.emplace_back(gru_hidden_grad_cell_node);
+  result_nodes->emplace_back(gru_hidden_grad_cells);
+  result_nodes->emplace_back(matmul_nodes);
+}
+
+AnfNodePtr AddTConcatNode(const FuncGraphPtr &func_graph, const std::vector<AnfNodePtr> &gru_hidden_grad_nodes,
+                          size_t concat_output_index) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  std::vector<AnfNodePtr> concat_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimConcat->name()))};
+  for (size_t i = 0; i < t_size; i++) {
+    auto gru_hidden_grad_node_i = gru_hidden_grad_nodes[t_size - 1 - i];
+    MS_EXCEPTION_IF_NULL(gru_hidden_grad_node_i);
+    std::vector<AnfNodePtr> gru_hidden_grad_node_outputs;
+    CreateMultipleOutputsOfAnfNode(func_graph, gru_hidden_grad_node_i, kGRUV2HiddenGradCellOutputNum,
+                                   &gru_hidden_grad_node_outputs);
+    concat_inputs.emplace_back(gru_hidden_grad_node_outputs[concat_output_index]);
+  }
+  auto concat_t_node = func_graph->NewCNode(concat_inputs);
+  auto out_dims = AnfAlgo::GetOutputInferShape(gru_hidden_grad_nodes[kIndex0], concat_output_index);
+  std::vector<size_t> concat_output_shape = {t_size, out_dims[kDim1], out_dims[kDim2]};
+  auto out_type = AnfAlgo::GetOutputInferDataType(gru_hidden_grad_nodes[kIndex0], concat_output_index);
+  AnfAlgo::SetOutputInferTypeAndShape({out_type}, {concat_output_shape}, concat_t_node.get());
+  AnfAlgo::SetNodeAttr(kAttrN, MakeValue(SizeToLong(t_size)), concat_t_node);
+  AnfAlgo::SetNodeAttr(kAttrDynInputSizes, MakeValue(std::vector<int64_t>{SizeToLong(t_size)}), concat_t_node);
+  AnfAlgo::SetNodeAttr(kAttrAxis, MakeValue(static_cast<int64_t>(0)), concat_t_node);
+  return concat_t_node;
+}
+
+std::vector<AnfNodePtr> AddGRUHiddenGradNode(const FuncGraphPtr &func_graph,
+                                             const CNodePtr &dynamic_gru_v2_grad_cnode) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(dynamic_gru_v2_grad_cnode);
+  std::vector<AnfNodePtr> result;
+  std::vector<std::vector<AnfNodePtr>> result_nodes;
+  // add loop t hidden grad nodes; [[hidden_grad_nodes] [matmul_nodes]]
+  AddTLoopNode(func_graph, dynamic_gru_v2_grad_cnode, &result_nodes);
+  if (result_nodes.empty() || result_nodes[0].empty()) {
+    MS_LOG(EXCEPTION) << "result_node is empty, DynamicGRUGrad fission failed.";
+  }
+  auto gru_hidden_grad_nodes = result_nodes[kIndex0];
+  result.emplace_back(gru_hidden_grad_nodes[gru_hidden_grad_nodes.size() - 1]);
+  if (t_size > 1) {
+    // add dnt_x concat node [t_size, batch_size, hidden_size]
+    auto dnt_x_concat_t_node = AddTConcatNode(func_graph, gru_hidden_grad_nodes, hidden_grad_output_index["dnt_x"]);
+    // add dgate_h concat node [t_size, batch_size, 3 * hidden_size]
+    auto dgate_h_concat_t_node = AddTConcatNode(func_graph, gru_hidden_grad_nodes, hidden_grad_output_index["dgate_h"]);
+    result.emplace_back(dgate_h_concat_t_node);
+    result.emplace_back(dnt_x_concat_t_node);
+  } else {
+    auto node = result_nodes[kIndex0][kIndex0];
+    result.emplace_back(node);
+    result.emplace_back(node);
+  }
+  return result;
+}
+
+AnfNodePtr AddHSplitNode(const FuncGraphPtr &func_graph, const CNodePtr &dynamic_gru_v2_grad_cnode) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(dynamic_gru_v2_grad_cnode);
+  auto input_h = dynamic_gru_v2_grad_cnode->input(input_index["h"]);
+  std::vector<AnfNodePtr> splitv_input = {NewValueNode(std::make_shared<Primitive>(prim::kPrimSplitV->name())),
+                                          input_h};
+  auto split_v = func_graph->NewCNode(splitv_input);
+  // Set infer data type and shape
+  auto dtypes = {AnfAlgo::GetOutputInferDataType(input_h, 0), AnfAlgo::GetOutputInferDataType(input_h, 0)};
+  std::vector<size_t> output1_shape = {t_size - 1, batch_size, hidden_size};
+  std::vector<size_t> output2_shape = {1, batch_size, hidden_size};
+  std::vector<int64_t> split_list = {SizeToLong(t_size - 1), 1};
+  std::vector<std::vector<size_t>> shapes = {output1_shape, output2_shape};
+  AnfAlgo::SetOutputInferTypeAndShape(dtypes, shapes, split_v.get());
+  // Set attr
+  AnfAlgo::SetNodeAttr(kAttrSplitDim, MakeValue(SizeToLong(0)), split_v);
+  AnfAlgo::SetNodeAttr(kAttrNumSplit, MakeValue(SizeToLong(kSplitVOutputNum)), split_v);
+  AnfAlgo::SetNodeAttr(kAttrSizeSplits, MakeValue(split_list), split_v);
+  AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), split_v);
+  return split_v;
 }
 
 AnfNodePtr CreateHReshape(const FuncGraphPtr &graph, const AnfNodePtr &node) {
@@ -111,115 +253,51 @@ AnfNodePtr CreateHReshape(const FuncGraphPtr &graph, const AnfNodePtr &node) {
   return reshape;
 }
 
-AnfNodePtr CreateHConcatDNode(const FuncGraphPtr &graph, const AnfNodePtr &node1, const AnfNodePtr &node2) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(node1);
-  MS_EXCEPTION_IF_NULL(node2);
-  std::vector<AnfNodePtr> ori_outputs;
-  CreateMultipleOutputsOfAnfNode(graph, node2, kSplitVOutputNum, &ori_outputs);
-  auto reshape = CreateHReshape(graph, node1);
-
-  std::vector<AnfNodePtr> concat_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimConcat->name())),
-                                           reshape, ori_outputs[kIndex0]};
-  auto concat_op = graph->NewCNode(concat_inputs);
-  MS_EXCEPTION_IF_NULL(concat_op);
-
-  std::vector<size_t> shape = {AnfAlgo::GetOutputInferShape(node2, 0)[kDim0] + 1,
-                               AnfAlgo::GetOutputInferShape(node2, 0)[kDim1],
-                               AnfAlgo::GetOutputInferShape(node2, 0)[kDim2]};
-  auto types = {AnfAlgo::GetOutputInferDataType(node2, 0)};
-  AnfAlgo::SetOutputInferTypeAndShape(types, {shape}, concat_op.get());
-  AnfAlgo::SetNodeAttr(kAttrN, MakeValue(SizeToLong(kConcatNum)), concat_op);
-  AnfAlgo::SetNodeAttr(kAttrDynInputSizes, MakeValue(std::vector<int64_t>{2}), concat_op);
-  AnfAlgo::SetNodeAttr(kAttrAxis, MakeValue(SizeToLong(0)), concat_op);
-  AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), concat_op);
-  return concat_op;
+AnfNodePtr AddHConcatNode(const FuncGraphPtr &func_graph, const CNodePtr &dynamic_gru_v2_grad_cnode,
+                          const AnfNodePtr &splitv) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(dynamic_gru_v2_grad_cnode);
+  MS_EXCEPTION_IF_NULL(splitv);
+  // Create node
+  std::vector<AnfNodePtr> splitv_outputs;
+  CreateMultipleOutputsOfAnfNode(func_graph, splitv, kSplitVOutputNum, &splitv_outputs);
+  if (splitv_outputs.size() != kSplitVOutputNum) {
+    MS_LOG(EXCEPTION) << "Create outputs of node " << splitv->DebugString() << " failed"
+                      << " trace: " << trace::DumpSourceLines(splitv);
+  }
+  std::vector<AnfNodePtr> concat_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimConcat->name()))};
+  auto init_h_reshape = CreateHReshape(func_graph, dynamic_gru_v2_grad_cnode->input(input_index["init_h"]));
+  concat_inputs.emplace_back(init_h_reshape);
+  concat_inputs.emplace_back(splitv_outputs[kIndex0]);
+  auto concat = func_graph->NewCNode(concat_inputs);
+  // Set infer data type and shape
+  std::vector<size_t> output_shape = {t_size, batch_size, hidden_size};
+  AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(init_h_reshape, 0)}, {output_shape},
+                                      concat.get());
+  // Set attr
+  AnfAlgo::SetNodeAttr(kAttrN, MakeValue(SizeToLong(kConcatNum)), concat);
+  AnfAlgo::SetNodeAttr(kAttrDynInputSizes, MakeValue(std::vector<int64_t>{kConcatNum}), concat);
+  AnfAlgo::SetNodeAttr(kAttrAxis, MakeValue(SizeToLong(0)), concat);
+  AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), concat);
+  return concat;
 }
 
-AnfNodePtr CreateDgateHSplitVDNode(const FuncGraphPtr &graph, const AnfNodePtr &node) {
-  MS_EXCEPTION_IF_NULL(graph);
+AnfNodePtr AddDwhMatmulNode(const FuncGraphPtr &func_graph, const AnfNodePtr &dgate_h, const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(dgate_h);
   MS_EXCEPTION_IF_NULL(node);
-  // SplitV
-  std::vector<AnfNodePtr> splitvd_input = {NewValueNode(std::make_shared<Primitive>(prim::kPrimSplitV->name())), node};
-  auto split_vd = graph->NewCNode(splitvd_input);
-  MS_EXCEPTION_IF_NULL(split_vd);
-  auto dtypes = {AnfAlgo::GetOutputInferDataType(node, 0), AnfAlgo::GetOutputInferDataType(node, 0)};
-  size_t t_size = AnfAlgo::GetOutputInferShape(node, 0)[kDim0];
-  size_t batch = AnfAlgo::GetOutputInferShape(node, 0)[kDim1];
-  size_t hidden_size = AnfAlgo::GetOutputInferShape(node, 0)[kDim2] / kGateNum;
-  std::vector<size_t> shape = {t_size, batch, hidden_size << 1};
-  std::vector<size_t> shape2 = {t_size, batch, hidden_size};
-  std::vector<std::vector<size_t>> shapes = {shape, shape2};
-  AnfAlgo::SetOutputInferTypeAndShape(dtypes, shapes, split_vd.get());
-  AnfAlgo::SetNodeAttr("split_dim", MakeValue(SizeToLong(kDim2)), split_vd);
-  AnfAlgo::SetNodeAttr("num_split", MakeValue(SizeToLong(kSplitVOutputNum)), split_vd);
-  std::vector<int64_t> size_splits = {SizeToLong(hidden_size + hidden_size), SizeToLong(hidden_size)};
-  AnfAlgo::SetNodeAttr("size_splits", MakeValue(size_splits), split_vd);
-  AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), split_vd);
-  return split_vd;
-}
-
-AnfNodePtr CreateDgateXConcatDNode(const FuncGraphPtr &graph, const AnfNodePtr &node1, const AnfNodePtr &node2) {
-  MS_EXCEPTION_IF_NULL(graph);
-  // node1: dgate_h_split
-  // node2: dnt_x
-  MS_EXCEPTION_IF_NULL(node1);
-  MS_EXCEPTION_IF_NULL(node2);
-  std::vector<AnfNodePtr> ori_outputs;
-  CreateMultipleOutputsOfAnfNode(graph, node1, kSplitVOutputNum, &ori_outputs);
-
-  // ConcatD
-  std::vector<AnfNodePtr> concat_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimConcat->name())),
-                                           ori_outputs[kIndex0], node2};
-  auto concat_op = graph->NewCNode(concat_inputs);
-  MS_EXCEPTION_IF_NULL(concat_op);
-  std::vector<size_t> shape = {
-    AnfAlgo::GetOutputInferShape(node2, 0)[kDim0], AnfAlgo::GetOutputInferShape(node2, 0)[kDim1],
-    AnfAlgo::GetOutputInferShape(node1, 0)[kDim2] + AnfAlgo::GetOutputInferShape(node2, 0)[kDim2]};
-  auto types = {AnfAlgo::GetOutputInferDataType(node2, 0)};
-  AnfAlgo::SetOutputInferTypeAndShape(types, {shape}, concat_op.get());
-  AnfAlgo::SetNodeAttr(kAttrN, MakeValue(SizeToLong(kConcatNum)), concat_op);
-  AnfAlgo::SetNodeAttr(kAttrDynInputSizes, MakeValue(std::vector<int64_t>{2}), concat_op);
-  AnfAlgo::SetNodeAttr(kAttrAxis, MakeValue(SizeToLong(kDim2)), concat_op);
-  AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), concat_op);
-  return concat_op;
-}
-
-AnfNodePtr CreateWBroadcastToDNode(const FuncGraphPtr &graph, const AnfNodePtr &node1, const AnfNodePtr &node2) {
-  MS_EXCEPTION_IF_NULL(graph);
-  // node1 : input node
-  // node2 : orign_input x
-  MS_EXCEPTION_IF_NULL(node1);
-  MS_EXCEPTION_IF_NULL(node2);
-  // BroadcastTo
-  std::vector<AnfNodePtr> braodcast_to_input = {NewValueNode(std::make_shared<Primitive>(kBroadcastToOpName)), node1};
-  auto broadcast_to_d = graph->NewCNode(braodcast_to_input);
-  MS_EXCEPTION_IF_NULL(broadcast_to_d);
-  size_t t_size = AnfAlgo::GetOutputInferShape(node2, 0)[kDim0];
-  size_t batch = AnfAlgo::GetOutputInferShape(node1, 0)[kDim0];
-  size_t gate_size = AnfAlgo::GetOutputInferShape(node1, 0)[kDim1];
-  std::vector<size_t> shape = {t_size, batch, gate_size};
-  auto type = {AnfAlgo::GetOutputInferDataType(node1, 0)};
-  AnfAlgo::SetOutputInferTypeAndShape(type, {shape}, broadcast_to_d.get());
-
-  std::vector<int64_t> attr_shape = {SizeToLong(t_size), SizeToLong(batch), SizeToLong(gate_size)};
-  AnfAlgo::SetNodeAttr(kAttrShape, MakeValue(attr_shape), broadcast_to_d);
-  AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), broadcast_to_d);
-  return broadcast_to_d;
-}
-
-AnfNodePtr CreateDhxBatchMatMul(const FuncGraphPtr &graph, const AnfNodePtr &node1, const AnfNodePtr &node2) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(node1);
-  MS_EXCEPTION_IF_NULL(node2);
   // BatchMatMul
-  std::vector<AnfNodePtr> matmul_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimBatchMatMul->name())),
-                                           node1, node2};
-  auto batch_matmul = graph->NewCNode(matmul_inputs);
-  MS_EXCEPTION_IF_NULL(batch_matmul);
-  std::vector<size_t> shape = {AnfAlgo::GetOutputInferShape(node1, 0)[kDim0],
-                               AnfAlgo::GetOutputInferShape(node1, 0)[kDim2],
-                               AnfAlgo::GetOutputInferShape(node2, 0)[kDim2]};
+  std::vector<AnfNodePtr> matmul_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimBatchMatMul->name()))};
+  matmul_inputs.emplace_back(node);
+  if (t_size == 1) {
+    std::vector<AnfNodePtr> dgate_h_outputs;
+    CreateMultipleOutputsOfAnfNode(func_graph, dgate_h, kGRUV2HiddenGradCellOutputNum, &dgate_h_outputs);
+    matmul_inputs.emplace_back(dgate_h_outputs[hidden_grad_output_index["dgate_h"]]);
+  } else {
+    matmul_inputs.emplace_back(dgate_h);
+  }
+  auto batch_matmul = func_graph->NewCNode(matmul_inputs);
+  std::vector<size_t> shape = {t_size, hidden_size, kGateNum * hidden_size};
   AnfAlgo::SetOutputInferTypeAndShape({kNumberTypeFloat16}, {shape}, batch_matmul.get());
   AnfAlgo::SetNodeAttr("transpose_x1", MakeValue(true), batch_matmul);
   AnfAlgo::SetNodeAttr("transpose_x2", MakeValue(false), batch_matmul);
@@ -227,7 +305,58 @@ AnfNodePtr CreateDhxBatchMatMul(const FuncGraphPtr &graph, const AnfNodePtr &nod
   return batch_matmul;
 }
 
-AnfNodePtr CreateDwhBatchMatMul(const FuncGraphPtr &graph, const AnfNodePtr &node1, const AnfNodePtr &node2) {
+AnfNodePtr CreateDgateHSplitVDNode(const FuncGraphPtr &func_graph, const AnfNodePtr &dgate_h) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(dgate_h);
+  std::vector<AnfNodePtr> splitvd_input = {NewValueNode(std::make_shared<Primitive>(prim::kPrimSplitV->name()))};
+  if (t_size == 1) {
+    std::vector<AnfNodePtr> dgate_h_outputs;
+    CreateMultipleOutputsOfAnfNode(func_graph, dgate_h, kGRUV2HiddenGradCellOutputNum, &dgate_h_outputs);
+    splitvd_input.emplace_back(dgate_h_outputs[hidden_grad_output_index["dgate_h"]]);
+  } else {
+    splitvd_input.emplace_back(dgate_h);
+  }
+  auto split_vd = func_graph->NewCNode(splitvd_input);
+  auto dtypes = {AnfAlgo::GetOutputInferDataType(dgate_h, 0), AnfAlgo::GetOutputInferDataType(dgate_h, 0)};
+  std::vector<size_t> shape = {t_size, batch_size, hidden_size << 1};
+  std::vector<size_t> shape2 = {t_size, batch_size, hidden_size};
+  std::vector<std::vector<size_t>> shapes = {shape, shape2};
+  AnfAlgo::SetOutputInferTypeAndShape(dtypes, shapes, split_vd.get());
+  AnfAlgo::SetNodeAttr("split_dim", MakeValue(SizeToLong(kDim2)), split_vd);
+  AnfAlgo::SetNodeAttr("num_split", MakeValue(SizeToLong(kSplitVOutputNum)), split_vd);
+  std::vector<int64_t> size_splits = {SizeToLong(hidden_size << 1), SizeToLong(hidden_size)};
+  AnfAlgo::SetNodeAttr("size_splits", MakeValue(size_splits), split_vd);
+  AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), split_vd);
+  return split_vd;
+}
+
+AnfNodePtr CreateDgateXConcatDNode(const FuncGraphPtr &func_graph, const AnfNodePtr &split, const AnfNodePtr &dnt_x) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(split);
+  MS_EXCEPTION_IF_NULL(dnt_x);
+  std::vector<AnfNodePtr> split_outputs;
+  CreateMultipleOutputsOfAnfNode(func_graph, split, kSplitVOutputNum, &split_outputs);
+  std::vector<AnfNodePtr> concat_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimConcat->name())),
+                                           split_outputs[kIndex0]};
+  if (t_size == 1) {
+    std::vector<AnfNodePtr> dnt_x_outputs;
+    CreateMultipleOutputsOfAnfNode(func_graph, dnt_x, kGRUV2HiddenGradCellOutputNum, &dnt_x_outputs);
+    concat_inputs.emplace_back(dnt_x_outputs[hidden_grad_output_index["dnt_x"]]);
+  } else {
+    concat_inputs.emplace_back(dnt_x);
+  }
+  auto concat_op = func_graph->NewCNode(concat_inputs);
+  std::vector<size_t> shape = {t_size, batch_size, kGateNum * hidden_size};
+  auto types = {AnfAlgo::GetOutputInferDataType(dnt_x, 0)};
+  AnfAlgo::SetOutputInferTypeAndShape(types, {shape}, concat_op.get());
+  AnfAlgo::SetNodeAttr(kAttrN, MakeValue(SizeToLong(kConcatNum)), concat_op);
+  AnfAlgo::SetNodeAttr(kAttrDynInputSizes, MakeValue(std::vector<int64_t>{kConcatNum}), concat_op);
+  AnfAlgo::SetNodeAttr(kAttrAxis, MakeValue(SizeToLong(kDim2)), concat_op);
+  AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), concat_op);
+  return concat_op;
+}
+
+AnfNodePtr CreateDwxBatchMatMul(const FuncGraphPtr &graph, const AnfNodePtr &node1, const AnfNodePtr &node2) {
   MS_EXCEPTION_IF_NULL(graph);
   MS_EXCEPTION_IF_NULL(node1);
   MS_EXCEPTION_IF_NULL(node2);
@@ -236,26 +365,57 @@ AnfNodePtr CreateDwhBatchMatMul(const FuncGraphPtr &graph, const AnfNodePtr &nod
                                            node1, node2};
   auto batch_matmul = graph->NewCNode(matmul_inputs);
   MS_EXCEPTION_IF_NULL(batch_matmul);
-  std::vector<size_t> shape = {AnfAlgo::GetOutputInferShape(node1, 0)[kDim0],
-                               AnfAlgo::GetOutputInferShape(node1, 0)[kDim1],
-                               AnfAlgo::GetOutputInferShape(node2, 0)[kDim1]};
-  AnfAlgo::SetOutputInferTypeAndShape({kNumberTypeFloat16}, {shape}, batch_matmul.get());
+  std::vector<size_t> shape = {t_size, input_size, kGateNum * hidden_size};
+  AnfAlgo::SetOutputInferTypeAndShape({dh_dtype}, {shape}, batch_matmul.get());
+  AnfAlgo::SetNodeAttr("transpose_x1", MakeValue(true), batch_matmul);
+  AnfAlgo::SetNodeAttr("transpose_x2", MakeValue(false), batch_matmul);
+  AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), batch_matmul);
+  return batch_matmul;
+}
+
+AnfNodePtr CreateDxtBatchMatMul(const FuncGraphPtr &func_graph, const AnfNodePtr &dgate_concat,
+                                const AnfNodePtr &weight_input, const AnfNodePtr &dx) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(dgate_concat);
+  MS_EXCEPTION_IF_NULL(weight_input);
+  MS_EXCEPTION_IF_NULL(dx);
+  std::vector<AnfNodePtr> matmul_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimBatchMatMul->name())),
+                                           dgate_concat, weight_input};
+  auto batch_matmul = func_graph->NewCNode(matmul_inputs);
+  MS_EXCEPTION_IF_NULL(batch_matmul);
+  AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(dx, 0)}, {AnfAlgo::GetOutputInferShape(dx, 0)},
+                                      batch_matmul.get());
   AnfAlgo::SetNodeAttr("transpose_x1", MakeValue(false), batch_matmul);
   AnfAlgo::SetNodeAttr("transpose_x2", MakeValue(true), batch_matmul);
   AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), batch_matmul);
   return batch_matmul;
 }
 
-AnfNodePtr CreateDwReduceSumDNode(const FuncGraphPtr &graph, const AnfNodePtr &node, const AnfNodePtr &node2) {
+AnfNodePtr CreateWBroadcastToDNode(const FuncGraphPtr &graph, const AnfNodePtr &node) {
   MS_EXCEPTION_IF_NULL(graph);
   MS_EXCEPTION_IF_NULL(node);
+  // BroadcastTo
+  std::vector<AnfNodePtr> braodcast_to_input = {NewValueNode(std::make_shared<Primitive>(kBroadcastToOpName)), node};
+  auto broadcast_to_d = graph->NewCNode(braodcast_to_input);
+  std::vector<size_t> shape = {t_size, input_size, kGateNum * hidden_size};
+  auto type = {AnfAlgo::GetOutputInferDataType(node, 0)};
+  AnfAlgo::SetOutputInferTypeAndShape(type, {shape}, broadcast_to_d.get());
+  std::vector<int64_t> attr_shape = {SizeToLong(t_size), SizeToLong(input_size), SizeToLong(kGateNum * hidden_size)};
+  AnfAlgo::SetNodeAttr(kAttrShape, MakeValue(attr_shape), broadcast_to_d);
+  AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), broadcast_to_d);
+  return broadcast_to_d;
+}
+
+AnfNodePtr CreateDwReduceSumDNode(const FuncGraphPtr &graph, const AnfNodePtr &matmul, const AnfNodePtr &gru_grad) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(matmul);
+  MS_EXCEPTION_IF_NULL(gru_grad);
   // ReduceSumD for dw_x and dw_h
   std::vector<AnfNodePtr> reducesum_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimReduceSum->name())),
-                                              node};
+                                              matmul};
   auto reduce_sumd = graph->NewCNode(reducesum_inputs);
-  MS_EXCEPTION_IF_NULL(reduce_sumd);
-  auto types = {AnfAlgo::GetOutputInferDataType(node, 0)};
-  auto shapes = {AnfAlgo::GetOutputInferShape(node2, 0)};
+  auto types = {AnfAlgo::GetOutputInferDataType(gru_grad, 0)};
+  auto shapes = {AnfAlgo::GetOutputInferShape(gru_grad, 0)};
   AnfAlgo::SetOutputInferTypeAndShape(types, shapes, reduce_sumd.get());
   AnfAlgo::SetNodeAttr(kAttrAxis, MakeValue(std::vector<int64_t>{0}), reduce_sumd);
   AnfAlgo::SetNodeAttr("keep_dims", MakeValue(false), reduce_sumd);
@@ -272,9 +432,8 @@ AnfNodePtr CreateDbReduceSumDNode(const FuncGraphPtr &graph, const AnfNodePtr &n
                                               node};
   auto reduce_sumd = graph->NewCNode(reducesum_inputs);
   MS_EXCEPTION_IF_NULL(reduce_sumd);
-
-  auto types = {AnfAlgo::GetOutputInferDataType(node, 0)};
-  std::vector<size_t> shape = {kGateNum * AnfAlgo::GetOutputInferShape(node2, 0)[kDim1]};
+  std::vector<size_t> shape = {kGateNum * hidden_size};
+  auto types = {AnfAlgo::GetOutputInferDataType(node2, 0)};
   AnfAlgo::SetOutputInferTypeAndShape(types, {shape}, reduce_sumd.get());
   AnfAlgo::SetNodeAttr(kAttrAxis, MakeValue(std::vector<int64_t>{0, 1}), reduce_sumd);
   AnfAlgo::SetNodeAttr("keep_dims", MakeValue(false), reduce_sumd);
@@ -299,52 +458,76 @@ const AnfNodePtr DynamicGRUV2GradFission::Process(const FuncGraphPtr &func_graph
                  << kDynamicGRUV2GradInputNum << " inputs";
     return nullptr;
   }
+  if (AnfAlgo::IsDynamicShape(node)) {
+    MS_LOG(INFO) << "DynamicGRUV2Grad is dynamic shape, can not optimizer.";
+    return nullptr;
+  }
 
   // input_list of dynamic_gru_v2_grad
   const auto &ori_inputs = dynamic_gru_v2_grad_cnode->inputs();
-  // add gru_v2_gru_hidden
-  auto gru_v2_gru_hidden = CreateGRUV2HiddenGradNode(func_graph, dynamic_gru_v2_grad_cnode);
-  std::vector<AnfNodePtr> gru_hidden_outputs;
-  CreateMultipleOutputsOfAnfNode(func_graph, gru_v2_gru_hidden, kGRUV2HiddenGradOutputNum, &gru_hidden_outputs);
-  size_t step_num = AnfAlgo::GetOutputInferShape(ori_inputs[kIndex1], 0)[kDim0];
-  AnfNodePtr dwh_batch_matmul = nullptr;
-  if (step_num != 1) {
+  std::vector<AnfNodePtr> gru_grad_outputs;
+  CreateMultipleOutputsOfAnfNode(func_graph, dynamic_gru_v2_grad_cnode, kDynamicGRUV2GradOutputNum, &gru_grad_outputs);
+  auto input_h = ori_inputs[input_index["h"]];
+  auto input_x = ori_inputs[input_index["x"]];
+  t_size = AnfAlgo::GetOutputInferShape(input_h, 0)[kDim0];
+  batch_size = AnfAlgo::GetOutputInferShape(input_h, 0)[kDim1];
+  hidden_size = AnfAlgo::GetOutputInferShape(input_h, 0)[kDim2];
+  input_size = AnfAlgo::GetOutputInferShape(input_x, 0)[kDim2];
+  MS_LOG(INFO) << "For DynamicGRUV2Grad op, t_size: " << t_size << ", batch_size: " << batch_size
+               << ", hidden_size: " << hidden_size << ", input_size: " << input_size;
+  // add GRUHiddenGrad {dhPrevNode, dgateHConcatTNode, dntXConcatTNode}
+  std::vector<AnfNodePtr> gru_hidden_grad_nodes = AddGRUHiddenGradNode(func_graph, dynamic_gru_v2_grad_cnode);
+  AnfNodePtr dwh_matmul_node;
+  auto dgate_h = gru_hidden_grad_nodes[hidden_grad_output_index["dgate_h"]];
+  if (t_size != 1) {
     // split h
-    auto h_split = CreateHSplitVDNode(func_graph, ori_inputs[kIndex6]);
+    auto split = AddHSplitNode(func_graph, dynamic_gru_v2_grad_cnode);
     // concat(h, h_split)
-    auto h_concat = CreateHConcatDNode(func_graph, ori_inputs[kIndex5], h_split);
-    // batchmatmul(h_concat.T, dgate_h)
-    dwh_batch_matmul = CreateDhxBatchMatMul(func_graph, h_concat, gru_hidden_outputs[kIndex1]);
+    auto h_concat = AddHConcatNode(func_graph, dynamic_gru_v2_grad_cnode, split);
+    // add matmul(h_prev.T, dgate_h)
+    dwh_matmul_node = AddDwhMatmulNode(func_graph, dgate_h, h_concat);
   } else {
-    auto reshape = CreateHReshape(func_graph, ori_inputs[kIndex5]);
-    // batchmatmul(init_h.T, dgate_h)
-    dwh_batch_matmul = CreateDhxBatchMatMul(func_graph, reshape, gru_hidden_outputs[kIndex1]);
+    auto reshape = CreateHReshape(func_graph, ori_inputs[input_index["init_h"]]);
+    dwh_matmul_node = AddDwhMatmulNode(func_graph, dgate_h, reshape);
   }
-  // split dgate_h
-  auto dgate_h_split = CreateDgateHSplitVDNode(func_graph, gru_hidden_outputs[kIndex1]);
+  // split dgate_h to [dit, drt] and [dnt_h]
+  auto dgate_h_split = CreateDgateHSplitVDNode(func_graph, dgate_h);
   // concat(dgate_h_split[0], dnt_x) to dgate_x
-  auto dgate_x_concat = CreateDgateXConcatDNode(func_graph, dgate_h_split, gru_hidden_outputs[kIndex2]);
+  auto dgate_x_concat =
+    CreateDgateXConcatDNode(func_graph, dgate_h_split, gru_hidden_grad_nodes[hidden_grad_output_index["dnt_x"]]);
   // broadcast weight_input [input_size, 3 * hidden_size] to [t_size, input_size, 3 * hidden_size]
-  auto w_input_broadcast = CreateWBroadcastToDNode(func_graph, ori_inputs[kIndex2], ori_inputs[kIndex1]);
-  // batchmatmul(x.T, dgate_x_concat)
-  auto dwx_batch_matmul = CreateDhxBatchMatMul(func_graph, ori_inputs[kIndex1], dgate_x_concat);
+  auto w_input_broadcast = CreateWBroadcastToDNode(func_graph, ori_inputs[input_index["weight_input"]]);
   // batchmatmul(dgate_x_concat, w_input_broadcast.T)
-  auto dxt_batch_matmul = CreateDwhBatchMatMul(func_graph, dgate_x_concat, w_input_broadcast);
+  auto dxt_batch_matmul =
+    CreateDxtBatchMatMul(func_graph, dgate_x_concat, w_input_broadcast, gru_grad_outputs[output_index["dx"]]);
+  // batchmatmul(x.T, dgate_x_concat)
+  auto dwx_batch_matmul = CreateDwxBatchMatMul(func_graph, ori_inputs[input_index["x"]], dgate_x_concat);
   // reducesum dw_x and dw_h
-  auto dwx_reduce_sum = CreateDwReduceSumDNode(func_graph, dwx_batch_matmul, ori_inputs[kIndex2]);
-  auto dwh_reduce_sum = CreateDwReduceSumDNode(func_graph, dwh_batch_matmul, ori_inputs[kIndex3]);
+  auto dwx_reduce_sum =
+    CreateDwReduceSumDNode(func_graph, dwx_batch_matmul, gru_grad_outputs[output_index["dw_input"]]);
+  auto dwh_reduce_sum =
+    CreateDwReduceSumDNode(func_graph, dwh_matmul_node, gru_grad_outputs[output_index["dw_hidden"]]);
   // reducesum db_x and db_h
   auto dbx_reduce_sum = CreateDbReduceSumDNode(func_graph, dgate_x_concat, ori_inputs[kIndex5]);
-  auto dbh_reduce_sum = CreateDbReduceSumDNode(func_graph, gru_hidden_outputs[kIndex1], ori_inputs[kIndex5]);
+  AnfNodePtr dbh_reduce_sum;
+  if (t_size == 1) {
+    std::vector<AnfNodePtr> dbh_outputs;
+    CreateMultipleOutputsOfAnfNode(func_graph, dgate_h, kGRUV2HiddenGradCellOutputNum, &dbh_outputs);
+    dbh_reduce_sum = CreateDbReduceSumDNode(func_graph, dbh_outputs[kIndex1], ori_inputs[kIndex5]);
+  } else {
+    dbh_reduce_sum = CreateDbReduceSumDNode(func_graph, dgate_h, ori_inputs[kIndex5]);
+  }
+  std::vector<AnfNodePtr> dh_prev_outputs;
+  CreateMultipleOutputsOfAnfNode(func_graph, gru_hidden_grad_nodes[kIndex0], kGRUV2HiddenGradCellOutputNum,
+                                 &dh_prev_outputs);
   std::vector<AnfNodePtr> make_tuple_inputs = {NewValueNode(prim::kPrimMakeTuple),
                                                dwx_reduce_sum,
                                                dwh_reduce_sum,
                                                dbx_reduce_sum,
                                                dbh_reduce_sum,
                                                dxt_batch_matmul,
-                                               gru_hidden_outputs[kIndex0]};
+                                               dh_prev_outputs[kIndex0]};
   auto make_tuple = func_graph->NewCNode(make_tuple_inputs);
-  MS_EXCEPTION_IF_NULL(make_tuple);
   return make_tuple;
 }
 }  // namespace opt
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/dynamic_gru_v2_grad_fission.h b/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/dynamic_gru_v2_grad_fission.h
index 0fef9617309..0a4026c89d8 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/dynamic_gru_v2_grad_fission.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/dynamic_gru_v2_grad_fission.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace opt {
 class DynamicGRUV2GradFission : public PatternProcessPass {
  public:
   explicit DynamicGRUV2GradFission(bool multigraph = true)
-      : PatternProcessPass("dynamic_gru_grad_v2_fission", multigraph) {}
+      : PatternProcessPass("dynamic_gru_v2_grad_fission", multigraph) {}
   ~DynamicGRUV2GradFission() override = default;
   const BaseRef DefinePattern() const override;
   const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/transdata_split.cc b/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/transdata_split.cc
index da6c440a3e8..36f51c2d82c 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/transdata_split.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/transdata_split.cc
@@ -53,7 +53,7 @@ bool TransDataSplit::IsFormatInvaild(const AnfNodePtr &node) const {
 
 const BaseRef TransDataSplit::DefinePattern() const {
   VarPtr X = std::make_shared<Var>();
-  return VectorRef({prim::KPrimTransData, X});
+  return VectorRef({prim::kPrimTransData, X});
 }
 
 // transdata cannot support frac_z to nchw need split transdata(frac_z-HWCN) and transpose(HWCN-NCHW)
@@ -75,7 +75,7 @@ CNodePtr TransDataSplit::DoSplit(const FuncGraphPtr &func_graph, const AnfNodePt
   if (output_format == kOpFormat_DEFAULT || output_format == kOpFormat_NCHW) {
     // trans input_format to hwcn
     new_transdata_node = NewTransOpNode(func_graph, AnfAlgo::GetInputNode(node->cast<CNodePtr>(), 0), kernel_select_,
-                                        false, prim::KPrimTransData->name());
+                                        false, prim::kPrimTransData->name());
     RefreshKernelBuildInfo(input_format, kOpFormat_HWCN, new_transdata_node, padding_axis);
     // trans hwcn to default_format
     new_transpose_node = NewTransOpNode(func_graph, new_transdata_node, kernel_select_, false,
@@ -93,7 +93,7 @@ CNodePtr TransDataSplit::DoSplit(const FuncGraphPtr &func_graph, const AnfNodePt
 
     // trans hwcn to output_format
     new_transdata_node =
-      NewTransOpNode(func_graph, new_transpose_node, kernel_select_, false, prim::KPrimTransData->name());
+      NewTransOpNode(func_graph, new_transpose_node, kernel_select_, false, prim::kPrimTransData->name());
     RefreshKernelBuildInfo(kOpFormat_HWCN, output_format, new_transdata_node, padding_axis);
     new_transdata_node->set_abstract(node->abstract());
     new_replace_node = new_transdata_node;
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/batchnorm_to_bninfer.cc b/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/batchnorm_to_bninfer.cc
index 8e98f25cb1b..ca6b0a1bc03 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/batchnorm_to_bninfer.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/batchnorm_to_bninfer.cc
@@ -122,7 +122,7 @@ const AnfNodePtr BatchNorm2BNInfer::Process(const FuncGraphPtr &graph, const Anf
     return nullptr;
   }
   auto bn_infer = CreateBNInfer(graph, batchnorm, node);
-  TransferDepend(batchnorm, graph, bn_infer);
+  TransferDependOrUpdateState(batchnorm, graph, bn_infer);
   return bn_infer;
 }
 }  // namespace opt
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/batchnormgrad_to_bninfergrad.cc b/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/batchnormgrad_to_bninfergrad.cc
index 117c4217c93..2a88d6fce1c 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/batchnormgrad_to_bninfergrad.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/batchnormgrad_to_bninfergrad.cc
@@ -125,7 +125,7 @@ const AnfNodePtr BatchNormGrad2BNInferGrad::Process(const FuncGraphPtr &graph, c
     return nullptr;
   }
   auto bn_infer_grad = CreateBNInferGrad(graph, batchnorm_grad, node);
-  TransferDepend(batchnorm_grad, graph, bn_infer_grad);
+  TransferDependOrUpdateState(batchnorm_grad, graph, bn_infer_grad);
   return bn_infer_grad;
 }
 }  // namespace opt
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/transpose_transdata_fusion.cc b/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/transpose_transdata_fusion.cc
index 5215a76e44b..9473ee22bef 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/transpose_transdata_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fusion/transpose_transdata_fusion.cc
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace opt {
 const BaseRef TransposeTransDataFusion::DefinePattern() const {
-  const auto prim_transdata = std::make_shared<Primitive>(prim::KPrimTransData->name());
+  const auto prim_transdata = std::make_shared<Primitive>(prim::kPrimTransData->name());
   VectorRef transpose({prim::kPrimTranspose, input_varptr_});
 
   return VectorRef({prim_transdata, transpose});
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/mindir/all_to_all_unify_mindir.cc b/mindspore/ccsrc/backend/optimizer/ascend/mindir/all_to_all_unify_mindir.cc
index 6b2d57b9a18..884f762f3f6 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/mindir/all_to_all_unify_mindir.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/mindir/all_to_all_unify_mindir.cc
@@ -27,6 +27,10 @@ namespace {
 constexpr size_t kCNodePrimitiveIdx = 0;
 constexpr size_t kAllToAllInputIdx = 1;
 
+inline int64_t NormalizeDim(const std::vector<size_t> &shape, int64_t dim) {
+  return dim < 0 ? SizeToLong(shape.size()) + dim : dim;
+}
+
 void ChangePrimitiveToAllToAllV(const AnfNodePtr &node) {
   MS_EXCEPTION_IF_NULL(node);
   auto neighbor_exchange = node->cast<CNodePtr>();
@@ -66,6 +70,7 @@ CNodePtr CreateSplitNode(const FuncGraphPtr &graph, const CNodePtr &all_to_all)
   MS_EXCEPTION_IF_NULL(split_v);
   auto dtype = AnfAlgo::GetOutputInferDataType(all_to_all_input, 0);
   auto shape = AnfAlgo::GetOutputInferShape(all_to_all_input, 0);
+  split_dim = NormalizeDim(shape, split_dim);
   if (SizeToLong(shape.size()) <= split_dim) {
     MS_LOG(EXCEPTION) << "Invalid split dim " << split_dim << " is over the shape size " << shape.size();
   }
@@ -133,6 +138,7 @@ CNodePtr CreateConcatNode(const FuncGraphPtr &graph, const CNodePtr &all_to_all,
   auto concat = graph->NewCNode(concat_input);
   MS_EXCEPTION_IF_NULL(concat);
   auto single_shape = AnfAlgo::GetOutputInferShape(all_to_all_v_outputs[0], 0);
+  concat_dim = NormalizeDim(single_shape, concat_dim);
   if (LongToSize(concat_dim) >= single_shape.size()) {
     MS_LOG(EXCEPTION) << "Invalid concat dim " << concat_dim << " is greater than shape size " << single_shape.size();
   }
diff --git a/mindspore/ccsrc/backend/optimizer/common/helper.cc b/mindspore/ccsrc/backend/optimizer/common/helper.cc
index a59499da83d..a07f9e023b1 100644
--- a/mindspore/ccsrc/backend/optimizer/common/helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/common/helper.cc
@@ -916,21 +916,34 @@ ValueNodePtr MakeValueNode(const ValueNodePtr &value_node) {
   return new_value_node;
 }
 
-void TransferDepend(const CNodePtr &old_node, const FuncGraphPtr &graph, const CNodePtr &new_node) {
+void TransferDependOrUpdateState(const CNodePtr &old_node, const FuncGraphPtr &graph, const CNodePtr &new_node) {
   MS_EXCEPTION_IF_NULL(old_node);
   MS_EXCEPTION_IF_NULL(graph);
   auto manager = graph->manager();
   MS_EXCEPTION_IF_NULL(manager);
   // Find BatchNorm's output which is a Depend or UpdateState.
-  for (const auto &node_index : manager->node_users()[old_node]) {
+  auto node_users = manager->node_users()[old_node];
+  for (const auto &node_index : node_users) {
     AnfNodePtr output = node_index.first;
-    size_t index = IntToSize(node_index.second);
     MS_EXCEPTION_IF_NULL(output);
     if (AnfAlgo::CheckPrimitiveType(output, prim::kPrimDepend) ||
         AnfAlgo::CheckPrimitiveType(output, prim::kPrimUpdateState)) {
-      auto depend = output->cast<CNodePtr>();
-      MS_EXCEPTION_IF_NULL(depend);
-      depend->set_input(index, new_node);
+      auto output_cnode = output->cast<CNodePtr>();
+      MS_EXCEPTION_IF_NULL(output_cnode);
+      auto inputs = output_cnode->inputs();
+      std::vector<AnfNodePtr> new_inputs{output_cnode->input(0)};
+      for (size_t i = 1; i < inputs.size(); i++) {
+        auto input = inputs[i];
+        if (input == old_node) {
+          new_inputs.emplace_back(new_node);
+        } else {
+          new_inputs.emplace_back(input);
+        }
+      }
+      auto new_output = graph->NewCNode(new_inputs);
+      new_output->set_abstract(output->abstract());
+      new_output->set_scope(output->scope());
+      manager->Replace(output, new_output);
     }
   }
 }
diff --git a/mindspore/ccsrc/backend/optimizer/common/helper.h b/mindspore/ccsrc/backend/optimizer/common/helper.h
index 88537b50d8c..e298b4c1192 100644
--- a/mindspore/ccsrc/backend/optimizer/common/helper.h
+++ b/mindspore/ccsrc/backend/optimizer/common/helper.h
@@ -213,8 +213,8 @@ bool CheckSupportDataType(const AnfNodePtr &node, const std::set<TypeId> &suppor
 // Create a new value node of func graph,not kernel graph
 ValueNodePtr MakeValueNode(const ValueNodePtr &value_node);
 
-// Transfer depend to the new node
-void TransferDepend(const CNodePtr &old_node, const FuncGraphPtr &graph, const CNodePtr &new_node);
+// Transfer depend or updatestate to the new node
+void TransferDependOrUpdateState(const CNodePtr &old_node, const FuncGraphPtr &graph, const CNodePtr &new_node);
 
 AbstractBasePtr CppInferShape(const PrimitivePtr &prim, const AbstractBasePtrList &args_spec_list);
 
diff --git a/mindspore/ccsrc/backend/optimizer/common/node_pass.cc b/mindspore/ccsrc/backend/optimizer/common/node_pass.cc
index ded38fc7b81..9da7099886a 100644
--- a/mindspore/ccsrc/backend/optimizer/common/node_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/common/node_pass.cc
@@ -31,22 +31,24 @@ bool NodePass::Run(const FuncGraphPtr &func_graph) {
   manager->AddFuncGraph(func_graph);
 
   std::unordered_set<AnfNodePtr> seen_node;
-  std::deque<AnfNodePtr> todo{func_graph->output()};
+  std::deque<std::pair<AnfNodePtr, FuncGraphPtr>> todo{{func_graph->output(), func_graph}};
   bool changes = false;
   while (!todo.empty()) {
-    AnfNodePtr node = todo.front();
+    AnfNodePtr node = todo.front().first;
+    auto fg = todo.front().second;
+    manager->AddFuncGraph(fg);
     todo.pop_front();
     if (seen_node.count(node) > 0 || !manager->all_nodes().contains(node)) {
       continue;
     }
     (void)seen_node.insert(node);
     TraceGuard guard(std::make_shared<TraceOpt>(node->debug_info()));
-    AnfNodePtr new_node = Run(func_graph, node);
+    AnfNodePtr new_node = Run(fg, node);
     bool change = (new_node != nullptr);
     if (new_node != nullptr && new_node != node) {
       (void)manager->Replace(node, new_node);
       // if replaced node is end_goto, refresh relative params in kernel graph
-      auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
+      auto kernel_graph = fg->cast<std::shared_ptr<session::KernelGraph>>();
       if (kernel_graph != nullptr && node->isa<CNode>()) {
         auto cnode = node->cast<CNodePtr>();
         MS_EXCEPTION_IF_NULL(cnode);
@@ -63,16 +65,18 @@ bool NodePass::Run(const FuncGraphPtr &func_graph) {
       auto const_func_graph = GetValueNode<FuncGraphPtr>(new_node);
       MS_EXCEPTION_IF_NULL(const_func_graph);
       if (!const_func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
-        todo.push_back(const_func_graph->output());
+        todo.push_back({const_func_graph->output(), const_func_graph});
       }
     } else if (new_node && new_node->isa<CNode>()) {
       if (AnfAlgo::IsGraphKernel(new_node)) {
-        todo.push_back(new_node);
+        todo.push_back({new_node, func_graph});
       }
       auto cnode = new_node->cast<CNodePtr>();
       MS_EXCEPTION_IF_NULL(cnode);
       auto inputs = cnode->inputs();
-      (void)todo.insert(todo.end(), inputs.begin(), inputs.end());
+      std::for_each(inputs.begin(), inputs.end(), [&fg, &todo](AnfNodePtr &node) {
+        todo.emplace_back(std::pair<AnfNodePtr, FuncGraphPtr>(node, fg));
+      });
     }
     changes = changes || change;
   }
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc
index 2cbe882d595..e7367de351f 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc
@@ -52,10 +52,10 @@ std::set<int64_t> GetUniqReduceAxes(const AnfNodePtr &node, bool is_ascend = fal
   auto axis_vec = GetReduceAxis(node);
   if (axis_vec.empty()) {
     for (size_t i = 0; i < src_shape_vec.size(); ++i) {
-      axis_vec.push_back(i);
+      axis_vec.emplace_back(i);
     }
   } else {
-    std::transform(axis_vec.begin(), axis_vec.end(), axis_vec.begin(), [&src_shape_vec](int64_t axis) -> int64_t {
+    (void)std::transform(axis_vec.begin(), axis_vec.end(), axis_vec.begin(), [&src_shape_vec](int64_t axis) -> int64_t {
       return axis < 0 ? axis + SizeToLong(src_shape_vec.size()) : axis;
     });
   }
@@ -81,7 +81,7 @@ bool HaveReduceInPredecessors(const AnfNodePtr &node) {
     }
 
     auto n_inputs = n->cast<CNodePtr>()->inputs();
-    std::for_each(n_inputs.cbegin() + 1, n_inputs.cend(), [&st](const AnfNodePtr &n) -> void { st.push(n); });
+    (void)std::for_each(n_inputs.cbegin() + 1, n_inputs.cend(), [&st](const AnfNodePtr &n) -> void { st.push(n); });
   }
 
   return false;
@@ -175,9 +175,9 @@ bool AtomicAddCheckerGPU::SuitableForAtomicAdd(const AnfNodePtr &node) {
   // For reduce whose last dim is reduced (including all-reduce),
   // it is suitable for atomic add only the reduce num is greater than or equal to 1024.
   if (axis_set.count(src_shape_vec.size() - 1) != 0) {
-    size_t reduce_size =
-      std::accumulate(axis_set.begin(), axis_set.end(), LongToSize(1),
-                      [&src_shape_vec](size_t size, int64_t axis) { return size * LongToSize(src_shape_vec[axis]); });
+    size_t reduce_size = std::accumulate(
+      axis_set.begin(), axis_set.end(), LongToSize(1),
+      [&src_shape_vec](size_t size, int64_t axis) { return size * LongToSize(src_shape_vec[LongToSize(axis)]); });
     return reduce_size >= 1024;
   }
 
@@ -212,8 +212,8 @@ bool AtomicAddCheckerAscend::SuitableForAtomicAdd(const AnfNodePtr &node) {
   }
 
   // If the non-reduce axis cannot make full use of multi-core, enable atomic addition
-  auto processor_core_num = 32;
-  auto start_non_reduce_dim = 1;
+  constexpr auto processor_core_num = 32LL;
+  auto start_non_reduce_dim = 1LL;
   for (size_t i = 0; i < src_shape_vec.size(); ++i) {
     auto dim = src_shape_vec[i];
     if (reduce_axis_set.count(i)) {
@@ -448,8 +448,8 @@ std::vector<std::pair<AnfNodePtr, int> > AtomicCleanInsertter::FindOriginCNodeUs
   std::vector<std::pair<AnfNodePtr, int> > reduce_user_nodes;
   if (real_output_num_ <= 1) {
     auto users = mng->node_users()[composite_node];
-    std::transform(users.cbegin(), users.cend(), std::back_inserter(reduce_user_nodes),
-                   [](const std::pair<AnfNodePtr, int> &pair) { return pair; });
+    (void)std::transform(users.cbegin(), users.cend(), std::back_inserter(reduce_user_nodes),
+                         [](const std::pair<AnfNodePtr, int> &pair) { return pair; });
   } else {
     std::vector<std::pair<AnfNodePtr, int> > getitem_user_nodes;
     auto users = mng->node_users()[composite_node];
@@ -491,7 +491,7 @@ std::vector<std::pair<AnfNodePtr, int> > AtomicCleanInsertter::FindOriginCNodeUs
     for (auto &pair : getitem_user_nodes) {
       // Directory to find real user.
       auto real_users = mng->node_users()[pair.first];
-      reduce_user_nodes.insert(reduce_user_nodes.end(), real_users.begin(), real_users.end());
+      (void)reduce_user_nodes.insert(reduce_user_nodes.end(), real_users.begin(), real_users.end());
     }
   }
 
@@ -513,7 +513,7 @@ void AtomicCleanInsertter::ProcessOriginCNodeUser(const KernelGraphPtr &main_gra
     auto user_cnode = user_node->cast<CNodePtr>();
     MS_EXCEPTION_IF_NULL(user_cnode);
     user_cnode->set_input(IntToSize(index), load_node);
-    to_process_order_.emplace_back(composite_node, user_node);
+    (void)to_process_order_.emplace_back(composite_node, user_node);
   }
 }
 
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.cc
index ab181401fdb..aa693fb34dc 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.cc
@@ -15,17 +15,7 @@
  */
 
 #include "backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h"
-#include <algorithm>
-#include <functional>
-#include <list>
-#include <map>
-#include <memory>
-#include <utility>
-#include <set>
-#include <stack>
-#include <string>
-#include <tuple>
-#include <vector>
+
 #include "base/core_ops.h"
 #include "ir/tensor.h"
 #include "utils/utils.h"
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/axis_normalizer.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/axis_normalizer.cc
index 632818b5d1c..9fc4d8601b5 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/axis_normalizer.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/axis_normalizer.cc
@@ -15,8 +15,6 @@
  */
 #include "backend/optimizer/graph_kernel/axis_normalizer.h"
 
-#include <algorithm>
-#include <vector>
 #include "ir/scalar.h"
 #include "backend/optimizer/graph_kernel/graph_kernel_helper.h"
 #include "backend/session/anf_runtime_algorithm.h"
@@ -71,6 +69,7 @@ bool AxisNormalizer::Process(const FuncGraphPtr &func_graph) const {
       }
       if (diff) {
         changed = true;
+        std::sort(axis_vec.begin(), axis_vec.end());
         SetNodeAttrSafely(kAttrAxis, MakeValue(axis_vec), node);
       }
     }
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/cast_matmul_fusion.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/cast_matmul_fusion.cc
index d30f556ece1..dab57f8c39d 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/cast_matmul_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/cast_matmul_fusion.cc
@@ -63,7 +63,7 @@ bool DoFuse(const FuncGraphPtr &func_graph) {
     if (cnode->size() != 4) {
       continue;
     }
-    auto cast_node = cnode->input(3);
+    auto cast_node = cnode->inputs().back();  // bias node
     if (!IsPrimitiveCNode(cast_node, prim::kPrimCast)) {
       continue;
     }
@@ -81,7 +81,7 @@ bool DoFuse(const FuncGraphPtr &func_graph) {
     // Cast is only used by matmul
     auto user_index_set = mng->node_users()[cast_node];
     if (user_index_set.size() == 1) {
-      mng->Replace(cast_node, (cast_node->cast<CNodePtr>())->input(1));
+      (void)mng->Replace(cast_node, (cast_node->cast<CNodePtr>())->input(1));
       UpdateBuildInfo(cnode, cast_node);
       changed = true;
       continue;
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_cluster.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_cluster.cc
index 2d26a864548..cd129b72fc7 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_cluster.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_cluster.cc
@@ -63,7 +63,7 @@ std::vector<PrimitivePtr> GetClusterableOpList() {
     prim::kPrimTranspose,
 #if ENABLE_D
     prim::kPrimMatMul,
-    prim::KPrimTransData,
+    prim::kPrimTransData,
     prim::kPrimBatchMatMul,
 #elif ENABLE_GPU
     prim::kPrimACos,
@@ -99,6 +99,7 @@ std::vector<PrimitivePtr> GetClusterableOpList() {
     prim::kPrimSelect,
     prim::kPrimSign,
     prim::kPrimSin,
+    prim::kPrimStridedSlice,
 #endif
   };
   const auto &flags = context::GraphKernelFlags::GetInstance();
@@ -158,7 +159,7 @@ class Graph {
         auto iter = node_idx_map.find(inp);
         if (iter != node_idx_map.end()) {
           // At the beginning, cluster_id is equal to node_id
-          inputs_.insert(iter->second);
+          (void)inputs_.insert(iter->second);
         }
       }
     }
@@ -169,8 +170,8 @@ class Graph {
       max_node_id_ = std::max(other_cluster->max_node_id_, max_node_id_);
       cluster_size_ += other_cluster->cluster_size_;
       basic_op_cnt_ += other_cluster->basic_op_cnt_;
-      std::for_each(other_cluster->inputs_.begin(), other_cluster->inputs_.end(),
-                    [this](size_t inp) { this->inputs_.insert(inp); });
+      (void)std::for_each(other_cluster->inputs_.begin(), other_cluster->inputs_.end(),
+                          [this](size_t inp) { (void)this->inputs_.insert(inp); });
       other_cluster->Clean();
     }
 
@@ -188,13 +189,13 @@ class Graph {
   Graph(const AnfNodePtrList &nodes, const std::unordered_map<AnfNodePtr, size_t> &node_idx_map) {
     clusters_.reserve(nodes.size());
     for (size_t i = 0; i < nodes.size(); i++) {
-      clusters_.emplace_back(i, nodes[i], node_idx_map);
+      (void)clusters_.emplace_back(i, nodes[i], node_idx_map);
     }
   }
   ~Graph() = default;
 
   // find the representative of the cluster
-  int Find(size_t node_id) {
+  size_t Find(size_t node_id) {
     size_t &pre_id = clusters_[node_id].cluster_id_;
     return (pre_id == clusters_[pre_id].cluster_id_) ? pre_id : (pre_id = Find(pre_id));
   }
@@ -221,7 +222,7 @@ class Graph {
   size_t GetClusterMaxNodeId(size_t cluster_id) { return clusters_[Find(cluster_id)].max_node_id_; }
 
   using VisitFunc = std::function<IncludeType(size_t)>;
-  void Dfs(size_t node_id, VisitFunc visitor) {
+  void Dfs(size_t node_id, const VisitFunc &visitor) {
     ++seen_;
     return DepthFirstSearch(Find(node_id), visitor);
   }
@@ -246,12 +247,12 @@ class Graph {
       size_t new_id = Find(*iter);
       if (new_id != *iter) {
         iter = inputs.erase(iter);
-        inputs.insert(new_id);
+        (void)inputs.insert(new_id);
       } else {
         ++iter;
       }
     }
-    inputs.erase(i);
+    (void)inputs.erase(i);
   }
 
   void DepthFirstSearch(size_t cluster_id, const VisitFunc &visitor) {
@@ -289,9 +290,9 @@ class CircleChecker {
         RemoveCircleNodesFromCandidates();
       }
     }
-    candidates->erase(std::remove_if(candidates->begin(), candidates->end(),
-                                     [this](size_t c) { return this->candidates_.count(c) == 0; }),
-                      candidates->end());
+    (void)candidates->erase(std::remove_if(candidates->begin(), candidates->end(),
+                                           [this](size_t c) { return this->candidates_.count(c) == 0; }),
+                            candidates->end());
   }
 
  private:
@@ -319,7 +320,7 @@ class CircleChecker {
         if (done.count(node_id) || acyclic_nodes_.count(node_id) || visited_circle_nodes.count(node_id)) {
           return EXCLUDE;
         }
-        done.insert(node_id);
+        (void)done.insert(node_id);
         if (candidates_.count(node_id)) {
           has_circle = true;
           circle_nodes_.push_back(node_id);
@@ -347,7 +348,7 @@ class CircleChecker {
   void RemoveCircleNodesFromCandidates() {
     auto remove_from_candidates = [this](size_t node_id) {
       if (candidates_.count(node_id)) {
-        candidates_.erase(node_id);
+        (void)candidates_.erase(node_id);
         return FOLLOW;
       }
       return EXCLUDE;
@@ -357,7 +358,6 @@ class CircleChecker {
     }
   }
 
- private:
   GraphPtr graph_;               // bind the global graph
   std::set<size_t> candidates_;  // bind the input candidates
   std::vector<size_t> circle_nodes_;
@@ -388,12 +388,12 @@ std::vector<size_t> GraphKernelCluster::FindCandidates(size_t basenode_id) {
 
 bool GraphKernelCluster::Process(const FuncGraphPtr &func_graph) {
   bool changed = false;
-  for (int i = nodes_.size() - 1; i >= 0; i--) {
+  for (int i = SizeToInt(nodes_.size()) - 1; i >= 0; i--) {
     // if the node has been clustered, it has tried to find its previous nodes, so it's unnecessary to try again.
-    if (graph_->GetSize(i) > 1) {
+    if (graph_->GetSize(IntToSize(i)) > 1) {
       continue;
     }
-    auto candidates = FindCandidates(i);
+    auto candidates = FindCandidates(IntToSize(i));
     CircleChecker(graph_).RemoveCircle(&candidates);
     RemoveWildGetitem(&candidates);
     if (candidates.empty()) continue;
@@ -425,11 +425,11 @@ bool GraphKernelCluster::Process(const FuncGraphPtr &func_graph) {
 void GraphKernelCluster::CreateFuncGraph(const FuncGraphPtr &func_graph, const std::vector<size_t> &nodes_id) {
   AnfNodePtrList old_nodes;
   AnfNodePtr new_node;
-  std::transform(nodes_id.begin(), nodes_id.end(), std::back_inserter(old_nodes),
-                 [this](size_t id) { return this->nodes_[id]; });
+  (void)std::transform(nodes_id.begin(), nodes_id.end(), std::back_inserter(old_nodes),
+                       [this](size_t id) { return this->nodes_[id]; });
   std::tie(new_node, std::ignore) = FuseNodesToSubGraph(old_nodes, func_graph, "fusion");
   std::shared_ptr<Pass> eliminate_getitem_pass = std::make_shared<opt::GetitemTuple>();
-  eliminate_getitem_pass->Run(AnfAlgo::GetCNodeFuncGraphPtr(new_node));
+  (void)eliminate_getitem_pass->Run(AnfAlgo::GetCNodeFuncGraphPtr(new_node));
   if (context::GraphKernelFlags::GetInstance().dump_as_text) {
     DumpClusterInfo(old_nodes, new_node);
   }
@@ -488,9 +488,9 @@ void GraphKernelCluster::RemoveWildGetitem(std::vector<size_t> *candidates) {
     ++iter;
   }
   if (changed) {
-    candidates->erase(std::remove_if(candidates->begin(), candidates->end(),
-                                     [&candidates_set](size_t c) { return candidates_set.count(c) == 0; }),
-                      candidates->end());
+    (void)candidates->erase(std::remove_if(candidates->begin(), candidates->end(),
+                                           [&candidates_set](size_t c) { return candidates_set.count(c) == 0; }),
+                            candidates->end());
   }
 }
 
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.cc
index 2c30e4b02e1..89990496a31 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.cc
@@ -35,6 +35,7 @@
 #include "pybind_api/ir/primitive_py.h"
 #include "runtime/device/kernel_info.h"
 #include "vm/segment_runner.h"
+#include "backend/optimizer/graph_kernel/expanders/expander_factory.h"
 
 namespace mindspore {
 namespace opt {
@@ -82,6 +83,7 @@ std::vector<PrimitivePtr> GetExpandOps() {
     prim::kPrimSigmoidGrad,
     prim::kPrimSigmoidCrossEntropyWithLogits,
     prim::kPrimSigmoidCrossEntropyWithLogitsGrad,
+    prim::kPrimSlice,
     prim::kPrimSoftmax,
     prim::kPrimSoftmaxCrossEntropyWithLogits,
     prim::kPrimSquaredDifference,
@@ -98,14 +100,14 @@ std::vector<PrimitivePtr> GetExpandOps() {
 }
 }  // namespace
 
-bool DefaultExpander::ExpandJsonInfo(const AnfNodePtr &node, nlohmann::json *kernel_json) {
+bool PyExpander::ExpandJsonInfo(const AnfNodePtr &node, nlohmann::json *kernel_json) {
   DumpOption dump_option;
   dump_option.extract_opinfo_from_anfnode = true;
   kernel::AkgKernelJsonGenerator json_generator(dump_option);
   return json_generator.CollectJson(node, kernel_json);
 }
 
-FuncGraphPtr DefaultExpander::CreateExpandFuncGraph(const CNodePtr &node) {
+FuncGraphPtr PyExpander::CreateExpandFuncGraph(const CNodePtr &node) {
   nlohmann::json kernel_json;
   if (!ExpandJsonInfo(node, &kernel_json)) {
     MS_LOG(ERROR) << "Expand json info to: " << node->DebugString(2) << " failed, ori_json:\n" << kernel_json.dump();
@@ -130,7 +132,36 @@ FuncGraphPtr DefaultExpander::CreateExpandFuncGraph(const CNodePtr &node) {
   return JsonDescToAnf(kernel_desc_str);
 }
 
-AnfNodePtr DefaultExpander::CreateExpandGraphKernel(const FuncGraphPtr &new_func_graph, const CNodePtr &old_node) {
+FuncGraphPtr DefaultExpander::CreateExpandFuncGraph(const CNodePtr &node) {
+  auto expander_ptr = expanders::OpExpanderFactory::Instance().GetExpander(AnfAlgo::GetCNodeName(node));
+  if (expander_ptr == nullptr) {
+    return PyExpander::CreateExpandFuncGraph(node);
+  }
+  expanders::BaseInfoList inputs(node->size() - 1);
+  expanders::BaseInfoList outputs(AnfAlgo::GetOutputTensorNum(node));
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto shape = AnfAlgo::GetInputDeviceShape(node, i);
+    std::transform(shape.begin(), shape.end(), std::back_inserter(inputs[i].shape), SizeToLong);
+    inputs[i].type = AnfAlgo::GetInputDeviceDataType(node, i);
+    inputs[i].format = AnfAlgo::GetInputFormat(node, i);
+  }
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto shape = AnfAlgo::GetOutputDeviceShape(node, i);
+    std::transform(shape.begin(), shape.end(), std::back_inserter(outputs[i].shape), SizeToLong);
+    outputs[i].type = AnfAlgo::GetOutputDeviceDataType(node, i);
+    outputs[i].format = AnfAlgo::GetOutputFormat(node, i);
+  }
+  auto &attrs = AnfAlgo::GetCNodePrimitive(node)->attrs();
+  try {
+    auto litegraph = expander_ptr->Run(inputs, outputs, attrs, kernel::GetStrProcessorFromContext());
+    return LiteGraph2AnfGraph(litegraph);
+  } catch (const graphkernel::GKException &e) {
+    MS_LOG(INFO) << e.what() << ", undo expanding this op";
+    return nullptr;
+  }
+}
+
+AnfNodePtr PyExpander::CreateExpandGraphKernel(const FuncGraphPtr &new_func_graph, const CNodePtr &old_node) {
   auto func_graph = old_node->func_graph();
   std::vector<AnfNodePtr> inputs(old_node->inputs().begin() + 1, old_node->inputs().end());
   AnfNodePtrList kernel_nodes;
@@ -145,7 +176,7 @@ AnfNodePtr DefaultExpander::CreateExpandGraphKernel(const FuncGraphPtr &new_func
   return graph_kernel_node;
 }
 
-AnfNodePtr DefaultExpander::Run(const AnfNodePtr &node) {
+AnfNodePtr PyExpander::Run(const AnfNodePtr &node) {
   auto cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
   auto new_func_graph = CreateExpandFuncGraph(cnode);
@@ -193,10 +224,10 @@ bool GraphKernelExpander::DoExpand(const FuncGraphPtr &func_graph) {
       continue;
     }
 
-    MS_LOG(INFO) << "Expanding node: " << node->fullname_with_scope();
+    MS_LOG(DEBUG) << "Expanding node: " << node->fullname_with_scope();
     auto new_node = GetExpander(node)->Run(node);
     if (new_node == nullptr) {
-      MS_LOG(INFO) << "Skipped node: " << node->fullname_with_scope();
+      MS_LOG(DEBUG) << "Skipped node: " << node->fullname_with_scope();
       continue;
     }
     (void)mng->Replace(node, new_node);
@@ -204,6 +235,7 @@ bool GraphKernelExpander::DoExpand(const FuncGraphPtr &func_graph) {
   }
   return changed;
 }
+
 bool GraphKernelComplexExpander::CanExpand(const CNodePtr &node) const {
   bool has_complex = false;
   auto all_inputs_type = AnfAlgo::GetAllInputDeviceTypes(node);
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.h b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.h
index fcb53e1cbd4..3721e18d5d3 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.h
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.h
@@ -30,7 +30,7 @@ class Expander {
 };
 using ExpanderPtr = std::shared_ptr<Expander>;
 
-class DefaultExpander : public Expander {
+class PyExpander : public Expander {
  public:
   AnfNodePtr Run(const AnfNodePtr &node) override;
 
@@ -39,6 +39,12 @@ class DefaultExpander : public Expander {
   virtual AnfNodePtr CreateExpandGraphKernel(const FuncGraphPtr &new_func_graph, const CNodePtr &old_node);
   virtual FuncGraphPtr CreateExpandFuncGraph(const CNodePtr &node);
 };
+
+class DefaultExpander : public PyExpander {
+ protected:
+  FuncGraphPtr CreateExpandFuncGraph(const CNodePtr &node) override;
+};
+
 class ComplexOpExpander : public DefaultExpander {
  protected:
   bool ExpandJsonInfo(const AnfNodePtr &node, nlohmann::json *kernel_json);
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc
index d5a16a15b51..40ec3d2593d 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc
@@ -207,7 +207,7 @@ bool ConvertNonscalarTensorToParameter(const FuncGraphPtr &fg, AnfNodePtrList *i
         v_replace.begin(), v_replace.end(),
         [&tensor](const std::pair<tensor::TensorPtr, AnfNodePtrList> &vl) { return vl.first->ValueEqual(*tensor); });
       if (tensor_iter == v_replace.end()) {
-        v_replace.emplace_back(tensor, AnfNodePtrList{tnode});
+        (void)v_replace.emplace_back(tensor, AnfNodePtrList{tnode});
       } else {
         tensor_iter->second.push_back(tnode);
       }
@@ -801,16 +801,16 @@ void OpListFilter(std::vector<PrimitivePtr> *ops, const std::vector<std::string>
   auto new_prim = [](const std::string &name) { return std::make_shared<Primitive>(name); };
   if (!enable_ops_only.empty()) {
     ops->clear();
-    std::transform(enable_ops_only.begin(), enable_ops_only.end(), std::back_inserter(*ops), new_prim);
+    (void)std::transform(enable_ops_only.begin(), enable_ops_only.end(), std::back_inserter(*ops), new_prim);
   } else {
     if (!enable_ops.empty()) {
-      std::transform(enable_ops.begin(), enable_ops.end(), std::back_inserter(*ops), new_prim);
+      (void)std::transform(enable_ops.begin(), enable_ops.end(), std::back_inserter(*ops), new_prim);
     }
     if (!disable_ops.empty()) {
       auto iter = std::remove_if(ops->begin(), ops->end(), [&disable_ops](const PrimitivePtr &p) {
         return std::find(disable_ops.begin(), disable_ops.end(), p->name()) != disable_ops.end();
       });
-      ops->erase(iter, ops->end());
+      (void)ops->erase(iter, ops->end());
     }
   }
 }
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_pass_manager.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_pass_manager.cc
index 6c718ee0cf1..591121f623e 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_pass_manager.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_pass_manager.cc
@@ -15,7 +15,6 @@
  */
 #include "backend/optimizer/graph_kernel/graph_kernel_pass_manager.h"
 
-#include <string>
 #include <iomanip>
 
 #include "ir/anf.h"
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.cc
index f8917fa0092..b1ce8202923 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_splitter.cc
@@ -798,12 +798,12 @@ class CostModelSplitSchemer : public SplitSchemer {
       need_inline_.clear();
       return;
     } else if (split_plan_.size() == 1 && !NeedInline(0)) {
-      /*In this case, the CostModel decided to keep the whole graph unchanged.*/
+      // In this case, the CostModel decided to keep the whole graph unchanged.
       split_plan_.clear();
       need_inline_.clear();
       return;
     } else {
-      MS_LOG(INFO) << "CostModel split succeeded. The kernel is split to " << split_plan_.size() << " parts.";
+      MS_LOG(DEBUG) << "CostModel split succeeded. The kernel is split to " << split_plan_.size() << " parts.";
     }
     MapNodeGroup();
     GroupReturnNode();
@@ -894,11 +894,11 @@ class CostModelSplitSchemer : public SplitSchemer {
 };
 
 bool TrySplit(const CNodePtr &sub_root_cnode) {
-  MS_LOG(INFO) << "Split process node: " << sub_root_cnode->fullname_with_scope();
+  MS_LOG(DEBUG) << "Split process node: " << sub_root_cnode->fullname_with_scope();
   auto splitter = Splitter::MakeSplitter(sub_root_cnode, std::make_shared<CostModelSplitSchemer>());
   MS_EXCEPTION_IF_NULL(splitter);
   bool result = splitter->Split();
-  MS_LOG(INFO) << "Split node completed, result: " << result;
+  MS_LOG(DEBUG) << "Split node completed, result: " << result;
   return result;
 }
 }  // namespace
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/insert_pad.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/insert_pad.cc
index 3ab84e5bc8c..a6de971063c 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/insert_pad.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/insert_pad.cc
@@ -95,7 +95,7 @@ auto NotTransANotTransB = [](const vec &shape_a, const vec &shape_b, vec *pad_sh
 };
 
 bool IsAkgMatMul(size_t K, size_t M, size_t N) {
-  if (K > 4096 || M * N * K >= 3 * pow(10, 10)) {
+  if (K > 4096 || M * N * K >= 3e10) {
     return false;
   }
   return true;
@@ -148,13 +148,13 @@ std::tuple<bool, bool, bool> NeedPad(const CNodePtr &matmul, vec *pad_shape_a, v
 // Insert pad for A if left is true, insert pad for B if left is false
 void InsertPad(const CNodePtr &matmul, const FuncGraphPtr &func_graph, const FuncGraphManagerPtr &mng, bool left,
                const vec &pad_shape, const vec &tail_shape) {
-  int input_index = left ? 1 : 2;
+  size_t input_index = left ? 1 : 2;
   AnfNodePtrList pad_inp = {NewValueNode(opt::kPrimPadAkg), matmul->input(input_index)};
   auto pad_cnode = func_graph->NewCNode(pad_inp);
   func_graph->AddNode(pad_cnode);
 
   ShapeVector tail;
-  tail.insert(tail.begin(), tail_shape.begin(), tail_shape.end());
+  (void)tail.insert(tail.begin(), tail_shape.begin(), tail_shape.end());
   ShapeVector head(tail_shape.size(), 0);
 
   SetNodeAttrSafely("head", MakeValue(head), pad_cnode);
@@ -163,7 +163,7 @@ void InsertPad(const CNodePtr &matmul, const FuncGraphPtr &func_graph, const Fun
   std::vector<TypeId> pad_type = {AnfAlgo::GetPrevNodeOutputInferDataType(matmul, 0)};
 
   ShapeVector abs_shape;
-  abs_shape.insert(abs_shape.begin(), pad_shape.begin(), pad_shape.end());
+  (void)abs_shape.insert(abs_shape.begin(), pad_shape.begin(), pad_shape.end());
   auto abs_shape_ptr = std::make_shared<abstract::Shape>(abstract::Shape(abs_shape));
   auto abstract = std::make_shared<abstract::AbstractTensor>(TypeIdToType(pad_type[0]), abs_shape_ptr);
   pad_cnode->set_abstract(abstract);
@@ -188,12 +188,12 @@ void InsertUnpad(const CNodePtr &matmul, const FuncGraphPtr &func_graph, const F
   auto unpad_cnode = func_graph->NewCNode(unpad_inp);
   func_graph->AddNode(unpad_cnode);
   ShapeVector tail;
-  tail.insert(tail.begin(), tail_shape.begin(), tail_shape.end());
+  (void)tail.insert(tail.begin(), tail_shape.begin(), tail_shape.end());
   SetNodeAttrSafely("tail", MakeValue(tail), unpad_cnode);
   std::vector<TypeId> unpad_type = {AnfAlgo::GetOutputInferDataType(matmul, 0)};
 
   ShapeVector abs_shape;
-  abs_shape.insert(abs_shape.begin(), unpad_shape.begin(), unpad_shape.end());
+  (void)abs_shape.insert(abs_shape.begin(), unpad_shape.begin(), unpad_shape.end());
   auto abs_shape_ptr = std::make_shared<abstract::Shape>(abstract::Shape(abs_shape));
   auto abstract = std::make_shared<abstract::AbstractTensor>(TypeIdToType(unpad_type[0]), abs_shape_ptr);
   unpad_cnode->set_abstract(abstract);
@@ -207,7 +207,7 @@ void InsertUnpad(const CNodePtr &matmul, const FuncGraphPtr &func_graph, const F
     BuildSelectKernelBuildInfo(unpad_input_format, unpad_input_type, unpad_output_format, unpad_output_type);
   AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, unpad_cnode.get());
 
-  mng->Replace(matmul, unpad_cnode);
+  (void)mng->Replace(matmul, unpad_cnode);
 }
 
 // Update matmul's Abatract and BuildInfo as M or N is changed
@@ -239,13 +239,7 @@ bool InsertPadUnpad(const FuncGraphPtr &func_graph) {
     if (!AnfAlgo::CheckPrimitiveType(n, prim::kPrimMatMul)) continue;
     auto mm_cnode = n->cast<CNodePtr>();
     vec pad_shape_a, pad_shape_b, tail_shape_a, tail_shape_b, tail_shape_unpad, unpad_shape;
-    bool pad_K, pad_M, pad_N;
-    pad_shape_a.clear();
-    pad_shape_b.clear();
-    tail_shape_a.clear();
-    tail_shape_b.clear();
-    tail_shape_unpad.clear();
-    unpad_shape.clear();
+    bool pad_K{false}, pad_M{false}, pad_N{false};
     std::tie(pad_K, pad_M, pad_N) =
       NeedPad(mm_cnode, &pad_shape_a, &pad_shape_b, &unpad_shape, &tail_shape_a, &tail_shape_b, &tail_shape_unpad);
     if (!pad_K && !pad_M && !pad_N) continue;
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/model/op_node.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/model/op_node.cc
index 0d8a073c0a5..1ec5b2f2b3c 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/model/op_node.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/model/op_node.cc
@@ -144,7 +144,7 @@ NodePtr PrimOp::InferValue(const NodePtrList &inputs, const DAttrs &attrs, const
   for (auto i : inputs) {
     if (i->NodeType() != NType::Value) return nullptr;
   }
-  TypeId output_type = InferType(inputs, attrs);
+  TypeId output_type = this->type;
   tensor::TensorPtr res = nullptr;
   switch (output_type) {
     case TypeId::kNumberTypeUInt8: {
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/parallel_cost_model.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/parallel_cost_model.cc
index c969216def4..922fb0705f7 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/parallel_cost_model.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/parallel_cost_model.cc
@@ -16,8 +16,6 @@
 
 #include "backend/optimizer/graph_kernel/parallel_cost_model.h"
 
-#include <algorithm>
-
 #include "backend/kernel_compiler/akg/akg_kernel_json_generator.h"
 #include "backend/optimizer/graph_kernel/graph_kernel_helper.h"
 #include "pipeline/jit/parse/python_adapter.h"
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/parallel_fusion.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/parallel_fusion.cc
index 95564759272..cdcb5aedc3c 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/parallel_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/parallel_fusion.cc
@@ -16,22 +16,6 @@
 
 #include "backend/optimizer/graph_kernel/parallel_fusion.h"
 
-#include <algorithm>
-#include <cstddef>
-#include <list>
-#include <map>
-#include <memory>
-#include <queue>
-#include <set>
-#include <sstream>
-#include <stack>
-#include <string>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include <cstdlib>
-
 #include "backend/optimizer/graph_kernel/graph_kernel_helper.h"
 #include "frontend/operator/ops.h"
 #include "ir/func_graph_cloner.h"
@@ -463,10 +447,10 @@ std::tuple<AnfNodePtrList, std::vector<int>> ParallelOpFusion::GetAvaliableNodes
   if (start >= node_limit) {
     MS_LOG(EXCEPTION) << "Index offset is exceed the limit of given nodes.";
   }
-  AnfNodePtrList target_nodes = {nodes[start]};
+  AnfNodePtrList target_nodes = {nodes[IntToSize(start)]};
   std::vector<int> valid_indices;
   std::vector<size_t> unused;
-  for (size_t i = start; i < used.size(); ++i) {
+  for (size_t i = IntToSize(start); i < used.size(); ++i) {
     if (!used[i] && excludes.count(i) == 0) {
       unused.push_back(i);
     }
@@ -593,7 +577,7 @@ std::tuple<std::vector<bool>, std::vector<ParallelInfo>> ParallelOpFusion::Searc
 
   std::map<AnfNodePtr, int> sorted_indices;
   for (size_t i = 0; i < candidates.size(); ++i) {
-    sorted_indices.emplace(candidates[i], i);
+    (void)sorted_indices.emplace(candidates[i], i);
   }
 
   return DoSearchInSortedCandidates(cs.size(), candidates, &origin_indices, &sorted_indices);
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/raise_reduction_precision.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/raise_reduction_precision.cc
index eedf0b2810c..11e495f3bab 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/raise_reduction_precision.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/raise_reduction_precision.cc
@@ -15,11 +15,6 @@
  */
 #include "backend/optimizer/graph_kernel/raise_reduction_precision.h"
 
-#include <vector>
-#include <string>
-#include <algorithm>
-#include <memory>
-
 #include "base/core_ops.h"
 #include "utils/utils.h"
 #include "backend/optimizer/common/helper.h"
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/reorder_ops.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/reorder_ops.cc
index ef0632984eb..8cbf7d4ba24 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/reorder_ops.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/reorder_ops.cc
@@ -31,7 +31,7 @@ namespace {
 bool IsTypeInsensitive(const CNodePtr &node) {
   // Nodes that will change the input data type will not seen as type insensitive nodes.
   static std::unordered_set<PrimitivePtr> type_insensitive_op_list{
-    prim::KPrimTransData, prim::kPrimTranspose, prim::kPrimExpandDims, prim::kPrimReshape,
+    prim::kPrimTransData, prim::kPrimTranspose, prim::kPrimExpandDims, prim::kPrimReshape,
     prim::kPrimSqueeze,   prim::kPrimTile,      prim::kPrimNeg,        prim::kPrimRelu,
     prim::kPrimMaximum,   prim::kPrimMinimum,   prim::kPrimSelect};
 
@@ -47,15 +47,12 @@ CastType GetCastType(const CNodePtr &node) {
   }
   TypeId input_type = AnfAlgo::GetInputDeviceDataType(node, 0);
   TypeId output_type = AnfAlgo::GetOutputDeviceDataType(node, 0);
-
   if (input_type == kNumberTypeFloat16 && output_type == kNumberTypeFloat32) {
     return CAST_UP;
   }
-
   if (input_type == kNumberTypeFloat32 && output_type == kNumberTypeFloat16) {
     return CAST_DOWN;
   }
-
   return CAST_OTHER;
 }
 
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/shape_ops_splitter.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/shape_ops_splitter.cc
index 6edb851121f..a81e97c4201 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/shape_ops_splitter.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/shape_ops_splitter.cc
@@ -64,7 +64,7 @@ void SplitNode(const AnfNodePtr &node, const FuncGraphManagerPtr &mng) {
     split_nodes.push_back(CloneCNode(node));
   }
 
-  int i = 0;
+  size_t i = 0;
   for (auto [user, indices] : users_info) {
     auto user_node = user->cast<CNodePtr>();
     MS_EXCEPTION_IF_NULL(user_node);
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/split_umonad.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/split_umonad.cc
index e0bdbc4ceda..3ea1c87fb12 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/split_umonad.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/split_umonad.cc
@@ -37,7 +37,7 @@ const BaseRef SplitAssign::DefinePattern() const {
 
 bool CanSplit(const AnfNodePtr &node) { return IsPrimitiveCNode(node, prim::kPrimAssign); }
 
-AnfNodePtr ProcessNode(const FuncGraphPtr &func_graph, const AnfNodePtr &node, int input_idx) {
+AnfNodePtr ProcessNode(const FuncGraphPtr &func_graph, const AnfNodePtr &node, size_t input_idx) {
   MS_EXCEPTION_IF_NULL(node);
   CNodePtr cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
@@ -46,16 +46,14 @@ AnfNodePtr ProcessNode(const FuncGraphPtr &func_graph, const AnfNodePtr &node, i
   AbstractBasePtr original_abstract = cnode->abstract()->Clone();
   auto original_inputs = cnode->inputs();
 
-  int input_node_size = cnode->size() - 1;
   // Create depend node
-  AnfNodePtrList depend_inputs = {NewValueNode(prim::kPrimDepend), original_inputs[input_idx],
-                                  original_inputs[input_node_size]};
+  AnfNodePtrList depend_inputs = {NewValueNode(prim::kPrimDepend), original_inputs[input_idx], original_inputs.back()};
   auto depend_cnode = func_graph->NewCNode(depend_inputs);
   depend_cnode->set_abstract(original_inputs[input_idx]->abstract());
   depend_cnode->set_kernel_info(std::make_shared<device::KernelInfo>());
   // Create new node, delete U from inputs.
   AnfNodePtrList new_inputs = {cnode->input(0)};
-  for (int i = 1; i < input_node_size; i++) {
+  for (size_t i = 1; i + 1 < cnode->size(); i++) {
     if (i == input_idx) {
       new_inputs.push_back(depend_cnode);
     } else {
@@ -77,21 +75,12 @@ const AnfNodePtr SplitAssign::Process(const FuncGraphPtr &func_graph, const AnfN
 AnfNodePtr OpUMonadExpander::Run(const AnfNodePtr &node) {
   auto cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
-
-  bool has_umonad = false;
-  for (unsigned int i = 1; i < cnode->size(); i++) {
-    if (HasAbstractUMonad(cnode->input(i))) {
-      has_umonad = true;
-      break;
-    }
-  }
-  if (has_umonad) {
+  // assume the UMonad node is the last input
+  if (cnode->size() > 1 && HasAbstractUMonad(cnode->inputs().back())) {
     auto new_node = ProcessNode(node->func_graph(), node, input_idx_);
     return DefaultExpander::Run(new_node);
   }
-
   return DefaultExpander::Run(node);
 }
-
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/split_umonad.h b/mindspore/ccsrc/backend/optimizer/graph_kernel/split_umonad.h
index f6d73e3797c..509049b03b6 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/split_umonad.h
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/split_umonad.h
@@ -30,12 +30,12 @@ class SplitAssign : public PatternProcessPass {
 
 class OpUMonadExpander : public DefaultExpander {
  public:
-  explicit OpUMonadExpander(int input_idx) : input_idx_(input_idx) {}
+  explicit OpUMonadExpander(size_t input_idx) : input_idx_(input_idx) {}
   ~OpUMonadExpander() = default;
   AnfNodePtr Run(const AnfNodePtr &node) override;
 
  private:
-  int input_idx_;
+  size_t input_idx_;
 };
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/update_state_formatter.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/update_state_formatter.cc
index 4fe79033ac2..dc2d8f1dcce 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/update_state_formatter.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/update_state_formatter.cc
@@ -95,7 +95,7 @@ bool SpreadUpdateState::Run(const FuncGraphPtr &func_graph) {
       // Create a new UpdateState
       auto new_node = func_graph->NewCNode(node_inputs);
       new_node->set_abstract(node->abstract());
-      mng->Replace(node, new_node);
+      (void)mng->Replace(node, new_node);
       changed = true;
     }
   }
@@ -124,7 +124,7 @@ bool ShrinkUpdateState::Run(const FuncGraphPtr &func_graph) {
     auto new_node = func_graph->NewCNode(inputs);
     new_node->set_abstract(node->abstract());
     new_node->set_kernel_info(std::make_shared<device::KernelInfo>());
-    mng->Replace(node, new_node);
+    (void)mng->Replace(node, new_node);
     changed = true;
   }
   return changed;
diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/value_graph_binder.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/value_graph_binder.cc
index ef3e5bfa627..ff282850c86 100644
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/value_graph_binder.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/value_graph_binder.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include "backend/optimizer/graph_kernel/value_graph_binder.h"
-#include <unordered_set>
+
 #include "frontend/optimizer/irpass.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/common_utils.h"
diff --git a/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.cc b/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.cc
index 06b7edafb68..6299a0204d9 100644
--- a/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.cc
@@ -32,6 +32,7 @@ namespace opt {
 namespace {
 constexpr auto kAttrDefaultGroup = "default_group";
 constexpr auto kAttrDefaultOp = "default_op";
+constexpr size_t kAlignSize = 2 << 9;
 
 kernel::KernelBuildInfoPtr GenerateKernelBuildInfo(const CommunicationOpInfo &communication_op_info, size_t start_index,
                                                    size_t end_index) {
@@ -139,6 +140,15 @@ bool CommunicationOpFusion::GetSplitSegments(const CommunicationOpInfo &communic
   size_t communication_op_node_size = communication_op_info.communication_op_nodes.size();
   MS_LOG(INFO) << "graph " << op_name_ << " node size " << communication_op_node_size;
 
+  if (op_name_ == kHcomSendOpName || op_name_ == kReceiveOpName) {
+    *segment_num = 1;
+    if (communication_op_node_size == 0) {
+      return false;
+    }
+    segment_index->emplace_back(communication_op_node_size - 1);
+    return true;
+  }
+
   auto parallel_context = parallel::ParallelContext::GetInstance();
   MS_EXCEPTION_IF_NULL(parallel_context);
   std::vector<uint32_t> split_indices;
@@ -155,8 +165,8 @@ bool CommunicationOpFusion::GetSplitSegments(const CommunicationOpInfo &communic
         MS_LOG(EXCEPTION) << "invalid " << op_name_ << " split index " << i << " " << index;
       }
       if (index >= communication_op_node_size) {
-        MS_LOG(WARNING) << op_name_ << "'s split index " << index << " is large than total gradient's number "
-                        << communication_op_node_size;
+        MS_LOG(WARNING) << op_name_ << "'s split index " << index
+                        << " is Greater than or equal to total gradient's number " << communication_op_node_size;
         continue;
       }
       segment_index->push_back(index);
@@ -329,6 +339,7 @@ AnfNodePtr CommunicationOpFusion::CreateFusedCommunicationOp(const FuncGraphPtr
   size_t output_num = node_num * rank_size_t;
   std::vector<TypeId> dtypes(output_num, AnfAlgo::GetOutputInferDataType(final_node, 0));
   std::vector<std::vector<size_t>> shapes;
+  int64_t fusion_total_size = 0;
   for (size_t i = 0; i < rank_size_t; ++i) {
     for (size_t idx = start_index; idx <= end_index; ++idx) {
       auto input_node = communication_op_info.communication_op_nodes[idx];
@@ -338,16 +349,27 @@ AnfNodePtr CommunicationOpFusion::CreateFusedCommunicationOp(const FuncGraphPtr
         shape[0] /= rank_size_t;
       }
       shapes.push_back(shape);
+      size_t tensor_size = AnfAlgo::GetOutputTensorMemSize(input_node, 0);
+      TypeId output_type = AnfAlgo::GetOutputDeviceDataType(input_node, 0);
+      size_t type_size = GetTypeByte(TypeIdToType(output_type));
+      tensor_size = (tensor_size / kAlignSize + 1) * kAlignSize / type_size;
+      fusion_total_size += static_cast<int64_t>(tensor_size);
     }
   }
   AnfAlgo::SetOutputInferTypeAndShape(dtypes, shapes, fused_node.get());
   auto kernel_build_info = GenerateKernelBuildInfo(communication_op_info, start_index, end_index);
   AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info, fused_node.get());
-  AnfAlgo::CopyNodeAttr(kAttrFusion, final_node, fused_node);
-  AnfAlgo::CopyNodeAttr(kAttrOp, final_node, fused_node);
-  AnfAlgo::CopyNodeAttr(kAttrGroup, final_node, fused_node);
-  if (AnfAlgo::HasNodeAttr(kAttrRankSize, final_node)) {
-    AnfAlgo::CopyNodeAttr(kAttrRankSize, final_node, fused_node);
+  const std::vector<std::string> kHcclFusionAttrs = {kAttrFusion, kAttrGroup,    kAttrGroupBack,
+                                                     kAttrSrTag,  kAttrDestRank, kAttrSrcRank,
+                                                     kAttrDType,  kAttrOp,       kAttrRankSize};
+  for (const auto &attr : kHcclFusionAttrs) {
+    if (AnfAlgo::HasNodeAttr(attr, final_node)) {
+      AnfAlgo::CopyNodeAttr(attr, final_node, fused_node);
+    }
+  }
+  if (AnfAlgo::HasNodeAttr(kAttrShape, final_node)) {
+    std::vector<int64_t> fusion_total_shape{fusion_total_size};
+    AnfAlgo::SetNodeAttr(kAttrShape, MakeValue(fusion_total_shape), fused_node);
   }
   return fused_node;
 }
diff --git a/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.h b/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.h
index 446b214c1f7..1e7c902c9e1 100644
--- a/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.h
@@ -51,6 +51,18 @@ class CommunicationOpFusion : public Pass {
   size_t groups_ = 1;
 };
 
+class SendFusion : public CommunicationOpFusion {
+ public:
+  explicit SendFusion(size_t groups = 1) : CommunicationOpFusion("send_fusion", kHcomSendOpName, groups) {}
+  ~SendFusion() override = default;
+};
+
+class RecvFusion : public CommunicationOpFusion {
+ public:
+  explicit RecvFusion(size_t groups = 1) : CommunicationOpFusion("recv_fusion", kReceiveOpName, groups) {}
+  ~RecvFusion() override = default;
+};
+
 class AllReduceFusion : public CommunicationOpFusion {
  public:
   explicit AllReduceFusion(size_t groups = 1) : CommunicationOpFusion("all_reduce_fusion", kAllReduceOpName, groups) {}
diff --git a/mindspore/ccsrc/backend/optimizer/pass/optimize_dependence.cc b/mindspore/ccsrc/backend/optimizer/pass/optimize_dependence.cc
index 4b04d4de543..9192f952384 100644
--- a/mindspore/ccsrc/backend/optimizer/pass/optimize_dependence.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/optimize_dependence.cc
@@ -166,7 +166,7 @@ std::vector<size_t> SearchTransDataAndCast(const CNodePtr &cnode) {
   for (size_t i = 1; i < cnode->size(); ++i) {
     auto &input = cnode->input(i);
     if (AnfAlgo::CheckPrimitiveType(input, prim::kPrimCast) ||
-        AnfAlgo::CheckPrimitiveType(input, prim::KPrimTransData) ||
+        AnfAlgo::CheckPrimitiveType(input, prim::kPrimTransData) ||
         AnfAlgo::CheckPrimitiveType(input, prim::kPrimMakeTuple)) {
       result.emplace_back(i);
     }
diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas.cc b/mindspore/ccsrc/backend/optimizer/somas/somas.cc
index 018bbda8d62..f64c2966fbc 100644
--- a/mindspore/ccsrc/backend/optimizer/somas/somas.cc
+++ b/mindspore/ccsrc/backend/optimizer/somas/somas.cc
@@ -124,8 +124,8 @@ bool Somas::LoadSomasCache(const session::KernelGraph *graph) {
 
   bool ret = CalcSomasModelHash(graph);
   if (ret) {
-    std::string filename =
-      save_graphs_path_ + "/somas_meta/" + "somas_graph" + std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json";
+    std::string filename = GetSaveGraphsPathName(
+      "/somas_meta/somas_graph" + std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json", save_graphs_path_);
     ret = LoadSomasResult(graph, filename);
     if (ret) {
       MS_LOG(INFO) << "Load Somas Cache file " << filename << " Successfully.";
@@ -141,8 +141,8 @@ bool Somas::CalcSomasModelHash(const session::KernelGraph *graph) {
   auto model_str = SomasInfo(true);
   hash_id_ = std::to_string(std::hash<std::string>()(model_str));
   MS_LOG(INFO) << "Graph " << graph->graph_id() << "'s SOMAS Model hash id is " << hash_id_;
-  std::string filename =
-    save_graphs_path_ + "/somas_meta/" + "somas_graph" + std::to_string(graph->graph_id()) + "_" + hash_id_ + ".info";
+  std::string filename = GetSaveGraphsPathName(
+    "/somas_meta/somas_graph" + std::to_string(graph->graph_id()) + "_" + hash_id_ + ".info", save_graphs_path_);
   return Common::SaveStringToFile(filename, model_str);
 }
 
@@ -178,8 +178,8 @@ bool Somas::SaveSomasResult(const session::KernelGraph *graph) {
   }
   somas_json[kTensors] = tensors_json;
 
-  std::string filename =
-    save_graphs_path_ + "/somas_meta/" + "somas_graph" + std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json";
+  std::string filename = GetSaveGraphsPathName(
+    "/somas_meta/somas_graph" + std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json", save_graphs_path_);
   (void)Common::SaveStringToFile(filename, somas_json.dump());
   return true;
 }
@@ -364,12 +364,12 @@ bool Somas::InitSomasTensors(const session::KernelGraph *graph) {
 #endif
 
   if (save_graphs_) {
-    std::string file_path =
-      save_graphs_path_ + "/" + "somas_pre_processed_info_" + std::to_string(graph->graph_id()) + ".ir";
+    std::string file_path = GetSaveGraphsPathName(
+      "/somas_pre_processed_info_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path_);
     DumpSomasInfoIR(file_path);
 
     std::string offline_file_path =
-      save_graphs_path_ + "/" + "somas_offline_log_" + std::to_string(graph->graph_id()) + ".ir";
+      GetSaveGraphsPathName("/somas_offline_log_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path_);
     DumpOfflineIR(offline_file_path);
   }
 
@@ -687,7 +687,8 @@ void Somas::InitBasicInfo(const session::KernelGraph *graph) {
     save_graphs_path_ = ".";
   }
   if (save_graphs_) {
-    std::string file_path = save_graphs_path_ + "/" + "somas_initial_info_" + std::to_string(graph->graph_id()) + ".ir";
+    std::string file_path =
+      GetSaveGraphsPathName("/somas_initial_info_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path_);
     DumpSomasInfoIR(file_path);
   }
 }
diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_solver_pre.cc b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_pre.cc
index 9b558d1f18d..6706ed99b11 100644
--- a/mindspore/ccsrc/backend/optimizer/somas/somas_solver_pre.cc
+++ b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_pre.cc
@@ -212,7 +212,8 @@ void SomasSolverPre::TensorRelationLog(const std::vector<DynamicBitSet> *pConstr
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
   auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
-  std::string filename = save_graphs_path + "/" + "somas_tensor_relation_" + std::to_string(graph->graph_id()) + ".ir";
+  std::string filename =
+    GetSaveGraphsPathName("somas_tensor_relation_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
   std::ostringstream oss;
   for (size_t tid1 = 0; tid1 < pConstraints->size(); tid1++) {
     oss << 't' << tid1 << ' ';
@@ -232,7 +233,8 @@ void SomasSolverPre::SolverInputLog(const session::KernelGraph *graph, const Ten
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
   auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
-  std::string filename = save_graphs_path + "/" + "somas_solver_input_" + std::to_string(graph->graph_id()) + ".ir";
+  std::string filename =
+    GetSaveGraphsPathName("somas_solver_input_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
   std::ostringstream oss;
   for (auto &t : tensors) {
     oss << "T " << t.second->index_ << " " << t.second->size_ << " " << t.second->lifelong_ << std::endl;
@@ -264,7 +266,7 @@ void SomasSolverPre::SolverOutputLog(const session::KernelGraph *graph, const Te
   MS_EXCEPTION_IF_NULL(context_ptr);
   auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
   std::string out_filename =
-    save_graphs_path + "/" + "somas_solver_output_" + std::to_string(graph->graph_id()) + ".ir";
+    GetSaveGraphsPathName("somas_solver_output_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
   std::ostringstream oss;
   constexpr size_t contiguous_left = 1;
   constexpr size_t contiguous_mid = 2;
diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
index 25d38c9bd0d..5d73de4d342 100644
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@@ -203,6 +203,9 @@ KernelWithIndex AnfRuntimeAlgorithm::VisitKernel(const AnfNodePtr &anf_node, siz
     auto input0 = cnode->input(0);
     MS_EXCEPTION_IF_NULL(input0);
     if (IsPrimitive(input0, prim::kPrimMakeTuple)) {
+      if (AnfAlgo::GetInputTensorNum(cnode) == 0) {
+        return std::make_pair(nullptr, 0);
+      }
       auto node = cnode->input(index + IntToSize(1));
       MS_EXCEPTION_IF_NULL(node);
       return VisitKernel(node, 0);
@@ -1723,7 +1726,7 @@ void AnfRuntimeAlgorithm::ReorderOptimizerExecList(NotNull<std::vector<CNodePtr>
 
     auto trans_data_func = [&](const CNodePtr &node) -> bool {
       MS_EXCEPTION_IF_NULL(node);
-      if (AnfAlgo::GetCNodeName(node) == prim::KPrimTransData->name()) {
+      if (AnfAlgo::GetCNodeName(node) == prim::kPrimTransData->name()) {
         auto kernel_index = AnfAlgo::VisitKernelWithReturnType(AnfAlgo::GetInputNode(node, 0), 0);
         MS_EXCEPTION_IF_NULL(kernel_index.first);
         if (kernel_index.first->isa<CNode>() && kOptOperatorSet.find(AnfAlgo::GetCNodeName(
@@ -2237,5 +2240,137 @@ bool AnfRuntimeAlgorithm::IsNodeInputContainMonad(const AnfNodePtr &node) {
   }
   return false;
 }
+
+void AnfRuntimeAlgorithm::CacheAddrForGraph(const KernelGraphPtr &kernel_graph) {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto nodes = kernel_graph->execution_order();
+  for (auto &kernel : nodes) {
+    // Skip transpose kernel with "nop_op" attr which is not hidden or removed in PyNative infer scenario. Transpose
+    // kernel, which is not supposed to be executed, is generated in TransDataSplit to support specific Transdata.
+    // And hard code here should be removed after new Transdata programme is implemented in the foreseeable future.
+    if (HasNodeAttr("nop_op", kernel)) {
+      for (size_t idx = 0; idx < GetOutputTensorNum(kernel); idx += 1) {
+        auto real_input = GetRealInputIndex(kernel, idx);
+        auto device_address = GetPrevNodeMutableOutputAddr(kernel, real_input);
+        SetOutputAddr(device_address, idx, kernel.get());
+      }
+      continue;
+    }
+    auto kernel_mod = GetKernelMod(kernel);
+    MS_EXCEPTION_IF_NULL(kernel_mod);
+    if (GetCNodeName(kernel) == kAtomicAddrCleanOpName) {
+      CacheAddrForAtomicClean(kernel, kernel_mod);
+      continue;
+    }
+    CacheAddrForKernel(kernel, kernel_mod);
+  }
+}
+
+void AnfRuntimeAlgorithm::CacheAddrForKernel(const AnfNodePtr &node, kernel::KernelMod *kernel_mod) {
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(kernel_mod);
+  std::vector<AddressPtr> kernel_inputs;
+  std::vector<AddressPtr> kernel_workspaces;
+  std::vector<AddressPtr> kernel_outputs;
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto visit_nop_node = (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode);
+  size_t input_num = GetInputTensorNum(node);
+  for (size_t i = 0; i < input_num; ++i) {
+    auto op_name = GetCNodeName(cnode);
+    constexpr auto none_placeholder_index = 3;
+    if (op_name == kDynamicRNNOpName && i == none_placeholder_index) {
+      continue;
+    }
+    if (op_name == kDynamicGRUV2OpName) {
+      auto none_index = GetNodeAttr<std::vector<int64_t>>(cnode, "placeholder_index");
+      auto item = std::find(none_index.begin(), none_index.end(), i);
+      if (item != none_index.end()) {
+        continue;
+      }
+    }
+    auto real_input = GetRealInputIndex(node, i);
+    auto device_address = GetPrevNodeOutputAddr(node, real_input, visit_nop_node);
+    MS_EXCEPTION_IF_NULL(device_address);
+    kernel::AddressPtr input = std::make_shared<kernel::Address>();
+    MS_EXCEPTION_IF_NULL(input);
+    input->addr = const_cast<void *>(device_address->GetPtr());
+    MS_EXCEPTION_IF_NULL(input->addr);
+    input->size = device_address->GetSize();
+    kernel_inputs.emplace_back(input);
+  }
+  for (size_t i = 0; i < kernel_mod->GetOutputSizeList().size(); ++i) {
+    auto device_address = GetOutputAddr(node, i, visit_nop_node);
+    kernel::AddressPtr output = std::make_shared<kernel::Address>();
+    MS_EXCEPTION_IF_NULL(output);
+    output->addr = const_cast<void *>(device_address->GetPtr());
+    MS_EXCEPTION_IF_NULL(output->addr);
+    output->size = device_address->GetSize();
+    kernel_outputs.emplace_back(output);
+  }
+  for (size_t i = 0; i < kernel_mod->GetWorkspaceSizeList().size(); ++i) {
+    auto device_address = GetWorkspaceAddr(node, i);
+    kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
+    MS_EXCEPTION_IF_NULL(workspace);
+    workspace->addr = const_cast<void *>(device_address->GetPtr());
+    MS_EXCEPTION_IF_NULL(workspace->addr);
+    workspace->size = device_address->GetSize();
+    kernel_workspaces.emplace_back(workspace);
+  }
+  kernel_mod->set_inputs_addr(kernel_inputs);
+  kernel_mod->set_workspaces_addr(kernel_workspaces);
+  kernel_mod->set_outputs_addr(kernel_outputs);
+}
+
+void AnfRuntimeAlgorithm::CacheAddrForAtomicClean(const AnfNodePtr &node, kernel::KernelMod *kernel_mod) {
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(kernel_mod);
+  std::vector<AddressPtr> kernel_inputs;
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (cnode->inputs().size() != 2) {
+    MS_LOG(EXCEPTION) << "Atomic Addr clean Node Input nodes not equal 2.";
+  }
+  MS_EXCEPTION_IF_NULL(cnode->inputs()[1]);
+  auto pre_node = (cnode->inputs()[1])->cast<CNodePtr>();
+  // set clean output address
+  if (HasNodeAttr(kAttrAtomicOutputIndexs, pre_node)) {
+#if defined(__APPLE__)
+    auto clean_output_indexes = GetNodeAttr<std::vector<int>>(pre_node, kAttrAtomicOutputIndexs);
+#else
+    auto clean_output_indexes = GetNodeAttr<std::vector<size_t>>(pre_node, kAttrAtomicOutputIndexs);
+#endif
+    for (auto index : clean_output_indexes) {
+      auto device_address = GetOutputAddr(pre_node, index);
+      kernel::AddressPtr input = std::make_shared<kernel::Address>();
+      MS_EXCEPTION_IF_NULL(input);
+      input->addr = const_cast<void *>(device_address->GetPtr());
+      MS_EXCEPTION_IF_NULL(input->addr);
+      input->size = device_address->GetSize();
+      kernel_inputs.emplace_back(input);
+    }
+    MS_LOG(DEBUG) << "AtomicAddClean clean output size:" << clean_output_indexes.size();
+  }
+  // set clean workspace address
+  if (HasNodeAttr(kAttrAtomicWorkspaceIndexs, pre_node)) {
+#if defined(__APPLE__)
+    auto clean_workspaces_indexes = GetNodeAttr<std::vector<int>>(pre_node, kAttrAtomicWorkspaceIndexs);
+#else
+    auto clean_workspaces_indexes = GetNodeAttr<std::vector<size_t>>(pre_node, kAttrAtomicWorkspaceIndexs);
+#endif
+    for (const auto &index : clean_workspaces_indexes) {
+      auto device_address = GetWorkspaceAddr(pre_node, index);
+      kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
+      MS_EXCEPTION_IF_NULL(workspace);
+      workspace->addr = const_cast<void *>(device_address->GetPtr());
+      MS_EXCEPTION_IF_NULL(workspace->addr);
+      workspace->size = device_address->GetSize();
+      kernel_inputs.emplace_back(workspace);
+    }
+  }
+  kernel_mod->set_inputs_addr(kernel_inputs);
+}
 }  // namespace session
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
index f75d7232828..1117534a6e8 100644
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
@@ -43,7 +43,8 @@ using PrimitiveSet = std::unordered_set<PrimitivePtr, PrimitiveHasher, Primitive
 using AnfVisitFuncion = std::function<Any(const AnfNodePtr &node, int index)>;
 using DeviceAddress = device::DeviceAddress;
 using DeviceAddressPtr = device::DeviceAddressPtr;
-
+using Address = kernel::Address;
+using AddressPtr = kernel::AddressPtr;
 using KernelWithIndex = std::pair<AnfNodePtr, size_t>;
 struct KernelWithIndexCmp {
   bool operator()(const KernelWithIndex &key1, const KernelWithIndex &key2) const {
@@ -308,6 +309,10 @@ class AnfRuntimeAlgorithm {
   static bool IsControlOpExecInBackend(const AnfNodePtr &node);
 
   static bool IsNodeInputContainMonad(const AnfNodePtr &node);
+  // Save inputs/outputs/workspace address in kernel_mod.
+  static void CacheAddrForGraph(const KernelGraphPtr &kernel_graph);
+  static void CacheAddrForKernel(const AnfNodePtr &node, kernel::KernelMod *kernel_mod);
+  static void CacheAddrForAtomicClean(const AnfNodePtr &node, kernel::KernelMod *kernel_mod);
 };
 }  // namespace session
 using AnfAlgo = session::AnfRuntimeAlgorithm;
diff --git a/mindspore/ccsrc/backend/session/ascend_auto_monad.cc b/mindspore/ccsrc/backend/session/ascend_auto_monad.cc
index d723b21abbc..94810f76c93 100644
--- a/mindspore/ccsrc/backend/session/ascend_auto_monad.cc
+++ b/mindspore/ccsrc/backend/session/ascend_auto_monad.cc
@@ -91,7 +91,7 @@ void DumpExecuteOrder(NotNull<KernelGraphPtr> kg) {
     return;
   }
   std::string filename = "ascend_execute_order_" + std::to_string(kg->graph_id()) + ".dat";
-  auto filepath = pipeline::GetSaveGraphsPathName(filename);
+  auto filepath = GetSaveGraphsPathName(filename);
   if (filepath.size() >= PATH_MAX) {
     MS_LOG(ERROR) << "File path: " << filepath << " is too long.";
     return;
@@ -1735,7 +1735,7 @@ class ExecuteOrderGenerator {
                            return {p.first.first, {p.first.second, p.second.first, p.second.second}};
                          });
     auto validate_ref_parameter = [](AnfNodePtr node) -> AnfNodePtr {
-      if (node->isa<CNode>() && AnfAlgo::CheckPrimitiveType(node, prim::KPrimTransData)) {
+      if (node->isa<CNode>() && AnfAlgo::CheckPrimitiveType(node, prim::kPrimTransData)) {
         auto cnode = node->cast<CNodePtr>();
         MS_EXCEPTION_IF_NULL(cnode);
         auto first_input = cnode->input(kFirstDataInputIndex);
diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc
index 5504bd5537a..491634a623d 100644
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -381,7 +381,7 @@ void AscendSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_gra
         MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
       }
       if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode ||
-          AnfAlgo::IsParameterWeight(input_param)) {
+          AnfAlgo::IsParameterWeight(input_param) || kernel_graph->IsUpdatedParameter(input_param)) {
         tensor->set_device_address(device_address);
       }
       if (kernel_graph->IsUpdatedParameter(input_param)) {
@@ -523,30 +523,14 @@ void AscendSession::BuildGraphImpl(GraphId graph_id) {
   InitRuntimeResource();
   // multiple graph handle
   if (graph_id == final_graph_id_) {
-    if (!graph->executable()) {
-      return;
-    }
-    SetFinalGraphSummaryFlag(graph);
-    // OptChildGraphs
-    auto graph_order = GetGraphOrder(final_graph_id_);
-    auto &graph_type = GetGraphOrderType(final_graph_id_);
-    for (size_t i = 0; i < graph_order.size(); i++) {
-      if (!(graph_type[i] == BRANCH_END || graph_type[i] == BRANCH_START)) {
-        auto child_graph = GetGraph(graph_order[i]);
-        CompileChildGraph(child_graph);
-      }
-    }
-    SetSummaryNodes(graph.get());
-    // merge child graph
-    MergeGraphExecOrder();
-  } else {
-    auto single_graph = GetGraph(graph_id);
-    MS_EXCEPTION_IF_NULL(single_graph);
-    CompileChildGraph(single_graph);
-    // set the distinction label of single graph
-    single_graph->set_stream_distinction_label(graph_id);
-    single_graph->UpdateExecuteKernelStreamLabel();
+    MS_LOG(EXCEPTION) << "Unexpected graph id:" << graph_id << ", final_graph_id_:" << final_graph_id_;
   }
+  auto single_graph = GetGraph(graph_id);
+  MS_EXCEPTION_IF_NULL(single_graph);
+  CompileChildGraph(single_graph);
+  // set the distinction label of single graph
+  single_graph->set_stream_distinction_label(graph_id);
+  single_graph->UpdateExecuteKernelStreamLabel();
   // adjust execution order because  merge child graph and other special operations
   AdjustKernel(graph);
 #if ENABLE_CPU && ENABLE_D
@@ -568,6 +552,7 @@ void AscendSession::BuildGraphImpl(GraphId graph_id) {
   } else {
     // alloc memory, including static memory and dynamic memory
     MemoryAlloc(graph.get());
+    AnfAlgo::CacheAddrForGraph(graph);
     // generate and load task info to device if it is sink mode
     Load(graph);
   }
@@ -643,15 +628,12 @@ void AscendSession::RunOpHardwareOptimize(const std::shared_ptr<session::KernelG
   MS_LOG(INFO) << "HardwareOptimize Finish";
 }
 
-bool AscendSession::GraphCacheExist(const GraphInfo &graph_info) const {
-  return run_op_graphs_.find(graph_info) != run_op_graphs_.end();
-}
-
-void AscendSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
-                                const std::vector<tensor::TensorPtr> &input_tensors,
-                                const std::vector<int64_t> &tensors_mask) {
-  if (GraphCacheExist(graph_info)) {
-    return;
+KernelGraphPtr AscendSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
+                                          const std::vector<tensor::TensorPtr> &input_tensors,
+                                          const std::vector<int64_t> &tensors_mask) {
+  auto it = run_op_graphs_.find(graph_info);
+  if (it != run_op_graphs_.end()) {
+    return it->second;
   }
 
   const auto &graph = PreBuildOp(op_run_info, input_tensors, tensors_mask);
@@ -661,7 +643,11 @@ void AscendSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &g
   // build kernel
   RunOpAdjustKernel(graph);
   BuildKernel(graph);
-  run_op_graphs_[graph_info] = graph;
+  auto enable_op_graph_cache = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
+  if (enable_op_graph_cache) {
+    run_op_graphs_[graph_info] = graph;
+  }
+  return graph;
 }
 
 void AscendSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
@@ -669,7 +655,7 @@ void AscendSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_inf
                               const std::vector<int64_t> &tensors_mask) {
   MS_EXCEPTION_IF_NULL(input_tensors);
   MS_EXCEPTION_IF_NULL(op_run_info);
-  BuildOpImpl(*op_run_info, graph_info, *input_tensors, tensors_mask);
+  const auto &graph = BuildOpImpl(*op_run_info, graph_info, *input_tensors, tensors_mask);
   EraseValueNodeTensor(tensors_mask, input_tensors);
 
   // wait for allreduce
@@ -678,13 +664,11 @@ void AscendSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_inf
       tensor->WaitDevice();
     }
   }
-  // Run op
-  auto graph = run_op_graphs_[graph_info];
-  MS_EXCEPTION_IF_NULL(graph);
   // malloc mem
   RunOpRemoveNopNode(graph);
   RunOpMemoryAlloc(*input_tensors, graph.get());
   RunOpGenKernelEvent(graph.get());
+  AnfAlgo::CacheAddrForGraph(graph);
   // Build dynamic kernel
   if (op_run_info->is_dynamic_shape) {
     BuildDynamicKernel(graph);
@@ -806,7 +790,10 @@ void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::map<AnfN
   // Record single op graphs in run_op_graphs_ so that these graphs can be reused in BuildOpImpl
   for (const auto &graph_item : single_op_graphs) {
     RunOpMemoryClear(graph_item.first.get());
-    run_op_graphs_[graph_item.second] = graph_item.first;
+    auto enable_op_graph_cache = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
+    if (enable_op_graph_cache) {
+      run_op_graphs_[graph_item.second] = graph_item.first;
+    }
     MS_LOG(DEBUG) << "Pre build op finished, graph info: " << graph_item.second;
   }
   built_graph_id_.insert(graph_id);
@@ -863,9 +850,10 @@ void AscendSession::InitRuntimeResource() {
   if (!runtime_instance->Init()) {
     MS_LOG(EXCEPTION) << "Kernel runtime init error.";
   }
-  auto env_table_file = common::GetEnv("RANK_TABLE_FILE");
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
   auto env_rank_id = common::GetEnv("RANK_ID");
-  if (!(env_table_file.empty() || env_rank_id.empty())) {
+  if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
     // get actual rank id if it's distribution training case.
     rank_id_ = GetRankId();
   }
@@ -1173,12 +1161,8 @@ void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph)
 void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
   MS_LOG(DEBUG) << "Start!";
   MS_EXCEPTION_IF_NULL(kernel_graph);
-  bool finish = E2eDump::DumpData(kernel_graph.get(), rank_id_);
-  if (finish) {
-    MS_LOG(DEBUG) << "Finish!";
-  } else {
-    MS_LOG(ERROR) << "Dump Data failed!";
-  }
+  E2eDump::DumpData(kernel_graph.get(), rank_id_);
+  MS_LOG(DEBUG) << "Finish!";
 }
 
 void AscendSession::DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs) {
diff --git a/mindspore/ccsrc/backend/session/ascend_session.h b/mindspore/ccsrc/backend/session/ascend_session.h
index 14ba03bb28f..0aeb2c86bc5 100644
--- a/mindspore/ccsrc/backend/session/ascend_session.h
+++ b/mindspore/ccsrc/backend/session/ascend_session.h
@@ -57,9 +57,9 @@ class AscendSession : public SessionBasic {
                         VectorRef *const outputs) override;
   void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) override;
   void BuildGraphImpl(GraphId) override;
-  void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
-                   const std::vector<tensor::TensorPtr> &input_tensors,
-                   const std::vector<int64_t> &tensors_mask) override;
+  KernelGraphPtr BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
+                             const std::vector<tensor::TensorPtr> &input_tensors,
+                             const std::vector<int64_t> &tensors_mask) override;
   void RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info, std::vector<tensor::TensorPtr> *input_tensors,
                  VectorRef *outputs, const std::vector<int64_t> &tensors_mask) override;
   void BuildOpsInGraph(const GraphId &graph_id, const std::map<AnfNodePtr, size_t> &parameter_index,
@@ -104,8 +104,6 @@ class AscendSession : public SessionBasic {
   const std::vector<GraphId> &GetGraphOrder(GraphId final_graph_id) const;
   // get graph order type vector by graph id
   const std::vector<GraphType> &GetGraphOrderType(GraphId final_graph_id) const;
-  // check if graph cache exist
-  bool GraphCacheExist(const GraphInfo &graph_info) const;
   // sync initial tensors' data to device
   void SyncInitialTenosrToDevice();
   void SetFinalGraphSummaryFlag(const std::shared_ptr<KernelGraph> &kernel_graph);
diff --git a/mindspore/ccsrc/backend/session/cpu_session.cc b/mindspore/ccsrc/backend/session/cpu_session.cc
index 8e6af7ea6d4..2bbfccaf5a9 100644
--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@@ -212,21 +212,27 @@ void CPUSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph)
   }
 }
 
-void CPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
-                             const std::vector<tensor::TensorPtr> &input_tensors,
-                             const std::vector<int64_t> &tensors_mask) {
+KernelGraphPtr CPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
+                                       const std::vector<tensor::TensorPtr> &input_tensors,
+                                       const std::vector<int64_t> &tensors_mask) {
   // Check if the graph cache exists.
-  if (run_op_graphs_.find(graph_info) != run_op_graphs_.end()) {
-    return;
+  auto it = run_op_graphs_.find(graph_info);
+  if (it != run_op_graphs_.end()) {
+    return it->second;
   }
+
   // Prepare the graph
-  auto kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask);
+  const auto &kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask);
   MS_EXCEPTION_IF_NULL(kernel_graph);
   SetKernelInfo(kernel_graph.get());
   Optimize(kernel_graph);
   BuildKernel(kernel_graph.get());
   ProcessCast(kernel_graph);
-  run_op_graphs_[graph_info] = kernel_graph;
+  auto enable_op_graph_cache = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
+  if (enable_op_graph_cache) {
+    run_op_graphs_[graph_info] = kernel_graph;
+  }
+  return kernel_graph;
 }
 
 void CPUSession::SetOutputFlags(const VectorRef &base_ref) {
@@ -260,12 +266,8 @@ void CPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
                            const std::vector<int64_t> &tensors_mask) {
   MS_EXCEPTION_IF_NULL(input_tensors);
   MS_EXCEPTION_IF_NULL(op_run_info);
-  BuildOpImpl(*op_run_info, graph_info, *input_tensors, tensors_mask);
+  const auto &kernel_graph = BuildOpImpl(*op_run_info, graph_info, *input_tensors, tensors_mask);
   EraseValueNodeTensor(tensors_mask, input_tensors);
-
-  auto kernel_graph = run_op_graphs_[graph_info];
-  MS_EXCEPTION_IF_NULL(kernel_graph);
-
   // Remove reorder after PS feature finish adapting push/pull in auto_monad.
   auto execution_order = kernel_graph->execution_order();
   Reorder(&execution_order);
diff --git a/mindspore/ccsrc/backend/session/cpu_session.h b/mindspore/ccsrc/backend/session/cpu_session.h
index f86250e889c..0fc9eaafe42 100644
--- a/mindspore/ccsrc/backend/session/cpu_session.h
+++ b/mindspore/ccsrc/backend/session/cpu_session.h
@@ -43,9 +43,9 @@ class CPUSession : public SessionBasic {
   void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) override;
   ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, KernelGraph *graph) override;
   void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph);
-  void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
-                   const std::vector<tensor::TensorPtr> &input_tensors,
-                   const std::vector<int64_t> &tensors_mask) override;
+  KernelGraphPtr BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
+                             const std::vector<tensor::TensorPtr> &input_tensors,
+                             const std::vector<int64_t> &tensors_mask) override;
   void RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info, std::vector<tensor::TensorPtr> *input_tensors,
                  VectorRef *outputs, const std::vector<int64_t> &tensors_mask) override;
   void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc
index 6f4c60987c4..ce7094cfc16 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -114,12 +114,12 @@ void GPUSession::Init(uint32_t device_id) {
   MS_EXCEPTION_IF_NULL(ms_context);
   ms_context->set_param<uint32_t>(MS_CTX_DEVICE_ID, device_id);
   if (collective_inited) {
-    rank_id_ = GetRankId();
     if (collective_handle_ != nullptr) {
       auto init_nccl_comm_funcptr =
         reinterpret_cast<InitNCCLComm>(dlsym(const_cast<void *>(collective_handle_), "InitNCCLComm"));
       MS_EXCEPTION_IF_NULL(init_nccl_comm_funcptr);
       (*init_nccl_comm_funcptr)();
+      rank_id_ = GetRankId();
     }
   }
 
@@ -601,16 +601,17 @@ void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const
   }
 }
 
-void GPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
-                             const std::vector<tensor::TensorPtr> &input_tensors,
-                             const std::vector<int64_t> &tensors_mask) {
+KernelGraphPtr GPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
+                                       const std::vector<tensor::TensorPtr> &input_tensors,
+                                       const std::vector<int64_t> &tensors_mask) {
   // Check if the graph cache exists.
-  if (run_op_graphs_.find(graph_info) != run_op_graphs_.end() &&
-      kOpCacheBlackList.find(op_run_info.op_name) == kOpCacheBlackList.end()) {
-    return;
+  auto it = run_op_graphs_.find(graph_info);
+  if (it != run_op_graphs_.end() && kOpCacheBlackList.find(op_run_info.op_name) == kOpCacheBlackList.end()) {
+    return it->second;
   }
+
   // Prepare the graph
-  auto kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask);
+  const auto &kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask);
   MS_EXCEPTION_IF_NULL(kernel_graph);
   RunOpOptimize(kernel_graph);
   SelectKernel(kernel_graph);
@@ -618,7 +619,11 @@ void GPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &grap
   StartKernelRT();
   RunOpHideNopNode(kernel_graph);
   BuildKernel(kernel_graph);
-  run_op_graphs_[graph_info] = kernel_graph;
+  auto enable_op_graph_cache = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
+  if (enable_op_graph_cache) {
+    run_op_graphs_[graph_info] = kernel_graph;
+  }
+  return kernel_graph;
 }
 
 void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
@@ -626,7 +631,7 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
                            const std::vector<int64_t> &tensors_mask) {
   MS_EXCEPTION_IF_NULL(input_tensors);
   MS_EXCEPTION_IF_NULL(op_run_info);
-  BuildOpImpl(*op_run_info, graph_info, *input_tensors, tensors_mask);
+  const auto &kernel_graph = BuildOpImpl(*op_run_info, graph_info, *input_tensors, tensors_mask);
   EraseValueNodeTensor(tensors_mask, input_tensors);
   // wait for allreduce
   for (auto &tensor : *input_tensors) {
@@ -636,7 +641,6 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
     }
   }
   // run op
-  auto kernel_graph = run_op_graphs_[graph_info];
   MS_EXCEPTION_IF_NULL(kernel_graph);
   RunOpRemoveNopNode(kernel_graph);
   RunOpAllocateMemory(*input_tensors, kernel_graph.get());
diff --git a/mindspore/ccsrc/backend/session/gpu_session.h b/mindspore/ccsrc/backend/session/gpu_session.h
index 45c04a4808d..c061bb41e79 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.h
+++ b/mindspore/ccsrc/backend/session/gpu_session.h
@@ -45,9 +45,9 @@ class GPUSession : public SessionBasic {
   void PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
                         VectorRef *const outputs) override;
   void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) override;
-  void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
-                   const std::vector<tensor::TensorPtr> &input_tensors,
-                   const std::vector<int64_t> &tensors_mask) override;
+  KernelGraphPtr BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
+                             const std::vector<tensor::TensorPtr> &input_tensors,
+                             const std::vector<int64_t> &tensors_mask) override;
   void RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info, std::vector<tensor::TensorPtr> *input_tensors,
                  VectorRef *outputs, const std::vector<int64_t> &tensors_mask) override;
   std::shared_ptr<device::Bucket> CreateBucket(uint32_t bucket_id, uint32_t bucket_size) override;
diff --git a/mindspore/ccsrc/backend/session/kernel_build_client.cc b/mindspore/ccsrc/backend/session/kernel_build_client.cc
index 097c4564a5c..8affa1c0063 100644
--- a/mindspore/ccsrc/backend/session/kernel_build_client.cc
+++ b/mindspore/ccsrc/backend/session/kernel_build_client.cc
@@ -199,29 +199,5 @@ bool AscendKernelBuildClient::CheckSupported(const std::string &json) {
   }
   return true;
 }
-
-int GpuKernelBuildClient::AkgGetPid() {
-  auto res = SendRequest(kAkgPid);
-  if (res == kErr) {
-    MS_LOG(ERROR) << "AKG/PID failed, res: " << res;
-    return -1;
-  }
-  return std::stoi(res);
-}
-
-bool GpuKernelBuildClient::AkgCompileSingle(const std::string json) {
-  auto res = SendRequest(kAkgCompileOp);
-  if (res != kAck) {
-    MS_LOG(ERROR) << "AKG/COMPILE failed, res: " << res;
-    return false;
-  }
-  // Send single json data.
-  res = SendRequest(json);
-  if (res != kAck) {
-    MS_LOG(ERROR) << "AKG/COMPILE responds failed, res: " << res;
-    return false;
-  }
-  return true;
-}
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/session/kernel_build_client.h b/mindspore/ccsrc/backend/session/kernel_build_client.h
index 7dc123f3bb3..be3c1441da4 100644
--- a/mindspore/ccsrc/backend/session/kernel_build_client.h
+++ b/mindspore/ccsrc/backend/session/kernel_build_client.h
@@ -141,7 +141,15 @@ class KernelBuildClient {
   std::shared_ptr<DuplexPipe> dp_;
 };
 
-static std::string GetScriptFilePath(const std::string cmd_env, const std::string &cmd_script) {
+static std::string GetScriptFilePath(const std::string cmd_env, const std::string &cmd_script,
+                                     const std::string &server_script) {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto server_dir = ms_context->get_param<std::string>(MS_CTX_KERNEL_BUILD_SERVER_DIR);
+  if (!server_dir.empty()) {
+    return server_dir + server_script;
+  }
+
   std::string cmd = cmd_env;
   (void)cmd.append(1, ' ').append(cmd_script);
   FILE *fpipe = popen(cmd.c_str(), "r");
@@ -196,6 +204,8 @@ class AscendKernelBuildClient : public KernelBuildClient {
     "print('[~]' + path)"
     "\"";
 
+  constexpr inline static auto kServerScript = "kernel_build_server_ascend.py";
+
   // Receive the response from server
   constexpr inline static auto kFailed = "-1";
 
@@ -221,7 +231,7 @@ class AscendKernelBuildClient : public KernelBuildClient {
 
   std::string GetScript() override {
     auto env = GetPyExe();
-    return GetScriptFilePath(env, kGetPathScript);
+    return GetScriptFilePath(env, kGetPathScript, kServerScript);
   }
 
   // Before building.
@@ -259,9 +269,7 @@ class GpuKernelBuildClient : public KernelBuildClient {
     "print('[~]' + path)"
     "\"";
 
-  // Send building request to server
-  constexpr inline static auto kAkgPid = "AKG/PID";
-  constexpr inline static auto kAkgCompileOp = "AKG/COMPILE";  // Compile a single op
+  constexpr inline static auto kServerScript = "kernel_build_server_gpu.py";
 
   static GpuKernelBuildClient &Instance() {
     static GpuKernelBuildClient instance;
@@ -272,14 +280,9 @@ class GpuKernelBuildClient : public KernelBuildClient {
 
   std::string GetScript() override {
     auto env = GetPyExe();
-    return GetScriptFilePath(env, kGetPathScript);
+    return GetScriptFilePath(env, kGetPathScript, kServerScript);
   }
 
-  // Fetch pid(pid_t) from remote.
-  int AkgGetPid();
-  // Run AKG building.
-  bool AkgCompileSingle(const std::string json);
-
   GpuKernelBuildClient(const GpuKernelBuildClient &) = delete;
   GpuKernelBuildClient &operator=(const GpuKernelBuildClient &) = delete;
 
diff --git a/mindspore/ccsrc/backend/session/kernel_graph.cc b/mindspore/ccsrc/backend/session/kernel_graph.cc
index ee2dcab041f..069f50e80d8 100644
--- a/mindspore/ccsrc/backend/session/kernel_graph.cc
+++ b/mindspore/ccsrc/backend/session/kernel_graph.cc
@@ -581,7 +581,6 @@ ParameterPtr KernelGraph::NewParameter(const abstract::AbstractBasePtr &abstract
 ValueNodePtr KernelGraph::NewValueNode(const ValueNodePtr &value_node) {
   MS_EXCEPTION_IF_NULL(value_node);
   auto new_value_node = MakeValueNode(value_node)->cast<ValueNodePtr>();
-  new_value_node->set_func_graph(shared_from_this()->cast<FuncGraphPtr>());
   AnfAlgo::SetGraphId(graph_id_, new_value_node.get());
   return new_value_node;
 }
@@ -591,7 +590,6 @@ ValueNodePtr KernelGraph::NewValueNode(const AbstractBasePtr &abstract, const Va
   MS_EXCEPTION_IF_NULL(value);
   ValueNodePtr new_value_node = std::make_shared<ValueNode>(value);
   new_value_node->set_abstract(abstract);
-  new_value_node->set_func_graph(shared_from_this()->cast<FuncGraphPtr>());
   SetKernelInfoForNode(new_value_node);
   AnfAlgo::SetGraphId(graph_id(), new_value_node.get());
   return new_value_node;
@@ -696,9 +694,8 @@ AnfNodePtr KernelGraph::TransTupleToMakeTuple(const AnfNodePtr &node) {
   } else if (node->isa<ValueNode>()) {
     auto value_node = node->cast<ValueNodePtr>();
     MS_EXCEPTION_IF_NULL(value_node);
-    auto cur_graph = value_node->func_graph()->cast<KernelGraphPtr>();
-    auto make_tuple = cur_graph->TransValueNodeTuple(value_node->abstract(), value_node->value());
-    if (!cur_graph->RemoveValueNodeFromGraph(value_node)) {
+    auto make_tuple = TransValueNodeTuple(value_node->abstract(), value_node->value());
+    if (!RemoveValueNodeFromGraph(value_node)) {
       MS_LOG(WARNING) << "Failed to remove the value_node " << value_node->DebugString();
     }
     return make_tuple;
@@ -1362,7 +1359,9 @@ void KernelGraph::SetOptimizerFlag() {
         continue;
       }
       auto param = real_node->cast<ParameterPtr>();
-      if (AnfAlgo::IsParameterWeight(param)) {
+      auto abstract = param->abstract();
+      MS_EXCEPTION_IF_NULL(abstract);
+      if (abstract->isa<abstract::AbstractRef>()) {
         has_optimizer_ = true;
         (void)updated_parameters_.insert(param);
       }
@@ -1381,8 +1380,7 @@ KernelGraph::~KernelGraph() {
         kernel_mod->ReleaseResource();
       }
     }
-    device::KernelRuntimeManager::Instance().ClearGraphResource(graph_id_, *inputs_, graph_value_nodes_,
-                                                                execution_order_);
+    device::KernelRuntimeManager::Instance().ClearGraphResource(graph_id_);
   } catch (const std::exception &e) {
     MS_LOG(ERROR) << "KernelGraph call destructor failed: " << e.what();
   } catch (...) {
diff --git a/mindspore/ccsrc/backend/session/session_basic.cc b/mindspore/ccsrc/backend/session/session_basic.cc
index a204f11c6f4..a690cffe180 100644
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@@ -446,6 +446,38 @@ void UpdateGraphAquireGilAttr(const NotNull<KernelGraphPtr> &root_graph) {
   }
   return;
 }
+
+bool ExistGraphCaller(const AnfNodePtr &partial_node) {
+  MS_EXCEPTION_IF_NULL(partial_node);
+  auto partial_cnode = partial_node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(partial_cnode);
+  auto partial_graph = GetValueNode<FuncGraphPtr>(partial_cnode->input(kFirstDataInputIndex));
+  MS_EXCEPTION_IF_NULL(partial_graph);
+  auto graph_nodes = TopoSort(partial_graph->get_return());
+  return std::any_of(graph_nodes.begin(), graph_nodes.end(), IsValueNode<FuncGraph>);
+}
+
+// 1. Convert the node to make_tuple if the node is a ValueNode<ValueTuple> and it's the input of 'return' node.
+// 2. Set the return of graph if node is "Return" node.
+void SetReturnNode(const AnfNodePtr &node, KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+
+  if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimReturn)) {
+    constexpr auto kReturnInputIdx = 1;
+    auto return_node = node->cast<CNodePtr>();
+    graph->set_return(return_node);
+    auto graph_output = return_node->input(kReturnInputIdx);
+    MS_EXCEPTION_IF_NULL(graph_output);
+
+    // If return's input is value node, then the graph has no kernel, and the pass 'trans tuple to make_tuple' cannot
+    // match this pattern because that pass begin with output node but return node. So we add transform value tuple
+    // to make_tuple here.
+    if (AnfAlgo::IsTupleOutput(graph_output) && graph_output->isa<ValueNode>()) {
+      return_node->set_input(kReturnInputIdx, graph->TransTupleToMakeTuple(graph_output));
+    }
+  }
+}
 }  // namespace
 
 GraphId SessionBasic::graph_sum_ = 0;
@@ -1463,9 +1495,7 @@ bool SessionBasic::CreateCNodeOfKernelGraph(const AnfNodePtr &node, KernelGraph
   new_cnode->set_fullname_with_scope(fullname);
   new_cnode->set_scope(cnode->scope());
   graph->FrontBackendlMapAdd(node, new_cnode);
-  if (AnfAlgo::CheckPrimitiveType(new_cnode, prim::kPrimReturn)) {
-    graph->set_return(new_cnode);
-  }
+  SetReturnNode(new_cnode, graph);
   return true;
 }
 
@@ -1958,7 +1988,8 @@ void SessionBasic::HandleInternalOutput(const AnfNodePtr &input_front_node, cons
   if (internal_output) {
     auto users = ExtendNodeUsers(front_func_graph_manager, front_node);
     for (auto &user : users) {
-      if (AnfAlgo::CheckPrimitiveType(user, prim::kPrimPartial) && kernel_target != kGPUDevice) {
+      if (AnfAlgo::CheckPrimitiveType(user, prim::kPrimPartial) && kernel_target != kGPUDevice &&
+          !ExistGraphCaller(user)) {
         auto partial_target = AddPartialParametersMap(user);
         if (partial_target != kNoTarget && partial_target != kernel_target) {
           unique_target = false;
@@ -2652,6 +2683,7 @@ uint32_t GetRankId() {
   uint32_t rank_id = 0;
   auto ms_context = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(ms_context);
+
   std::string world_group;
   std::string backend = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
   if (backend == kAscendDevice) {
@@ -2660,6 +2692,7 @@ uint32_t GetRankId() {
     world_group = kNcclWorldGroup;
   } else {
     MS_LOG(ERROR) << "Invalid backend: " << backend;
+    return rank_id;
   }
   if (!CommManager::GetInstance().GetRankID(world_group, &rank_id)) {
     MS_LOG(INFO) << "Failed to get rank id.";
diff --git a/mindspore/ccsrc/backend/session/session_basic.h b/mindspore/ccsrc/backend/session/session_basic.h
index ef3b137626e..d43c3fd337c 100644
--- a/mindspore/ccsrc/backend/session/session_basic.h
+++ b/mindspore/ccsrc/backend/session/session_basic.h
@@ -217,9 +217,11 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
                                 const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {}
   virtual void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) {}
   void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs);
-  virtual void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
-                           const std::vector<tensor::TensorPtr> &input_tensors,
-                           const std::vector<int64_t> &tensors_mask) {}
+  virtual KernelGraphPtr BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
+                                     const std::vector<tensor::TensorPtr> &input_tensors,
+                                     const std::vector<int64_t> &tensors_mask) {
+    return nullptr;
+  }
   virtual void RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
                          std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
                          const std::vector<int64_t> &tensors_mask) {}
diff --git a/mindspore/ccsrc/common/duplex_pipe.cc b/mindspore/ccsrc/common/duplex_pipe.cc
index c14e1f720c5..b2040ee040c 100644
--- a/mindspore/ccsrc/common/duplex_pipe.cc
+++ b/mindspore/ccsrc/common/duplex_pipe.cc
@@ -48,6 +48,8 @@ int DuplexPipe::Open(const std::initializer_list<std::string> &arg_list, bool ap
     close(fd2_[1]);
     DP_EXCEPTION << "fork failed, errno: " << errno;
   } else if (pid_ == 0) {  // Remote process
+    DP_INFO << "Remote process, id: " << getpid() << ", " << fd1_[0] << "/" << fd2_[1];
+    DP_INFO << "Execute: arg_list:" << arg_list;
     remote_stdout_ = dup(STDOUT_FILENO);
     remote_stdin_ = dup(STDIN_FILENO);
     close(fd1_[1]);
diff --git a/mindspore/ccsrc/common/trans.cc b/mindspore/ccsrc/common/trans.cc
index 59b95d4a0a2..21858105cc3 100644
--- a/mindspore/ccsrc/common/trans.cc
+++ b/mindspore/ccsrc/common/trans.cc
@@ -620,7 +620,27 @@ std::vector<int64_t> FracZDeviceShapeWithGroups(const std::vector<int64_t> &shap
   return device_shape;
 }
 
-std::vector<int64_t> TransShapeToFracNZ(const std::vector<int64_t> &shape) {
+std::vector<size_t> FracNZDeviceShape(const std::vector<size_t> &shape) {
+  if (shape.size() == 1 && (shape[0] == 1 || shape[0] % kCubeSize == 0)) {
+    // For [1] and [1024] shape we can trait it as NZ shape
+    return shape;
+  }
+  std::vector<size_t> device_shape;
+  if (shape.size() < 2) {
+    MS_LOG(EXCEPTION) << "Format FRACTAL_NZ is not support shape " << shape.size();
+  } else {
+    (void)std::copy(shape.begin(), shape.end() - 2, std::back_inserter(device_shape));
+  }
+  auto h1 = (shape[shape.size() - 2] - 1) / kCubeSize + 1;
+  auto w1 = (shape[shape.size() - 1] - 1) / kCubeSize + 1;
+  device_shape.push_back(w1);
+  device_shape.push_back(h1);
+  device_shape.push_back(kCubeSize);
+  device_shape.push_back(kCubeSize);
+  return device_shape;
+}
+
+std::vector<int64_t> FracNZDeviceDynamicShape(const std::vector<int64_t> &shape) {
   std::vector<int64_t> device_shape;
   if (shape.size() == 1 && (shape[0] == 1 || shape[0] % kCubeSize == 0)) {
     // For [1] and [1024] shape we can trait it as NZ shape
@@ -642,7 +662,21 @@ std::vector<int64_t> TransShapeToFracNZ(const std::vector<int64_t> &shape) {
   return device_shape;
 }
 
-std::vector<int64_t> TransShapeToFracNZLSTM(const std::vector<int64_t> &shape) {
+std::vector<size_t> FracNZLSTMDeviceShape(const std::vector<size_t> &shape) {
+  const size_t c0 = 4;
+  const size_t h = shape.at(kN) / c0;
+  const size_t i = shape.at(kC) - h;
+  const size_t first = DivCeil(i, kCubeSize) + DivCeil(h, kCubeSize);
+  const size_t second = c0 * DivCeil(h, kCubeSize);
+  std::vector<size_t> device_shape;
+  device_shape.push_back(first);
+  device_shape.push_back(second);
+  device_shape.push_back(kCubeSize);
+  device_shape.push_back(kCubeSize);
+  return device_shape;
+}
+
+std::vector<int64_t> FracNZLSTMDeviceDynamicShape(const std::vector<int64_t> &shape) {
   std::vector<int64_t> device_shape;
   const int64_t c0 = 4;
   const int64_t h_shape = shape.at(kN);
@@ -693,8 +727,8 @@ bool IsNeedPadding(const std::string &format, const size_t shape_size) {
   if (shape_size == 0) {
     return false;
   }
-  if (format == kOpFormat_DEFAULT || format == kOpFormat_FRAC_NZ || format == kOpFormat_ChannelLast ||
-      format == kOpFormat_NCHW) {
+  if (format == kOpFormat_DEFAULT || format == kOpFormat_NCHW ||
+      kNoPaddingFormatSet.find(format) != kNoPaddingFormatSet.end()) {
     return false;
   } else if (shape_size < kNchwDims) {
     return true;
@@ -799,7 +833,9 @@ std::vector<size_t> TransShapeToDevice(const std::vector<size_t> &shape, const s
                                                                     {kOpFormat_NCDHW, NcdhwDeviceShape},
                                                                     {kOpFormat_ChannelLast, ChannelLastDeviceShape},
                                                                     {kOpFormat_NDC1HWC0, Ndc1hwc0DeviceShape},
-                                                                    {kOpFormat_FRACTAL_Z_3D, Fracz3DDeviceShape}};
+                                                                    {kOpFormat_FRACTAL_Z_3D, Fracz3DDeviceShape},
+                                                                    {kOpFormat_FRAC_NZ, FracNZDeviceShape},
+                                                                    {kOpFormat_FRACTAL_ZN_LSTM, FracNZLSTMDeviceShape}};
 
   if (format == kOpFormat_ND || format == kOpFormat_DEFAULT) {
     return shape;
@@ -808,37 +844,8 @@ std::vector<size_t> TransShapeToDevice(const std::vector<size_t> &shape, const s
     return FracZDeviceShapeWithGroups(shape, groups);
   }
   auto temp_shape = shape;
-  std::vector<size_t> device_shape;
-  if (format == kOpFormat_FRAC_NZ) {
-    if (shape.size() == 1 && (shape[0] == 1 || shape[0] % kCubeSize == 0)) {
-      // For [1] and [1024] shape we can trait it as NZ shape
-      return shape;
-    }
-    if (shape.size() < 2) {
-      MS_LOG(EXCEPTION) << "Format" << format << " is not support shape " << shape.size();
-    } else {
-      (void)std::copy(shape.begin(), shape.end() - 2, std::back_inserter(device_shape));
-    }
-    auto h1 = (shape[shape.size() - 2] - 1) / kCubeSize + 1;
-    auto w1 = (shape[shape.size() - 1] - 1) / kCubeSize + 1;
-    device_shape.push_back(w1);
-    device_shape.push_back(h1);
-    device_shape.push_back(kCubeSize);
-    device_shape.push_back(kCubeSize);
-    return device_shape;
-  } else if (format == kOpFormat_FRACTAL_ZN_LSTM) {
-    const size_t c0 = 4;
-    const size_t h = shape.at(kN) / c0;
-    const size_t i = shape.at(kC) - h;
-    const size_t first = DivCeil(i, kCubeSize) + DivCeil(h, kCubeSize);
-    const size_t second = c0 * DivCeil(h, kCubeSize);
-    device_shape.push_back(first);
-    device_shape.push_back(second);
-    device_shape.push_back(kCubeSize);
-    device_shape.push_back(kCubeSize);
-    return device_shape;
-  }
-  if (format != kOpFormat_ChannelLast && shape.size() != kNchwDims && k3DFormatSet.find(format) == k3DFormatSet.end()) {
+  if (kNoPaddingFormatSet.find(format) == kNoPaddingFormatSet.end() && format != kOpFormat_FRACTAL_ZN_LSTM &&
+      shape.size() != kNchwDims && k3DFormatSet.find(format) == k3DFormatSet.end()) {
     MS_LOG(WARNING) << "Get Device Shape using a shape size is less than 4 ,should be Padding shape by Default firstly";
     temp_shape = PaddingShapeTo4dDefault(shape);
   }
@@ -867,7 +874,9 @@ std::vector<int64_t> TransShapeToDevice(const std::vector<int64_t> &shape, const
     {kOpFormat_NCDHW, NcdhwDeviceDynamicShape},
     {kOpFormat_ChannelLast, ChannelLastDeviceDynamicShape},
     {kOpFormat_NDC1HWC0, Ndc1hwc0DeviceDynamicShape},
-    {kOpFormat_FRACTAL_Z_3D, Fracz3DDeviceDynamicShape}};
+    {kOpFormat_FRACTAL_Z_3D, Fracz3DDeviceDynamicShape},
+    {kOpFormat_FRAC_NZ, FracNZDeviceDynamicShape},
+    {kOpFormat_FRACTAL_ZN_LSTM, FracNZLSTMDeviceDynamicShape}};
 
   if (format == kOpFormat_ND || format == kOpFormat_DEFAULT || format == kOpFormat_NCHW) {
     return shape;
@@ -876,12 +885,8 @@ std::vector<int64_t> TransShapeToDevice(const std::vector<int64_t> &shape, const
     return FracZDeviceShapeWithGroups(shape, groups);
   }
   auto temp_shape = shape;
-  if (format == kOpFormat_FRAC_NZ) {
-    return TransShapeToFracNZ(shape);
-  } else if (format == kOpFormat_FRACTAL_ZN_LSTM) {
-    return TransShapeToFracNZLSTM(shape);
-  }
-  if (format != kOpFormat_ChannelLast && shape.size() != kNchwDims && k3DFormatSet.find(format) == k3DFormatSet.end()) {
+  if (kNoPaddingFormatSet.find(format) == kNoPaddingFormatSet.end() && format != kOpFormat_FRACTAL_ZN_LSTM &&
+      shape.size() != kNchwDims && k3DFormatSet.find(format) == k3DFormatSet.end()) {
     MS_LOG(WARNING) << "Get Device Shape using a shape size is less than 4 ,should be Padding shape by Default firstly";
     temp_shape = PaddingShapeTo4dDefault(shape);
   }
@@ -1219,6 +1224,7 @@ bool NchwToNc1hwc04(const FormatArgs &args, void *result) {
   MS_LOG(DEBUG) << "Trans format from nchw to Nc1hwc04.";
   return NchwToNc1hwc0(args, result);
 }
+
 bool Nc1hwc04ToNchw(const FormatArgs &args, void *result) {
   MS_LOG(DEBUG) << "Trans format from Nc1hwc04 to nchw.";
   return Nc1hwc0ToNchw(args, result);
diff --git a/mindspore/ccsrc/debug/anf_ir_dump.cc b/mindspore/ccsrc/debug/anf_ir_dump.cc
index 4248dc7f5f4..22b6d1861c8 100644
--- a/mindspore/ccsrc/debug/anf_ir_dump.cc
+++ b/mindspore/ccsrc/debug/anf_ir_dump.cc
@@ -28,7 +28,6 @@
 #include "backend/session/anf_runtime_algorithm.h"
 #include "frontend/parallel/ops_info/operator_info.h"
 #include "pipeline/jit/base.h"
-#include "debug/common.h"
 #include "debug/trace.h"
 #include "utils/trace_base.h"
 
@@ -582,7 +581,7 @@ void DumpIR(const std::string &filename, const FuncGraphPtr &graph, bool dump_fu
   if (graph == nullptr) {
     return;
   }
-  auto path = pipeline::GetSaveGraphsPathName(Common::AddId(filename, ".ir"));
+  auto path = GetSaveGraphsPathName(Common::AddId(filename, ".ir"));
   if (!target_file.empty()) {
     path = target_file;
   }
diff --git a/mindspore/ccsrc/debug/anf_ir_dump.h b/mindspore/ccsrc/debug/anf_ir_dump.h
index 47831b071a1..2b9df92662a 100644
--- a/mindspore/ccsrc/debug/anf_ir_dump.h
+++ b/mindspore/ccsrc/debug/anf_ir_dump.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include "ir/dtype/type.h"
 #include "ir/anf.h"
+#include "debug/common.h"
 
 namespace mindspore {
 enum LocDumpMode { kOff = 0, kTopStack = 1, kWholeStack = 2 };
diff --git a/mindspore/ccsrc/debug/anf_ir_utils.cc b/mindspore/ccsrc/debug/anf_ir_utils.cc
index aba493689ed..c4434583f30 100644
--- a/mindspore/ccsrc/debug/anf_ir_utils.cc
+++ b/mindspore/ccsrc/debug/anf_ir_utils.cc
@@ -632,7 +632,7 @@ void ExportIR(const std::string &filename, const FuncGraphPtr &func_graph) {
     return;
   }
 
-  auto filepath = pipeline::GetSaveGraphsPathName(Common::AddId(filename, ".dat"));
+  auto filepath = GetSaveGraphsPathName(Common::AddId(filename, ".dat"));
   auto real_filepath = Common::GetRealPath(filepath);
   if (!real_filepath.has_value()) {
     MS_LOG(ERROR) << "The export ir path: " << filepath << " is not illegal.";
diff --git a/mindspore/ccsrc/debug/common.cc b/mindspore/ccsrc/debug/common.cc
index 876eb32ecf9..2b0fb1ae0c6 100644
--- a/mindspore/ccsrc/debug/common.cc
+++ b/mindspore/ccsrc/debug/common.cc
@@ -26,6 +26,25 @@
 #include "utils/utils.h"
 
 namespace mindspore {
+std::string Common::CommonFuncForConfigPath(const std::string &default_path, const std::string &env_path) {
+  std::string res_path = default_path;
+  if (!env_path.empty()) {
+    char real_path[PATH_MAX] = {0};
+#if defined(SYSTEM_ENV_WINDOWS)
+    if (_fullpath(real_path, common::SafeCStr(env_path), PATH_MAX) == nullptr) {
+      MS_LOG(EXCEPTION) << "The dir " << env_path << " does not exist.";
+    }
+    return real_path;
+#else
+    if (realpath(env_path.c_str(), real_path)) {
+      return real_path;
+    }
+    MS_LOG(EXCEPTION) << "Invalid env path, path : " << env_path;
+#endif
+  }
+  return res_path;
+}
+
 std::optional<std::string> Common::GetRealPath(const std::string &input_path) {
   if (input_path.length() >= PATH_MAX) {
     MS_LOG(ERROR) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
@@ -303,7 +322,19 @@ struct GlogLogDirRegister {
     if (logtostderr != nullptr && log_dir != nullptr) {
       std::string logtostderr_str = std::string(logtostderr);
       std::string log_dir_str = std::string(log_dir);
-
+      const char *rank_id = std::getenv("RANK_ID");
+      const char *gpu_rank_id = std::getenv("OMPI_COMM_WORLD_RANK");
+      std::string rank = "0";
+      bool both_exist = false;
+      if (rank_id != nullptr && gpu_rank_id == nullptr) {
+        rank = std::string(rank_id);
+      } else if (rank_id == nullptr && gpu_rank_id != nullptr) {
+        rank = std::string(gpu_rank_id);
+      } else if (rank_id != nullptr && gpu_rank_id != nullptr) {
+        rank = std::string(rank_id);
+        both_exist = true;
+      }
+      log_dir_str += "/rank_" + rank + "/logs";
       auto real_log_dir_str = Common::GetRealPath(log_dir_str);
       // While 'GLOG_logtostderr' = 0, logs output to files. 'GLOG_log_dir' must be specified as the path of log files.
       // Here can not throw exception and use python to catch, because the PYBIND11_MODULE is not yet been initialed.
@@ -319,6 +350,10 @@ struct GlogLogDirRegister {
         MS_LOG(ERROR) << "The path of log files, which set by 'GLOG_log_dir', is invalid.";
         exit(EXIT_FAILURE);
       }
+      if (both_exist) {
+        MS_LOG(WARNING) << "Environment variables RANK_ID and OMPI_COMM_WORLD_RANK both exist, we will use RANK_ID to "
+                           "get rank id by default.";
+      }
     }
   }
 } _glog_log_dir_register;
diff --git a/mindspore/ccsrc/debug/common.h b/mindspore/ccsrc/debug/common.h
index eff9c2efe62..07b231d554c 100644
--- a/mindspore/ccsrc/debug/common.h
+++ b/mindspore/ccsrc/debug/common.h
@@ -20,6 +20,8 @@
 #include <string>
 #include <optional>
 #include "utils/contract.h"
+#include "utils/ms_context.h"
+#include "utils/comm_manager.h"
 
 namespace mindspore {
 static const int MAX_DIRECTORY_LENGTH = 1024;
@@ -39,9 +41,25 @@ class Common {
   static std::string AddId(const std::string &filename, const std::string &suffix);
   static bool SaveStringToFile(const std::string filename, const std::string string_info);
   static bool FileExists(const std::string &filepath);
+  static std::string CommonFuncForConfigPath(const std::string &default_path, const std::string &env_path);
 
  private:
   static bool IsEveryFilenameValid(const std::string &path, size_t length_limit, const std::string &error_message);
 };
+
+inline std::string GetSaveGraphsPathName(const std::string &file_name, const std::string &save_path = "") {
+  std::string save_graphs_path;
+  if (save_path.empty()) {
+    auto ms_context = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(ms_context);
+    save_graphs_path = ms_context->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
+    if (save_graphs_path.empty()) {
+      save_graphs_path = ".";
+    }
+  } else {
+    save_graphs_path = save_path;
+  }
+  return save_graphs_path + "/rank_" + std::to_string(GetRank()) + "/ir_dump/" + file_name;
+}
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_DEBUG_COMMON_H_
diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
index 59f3864dc4c..36f0aff3c01 100644
--- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
+++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
@@ -139,6 +139,9 @@ void DumpJsonParser::CopyJsonToDir(uint32_t rank_id) {
       const std::string file_path = realpath.value();
       ChangeFileMode(file_path, S_IWUSR);
       std::ofstream json_copy(file_path);
+      if (!json_copy.is_open()) {
+        MS_LOG(EXCEPTION) << "Json file " << file_path << "open failed!";
+      }
       json_copy << json_file.rdbuf();
       json_copy.close();
       ChangeFileMode(file_path, S_IRUSR);
@@ -166,6 +169,9 @@ void DumpJsonParser::CopyHcclJsonToDir(uint32_t rank_id) {
     const std::string file_path = realpath.value();
     ChangeFileMode(file_path, S_IWUSR);
     std::ofstream json_copy(file_path);
+    if (!json_copy.is_open()) {
+      MS_LOG(EXCEPTION) << "Json file " << file_path << "open failed!";
+    }
     json_copy << json_file.rdbuf();
     json_copy.close();
     ChangeFileMode(file_path, S_IRUSR);
@@ -188,6 +194,9 @@ void DumpJsonParser::CopyMSCfgJsonToDir(uint32_t rank_id) {
     const std::string file_path = realpath.value();
     ChangeFileMode(file_path, S_IWUSR);
     std::ofstream json_create(file_path);
+    if (!json_create.is_open()) {
+      MS_LOG(EXCEPTION) << "Json file " << file_path << "open failed!";
+    }
     json_create << ms_info;
     json_create.close();
     ChangeFileMode(file_path, S_IRUSR);
@@ -212,14 +221,17 @@ bool DumpJsonParser::DumpToFile(const std::string &filename, const void *data, s
   ChangeFileMode(file_path, S_IWUSR);
   std::ofstream fd(file_path, std::ios::out | std::ios::trunc | std::ios::binary);
   if (!fd.is_open()) {
-    MS_LOG(ERROR) << "Open file " << file_path << " failed."
-                  << " Errno:" << errno << " ErrInfo:" << strerror(errno);
-    return false;
+    MS_LOG(EXCEPTION) << "Open file " << file_path << " failed."
+                      << " Errno:" << errno << " ErrInfo:" << strerror(errno);
   }
   std::string npy_header = GenerateNpyHeader(shape, type);
   if (!npy_header.empty()) {
     fd << npy_header;
     (void)fd.write(reinterpret_cast<const char *>(data), SizeToLong(len));
+    if (fd.bad()) {
+      fd.close();
+      MS_LOG(EXCEPTION) << "Write mem to file " << file_path << " failed.";
+    }
     fd.close();
     ChangeFileMode(file_path, S_IRUSR);
   }
@@ -358,7 +370,7 @@ void DumpJsonParser::ParseIteration(const nlohmann::json &content) {
       MS_LOG(EXCEPTION) << "iteration only supports digits, {'-', '|'}, or just \"all\" but got: " << iteration_;
     }
   } else if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
-    MS_LOG(WARNING) << "Dump not enabled. ";
+    MS_LOG(WARNING) << "Dump is not enabled. ";
   } else {
     MS_LOG(EXCEPTION) << "Dump Json Parse Failed. Async or E2E should be enabled. ";
   }
@@ -372,11 +384,11 @@ bool DumpJsonParser::IsDumpIter(uint32_t iteration) const {
   int start = 0;
   int end = iteration_.find("|");
   while (end != -1) {
-    std::string temp = iteration_.substr(start, end - start);
+    std::string temp = iteration_.substr(IntToSize(start), IntToSize(end - start));
     int range_idx = temp.find("-");
     if (range_idx != -1) {
-      uint32_t low_range = std::stoul(temp.substr(0, range_idx));
-      uint32_t high_range = std::stoul(temp.substr((range_idx + 1), -1));
+      uint32_t low_range = std::stoul(temp.substr(0, IntToSize(range_idx)));
+      uint32_t high_range = std::stoul(temp.substr(IntToSize(range_idx + 1), -1));
       if ((low_range <= iteration) && (iteration <= high_range)) {
         return true;
       }
@@ -386,10 +398,10 @@ bool DumpJsonParser::IsDumpIter(uint32_t iteration) const {
     start = end + 1;
     end = iteration_.find("|", start);
   }
-  std::string temp = iteration_.substr(start, end - start);
+  std::string temp = iteration_.substr(IntToSize(start), IntToSize(end - start));
   int range_idx = temp.find("-");
   if (range_idx != -1) {
-    uint32_t low_range = std::stoul(temp.substr(0, range_idx));
+    uint32_t low_range = std::stoul(temp.substr(0, IntToSize(range_idx)));
     uint32_t high_range = std::stoul(temp.substr((range_idx + 1), -1));
     if ((low_range <= iteration) && (iteration <= high_range)) {
       return true;
@@ -472,9 +484,9 @@ void DumpJsonParser::JsonConfigToString() {
   cur_config.append(" input_output:");
   cur_config.append(std::to_string(input_output_));
   cur_config.append("e2e_enable:");
-  cur_config.append(std::to_string(e2e_dump_enabled_));
+  cur_config.append(std::to_string(static_cast<int>(e2e_dump_enabled_)));
   cur_config.append(" async_dump_enable:");
-  cur_config.append(std::to_string(async_dump_enabled_));
+  cur_config.append(std::to_string(static_cast<int>(async_dump_enabled_)));
   MS_LOG(INFO) << cur_config;
 }
 
@@ -493,14 +505,14 @@ void DumpJsonParser::JudgeDumpEnabled() {
   }
 
   if (!async_dump_enabled_ && !e2e_dump_enabled_) {
-    MS_LOG(WARNING) << "Dump json parse failed. Dump not enabled";
+    MS_LOG(WARNING) << "Dump json parse failed. Dump is not enabled";
   }
   if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kCPUDevice) {
     auto device_id = context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
     if (support_devices_.find(device_id) == support_devices_.end()) {
       async_dump_enabled_ = false;
       e2e_dump_enabled_ = false;
-      MS_LOG(WARNING) << "Dump not enabled. device_id:" << device_id << " not support";
+      MS_LOG(WARNING) << "Dump is not enabled. device_id:" << device_id << " not support";
     }
   }
   JsonConfigToString();
@@ -541,9 +553,10 @@ std::string DumpJsonParser::GetOpOverflowBinPath(uint32_t graph_id) const {
   bin_path.append("rank_");
 
   uint32_t rank_id = 0;
-  auto env_table_file = common::GetEnv("RANK_TABLE_FILE");
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
   auto env_rank_id = common::GetEnv("RANK_ID");
-  if (!(env_table_file.empty() || env_rank_id.empty())) {
+  if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
     // get actual rank id if it's distribution training case.
     if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) {
       MS_LOG(INFO) << "Failed to get rank id.";
diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
index 4aa7efecc1e..013c486b0e3 100644
--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
@@ -309,7 +309,7 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t rank_id) {
   }
 }
 
-bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) {
+void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) {
   MS_EXCEPTION_IF_NULL(graph);
   bool success = false;
   auto &dump_json_parser = DumpJsonParser::GetInstance();
@@ -379,7 +379,11 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons
     success = true;
   }
 
-  return success;
+  if (success) {
+    MS_LOG(DEBUG) << "Dump Data completed!";
+  } else {
+    MS_LOG(DEBUG) << "Dump has not occurred!";
+  }
 }
 
 bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, const Debugger *debugger) {
diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.h b/mindspore/ccsrc/debug/data_dump/e2e_dump.h
index 1a906597f75..bacdde92509 100644
--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h
@@ -36,7 +36,7 @@ class E2eDump {
   E2eDump() = default;
   ~E2eDump() = default;
   static void DumpSetup(const session::KernelGraph *graph, uint32_t rank_id);
-  static bool DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
+  static void DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
 
   static bool DumpParametersAndConstData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger);
 
diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc
index 1f543f86376..3b75437b8b4 100644
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@@ -39,18 +39,18 @@ namespace mindspore {
 DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
 
 DebugServices::DebugServices(const DebugServices &other) {
-  wp_id_cache = other.wp_id_cache;
-  net_name = other.net_name;
-  dump_dir = other.dump_dir;
-  is_sync_mode = other.is_sync_mode;
+  wp_id_cache_ = other.wp_id_cache_;
+  net_name_ = other.net_name_;
+  dump_dir_ = other.dump_dir_;
+  is_sync_mode_ = other.is_sync_mode_;
   tensor_loader_ = other.tensor_loader_;
-  watchpoint_table = other.watchpoint_table;
+  watchpoint_table_ = other.watchpoint_table_;
 }
 
 DebugServices &DebugServices::operator=(const DebugServices &other) {
   if (this != &other) {
     tensor_loader_ = other.tensor_loader_;
-    watchpoint_table = other.watchpoint_table;
+    watchpoint_table_ = other.watchpoint_table_;
   }
   return *this;
 }
@@ -74,12 +74,12 @@ void DebugServices::AddWatchpoint(
     watchpoint_item.check_node_graph_list = *check_node_graph_list;
   }
   watchpoint_item.parameter_list = parameter_list;
-  watchpoint_table[id] = watchpoint_item;
+  watchpoint_table_[id] = watchpoint_item;
 }
 
 void DebugServices::RemoveWatchpoint(unsigned int id) {
   std::lock_guard<std::mutex> lg(lock_);
-  watchpoint_table.erase(id);
+  watchpoint_table_.erase(id);
 }
 
 std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
@@ -138,7 +138,7 @@ void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bo
   if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) {
     // read data in offline mode
     std::vector<std::string> file_paths;
-    if (!is_sync_mode) {
+    if (!is_sync_mode_) {
       ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
                          std::vector<unsigned int>{tensor->GetDeviceId()},
                          std::vector<unsigned int>{tensor->GetIteration() - 1},
@@ -165,7 +165,7 @@ void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end,
                                           const std::string &tensor_name, const std::string &tensor_name_no_slot,
                                           bool *previous_iter_tensor_needed, std::string *const qualified_tensor_name,
                                           std::vector<watchpoint_t> *const watchpoints_to_check) {
-  for (auto w_table_item : watchpoint_table) {
+  for (auto w_table_item : watchpoint_table_) {
     auto wp = std::get<1>(w_table_item);
     // check ONLY init conditions on initial suspended state.
     // skip other conditions on initial suspended state
@@ -178,7 +178,7 @@ void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end,
     // if not a recheck, check only unanalyzed tensors
     if (!recheck) {
       wp_lock_.lock();
-      bool wp_cache_hit = wp_id_cache[tensor_name].count(wp.id);
+      bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
       wp_lock_.unlock();
       if (wp_cache_hit) continue;
     }
@@ -200,7 +200,7 @@ void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned
   // add analyzed tensor to cache
   if (!recheck) {
     wp_lock_.lock();
-    wp_id_cache[tensor_name].insert(id);
+    wp_id_cache_[tensor_name].insert(id);
     wp_lock_.unlock();
   }
 }
@@ -309,7 +309,7 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::
                                      std::vector<unsigned int> *root_graph_id) {
   std::lock_guard<std::mutex> lg(lock_);
   auto t1 = std::chrono::high_resolution_clock::now();
-  if (watchpoint_table.empty()) return;
+  if (watchpoint_table_.empty()) return;
   // vector to store execution order of tensors hit
   std::vector<int> exec_order;
   int tensor_list_size = tensor_list->size();
@@ -505,7 +505,7 @@ void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<
           }
         }
       }
-      closedir(d_handle);
+      (void)closedir(d_handle);
     }
   }
 }
@@ -556,7 +556,7 @@ void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, st
     std::string prefix_dump_file_name = dump_style_kernel_name;
     GetNodeNameWithoutScope(&prefix_dump_file_name);
 
-    std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
+    std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
                                     std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
 
     // search files in dir for the one that meets the filename prefix and read the file into memory
@@ -586,7 +586,7 @@ void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, st
           }
         }
       }
-      closedir(d);
+      (void)closedir(d);
     }
   }
   ConvertToHostFormat(dir_to_files_map, result_list);
@@ -627,7 +627,7 @@ void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::str
           }
         }
       }
-      closedir(d);
+      (void)closedir(d);
     }
   }
   ConvertToHostFormat(dir_to_files_map, result_list);
@@ -753,7 +753,7 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
     std::string prefix_dump_to_check = dump_style_kernel_name;
     GetNodeNameWithoutScope(&prefix_dump_to_check);
 
-    std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
+    std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
                                     std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
 
     // search files in dir for the one that meets the filename prefix and read the file into memory
@@ -761,7 +761,7 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
     std::string type_name = "";
     std::vector<int64_t> shape;
     uint64_t data_size = 0;
-    if (is_sync_mode) {
+    if (is_sync_mode_) {
       std::string abspath = RealPath(specific_dump_dir);
       DIR *d = opendir(abspath.c_str());
       bool found_file = false;
@@ -786,8 +786,8 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
             matched_paths.push_back(full_path);
             found_file = true;
           }
-          closedir(d);
         }
+        (void)closedir(d);
       }
 
       if (found_file) {
@@ -857,7 +857,7 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
   // get a list of nodes and the devices they are on to monitor
   std::vector<std::shared_ptr<TensorData>> tensor_list;
   std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> device_and_graph_to_nodes;
-  for (auto w_table_item : watchpoint_table) {
+  for (auto w_table_item : watchpoint_table_) {
     auto wp = std::get<1>(w_table_item);
     unsigned int index = 0;
     for (auto check_node : wp.check_node_list) {
@@ -883,7 +883,7 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
     std::vector<std::tuple<std::string, bool>> wp_nodes = device_and_graph_item.second;
     std::vector<std::tuple<std::string, std::string>> proto_to_dump;
 
-    std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id) + "/" + net_name + "/" +
+    std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
                                     std::to_string(root_graph_id) + "/" + IterationString(iteration);
 
     // convert node names to dump style
@@ -903,11 +903,11 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
       proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
     }
 
-    if (!is_sync_mode) {
+    if (!is_sync_mode_) {
       // convert all files in proto_to_dump to npy and add to pool of async file names
       ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
     }
-    if (is_sync_mode) {
+    if (is_sync_mode_) {
       // search files in dir for the one that meets the filename prefix and read the file into memory
       std::string abspath = RealPath(specific_dump_dir);
       DIR *d = opendir(abspath.c_str());
@@ -940,7 +940,7 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
             }
           }
         }
-        closedir(d);
+        (void)closedir(d);
       }
     } else {
       GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool,
@@ -985,7 +985,7 @@ void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::
 #ifdef ONLINE_DBG_MODE
 bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
   bool ret = false;
-  for (auto w_table_item : watchpoint_table) {
+  for (auto w_table_item : watchpoint_table_) {
     auto check_node_list = std::get<1>(w_table_item).check_node_list;
     for (auto check_node : check_node_list) {
       std::string w_name = std::get<0>(check_node);
@@ -1049,17 +1049,17 @@ bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, boo
 }
 
 std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() {
-  return watchpoint_table;
+  return watchpoint_table_;
 }
 
 void DebugServices::ResetLoadedTensors() {
-  wp_id_cache.clear();
+  wp_id_cache_.clear();
   MS_LOG(INFO) << "Resetting loaded tensors";
   tensor_loader_->MoveParametersCurrentToPrev();
   tensor_loader_->EmptyCurrentTensor();
   // will move parameters from previous to current map
   tensor_loader_->SwapCurrentPrev();
-  overflow_ops.clear();
+  overflow_ops_.clear();
 }
 
 #ifdef ONLINE_DBG_MODE
@@ -1093,7 +1093,7 @@ bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int
   }
   overflow_bin_path = realpath.value();
 #else
-  overflow_bin_path = dump_dir + "/rank_" + std::to_string(device_id) + "/" + net_name + "/" +
+  overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
                       std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
   overflow_bin_path = RealPath(overflow_bin_path);
 #endif
@@ -1101,10 +1101,10 @@ bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int
   overflow_wp_lock_.lock();
 
   MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
-  auto found_overflows = overflow_ops.find(overflow_bin_path);
-  if (found_overflows != overflow_ops.end()) {
+  auto found_overflows = overflow_ops_.find(overflow_bin_path);
+  if (found_overflows != overflow_ops_.end()) {
     MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
-    op_names = overflow_ops[overflow_bin_path];
+    op_names = overflow_ops_[overflow_bin_path];
   } else {
     std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
     std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
@@ -1169,7 +1169,7 @@ bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int
           infile.close();
         }
       }
-      closedir(d);
+      (void)closedir(d);
     }
 
     // find the op_names with an overflow hit
@@ -1181,7 +1181,7 @@ bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int
       }
     }
 
-    overflow_ops[overflow_bin_path] = op_names;
+    overflow_ops_[overflow_bin_path] = op_names;
   }
 
   overflow_wp_lock_.unlock();
@@ -1303,17 +1303,17 @@ void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
   tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
 }
 
-void DebugServices::SetNetName(std::string net_name) { this->net_name = net_name; }
+void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
 
-std::string DebugServices::GetNetName() { return net_name; }
+std::string DebugServices::GetNetName() { return net_name_; }
 
-void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir = dump_dir; }
+void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
 
-std::string DebugServices::GetDumpDir() { return dump_dir; }
+std::string DebugServices::GetDumpDir() { return dump_dir_; }
 
-void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode = is_sync_mode; }
+void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
 
-bool DebugServices::GetSyncMode() { return is_sync_mode; }
+bool DebugServices::GetSyncMode() { return is_sync_mode_; }
 
 #ifdef ONLINE_DBG_MODE
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h
index d814b029589..9866475688e 100644
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@@ -332,13 +332,13 @@ class DebugServices {
   std::mutex overflow_wp_lock_;
 
   // to keep track of watchpoints that have been checked already for a tensor in current step
-  std::unordered_map<std::string, std::set<int32_t>> wp_id_cache;
-  std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
+  std::unordered_map<std::string, std::set<int32_t>> wp_id_cache_;
+  std::unordered_map<unsigned int, watchpoint_t> watchpoint_table_;
   // key is the iteration path, value is vector of op_names which have overflowed
-  std::unordered_map<std::string, std::vector<std::string>> overflow_ops;
-  std::string net_name;
-  std::string dump_dir;
-  bool is_sync_mode;
+  std::unordered_map<std::string, std::vector<std::string>> overflow_ops_;
+  std::string net_name_;
+  std::string dump_dir_;
+  bool is_sync_mode_;
 
   std::shared_ptr<TensorLoader> tensor_loader_;
 };
diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc
index bddc3c5a2ce..ced8b09f489 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -79,6 +79,7 @@ Debugger::Debugger()
       is_dataset_graph_(false),
       partial_memory_(false),
       initial_suspend_(true),
+      enable_heartbeat_(false),
       not_dataset_graph_sum_(0),
       version_("") {
   CheckDebuggerEnabledParam();
@@ -131,6 +132,7 @@ void Debugger::EnableDebugger() {
   // reset some of the class members
   num_step_ = 0;
   debugger_enabled_ = false;
+  enable_heartbeat_ = false;
   partial_memory_ = false;
   grpc_client_ = nullptr;
   debug_services_ = nullptr;
@@ -188,7 +190,7 @@ void Debugger::EnableDebugger() {
     // initialize grpc client
     grpc_client_ = std::make_unique<GrpcClient>(host, port);
     // initialize sending heartbeat
-    heartbeat_thread_ = std::make_unique<std::thread>([&]() { SendHeartbeat(heartbeat_period_second); });
+    heartbeat_thread_ = std::make_unique<std::thread>([=]() { SendHeartbeat(heartbeat_period_second); });
   }
   debug_services_ = std::make_unique<DebugServices>();
 }
@@ -582,17 +584,16 @@ GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
 }
 
 void Debugger::SendHeartbeat(int32_t period) {
-  bool heartbeat_enabled_ = true;
   int num_heartbeat_fail = 0;
   const int max_num_heartbeat_fail = 5;
-  const int retry_period = 500;
+  const int retry_milliseconds = 500;
 
   Heartbeat heartbeat;
   heartbeat.set_message("Debugger is alive");
   heartbeat.set_period(heartbeat_period_second);
 
-  bool run_ = CheckDebuggerEnabled() && heartbeat_enabled_;
-  while (run_) {
+  SetEnableHeartbeat(CheckDebuggerEnabled());
+  while (enable_heartbeat_) {
     EventReply reply = grpc_client_->SendHeartbeat(heartbeat);
 
     if (reply.status() != reply.OK) {
@@ -600,11 +601,11 @@ void Debugger::SendHeartbeat(int32_t period) {
       num_heartbeat_fail++;
       if (num_heartbeat_fail >= max_num_heartbeat_fail) {
         MS_LOG(ERROR) << "Maximum number of failure for SendHeartbeat reached : exiting training session.";
-        Exit();
-        run_ = false;
+        SetEnableHeartbeat(false);
+        break;
       } else {
         MS_LOG(ERROR) << "Number of consecutive SendHeartbeat fail:" << num_heartbeat_fail;
-        std::this_thread::sleep_for(std::chrono::milliseconds(retry_period));
+        std::this_thread::sleep_for(std::chrono::milliseconds(retry_milliseconds));
       }
     } else {
       std::this_thread::sleep_for(std::chrono::milliseconds(period * 1000));
@@ -943,9 +944,15 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
   }
   return tensor_list;
 }
+
 void Debugger::Exit() {
   // clear resource before exit
-  // debugger will notify main thread to exit because main thread can only exit at step boundary
+  // debugger will notify main thread to exit because main thread can only exit at step boundary.
+  SetEnableHeartbeat(false);
+  if (heartbeat_thread_ && heartbeat_thread_->joinable()) {
+    heartbeat_thread_->join();
+    MS_LOG(INFO) << "Join Heartbeat thread.";
+  }
   pipeline::ExecutorPy::DebugTerminate(true);
 }
 
@@ -1136,6 +1143,8 @@ bool GetMiVersionMatched(const EventReply &reply) { return reply.version_matched
 
 bool Debugger::partial_memory() const { return partial_memory_; }
 
+void Debugger::SetEnableHeartbeat(bool enabled) { enable_heartbeat_ = enabled; }
+
 void Debugger::SetCurNode(const std::string &cur_name) {
   // access lock for public method
   std::lock_guard<std::mutex> a_lock(access_lock_);
@@ -1231,13 +1240,13 @@ void Debugger::LoadParametersAndConst() {
   if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
   MS_EXCEPTION_IF_NULL(graph_ptr_);
   // load parameters
-  MS_LOG(INFO) << "Start to load Parameters!";
+  MS_LOG(INFO) << "Start to load Parameters for graph " << graph_ptr_->graph_id();
   const auto &parameters = graph_ptr_->inputs();
   for (auto &item : parameters) {
     LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX);
   }
   // load value nodes
-  // get all constant avlues from the graph
+  // get all constant values from the graph
   MS_LOG(INFO) << "Start to load value nodes!";
   const auto value_nodes = graph_ptr_->graph_value_nodes();
   for (auto &item : value_nodes) {
@@ -1255,7 +1264,7 @@ void Debugger::LoadParametersAndConst(const KernelGraphPtr &graph) {
     LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX);
   }
   // load value nodes
-  // get all constant avlues from the graph
+  // get all constant values from the graph
   MS_LOG(INFO) << "Start to load value nodes for graph " << graph->graph_id();
   const auto value_nodes = graph_ptr_->graph_value_nodes();
   for (auto &item : value_nodes) {
diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h
index 9446f96b61d..07f6121fb7c 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -124,6 +124,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
 
   bool partial_memory() const;
 
+  void SetEnableHeartbeat(bool enabled);
+
   void SetCurNode(const std::string &cur_name);
 
   std::string run_level() const;
@@ -263,6 +265,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
   std::mutex access_lock_;
   // flag to keep track of the very first suspension of debugger
   bool initial_suspend_;
+  bool enable_heartbeat_;
 
   std::list<GraphProto> graph_proto_list_;
   std::list<KernelGraphPtr> graph_ptr_list_;
diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc
index eec6addc0fd..4a0075b341a 100644
--- a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc
+++ b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc
@@ -24,26 +24,26 @@ DbgServices::DbgServices(bool verbose) {
   if (dbg_log_path != NULL) {
     DbgLogger::verbose = true;
   }
-  debug_services = new DebugServices();
+  debug_services_ = new DebugServices();
 }
 
 DbgServices::DbgServices(const DbgServices &other) {
   MS_LOG(INFO) << "cpp DbgServices object is created via copy";
-  debug_services = new DebugServices(*other.debug_services);
+  debug_services_ = new DebugServices(*other.debug_services_);
 }
 
 DbgServices &DbgServices::operator=(const DbgServices &other) {
   MS_LOG(INFO) << "cpp DbgServices object is being assigned a different state";
   if (this != &other) {
-    delete debug_services;
-    debug_services = new DebugServices(*other.debug_services);
+    delete debug_services_;
+    debug_services_ = new DebugServices(*other.debug_services_);
   }
   return *this;
 }
 
 DbgServices::~DbgServices() {
   MS_LOG(INFO) << "cpp DbgServices object is deleted";
-  delete debug_services;
+  delete debug_services_;
 }
 
 std::string DbgServices::GetVersion() {
@@ -55,13 +55,13 @@ int32_t DbgServices::Initialize(std::string net_name, std::string dump_folder_pa
   MS_LOG(INFO) << "cpp DbgServices initialize network name " << net_name;
   MS_LOG(INFO) << "cpp DbgServices initialize dump folder path " << dump_folder_path;
   MS_LOG(INFO) << "cpp DbgServices initialize sync mode " << is_sync_mode;
-  if (debug_services == nullptr) {
+  if (debug_services_ == nullptr) {
     MS_LOG(EXCEPTION) << "Debugger services initialize failed as occur null pointer error,"
                       << "may be due to memory allocation failure, check as: top";
   }
-  debug_services->SetNetName(net_name);
-  debug_services->SetDumpDir(dump_folder_path);
-  debug_services->SetSyncMode(is_sync_mode);
+  debug_services_->SetNetName(net_name);
+  debug_services_->SetDumpDir(dump_folder_path);
+  debug_services_->SetSyncMode(is_sync_mode);
   return 0;
 }
 
@@ -149,15 +149,15 @@ int32_t DbgServices::AddWatchpoint(
       return DebugServices::parameter_t{parameter.name, parameter.disabled, parameter.value, parameter.hit};
     });
 
-  debug_services->AddWatchpoint(id, watch_condition, 0, check_node_list, parameter_list_backend,
-                                &check_node_device_list, &check_node_graph_list);
+  debug_services_->AddWatchpoint(id, watch_condition, 0, check_node_list, parameter_list_backend,
+                                 &check_node_device_list, &check_node_graph_list);
   MS_LOG(INFO) << "cpp end";
   return 0;
 }
 
 int32_t DbgServices::RemoveWatchpoint(unsigned int id) {
   MS_LOG(INFO) << "cpp DbgServices RemoveWatchpoint id " << id;
-  debug_services->RemoveWatchpoint(id);
+  debug_services_->RemoveWatchpoint(id);
   return 0;
 }
 
@@ -178,10 +178,10 @@ std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iterati
 
   const bool init_dbg_suspend = (iteration == UINT_MAX);
 
-  tensor_list = debug_services->ReadNeededDumpedTensors(iteration, &file_paths);
+  tensor_list = debug_services_->ReadNeededDumpedTensors(iteration, &file_paths);
 
-  debug_services->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
-                                   file_paths, &tensor_list, init_dbg_suspend, true, true, &rank_id, &root_graph_id);
+  debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
+                                    file_paths, &tensor_list, init_dbg_suspend, true, true, &rank_id, &root_graph_id);
 
   std::vector<watchpoint_hit_t> hits;
   for (unsigned int i = 0; i < name.size(); i++) {
@@ -252,11 +252,11 @@ std::vector<tensor_data_t> DbgServices::ReadTensors(std::vector<tensor_info_t> i
   std::vector<std::string> file_paths;
   auto t1 = std::chrono::high_resolution_clock::now();
   // Convert the dumped data to npy format if it's async mode.
-  if (!debug_services->GetSyncMode()) {
-    debug_services->ConvertReadTensors(backend_name, slot, rank_id, iteration, root_graph_id, &file_paths);
+  if (!debug_services_->GetSyncMode()) {
+    debug_services_->ConvertReadTensors(backend_name, slot, rank_id, iteration, root_graph_id, &file_paths);
   }
-  debug_services->ReadDumpedTensor(backend_name, slot, rank_id, iteration, root_graph_id, is_output, file_paths,
-                                   &result_list);
+  debug_services_->ReadDumpedTensor(backend_name, slot, rank_id, iteration, root_graph_id, is_output, file_paths,
+                                    &result_list);
   auto t2 = std::chrono::high_resolution_clock::now();
   /* Getting number of milliseconds as a double. */
   std::chrono::duration<double, std::milli> ms_double = t2 - t1;
diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.h b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.h
index c53e5a1efa4..5243c413a6e 100644
--- a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.h
+++ b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.h
@@ -119,7 +119,7 @@ struct tensor_data_t {
 
 class DbgServices {
  private:
-  DebugServices *debug_services;
+  DebugServices *debug_services_;
 
  public:
   explicit DbgServices(bool verbose = false);
diff --git a/mindspore/ccsrc/debug/draw.cc b/mindspore/ccsrc/debug/draw.cc
index 734ae7081af..769ee812a66 100644
--- a/mindspore/ccsrc/debug/draw.cc
+++ b/mindspore/ccsrc/debug/draw.cc
@@ -30,7 +30,6 @@
 #include "pipeline/jit/parse/resolve.h"
 #include "ir/tensor.h"
 #include "pipeline/jit/base.h"
-#include "debug/common.h"
 
 namespace mindspore {
 // namespace to support debug utils
@@ -189,7 +188,7 @@ void Draw(const std::string &filename, const FuncGraphPtr &func_graph) {
   const std::string dot_suffix = ".dot";
   const std::string filename_with_suffix =
     (filename.rfind(dot_suffix) != (filename.size() - dot_suffix.size())) ? (filename + dot_suffix) : filename;
-  const std::string filepath = pipeline::GetSaveGraphsPathName(Common::AddId(filename_with_suffix, dot_suffix));
+  const std::string filepath = GetSaveGraphsPathName(Common::AddId(filename_with_suffix, dot_suffix));
   auto real_filepath = Common::GetRealPath(filepath);
   if (!real_filepath.has_value()) {
     MS_LOG(EXCEPTION) << "The export ir path: " << filepath << " is not illegal.";
@@ -199,7 +198,7 @@ void Draw(const std::string &filename, const FuncGraphPtr &func_graph) {
 
 void DrawUserFuncGraph(const std::string &filename, const FuncGraphPtr &func_graph) {
   const std::string dot_suffix = ".dot";
-  const std::string filepath = pipeline::GetSaveGraphsPathName(Common::AddId(filename, dot_suffix));
+  const std::string filepath = GetSaveGraphsPathName(Common::AddId(filename, dot_suffix));
   auto real_filepath = Common::GetRealPath(filepath);
   if (!real_filepath.has_value()) {
     MS_LOG(EXCEPTION) << "The export ir path: " << filepath << " is not illegal.";
diff --git a/mindspore/ccsrc/debug/draw.h b/mindspore/ccsrc/debug/draw.h
index 85a6af5d1e1..350c7af3c87 100644
--- a/mindspore/ccsrc/debug/draw.h
+++ b/mindspore/ccsrc/debug/draw.h
@@ -22,6 +22,7 @@
 #include <vector>
 #include "ir/anf.h"
 #include "utils/any.h"
+#include "debug/common.h"
 
 namespace mindspore {
 namespace draw {
diff --git a/mindspore/ccsrc/debug/dump_proto.cc b/mindspore/ccsrc/debug/dump_proto.cc
index ec6a67ed872..f6a9bbb3a48 100644
--- a/mindspore/ccsrc/debug/dump_proto.cc
+++ b/mindspore/ccsrc/debug/dump_proto.cc
@@ -23,7 +23,6 @@
 #include <vector>
 
 #include "debug/anf_ir_utils.h"
-#include "debug/common.h"
 #include "proto/anf_ir.pb.h"
 #include "ir/graph_utils.h"
 #include "utils/ms_context.h"
@@ -544,7 +543,7 @@ void DumpIRProto(const FuncGraphPtr &func_graph, const std::string &suffix) {
     MS_LOG(ERROR) << "Func graph is nullptr";
     return;
   }
-  std::string file_path = pipeline::GetSaveGraphsPathName("ms_output_" + suffix + ".pb");
+  std::string file_path = GetSaveGraphsPathName("ms_output_" + suffix + ".pb");
   auto realpath = Common::GetRealPath(file_path);
   if (!realpath.has_value()) {
     MS_LOG(ERROR) << "Get real path failed, path=" << file_path;
diff --git a/mindspore/ccsrc/debug/dump_proto.h b/mindspore/ccsrc/debug/dump_proto.h
index ce826db13db..2e035c21376 100644
--- a/mindspore/ccsrc/debug/dump_proto.h
+++ b/mindspore/ccsrc/debug/dump_proto.h
@@ -20,6 +20,7 @@
 
 #include "ir/func_graph.h"
 #include "proto/mind_ir.pb.h"
+#include "debug/common.h"
 
 namespace mindspore {
 std::string GetFuncGraphProtoString(const FuncGraphPtr &func_graph);
diff --git a/mindspore/ccsrc/debug/env_config_parser.cc b/mindspore/ccsrc/debug/env_config_parser.cc
index 3a43fcc1238..8a3ff8e3f9a 100644
--- a/mindspore/ccsrc/debug/env_config_parser.cc
+++ b/mindspore/ccsrc/debug/env_config_parser.cc
@@ -23,16 +23,19 @@
 #include "utils/convert_utils_base.h"
 
 namespace {
+#ifdef ENABLE_DUMP_IR
 constexpr auto ENV_RDR_ENABLE = "MS_RDR_ENABLE";
 constexpr auto ENV_RDR_PATH = "MS_RDR_PATH";
 constexpr auto KEY_RDR_SETTINGS = "rdr";
 constexpr auto KEY_PATH = "path";
 constexpr auto KEY_ENABLE = "enable";
+#endif
 constexpr auto KEY_MEM_REUSE_SETTINGS = "sys";
 constexpr auto KEY_MEM_REUSE = "mem_reuse";
 }  // namespace
 
 namespace mindspore {
+#ifdef ENABLE_DUMP_IR
 std::optional<bool> GetRdrEnableFromEnv() {
   // get environment variable to configure RDR
   std::string env_enable_str = common::GetEnv(ENV_RDR_ENABLE);
@@ -62,6 +65,7 @@ std::optional<std::string> GetRdrPathFromEnv() {
   }
   return std::nullopt;
 }
+#endif
 
 bool EnvConfigParser::CheckJsonStringType(const nlohmann::json &content, const std::string &setting_key,
                                           const std::string &key) const {
@@ -91,6 +95,7 @@ std::string EnvConfigParser::GetIfstreamString(const std::ifstream &ifstream) co
 }
 
 void EnvConfigParser::ParseFromEnv() {
+#ifdef ENABLE_DUMP_IR
   // Get RDR seetings from environment variables
   auto rdr_enable_env = GetRdrEnableFromEnv();
   if (rdr_enable_env.has_value()) {
@@ -108,6 +113,7 @@ void EnvConfigParser::ParseFromEnv() {
       rdr_path_ = path;
     }
   }
+#endif
 }
 
 void EnvConfigParser::ParseFromFile() {
@@ -142,8 +148,9 @@ void EnvConfigParser::ParseFromFile() {
   std::string cfg = ss.str();
   MS_LOG(INFO) << "Env config json:" << cfg;
 
-  // Parse rdr seetings from file
+#ifdef ENABLE_DUMP_IR
   ParseRdrSetting(j);
+#endif
   ParseMemReuseSetting(j);
 
   ConfigToString();
@@ -181,6 +188,7 @@ void EnvConfigParser::ParseSysMemReuse(const nlohmann::json &content) {
   sys_memreuse_ = content;
 }
 
+#ifdef ENABLE_DUMP_IR
 void EnvConfigParser::ParseRdrSetting(const nlohmann::json &content) {
   auto rdr_setting = content.find(KEY_RDR_SETTINGS);
   if (rdr_setting == content.end()) {
@@ -231,14 +239,17 @@ void EnvConfigParser::ParseRdrEnable(const nlohmann::json &content) {
   }
   rdr_enabled_ = content;
 }
+#endif
 
 void EnvConfigParser::ConfigToString() {
   std::string cur_config;
+#ifdef ENABLE_DUMP_IR
   cur_config.append("After parsed, rdr path: ");
   cur_config.append(rdr_path_);
   cur_config.append(", rdr_enable: ");
   std::string rdr_enable_flag = rdr_enabled_ ? "1" : "0";
   (void)cur_config.append(rdr_enable_flag);
+#endif
   MS_LOG(INFO) << cur_config;
 }
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/debug/env_config_parser.h b/mindspore/ccsrc/debug/env_config_parser.h
index e38c2b67359..59def33f2b1 100644
--- a/mindspore/ccsrc/debug/env_config_parser.h
+++ b/mindspore/ccsrc/debug/env_config_parser.h
@@ -34,9 +34,11 @@ class EnvConfigParser {
   void Parse();
   std::string ConfigPath() const { return config_file_; }
 
+#ifdef ENABLE_DUMP_IR
   bool HasRdrSetting() const { return has_rdr_setting_; }
   bool RdrEnabled() const { return rdr_enabled_; }
   std::string RdrPath() const { return rdr_path_; }
+#endif
   bool GetSysMemreuse() { return sys_memreuse_; }
   void SetSysMemreuse(bool set_memreuse) { sys_memreuse_ = set_memreuse; }
 
@@ -48,10 +50,12 @@ class EnvConfigParser {
   std::string config_file_{""};
   bool already_parsed_{false};
 
+#ifdef ENABLE_DUMP_IR
   // rdr
   bool rdr_enabled_{false};
   bool has_rdr_setting_{false};
   std::string rdr_path_{"./rdr/"};
+#endif
 
   // memreuse
   bool sys_memreuse_{true};
@@ -63,9 +67,11 @@ class EnvConfigParser {
   std::optional<nlohmann::detail::iter_impl<const nlohmann::json>> CheckJsonKeyExist(const nlohmann::json &content,
                                                                                      const std::string &setting_key,
                                                                                      const std::string &key) const;
+#ifdef ENABLE_DUMP_IR
   void ParseRdrSetting(const nlohmann::json &content);
   void ParseRdrPath(const nlohmann::json &content);
   void ParseRdrEnable(const nlohmann::json &content);
+#endif
   void ParseMemReuseSetting(const nlohmann::json &content);
   void ParseSysMemReuse(const nlohmann::json &content);
 
diff --git a/mindspore/ccsrc/debug/tensor_data.h b/mindspore/ccsrc/debug/tensor_data.h
index 0a85b10ede6..91b9100f2b3 100644
--- a/mindspore/ccsrc/debug/tensor_data.h
+++ b/mindspore/ccsrc/debug/tensor_data.h
@@ -157,143 +157,143 @@ typedef enum DbgDataType : unsigned int {
 
 class TensorData {
  public:
-  TensorData() : slot(0), execution_order(-1) {}
+  TensorData() : slot_(0), execution_order_(-1) {}
 
   TensorData(const TensorData &obj) {
     MS_LOG(INFO) << "Copy Constructor";
-    this->name = obj.name;
-    this->execution_order = obj.execution_order;
-    this->slot = obj.slot;
-    this->data_ptr = obj.data_ptr;
-    this->size = obj.size;
-    this->data_type = obj.data_type;
-    this->data_type_size = obj.data_type_size;
-    this->shape = obj.shape;
-    this->iteration = obj.iteration;
-    this->device_id = obj.device_id;
-    this->data_ptr = obj.data_ptr;
-    this->root_graph_id = obj.root_graph_id;
-    this->is_output = obj.is_output;
+    this->name_ = obj.name_;
+    this->execution_order_ = obj.execution_order_;
+    this->slot_ = obj.slot_;
+    this->data_ptr_ = obj.data_ptr_;
+    this->size_ = obj.size_;
+    this->data_type_ = obj.data_type_;
+    this->data_type_size_ = obj.data_type_size_;
+    this->shape_ = obj.shape_;
+    this->iteration_ = obj.iteration_;
+    this->device_id_ = obj.device_id_;
+    this->data_ptr_ = obj.data_ptr_;
+    this->root_graph_id_ = obj.root_graph_id_;
+    this->is_output_ = obj.is_output_;
 #ifdef ONLINE_DBG_MODE
-    this->tensor_ptr = obj.tensor_ptr;
+    this->tensor_ptr_ = obj.tensor_ptr_;
 #endif
   }
 
   ~TensorData() {}
 
-  std::string GetName() const { return this->name; }
+  std::string GetName() const { return this->name_; }
 
-  size_t GetSlot() const { return this->slot; }
+  size_t GetSlot() const { return this->slot_; }
 
-  int GetExecutionOrder() const { return this->execution_order; }
+  int GetExecutionOrder() const { return this->execution_order_; }
 
-  void SetExecutionOrder(int execution_order) { this->execution_order = execution_order; }
+  void SetExecutionOrder(int execution_order) { this->execution_order_ = execution_order; }
 
-  void SetName(const std::string &name) { this->name = name; }
+  void SetName(const std::string &name) { this->name_ = name; }
 
 #ifdef ONLINE_DBG_MODE
-  void SetTensor(mindspore::tensor::TensorPtr out_tensor) { this->tensor_ptr = out_tensor; }
+  void SetTensor(mindspore::tensor::TensorPtr out_tensor) { this->tensor_ptr_ = out_tensor; }
 #endif
 
-  void SetSlot(size_t slot) { this->slot = slot; }
+  void SetSlot(size_t slot) { this->slot_ = slot; }
 
-  char *GetDataPtr() const { return this->data_ptr; }
+  char *GetDataPtr() const { return this->data_ptr_; }
 
-  void SetDataPtr(char *data_ptr) { this->data_ptr = data_ptr; }
+  void SetDataPtr(char *data_ptr) { this->data_ptr_ = data_ptr; }
 
-  uint32_t GetNumElements() { return size / data_type_size; }
+  uint32_t GetNumElements() { return size_ / data_type_size_; }
 
-  uint64_t GetByteSize() const { return this->size; }
+  uint64_t GetByteSize() const { return this->size_; }
 
-  void SetByteSize(uint64_t size) { this->size = size; }
+  void SetByteSize(uint64_t size) { this->size_ = size; }
 
-  std::vector<int64_t> GetShape() const { return this->shape; }
+  std::vector<int64_t> GetShape() const { return this->shape_; }
 
-  void SetShape(std::vector<int64_t> shape) { this->shape = shape; }
+  void SetShape(std::vector<int64_t> shape) { this->shape_ = shape; }
 
-  unsigned int GetIteration() const { return this->iteration; }
+  unsigned int GetIteration() const { return this->iteration_; }
 
-  void SetIteration(unsigned int iteration) { this->iteration = iteration; }
+  void SetIteration(unsigned int iteration) { this->iteration_ = iteration; }
 
-  unsigned int GetDeviceId() const { return this->device_id; }
+  unsigned int GetDeviceId() const { return this->device_id_; }
 
-  void SetDeviceId(unsigned int device_id) { this->device_id = device_id; }
+  void SetDeviceId(unsigned int device_id) { this->device_id_ = device_id; }
 
-  unsigned int GetRootGraphId() const { return this->root_graph_id; }
+  unsigned int GetRootGraphId() const { return this->root_graph_id_; }
 
-  void SetRootGraphId(unsigned int root_graph_id) { this->root_graph_id = root_graph_id; }
+  void SetRootGraphId(unsigned int root_graph_id) { this->root_graph_id_ = root_graph_id; }
 
-  DbgDataType GetType() const { return this->data_type; }
+  DbgDataType GetType() const { return this->data_type_; }
 
   void SetType(unsigned int type) { ConvertMsToDbgType(type); }
 
   void SetType(std::string type_name) { ConvertStringToDbgType(type_name); }
 
-  bool GetIsOutput() const { return this->is_output; }
+  bool GetIsOutput() const { return this->is_output_; }
 
-  void SetIsOutput(bool is_output) { this->is_output = is_output; }
+  void SetIsOutput(bool is_output) { this->is_output_ = is_output; }
 
   void ConvertMsToDbgType(uint32_t type) {
     switch (type) {
       case MsTypeId::kNumberTypeBool:
-        this->data_type = DbgDataType::DT_BOOL;
-        this->data_type_size = 1;
+        this->data_type_ = DbgDataType::DT_BOOL;
+        this->data_type_size_ = 1;
         break;
       case MsTypeId::kNumberTypeInt8:
-        this->data_type = DbgDataType::DT_INT8;
-        this->data_type_size = 1;
+        this->data_type_ = DbgDataType::DT_INT8;
+        this->data_type_size_ = 1;
         break;
       case MsTypeId::kNumberTypeInt16:
-        this->data_type = DbgDataType::DT_INT16;
-        this->data_type_size = 2;
+        this->data_type_ = DbgDataType::DT_INT16;
+        this->data_type_size_ = 2;
         break;
       case MsTypeId::kNumberTypeInt32:
-        this->data_type = DbgDataType::DT_INT32;
-        this->data_type_size = 4;
+        this->data_type_ = DbgDataType::DT_INT32;
+        this->data_type_size_ = 4;
         break;
       case MsTypeId::kNumberTypeInt64:
-        this->data_type = DbgDataType::DT_INT64;
-        this->data_type_size = 8;
+        this->data_type_ = DbgDataType::DT_INT64;
+        this->data_type_size_ = 8;
         break;
       case MsTypeId::kNumberTypeUInt8:
-        this->data_type = DbgDataType::DT_UINT8;
-        this->data_type_size = 1;
+        this->data_type_ = DbgDataType::DT_UINT8;
+        this->data_type_size_ = 1;
         break;
       case MsTypeId::kNumberTypeUInt16:
-        this->data_type = DbgDataType::DT_UINT16;
-        this->data_type_size = 2;
+        this->data_type_ = DbgDataType::DT_UINT16;
+        this->data_type_size_ = 2;
         break;
       case MsTypeId::kNumberTypeUInt32:
-        this->data_type = DbgDataType::DT_UINT32;
-        this->data_type_size = 4;
+        this->data_type_ = DbgDataType::DT_UINT32;
+        this->data_type_size_ = 4;
         break;
       case MsTypeId::kNumberTypeUInt64:
-        this->data_type = DbgDataType::DT_UINT64;
-        this->data_type_size = 8;
+        this->data_type_ = DbgDataType::DT_UINT64;
+        this->data_type_size_ = 8;
         break;
       case MsTypeId::kNumberTypeFloat16:
-        this->data_type = DbgDataType::DT_FLOAT16;
-        this->data_type_size = 2;
+        this->data_type_ = DbgDataType::DT_FLOAT16;
+        this->data_type_size_ = 2;
         break;
       case MsTypeId::kNumberTypeFloat32:
-        this->data_type = DbgDataType::DT_FLOAT32;
-        this->data_type_size = 4;
+        this->data_type_ = DbgDataType::DT_FLOAT32;
+        this->data_type_size_ = 4;
         break;
       case MsTypeId::kNumberTypeFloat64:
-        this->data_type = DbgDataType::DT_FLOAT64;
-        this->data_type_size = 8;
+        this->data_type_ = DbgDataType::DT_FLOAT64;
+        this->data_type_size_ = 8;
         break;
       case MsTypeId::kNumberTypeInt:
-        this->data_type = DbgDataType::DT_BASE_INT;
-        this->data_type_size = 4;
+        this->data_type_ = DbgDataType::DT_BASE_INT;
+        this->data_type_size_ = 4;
         break;
       case MsTypeId::kNumberTypeUInt:
-        this->data_type = DbgDataType::DT_BASE_UINT;
-        this->data_type_size = 4;
+        this->data_type_ = DbgDataType::DT_BASE_UINT;
+        this->data_type_size_ = 4;
         break;
       case MsTypeId::kNumberTypeFloat:
-        this->data_type = DbgDataType::DT_BASE_FLOAT;
-        this->data_type_size = 4;
+        this->data_type_ = DbgDataType::DT_BASE_FLOAT;
+        this->data_type_size_ = 4;
         break;
       default:
         MS_LOG(EXCEPTION) << "Unexpected type id: " << type;
@@ -302,52 +302,52 @@ class TensorData {
 
   bool ConvertNpyStringToDbgType(const std::string &type_name) {
     if (type_name == "b1") {
-      this->data_type = DbgDataType::DT_BOOL;
-      this->data_type_size = 1;
+      this->data_type_ = DbgDataType::DT_BOOL;
+      this->data_type_size_ = 1;
       return true;
     } else if (type_name == "i1") {
-      this->data_type = DbgDataType::DT_INT8;
-      this->data_type_size = 1;
+      this->data_type_ = DbgDataType::DT_INT8;
+      this->data_type_size_ = 1;
       return true;
     } else if (type_name == "i2") {
-      this->data_type = DbgDataType::DT_INT16;
-      this->data_type_size = 2;
+      this->data_type_ = DbgDataType::DT_INT16;
+      this->data_type_size_ = 2;
       return true;
     } else if (type_name == "i4") {
-      this->data_type = DbgDataType::DT_INT32;
-      this->data_type_size = 4;
+      this->data_type_ = DbgDataType::DT_INT32;
+      this->data_type_size_ = 4;
       return true;
     } else if (type_name == "i8") {
-      this->data_type = DbgDataType::DT_INT64;
-      this->data_type_size = 8;
+      this->data_type_ = DbgDataType::DT_INT64;
+      this->data_type_size_ = 8;
       return true;
     } else if (type_name == "u1") {
-      this->data_type = DbgDataType::DT_UINT8;
-      this->data_type_size = 1;
+      this->data_type_ = DbgDataType::DT_UINT8;
+      this->data_type_size_ = 1;
       return true;
     } else if (type_name == "u2") {
-      this->data_type = DbgDataType::DT_UINT16;
-      this->data_type_size = 2;
+      this->data_type_ = DbgDataType::DT_UINT16;
+      this->data_type_size_ = 2;
       return true;
     } else if (type_name == "u4") {
-      this->data_type = DbgDataType::DT_UINT32;
-      this->data_type_size = 4;
+      this->data_type_ = DbgDataType::DT_UINT32;
+      this->data_type_size_ = 4;
       return true;
     } else if (type_name == "u8") {
-      this->data_type = DbgDataType::DT_UINT64;
-      this->data_type_size = 8;
+      this->data_type_ = DbgDataType::DT_UINT64;
+      this->data_type_size_ = 8;
       return true;
     } else if (type_name == "f2") {
-      this->data_type = DbgDataType::DT_FLOAT16;
-      this->data_type_size = 2;
+      this->data_type_ = DbgDataType::DT_FLOAT16;
+      this->data_type_size_ = 2;
       return true;
     } else if (type_name == "f4") {
-      this->data_type = DbgDataType::DT_FLOAT32;
-      this->data_type_size = 4;
+      this->data_type_ = DbgDataType::DT_FLOAT32;
+      this->data_type_size_ = 4;
       return true;
     } else if (type_name == "f8") {
-      this->data_type = DbgDataType::DT_FLOAT64;
-      this->data_type_size = 8;
+      this->data_type_ = DbgDataType::DT_FLOAT64;
+      this->data_type_size_ = 8;
       return true;
     } else {
       return false;
@@ -362,44 +362,44 @@ class TensorData {
     }
     (void)std::transform(type_name_lower.begin(), type_name_lower.end(), type_name_lower.begin(), ::tolower);
     if (type_name_lower == "bool") {
-      this->data_type = DbgDataType::DT_BOOL;
-      this->data_type_size = 1;
+      this->data_type_ = DbgDataType::DT_BOOL;
+      this->data_type_size_ = 1;
     } else if (type_name_lower == "int8") {
-      this->data_type = DbgDataType::DT_INT8;
-      this->data_type_size = 1;
+      this->data_type_ = DbgDataType::DT_INT8;
+      this->data_type_size_ = 1;
     } else if (type_name_lower == "int16") {
-      this->data_type = DbgDataType::DT_INT16;
-      this->data_type_size = 2;
+      this->data_type_ = DbgDataType::DT_INT16;
+      this->data_type_size_ = 2;
     } else if (type_name_lower == "int32") {
-      this->data_type = DbgDataType::DT_INT32;
-      this->data_type_size = 4;
+      this->data_type_ = DbgDataType::DT_INT32;
+      this->data_type_size_ = 4;
     } else if (type_name_lower == "int64") {
-      this->data_type = DbgDataType::DT_INT64;
-      this->data_type_size = 8;
+      this->data_type_ = DbgDataType::DT_INT64;
+      this->data_type_size_ = 8;
     } else if (type_name_lower == "uint8") {
-      this->data_type = DbgDataType::DT_UINT8;
-      this->data_type_size = 1;
+      this->data_type_ = DbgDataType::DT_UINT8;
+      this->data_type_size_ = 1;
     } else if (type_name_lower == "uint16") {
-      this->data_type = DbgDataType::DT_UINT16;
-      this->data_type_size = 2;
+      this->data_type_ = DbgDataType::DT_UINT16;
+      this->data_type_size_ = 2;
     } else if (type_name_lower == "uint32") {
-      this->data_type = DbgDataType::DT_UINT32;
-      this->data_type_size = 4;
+      this->data_type_ = DbgDataType::DT_UINT32;
+      this->data_type_size_ = 4;
     } else if (type_name_lower == "uint64") {
-      this->data_type = DbgDataType::DT_UINT64;
-      this->data_type_size = 8;
+      this->data_type_ = DbgDataType::DT_UINT64;
+      this->data_type_size_ = 8;
     } else if (type_name_lower == "float16") {
-      this->data_type = DbgDataType::DT_FLOAT16;
-      this->data_type_size = 2;
+      this->data_type_ = DbgDataType::DT_FLOAT16;
+      this->data_type_size_ = 2;
     } else if (type_name_lower == "float32") {
-      this->data_type = DbgDataType::DT_FLOAT32;
-      this->data_type_size = 4;
+      this->data_type_ = DbgDataType::DT_FLOAT32;
+      this->data_type_size_ = 4;
     } else if (type_name_lower == "float64") {
-      this->data_type = DbgDataType::DT_FLOAT64;
-      this->data_type_size = 8;
+      this->data_type_ = DbgDataType::DT_FLOAT64;
+      this->data_type_size_ = 8;
     } else if (type_name_lower == "") {
-      this->data_type = DbgDataType::DT_UNDEFINED;
-      this->data_type_size = 0;
+      this->data_type_ = DbgDataType::DT_UNDEFINED;
+      this->data_type_size_ = 0;
     } else {
       if (!ConvertNpyStringToDbgType(type_name_lower)) {
         MS_LOG(EXCEPTION) << "Unexpected type name: " << type_name;
@@ -408,20 +408,20 @@ class TensorData {
   }
 
  private:
-  char *data_ptr;         // pointer to the pre-allocated memory
-  uint64_t size;          // size in bytes
-  DbgDataType data_type;  // internal debugger type
-  unsigned int data_type_size;
-  std::vector<int64_t> shape;
-  std::string name;
-  uint64_t slot;
-  unsigned int iteration;
-  unsigned int device_id;
-  unsigned int root_graph_id;
-  bool is_output;
-  int execution_order;
+  char *data_ptr_;         // pointer to the pre-allocated memory
+  uint64_t size_;          // size_ in bytes
+  DbgDataType data_type_;  // internal debugger type
+  unsigned int data_type_size_;
+  std::vector<int64_t> shape_;
+  std::string name_;
+  uint64_t slot_;
+  unsigned int iteration_;
+  unsigned int device_id_;
+  unsigned int root_graph_id_;
+  bool is_output_;
+  int execution_order_;
 #ifdef ONLINE_DBG_MODE
-  mindspore::tensor::TensorPtr tensor_ptr;
+  mindspore::tensor::TensorPtr tensor_ptr_;
 #endif
 };
 #ifdef ONLINE_DBG_MODE
diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h
index aeb402ea24b..12e1d8f7539 100644
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@@ -33,22 +33,22 @@ namespace mindspore {
 #endif
 class TensorLoader {
  public:
-  TensorLoader() : iter_num(-1) {}
+  TensorLoader() : iter_num_(-1) {}
 
   ~TensorLoader() { EmptyTensor(); }
 
   void MoveTensorCurrentToPrev(std::string tensor_name) {
-    auto handle = tensor_list_map.extract(tensor_name);
+    auto handle = tensor_list_map_.extract(tensor_name);
     if (!handle.empty()) {
       MS_LOG(INFO) << "Moving " << tensor_name << " from current map to previous map";
-      prev_tensor_list_map.insert(std::move(handle));
+      prev_tensor_list_map_.insert(std::move(handle));
     }
   }
 
-  void SwapCurrentPrev() { tensor_list_map.swap(prev_tensor_list_map); }
+  void SwapCurrentPrev() { tensor_list_map_.swap(prev_tensor_list_map_); }
 
   bool TensorExistsInCurrent(std::string tensor_name) const {
-    return tensor_list_map.find(tensor_name) != tensor_list_map.end();
+    return tensor_list_map_.find(tensor_name) != tensor_list_map_.end();
   }
 
   // only parameters will return true
@@ -56,8 +56,8 @@ class TensorLoader {
 
   void MoveParametersCurrentToPrev() {
     MS_LOG(INFO) << "Moving parameters from current map to previous map";
-    auto iter = tensor_list_map.begin();
-    while (iter != tensor_list_map.end()) {
+    auto iter = tensor_list_map_.begin();
+    while (iter != tensor_list_map_.end()) {
       auto key = iter->first;
       if (PrevTensorExistsInCurrent(key)) {
         // :prev tensor only exists for parameter. Move it to prev
@@ -79,47 +79,47 @@ class TensorLoader {
     std::lock_guard<std::mutex> lg(lock_);
     if (keep_prev) {
       // add prev step tensor into current step map with ":prev" suffix
-      auto handle = prev_tensor_list_map.extract(tensor->GetName());
+      auto handle = prev_tensor_list_map_.extract(tensor->GetName());
       if (!handle.empty()) {
         handle.key() = tensor->GetName() + ":prev";
-        tensor_list_map.insert(std::move(handle));
+        tensor_list_map_.insert(std::move(handle));
       }
     }
-    tensor_list_map[tensor->GetName()] = tensor;  // use [] instead of insert to ensure latest value
+    tensor_list_map_[tensor->GetName()] = tensor;  // use [] instead of insert to ensure latest value
     auto node_name = tensor->GetName();
     node_name = node_name.substr(0, node_name.find_first_of(":"));
-    node_tensor_map.insert({node_name, tensor});
+    node_tensor_map_.insert({node_name, tensor});
     return true;
   }
 
   std::vector<std::shared_ptr<TensorData>> GetTensor() {
     std::vector<std::shared_ptr<TensorData>> tensor_list;
-    for (auto &it : tensor_list_map) {
+    for (auto &it : tensor_list_map_) {
       if (!IsPrevTensor(it.first)) tensor_list.push_back(it.second);
     }
     return tensor_list;
   }
 
   std::shared_ptr<TensorData> GetTensor(const std::string &tensor_name) const {
-    auto iter = tensor_list_map.find(tensor_name);
-    if (iter != tensor_list_map.end()) return iter->second;
+    auto iter = tensor_list_map_.find(tensor_name);
+    if (iter != tensor_list_map_.end()) return iter->second;
     return nullptr;
   }
 
-  uint32_t GetIterNum() const { return iter_num; }
+  uint32_t GetIterNum() const { return iter_num_; }
 
-  std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; }
+  std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map_; }
 
   std::shared_ptr<TensorData> GetPrevTensor(const std::string &tensor_name) {
-    if (tensor_list_map.find(tensor_name + ":prev") != tensor_list_map.end()) {
-      return tensor_list_map[tensor_name + ":prev"];
+    if (tensor_list_map_.find(tensor_name + ":prev") != tensor_list_map_.end()) {
+      return tensor_list_map_[tensor_name + ":prev"];
     }
     return nullptr;
   }
 
   std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(std::string node_name) {
     std::vector<std::shared_ptr<TensorData>> tensors;
-    for (auto itr = node_tensor_map.begin(); itr != node_tensor_map.end(); itr++) {
+    for (auto itr = node_tensor_map_.begin(); itr != node_tensor_map_.end(); itr++) {
       if (itr->first == node_name) {
         tensors.push_back(itr->second);
       }
@@ -131,8 +131,8 @@ class TensorLoader {
                      std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
     for (auto i : search_list) {
       std::map<std::string, std::shared_ptr<TensorData>>::iterator iter;
-      iter = tensor_list_map.find(i);
-      if (iter != tensor_list_map.end()) {
+      iter = tensor_list_map_.find(i);
+      if (iter != tensor_list_map_.end()) {
         result_list->push_back(std::make_tuple(i, iter->second));
       } else {
         result_list->push_back(std::make_tuple(i, nullptr));
@@ -142,19 +142,19 @@ class TensorLoader {
 
   void EmptyTensor() {
     std::lock_guard<std::mutex> lg(lock_);
-    prev_tensor_list_map.clear();
-    node_tensor_map.clear();
-    tensor_list_map.swap(prev_tensor_list_map);
+    prev_tensor_list_map_.clear();
+    node_tensor_map_.clear();
+    tensor_list_map_.swap(prev_tensor_list_map_);
   }
 
-  void EmptyPrevTensor() { prev_tensor_list_map.clear(); }
+  void EmptyPrevTensor() { prev_tensor_list_map_.clear(); }
 
   void EmptyCurrentTensor() {
-    tensor_list_map.clear();
-    node_tensor_map.clear();
+    tensor_list_map_.clear();
+    node_tensor_map_.clear();
   }
 
-  void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; }
+  void set_iter_num(uint32_t iter_num) { this->iter_num_ = iter_num; }
 
 #ifdef ONLINE_DBG_MODE
   bool DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
@@ -174,24 +174,24 @@ class TensorLoader {
     MS_LOG(INFO) << "Dump path is " << path;
 
     std::string tensor_loader_name = tensor_name + ":" + std::to_string(slot);
-    auto iter = tensor_list_map.find(tensor_loader_name);
-    if (iter != tensor_list_map.end()) {
+    auto iter = tensor_list_map_.find(tensor_loader_name);
+    if (iter != tensor_list_map_.end()) {
       std::shared_ptr<TensorData> node = iter->second;
       size_t host_size = node->GetByteSize();
 
       return DumpJsonParser::DumpToFile(path, node->GetDataPtr(), host_size, host_shape, host_type);
     }
-    MS_LOG(INFO) << "Tensor name:" << tensor_name << " not found in tensor_list_map";
+    MS_LOG(INFO) << "Tensor name:" << tensor_name << " not found in tensor_list_map_";
     return true;
   }
 #endif
 
  private:
   // the pair is (device_id, iteration)
-  std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map;
-  std::multimap<std::string, std::shared_ptr<TensorData>> node_tensor_map;
-  std::map<std::string, std::shared_ptr<TensorData>> prev_tensor_list_map;
-  uint32_t iter_num;
+  std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map_;
+  std::multimap<std::string, std::shared_ptr<TensorData>> node_tensor_map_;
+  std::map<std::string, std::shared_ptr<TensorData>> prev_tensor_list_map_;
+  uint32_t iter_num_;
   std::mutex lock_;
 };
 #ifdef ONLINE_DBG_MODE
diff --git a/mindspore/ccsrc/debug/trace.cc b/mindspore/ccsrc/debug/trace.cc
index 72d2de8f796..51737df9290 100644
--- a/mindspore/ccsrc/debug/trace.cc
+++ b/mindspore/ccsrc/debug/trace.cc
@@ -36,7 +36,9 @@
 #include "debug/common.h"
 #include "pipeline/jit/static_analysis/evaluator.h"
 #include "pipeline/jit/static_analysis/async_eval_result.h"
+#include "pipeline/jit/base.h"
 #include "utils/log_adapter.h"
+#include "utils/comm_manager.h"
 #include "abstract/abstract_value.h"
 
 namespace mindspore {
@@ -133,9 +135,11 @@ class AnalyzeFailExporter : public AnfExporter {
 
   bool ExportFuncGraph(const std::string &filename, const TraceCNodeEvalStack &node_config_stack);
 
- private:
+ protected:
   void OutputCNode(std::ofstream &ofs, const CNodePtr &cnode, const FuncGraphPtr &func_graph, int *idx,
                    std::map<AnfNodePtr, int> *const apply_map) override;
+
+ private:
   std::string GetNodeType(const AnfNodePtr &nd) override;
   AbstractBasePtr GetNodeAbstract(const AnfNodePtr &nd);
   AnfNodeConfigPtr GetForwardConfig(const AnfNodeConfigPtr &cfg);
@@ -400,6 +404,22 @@ bool AnalyzeFailExporter::ExportFuncGraph(const std::string &filename, const Tra
   return true;
 }
 
+std::string GetEvalFailDatPath() {
+  std::string path;
+  auto ms_om_path = common::GetEnv("MS_OM_PATH");
+  if (!ms_om_path.empty()) {
+    path = ms_om_path;
+  } else {
+    path = ".";
+  }
+  path += "/rank_" + std::to_string(GetRank()) + "/om/analyze_fail.dat";
+  auto realpath = Common::GetRealPath(path);
+  if (!realpath.has_value()) {
+    MS_EXCEPTION(ValueError) << "Get real path failed. path=" << path;
+  }
+  return realpath.value();
+}
+
 void GetEvalStackInfo(std::ostringstream &oss) {
   MS_LOG(INFO) << "Get graph analysis information begin";
   auto stack = GetCNodeDebugStack();
@@ -407,17 +427,7 @@ void GetEvalStackInfo(std::ostringstream &oss) {
     MS_LOG(INFO) << "Length of analysis information stack is empty.";
     return;
   }
-  string file_name = "analyze_fail.dat";
-  auto ms_om_path = common::GetEnv("MS_OM_PATH");
-  if (!ms_om_path.empty()) {
-    auto path = ms_om_path + "/" + file_name;
-    auto realpath = Common::GetRealPath(path);
-    if (!realpath.has_value()) {
-      MS_EXCEPTION(ValueError) << "Get real path failed. path=" << path;
-    }
-    file_name = realpath.value();
-  }
-
+  std::string file_name = GetEvalFailDatPath();
   auto ret = OutputAnalyzedGraphWithType(file_name);
   oss << "\nThe function call stack";
   if (ret) {
diff --git a/mindspore/ccsrc/fl/CMakeLists.txt b/mindspore/ccsrc/fl/CMakeLists.txt
index bab81a91bb4..ef0760372f5 100644
--- a/mindspore/ccsrc/fl/CMakeLists.txt
+++ b/mindspore/ccsrc/fl/CMakeLists.txt
@@ -20,6 +20,7 @@ if(NOT ENABLE_CPU OR WIN32)
     list(REMOVE_ITEM _FL_SRC_FILES "server/kernel/round/get_secrets_kernel.cc")
     list(REMOVE_ITEM _FL_SRC_FILES "server/kernel/round/reconstruct_secrets_kernel.cc")
     list(REMOVE_ITEM _FL_SRC_FILES "server/kernel/round/share_secrets_kernel.cc")
+    list(REMOVE_ITEM _FL_SRC_FILES "server/kernel/round/push_metrics_kernel.cc")
     list(REMOVE_ITEM _FL_SRC_FILES "server/kernel/params_info.cc")
     list(REMOVE_ITEM _FL_SRC_FILES "server/consistent_hash_ring.cc")
     list(REMOVE_ITEM _FL_SRC_FILES "server/iteration_timer.cc")
@@ -34,6 +35,7 @@ if(NOT ENABLE_CPU OR WIN32)
     list(REMOVE_ITEM _FL_SRC_FILES "server/model_store.cc")
     list(REMOVE_ITEM _FL_SRC_FILES "server/round.cc")
     list(REMOVE_ITEM _FL_SRC_FILES "server/server.cc")
+    list(REMOVE_ITEM _FL_SRC_FILES "server/iteration_metrics.cc")
     list(REMOVE_ITEM _FL_SRC_FILES "worker/fl_worker.cc")
     list(REMOVE_ITEM _FL_SRC_FILES "armour/secure_protocol/encrypt.cc")
     list(REMOVE_ITEM _FL_SRC_FILES "armour/secure_protocol/key_agreement.cc")
diff --git a/mindspore/ccsrc/fl/server/common.h b/mindspore/ccsrc/fl/server/common.h
index e462f09a907..a35ecb00244 100644
--- a/mindspore/ccsrc/fl/server/common.h
+++ b/mindspore/ccsrc/fl/server/common.h
@@ -65,6 +65,20 @@ struct CipherConfig {
   size_t reconstruct_secrets_threshold = 0;
 };
 
+// Every instance is one training loop that runs fl_iteration_num iterations of federated learning.
+// During every instance, server's training process could be controlled by scheduler, which will change the state of
+// this instance.
+enum class InstanceState {
+  // If this instance is in kRunning state, server could communicate with client/worker and the traning process moves
+  // on.
+  kRunning = 0,
+  // The server is not available for client/worker if in kDisable state.
+  kDisable,
+  // The server is not available for client/worker if in kDisable state. And this state means one instance has finished.
+  // In other words, fl_iteration_num iterations are completed.
+  kFinish
+};
+
 using mindspore::kernel::Address;
 using mindspore::kernel::AddressPtr;
 using mindspore::kernel::CPUKernel;
@@ -173,6 +187,7 @@ constexpr size_t kCipherMgrMaxTaskNum = 64;
 constexpr size_t kExecutorThreadPoolSize = 32;
 constexpr size_t kExecutorMaxTaskNum = 32;
 constexpr int kHttpSuccess = 200;
+constexpr uint32_t kThreadSleepTime = 50;
 constexpr auto kPBProtocol = "PB";
 constexpr auto kFBSProtocol = "FBS";
 constexpr auto kSuccess = "Success";
diff --git a/mindspore/ccsrc/fl/server/distributed_count_service.cc b/mindspore/ccsrc/fl/server/distributed_count_service.cc
index c28c76b856c..f143d2427c5 100644
--- a/mindspore/ccsrc/fl/server/distributed_count_service.cc
+++ b/mindspore/ccsrc/fl/server/distributed_count_service.cc
@@ -66,6 +66,20 @@ void DistributedCountService::RegisterCounter(const std::string &name, size_t gl
   return;
 }
 
+bool DistributedCountService::ReInitCounter(const std::string &name, size_t global_threshold_count) {
+  MS_LOG(INFO) << "Rank " << local_rank_ << " reinitialize counter for " << name << " count:" << global_threshold_count;
+  if (local_rank_ == counting_server_rank_) {
+    std::unique_lock<std::mutex> lock(mutex_[name]);
+    if (global_threshold_count_.count(name) == 0) {
+      MS_LOG(INFO) << "Counter for " << name << " is not set.";
+      return false;
+    }
+    global_current_count_[name] = {};
+    global_threshold_count_[name] = global_threshold_count;
+  }
+  return true;
+}
+
 bool DistributedCountService::Count(const std::string &name, const std::string &id, std::string *reason) {
   MS_LOG(INFO) << "Rank " << local_rank_ << " reports count for " << name << " of " << id;
   if (local_rank_ == counting_server_rank_) {
diff --git a/mindspore/ccsrc/fl/server/distributed_count_service.h b/mindspore/ccsrc/fl/server/distributed_count_service.h
index cdb137c4958..d98f2e9f195 100644
--- a/mindspore/ccsrc/fl/server/distributed_count_service.h
+++ b/mindspore/ccsrc/fl/server/distributed_count_service.h
@@ -63,6 +63,9 @@ class DistributedCountService {
   // first/last count event callbacks.
   void RegisterCounter(const std::string &name, size_t global_threshold_count, const CounterHandlers &counter_handlers);
 
+  // Reinitialize counter due to the change of threshold count.
+  bool ReInitCounter(const std::string &name, size_t global_threshold_count);
+
   // Report a count to the counting server. Parameter 'id' is in case of repeated counting. Parameter 'reason' is the
   // reason why counting failed.
   bool Count(const std::string &name, const std::string &id, std::string *reason = nullptr);
diff --git a/mindspore/ccsrc/fl/server/distributed_metadata_store.cc b/mindspore/ccsrc/fl/server/distributed_metadata_store.cc
index 1b12143e9e2..ec61e960781 100644
--- a/mindspore/ccsrc/fl/server/distributed_metadata_store.cc
+++ b/mindspore/ccsrc/fl/server/distributed_metadata_store.cc
@@ -50,7 +50,7 @@ void DistributedMetadataStore::RegisterMetadata(const std::string &name, const P
   uint32_t stored_rank = router_->Find(name);
   if (local_rank_ == stored_rank) {
     if (metadata_.count(name) != 0) {
-      MS_LOG(ERROR) << "The metadata for " << name << " is already registered.";
+      MS_LOG(WARNING) << "The metadata for " << name << " is already registered.";
       return;
     }
 
diff --git a/mindspore/ccsrc/fl/server/executor.cc b/mindspore/ccsrc/fl/server/executor.cc
index cf87a3513eb..f121f5aa3b5 100644
--- a/mindspore/ccsrc/fl/server/executor.cc
+++ b/mindspore/ccsrc/fl/server/executor.cc
@@ -51,6 +51,18 @@ bool Executor::ReInitForScaling() {
   return true;
 }
 
+bool Executor::ReInitForUpdatingHyperParams(size_t aggr_threshold) {
+  aggregation_count_ = aggr_threshold;
+  auto result = std::find_if(param_aggrs_.begin(), param_aggrs_.end(), [this](auto param_aggr) {
+    return !param_aggr.second->ReInitForUpdatingHyperParams(aggregation_count_);
+  });
+  if (result != param_aggrs_.end()) {
+    MS_LOG(ERROR) << "Reinitializing aggregator of " << result->first << " for scaling failed.";
+    return false;
+  }
+  return true;
+}
+
 bool Executor::initialized() const { return initialized_; }
 
 bool Executor::HandlePush(const std::string &param_name, const UploadData &upload_data) {
@@ -328,10 +340,10 @@ bool Executor::InitParamAggregator(const FuncGraphPtr &func_graph) {
     param_aggrs_[param_name] = param_aggr;
     parameter_mutex_[param_name];
     if (!param_aggr->Init(cnode, aggregation_count_)) {
-      MS_LOG(EXCEPTION) << "Initializing parameter aggregator failed for " << param_name;
+      MS_LOG(EXCEPTION) << "Initializing parameter aggregator for " << param_name << " failed.";
       return false;
     }
-    MS_LOG(DEBUG) << "Initializing control flow for param_name " << param_name << " success.";
+    MS_LOG(DEBUG) << "Initializing parameter aggregator for param_name " << param_name << " success.";
   }
   return true;
 }
diff --git a/mindspore/ccsrc/fl/server/executor.h b/mindspore/ccsrc/fl/server/executor.h
index bc0963cb519..3bc90288d5f 100644
--- a/mindspore/ccsrc/fl/server/executor.h
+++ b/mindspore/ccsrc/fl/server/executor.h
@@ -33,8 +33,6 @@
 namespace mindspore {
 namespace fl {
 namespace server {
-constexpr int kThreadSleepTime = 5;
-
 // Executor is the entrance for server to handle aggregation, optimizing, model querying, etc. It handles
 // logics relevant to kernel launching.
 class Executor {
@@ -53,6 +51,9 @@ class Executor {
   // Reinitialize parameter aggregators after scaling operations are done.
   bool ReInitForScaling();
 
+  // After hyper-parameters are updated, some parameter aggregators should be reinitialized.
+  bool ReInitForUpdatingHyperParams(size_t aggr_threshold);
+
   // Called in parameter server training mode to do Push operation.
   // For the same trainable parameter, HandlePush method must be called aggregation_count_ times before it's considered
   // as completed.
diff --git a/mindspore/ccsrc/fl/server/iteration.cc b/mindspore/ccsrc/fl/server/iteration.cc
index 21e4ad014d6..0ac831f3d22 100644
--- a/mindspore/ccsrc/fl/server/iteration.cc
+++ b/mindspore/ccsrc/fl/server/iteration.cc
@@ -26,6 +26,15 @@ namespace mindspore {
 namespace fl {
 namespace server {
 class Server;
+
+Iteration::~Iteration() {
+  move_to_next_thread_running_ = false;
+  next_iteration_cv_.notify_all();
+  if (move_to_next_thread_.joinable()) {
+    move_to_next_thread_.join();
+  }
+}
+
 void Iteration::RegisterMessageCallback(const std::shared_ptr<ps::core::TcpCommunicator> &communicator) {
   MS_EXCEPTION_IF_NULL(communicator);
   communicator_ = communicator;
@@ -79,9 +88,30 @@ void Iteration::InitRounds(const std::vector<std::shared_ptr<ps::core::Communica
                                                  });
   LocalMetaStore::GetInstance().put_value(kCtxTotalTimeoutDuration, iteration_time_window);
   MS_LOG(INFO) << "Time window for one iteration is " << iteration_time_window;
+
+  // Initialize the thread which will handle the signal from round kernels.
+  move_to_next_thread_ = std::thread([this]() {
+    while (move_to_next_thread_running_.load()) {
+      std::unique_lock<std::mutex> lock(next_iteration_mutex_);
+      next_iteration_cv_.wait(lock);
+      if (!move_to_next_thread_running_.load()) {
+        break;
+      }
+      MoveToNextIteration(is_last_iteration_valid_, move_to_next_reason_);
+    }
+  });
   return;
 }
 
+void Iteration::ClearRounds() { rounds_.clear(); }
+
+void Iteration::NotifyNext(bool is_last_iter_valid, const std::string &reason) {
+  std::unique_lock<std::mutex> lock(next_iteration_mutex_);
+  is_last_iteration_valid_ = is_last_iter_valid;
+  move_to_next_reason_ = reason;
+  next_iteration_cv_.notify_one();
+}
+
 void Iteration::MoveToNextIteration(bool is_last_iter_valid, const std::string &reason) {
   MS_LOG(INFO) << "Notify cluster starts to proceed to next iteration. Iteration is " << iteration_num_
                << " validation is " << is_last_iter_valid << ". Reason: " << reason;
@@ -119,7 +149,10 @@ void Iteration::SetIterationRunning() {
     // This event helps worker/server to be consistent in iteration state.
     server_node_->BroadcastEvent(static_cast<uint32_t>(ps::CustomEvent::kIterationRunning));
   }
+
+  std::unique_lock<std::mutex> lock(iteration_state_mtx_);
   iteration_state_ = IterationState::kRunning;
+  start_timestamp_ = LongToUlong(CURRENT_TIME_MILLI.count());
 }
 
 void Iteration::SetIterationCompleted() {
@@ -129,13 +162,17 @@ void Iteration::SetIterationCompleted() {
     // This event helps worker/server to be consistent in iteration state.
     server_node_->BroadcastEvent(static_cast<uint32_t>(ps::CustomEvent::kIterationCompleted));
   }
+
+  std::unique_lock<std::mutex> lock(iteration_state_mtx_);
   iteration_state_ = IterationState::kCompleted;
+  complete_timestamp_ = LongToUlong(CURRENT_TIME_MILLI.count());
 }
 
 void Iteration::ScalingBarrier() {
   MS_LOG(INFO) << "Starting Iteration scaling barrier.";
-  while (iteration_state_.load() != IterationState::kCompleted) {
-    std::this_thread::yield();
+  std::unique_lock<std::mutex> lock(iteration_state_mtx_);
+  if (iteration_state_.load() != IterationState::kCompleted) {
+    iteration_state_cv_.wait(lock);
   }
   MS_LOG(INFO) << "Ending Iteration scaling barrier.";
 }
@@ -156,10 +193,148 @@ bool Iteration::ReInitForScaling(uint32_t server_num, uint32_t server_rank) {
   return true;
 }
 
+bool Iteration::ReInitForUpdatingHyperParams(const std::vector<RoundConfig> &updated_rounds_config) {
+  for (const auto &updated_round : updated_rounds_config) {
+    for (const auto &round : rounds_) {
+      if (updated_round.name == round->name()) {
+        MS_LOG(INFO) << "Reinitialize for round " << round->name();
+        if (!round->ReInitForUpdatingHyperParams(updated_round.threshold_count, updated_round.time_window)) {
+          MS_LOG(ERROR) << "Reinitializing for round " << round->name() << " failed.";
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
 const std::vector<std::shared_ptr<Round>> &Iteration::rounds() const { return rounds_; }
 
 bool Iteration::is_last_iteration_valid() const { return is_last_iteration_valid_; }
 
+void Iteration::set_metrics(const std::shared_ptr<IterationMetrics> &metrics) { metrics_ = metrics; }
+
+void Iteration::set_loss(float loss) { loss_ = loss; }
+
+void Iteration::set_accuracy(float accuracy) { accuracy_ = accuracy; }
+
+InstanceState Iteration::instance_state() const { return instance_state_.load(); }
+
+bool Iteration::EnableServerInstance(std::string *result) {
+  MS_ERROR_IF_NULL_W_RET_VAL(result, false);
+  // Before enabling server instance, we should judge whether this request should be handled.
+  std::unique_lock<std::mutex> lock(instance_mtx_);
+  if (is_instance_being_updated_) {
+    *result = "The instance is being updated. Please retry enabling server later.";
+    MS_LOG(WARNING) << *result;
+    return false;
+  }
+  if (instance_state_.load() == InstanceState::kFinish) {
+    *result = "The instance is completed. Please do not enabling server now.";
+    MS_LOG(WARNING) << *result;
+    return false;
+  }
+
+  // Start enabling server instance.
+  is_instance_being_updated_ = true;
+
+  instance_state_ = InstanceState::kRunning;
+  *result = "Enabling FL-Server succeeded.";
+  MS_LOG(INFO) << *result;
+
+  // End enabling server instance.
+  is_instance_being_updated_ = false;
+  return true;
+}
+
+bool Iteration::DisableServerInstance(std::string *result) {
+  MS_ERROR_IF_NULL_W_RET_VAL(result, false);
+  // Before disabling server instance, we should judge whether this request should be handled.
+  std::unique_lock<std::mutex> lock(instance_mtx_);
+  if (is_instance_being_updated_) {
+    *result = "The instance is being updated. Please retry disabling server later.";
+    MS_LOG(WARNING) << *result;
+    return false;
+  }
+  if (instance_state_.load() == InstanceState::kFinish) {
+    *result = "The instance is completed. Please do not disabling server now.";
+    MS_LOG(WARNING) << *result;
+    return false;
+  }
+  if (instance_state_.load() == InstanceState::kDisable) {
+    *result = "Disabling FL-Server succeeded.";
+    MS_LOG(INFO) << *result;
+    return true;
+  }
+
+  // Start disabling server instance.
+  is_instance_being_updated_ = true;
+
+  // If instance is running, we should drop current iteration and move to the next.
+  instance_state_ = InstanceState::kDisable;
+  if (!ForciblyMoveToNextIteration()) {
+    *result = "Disabling instance failed. Can't drop current iteration and move to the next.";
+    MS_LOG(ERROR) << result;
+    return false;
+  }
+  *result = "Disabling FL-Server succeeded.";
+  MS_LOG(INFO) << *result;
+
+  // End disabling server instance.
+  is_instance_being_updated_ = false;
+  return true;
+}
+
+bool Iteration::NewInstance(const nlohmann::json &new_instance_json, std::string *result) {
+  MS_ERROR_IF_NULL_W_RET_VAL(result, false);
+  // Before new instance, we should judge whether this request should be handled.
+  std::unique_lock<std::mutex> lock(instance_mtx_);
+  if (is_instance_being_updated_) {
+    *result = "The instance is being updated. Please retry new instance later.";
+    MS_LOG(WARNING) << *result;
+    return false;
+  }
+
+  // Start new server instance.
+  is_instance_being_updated_ = true;
+
+  // Reset current instance.
+  instance_state_ = InstanceState::kFinish;
+  Server::GetInstance().WaitExitSafeMode();
+  WaitAllRoundsFinish();
+  MS_LOG(INFO) << "Proceed to a new instance.";
+  for (auto &round : rounds_) {
+    MS_ERROR_IF_NULL_W_RET_VAL(round, false);
+    round->Reset();
+  }
+  iteration_num_ = 1;
+  LocalMetaStore::GetInstance().set_curr_iter_num(iteration_num_);
+  ModelStore::GetInstance().Reset();
+
+  // Update the hyper-parameters on server and reinitialize rounds.
+  if (!UpdateHyperParams(new_instance_json)) {
+    *result = "Updating hyper-parameters failed.";
+    return false;
+  }
+  if (!ReInitRounds()) {
+    *result = "Reinitializing rounds failed.";
+    return false;
+  }
+
+  instance_state_ = InstanceState::kRunning;
+  *result = "New FL-Server instance succeeded.";
+
+  // End new server instance.
+  is_instance_being_updated_ = false;
+  return true;
+}
+
+void Iteration::WaitAllRoundsFinish() {
+  while (running_round_num_.load() != 0) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(kThreadSleepTime));
+  }
+}
+
 bool Iteration::SyncIteration(uint32_t rank) {
   MS_ERROR_IF_NULL_W_RET_VAL(communicator_, false);
   SyncIterationRequest sync_iter_req;
@@ -316,6 +491,7 @@ void Iteration::HandlePrepareForNextIterRequest(const std::shared_ptr<ps::core::
 void Iteration::PrepareForNextIter() {
   MS_LOG(INFO) << "Prepare for next iteration. Switch the server to safemode.";
   Server::GetInstance().SwitchToSafeMode();
+  WaitAllRoundsFinish();
 }
 
 bool Iteration::BroadcastMoveToNextIterRequest(bool is_last_iter_valid, const std::string &reason) {
@@ -432,24 +608,133 @@ void Iteration::HandleEndLastIterRequest(const std::shared_ptr<ps::core::Message
 
 void Iteration::EndLastIter() {
   MS_LOG(INFO) << "End the last iteration " << iteration_num_;
-  iteration_num_++;
-  // After the job is done, reset the iteration to the initial number and reset ModelStore.
-  if (iteration_num_ > ps::PSContext::instance()->fl_iteration_num()) {
+  if (iteration_num_ == ps::PSContext::instance()->fl_iteration_num()) {
     MS_LOG(INFO) << "Iteration loop " << iteration_loop_count_
                  << " is completed. Iteration number: " << ps::PSContext::instance()->fl_iteration_num();
-    iteration_num_ = 1;
     iteration_loop_count_++;
-    ModelStore::GetInstance().Reset();
+    instance_state_ = InstanceState::kFinish;
   }
 
   std::unique_lock<std::mutex> lock(pinned_mtx_);
   pinned_iter_num_ = 0;
   lock.unlock();
+
+  SetIterationCompleted();
+  SummarizeIteration();
+  iteration_num_++;
   LocalMetaStore::GetInstance().set_curr_iter_num(iteration_num_);
   Server::GetInstance().CancelSafeMode();
-  SetIterationCompleted();
+  iteration_state_cv_.notify_all();
   MS_LOG(INFO) << "Move to next iteration:" << iteration_num_ << "\n";
 }
+
+bool Iteration::ForciblyMoveToNextIteration() {
+  NotifyNext(false, "Forcibly move to next iteration.");
+  return true;
+}
+
+bool Iteration::SummarizeIteration() {
+  // If the metrics_ is not initialized or the server is not the leader server, do not summarize.
+  if (server_node_->rank_id() != kLeaderServerRank || metrics_ == nullptr) {
+    MS_LOG(INFO) << "This server will not summarize for iteration.";
+    return true;
+  }
+
+  metrics_->set_fl_name(ps::PSContext::instance()->fl_name());
+  metrics_->set_fl_iteration_num(ps::PSContext::instance()->fl_iteration_num());
+  metrics_->set_cur_iteration_num(iteration_num_ - 1);
+  metrics_->set_instance_state(instance_state_.load());
+  metrics_->set_loss(loss_);
+  metrics_->set_accuracy(accuracy_);
+  // The joined client number is equal to the threshold of updateModel.
+  size_t update_model_threshold = static_cast<size_t>(
+    std::ceil(ps::PSContext::instance()->start_fl_job_threshold() * ps::PSContext::instance()->update_model_ratio()));
+  metrics_->set_joined_client_num(update_model_threshold);
+  // The rejected client number is equal to threshold of startFLJob minus threshold of updateModel.
+  metrics_->set_rejected_client_num(ps::PSContext::instance()->start_fl_job_threshold() - update_model_threshold);
+
+  if (complete_timestamp_ < start_timestamp_) {
+    MS_LOG(ERROR) << "The complete_timestamp_: " << complete_timestamp_ << ", start_timestamp_: " << start_timestamp_
+                  << ". One of them is invalid.";
+    metrics_->set_iteration_time_cost(UINT64_MAX);
+  } else {
+    metrics_->set_iteration_time_cost(complete_timestamp_ - start_timestamp_);
+  }
+
+  metrics_->Summarize();
+  return true;
+}
+
+bool Iteration::UpdateHyperParams(const nlohmann::json &json) {
+  for (const auto &item : json.items()) {
+    std::string key = item.key();
+    if (key == "start_fl_job_threshold") {
+      ps::PSContext::instance()->set_start_fl_job_threshold(item.value().get<uint64_t>());
+      continue;
+    }
+    if (key == "start_fl_job_time_window") {
+      ps::PSContext::instance()->set_start_fl_job_time_window(item.value().get<uint64_t>());
+      continue;
+    }
+    if (key == "update_model_ratio") {
+      ps::PSContext::instance()->set_update_model_ratio(item.value().get<float>());
+      continue;
+    }
+    if (key == "update_model_time_window") {
+      ps::PSContext::instance()->set_update_model_time_window(item.value().get<uint64_t>());
+      continue;
+    }
+    if (key == "fl_iteration_num") {
+      ps::PSContext::instance()->set_fl_iteration_num(item.value().get<uint64_t>());
+      continue;
+    }
+    if (key == "client_epoch_num") {
+      ps::PSContext::instance()->set_client_epoch_num(item.value().get<uint64_t>());
+      continue;
+    }
+    if (key == "client_batch_size") {
+      ps::PSContext::instance()->set_client_batch_size(item.value().get<uint64_t>());
+      continue;
+    }
+    if (key == "client_learning_rate") {
+      ps::PSContext::instance()->set_client_learning_rate(item.value().get<float>());
+      continue;
+    }
+  }
+  return true;
+}
+
+bool Iteration::ReInitRounds() {
+  size_t start_fl_job_threshold = ps::PSContext::instance()->start_fl_job_threshold();
+  float update_model_ratio = ps::PSContext::instance()->update_model_ratio();
+  size_t update_model_threshold = static_cast<size_t>(std::ceil(start_fl_job_threshold * update_model_ratio));
+  uint64_t start_fl_job_time_window = ps::PSContext::instance()->start_fl_job_time_window();
+  uint64_t update_model_time_window = ps::PSContext::instance()->update_model_time_window();
+  std::vector<RoundConfig> new_round_config = {
+    {"startFLJob", true, start_fl_job_time_window, true, start_fl_job_threshold},
+    {"updateModel", true, update_model_time_window, true, update_model_threshold}};
+  if (!ReInitForUpdatingHyperParams(new_round_config)) {
+    MS_LOG(ERROR) << "Reinitializing for updating hyper-parameters failed.";
+    return false;
+  }
+
+  size_t executor_threshold = 0;
+  const std::string &server_mode = ps::PSContext::instance()->server_mode();
+  uint32_t worker_num = ps::PSContext::instance()->initial_worker_num();
+  if (server_mode == ps::kServerModeFL || server_mode == ps::kServerModeHybrid) {
+    executor_threshold = update_model_threshold;
+  } else if (server_mode == ps::kServerModePS) {
+    executor_threshold = worker_num;
+  } else {
+    MS_LOG(ERROR) << "Server mode " << server_mode << " is not supported.";
+    return false;
+  }
+  if (!Executor::GetInstance().ReInitForUpdatingHyperParams(executor_threshold)) {
+    MS_LOG(ERROR) << "Reinitializing executor failed.";
+    return false;
+  }
+  return true;
+}
 }  // namespace server
 }  // namespace fl
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/fl/server/iteration.h b/mindspore/ccsrc/fl/server/iteration.h
index 28f0da69bf5..7caef124241 100644
--- a/mindspore/ccsrc/fl/server/iteration.h
+++ b/mindspore/ccsrc/fl/server/iteration.h
@@ -24,6 +24,7 @@
 #include "fl/server/common.h"
 #include "fl/server/round.h"
 #include "fl/server/local_meta_store.h"
+#include "fl/server/iteration_metrics.h"
 
 namespace mindspore {
 namespace fl {
@@ -38,6 +39,7 @@ enum class IterationState {
 // The time duration between retrying when sending prepare for next iteration request failed.
 constexpr uint32_t kRetryDurationForPrepareForNextIter = 500;
 
+class IterationMetrics;
 // In server's logic, Iteration is the minimum execution unit. For each execution, it consists of multiple kinds of
 // Rounds, only after all the rounds are finished, this iteration is considered as completed.
 class Iteration {
@@ -60,6 +62,12 @@ class Iteration {
   void InitRounds(const std::vector<std::shared_ptr<ps::core::CommunicatorBase>> &communicators,
                   const TimeOutCb &timeout_cb, const FinishIterCb &finish_iteration_cb);
 
+  // Release all the round objects in Iteration instance. Used for reinitializing round and round kernels.
+  void ClearRounds();
+
+  // Notify move_to_next_thread_ to move to next iteration.
+  void NotifyNext(bool is_last_iter_valid, const std::string &reason);
+
   // This method will control servers to proceed to next iteration.
   // There's communication between leader and follower servers in this method.
   // The server moves to next iteration only after the last round finishes or the time expires.
@@ -79,22 +87,65 @@ class Iteration {
   // The server number after scaling is required in some rounds.
   bool ReInitForScaling(uint32_t server_num, uint32_t server_rank);
 
+  // After hyper-parameters are updated, some rounds and kernels should be reinitialized.
+  bool ReInitForUpdatingHyperParams(const std::vector<RoundConfig> &updated_rounds_config);
+
   const std::vector<std::shared_ptr<Round>> &rounds() const;
 
   bool is_last_iteration_valid() const;
 
+  // Set the instance metrics which will be called for each iteration.
+  void set_metrics(const std::shared_ptr<IterationMetrics> &metrics);
+  void set_loss(float loss);
+  void set_accuracy(float accuracy);
+
+  // Return state of current training job instance.
+  InstanceState instance_state() const;
+
+  // Return whether current instance is being updated.
+  bool IsInstanceBeingUpdated() const;
+
+  // EnableFLS/disableFLS the current training instance.
+  bool EnableServerInstance(std::string *result);
+  bool DisableServerInstance(std::string *result);
+
+  // Finish current instance and start a new one. FLPlan could be changed in this method.
+  bool NewInstance(const nlohmann::json &new_instance_json, std::string *result);
+
+  // Query information of current instance.
+  bool QueryInstance(std::string *result);
+
+  // Need to wait all the rounds to finish before proceed to next iteration.
+  void WaitAllRoundsFinish();
+
+  // The round kernels whose Launch method has not returned yet.
+  std::atomic_uint32_t running_round_num_;
+
  private:
   Iteration()
-      : server_node_(nullptr),
+      : running_round_num_(0),
+        server_node_(nullptr),
         communicator_(nullptr),
         iteration_state_(IterationState::kCompleted),
+        start_timestamp_(0),
+        complete_timestamp_(0),
         iteration_loop_count_(0),
         iteration_num_(1),
         is_last_iteration_valid_(true),
-        pinned_iter_num_(0) {
+        move_to_next_reason_(""),
+        move_to_next_thread_running_(true),
+        pinned_iter_num_(0),
+        metrics_(nullptr),
+        instance_state_(InstanceState::kRunning),
+        is_instance_being_updated_(false),
+        loss_(0.0),
+        accuracy_(0.0),
+        joined_client_num_(0),
+        rejected_client_num_(0),
+        time_cost_(0) {
     LocalMetaStore::GetInstance().set_curr_iter_num(iteration_num_);
   }
-  ~Iteration() = default;
+  ~Iteration();
   Iteration(const Iteration &) = delete;
   Iteration &operator=(const Iteration &) = delete;
 
@@ -132,6 +183,18 @@ class Iteration {
   // The server end the last iteration. This method will increase the iteration number and cancel the safemode.
   void EndLastIter();
 
+  // Drop current iteration and move to the next immediately.
+  bool ForciblyMoveToNextIteration();
+
+  // Summarize metrics for the completed iteration, including iteration time cost, accuracy, loss, etc.
+  bool SummarizeIteration();
+
+  // Update server's hyper-parameters according to the given serialized json(hyper_params_data).
+  bool UpdateHyperParams(const nlohmann::json &json);
+
+  // Reinitialize rounds and round kernels.
+  bool ReInitRounds();
+
   std::shared_ptr<ps::core::ServerNode> server_node_;
   std::shared_ptr<ps::core::TcpCommunicator> communicator_;
 
@@ -139,7 +202,11 @@ class Iteration {
   std::vector<std::shared_ptr<Round>> rounds_;
 
   // The iteration is either running or completed at any time.
+  std::mutex iteration_state_mtx_;
+  std::condition_variable iteration_state_cv_;
   std::atomic<IterationState> iteration_state_;
+  uint64_t start_timestamp_;
+  uint64_t complete_timestamp_;
 
   // The count of iteration loops which are completed.
   size_t iteration_loop_count_;
@@ -147,12 +214,44 @@ class Iteration {
   // Server's current iteration number.
   size_t iteration_num_;
 
-  // Last iteration is successfully finished.
+  // Whether last iteration is successfully finished and the reason.
   bool is_last_iteration_valid_;
+  std::string move_to_next_reason_;
+
+  // It will be notified by rounds that the instance moves to the next iteration.
+  std::thread move_to_next_thread_;
+  std::atomic_bool move_to_next_thread_running_;
+  std::mutex next_iteration_mutex_;
+  std::condition_variable next_iteration_cv_;
 
   // To avoid Next method is called multiple times in one iteration, we should mark the iteration number.
   uint64_t pinned_iter_num_;
   std::mutex pinned_mtx_;
+
+  std::shared_ptr<IterationMetrics> metrics_;
+
+  // The state for current instance.
+  std::atomic<InstanceState> instance_state_;
+
+  // Every instance is not reentrant.
+  // This flag represents whether the instance is being updated.
+  std::mutex instance_mtx_;
+  bool is_instance_being_updated_;
+
+  // The training loss after this federated learning iteration, passed by worker.
+  float loss_;
+
+  // The evaluation result after this federated learning iteration, passed by worker.
+  float accuracy_;
+
+  // The number of clients which join the federated aggregation.
+  size_t joined_client_num_;
+
+  // The number of clients which are not involved in federated aggregation.
+  size_t rejected_client_num_;
+
+  // The time cost in millisecond for this completed iteration.
+  uint64_t time_cost_;
 };
 }  // namespace server
 }  // namespace fl
diff --git a/mindspore/ccsrc/fl/server/kernel/aggregation_kernel.h b/mindspore/ccsrc/fl/server/kernel/aggregation_kernel.h
index a0c41771163..aae59210a1c 100644
--- a/mindspore/ccsrc/fl/server/kernel/aggregation_kernel.h
+++ b/mindspore/ccsrc/fl/server/kernel/aggregation_kernel.h
@@ -67,6 +67,8 @@ class AggregationKernel : public CPUKernel {
   // Reinitialize aggregation kernel after scaling operations are done.
   virtual bool ReInitForScaling() { return true; }
 
+  virtual bool ReInitForUpdatingHyperParams(size_t) { return true; }
+
   // Setter and getter of kernels parameters information.
   void set_params_info(const ParamsInfo &params_info) { params_info_ = params_info; }
   const std::vector<std::string> &input_names() { return params_info_.inputs_names(); }
diff --git a/mindspore/ccsrc/fl/server/kernel/fed_avg_kernel.h b/mindspore/ccsrc/fl/server/kernel/fed_avg_kernel.h
index b201fa83d92..5a5a4ab2f11 100644
--- a/mindspore/ccsrc/fl/server/kernel/fed_avg_kernel.h
+++ b/mindspore/ccsrc/fl/server/kernel/fed_avg_kernel.h
@@ -178,6 +178,15 @@ class FedAvgKernel : public AggregationKernel {
     return true;
   }
 
+  bool ReInitForUpdatingHyperParams(size_t aggr_threshold) override {
+    done_count_ = aggr_threshold;
+    if (!DistributedCountService::GetInstance().ReInitCounter(name_, done_count_)) {
+      MS_LOG(ERROR) << "Reinitializing counter for " << name_ << " failed.";
+      return false;
+    }
+    return true;
+  }
+
  private:
   void GenerateReuseKernelNodeInfo() override {
     MS_LOG(INFO) << "FedAvg reuse 'weight' of the kernel node.";
diff --git a/mindspore/ccsrc/fl/server/kernel/round/get_model_kernel.cc b/mindspore/ccsrc/fl/server/kernel/round/get_model_kernel.cc
index cc5302ae2ca..40656475db0 100644
--- a/mindspore/ccsrc/fl/server/kernel/round/get_model_kernel.cc
+++ b/mindspore/ccsrc/fl/server/kernel/round/get_model_kernel.cc
@@ -99,7 +99,7 @@ void GetModelKernel::GetModel(const schema::RequestGetModel *get_model_req, cons
   const auto &iter_to_model = ModelStore::GetInstance().iteration_to_model();
   size_t latest_iter_num = iter_to_model.rbegin()->first;
   // If this iteration is not finished yet, return ResponseCode_SucNotReady so that clients could get model later.
-  if ((current_iter == get_model_iter && latest_iter_num != current_iter) || current_iter == get_model_iter - 1) {
+  if ((current_iter == get_model_iter && latest_iter_num != current_iter)) {
     std::string reason = "The model is not ready yet for iteration " + std::to_string(get_model_iter) +
                          ". Maybe this is because\n" + "1.Client doesn't send enough update model requests.\n" +
                          "2. Worker has not push all the weights to servers.";
diff --git a/mindspore/ccsrc/fl/server/model_store.cc b/mindspore/ccsrc/fl/server/model_store.cc
index 8444798a614..4d2f66c1d40 100644
--- a/mindspore/ccsrc/fl/server/model_store.cc
+++ b/mindspore/ccsrc/fl/server/model_store.cc
@@ -102,7 +102,6 @@ void ModelStore::Reset() {
   initial_model_ = iteration_to_model_.rbegin()->second;
   iteration_to_model_.clear();
   iteration_to_model_[kInitIterationNum] = initial_model_;
-  iteration_to_model_[kResetInitIterNum] = initial_model_;
 }
 
 const std::map<size_t, std::shared_ptr<MemoryRegister>> &ModelStore::iteration_to_model() {
diff --git a/mindspore/ccsrc/fl/server/parameter_aggregator.cc b/mindspore/ccsrc/fl/server/parameter_aggregator.cc
index 9a5cf531821..0ef6f5569ad 100644
--- a/mindspore/ccsrc/fl/server/parameter_aggregator.cc
+++ b/mindspore/ccsrc/fl/server/parameter_aggregator.cc
@@ -60,6 +60,21 @@ bool ParameterAggregator::ReInitForScaling() {
   return true;
 }
 
+bool ParameterAggregator::ReInitForUpdatingHyperParams(size_t aggr_threshold) {
+  required_push_count_ = aggr_threshold;
+  required_pull_count_ = aggr_threshold;
+  auto result = std::find_if(aggregation_kernel_parameters_.begin(), aggregation_kernel_parameters_.end(),
+                             [aggr_threshold](auto aggregation_kernel) {
+                               MS_ERROR_IF_NULL_W_RET_VAL(aggregation_kernel.first, true);
+                               return !aggregation_kernel.first->ReInitForUpdatingHyperParams(aggr_threshold);
+                             });
+  if (result != aggregation_kernel_parameters_.end()) {
+    MS_LOG(ERROR) << "Reinitializing aggregation kernel after scaling failed";
+    return false;
+  }
+  return true;
+}
+
 bool ParameterAggregator::UpdateData(const std::map<std::string, Address> &new_data) {
   std::map<std::string, AddressPtr> &name_to_addr = memory_register_->addresses();
   for (const auto &data : new_data) {
diff --git a/mindspore/ccsrc/fl/server/parameter_aggregator.h b/mindspore/ccsrc/fl/server/parameter_aggregator.h
index 4fc3fe60f0c..8bf68143b6d 100644
--- a/mindspore/ccsrc/fl/server/parameter_aggregator.h
+++ b/mindspore/ccsrc/fl/server/parameter_aggregator.h
@@ -68,6 +68,9 @@ class ParameterAggregator {
   // Reinitialize the parameter aggregator after scaling operations are done.
   bool ReInitForScaling();
 
+  // After hyper-parameters are updated, some parameter aggregators should be reinitialized.
+  bool ReInitForUpdatingHyperParams(size_t aggr_threshold);
+
   // Update old data stored in ParameterAggregator with new data.
   // The data could have many meanings: weights, gradients, learning_rate, momentum, etc.
   bool UpdateData(const std::map<std::string, Address> &new_data);
diff --git a/mindspore/ccsrc/fl/server/round.cc b/mindspore/ccsrc/fl/server/round.cc
index 0b578814b29..28a9a41ed08 100644
--- a/mindspore/ccsrc/fl/server/round.cc
+++ b/mindspore/ccsrc/fl/server/round.cc
@@ -102,6 +102,21 @@ bool Round::ReInitForScaling(uint32_t server_num) {
   return true;
 }
 
+bool Round::ReInitForUpdatingHyperParams(size_t updated_threshold_count, size_t updated_time_window) {
+  time_window_ = updated_time_window;
+  threshold_count_ = updated_threshold_count;
+  if (check_count_) {
+    if (!DistributedCountService::GetInstance().ReInitCounter(name_, threshold_count_)) {
+      MS_LOG(ERROR) << "Reinitializing count for " << name_ << " failed.";
+      return false;
+    }
+  }
+
+  MS_ERROR_IF_NULL_W_RET_VAL(kernel_, false);
+  kernel_->InitKernel(threshold_count_);
+  return true;
+}
+
 void Round::BindRoundKernel(const std::shared_ptr<kernel::RoundKernel> &kernel) {
   MS_EXCEPTION_IF_NULL(kernel);
   kernel_ = kernel;
@@ -114,10 +129,9 @@ void Round::LaunchRoundKernel(const std::shared_ptr<ps::core::MessageHandler> &m
   MS_ERROR_IF_NULL_WO_RET_VAL(message);
   MS_ERROR_IF_NULL_WO_RET_VAL(kernel_);
   MS_ERROR_IF_NULL_WO_RET_VAL(communicator_);
-  // If the server is still in the process of scaling, refuse the request.
-  if (Server::GetInstance().IsSafeMode()) {
-    MS_LOG(WARNING) << "The cluster is still in process of scaling, please retry " << name_ << " later.";
-    std::string reason = "The cluster is in safemode.";
+
+  std::string reason = "";
+  if (!IsServerAvailable(&reason)) {
     if (!communicator_->SendResponse(reason.c_str(), reason.size(), message)) {
       MS_LOG(ERROR) << "Sending response failed.";
       return;
@@ -125,6 +139,7 @@ void Round::LaunchRoundKernel(const std::shared_ptr<ps::core::MessageHandler> &m
     return;
   }
 
+  Iteration::GetInstance().running_round_num_++;
   AddressPtr input = std::make_shared<Address>();
   AddressPtr output = std::make_shared<Address>();
   MS_ERROR_IF_NULL_WO_RET_VAL(input);
@@ -133,7 +148,7 @@ void Round::LaunchRoundKernel(const std::shared_ptr<ps::core::MessageHandler> &m
   input->size = message->len();
   bool ret = kernel_->Launch({input}, {}, {output});
   if (output->size == 0) {
-    std::string reason = "The output of the round " + name_ + " is empty.";
+    reason = "The output of the round " + name_ + " is empty.";
     MS_LOG(WARNING) << reason;
     if (!communicator_->SendResponse(reason.c_str(), reason.size(), message)) {
       MS_LOG(ERROR) << "Sending response failed.";
@@ -149,9 +164,10 @@ void Round::LaunchRoundKernel(const std::shared_ptr<ps::core::MessageHandler> &m
 
   // Must send response back no matter what value Launch method returns.
   if (!ret) {
-    std::string reason = "Launching round kernel of round " + name_ + " failed.";
-    Iteration::GetInstance().MoveToNextIteration(false, reason);
+    reason = "Launching round kernel of round " + name_ + " failed.";
+    Iteration::GetInstance().NotifyNext(false, reason);
   }
+  Iteration::GetInstance().running_round_num_--;
   return;
 }
 
@@ -195,6 +211,30 @@ void Round::OnLastCountEvent(const std::shared_ptr<ps::core::MessageHandler> &me
   kernel_->OnLastCountEvent(message);
   return;
 }
+
+bool Round::IsServerAvailable(std::string *reason) {
+  MS_ERROR_IF_NULL_W_RET_VAL(reason, false);
+  // After one instance is completed, the model should be accessed by clients.
+  if (Iteration::GetInstance().instance_state() == InstanceState::kFinish && name_ == "getModel") {
+    return true;
+  }
+
+  // If the server state is Disable or Finish, refuse the request.
+  if (Iteration::GetInstance().instance_state() == InstanceState::kDisable ||
+      Iteration::GetInstance().instance_state() == InstanceState::kFinish) {
+    MS_LOG(WARNING) << "The server's training job is disabled or finished, please retry " + name_ + " later.";
+    *reason = ps::kJobNotAvailable;
+    return false;
+  }
+
+  // If the server is still in the process of scaling, reject the request.
+  if (Server::GetInstance().IsSafeMode()) {
+    MS_LOG(WARNING) << "The cluster is still in process of scaling, please retry " << name_ << " later.";
+    *reason = ps::kClusterSafeMode;
+    return false;
+  }
+  return true;
+}
 }  // namespace server
 }  // namespace fl
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/fl/server/round.h b/mindspore/ccsrc/fl/server/round.h
index 1aae7b560d7..cbd868b1f43 100644
--- a/mindspore/ccsrc/fl/server/round.h
+++ b/mindspore/ccsrc/fl/server/round.h
@@ -43,6 +43,9 @@ class Round {
   // Reinitialize count service and round kernel of this round after scaling operations are done.
   bool ReInitForScaling(uint32_t server_num);
 
+  // After hyper-parameters are updated, some rounds and kernels should be reinitialized.
+  bool ReInitForUpdatingHyperParams(size_t updated_threshold_count, size_t updated_time_window);
+
   // Bind a round kernel to this Round. This method should be called after Initialize.
   void BindRoundKernel(const std::shared_ptr<kernel::RoundKernel> &kernel);
 
@@ -63,6 +66,9 @@ class Round {
   void OnFirstCountEvent(const std::shared_ptr<ps::core::MessageHandler> &message);
   void OnLastCountEvent(const std::shared_ptr<ps::core::MessageHandler> &message);
 
+  // Judge whether the training service is available.
+  bool IsServerAvailable(std::string *reason);
+
   std::string name_;
 
   // Whether this round needs to use timer. Most rounds in federated learning with mobile devices scenario need to set
diff --git a/mindspore/ccsrc/fl/server/server.cc b/mindspore/ccsrc/fl/server/server.cc
index 69ad3fe52f1..17885e359d6 100644
--- a/mindspore/ccsrc/fl/server/server.cc
+++ b/mindspore/ccsrc/fl/server/server.cc
@@ -32,6 +32,22 @@
 namespace mindspore {
 namespace fl {
 namespace server {
+// The handler to capture the signal of SIGTERM. Normally this signal is triggered by cloud cluster managers like K8S.
+std::shared_ptr<ps::core::CommunicatorBase> g_communicator_with_server = nullptr;
+std::vector<std::shared_ptr<ps::core::CommunicatorBase>> g_communicators_with_worker = {};
+void SignalHandler(int signal) {
+  MS_LOG(WARNING) << "SIGTERM captured: " << signal;
+  (void)std::for_each(g_communicators_with_worker.begin(), g_communicators_with_worker.end(),
+                      [](const std::shared_ptr<ps::core::CommunicatorBase> &communicator) {
+                        MS_ERROR_IF_NULL_WO_RET_VAL(communicator);
+                        (void)communicator->Stop();
+                      });
+
+  MS_ERROR_IF_NULL_WO_RET_VAL(g_communicator_with_server);
+  (void)g_communicator_with_server->Stop();
+  return;
+}
+
 void Server::Initialize(bool use_tcp, bool use_http, uint16_t http_port, const std::vector<RoundConfig> &rounds_config,
                         const CipherConfig &cipher_config, const FuncGraphPtr &func_graph, size_t executor_threshold) {
   MS_EXCEPTION_IF_NULL(func_graph);
@@ -48,6 +64,7 @@ void Server::Initialize(bool use_tcp, bool use_http, uint16_t http_port, const s
   use_http_ = use_http;
   http_port_ = http_port;
   executor_threshold_ = executor_threshold;
+  signal(SIGTERM, SignalHandler);
   return;
 }
 
@@ -80,6 +97,7 @@ void Server::Run() {
     MS_LOG(INFO) << "Parameters for secure aggregation have been initiated.";
   }
   RegisterRoundKernel();
+  InitMetrics();
   MS_LOG(INFO) << "Server started successfully.";
   safemode_ = false;
   lock.unlock();
@@ -108,6 +126,12 @@ void Server::CancelSafeMode() {
 
 bool Server::IsSafeMode() const { return safemode_.load(); }
 
+void Server::WaitExitSafeMode() const {
+  while (safemode_.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(kThreadSleepTime));
+  }
+}
+
 void Server::InitServerContext() {
   ps::PSContext::instance()->GenerateResetterRound();
   scheduler_ip_ = ps::PSContext::instance()->scheduler_host();
@@ -144,6 +168,7 @@ bool Server::InitCommunicatorWithServer() {
   communicator_with_server_ =
     server_node_->GetOrCreateTcpComm(scheduler_ip_, scheduler_port_, worker_num_, server_num_, task_executor_);
   MS_EXCEPTION_IF_NULL(communicator_with_server_);
+  g_communicator_with_server = communicator_with_server_;
   return true;
 }
 
@@ -165,6 +190,7 @@ bool Server::InitCommunicatorWithWorker() {
     MS_EXCEPTION_IF_NULL(http_comm);
     communicators_with_worker_.push_back(http_comm);
   }
+  g_communicators_with_worker = communicators_with_worker_;
   return true;
 }
 
@@ -238,10 +264,9 @@ void Server::InitIteration() {
 #endif
 
   // 2.Initialize all the rounds.
-  TimeOutCb time_out_cb =
-    std::bind(&Iteration::MoveToNextIteration, iteration_, std::placeholders::_1, std::placeholders::_2);
+  TimeOutCb time_out_cb = std::bind(&Iteration::NotifyNext, iteration_, std::placeholders::_1, std::placeholders::_2);
   FinishIterCb finish_iter_cb =
-    std::bind(&Iteration::MoveToNextIteration, iteration_, std::placeholders::_1, std::placeholders::_2);
+    std::bind(&Iteration::NotifyNext, iteration_, std::placeholders::_1, std::placeholders::_2);
   iteration_->InitRounds(communicators_with_worker_, time_out_cb, finish_iter_cb);
   return;
 }
@@ -306,6 +331,8 @@ void Server::RegisterCommCallbacks() {
 
   // Set exception event callbacks for server.
   RegisterExceptionEventCallback(tcp_comm);
+  // Set message callbacks for server.
+  RegisterMessageCallback(tcp_comm);
 
   if (!server_node_->InitFollowerScaler()) {
     MS_LOG(EXCEPTION) << "Initializing follower elastic scaler failed.";
@@ -354,6 +381,19 @@ void Server::RegisterExceptionEventCallback(const std::shared_ptr<ps::core::TcpC
   });
 }
 
+void Server::RegisterMessageCallback(const std::shared_ptr<ps::core::TcpCommunicator> &communicator) {
+  MS_EXCEPTION_IF_NULL(communicator);
+  // Register handler for restful requests receviced by scheduler.
+  communicator->RegisterMsgCallBack("enableFLS",
+                                    std::bind(&Server::HandleEnableServerRequest, this, std::placeholders::_1));
+  communicator->RegisterMsgCallBack("disableFLS",
+                                    std::bind(&Server::HandleDisableServerRequest, this, std::placeholders::_1));
+  communicator->RegisterMsgCallBack("newInstance",
+                                    std::bind(&Server::HandleNewInstanceRequest, this, std::placeholders::_1));
+  communicator->RegisterMsgCallBack("queryInstance",
+                                    std::bind(&Server::HandleQueryInstanceRequest, this, std::placeholders::_1));
+}
+
 void Server::InitExecutor() {
   MS_EXCEPTION_IF_NULL(func_graph_);
   if (executor_threshold_ == 0) {
@@ -392,6 +432,19 @@ void Server::RegisterRoundKernel() {
   return;
 }
 
+void Server::InitMetrics() {
+  if (server_node_->rank_id() == kLeaderServerRank) {
+    MS_EXCEPTION_IF_NULL(iteration_);
+    std::shared_ptr<IterationMetrics> iteration_metrics =
+      std::make_shared<IterationMetrics>(ps::PSContext::instance()->config_file_path());
+    if (!iteration_metrics->Initialize()) {
+      MS_LOG(WARNING) << "Initializing metrics failed.";
+      return;
+    }
+    iteration_->set_metrics(iteration_metrics);
+  }
+}
+
 void Server::StartCommunicator() {
   if (communicators_with_worker_.empty()) {
     MS_LOG(EXCEPTION) << "Communicators for communication with worker is empty.";
@@ -458,15 +511,7 @@ void Server::ProcessAfterScalingIn() {
   std::unique_lock<std::mutex> lock(scaling_mtx_);
   MS_ERROR_IF_NULL_WO_RET_VAL(server_node_);
   if (server_node_->rank_id() == UINT32_MAX) {
-    MS_LOG(WARNING) << "This server the one to be scaled in. Server exiting.";
-    (void)std::for_each(communicators_with_worker_.begin(), communicators_with_worker_.end(),
-                        [](const std::shared_ptr<ps::core::CommunicatorBase> &communicator) {
-                          MS_ERROR_IF_NULL_WO_RET_VAL(communicator);
-                          (void)communicator->Stop();
-                        });
-
-    MS_ERROR_IF_NULL_WO_RET_VAL(communicator_with_server_);
-    (void)communicator_with_server_->Stop();
+    MS_LOG(WARNING) << "This server the one to be scaled in. Server need to wait SIGTERM to exit.";
     return;
   }
 
@@ -489,6 +534,92 @@ void Server::ProcessAfterScalingIn() {
   std::this_thread::sleep_for(std::chrono::milliseconds(kServerSleepTimeForNetworking));
   safemode_ = false;
 }
+
+void Server::HandleEnableServerRequest(const std::shared_ptr<ps::core::MessageHandler> &message) {
+  MS_ERROR_IF_NULL_WO_RET_VAL(message);
+  MS_ERROR_IF_NULL_WO_RET_VAL(iteration_);
+  MS_ERROR_IF_NULL_WO_RET_VAL(communicator_with_server_);
+  auto tcp_comm = std::dynamic_pointer_cast<ps::core::TcpCommunicator>(communicator_with_server_);
+  MS_ERROR_IF_NULL_WO_RET_VAL(tcp_comm);
+
+  std::string result_message = "";
+  bool result = iteration_->EnableServerInstance(&result_message);
+  nlohmann::json response;
+  response["result"] = result;
+  response["message"] = result_message;
+  if (!tcp_comm->SendResponse(response.dump().c_str(), response.dump().size(), message)) {
+    MS_LOG(ERROR) << "Sending response failed.";
+    return;
+  }
+}
+
+void Server::HandleDisableServerRequest(const std::shared_ptr<ps::core::MessageHandler> &message) {
+  MS_ERROR_IF_NULL_WO_RET_VAL(message);
+  MS_ERROR_IF_NULL_WO_RET_VAL(iteration_);
+  MS_ERROR_IF_NULL_WO_RET_VAL(communicator_with_server_);
+  auto tcp_comm = std::dynamic_pointer_cast<ps::core::TcpCommunicator>(communicator_with_server_);
+  MS_ERROR_IF_NULL_WO_RET_VAL(tcp_comm);
+
+  std::string result_message = "";
+  bool result = iteration_->DisableServerInstance(&result_message);
+  nlohmann::json response;
+  response["result"] = result;
+  response["message"] = result_message;
+  if (!tcp_comm->SendResponse(response.dump().c_str(), response.dump().size(), message)) {
+    MS_LOG(ERROR) << "Sending response failed.";
+    return;
+  }
+}
+
+void Server::HandleNewInstanceRequest(const std::shared_ptr<ps::core::MessageHandler> &message) {
+  MS_ERROR_IF_NULL_WO_RET_VAL(message);
+  MS_ERROR_IF_NULL_WO_RET_VAL(iteration_);
+  MS_ERROR_IF_NULL_WO_RET_VAL(communicator_with_server_);
+  auto tcp_comm = std::dynamic_pointer_cast<ps::core::TcpCommunicator>(communicator_with_server_);
+  MS_ERROR_IF_NULL_WO_RET_VAL(tcp_comm);
+
+  std::string hyper_params_str(static_cast<const char *>(message->data()), message->len());
+  nlohmann::json new_instance_json;
+  nlohmann::json response;
+  try {
+    new_instance_json = nlohmann::json::parse(hyper_params_str);
+  } catch (const std::exception &e) {
+    response["result"] = false;
+    response["message"] = "The hyper-parameter data is not in json format.";
+    if (!tcp_comm->SendResponse(response.dump().c_str(), response.dump().size(), message)) {
+      MS_LOG(ERROR) << "Sending response failed.";
+      return;
+    }
+  }
+
+  std::string result_message = "";
+  bool result = iteration_->NewInstance(new_instance_json, &result_message);
+  response["result"] = result;
+  response["message"] = result_message;
+  if (!tcp_comm->SendResponse(response.dump().c_str(), response.dump().size(), message)) {
+    MS_LOG(ERROR) << "Sending response failed.";
+    return;
+  }
+}
+
+void Server::HandleQueryInstanceRequest(const std::shared_ptr<ps::core::MessageHandler> &message) {
+  MS_ERROR_IF_NULL_WO_RET_VAL(message);
+  nlohmann::basic_json<std::map, std::vector, std::string, bool, int64_t, uint64_t, float> response;
+  response["start_fl_job_threshold"] = ps::PSContext::instance()->start_fl_job_threshold();
+  response["start_fl_job_time_window"] = ps::PSContext::instance()->start_fl_job_time_window();
+  response["update_model_ratio"] = ps::PSContext::instance()->update_model_ratio();
+  response["update_model_time_window"] = ps::PSContext::instance()->update_model_time_window();
+  response["fl_iteration_num"] = ps::PSContext::instance()->fl_iteration_num();
+  response["client_epoch_num"] = ps::PSContext::instance()->client_epoch_num();
+  response["client_batch_size"] = ps::PSContext::instance()->client_batch_size();
+  response["client_learning_rate"] = ps::PSContext::instance()->client_learning_rate();
+  auto tcp_comm = std::dynamic_pointer_cast<ps::core::TcpCommunicator>(communicator_with_server_);
+  MS_ERROR_IF_NULL_WO_RET_VAL(tcp_comm);
+  if (!tcp_comm->SendResponse(response.dump().c_str(), response.dump().size(), message)) {
+    MS_LOG(ERROR) << "Sending response failed.";
+    return;
+  }
+}
 }  // namespace server
 }  // namespace fl
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/fl/server/server.h b/mindspore/ccsrc/fl/server/server.h
index bd0a3c6aa68..64ffa0e3dae 100644
--- a/mindspore/ccsrc/fl/server/server.h
+++ b/mindspore/ccsrc/fl/server/server.h
@@ -23,6 +23,7 @@
 #include "ps/core/communicator/communicator_base.h"
 #include "ps/core/communicator/tcp_communicator.h"
 #include "ps/core/communicator/task_executor.h"
+#include "ps/core/file_configuration.h"
 #include "fl/server/common.h"
 #include "fl/server/executor.h"
 #include "fl/server/iteration.h"
@@ -55,6 +56,10 @@ class Server {
   void SwitchToSafeMode();
   void CancelSafeMode();
   bool IsSafeMode() const;
+  void WaitExitSafeMode() const;
+
+  // Whether the training job of the server is enabled.
+  InstanceState instance_state() const;
 
  private:
   Server()
@@ -88,6 +93,9 @@ class Server {
   // Load variables which is set by ps_context.
   void InitServerContext();
 
+  // Try to recover server config from persistent storage.
+  void Recovery();
+
   // Initialize the server cluster, server node and communicators.
   void InitCluster();
   bool InitCommunicatorWithServer();
@@ -103,6 +111,9 @@ class Server {
   // Register cluster exception callbacks. This method is called in RegisterCommCallbacks.
   void RegisterExceptionEventCallback(const std::shared_ptr<ps::core::TcpCommunicator> &communicator);
 
+  // Register message callbacks. These messages are mainly from scheduler.
+  void RegisterMessageCallback(const std::shared_ptr<ps::core::TcpCommunicator> &communicator);
+
   // Initialize executor according to the server mode.
   void InitExecutor();
 
@@ -112,6 +123,8 @@ class Server {
   // Create round kernels and bind these kernels with corresponding Round.
   void RegisterRoundKernel();
 
+  void InitMetrics();
+
   // The communicators should be started after all initializations are completed.
   void StartCommunicator();
 
@@ -123,6 +136,16 @@ class Server {
   void ProcessAfterScalingOut();
   void ProcessAfterScalingIn();
 
+  // Handlers for enableFLS/disableFLS requests from the scheduler.
+  void HandleEnableServerRequest(const std::shared_ptr<ps::core::MessageHandler> &message);
+  void HandleDisableServerRequest(const std::shared_ptr<ps::core::MessageHandler> &message);
+
+  // Finish current instance and start a new one. FLPlan could be changed in this method.
+  void HandleNewInstanceRequest(const std::shared_ptr<ps::core::MessageHandler> &message);
+
+  // Query current instance information.
+  void HandleQueryInstanceRequest(const std::shared_ptr<ps::core::MessageHandler> &message);
+
   // The server node is initialized in Server.
   std::shared_ptr<ps::core::ServerNode> server_node_;
 
diff --git a/mindspore/ccsrc/fl/worker/fl_worker.cc b/mindspore/ccsrc/fl/worker/fl_worker.cc
index a004ba74042..8acdf15b455 100644
--- a/mindspore/ccsrc/fl/worker/fl_worker.cc
+++ b/mindspore/ccsrc/fl/worker/fl_worker.cc
@@ -25,7 +25,7 @@ namespace mindspore {
 namespace fl {
 namespace worker {
 void FLWorker::Run() {
-  if (running_) {
+  if (running_.load()) {
     return;
   }
   running_ = true;
@@ -48,6 +48,7 @@ void FLWorker::Run() {
 
   worker_node_->RegisterEventCallback(ps::core::ClusterEvent::SCHEDULER_TIMEOUT, [this]() {
     Finalize();
+    running_ = false;
     try {
       MS_LOG(EXCEPTION)
         << "Event SCHEDULER_TIMEOUT is captured. This is because scheduler node is finalized or crashed.";
@@ -57,6 +58,7 @@ void FLWorker::Run() {
   });
   worker_node_->RegisterEventCallback(ps::core::ClusterEvent::NODE_TIMEOUT, [this]() {
     Finalize();
+    running_ = false;
     try {
       MS_LOG(EXCEPTION)
         << "Event NODE_TIMEOUT is captured. This is because some server nodes are finalized or crashed after the "
@@ -123,8 +125,9 @@ bool FLWorker::SendToServer(uint32_t server_rank, const void *data, size_t size,
         return false;
       }
 
-      if (std::string(reinterpret_cast<char *>((*output)->data()), (*output)->size()) == ps::kClusterSafeMode) {
-        MS_LOG(INFO) << "The server " << server_rank << " is in safemode.";
+      std::string response_str = std::string(reinterpret_cast<char *>((*output)->data()), (*output)->size());
+      if (response_str == ps::kClusterSafeMode || response_str == ps::kJobNotAvailable) {
+        MS_LOG(INFO) << "The server " << server_rank << " is in safemode or finished.";
         std::this_thread::sleep_for(std::chrono::milliseconds(kWorkerRetryDurationForSafeMode));
       } else {
         break;
@@ -147,6 +150,8 @@ uint32_t FLWorker::rank_id() const { return rank_id_; }
 
 uint64_t FLWorker::worker_step_num_per_iteration() const { return worker_step_num_per_iteration_; }
 
+bool FLWorker::running() const { return running_.load(); }
+
 void FLWorker::SetIterationRunning() {
   MS_LOG(INFO) << "Worker iteration starts.";
   worker_iteration_state_ = IterationState::kRunning;
diff --git a/mindspore/ccsrc/fl/worker/fl_worker.h b/mindspore/ccsrc/fl/worker/fl_worker.h
index f8f08dbb62f..4b0fc9e2fde 100644
--- a/mindspore/ccsrc/fl/worker/fl_worker.h
+++ b/mindspore/ccsrc/fl/worker/fl_worker.h
@@ -35,6 +35,7 @@ using FBBuilder = flatbuffers::FlatBufferBuilder;
 // The step number for worker to judge whether to communicate with server.
 constexpr uint32_t kTrainBeginStepNum = 1;
 constexpr uint32_t kTrainEndStepNum = 0;
+constexpr uint32_t kOneStepPerIteration = 1;
 
 // The sleeping time of the worker thread before the networking is completed.
 constexpr uint32_t kWorkerSleepTimeForNetworking = 1000;
@@ -42,6 +43,9 @@ constexpr uint32_t kWorkerSleepTimeForNetworking = 1000;
 // The time duration between retrying when server is in safemode.
 constexpr uint32_t kWorkerRetryDurationForSafeMode = 500;
 
+// The rank of the leader server.
+constexpr uint32_t kLeaderServerRank = 0;
+
 enum class IterationState {
   // This iteration is still in process.
   kRunning,
@@ -68,6 +72,9 @@ class FLWorker {
   uint32_t rank_id() const;
   uint64_t worker_step_num_per_iteration() const;
 
+  // Check whether worker has exited.
+  bool running() const;
+
   // These methods set the worker's iteration state.
   void SetIterationRunning();
   void SetIterationCompleted();
@@ -112,7 +119,7 @@ class FLWorker {
   void ProcessAfterScalingOut();
   void ProcessAfterScalingIn();
 
-  bool running_;
+  std::atomic_bool running_;
   uint32_t server_num_;
   uint32_t worker_num_;
   std::string scheduler_ip_;
diff --git a/mindspore/ccsrc/frontend/operator/composite/composite.cc b/mindspore/ccsrc/frontend/operator/composite/composite.cc
index ff1096dfb95..5a0bef61bfb 100644
--- a/mindspore/ccsrc/frontend/operator/composite/composite.cc
+++ b/mindspore/ccsrc/frontend/operator/composite/composite.cc
@@ -167,11 +167,11 @@ AnfNodePtr HyperMap::FullMake(const std::shared_ptr<List> &type, const FuncGraph
       num++;
       auto lhs = std::static_pointer_cast<List>(item.second);
       if (lhs == nullptr) {
-        MS_LOG(EXCEPTION) << "The elements[" << num - 1 << "] has wrong type, expected a List, but got "
+        MS_LOG(EXCEPTION) << "The elements[" << (num - 1) << "] has wrong type, expected a List, but got "
                           << item.second->ToString();
       }
       if (lhs->elements().size() != size) {
-        MS_LOG(ERROR) << "The elements[" << num - 1 << "] has different length, expected " << size << ", but got "
+        MS_LOG(ERROR) << "The elements[" << (num - 1) << "] has different length, expected " << size << ", but got "
                       << lhs->elements().size();
         return true;
       }
@@ -225,11 +225,11 @@ AnfNodePtr HyperMap::FullMake(const std::shared_ptr<Tuple> &type, const FuncGrap
       num++;
       auto lhs = std::static_pointer_cast<Tuple>(item.second);
       if (lhs == nullptr) {
-        MS_LOG(EXCEPTION) << "The elements[" << num - 1 << "] has wrong type, expected a Tuple, but got "
+        MS_LOG(EXCEPTION) << "The elements[" << (num - 1) << "] has wrong type, expected a Tuple, but got "
                           << item.second->ToString();
       }
       if (lhs->elements().size() != size) {
-        MS_LOG(ERROR) << "The elements[" << num - 1 << "] has different length, expected " << size << ", but got "
+        MS_LOG(ERROR) << "The elements[" << (num - 1) << "] has different length, expected " << size << ", but got "
                       << lhs->elements().size();
         return true;
       }
diff --git a/mindspore/ccsrc/frontend/operator/composite/map.cc b/mindspore/ccsrc/frontend/operator/composite/map.cc
index c550b270ad9..7826bfa66da 100644
--- a/mindspore/ccsrc/frontend/operator/composite/map.cc
+++ b/mindspore/ccsrc/frontend/operator/composite/map.cc
@@ -77,11 +77,11 @@ AnfNodePtr Map::FullMakeList(const std::shared_ptr<List> &type, const FuncGraphP
       num++;
       auto lhs = std::dynamic_pointer_cast<List>(item.second);
       if (lhs == nullptr) {
-        MS_LOG(EXCEPTION) << "The elements[" << num - 1 << "] has wrong type, expected a List, but got "
+        MS_LOG(EXCEPTION) << "The elements[" << (num - 1) << "] has wrong type, expected a List, but got "
                           << item.second->ToString();
       }
       if (lhs->elements().size() != size) {
-        MS_LOG(ERROR) << "The elements[" << num - 1 << "] has different length, expected " << size << ", but got "
+        MS_LOG(ERROR) << "The elements[" << (num - 1) << "] has different length, expected " << size << ", but got "
                       << lhs->elements().size();
         return true;
       }
@@ -136,11 +136,11 @@ AnfNodePtr Map::FullMakeTuple(const std::shared_ptr<Tuple> &type, const FuncGrap
       num++;
       auto lhs = std::dynamic_pointer_cast<Tuple>(item.second);
       if (lhs == nullptr) {
-        MS_LOG(EXCEPTION) << "The elements[" << num - 1 << "] has wrong type, expected a Tuple, but got "
+        MS_LOG(EXCEPTION) << "The elements[" << (num - 1) << "] has wrong type, expected a Tuple, but got "
                           << item.second->ToString();
       }
       if (lhs->elements().size() != size) {
-        MS_LOG(ERROR) << "The elements[" << num - 1 << "] has different length, expected " << size << ", but got "
+        MS_LOG(ERROR) << "The elements[" << (num - 1) << "] has different length, expected " << size << ", but got "
                       << lhs->elements().size();
         return true;
       }
@@ -216,7 +216,8 @@ AnfNodePtr Map::FullMakeClass(const std::shared_ptr<Class> &type, const FuncGrap
 
     auto call_node = func_graph->NewCNodeInOrder(inputs2);
     if (reverse_) {
-      (void)inputs.insert(inputs.begin() + 2, call_node);
+      constexpr size_t kCallNodePosition = 2;
+      (void)inputs.insert(inputs.begin() + kCallNodePosition, call_node);
     } else {
       inputs.emplace_back(call_node);
     }
diff --git a/mindspore/ccsrc/frontend/optimizer/ad/kpynative.cc b/mindspore/ccsrc/frontend/optimizer/ad/kpynative.cc
index ab34e7986ff..7cc7d3ceeef 100644
--- a/mindspore/ccsrc/frontend/optimizer/ad/kpynative.cc
+++ b/mindspore/ccsrc/frontend/optimizer/ad/kpynative.cc
@@ -25,7 +25,7 @@
 #include <vector>
 #include <algorithm>
 #include "ir/anf.h"
-#include "pipeline/jit/prim_bprop_optimizer.h"
+#include "frontend/optimizer/ad/prim_bprop_optimizer.h"
 #include "frontend/optimizer/ad/adjoint.h"
 #include "frontend/optimizer/ad/dfunctor.h"
 #include "frontend/optimizer/ad/kpynative.h"
@@ -90,8 +90,11 @@ FuncGraphPtr GetZerosLike(const abstract::AbstractBasePtrList &args_spec) {
   MS_EXCEPTION_IF_NULL(specialized_zeros_like_fg);
   auto opted_zeros_like_fg = ZerosLikePrimOptPass(resource);
   MS_EXCEPTION_IF_NULL(opted_zeros_like_fg);
-  zeros_like_funcgraph_cache[args_spec] = opted_zeros_like_fg;
-  return BasicClone(opted_zeros_like_fg);
+  auto enable_grad_cache = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
+  if (enable_grad_cache) {
+    zeros_like_funcgraph_cache[args_spec] = BasicClone(opted_zeros_like_fg);
+  }
+  return opted_zeros_like_fg;
 }
 
 FuncGraphPtr GetHyperAdd(const abstract::AbstractBasePtrList &args_spec) {
@@ -146,8 +149,11 @@ FuncGraphPtr GetOnesLike(const abstract::AbstractBasePtrList &args_spec) {
   pipeline::ResourcePtr resource = std::make_shared<pipeline::Resource>();
   auto specialized_ones_like_fg = pipeline::Renormalize(resource, ones_like_fg, args_spec);
   MS_EXCEPTION_IF_NULL(specialized_ones_like_fg);
-  ones_like_funcgraph_cache[args_spec] = specialized_ones_like_fg;
-  return BasicClone(specialized_ones_like_fg);
+  auto enable_grad_cache = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
+  if (enable_grad_cache) {
+    ones_like_funcgraph_cache[args_spec] = BasicClone(specialized_ones_like_fg);
+  }
+  return specialized_ones_like_fg;
 }
 
 AnfNodePtr BuildOnesLikeValue(const FuncGraphPtr &tape, const ValuePtr &out) {
@@ -359,8 +365,8 @@ FuncGraphPtr KPynativeCellImpl::Finish(const AnfNodePtrList &weights, bool grad_
   SetOutput(weights, grad_inputs, grad_weights);
   // Replace Parameter of primal funcgraph  with parameter of tape_;
   ReplacePrimalParameter(weights, has_sens_arg);
-
-  if (MsContext::GetInstance()->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG)) {
+  auto save_graphs_flg = MsContext::GetInstance()->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
+  if (save_graphs_flg) {
     DumpIR("before_final_opt.ir", tape_);
   }
   return tape_;
@@ -645,7 +651,7 @@ bool KPynativeCellImpl::BuildAdjoint(const CNodePtr &cnode, const ValuePtrList &
 FuncGraphPtr OptimizeBPropFuncGraph(const FuncGraphPtr &bprop_fg, const CNodePtr &cnode, const ValuePtrList &op_args,
                                     const ValuePtr &out) {
   auto optimized_bprop_fg =
-    pipeline::PrimBpropOptimizer::GetPrimBpropOptimizerInst().OptimizeBPropFuncGraph(bprop_fg, cnode, op_args, out);
+    PrimBpropOptimizer::GetPrimBpropOptimizerInst().OptimizeBPropFuncGraph(bprop_fg, cnode, op_args, out);
   return optimized_bprop_fg;
 }
 
diff --git a/mindspore/ccsrc/frontend/optimizer/auto_monad_eliminate.cc b/mindspore/ccsrc/frontend/optimizer/auto_monad_eliminate.cc
index 114ac9e6a73..1bf1ea7f8e7 100644
--- a/mindspore/ccsrc/frontend/optimizer/auto_monad_eliminate.cc
+++ b/mindspore/ccsrc/frontend/optimizer/auto_monad_eliminate.cc
@@ -236,6 +236,27 @@ AnfNodePtr GetFirstMonad(const FuncGraphPtr &fg) {
   return monad;
 }
 
+bool MayModifyParameter(const AnfNodePtr &update_state, const AnfNodePtr &load) {
+  MS_EXCEPTION_IF_NULL(update_state);
+  MS_EXCEPTION_IF_NULL(load);
+  auto update_state_cnode = update_state->cast<CNodePtr>();
+  auto load_cnode = load->cast<CNodePtr>();
+  constexpr size_t attach_index = 2;
+  auto attach = update_state_cnode->input(attach_index);
+  if (!attach->isa<CNode>()) {
+    return false;
+  }
+  if (IsValueNode<FuncGraph>(attach->cast<CNodePtr>()->input(0))) {
+    return true;
+  }
+  auto inputs = attach->cast<CNodePtr>()->inputs();
+  bool exist_param_or_load = std::any_of(inputs.begin(), inputs.end(), [&load_cnode](const AnfNodePtr &input) {
+    auto parameter = load_cnode->input(1);
+    return input == load_cnode || input == parameter;
+  });
+  return exist_param_or_load;
+}
+
 // Replace UpdateStates with U for first load.
 // Covert:
 // u1 = UpdateState(u, c)
@@ -258,6 +279,9 @@ bool ReplaceUpdateStateForLoad(const FuncGraphPtr &fg, const std::vector<AnfNode
     if (!IsPrimitiveCNode(update_state, prim::kPrimUpdateState)) {
       continue;
     }
+    if (MayModifyParameter(update_state, load_node)) {
+      continue;
+    }
     auto mgr = fg->manager();
     MS_EXCEPTION_IF_NULL(mgr);
     mgr->SetEdge(load_node, second_input_index, monad);
diff --git a/mindspore/ccsrc/frontend/optimizer/irpass.cc b/mindspore/ccsrc/frontend/optimizer/irpass.cc
index 478afa46d86..1e58b6a7152 100644
--- a/mindspore/ccsrc/frontend/optimizer/irpass.cc
+++ b/mindspore/ccsrc/frontend/optimizer/irpass.cc
@@ -146,7 +146,7 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
                                              {prim::kPrimGetRefKey, prim::kPrimGetRefValue});
 
   replace_refkey_by_param_ = MakeSubstitution(std::make_shared<ReplaceRefkeyByParam>(), "replace_refkey_by_param",
-                                              IsValueNode<RefKey>, opt::FORCE_RENORM);
+                                              IsValueNode<RefKey>, false, opt::FORCE_RENORM);
   replace_old_param_ = MakeSubstitution(std::make_shared<ReplaceOldParam>(), "replace_old_param", IsParam);
   minmaximum_grad_ = MakeSubstitution(std::make_shared<MinMaximumGrad>(), "minmaximum_grad", prim::kPrimTupleGetItem);
 
@@ -192,13 +192,14 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
   updatestate_pure_node_eliminater_ = MakeSubstitution(std::make_shared<UpdatestatePureNodeEliminater>(),
                                                        "updatestate_pure_node_eliminater", prim::kPrimUpdateState);
   updatestate_depend_eliminater_ = MakeSubstitution(std::make_shared<UpdatestateDependEliminater>(),
-                                                    "updatestate_depend_eliminater", prim::kPrimUpdateState);
+                                                    "updatestate_depend_eliminater", prim::kPrimUpdateState, true);
   updatestate_assign_eliminater_ = MakeSubstitution(std::make_shared<UpdatestateAssignEliminater>(),
-                                                    "updatestate_assign_eliminater", prim::kPrimUpdateState);
-  updatestate_maketuple_eliminater_ = MakeSubstitution(std::make_shared<UpdatestateMakeTupleEliminater>(),
-                                                       "updatestate_maketuple_eliminater", prim::kPrimUpdateState);
+                                                    "updatestate_assign_eliminater", prim::kPrimUpdateState, true);
+  updatestate_maketuple_eliminater_ =
+    MakeSubstitution(std::make_shared<UpdatestateMakeTupleEliminater>(), "updatestate_maketuple_eliminater",
+                     prim::kPrimUpdateState, true);
   updatestate_loads_eliminater_ = MakeSubstitution(std::make_shared<UpdatestateLoadsEliminater>(),
-                                                   "updatestate_loads_eliminater", prim::kPrimUpdateState);
+                                                   "updatestate_loads_eliminater", prim::kPrimUpdateState, true);
   switch_call_monad_eliminater_ = MakeSubstitution(std::make_shared<SwitchCallMonadParameterEliminater>(),
                                                    "switch_call_monad_eliminater", IsCNodeDup);
 
@@ -273,8 +274,9 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
 
 ResolveIRPassLib::ResolveIRPassLib() {
   // In resolver_getattr_resolve_, some patterns have priority over others.
-  resolver_getattr_resolve_ = MakeSubstitution(std::make_shared<ResolverGetAttrResolve>(), "getattr_resolve",
-                                               {prim::kPrimGetAttr, prim::kPrimResolve}, opt::CHECK_RENORM, true);
+  resolver_getattr_resolve_ =
+    MakeSubstitution(std::make_shared<ResolverGetAttrResolve>(), "getattr_resolve",
+                     {prim::kPrimGetAttr, prim::kPrimResolve}, false, opt::CHECK_RENORM, true);
 }
 
 InferenceOptPrepareLib::InferenceOptPrepareLib() {
diff --git a/mindspore/ccsrc/frontend/optimizer/irpass/incorporate_getitem.h b/mindspore/ccsrc/frontend/optimizer/irpass/incorporate_getitem.h
index 8fe6757f3e9..d4b37616626 100644
--- a/mindspore/ccsrc/frontend/optimizer/irpass/incorporate_getitem.h
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/incorporate_getitem.h
@@ -19,6 +19,7 @@
 
 #include <algorithm>
 #include <memory>
+#include <set>
 #include <unordered_map>
 #include <vector>
 #include <utility>
@@ -128,6 +129,195 @@ class GetItemTransformACrossGraph {
  private:
   std::unordered_map<FuncGraphPtr, std::unordered_map<int64_t, FuncGraphPtr>> cache_;
 };
+
+bool HasMoreJ(const OptimizerPtr &optimizer) {
+  bool more_j = false;
+  auto res = optimizer->resource();
+  auto resource_ptr = std::dynamic_pointer_cast<pipeline::Resource>(res);
+  if (resource_ptr != nullptr) {
+    const auto &manager = optimizer->manager();
+    MS_EXCEPTION_IF_NULL(manager);
+    more_j = manager->func_graph_j_total(resource_ptr->func_graph());
+  }
+  return more_j;
+}
+
+bool IsOutputShrinkable(const AnfNodePtr &output) {
+  if (IsPrimitiveCNode(output, prim::kPrimMakeTuple)) {
+    return true;
+  }
+  if (GetValueNode<ValueTuplePtr>(output)) {
+    return true;
+  }
+  return false;
+}
+
+size_t GetOutputSize(const AnfNodePtr &output) {
+  if (IsPrimitiveCNode(output, prim::kPrimMakeTuple)) {
+    const auto &output_cnode = output->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(output_cnode);
+    return output_cnode->size() - 1;
+  }
+  const auto &value_tuple = GetValueNode<ValueTuplePtr>(output);
+  if (value_tuple == nullptr) {
+    MS_LOG(EXCEPTION) << "fg output is not MakeTuple or ValueTuple, but: " << output->DebugString();
+  }
+  return value_tuple->size();
+}
+
+struct TpCNodeAndIndex {
+  // CNode {TupleGetItem, call, index}
+  CNodePtr tp_cnode;
+  int64_t index;
+};
+
+int64_t UpdateUserNodeIndex(const CNodePtr &fg_call_cnode, const int64_t current_index,
+                            const std::vector<TpCNodeAndIndex> &tp_cnodes_and_index) {
+  const auto &manager = fg_call_cnode->func_graph()->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  int64_t new_index = current_index;
+  auto txn = manager->Transact();
+  for (int64_t i = 0; i < SizeToLong(tp_cnodes_and_index.size()); ++i) {
+    const auto &cnode_and_index = tp_cnodes_and_index[i];
+    if (cnode_and_index.index != i) {
+      constexpr auto kInputIndex = 2;
+      txn.SetEdge(cnode_and_index.tp_cnode, kInputIndex, NewValueNode(i));
+    }
+    if (cnode_and_index.index == current_index) {
+      new_index = i;
+    }
+  }
+  txn.Commit();
+  return new_index;
+}
+
+AbstractBasePtr ShrinkAbstract(const AbstractBasePtr &original_abstract,
+                               const std::vector<TpCNodeAndIndex> &tp_cnodes_and_index) {
+  if (original_abstract != nullptr && original_abstract->isa<abstract::AbstractTuple>()) {
+    const auto &abs_tuple = original_abstract->cast<abstract::AbstractTuplePtr>();
+    MS_EXCEPTION_IF_NULL(abs_tuple);
+    const auto &abs_tuple_elements = abs_tuple->elements();
+    const int64_t before_shrink_tuple_size = SizeToLong(abs_tuple_elements.size());
+    AbstractBasePtrList shrunk_abstract_elements;
+    std::transform(tp_cnodes_and_index.cbegin(), tp_cnodes_and_index.cend(),
+                   std::back_inserter(shrunk_abstract_elements),
+                   [abs_tuple_elements, before_shrink_tuple_size](const auto &node_and_index) {
+                     if (node_and_index.index >= before_shrink_tuple_size) {
+                       MS_LOG(EXCEPTION) << "index should less than inputs size, index: " << node_and_index.index
+                                         << ", abstract tuple size: " << before_shrink_tuple_size;
+                     }
+                     return abs_tuple_elements[node_and_index.index];
+                   });
+    return std::make_shared<abstract::AbstractTuple>(shrunk_abstract_elements);
+  }
+  return nullptr;
+}
+
+FuncGraphPtr ShrinkUnsedOutput(const FuncGraphPtr &fg, const std::vector<TpCNodeAndIndex> &tp_cnodes_and_index) {
+  const auto &manager = fg->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+
+  auto new_fg = TransformableClone(fg, std::make_shared<TraceTransform>("tp_use"));
+  auto new_fg_output = new_fg->output();
+  AnfNodePtr shrunk_output = nullptr;
+  int64_t before_shrink_inputs_size = 0;
+  if (IsPrimitiveCNode(new_fg_output, prim::kPrimMakeTuple)) {
+    // Shrink output;
+    auto new_fg_output_cnode = new_fg_output->cast<CNodePtr>();
+    const auto &new_fg_output_inputs = new_fg_output_cnode->inputs();
+    constexpr auto kMinimalSize = 2;
+    if (new_fg_output_inputs.size() <= kMinimalSize) {
+      MS_LOG(EXCEPTION) << "New fg output should at least 2 elements, but: " << new_fg_output->DebugString();
+    }
+    before_shrink_inputs_size = SizeToLong(new_fg_output_inputs.size() - 1);
+    AnfNodePtrList shrunk_inputs{NewValueNode({prim::kPrimMakeTuple})};
+    // Bypass maketuple primitive in new_fg_output_inputs;
+    std::transform(tp_cnodes_and_index.cbegin(), tp_cnodes_and_index.cend(), std::back_inserter(shrunk_inputs),
+                   [new_fg_output, new_fg_output_inputs, before_shrink_inputs_size](const auto &node_and_index) {
+                     if (node_and_index.index >= before_shrink_inputs_size) {
+                       MS_LOG(EXCEPTION) << "index should less than inputs size, index: " << node_and_index.index
+                                         << ", output: " << new_fg_output->DebugString();
+                     }
+                     return new_fg_output_inputs[node_and_index.index + 1];
+                   });
+    shrunk_output = new_fg->NewCNode(shrunk_inputs);
+  } else {
+    auto value_tuple = GetValueNode<ValueTuplePtr>(new_fg_output);
+    if (value_tuple == nullptr) {
+      MS_LOG(EXCEPTION) << "New fg output is not MakeTuple or ValueTuple, but " << new_fg_output->DebugString();
+    }
+    ValuePtrList shrunk_inputs;
+    before_shrink_inputs_size = value_tuple->size();
+    std::transform(tp_cnodes_and_index.cbegin(), tp_cnodes_and_index.cend(), std::back_inserter(shrunk_inputs),
+                   [new_fg_output, value_tuple, before_shrink_inputs_size](const auto &node_and_index) {
+                     if (node_and_index.index >= before_shrink_inputs_size) {
+                       MS_LOG(EXCEPTION) << "index should less than inputs size, index: " << node_and_index.index
+                                         << ", output: " << new_fg_output->DebugString();
+                     }
+                     return (*value_tuple)[node_and_index.index];
+                   });
+    shrunk_output = NewValueNode(std::make_shared<ValueTuple>(shrunk_inputs));
+  }
+  auto shrunk_abstract = ShrinkAbstract(new_fg_output->abstract(), tp_cnodes_and_index);
+  MS_EXCEPTION_IF_NULL(shrunk_abstract);
+  shrunk_output->set_abstract(shrunk_abstract);
+  new_fg->set_output(shrunk_output);
+  MS_LOG(DEBUG) << "Partly item used; original size: " << before_shrink_inputs_size
+                << ", new size: " << tp_cnodes_and_index.size() << ", fg: " << fg->ToString() << ", new graph"
+                << new_fg->ToString();
+  return new_fg;
+}
+
+struct FuncGraphIntVectorPairHasher {
+  std::size_t Int64VectorHash(const std::vector<int64_t> &int_vector) const {
+    std::size_t hash_value = 0;
+    constexpr auto kMaxElementsNum = 4;
+    for (size_t i = 0; (i < int_vector.size()) && (i < kMaxElementsNum); ++i) {
+      hash_value = hash_combine(hash_value, std::hash<int64_t>{}(int_vector[i]));
+    }
+    return hash_value;
+  }
+
+  std::size_t operator()(const std::pair<FuncGraphPtr, std::vector<int64_t>> &p) const {
+    auto h1 = std::hash<FuncGraphPtr>{}(p.first);
+    auto h2 = Int64VectorHash(p.second);
+    return hash_combine(h1, h2);
+  }
+};
+
+bool ShouldTransform(const AnfNodePtr &node, const std::vector<TpCNodeAndIndex> &tp_cnodes_and_index) {
+  if (node->abstract() && node->abstract()->isa<abstract::AbstractTuple>()) {
+    const auto &abs_tuple = *(node->abstract()->cast<abstract::AbstractTuplePtr>());
+    if (tp_cnodes_and_index[0].index == 0 && abs_tuple.size() > 0) {
+      if (abs_tuple[0]->isa<abstract::AbstractScalar>() && abs_tuple[0]->GetTypeTrack()->isa<EnvType>()) {
+        return true;
+      }
+    }
+    // fprop_fg will return MakeTuple(xx, bprop_fg).
+    if (tp_cnodes_and_index.size() > 1 && tp_cnodes_and_index[1].index == 1 && abs_tuple.size() > 1 &&
+        abs_tuple[1]->isa<abstract::AbstractFunction>()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Incorporate getitem if the indexed node is a ZerosLike node, so another opt pass AddN(MakeTuple(Xs, ZerosLike))
+// can work.
+bool AlwaysTransformThisIndex(const AnfNodePtr &output, const int64_t index) {
+  if (IsPrimitiveCNode(output, prim::kPrimMakeTuple)) {
+    const auto &output_cnode = output->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(output_cnode);
+    if (index >= SizeToLong(output_cnode->size() - 1)) {
+      MS_LOG(EXCEPTION) << "Index of GetItem: " << index
+                        << " exceeds size of MakeTuple: " << output_cnode->DebugString();
+    }
+    if (IsPrimitiveCNode(output_cnode->input(index + 1), prim::kPrimZerosLike)) {
+      return true;
+    }
+  }
+  return false;
+}
 }  // namespace internal
 
 // {prim::kPrimTupleGetItem, {G, Xs}, C}
@@ -136,7 +326,7 @@ class IncorporateGetitem : public AnfVisitor {
   IncorporateGetitem() : getitem_transform_() {}
   ~IncorporateGetitem() override = default;
 
-  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override {
     Reset();
     AnfVisitor::Match(prim::kPrimTupleGetItem, {IsCNode, IsValueNode<Int64Imm>})(node);
     if (node->func_graph() == nullptr || idx_ == -1 || fg_ == nullptr || fg_->has_flag(FUNC_GRAPH_FLAG_DEFER_INLINE) ||
@@ -144,15 +334,138 @@ class IncorporateGetitem : public AnfVisitor {
       return nullptr;
     }
 
-    auto new_fg = getitem_transform_(node, fg_, idx_);
-    (void)args_.insert(args_.begin(), NewValueNode(new_fg));
-    auto new_node = node->func_graph()->NewCNode(args_);
-    // Check if the another only usage of {G, Xs} is UpdateState{s, {G, Xs}}, if yes, replace
-    // UpdateState{s, {G, Xs}} with UpdateState{s, new_node};
     const auto &manager = fg_->manager();
     MS_EXCEPTION_IF_NULL(manager);
+    if (internal::AlwaysTransformThisIndex(fg_->output(), idx_)) {
+      return TransformFuncGraph(manager, node);
+    }
+    // This node had been substituted.
+    if (processed_nodes_.find(fg_call_cnode_) != processed_nodes_.end()) {
+      MS_LOG(DEBUG) << "fg call with same cnode is already replaced, node: " << node->DebugString()
+                    << ", fg_call: " << fg_call_cnode_->DebugString();
+      return nullptr;
+    }
+    bool output_is_shrinkable = internal::IsOutputShrinkable(fg_->output());
+    std::vector<internal::TpCNodeAndIndex> tp_cnodes_and_index;
+    auto fg_call_cnode_users_counter = MultipleUse(fg_call_cnode_, fg_, &tp_cnodes_and_index);
+    bool multiple_use = (tp_cnodes_and_index.size() > 1);
+    if (output_is_shrinkable && multiple_use && (tp_cnodes_and_index.size() == fg_call_cnode_users_counter)) {
+      if (!internal::ShouldTransform(fg_call_cnode_, tp_cnodes_and_index) && !internal::HasMoreJ(optimizer)) {
+        MS_LOG(DEBUG) << "No more j and multiple use, will shrink, node: " << node->DebugString()
+                      << ", fg_call: " << fg_call_cnode_->DebugString();
+        const auto output_size = internal::GetOutputSize(fg_->output());
+        if (fg_call_cnode_users_counter == output_size) {
+          processed_nodes_.emplace(fg_call_cnode_);
+          MS_LOG(DEBUG) << "All elements in output is used, no need to transform, node: " << node->DebugString()
+                        << ", fg_call: " << fg_call_cnode_->DebugString();
+          return nullptr;
+        }
+        auto new_node = ShrinkFuncGraphOutput(node, tp_cnodes_and_index);
+        if (new_node != nullptr) {
+          return new_node;
+        }
+      }
+    }
+    MS_LOG(DEBUG) << "Cannot shrink, transform_getitem, node: " << node->DebugString()
+                  << ", fg_call: " << fg_call_cnode_->DebugString();
+    return TransformFuncGraph(manager, node);
+  }
+
+  size_t MultipleUse(const CNodePtr &fg_call, const FuncGraphPtr &fg,
+                     std::vector<internal::TpCNodeAndIndex> *cnodes_and_index) const {
+    const auto &manager = fg->manager();
+    MS_EXCEPTION_IF_NULL(manager);
+    auto &cnode_and_index_vector = *cnodes_and_index;
+    std::set<int64_t> index_set;
+    std::size_t total_usage = 0;
+    const auto &node_users_map = manager->node_users();
+    const auto &it = node_users_map.find(fg_call);
+    if (it == node_users_map.end()) {
+      return 0;
+    }
+    const auto &node_users = it->second;
+    for (const auto &user : node_users) {
+      if (IsPrimitiveCNode(user.first, prim::kPrimTupleGetItem)) {
+        const auto &cnode = user.first->cast<CNodePtr>();
+        if (cnode->input(2)->isa<ValueNode>()) {
+          auto idx = GetValue<int64_t>(cnode->input(2)->cast<ValueNodePtr>()->value());
+          cnode_and_index_vector.push_back({cnode, idx});
+          index_set.insert(idx);
+          total_usage++;
+        } else {
+          MS_LOG(EXCEPTION) << "tuple_getitem index is not valuenode, but: " << user.first->DebugString();
+        }
+      } else {
+        MS_LOG(DEBUG) << "fg_call usre is not tuple_getitem, user: " << user.first->DebugString();
+      }
+    }
+    if (index_set.size() != total_usage) {
+      MS_LOG(DEBUG) << "some index usage is duplicated, total_usage: " << total_usage;
+      MS_LOG(DEBUG) << "index_set:";
+      for (auto idx : index_set) {
+        MS_LOG(DEBUG) << " " << idx;
+      }
+    }
+    // sort by index;
+    std::sort(cnode_and_index_vector.begin(), cnode_and_index_vector.end(),
+              [](const auto &tp1, const auto &tp2) { return tp1.index < tp2.index; });
+    return node_users.size();
+  }
+
+  AnfNodePtr ShrinkFuncGraphOutput(const AnfNodePtr &node,
+                                   const std::vector<internal::TpCNodeAndIndex> &tp_cnodes_and_index) {
+    const auto &manager = fg_->manager();
+    MS_EXCEPTION_IF_NULL(manager);
+    std::vector<int64_t> index_vector;
+    (void)std::transform(tp_cnodes_and_index.begin(), tp_cnodes_and_index.end(), std::back_inserter(index_vector),
+                         [](const auto &cnode_and_index) { return cnode_and_index.index; });
+    auto iter = processed_fgs_.find(std::make_pair(fg_, index_vector));
+    if (iter != processed_fgs_.end()) {
+      MS_LOG(DEBUG) << "fg is already processed, just update caller index, node: " << node->DebugString()
+                    << ", fg_call: " << fg_call_cnode_->DebugString();
+      MS_LOG(DEBUG) << "original fg: " << fg_->ToString() << ", processed_fg: " << iter->second->ToString();
+      processed_nodes_.emplace(fg_call_cnode_);
+      manager->SetEdge(fg_call_cnode_, 0, NewValueNode(iter->second));
+      auto shrunk_abstract = internal::ShrinkAbstract(fg_call_cnode_->abstract(), tp_cnodes_and_index);
+      if (shrunk_abstract != nullptr) {
+        fg_call_cnode_->set_abstract(shrunk_abstract);
+      }
+      auto new_idx = internal::UpdateUserNodeIndex(fg_call_cnode_, idx_, tp_cnodes_and_index);
+      auto new_node =
+        node->func_graph()->NewCNode({NewValueNode(prim::kPrimTupleGetItem), fg_call_cnode_, NewValueNode(new_idx)});
+      new_node->set_abstract(node->abstract());
+      return new_node;
+    }
+    const auto new_fg = internal::ShrinkUnsedOutput(fg_, tp_cnodes_and_index);
+    if (new_fg != nullptr) {
+      MS_LOG(DEBUG) << "fg output is shrunk, original fg: " << fg_->ToString() << ", new fg: " << new_fg->ToString();
+      processed_nodes_.emplace(fg_call_cnode_);
+      processed_fgs_.emplace(std::make_pair(fg_, index_vector), new_fg);
+      manager->SetEdge(fg_call_cnode_, 0, NewValueNode(new_fg));
+      auto shrunk_abstract = internal::ShrinkAbstract(fg_call_cnode_->abstract(), tp_cnodes_and_index);
+      if (shrunk_abstract != nullptr) {
+        fg_call_cnode_->set_abstract(shrunk_abstract);
+      }
+      auto new_idx = internal::UpdateUserNodeIndex(fg_call_cnode_, idx_, tp_cnodes_and_index);
+      auto new_node =
+        node->func_graph()->NewCNode({NewValueNode(prim::kPrimTupleGetItem), fg_call_cnode_, NewValueNode(new_idx)});
+      new_node->set_abstract(node->abstract());
+      return new_node;
+    }
+    MS_LOG(DEBUG) << "Shrink failed. node: " << node->DebugString()
+                  << ", switch_call: " << fg_call_cnode_->DebugString();
+    return nullptr;
+  }
+
+  AnfNodePtr TransformFuncGraph(const FuncGraphManagerPtr &manager, const AnfNodePtr &origin_node) {
+    auto new_fg = getitem_transform_(origin_node, fg_, idx_);
+    MS_LOG(DEBUG) << "Original fg: " << fg_->ToString() << ", new fg: " << new_fg->ToString();
+    (void)args_.insert(args_.begin(), NewValueNode(new_fg));
+    auto new_node = origin_node->func_graph()->NewCNode(args_);
+    // Check if the another only usage of {G, Xs} is UpdateState{s, {G, Xs}}, if yes, replace
+    // UpdateState{s, {G, Xs}} with UpdateState{s, new_node};
     auto &node_users_map = manager->node_users();
-    auto it = node_users_map.find(fg_cnode_);
+    auto it = node_users_map.find(fg_call_cnode_);
     if (it != node_users_map.end()) {
       AnfNodePtr update_state_node = nullptr;
       auto &node_users = it->second;
@@ -166,14 +479,14 @@ class IncorporateGetitem : public AnfVisitor {
       if (update_state_node != nullptr) {
         auto update_state_cnode = update_state_node->cast<CNodePtr>();
         // double check;
-        if (update_state_cnode->input(2) == fg_cnode_) {
+        if (update_state_cnode->input(2) == fg_call_cnode_) {
           MS_LOG(DEBUG) << "Replace UpdateState node: " << update_state_cnode->DebugString(2)
                         << ", input 2 with: " << new_node->DebugString();
           manager->SetEdge(update_state_cnode, 2, new_node);
         }
       }
     }
-    new_node->set_abstract(node->abstract());
+    new_node->set_abstract(origin_node->abstract());
     return new_node;
   }
 
@@ -182,7 +495,7 @@ class IncorporateGetitem : public AnfVisitor {
       return;
     }
 
-    fg_cnode_ = cnode;
+    fg_call_cnode_ = cnode;
     auto &inputs = cnode->inputs();
     fg_ = GetValueNode<FuncGraphPtr>(inputs[0]);
     (void)std::copy(inputs.begin() + 1, inputs.end(), std::back_inserter(args_));
@@ -193,15 +506,19 @@ class IncorporateGetitem : public AnfVisitor {
   void Reset() {
     idx_ = -1;
     fg_ = nullptr;
-    fg_cnode_ = nullptr;
+    fg_call_cnode_ = nullptr;
     args_.clear();
   }
 
  private:
   int64_t idx_{-1};
   FuncGraphPtr fg_{nullptr};
-  AnfNodePtr fg_cnode_{nullptr};
+  CNodePtr fg_call_cnode_{nullptr};
   std::vector<AnfNodePtr> args_{};
+  std::set<AnfNodePtr> processed_nodes_;
+  std::unordered_map<std::pair<FuncGraphPtr, std::vector<int64_t>>, FuncGraphPtr,
+                     internal::FuncGraphIntVectorPairHasher>
+    processed_fgs_;
   internal::GetitemTransform getitem_transform_;
 };
 
@@ -298,7 +615,7 @@ class IncorporateGetitemSwitch : public AnfVisitor {
   IncorporateGetitemSwitch() : getitem_transform_() {}
   ~IncorporateGetitemSwitch() override = default;
 
-  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override {
     Reset();
     is_in_get_ = true;
     AnfVisitor::Match(prim::kPrimTupleGetItem, {IsCNode, IsValueNode<Int64Imm>})(node);
@@ -316,33 +633,57 @@ class IncorporateGetitemSwitch : public AnfVisitor {
     if (g2_ == nullptr) {
       return nullptr;
     }
-    auto tuple_getitem = node->cast<CNodePtr>();
-    MS_EXCEPTION_IF_NULL(tuple_getitem);
-    bool has_env_type = false;
-    if (tuple_getitem->input(1)->abstract() && tuple_getitem->input(1)->abstract()->isa<abstract::AbstractTuple>()) {
-      const auto &abs_tuple = *(tuple_getitem->input(1)->abstract()->cast<abstract::AbstractTuplePtr>());
-      // eliminate (envinstance, value1, value2, ...) built by bprop func_graph()
-      if (abs_tuple.size() >= 1) {
-        // Value maybe kAnyValue, so check the type track;
-        if (abs_tuple[0]->isa<abstract::AbstractScalar>() && abs_tuple[0]->GetTypeTrack()->isa<EnvType>()) {
-          has_env_type = true;
-        }
-      }
-      // eliminate (value, bprop_func) built by fprop func_graph
-      if (abs_tuple.size() >= 2) {
-        if (abs_tuple[1]->isa<abstract::AbstractFunction>()) {
-          has_env_type = true;
-        }
-      }
-    }
-    // If exist env_getitem/env_setitem in this funcgraph or
-    // if g1_/g2_ is fprop func_graph and the corresponding bprop funcgraph has any env_getitem or env_setitem;
-    if (MultipleUseOfSwitch(tuple_getitem->input(1), fg) && !ExistEnvNode(fg) && !ExistEnvNodeInTupleItem(g1_) &&
-        !ExistEnvNodeInTupleItem(g2_) && !has_env_type) {
+    if (processed_nodes_.find(switch_) != processed_nodes_.end()) {
+      MS_LOG(DEBUG) << "fg in switch node has been replaced. node: " << node->DebugString()
+                    << ", switch: " << switch_->DebugString();
       return nullptr;
     }
+
+    bool g1_output_is_shrinkable = internal::IsOutputShrinkable(g1_->output());
+    bool g2_output_is_shrinkable = internal::IsOutputShrinkable(g2_->output());
+
+    auto tuple_getitem = node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(tuple_getitem);
+    const auto &switch_call = tuple_getitem->input(1);
+    MS_EXCEPTION_IF_NULL(switch_call);
+    const auto &switch_call_cnode = switch_call->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(switch_call_cnode);
+    // If exist env_getitem/env_setitem in this funcgraph or
+    // if g1_/g2_ is fprop func_graph and the corresponding bprop funcgraph has any env_getitem or env_setitem;
+    std::vector<internal::TpCNodeAndIndex> tp_cnodes_and_index;
+    auto switch_call_users_counter = MultipleUseOfSwitch(switch_call, fg, &tp_cnodes_and_index);
+    bool multiple_use = (tp_cnodes_and_index.size() > 1);
+    if (g1_output_is_shrinkable && g2_output_is_shrinkable && multiple_use &&
+        (tp_cnodes_and_index.size() == switch_call_users_counter)) {
+      if (!internal::HasMoreJ(optimizer) && !ExistEnvNode(fg) && !ExistEnvNodeInTupleItem(g1_) &&
+          !ExistEnvNodeInTupleItem(g2_) && !internal::ShouldTransform(switch_call, tp_cnodes_and_index)) {
+        MS_LOG(DEBUG) << "No more j, will shrink. Node: " << node->DebugString()
+                      << ", switch: " << switch_->DebugString();
+        const auto g1_output_size = internal::GetOutputSize(g1_->output());
+        const auto g2_output_size = internal::GetOutputSize(g2_->output());
+        if (g1_output_size != g2_output_size) {
+          MS_LOG(EXCEPTION) << "output of g1 and g2 should have same tuple size, but g1 output: "
+                            << g1_->output()->DebugString() << ", g2 output: " << g2_->output()->DebugString();
+        }
+        if (switch_call_users_counter == g1_output_size) {
+          processed_nodes_.emplace(switch_call);
+          MS_LOG(DEBUG) << "All elements in output is used, no need to transform, node: " << node->DebugString()
+                        << ", switch: " << switch_->DebugString();
+          return nullptr;
+        }
+
+        auto new_node = ShrinkFuncGraphOutput(node, switch_call_cnode, tp_cnodes_and_index);
+        if (new_node != nullptr) {
+          return new_node;
+        }
+      }
+    }
+    MS_LOG(DEBUG) << "Cannot shrink output, transform_getitem_switch, node: " << node->DebugString()
+                  << ", switch: " << switch_->DebugString();
     auto new_g1 = getitem_transform_(node, g1_, idx_);
     auto new_g2 = getitem_transform_(node, g2_, idx_);
+    MS_LOG(DEBUG) << "Original fg1: " << g1_->ToString() << ", new_fg1: " << new_g1->ToString();
+    MS_LOG(DEBUG) << "Original fg2: " << g2_->ToString() << ", new_fg2: " << new_g2->ToString();
     auto sw_node = fg->NewCNode({NewValueNode(prim::kPrimSwitch), x_, NewValueNode(new_g1), NewValueNode(new_g2)});
     (void)args_.insert(args_.begin(), sw_node);
 
@@ -350,7 +691,60 @@ class IncorporateGetitemSwitch : public AnfVisitor {
     new_node->set_abstract(node->abstract());
     return new_node;
   }
-
+  AnfNodePtr ShrinkFuncGraphOutput(const AnfNodePtr &node, const CNodePtr &switch_call_cnode,
+                                   const std::vector<internal::TpCNodeAndIndex> &tp_cnodes_and_index) {
+    const auto &manager = node->func_graph()->manager();
+    MS_EXCEPTION_IF_NULL(manager);
+    auto switch_cnode = switch_->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(switch_cnode);
+    std::vector<int64_t> index_vector;
+    (void)std::transform(tp_cnodes_and_index.begin(), tp_cnodes_and_index.end(), std::back_inserter(index_vector),
+                         [](const auto &cnode_and_index) { return cnode_and_index.index; });
+    const auto &iter1 = processed_fgs_.find(std::make_pair(g1_, index_vector));
+    const auto &iter2 = processed_fgs_.find(std::make_pair(g2_, index_vector));
+    if (iter1 != processed_fgs_.end() && iter2 != processed_fgs_.end()) {
+      MS_LOG(DEBUG) << "fg output had been processed, no need to transform, node: " << node->DebugString()
+                    << ", switch: " << switch_->DebugString();
+      MS_LOG(DEBUG) << "Original fg1: " << g1_->ToString() << ", new_fg1: " << iter1->second->ToString();
+      MS_LOG(DEBUG) << "Original fg2: " << g2_->ToString() << ", new_fg2: " << iter2->second->ToString();
+      processed_nodes_.emplace(switch_);
+      manager->SetEdge(switch_cnode, 2, NewValueNode(iter1->second));
+      manager->SetEdge(switch_cnode, 3, NewValueNode(iter2->second));
+      auto shrunk_abstract = internal::ShrinkAbstract(switch_call_cnode->abstract(), tp_cnodes_and_index);
+      if (shrunk_abstract != nullptr) {
+        switch_call_cnode->set_abstract(shrunk_abstract);
+      }
+      auto new_idx = internal::UpdateUserNodeIndex(switch_call_cnode, idx_, tp_cnodes_and_index);
+      auto new_node =
+        node->func_graph()->NewCNode({NewValueNode(prim::kPrimTupleGetItem), switch_call_cnode, NewValueNode(new_idx)});
+      new_node->set_abstract(node->abstract());
+      return new_node;
+    }
+    const auto &new_g1 = internal::ShrinkUnsedOutput(g1_, tp_cnodes_and_index);
+    const auto &new_g2 = internal::ShrinkUnsedOutput(g2_, tp_cnodes_and_index);
+    if (new_g1 != nullptr && new_g2 != nullptr) {
+      MS_LOG(DEBUG) << "Shrink output. node: " << node->DebugString() << ", switch: " << switch_->DebugString();
+      MS_LOG(DEBUG) << "Original fg1: " << g1_->ToString() << ", new_fg1: " << new_g1->ToString();
+      MS_LOG(DEBUG) << "Original fg2: " << g2_->ToString() << ", new_fg2: " << new_g2->ToString();
+      processed_nodes_.emplace(switch_);
+      processed_fgs_.emplace(std::make_pair(g1_, index_vector), new_g1);
+      processed_fgs_.emplace(std::make_pair(g2_, index_vector), new_g2);
+      manager->SetEdge(switch_cnode, 2, NewValueNode(new_g1));
+      manager->SetEdge(switch_cnode, 3, NewValueNode(new_g2));
+      auto shrunk_abstract = internal::ShrinkAbstract(switch_call_cnode->abstract(), tp_cnodes_and_index);
+      if (shrunk_abstract != nullptr) {
+        switch_call_cnode->set_abstract(shrunk_abstract);
+      }
+      auto new_idx = internal::UpdateUserNodeIndex(switch_call_cnode, idx_, tp_cnodes_and_index);
+      auto new_node =
+        node->func_graph()->NewCNode({NewValueNode(prim::kPrimTupleGetItem), switch_call_cnode, NewValueNode(new_idx)});
+      new_node->set_abstract(node->abstract());
+      return new_node;
+    }
+    MS_LOG(DEBUG) << "Shrink failed. node: " << node->DebugString()
+                  << ", switch_call: " << switch_call_cnode->DebugString();
+    return nullptr;
+  }
   void Visit(const AnfNodePtr &node) override {
     if (is_in_switch_ && x_ == nullptr) {
       x_ = node;
@@ -393,22 +787,51 @@ class IncorporateGetitemSwitch : public AnfVisitor {
   }
 
  private:
-  bool MultipleUseOfSwitch(const AnfNodePtr &switch_call, const FuncGraphPtr &fg) const {
+  size_t MultipleUseOfSwitch(const AnfNodePtr &switch_call, const FuncGraphPtr &fg,
+                             std::vector<internal::TpCNodeAndIndex> *cnodes_and_index) const {
     auto switch_call_cnode = switch_call->cast<CNodePtr>();
     MS_EXCEPTION_IF_NULL(switch_call_cnode);
     auto manager = fg->manager();
     MS_EXCEPTION_IF_NULL(manager);
+    auto &cnode_and_index_vector = *cnodes_and_index;
+    std::set<int64_t> index_set;
+    std::size_t total_usage = 0;
     auto &node_users_map = manager->node_users();
     auto it = node_users_map.find(switch_call);
     if (it == node_users_map.end()) {
-      return false;
+      return 0;
     }
     auto &node_users = it->second;
-    // If switch was used by more than 1 tuple_getitem nodes, this pass shouldn't be execute.s
-    auto tuple_getitem_num = std::count_if(node_users.begin(), node_users.end(), [](std::pair<AnfNodePtr, int> &user) {
-      return IsPrimitiveCNode(user.first, prim::kPrimTupleGetItem);
-    });
-    return tuple_getitem_num > 1;
+    // If switch was used by more than 1 tuple_getitem nodes, this pass shouldn't be execute.
+    for (auto user : node_users) {
+      if (IsPrimitiveCNode(user.first, prim::kPrimTupleGetItem)) {
+        auto cnode = user.first->cast<CNodePtr>();
+        constexpr auto kInputIndex = 2;
+        if (cnode->input(kInputIndex)->isa<ValueNode>()) {
+          const auto &idx_node = cnode->input(kInputIndex)->cast<ValueNodePtr>();
+          MS_EXCEPTION_IF_NULL(idx_node);
+          auto idx = GetValue<int64_t>(idx_node->value());
+          cnode_and_index_vector.push_back({cnode, idx});
+          index_set.insert(idx);
+          total_usage++;
+        } else {
+          MS_LOG(EXCEPTION) << "Tuple_getitem index is not valuenode, but: " << user.first->DebugString(2);
+        }
+      } else {
+        MS_LOG(DEBUG) << "switch_call user is not tuple_getitem, user: " << user.first->DebugString(2);
+      }
+    }
+    if (index_set.size() != total_usage) {
+      MS_LOG(DEBUG) << "some index is duplicated, total_usage: " << total_usage;
+      MS_LOG(DEBUG) << "index_set: ";
+      for (auto idx : index_set) {
+        MS_LOG(DEBUG) << " " << idx;
+      }
+    }
+    // sort by index;
+    std::sort(cnode_and_index_vector.begin(), cnode_and_index_vector.end(),
+              [](const auto &tp1, const auto &tp2) { return tp1.index < tp2.index; });
+    return node_users.size();
   }
 
   static bool inline ExistEnvNode(const FuncGraphPtr &fg) {
@@ -441,6 +864,10 @@ class IncorporateGetitemSwitch : public AnfVisitor {
   FuncGraphPtr g1_{nullptr}, g2_{nullptr};
   bool is_in_get_{false}, is_in_switch_{false};
   std::vector<AnfNodePtr> args_{};
+  std::set<AnfNodePtr> processed_nodes_;
+  std::unordered_map<std::pair<FuncGraphPtr, std::vector<int64_t>>, FuncGraphPtr,
+                     internal::FuncGraphIntVectorPairHasher>
+    processed_fgs_;
   internal::GetitemTransform getitem_transform_;
 };
 
diff --git a/mindspore/ccsrc/frontend/optimizer/irpass/less_batch_normalization.cc b/mindspore/ccsrc/frontend/optimizer/irpass/less_batch_normalization.cc
index b1a2901ffe7..e4b6a51d2a1 100644
--- a/mindspore/ccsrc/frontend/optimizer/irpass/less_batch_normalization.cc
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/less_batch_normalization.cc
@@ -356,10 +356,7 @@ bool LessBatchNormalization::MatchStructureNode(const CNodePtr &cnode, const int
   }
   const auto &use_pattern = std::get<1>(patternTuple);
   int32_t use_index = index % static_cast<int32_t>(use_pattern.size());
-  if (!IsPrimitiveCNode(cnode, use_pattern[use_index])) {
-    return false;
-  }
-  return true;
+  return IsPrimitiveCNode(cnode, use_pattern[IntToSize(use_index)]);
 }
 
 bool LessBatchNormalization::MatchGraphStructure(const CNodePtr &cnode,
@@ -410,7 +407,7 @@ AnfNodePtr LessBatchNormalization::operator()(const OptimizerPtr &optimizer, con
     Reset();
     const auto &current_pattern = kNeedMatchPattern.at(match_pattern_);
     size_t sum_match_node = 0;
-    std::for_each(current_pattern.begin(), current_pattern.end(), [&](const kStructureTuple &t) {
+    (void)std::for_each(current_pattern.begin(), current_pattern.end(), [&](const kStructureTuple &t) {
       sum_match_node += std::get<0>(t);
       (void)total_match_node_.emplace_back(sum_match_node);
     });
diff --git a/mindspore/ccsrc/frontend/optimizer/irpass/updatestate_eliminate.cc b/mindspore/ccsrc/frontend/optimizer/irpass/updatestate_eliminate.cc
index 02eede35af8..54a1576104c 100644
--- a/mindspore/ccsrc/frontend/optimizer/irpass/updatestate_eliminate.cc
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/updatestate_eliminate.cc
@@ -85,25 +85,47 @@ bool OnlyUsedByTwoNode(const AnfNodePtr &be_used_node, const AnfNodePtr &first_n
          (first_user == second_node && second_user == first_node);
 }
 
+// Determine whether there is a monad in the inputs of the node.
+bool CheckHasMonadInput(const CNodePtr &cnode) {
+  // If the last input is a monad, means the attach node has side-effect and
+  // we should keep UpdateState; otherwise, we will remove the UpdateState.
+  if (cnode->size() > 1 && HasAbstractMonad(cnode->inputs().back())) {
+    return true;
+  }
+
+  // Check the inputs of Call/Switch/SwitchLayer.
+  auto first_input_node = cnode->input(kFirstInputIndex);
+  if (IsPrimitiveCNode(first_input_node, prim::kPrimCall) || IsPrimitiveCNode(first_input_node, prim::kPrimSwitch) ||
+      IsPrimitiveCNode(first_input_node, prim::kPrimSwitchLayer)) {
+    for (auto &input : first_input_node->cast<CNodePtr>()->inputs()) {
+      if (HasAbstractMonad(input)) {
+        return true;
+      }
+      auto input_cnode = dyn_cast<CNode>(input);
+      if (input_cnode != nullptr && input_cnode->size() > 1 && HasAbstractMonad(input_cnode->inputs().back())) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 AnfNodePtr EliminateUpdateStateForPureNode(const CNodePtr &update_state, const AnfNodePtr &attach) {
   auto cnode = dyn_cast<CNode>(attach);
   if (cnode == nullptr) {
     // Skip value node or parameter.
     return nullptr;
   }
-  if (cnode->size() > 1) {
-    // If the last input is a monad, means the attach node has side-effect and
-    // we should keep UpdateState; otherwise, we will remove the UpdateState.
-    if (HasAbstractMonad(cnode->inputs().back())) {
-      return nullptr;
-    }
+  if (CheckHasMonadInput(cnode)) {
+    return nullptr;
   }
 
   // Remove UpdateState by replace it with its input monad.
   return update_state->input(kInputIndex);
 }
 
-AnfNodePtr EliminateUpdateStateWithDepend(const CNodePtr &update_state, const CNodePtr &depend) {
+AnfNodePtr EliminateUpdateStateWithDepend(const OptimizerPtr &optimizer, const CNodePtr &update_state,
+                                          const CNodePtr &depend) {
   auto input_monad = depend->inputs().back();
   if (!HasAbstractMonad(input_monad)) {
     // Skip if Depend attach input is not a monad.
@@ -131,7 +153,7 @@ AnfNodePtr EliminateUpdateStateWithDepend(const CNodePtr &update_state, const CN
   // Replace Depend with its input.
   if (depend->size() == kMinDependSize) {
     auto depend_input = depend->input(kInputIndex);
-    mgr->Replace(depend, depend_input);
+    optimizer->SubstitutionReplace(mgr, depend, depend_input);
   } else {
     auto inputs = depend->inputs();
     inputs.pop_back();
@@ -139,7 +161,7 @@ AnfNodePtr EliminateUpdateStateWithDepend(const CNodePtr &update_state, const CN
     MS_EXCEPTION_IF_NULL(fg);
     auto new_depend = fg->NewCNode(inputs);
     new_depend->set_abstract(depend->abstract());
-    mgr->Replace(depend, new_depend);
+    optimizer->SubstitutionReplace(mgr, depend, new_depend);
   }
   // Replace UpdateState node with the input monad of Depend.
   return input_monad;
@@ -297,7 +319,7 @@ AnfNodePtr MakeTupleForSameNodes(const FuncGraphPtr &fg, const CNodePtr &old_upd
 }
 
 // Remove all nodes related to UpdateStates, if they're redundant.
-void EliminateUselessNodesForUpdateStates(const std::vector<CNodePtr> &update_states) {
+void EliminateUselessNodesForUpdateStates(const OptimizerPtr &optimizer, const std::vector<CNodePtr> &update_states) {
   if (update_states.empty()) {
     return;
   }
@@ -309,7 +331,7 @@ void EliminateUselessNodesForUpdateStates(const std::vector<CNodePtr> &update_st
   // 1. Remove the use of UpdateState nodes, except the last one.
   for (auto i = update_states.size() - 1; i > 0; i--) {
     auto &us = update_states[i];
-    mgr->Replace(us, us->input(kInputIndex));
+    optimizer->SubstitutionReplace(mgr, us, us->input(kInputIndex));
   }
 
   // 2. Remove the Depend users of last UpdateState node.
@@ -343,7 +365,7 @@ void EliminateUselessNodesForUpdateStates(const std::vector<CNodePtr> &update_st
   for (ssize_t i = depend_nodes.size() - 1; i >= end; i--) {
     const auto &depend_node = depend_nodes[i];
     const auto &depend_cnode = depend_node->cast<CNodePtr>();
-    mgr->Replace(depend_cnode, depend_cnode->input(kInputIndex));
+    optimizer->SubstitutionReplace(mgr, depend_cnode, depend_cnode->input(kInputIndex));
   }
 }
 
@@ -363,7 +385,8 @@ void EliminateUselessNodesForUpdateStates(const std::vector<CNodePtr> &update_st
 //    xN = Load(xN, u)
 //    t = make_tuple(x1, x2, ... , xN)
 //    u1 = UpdateState(u, t)
-AnfNodePtr EliminateUpdateStateForLoads(const CNodePtr &old_update_state, const std::vector<CNodePtr> &update_states,
+AnfNodePtr EliminateUpdateStateForLoads(const OptimizerPtr &optimizer, const CNodePtr &old_update_state,
+                                        const std::vector<CNodePtr> &update_states,
                                         const std::vector<CNodePtr> &loads) {
   auto fg = old_update_state->func_graph();
   if (fg == nullptr) {
@@ -393,7 +416,7 @@ AnfNodePtr EliminateUpdateStateForLoads(const CNodePtr &old_update_state, const
     }
   }
 
-  EliminateUselessNodesForUpdateStates(update_states);
+  EliminateUselessNodesForUpdateStates(optimizer, update_states);
 
   if (make_tuple_inputs.size() == 1) {
     // This should not happen.
@@ -420,7 +443,8 @@ AnfNodePtr EliminateUpdateStateForLoads(const CNodePtr &old_update_state, const
 // a2 = Assign(para2, value2, u1)
 // t = MakeTuple(a1, a2)
 // u3 = UpdateState(u1, t)
-AnfNodePtr EliminateUpdateStateBetweenAssigns(const CNodePtr &update_state, const AnfNodePtr &assign) {
+AnfNodePtr EliminateUpdateStateBetweenAssigns(const OptimizerPtr &optimizer, const CNodePtr &update_state,
+                                              const AnfNodePtr &assign) {
   auto a2_cnode = assign->cast<CNodePtr>();
   if (a2_cnode->size() != kAssignSize) {
     return nullptr;
@@ -444,7 +468,7 @@ AnfNodePtr EliminateUpdateStateBetweenAssigns(const CNodePtr &update_state, cons
         MS_EXCEPTION_IF_NULL(fg);
         auto mgr = fg->manager();
         MS_EXCEPTION_IF_NULL(mgr);
-        mgr->Replace(u2, u1);
+        optimizer->SubstitutionReplace(mgr, u2, u1);
         AnfNodePtrList make_tuple_inputs{NewValueNode(prim::kPrimMakeTuple), a1, assign};
         auto make_tuple = MakeTupleForSameNodes(fg, update_state, make_tuple_inputs);
         auto new_update_state = fg->NewCNode({NewValueNode(prim::kPrimUpdateState), u1, make_tuple});
@@ -472,7 +496,8 @@ AnfNodePtr EliminateUpdateStateBetweenAssigns(const CNodePtr &update_state, cons
 // a3 = Assign(para3, value3, u1)
 // t = MakeTuple(a1, a2, a3)
 // u4 = UpdateState(u1, t)
-AnfNodePtr EliminateUpdateStateBetweenMakeTupleAssign(const CNodePtr &update_state, const AnfNodePtr &assign) {
+AnfNodePtr EliminateUpdateStateBetweenMakeTupleAssign(const OptimizerPtr &optimizer, const CNodePtr &update_state,
+                                                      const AnfNodePtr &assign) {
   auto a3_cnode = assign->cast<CNodePtr>();
   if (a3_cnode->size() != kAssignSize) {
     return nullptr;
@@ -509,11 +534,11 @@ AnfNodePtr EliminateUpdateStateBetweenMakeTupleAssign(const CNodePtr &update_sta
           MS_EXCEPTION_IF_NULL(fg);
           auto mgr = fg->manager();
           MS_EXCEPTION_IF_NULL(mgr);
-          mgr->Replace(u3, u1);
+          optimizer->SubstitutionReplace(mgr, u3, u1);
           AnfNodePtrList new_make_tuple_inputs{NewValueNode(prim::kPrimMakeTuple), make_tuple_cnode->input(kInputIndex),
                                                make_tuple_cnode->input(kAttachIndex), assign};
           auto new_make_tuple = MakeTupleForSameNodes(fg, update_state, new_make_tuple_inputs);
-          mgr->Replace(make_tuple, new_make_tuple);
+          optimizer->SubstitutionReplace(mgr, make_tuple, new_make_tuple);
           auto new_update_state = fg->NewCNode({NewValueNode(prim::kPrimUpdateState), u1, new_make_tuple});
           new_update_state->set_abstract(update_state->abstract());
           new_update_state->set_scope(update_state->scope());
@@ -540,7 +565,8 @@ AnfNodePtr EliminateUpdateStateBetweenMakeTupleAssign(const CNodePtr &update_sta
 // a3 = Assign(para3, value3, u1)
 // t = MakeTuple(a1, a2, a3)
 // u4 = UpdateState(u1, t)
-AnfNodePtr EliminateUpdateStateBetweenAssignMakeTuple(const CNodePtr &update_state, const AnfNodePtr &make_tuple) {
+AnfNodePtr EliminateUpdateStateBetweenAssignMakeTuple(const OptimizerPtr &optimizer, const CNodePtr &update_state,
+                                                      const AnfNodePtr &make_tuple) {
   auto make_tuple_cnode = make_tuple->cast<CNodePtr>();
   if (make_tuple_cnode->size() != kMakeTupleSize || !OnlyUsedByOneNode(make_tuple, update_state)) {
     return nullptr;
@@ -583,12 +609,12 @@ AnfNodePtr EliminateUpdateStateBetweenAssignMakeTuple(const CNodePtr &update_sta
           MS_EXCEPTION_IF_NULL(fg);
           auto mgr = fg->manager();
           MS_EXCEPTION_IF_NULL(mgr);
-          mgr->Replace(u2, u1);
+          optimizer->SubstitutionReplace(mgr, u2, u1);
           AnfNodePtrList new_make_tuple_inputs{NewValueNode(prim::kPrimMakeTuple), a1,
                                                make_tuple_cnode->input(kInputIndex),
                                                make_tuple_cnode->input(kAttachIndex)};
           auto new_make_tuple = MakeTupleForSameNodes(fg, update_state, new_make_tuple_inputs);
-          mgr->Replace(make_tuple, new_make_tuple);
+          optimizer->SubstitutionReplace(mgr, make_tuple, new_make_tuple);
           auto new_update_state = fg->NewCNode({NewValueNode(prim::kPrimUpdateState), u1, new_make_tuple});
           new_update_state->set_abstract(update_state->abstract());
           new_update_state->set_scope(update_state->scope());
@@ -657,7 +683,7 @@ AnfNodePtr UpdatestatePureNodeEliminater::operator()(const OptimizerPtr &, const
 // To:
 //    out = x_user(x)
 //    u2 = u_user(u)
-AnfNodePtr UpdatestateDependEliminater::operator()(const OptimizerPtr &, const AnfNodePtr &node) {
+AnfNodePtr UpdatestateDependEliminater::operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
   auto update_state_node = dyn_cast<CNode>(node);
   if (update_state_node == nullptr || update_state_node->inputs().empty()) {
     MS_LOG(WARNING) << "UpdatestateEliminater encounter invalid node: " << node->DebugString();
@@ -665,14 +691,14 @@ AnfNodePtr UpdatestateDependEliminater::operator()(const OptimizerPtr &, const A
   }
   auto &attach = update_state_node->input(kAttachIndex);
   if (IsPrimitiveCNode(attach, prim::kPrimDepend)) {
-    return EliminateUpdateStateWithDepend(update_state_node, attach->cast<CNodePtr>());
+    return EliminateUpdateStateWithDepend(optimizer, update_state_node, attach->cast<CNodePtr>());
   }
   return nullptr;
 }
 
 // Eliminate UpdateStates between Assign nodes.
 // Eliminate UpdateStates between Assign and MakeTuple.
-AnfNodePtr UpdatestateAssignEliminater::operator()(const OptimizerPtr &, const AnfNodePtr &node) {
+AnfNodePtr UpdatestateAssignEliminater::operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
   auto update_state_node = dyn_cast<CNode>(node);
   if (update_state_node == nullptr || update_state_node->inputs().empty()) {
     MS_LOG(WARNING) << "UpdatestateEliminater encounter invalid node: " << node->DebugString();
@@ -680,19 +706,19 @@ AnfNodePtr UpdatestateAssignEliminater::operator()(const OptimizerPtr &, const A
   }
   auto &attach = update_state_node->input(kAttachIndex);
   if (IsPrimitiveCNode(attach, prim::kPrimAssign)) {
-    auto new_node = EliminateUpdateStateBetweenAssigns(update_state_node, attach);
+    auto new_node = EliminateUpdateStateBetweenAssigns(optimizer, update_state_node, attach);
     if (new_node != nullptr) {
       return new_node;
     }
-    return EliminateUpdateStateBetweenMakeTupleAssign(update_state_node, attach);
+    return EliminateUpdateStateBetweenMakeTupleAssign(optimizer, update_state_node, attach);
   }
   return nullptr;
 }
 
 // Eliminate UpdateStates which the second input is MakeTuple.
-AnfNodePtr UpdatestateMakeTupleEliminater::operator()(const OptimizerPtr &, const AnfNodePtr &node) {
+AnfNodePtr UpdatestateMakeTupleEliminater::operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
   PatternNode<AnfNodePtr> u, attach;
-  auto MakeTupleLambda = [&node, &u, &attach]() -> AnfNodePtr {
+  auto MakeTupleLambda = [&optimizer, &node, &u, &attach]() -> AnfNodePtr {
     auto update_state_node = node->cast<CNodePtr>();
     auto make_tuple = attach.GetNode(node)->cast<CNodePtr>();
     auto new_node = EliminateMakeTupleWithDeadNode(update_state_node, make_tuple);
@@ -703,7 +729,7 @@ AnfNodePtr UpdatestateMakeTupleEliminater::operator()(const OptimizerPtr &, cons
     if (new_node != nullptr) {
       return new_node;
     }
-    return EliminateUpdateStateBetweenAssignMakeTuple(update_state_node, make_tuple);
+    return EliminateUpdateStateBetweenAssignMakeTuple(optimizer, update_state_node, make_tuple);
   };
 
   MATCH_REPLACE_LAMBDA_IF(node, PPrimitive(prim::kPrimUpdateState, u, attach), MakeTupleLambda,
@@ -712,7 +738,7 @@ AnfNodePtr UpdatestateMakeTupleEliminater::operator()(const OptimizerPtr &, cons
 }
 
 // Eliminate UpdateStates for consecutive Loads.
-AnfNodePtr UpdatestateLoadsEliminater::operator()(const OptimizerPtr &, const AnfNodePtr &node) {
+AnfNodePtr UpdatestateLoadsEliminater::operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
   auto update_state_node = dyn_cast<CNode>(node);
   if (update_state_node == nullptr || update_state_node->inputs().empty()) {
     MS_LOG(WARNING) << "UpdatestateEliminater encounter invalid node: " << node->DebugString();
@@ -724,7 +750,7 @@ AnfNodePtr UpdatestateLoadsEliminater::operator()(const OptimizerPtr &, const An
     std::vector<CNodePtr> loads;
     GetLoadsFromUpdateState(update_state_node, &update_states, &loads);
     if (update_states.size() > 1 && loads.size() > 1) {
-      return EliminateUpdateStateForLoads(update_state_node, update_states, loads);
+      return EliminateUpdateStateForLoads(optimizer, update_state_node, update_states, loads);
     }
   }
   return nullptr;
diff --git a/mindspore/ccsrc/frontend/optimizer/irpass/updatestate_eliminate.h b/mindspore/ccsrc/frontend/optimizer/irpass/updatestate_eliminate.h
index 60fe63e0d9d..d672358ce65 100644
--- a/mindspore/ccsrc/frontend/optimizer/irpass/updatestate_eliminate.h
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/updatestate_eliminate.h
@@ -36,26 +36,26 @@ class UpdatestatePureNodeEliminater : public AnfVisitor {
 // Eliminate redundant UpdateState/Depend pair nodes caused by inline.
 class UpdatestateDependEliminater : public AnfVisitor {
  public:
-  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override;
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override;
 };
 
 // Eliminate UpdateStates between Assign nodes.
 // Eliminate UpdateStates between Assign and MakeTuple.
 class UpdatestateAssignEliminater : public AnfVisitor {
  public:
-  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override;
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override;
 };
 
 // Eliminate UpdateStates which the second input is MakeTuple.
 class UpdatestateMakeTupleEliminater : public AnfVisitor {
  public:
-  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override;
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override;
 };
 
 // Eliminate UpdateStates for consecutive Loads.
 class UpdatestateLoadsEliminater : public AnfVisitor {
  public:
-  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override;
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override;
 };
 
 // SwitchCallMonadParameterEliminater eliminates Monad parameter in switch call.
diff --git a/mindspore/ccsrc/frontend/optimizer/opt.cc b/mindspore/ccsrc/frontend/optimizer/opt.cc
index 30ec46304b3..db42fcb8d15 100644
--- a/mindspore/ccsrc/frontend/optimizer/opt.cc
+++ b/mindspore/ccsrc/frontend/optimizer/opt.cc
@@ -30,14 +30,15 @@ namespace mindspore {
 /* namespace to support opt */
 namespace opt {
 SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name, const PrimitivePtr &prim,
-                                 const RenormAction &renorm_action, bool has_priority_pattern) {
+                                 bool has_node_replacement, const RenormAction &renorm_action,
+                                 bool has_priority_pattern) {
   auto fn = [prim](const AnfNodePtr &node) -> bool { return IsPrimitiveCNode(node, prim); };
-  return std::make_shared<Substitution>(transform, name, fn, renorm_action, has_priority_pattern);
+  return std::make_shared<Substitution>(transform, name, fn, has_node_replacement, renorm_action, has_priority_pattern);
 }
 
 SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name,
-                                 const std::vector<PrimitivePtr> &prims, const RenormAction &renorm_action,
-                                 bool has_priority_pattern) {
+                                 const std::vector<PrimitivePtr> &prims, bool has_node_replacement,
+                                 const RenormAction &renorm_action, bool has_priority_pattern) {
   auto fn = [prims](const AnfNodePtr &node) -> bool {
     if (!node->isa<CNode>()) {
       return false;
@@ -60,13 +61,14 @@ SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std:
     return false;
   };
 
-  return std::make_shared<Substitution>(transform, name, fn, renorm_action, has_priority_pattern);
+  return std::make_shared<Substitution>(transform, name, fn, has_node_replacement, renorm_action, has_priority_pattern);
 }
 
 SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name,
-                                 const PredicateFuncType &predicate, const RenormAction &renorm_action,
-                                 bool has_priority_pattern) {
-  return std::make_shared<Substitution>(transform, name, predicate, renorm_action, has_priority_pattern);
+                                 const PredicateFuncType &predicate, bool has_node_replacement,
+                                 const RenormAction &renorm_action, bool has_priority_pattern) {
+  return std::make_shared<Substitution>(transform, name, predicate, has_node_replacement, renorm_action,
+                                        has_priority_pattern);
 }
 
 AnfNodePtr Substitution::operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
@@ -212,6 +214,14 @@ bool SubstitutionList::ApplyIRToSubstitutions(const OptimizerPtr &optimizer, con
         change = true;
         changes = true;
         node = res;
+        // If there is a node replacement in the substitution, add replaced nodes to todo list.
+        if (substitution->has_node_replacement_) {
+          for (auto &replaced_node : optimizer->substitution_replaced_nodes()) {
+            UpdateTransformingListForSubstitutions(replaced_node, &todo, change);
+            UpdateTransformingListWithUserNodes(optimizer, replaced_node, &todo, change, seen);
+          }
+          optimizer->clear_substitution_replaced_nodes();
+        }
         break;
       }
     }
@@ -251,6 +261,14 @@ bool SubstitutionList::ApplySubstitutionToIR(const OptimizerPtr &optimizer, cons
       change = true;
       changes = true;
       node = res;
+      // If there is a node replacement in the substitution, add replaced nodes to todo list.
+      if (substitution->has_node_replacement_) {
+        for (auto &replaced_node : optimizer->substitution_replaced_nodes()) {
+          UpdateTransformingListForIR(replaced_node, &todo, change, substitution);
+          UpdateTransformingListWithUserNodes(optimizer, replaced_node, &todo, change, seen);
+        }
+        optimizer->clear_substitution_replaced_nodes();
+      }
     }
     UpdateTransformingListForIR(node, &todo, change, substitution);
     UpdateTransformingListWithUserNodes(optimizer, node, &todo, change, seen);
diff --git a/mindspore/ccsrc/frontend/optimizer/opt.h b/mindspore/ccsrc/frontend/optimizer/opt.h
index 3370f1cebcd..feee7283a53 100644
--- a/mindspore/ccsrc/frontend/optimizer/opt.h
+++ b/mindspore/ccsrc/frontend/optimizer/opt.h
@@ -42,16 +42,19 @@ class Substitution {
   OptimizerCallerPtr transform_;
   std::string name_;
   PredicateFuncType predicate_{nullptr};
-  // An enum to mark this Substitution relation to renormalize pass
+  // Determine whether there is a node replacement in the substitution, such as manager->Replace(old_node, new_node).
+  bool has_node_replacement_{false};
+  // An enum to mark this Substitution relation to renormalize pass.
   RenormAction renorm_action_;
   // Determine whether it is a priority substitution, that is, some patterns need to be matched prior to others.
   bool has_priority_pattern_{false};
 
   Substitution(const OptimizerCallerPtr &transform, const std::string &name, const PredicateFuncType &predicate,
-               const RenormAction &renorm_action, bool has_priority_pattern)
+               bool has_node_replacement, const RenormAction &renorm_action, bool has_priority_pattern)
       : transform_(transform),
         name_(name),
         predicate_(predicate),
+        has_node_replacement_(has_node_replacement),
         renorm_action_(renorm_action),
         has_priority_pattern_(has_priority_pattern) {}
   ~Substitution() = default;
@@ -61,13 +64,14 @@ class Substitution {
 using SubstitutionPtr = std::shared_ptr<Substitution>;
 
 SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name, const PrimitivePtr &prim,
-                                 const RenormAction &action_renorm = CHECK_RENORM, bool has_priority_pattern = false);
-SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name,
-                                 const std::vector<PrimitivePtr> &prims,
-                                 const RenormAction &action_renorm = CHECK_RENORM, bool has_priority_pattern = false);
-SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name,
-                                 const PredicateFuncType &predicate, const RenormAction &action_renorm = CHECK_RENORM,
+                                 bool has_node_replacement = false, const RenormAction &action_renorm = CHECK_RENORM,
                                  bool has_priority_pattern = false);
+SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name,
+                                 const std::vector<PrimitivePtr> &prims, bool has_node_replacement = false,
+                                 const RenormAction &action_renorm = CHECK_RENORM, bool has_priority_pattern = false);
+SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name,
+                                 const PredicateFuncType &predicate, bool has_node_replacement = false,
+                                 const RenormAction &action_renorm = CHECK_RENORM, bool has_priority_pattern = false);
 
 enum OptTraverseSubstitutionsMode { kOptTraverseFromIRToSubstitutions = 0, kOptTraverseFromSubstitutionsToIR };
 
diff --git a/mindspore/ccsrc/frontend/optimizer/optimizer.h b/mindspore/ccsrc/frontend/optimizer/optimizer.h
index 9bc63257aff..9a92a243c39 100644
--- a/mindspore/ccsrc/frontend/optimizer/optimizer.h
+++ b/mindspore/ccsrc/frontend/optimizer/optimizer.h
@@ -226,6 +226,15 @@ class Optimizer : public std::enable_shared_from_this<Optimizer> {
     MS_LOG(EXCEPTION) << "No ResourceBase exists.";
   }
 
+  // Only for the case that manager->replace() has to be called in substitution. This interface can only be used in
+  // substitution. Note that it is not recommended to replace nodes other than the input node in substitution.
+  void SubstitutionReplace(const FuncGraphManagerPtr &manager, const AnfNodePtr &old_node, const AnfNodePtr &new_node) {
+    manager->Replace(old_node, new_node);
+    substitution_replaced_nodes_.emplace_back(new_node);
+  }
+  std::vector<AnfNodePtr> substitution_replaced_nodes() const { return substitution_replaced_nodes_; }
+  void clear_substitution_replaced_nodes() { substitution_replaced_nodes_.clear(); }
+
   const std::string name() const { return name_; }
 
   void set_is_untyped_generated() { is_untyped_generated_ = true; }
@@ -250,6 +259,7 @@ class Optimizer : public std::enable_shared_from_this<Optimizer> {
   pipeline::ResourceBasePtr resource_;
   std::vector<OptPass> passes_;
   std::vector<std::string> pass_names_;
+  std::vector<AnfNodePtr> substitution_replaced_nodes_;
   bool run_only_once_;
   bool is_watch_renormalize_;
   bool is_enable_;
diff --git a/mindspore/ccsrc/frontend/optimizer/recompute.cc b/mindspore/ccsrc/frontend/optimizer/recompute.cc
index 13b408c5ab6..ce5896f179a 100644
--- a/mindspore/ccsrc/frontend/optimizer/recompute.cc
+++ b/mindspore/ccsrc/frontend/optimizer/recompute.cc
@@ -33,8 +33,8 @@ namespace {
 constexpr auto kGradientsFlag = "Gradients";
 
 bool CanNotRecomputed(const CNodePtr &node) {
-  static std::unordered_set<PrimitivePtr> not_recomputed_op_list{prim::kPrimAllGather, prim::kPrimDropoutGenMask,
-                                                                 prim::kPrimLoad, prim::kPrimTupleGetItem};
+  static std::unordered_set<PrimitivePtr> not_recomputed_op_list{prim::kPrimDropoutGenMask, prim::kPrimLoad,
+                                                                 prim::kPrimTupleGetItem};
 
   return std::any_of(not_recomputed_op_list.begin(), not_recomputed_op_list.end(),
                      [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
diff --git a/mindspore/ccsrc/frontend/parallel/allreduce_fusion/allreduce_fusion.h b/mindspore/ccsrc/frontend/parallel/allreduce_fusion/allreduce_fusion.h
index 25342eef82a..ee992053b99 100644
--- a/mindspore/ccsrc/frontend/parallel/allreduce_fusion/allreduce_fusion.h
+++ b/mindspore/ccsrc/frontend/parallel/allreduce_fusion/allreduce_fusion.h
@@ -23,6 +23,7 @@
 #include "frontend/parallel/allreduce_fusion/allreduce_graph.h"
 #include "frontend/parallel/status.h"
 #include "frontend/parallel/ops_info/ops_utils.h"
+#include "frontend/parallel/step_parallel_utils.h"
 
 namespace mindspore {
 namespace parallel {
diff --git a/mindspore/ccsrc/frontend/parallel/graph_util/node_info.cc b/mindspore/ccsrc/frontend/parallel/graph_util/node_info.cc
index d80da8dfdfd..0faf5bca970 100644
--- a/mindspore/ccsrc/frontend/parallel/graph_util/node_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/graph_util/node_info.cc
@@ -24,6 +24,7 @@
 #include "pipeline/jit/parse/python_adapter.h"
 #include "frontend/parallel/ops_info/ops_utils.h"
 #include "frontend/parallel/step_parallel.h"
+#include "frontend/parallel/step_parallel_utils.h"
 
 namespace mindspore {
 namespace parallel {
@@ -319,7 +320,7 @@ bool FindReshapePreNodeStraCosts(const AnfNodePtr &node, OperatorInfoPtr *pre_op
     return false;
   }
   auto node_op_info = cnode->user_data<OperatorInfo>();
-  if (IsParallelCareNode(cnode) && (node_op_info != nullptr)) {
+  if (IsParallelCareNode(cnode) && (node_op_info != nullptr) && !IsPrimitiveCNode(cnode, prim::kPrimReshape)) {
     *pre_operator_info = node_op_info;
     *out_index = 0;
     return true;
@@ -358,7 +359,7 @@ bool FindReshapePreNodeStraCosts(const AnfNodePtr &node, OperatorInfoPtr *pre_op
 // Find next node of Reshape, then obtain its strategy_cost_ vector to get its layout vector.
 // if reshape's output connect to several primitive, return the first layout found
 bool FindReshapeNextNodeStraCosts(const CNodePtr &cnode, OperatorInfoPtr *next_operator_info, int64_t *in_index,
-                                  size_t curr_depth) {
+                                  bool *is_next_reshape, size_t curr_depth) {
   if (curr_depth > MAX_RECURSIVE_DEPTH) {
     MS_LOG(WARNING) << "When finding Reshape's next node, exceeded the max recursive depth: " << MAX_RECURSIVE_DEPTH;
     return false;
@@ -373,6 +374,10 @@ bool FindReshapeNextNodeStraCosts(const CNodePtr &cnode, OperatorInfoPtr *next_o
     if (use_apply == nullptr || !IsValueNode<Primitive>(use_apply->input(0))) {
       continue;
     }
+    if (IsPrimitiveCNode(use_apply, prim::kPrimReshape)) {
+      *is_next_reshape = true;
+      continue;
+    }
     ValueNodePtr prim_anf_node = use_apply->input(0)->cast<ValueNodePtr>();
     MS_EXCEPTION_IF_NULL(prim_anf_node);
     PrimitivePtr node_prim = prim_anf_node->value()->cast<PrimitivePtr>();
@@ -384,6 +389,7 @@ bool FindReshapeNextNodeStraCosts(const CNodePtr &cnode, OperatorInfoPtr *next_o
     auto op_info = use_apply->user_data<OperatorInfo>();
     if (IsParallelCareNode(use_apply) && (op_info != nullptr)) {
       MS_LOG(INFO) << "FindReshapeNextNodeStraCosts success prim " << node_prim->name();
+      *is_next_reshape = false;
       *next_operator_info = op_info;
       *in_index = node_pair.second - 1;
       return true;
@@ -391,7 +397,7 @@ bool FindReshapeNextNodeStraCosts(const CNodePtr &cnode, OperatorInfoPtr *next_o
     MS_LOG(DEBUG) << "FindReshapeNextNodeStraCosts failed prim " << node_prim->name() << "  "
                   << IsParallelCareNode(use_apply) << "   " << (op_info != nullptr);
 
-    if (FindReshapeNextNodeStraCosts(use_apply, next_operator_info, in_index, ++curr_depth)) {
+    if (FindReshapeNextNodeStraCosts(use_apply, next_operator_info, in_index, is_next_reshape, ++curr_depth)) {
       return true;
     }
   }
diff --git a/mindspore/ccsrc/frontend/parallel/graph_util/node_info.h b/mindspore/ccsrc/frontend/parallel/graph_util/node_info.h
index 88f9ff64684..28f514db3f9 100644
--- a/mindspore/ccsrc/frontend/parallel/graph_util/node_info.h
+++ b/mindspore/ccsrc/frontend/parallel/graph_util/node_info.h
@@ -51,7 +51,7 @@ bool FindReshapePreNodeStraCosts(const AnfNodePtr &node, OperatorInfoPtr *pre_op
                                  size_t curr_depth);
 
 bool FindReshapeNextNodeStraCosts(const CNodePtr &cnode, OperatorInfoPtr *next_operator_info, int64_t *in_index,
-                                  size_t curr_depth);
+                                  bool *is_next_reshape, size_t curr_depth);
 void SetUserAttrs(const std::unordered_map<std::string, ValuePtr> &origin_prim_attrs, PrimitivePtr self_prim);
 }  // namespace parallel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.cc
index 39d998aa2aa..092e63f15ae 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.cc
@@ -143,54 +143,97 @@ Status Conv2DInfo::CheckHWStrategyBase(int64_t h_strategy, int64_t w_strategy) {
   return SUCCESS;
 }
 
+Status Conv2DInfo::CheckHWStrategySameMode(int64_t h_strategy, int64_t w_strategy) {
+  int64_t h_slice_shape = inputs_shape_[0][2] / h_strategy;
+  int64_t w_slice_shape = inputs_shape_[0][3] / w_strategy;
+
+  // H dimension
+  if (kernel_size_[0] > stride_[2] && h_strategy > 1) {
+    MS_LOG(ERROR) << name_ << ": The 'same' mode do not support to split H when kernel_size > stride";
+    return FAILED;
+  }
+
+  if (h_strategy > 1 && (kernel_size_[0] <= stride_[2] && h_slice_shape % stride_[2] != 0)) {
+    MS_LOG(ERROR) << name_
+                  << ": The 'same' mode do not support to split H when kernel_size <= stride but slice shape "
+                     "is not divisible by stride ";
+    return FAILED;
+  }
+
+  // W dimension
+  if (w_strategy > 1 && (kernel_size_[1] <= stride_[3] && w_slice_shape % stride_[3] != 0)) {
+    MS_LOG(ERROR) << name_
+                  << ": The 'same' mode do not support to split W when kernel_size <= stride but slice shape "
+                     "is not divisible by stride ";
+    return FAILED;
+  }
+
+  if (w_strategy > 1 && (kernel_size_[1] > stride_[3])) {
+    if (inputs_shape_[0][3] % stride_[3] != 0) {
+      MS_LOG(ERROR) << name_
+                    << ": The 'same' mode do not support to split W when kernel_size > stride but w shape is not "
+                       "divisible by stride";
+      return FAILED;
+    }
+
+    if (w_slice_shape < ((kernel_size_[1] - stride_[3] + 1) / 2)) {
+      MS_LOG(ERROR) << name_
+                    << ": The 'same' mode do not support to split W when kernel_size > stride but w slice shape is "
+                       "smaller than (k - s + 1) / 2";
+      return FAILED;
+    }
+
+    if (kernel_size_[1] - stride_[3] == 1) {
+      MS_LOG(ERROR) << name_ << ": The 'same' mode do not support to split W when kernel_size > stride but k - s == 1";
+      return FAILED;
+    }
+  }
+
+  return SUCCESS;
+}
+
+Status Conv2DInfo::CheckHWStrategyValidMode(int64_t h_strategy, int64_t w_strategy) {
+  int64_t h_slice_shape = inputs_shape_[0][2] / h_strategy;
+  int64_t w_slice_shape = inputs_shape_[0][3] / w_strategy;
+
+  if ((kernel_size_[0] > stride_[2] && h_strategy > 1) || (kernel_size_[1] > stride_[3] && w_strategy > 1)) {
+    MS_LOG(ERROR) << name_ << ": The 'valid' mode do not support to split H or W when kernel_size > stride";
+    return FAILED;
+  }
+
+  if (kernel_size_[0] <= stride_[2] && h_slice_shape % stride_[2] != 0) {
+    MS_LOG(ERROR) << name_
+                  << ": The 'valid' mode do not support to split H when kernel_size <= stride but slice shape is "
+                     "not divisible by stride ";
+    return FAILED;
+  }
+
+  if (kernel_size_[1] <= stride_[3] && w_slice_shape % stride_[3] != 0) {
+    MS_LOG(ERROR) << name_
+                  << ": The 'valid' mode do not support to split W when kernel_size <= stride but slice shape is "
+                     "not divisible by stride ";
+    return FAILED;
+  }
+
+  return SUCCESS;
+}
+
 Status Conv2DInfo::CheckHWStrategy(int64_t h_strategy, int64_t w_strategy) {
   if (CheckHWStrategyBase(h_strategy, w_strategy) != SUCCESS) {
     return FAILED;
   }
 
-  int64_t h_slice_shape = inputs_shape_[0][2] / h_strategy;
-  int64_t w_slice_shape = inputs_shape_[0][3] / w_strategy;
-
   if (pad_mode_ == 0) {  // 'pad' mode
     MS_LOG(ERROR) << name_ << ": The 'pad' mode do not support to split H or W";
     return FAILED;
   }
 
   if (pad_mode_ == 1) {  // 'same' mode
-    if ((kernel_size_[0] > stride_[2] || kernel_size_[1] > stride_[3]) && h_strategy > 1) {
-      MS_LOG(ERROR) << name_ << ": The 'same' mode do not support to split H when kernel_size > stride";
-      return FAILED;
-    }
-
-    if (kernel_size_[0] <= stride_[2] || kernel_size_[1] <= stride_[3]) {
-      if (h_slice_shape % stride_[2] != 0 || w_slice_shape % stride_[3] != 0) {
-        MS_LOG(ERROR) << name_
-                      << ": The 'same' mode do not support to split H or W when kernel_size <= stride but slice shape "
-                         "is not divisible by stride ";
-        return FAILED;
-      }
-    }
+    return CheckHWStrategySameMode(h_strategy, w_strategy);
   }
 
   if (pad_mode_ == 2) {  // 'valid' mode
-    if ((kernel_size_[0] > stride_[2] && h_strategy > 1) || (kernel_size_[1] > stride_[3] && w_strategy > 1)) {
-      MS_LOG(ERROR) << name_ << ": The 'valid' mode do not support to split H or W when kernel_size > stride";
-      return FAILED;
-    }
-
-    if (kernel_size_[0] <= stride_[2] && h_slice_shape % stride_[2] != 0) {
-      MS_LOG(ERROR) << name_
-                    << ": The 'valid' mode do not support to split H when kernel_size <= stride but slice shape is "
-                       "not divisible by stride ";
-      return FAILED;
-    }
-
-    if (kernel_size_[1] <= stride_[3] && w_slice_shape % stride_[3] != 0) {
-      MS_LOG(ERROR) << name_
-                    << ": The 'valid' mode do not support to split W when kernel_size <= stride but slice shape is "
-                       "not divisible by stride ";
-      return FAILED;
-    }
+    return CheckHWStrategyValidMode(h_strategy, w_strategy);
   }
 
   return SUCCESS;
@@ -493,10 +536,18 @@ void Conv2DInfo::InferSendRecvFlag() {
                << right_need_recv_;
 
   if (left_need_send_) {
+    if (left_rank_overlap_right_size_ > input_slice_shape_[3]) {
+      MS_LOG(EXCEPTION) << name_ << ": Do not support left overlap size(" << left_rank_overlap_right_size_
+                        << ") larger than slice shape in w dimension(" << input_slice_shape_[3] << ")";
+    }
     send_rank_ids_.push_back(left_rank_id_);
   }
 
   if (right_need_send_) {
+    if (right_rank_overlap_left_size_ > input_slice_shape_[3]) {
+      MS_LOG(EXCEPTION) << name_ << ": Do not support left overlap size(" << right_rank_overlap_left_size_
+                        << ") larger than slice shape in w dimension(" << input_slice_shape_[3] << ")";
+    }
     send_rank_ids_.push_back(right_rank_id_);
   }
 
@@ -869,15 +920,8 @@ Status Conv2DBackpropInputInfo::CheckHWStrategy(int64_t h_strategy, int64_t w_st
   }
 
   if (h_strategy > 1) {
-    if (inputs_shape_[0][2] * stride_[2] != outputs_shape_[0][2]) {
-      MS_LOG(ERROR) << name_ << ": Do not support to split h dimension when in_shape * stride != out_shape";
-      return FAILED;
-    }
-
-    if (kernel_size_[0] > stride_[2]) {
-      MS_LOG(ERROR) << name_ << ": Do not support to split h dimension when kernel size larger than stride";
-      return FAILED;
-    }
+    MS_LOG(ERROR) << name_ << ": Do not support to split h dimension";
+    return FAILED;
   }
 
   if (w_strategy > 1 && inputs_shape_[0][3] * stride_[3] != outputs_shape_[0][3]) {
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.h b/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.h
index 3786dc5f826..539105a2f7d 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.h
@@ -115,6 +115,10 @@ class Conv2DInfo : public OperatorInfo {
   virtual void InferNewPadList();
   virtual int64_t ComputeOverlapLeftSizeByRankBias(int64_t rank_bias);
   virtual int64_t ComputeOverlapRightSizeByRankBias(int64_t rank_bias);
+
+ private:
+  Status CheckHWStrategySameMode(int64_t h_strategy, int64_t w_strategy);
+  Status CheckHWStrategyValidMode(int64_t h_strategy, int64_t w_strategy);
 };
 
 class Conv2DBackpropInputInfo : public Conv2DInfo {
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/maxpool_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/maxpool_info.cc
index ed6b2592f59..2974e190873 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/maxpool_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/maxpool_info.cc
@@ -76,6 +76,20 @@ Status MaxPoolInfo::GetAttrs() {
 }
 
 Status MaxPoolInfo::CheckHWStrategy(int64_t h_strategy, int64_t w_strategy) {
+  if (outputs_shape_[0][2] % h_strategy != 0) {
+    MS_LOG(ERROR) << name_
+                  << ": Do not support to split h dimension when out_shape of h dimension is not divisible by strategy "
+                     "of h dimension";
+    return FAILED;
+  }
+
+  if (outputs_shape_[0][3] % w_strategy != 0) {
+    MS_LOG(ERROR) << name_
+                  << ": Do not support to split w dimension when out_shape of w dimension is not divisible by strategy "
+                     "of w dimension";
+    return FAILED;
+  }
+
   if (h_strategy > 1) {
     if (kernel_size_[2] > stride_[2]) {
       MS_LOG(ERROR) << name_ << ": It does not support to split H dimension when kernel_size > stride";
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h b/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
index c0c89beb245..78b5ddd034e 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
@@ -134,6 +134,7 @@ constexpr char FUSION[] = "fusion";
 constexpr char DO_MIRROR[] = "do_mirror";
 constexpr char RECOMPUTE[] = "recompute";
 constexpr char RECOMPUTE_COMM_OP[] = "recompute_comm_op";
+constexpr char NOT_RECOMPUTE[] = "not_recompute";
 constexpr char NUM_SAMPLED[] = "num_sampled";
 constexpr char NUM_TRUE[] = "num_true";
 constexpr char SEED[] = "seed";
@@ -193,7 +194,7 @@ constexpr char FORWARD_REDUCE_SCATTER[] = "forward_reduce_scatter";
 constexpr char FIELD_SIZE[] = "field_size";
 constexpr char OPTIMIZER_SUB_STRING[] = "optimizer";
 constexpr char DEVICE[] = "Device";
-constexpr char PARALLEL_OPTIMIZER_ALLGATHER[] = "parallel_optimizer_allgather";
+constexpr char PARALLEL_OPTIMIZER_ALLGATHER[] = "parallel_optimizer_allgather_not_recompute";
 constexpr char CELLLIST_KEYWORD_PATTERN[] = "-CellList/(\\d+)-";
 
 constexpr char OUT_CHANNEL[] = "out_channel";
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.cc
index 57f8755473e..64ce583b730 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.cc
@@ -443,7 +443,8 @@ std::vector<StrategyPtr> ReshapeInfo::GenerateOpStrategies(int64_t) {
 
 Status ReshapeInfo::GenetateStrategyCosts(const std::vector<std::shared_ptr<StrategyWithCost>> &pre_stra_costs,
                                           const std::vector<std::shared_ptr<StrategyWithCost>> &next_stra_costs,
-                                          int64_t out_index, int64_t in_index, bool is_prev_param) {
+                                          int64_t out_index, int64_t in_index, bool is_prev_param,
+                                          bool is_next_reshape) {
   is_generating_costs_ = true;
   for (auto pre_stra_cost : pre_stra_costs) {
     std::vector<TensorInfo> pre_out_tensor_infos;
@@ -466,7 +467,12 @@ Status ReshapeInfo::GenetateStrategyCosts(const std::vector<std::shared_ptr<Stra
     }
     Strategys stra_inputs = {stra};
     StrategyPtr reshape_stra = std::make_shared<Strategy>(pre_stra_cost->strategy_ptr->GetInputStage(), stra_inputs);
-    if (next_stra_costs.empty()) {
+    if (is_next_reshape) {
+      SetOutputLayout(pre_out_tensor_info.tensor_layout());
+      ResetQueueMember();
+      InferTensorInfoByLayout();
+      SetCostForReshape(reshape_stra);
+    } else if (next_stra_costs.empty()) {
       if (Init(nullptr) == FAILED) {
         MS_LOG(ERROR) << "Failure:operator reshape init failed";
         return FAILED;
@@ -481,6 +487,7 @@ Status ReshapeInfo::GenetateStrategyCosts(const std::vector<std::shared_ptr<Stra
         return FAILED;
       }
       TensorInfo next_in_tensor_info = next_in_tensor_infos[LongToSize(in_index)];
+
       SetOutputLayout(next_in_tensor_info.tensor_layout());
       ResetQueueMember();
       InferTensorInfoByLayout();
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.h b/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.h
index ec2939237ae..41136711263 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.h
@@ -60,7 +60,7 @@ class ReshapeInfo : public OperatorInfo {
   void set_next_operator_index(int64_t next_index) { next_operator_index_ = next_index; }
   Status GenetateStrategyCosts(const std::vector<std::shared_ptr<StrategyWithCost>> &pre_stra_costs,
                                const std::vector<std::shared_ptr<StrategyWithCost>> &next_stra_costs, int64_t out_index,
-                               int64_t in_index, bool is_prev_param);
+                               int64_t in_index, bool is_prev_param, bool is_next_reshape);
   Status InitForCostModel(const StrategyPtr &strategy) override;
   Status GenerateStrategies(int64_t stage_id) override;
   std::vector<StrategyPtr> GenerateOpStrategies(int64_t stage_id) override;
diff --git a/mindspore/ccsrc/frontend/parallel/pipeline_transformer/pipeline_transformer.cc b/mindspore/ccsrc/frontend/parallel/pipeline_transformer/pipeline_transformer.cc
index 40bbb936bb6..1d82aa182ea 100644
--- a/mindspore/ccsrc/frontend/parallel/pipeline_transformer/pipeline_transformer.cc
+++ b/mindspore/ccsrc/frontend/parallel/pipeline_transformer/pipeline_transformer.cc
@@ -30,6 +30,7 @@
 #include "frontend/parallel/node_check.h"
 #include "frontend/parallel/graph_util/node_info.h"
 #include "frontend/parallel/graph_util/pipeline_split_utils.h"
+#include "frontend/parallel/step_parallel_utils.h"
 #include "ir/anf.h"
 #include "ir/graph_utils.h"
 #include "base/core_ops.h"
diff --git a/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc b/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
index 4d676c4a714..72fcbd3888f 100644
--- a/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_auto_parallel.cc
@@ -43,6 +43,7 @@
 #include "frontend/parallel/ops_info/reshape_info.h"
 #include "frontend/parallel/ops_info/tmp_identity_info.h"
 #include "frontend/parallel/step_parallel.h"
+#include "frontend/parallel/parameter_manager.h"
 #include "frontend/parallel/strategy_checkpoint/parallel_strategy_checkpoint.h"
 #include "ir/anf.h"
 #include "ir/param_info.h"
@@ -874,8 +875,9 @@ void ReshapeCostCompute(const std::vector<AnfNodePtr> &all_nodes) {
     // get next node's strategy_cost_
     int64_t in_index = 0;
     OperatorInfoPtr next_operator_info;
+    bool is_next_reshape = false;
     std::vector<std::shared_ptr<StrategyWithCost>> next_stra_costs;
-    bool find_next_node = FindReshapeNextNodeStraCosts(cnode, &next_operator_info, &in_index, 0);
+    bool find_next_node = FindReshapeNextNodeStraCosts(cnode, &next_operator_info, &in_index, &is_next_reshape, 0);
     if (!find_next_node) {
       MS_LOG(INFO) << "FindReshapeNextNodeStraCosts for reshape failed";
     }
@@ -890,8 +892,8 @@ void ReshapeCostCompute(const std::vector<AnfNodePtr> &all_nodes) {
       reshape_info->set_next_operator_index(in_index);
     }
     bool is_prev_param = pre_node->isa<Parameter>();
-    if (reshape_info->GenetateStrategyCosts(pre_stra_costs, next_stra_costs, out_index, in_index, is_prev_param) !=
-        SUCCESS) {
+    if (reshape_info->GenetateStrategyCosts(pre_stra_costs, next_stra_costs, out_index, in_index, is_prev_param,
+                                            is_next_reshape) != SUCCESS) {
       MS_LOG(EXCEPTION) << "reshape generate strategy_costs failed!";
     }
   }
diff --git a/mindspore/ccsrc/frontend/parallel/step_parallel.cc b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
index 043f8dd9833..a20615d4384 100644
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
@@ -39,6 +39,7 @@
 #include "frontend/parallel/graph_util/node_info.h"
 #include "frontend/parallel/graph_util/pipeline_split_utils.h"
 #include "frontend/parallel/node_check.h"
+#include "frontend/parallel/parameter_manager.h"
 #include "frontend/parallel/ops_info/matmul_info.h"
 #include "ir/param_info.h"
 #include "ir/tensor.h"
@@ -141,28 +142,6 @@ std::vector<AnfNodePtr> CreateInput(const Operator &op, const AnfNodePtr &node,
   return new_node_input;
 }
 
-bool ParameterIsCloned(const AnfNodePtr &parameter_node) {
-  MS_EXCEPTION_IF_NULL(parameter_node);
-  auto cloned_parameter = parameter_node->cast<ParameterPtr>();
-  MS_EXCEPTION_IF_NULL(cloned_parameter);
-
-  // find the clone parameter
-  if (!cloned_parameter->has_default()) {
-    return false;
-  }
-  auto param_value = cloned_parameter->param_info();
-  if (param_value == nullptr) {
-    return false;
-  }
-  bool cloned = param_value->cloned();
-  if (!cloned) {
-    return false;
-  }
-
-  MS_LOG(INFO) << "The parameter: " << cloned_parameter->name() << " is cloned";
-  return true;
-}
-
 std::vector<AnfNodePtr> CreateMirrorInput(const FuncGraphPtr &root, const Operator &op, const AnfNodePtr &node,
                                           const std::string &instance_name, const std::string &weight_name) {
   MS_EXCEPTION_IF_NULL(root);
@@ -261,6 +240,9 @@ void InsertNode(const Operator &op, const CNodePtr &node, size_t index, const An
   PrimitivePtr new_node_prim = new_node_value->value()->cast<PrimitivePtr>();
   new_node_prim->set_instance_name(instance_name);
   new_node_prim->set_attr("keep_value_node_input", MakeValue(true));
+  if (instance_name.find(NOT_RECOMPUTE) != std::string::npos) {
+    new_node_prim->set_attr("recompute", MakeValue(false));
+  }
   new_node->set_scope(scope);
   node_input[0]->set_scope(scope);
   manager->SetEdge(node, SizeToLong(index), new_node);
@@ -290,6 +272,9 @@ static CNodePtr ReplaceNode(const Operator &op, const AnfNodePtr &pre_node, cons
   auto new_node_prim = GetValueNode<PrimitivePtr>(node_input[0]);
   new_node_prim->set_instance_name(instance_name);
   new_node_prim->set_attr("keep_value_node_input", MakeValue(true));
+  if (instance_name.find(NOT_RECOMPUTE) != std::string::npos) {
+    new_node_prim->set_attr("recompute", MakeValue(false));
+  }
   new_node->set_scope(scope);
   node_input[0]->set_scope(scope);
   manager->Replace(pre_node, new_node);
@@ -394,6 +379,18 @@ void InsertRedistribution(const RedistributionOpListPtr &redistribution_oplist_p
     std::string op_name = (redistribution_oplist_ptr->first)[index].first;
     std::string instance_name_base = REDISTRIBUTION_OP;
     std::string instance_name = instance_name_base + "_" + CreateInstanceName(pre_node, index) + op_name;
+    auto prim_out = GetCNodePrimitive(node);
+    auto prim_in = GetCNodePrimitive(pre_node);
+    if (prim_out != nullptr && prim_in != nullptr) {
+      auto prim_out_attr = prim_out->attrs();
+      auto prim_in_attr = prim_in->attrs();
+      if (prim_out_attr.find(RECOMPUTE_COMM_OP) != prim_out_attr.end() &&
+          prim_in_attr.find(RECOMPUTE_COMM_OP) != prim_in_attr.end() &&
+          COMMUNICATION_OPS.find(op_name) != COMMUNICATION_OPS.end()) {
+        MS_LOG(INFO) << "The redistribution node would not be recomputed.";
+        instance_name = instance_name + "_" + NOT_RECOMPUTE;
+      }
+    }
     InsertNode(op, node, LongToSize(pos), target_node, func_graph, instance_name);
     if ((redistribution_oplist_ptr->second)[index].first) {
       target_node = node->input(LongToSize(pos));
@@ -443,12 +440,7 @@ TensorLayout GetTensorInLayout(const CNodePtr &middle_node, const PrimitivePtr &
 }
 
 std::string GetPrimName(const CNodePtr &node) {
-  MS_EXCEPTION_IF_NULL(node);
-  if (!IsValueNode<Primitive>(node->input(0))) {
-    MS_LOG(EXCEPTION) << "The node is not a primitive";
-  }
-  auto value_node = node->input(0)->cast<ValueNodePtr>();
-  auto prim = GetValueNode<PrimitivePtr>(value_node);
+  auto prim = GetCNodePrimitive(node);
   MS_EXCEPTION_IF_NULL(prim);
   return prim->name();
 }
@@ -579,31 +571,6 @@ bool FindCommunicationOp(const std::vector<AnfNodePtr> &all_nodes) {
   return false;
 }
 
-bool IsParallelCareNode(const CNodePtr &cnode) {
-  MS_EXCEPTION_IF_NULL(cnode);
-  ValueNodePtr prim_node = cnode->input(0)->cast<ValueNodePtr>();
-  if (prim_node == nullptr) {
-    return false;
-  }
-  PrimitivePtr prim = prim_node->value()->cast<PrimitivePtr>();
-  if (prim == nullptr) {
-    return false;
-  }
-  if (IsInParallelBlackList(prim)) {
-    MS_LOG(DEBUG) << "Parallel don't care node: " << prim->name();
-    return false;
-  }
-  // get_next is not in the forward graph, we need mark the get_next as the forward node
-  if (prim->name() == GET_NEXT || prim->name() == VIRTUAL_OUTPUT) {
-    return true;
-  }
-  if ((prim->name() == CAST) && !cnode->has_user_data<OperatorInfo>()) {
-    return false;
-  }
-
-  return cnode->in_forward_flag();
-}
-
 void StepRedistribution(const CNodePtr &node, const OperatorInfoPtr &distribute_operator, const CNodePtr &insert_node,
                         const TensorRedistribution &tensor_redistribution, const CNodePtr &pre_node) {
   MS_EXCEPTION_IF_NULL(node->func_graph());
@@ -881,6 +848,11 @@ void StepReplaceOp(OperatorVector replace_op, const CNodePtr &node) {
     PrimitivePtr prim = GetValueNode<PrimitivePtr>(replace_node->input(0));
     PrimitivePtr origin_prim = GetValueNode<PrimitivePtr>(node->input(0));
     SetUserAttrs(origin_prim->attrs(), prim);
+    if (origin_prim->attrs().find(RECOMPUTE_COMM_OP) != origin_prim->attrs().end() &&
+        COMMUNICATION_OPS.find(prim->name()) != COMMUNICATION_OPS.end()) {
+      MS_LOG(INFO) << "The redistribution node in reshape would not be recomputed.";
+      prim->set_attr("recompute", MakeValue(false));
+    }
     if (index == replace_op.size() - 1) {
       replace_node->set_user_data<OperatorInfo>(node->user_data<OperatorInfo>());
       replace_node->set_primal_attrs(node->primal_attrs());
@@ -898,16 +870,6 @@ void StepReplaceOp(OperatorVector replace_op, const CNodePtr &node) {
   MS_LOG(INFO) << "Insert ReplaceOp success for " << distribute_operator->name();
 }
 
-bool IsSomePrimitive(const CNodePtr &cnode, const std::string &name) {
-  if (!cnode) {
-    return false;
-  }
-  ValueNodePtr anf_node = cnode->input(0)->cast<ValueNodePtr>();
-  MS_EXCEPTION_IF_NULL(anf_node);
-  PrimitivePtr prim = anf_node->value()->cast<PrimitivePtr>();
-  return (prim->name() == name);
-}
-
 void StepReplaceGraph(const ReplaceGraphPtr &replace_graph, const CNodePtr &node) {
   MS_EXCEPTION_IF_NULL(replace_graph);
   MS_EXCEPTION_IF_NULL(node);
@@ -1468,72 +1430,6 @@ StrategyPtr ExtractStrategy(const ValuePtr &stra) {
   return strategyPtr;
 }
 
-Shapes GetValueListShape(const AnfNodePtr &node) {
-  Shapes shapes;
-  std::vector<ValuePtr> inputs_seq;
-  if (IsValueNode<ValueList>(node)) {
-    inputs_seq = node->cast<ValueNodePtr>()->value()->cast<ValueListPtr>()->value();
-  } else if (IsValueNode<ValueTuple>(node)) {
-    inputs_seq = node->cast<ValueNodePtr>()->value()->cast<ValueTuplePtr>()->value();
-  } else {
-    MS_LOG(EXCEPTION) << "node is eigther ValueList or ValueTuple";
-  }
-  for (auto &ele : inputs_seq) {
-    auto tensor = ele->cast<tensor::TensorPtr>();
-    MS_EXCEPTION_IF_NULL(tensor);
-    auto one_shape = tensor->shape();
-    shapes.push_back(one_shape);
-  }
-  return shapes;
-}
-
-Shapes GetNodeShape(const AnfNodePtr &node) {
-  MS_EXCEPTION_IF_NULL(node);
-  Shapes shapes;
-  if (IsValueNode<ValueList>(node) || IsValueNode<ValueTuple>(node)) {
-    return GetValueListShape(node);
-  }
-  BaseShapePtr base_shape_ptr = node->Shape();
-  if (node->isa<CNode>()) {
-    auto cnode = node->cast<CNodePtr>();
-    if (IsValueNode<Primitive>(cnode->input(0))) {
-      PrimitivePtr prim = GetValueNode<PrimitivePtr>(cnode->input(0));
-      MS_EXCEPTION_IF_NULL(prim);
-      if (prim->name() == MAKEREF) {
-        AnfNodePtr ref_node = cnode->input(1);
-        auto func_graph = cnode->func_graph();
-        MS_EXCEPTION_IF_NULL(ref_node);
-        MS_EXCEPTION_IF_NULL(func_graph);
-        return GetRefKeyNodeShape(ref_node, func_graph);
-      }
-    }
-    if (cnode->input(0)->isa<CNode>()) {
-      if (cnode->inputs().size() < 2) {
-        MS_LOG(EXCEPTION) << "GetNodeShape: " << node->ToString() << " size is smaller than 2";
-      }
-      base_shape_ptr = cnode->input(1)->Shape();
-    }
-  }
-  if (base_shape_ptr == nullptr) {
-    MS_LOG(EXCEPTION) << "GetNodeShape: " << node->ToString() << " shape_ptr is nullptr, full name is "
-                      << node->fullname_with_scope();
-  }
-  auto tuple_shape_ptr = dyn_cast<abstract::SequeueShape>(base_shape_ptr);
-  if (tuple_shape_ptr != nullptr) {
-    auto tuple_shape = tuple_shape_ptr->shape();
-    for (auto &shape : tuple_shape) {
-      auto each_shape = dyn_cast<abstract::Shape>(shape);
-      MS_EXCEPTION_IF_NULL(each_shape);
-      shapes.push_back(each_shape->shape());
-    }
-  } else {
-    auto shape_ptr = dyn_cast<abstract::Shape>(base_shape_ptr);
-    MS_EXCEPTION_IF_NULL(shape_ptr);
-    shapes.push_back(shape_ptr->shape());
-  }
-  return shapes;
-}
-
 Shapes GetRefKeyNodeShape(const AnfNodePtr &node, const FuncGraphPtr &func_graph) {
   MS_EXCEPTION_IF_NULL(node);
   MS_EXCEPTION_IF_NULL(func_graph);
@@ -1918,91 +1814,6 @@ void CoverSliceShape(const FuncGraphPtr &root) {
   g_RefMap.clear();
 }
 
-void SetClonedTensorShapeForOptimizer(const FuncGraphPtr &root) {
-  MS_EXCEPTION_IF_NULL(root);
-  for (auto &cloned_parameter_node : root->parameters()) {
-    MS_EXCEPTION_IF_NULL(cloned_parameter_node);
-    auto cloned_parameter = cloned_parameter_node->cast<ParameterPtr>();
-    MS_EXCEPTION_IF_NULL(cloned_parameter);
-
-    if (!ParameterIsCloned(cloned_parameter_node)) {
-      continue;
-    }
-    auto param_value = cloned_parameter->param_info();
-    if (param_value == nullptr) {
-      continue;
-    }
-    // get the cloned index
-    int64_t cloned_index = param_value->cloned_index();
-
-    // find the be cloned parameter
-    bool found_be_cloned_parameter = false;
-    ParameterPtr cloned_from_parameter = nullptr;
-    AnfNodePtr cloned_from_node = nullptr;
-    for (auto &be_cloned_parameter_node : root->parameters()) {
-      MS_EXCEPTION_IF_NULL(be_cloned_parameter_node);
-      auto be_cloned_parameter = be_cloned_parameter_node->cast<ParameterPtr>();
-      MS_EXCEPTION_IF_NULL(be_cloned_parameter);
-      if (!be_cloned_parameter->has_default()) {
-        continue;
-      }
-
-      auto param_value_in = be_cloned_parameter->param_info();
-      if (param_value_in == nullptr) {
-        continue;
-      }
-      if (!param_value_in->be_cloned()) {
-        continue;
-      }
-
-      // get the be cloned index
-      auto &be_cloned_index = param_value_in->be_cloned_index();
-      if (std::find(be_cloned_index.begin(), be_cloned_index.end(), cloned_index) != be_cloned_index.end()) {
-        found_be_cloned_parameter = true;
-        cloned_from_parameter = be_cloned_parameter;
-        cloned_from_node = be_cloned_parameter_node;
-      }
-    }
-
-    if (found_be_cloned_parameter) {
-      // set the shape and tensor layout for cloned parameter
-      std::string param_name = cloned_parameter_node->cast<ParameterPtr>()->name();
-      if (cloned_from_parameter->user_data<TensorLayout>() == nullptr) {
-        MS_LOG(WARNING) << "The parameter " << param_name << " has not tensor layout, skip it";
-        continue;
-      }
-      auto tensor_layout = cloned_from_parameter->user_data<TensorLayout>();
-      MS_EXCEPTION_IF_NULL(cloned_parameter_node->abstract());
-      MS_EXCEPTION_IF_NULL(cloned_from_node->abstract());
-      auto cloned_abstract = cloned_parameter_node->abstract()->Clone();
-      MS_EXCEPTION_IF_NULL(cloned_abstract);
-      // from pipeline or grad accumulation
-      if (param_name.find(ACCU_GRADS) != std::string::npos) {
-        auto slice_shape = cloned_from_parameter->user_data<TensorLayout>()->slice_shape().array();
-        std::shared_ptr<abstract::BaseShape> parallel_shape = std::make_shared<abstract::Shape>(slice_shape);
-        MS_EXCEPTION_IF_NULL(parallel_shape);
-        cloned_abstract->set_shape(parallel_shape);
-        // in opt shard, accu_grad's shape is different from the original param's shape
-        if (ParallelContext::GetInstance()->enable_parallel_optimizer()) {
-          TensorLayout new_layout = *tensor_layout;
-          new_layout.set_opt_shard_group("");
-          tensor_layout = std::make_shared<TensorLayout>(new_layout);
-        }
-      } else {
-        cloned_abstract->set_shape(cloned_from_node->abstract()->GetShapeTrack());
-      }
-      cloned_parameter->set_user_data<TensorLayout>(tensor_layout);
-      cloned_parameter_node->set_abstract(cloned_abstract);
-      MS_LOG(INFO) << "The parameter: " << cloned_parameter->name()
-                   << " is cloned, the be cloned parameter is: " << cloned_from_parameter->name()
-                   << ", clone index is:  " << cloned_index;
-    } else {
-      MS_LOG(EXCEPTION) << "The parameter: " << cloned_parameter->name() << " is cloned, cloned index is  "
-                        << cloned_index << ", but not found the be cloned parameter";
-    }
-  }
-}
-
 void SetVirtualDatasetStrategy(const CNodePtr &node) {
   MS_EXCEPTION_IF_NULL(node);
   MS_EXCEPTION_IF_NULL(ParallelContext::GetInstance());
@@ -2233,7 +2044,7 @@ TensorLayout GetInputLayoutFromCNode(const std::pair<AnfNodePtr, int64_t> &node_
 }
 
 // if reshape's output connect to several primitive, return the first layout found
-std::shared_ptr<TensorLayout> FindNextLayout(const CNodePtr &cnode) {
+std::shared_ptr<TensorLayout> FindNextLayout(const CNodePtr &cnode, bool *next_is_reshape) {
   MS_EXCEPTION_IF_NULL(cnode);
   MS_EXCEPTION_IF_NULL(cnode->func_graph());
   FuncGraphManagerPtr manager = cnode->func_graph()->manager();
@@ -2244,6 +2055,10 @@ std::shared_ptr<TensorLayout> FindNextLayout(const CNodePtr &cnode) {
     if (use_apply == nullptr || !IsValueNode<Primitive>(use_apply->input(0))) {
       continue;
     }
+    if (IsPrimitiveCNode(use_apply, prim::kPrimReshape)) {
+      *next_is_reshape = true;
+      continue;
+    }
     ValueNodePtr prim_anf_node = use_apply->input(0)->cast<ValueNodePtr>();
     MS_EXCEPTION_IF_NULL(prim_anf_node);
     PrimitivePtr node_prim = prim_anf_node->value()->cast<PrimitivePtr>();
@@ -2254,13 +2069,14 @@ std::shared_ptr<TensorLayout> FindNextLayout(const CNodePtr &cnode) {
     }
     if (IsParallelCareNode(use_apply) && use_apply->has_user_data<OperatorInfo>()) {
       MS_LOG(INFO) << "FindNextLayout success prim " << node_prim->name();
+      *next_is_reshape = false;
       auto layout = GetInputLayoutFromCNode(node_pair);
       return std::make_shared<TensorLayout>(layout);
     }
     MS_LOG(DEBUG) << "FindNextLayout failed prim " << node_prim->name() << "  " << IsParallelCareNode(use_apply)
                   << "   " << use_apply->has_user_data<OperatorInfo>();
 
-    auto layout_ptr = FindNextLayout(use_apply);
+    auto layout_ptr = FindNextLayout(use_apply, next_is_reshape);
     if (layout_ptr) {
       return layout_ptr;
     }
@@ -2475,10 +2291,14 @@ void ReshapeInit(const std::vector<AnfNodePtr> &all_nodes) {
       auto reshape_info_ptr = std::dynamic_pointer_cast<ReshapeInfo>(operator_info);
       reshape_info_ptr->SetInputLayout(*prev_layout_ptr);
     }
-    auto next_layout_ptr = FindNextLayout(cnode);
+    bool is_next_reshape = false;
+    auto next_layout_ptr = FindNextLayout(cnode, &is_next_reshape);
     if (next_layout_ptr) {
       auto reshape_info_ptr = std::dynamic_pointer_cast<ReshapeInfo>(operator_info);
       reshape_info_ptr->SetOutputLayout(*next_layout_ptr);
+    } else if (is_next_reshape && prev_layout_ptr != nullptr) {
+      auto reshape_info_ptr = std::dynamic_pointer_cast<ReshapeInfo>(operator_info);
+      reshape_info_ptr->SetOutputLayout(*prev_layout_ptr);
     }
     if (operator_info->Init(nullptr) == FAILED) {
       MS_LOG(EXCEPTION) << "Failure:operator " << prim->ToString() << " init failed";
@@ -2922,41 +2742,6 @@ void ParallelCommunication(const FuncGraphPtr &root, const std::vector<AnfNodePt
   }
 }
 
-namespace {
-void RevertSymbolicKeyInstance(const FuncGraphPtr &root, const AnfNodePtr &node) {
-  MS_EXCEPTION_IF_NULL(root);
-  MS_EXCEPTION_IF_NULL(node);
-  auto symbolic_key = GetValueNode<SymbolicKeyInstancePtr>(node);
-  MS_EXCEPTION_IF_NULL(symbolic_key);
-  auto all_upstream_node = root->manager()->node_users()[node];
-  for (auto &upstream_node : all_upstream_node) {
-    FuncGraphPtr fg = upstream_node.first->func_graph();
-    if (symbolic_key->node()->isa<Parameter>()) {
-      for (auto &param : root->parameters()) {
-        if (*param == *symbolic_key->node()) {
-          AnfNodePtr reverted_node = root->NewCNode({NewValueNode(prim::kPrimEmbed), param});
-          MS_EXCEPTION_IF_NULL(reverted_node);
-          MS_LOG(DEBUG) << "before replace " << node->ToString() << " to node " << reverted_node->DebugString();
-          (void)fg->manager()->Replace(node, reverted_node);
-          MS_LOG(DEBUG) << "revert node " << node->ToString() << " to node " << reverted_node->DebugString();
-        }
-      }
-    }
-  }
-}
-}  // namespace
-
-void HandleSymbolicKeyInstance(const FuncGraphPtr &root, const std::vector<AnfNodePtr> &all_nodes) {
-  MS_EXCEPTION_IF_NULL(root);
-  for (auto &node : all_nodes) {
-    // revert back SymbolicKeyInstance to embed() primitive
-    if (IsValueNode<SymbolicKeyInstance>(node)) {
-      RevertSymbolicKeyInstance(root, node);
-      continue;
-    }
-  }
-}
-
 bool IsCohesiveNode(const CNodePtr &cnode) {
   return IsPrimitiveCNode(cnode, prim::kPrimCast) || IsPrimitiveCNode(cnode, prim::kPrimLoad) ||
          IsPrimitiveCNode(cnode, prim::kPrimAllGather) || IsPrimitiveCNode(cnode, prim::kPrimMiniStepAllGather) ||
@@ -3356,200 +3141,6 @@ void HandleForwardMakeTupleAndMakeList(const std::vector<AnfNodePtr> &all_nodes)
   }
 }
 
-RefKeyPair CNodeWithRefKeys(const AnfNodePtr &cnode) {
-  MS_EXCEPTION_IF_NULL(cnode);
-  std::vector<AnfNodePtr> refkeys;
-  if (cnode->isa<CNode>()) {
-    auto cnode_ptr = cnode->cast<CNodePtr>();
-    auto inputs = cnode_ptr->inputs();
-    for (auto &one_input : inputs) {
-      if (IsValueNode<RefKey>(one_input)) {
-        refkeys.push_back(one_input);
-      }
-    }
-    if (refkeys.size() >= 1) {
-      return std::make_pair(cnode, refkeys);
-    }
-  }
-  return {nullptr, refkeys};
-}
-
-ParameterUsersInfo FindParameterNodeUsers(const AnfNodePtr &node, bool (*IsCareNode)(const CNodePtr &)) {
-  // In this case, node is a Parameter
-  ParameterUsersInfo parameter_user_info;
-  MS_EXCEPTION_IF_NULL(node->func_graph());
-  MS_EXCEPTION_IF_NULL(node->func_graph()->manager());
-  auto candidate_set = node->func_graph()->manager()->node_users()[node];
-  for (auto &candidate : candidate_set) {
-    auto candidate_node = candidate.first;
-    if (IsPrimitiveCNode(candidate_node, prim::kPrimLoad)) {
-      if (candidate.second != 1) {
-        continue;
-      }
-      auto load_node_users = node->func_graph()->manager()->node_users()[candidate_node];
-      for (auto &node_user : load_node_users) {
-        auto cnode = node_user.first->cast<CNodePtr>();
-        if (cnode == nullptr || !cnode->has_user_data<OperatorInfo>() || IsSomePrimitive(cnode, RECEIVE)) {
-          continue;
-        }
-        (void)parameter_user_info.second.second.insert(node_user);
-      }
-    } else {
-      auto c = candidate_node->cast<CNodePtr>();
-      if (c == nullptr || !c->has_user_data<OperatorInfo>() || IsSomePrimitive(c, RECEIVE)) {
-        continue;
-      }
-      (void)parameter_user_info.second.second.insert(candidate);
-    }
-  }
-  parameter_user_info.first = node->cast<ParameterPtr>()->name();
-  parameter_user_info.second.first = node;
-  return parameter_user_info;
-}
-
-ParameterUsersInfo FindRefKeyNodeUsers(const RefKeyPair &ref_key_pair, bool (*IsCareNode)(const CNodePtr &)) {
-  // Dealing with the RefKey case
-  ParameterUsersInfo parameter_user_info;
-  auto refkeys = ref_key_pair.second;
-  auto cnode = ref_key_pair.first;
-
-  auto cnode_ptr = cnode->cast<CNodePtr>();
-  if ((cnode_ptr == nullptr) || !IsValueNode<Primitive>(cnode_ptr->input(0)) || !IsCareNode(cnode_ptr)) {
-    return parameter_user_info;
-  }
-
-  if (refkeys.size() > 1) {
-    MS_LOG(EXCEPTION) << "CNode: " << cnode->fullname_with_scope() << "'s inputs have more than 1 RefKeys";
-  }
-  MS_EXCEPTION_IF_NULL(cnode->func_graph());
-  auto cnode_func_graph = cnode->func_graph();
-  MS_EXCEPTION_IF_NULL(cnode->func_graph()->manager());
-
-  // Find the RefKey being used
-  auto candidate_set_by_refkey = cnode_func_graph->manager()->node_users()[refkeys[0]];
-  for (auto &candidate : candidate_set_by_refkey) {
-    auto candidate_node = candidate.first;
-    auto c = candidate_node->cast<CNodePtr>();
-    if ((c == nullptr) || !IsValueNode<Primitive>(c->input(0)) || !IsCareNode(c)) {
-      continue;
-    }
-    parameter_user_info.second.second.add(candidate);
-  }
-
-  // Find the corresponding Parameter being used
-  std::vector<AnfNodePtr> parameters = FindParameterByRefKeyNode(refkeys[0], cnode_func_graph);
-  if (parameters.size() != 1) {
-    MS_LOG(EXCEPTION) << "Find parameter by ref key node failed";
-  }
-  parameter_user_info.first = parameters[0]->cast<ParameterPtr>()->name();
-  parameter_user_info.second.first = parameters[0];
-  auto candidate_set_by_para = cnode_func_graph->manager()->node_users()[parameters[0]];
-  for (auto &candidate : candidate_set_by_para) {
-    auto candidate_node = candidate.first;
-    auto c = candidate_node->cast<CNodePtr>();
-    if ((c == nullptr) || !IsValueNode<Primitive>(c->input(0)) || !IsCareNode(c)) {
-      continue;
-    }
-    (void)parameter_user_info.second.second.insert(candidate);
-  }
-  return parameter_user_info;
-}
-
-ParameterUsersInfo FindParameterUsers(const AnfNodePtr &node, bool (*IsCareNode)(const CNodePtr &)) {
-  ParameterUsersInfo parameter_users_info;
-
-  auto cnode_with_refkeys = CNodeWithRefKeys(node);
-  if (cnode_with_refkeys.first != nullptr) {
-    // the node is a ref key node
-    return FindRefKeyNodeUsers(cnode_with_refkeys, IsCareNode);
-  } else if (node->isa<Parameter>()) {
-    // the node is a parameter node
-    return FindParameterNodeUsers(node, IsCareNode);
-  }
-
-  return parameter_users_info;
-}
-
-RankList GetGroupByTensorInfo(const TensorInfo &tensor_info) {
-  CheckGlobalDeviceManager();
-  int64_t rank = g_device_manager->global_rank();
-  RankList stage_device_list = g_device_manager->GetDeviceListInThisStage();
-  Shape dev_matrix_shape = tensor_info.tensor_layout().device_arrangement().array();
-  Shape tensor_map = tensor_info.tensor_layout().tensor_map().array();
-
-  DeviceMatrix dev_matrix(rank, stage_device_list, dev_matrix_shape);
-  RankList group_devices;
-  if (dev_matrix.GetDevicesByTensorMap(tensor_map, &group_devices) != SUCCESS) {
-    MS_LOG(EXCEPTION) << "Get devices by tensor map failed";
-  }
-
-  std::sort(group_devices.begin(), group_devices.end());
-  return group_devices;
-}
-
-ParameterSliceInfo GetParameterSliceInfo(const std::pair<AnfNodePtr, int64_t> &param_info) {
-  auto user_cnode = param_info.first->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(user_cnode);
-  auto user_input_index = param_info.second;
-  OperatorInfoPtr op_info = user_cnode->user_data<OperatorInfo>();
-  MS_EXCEPTION_IF_NULL(op_info);
-
-  TensorInfo tensor_info;
-  if (IsPrimitiveCNode(user_cnode, prim::kPrimSend)) {
-    auto param_index = IntToSize(GetValue<int>(user_cnode->GetPrimalAttr(PARAM_INDEX)));
-    tensor_info = op_info->inputs_tensor_info()[param_index];
-  } else {
-    size_t input_tensor_info_size = op_info->inputs_tensor_info().size();
-    if (SizeToLong(input_tensor_info_size) <= user_input_index - 1) {
-      MS_LOG(EXCEPTION) << op_info->name() << ": the size of inputs tensor info is " << input_tensor_info_size
-                        << ", but the index is " << user_input_index - 1;
-    }
-    tensor_info = op_info->inputs_tensor_info()[user_input_index - 1];
-  }
-
-  ParameterSliceInfo parameter_slice_info;
-  parameter_slice_info.slice_shape = tensor_info.slice_shape();
-  parameter_slice_info.group_ranks = GetGroupByTensorInfo(tensor_info);
-  MS_LOG(DEBUG) << "The op name is " << op_info->name() << ", the parameter index is " << user_input_index - 1
-                << ", the slice shape is " << tensor_info.slice_shape() << ", the origin shape is "
-                << tensor_info.shape() << ", the group rank list is " << parameter_slice_info.group_ranks;
-  return parameter_slice_info;
-}
-
-void CheckParameterSplit(const std::vector<AnfNodePtr> &all_nodes) {
-  for (auto &node : all_nodes) {
-    ParameterUsersInfo parameter_users_info = FindParameterUsers(node, IsParallelCareNode);
-    auto users_set = parameter_users_info.second.second;
-    if (users_set.size() <= 1) {
-      continue;
-    }
-
-    auto parameter_name = parameter_users_info.first;
-    MS_LOG(INFO) << "The parameter: " << parameter_name << " has " << users_set.size() << " users";
-    auto first_user = users_set.pop();
-    ParameterSliceInfo parameter_slice_info = GetParameterSliceInfo(first_user);
-    Shape first_user_slice_shape = parameter_slice_info.slice_shape;
-    RankList first_user_group_list = parameter_slice_info.group_ranks;
-
-    for (auto &user : users_set) {
-      ParameterSliceInfo user_slice_info = GetParameterSliceInfo(user);
-      Shape user_slice_shape = user_slice_info.slice_shape;
-      RankList user_group_list = user_slice_info.group_ranks;
-      if (first_user_slice_shape != user_slice_shape) {
-        MS_LOG(EXCEPTION) << "The parameter: " << parameter_name
-                          << " has multiple users, but the slice shapes are different";
-      }
-
-      if (ParallelContext::GetInstance()->pipeline_stage_split_num() == 1 && first_user_group_list != user_group_list) {
-        MS_LOG(EXCEPTION) << "The parameter: " << parameter_name
-                          << " has multiple users, but the group rank list are different, "
-                          << "the group rank list for first user is " << first_user_group_list
-                          << ", and the group rank list for this user is " << user_group_list;
-      }
-    }
-  }
-}
-
 bool CreateGroupsByCkptFile(const std::string &file) {
   GroupInfoMap group_info_map;
   if (StrategyCheckpoint::GetInstance().LoadGroupInfo(file, &group_info_map) != SUCCESS) {
@@ -3563,154 +3154,6 @@ bool CreateGroupsByCkptFile(const std::string &file) {
   return true;
 }
 
-bool IsUsedParameter(const FuncGraphPtr &graph, const AnfNodePtr &parameter) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(parameter);
-  auto manager = graph->manager();
-  auto node_users = manager->node_users()[parameter];
-  if (node_users.empty()) {
-    return false;
-  }
-  for (auto node_user : node_users) {
-    auto use_node = node_user.first->cast<CNodePtr>();
-    if (IsValueNode<FuncGraph>(use_node->input(0))) {
-      auto graph_sub = GetValueNode<FuncGraphPtr>(use_node->input(0));
-      auto parameters = graph_sub->parameters();
-      auto parameter_sub = parameters[node_user.second - 1];
-      return IsUsedParameter(graph_sub, parameter_sub);
-    }
-    if (use_node->input(0)->isa<CNode>()) {
-      auto cnode = use_node->input(0)->cast<CNodePtr>();
-      if (!IsSomePrimitive(cnode, J) || !IsValueNode<FuncGraph>(cnode->input(1))) {
-        return true;
-      }
-      auto graph_sub = GetValueNode<FuncGraphPtr>(cnode->input(1));
-      auto parameters = graph_sub->parameters();
-      auto parameter_sub = parameters[node_user.second - 1];
-      return IsUsedParameter(graph_sub, parameter_sub);
-    }
-    return true;
-  }
-  return true;
-}
-
-static void HandleNoUsedParameter(const FuncGraphPtr &root) {
-  MS_EXCEPTION_IF_NULL(root);
-  bool full_batch = ParallelContext::GetInstance()->full_batch();
-  if (full_batch) {
-    return;
-  }
-
-  // in grad accumulation mode, if use dynamic lr, it has some parameters in optimizer which no used for first graph,
-  // but used for second graph(such as global_step), so can not change their shapes
-  int64_t grad_accumulation_step = ParallelContext::GetInstance()->grad_accumulation_step();
-  if (grad_accumulation_step > 1) {
-    MS_LOG(INFO) << "In grad accumulation mode, do not handle no used parameters";
-    return;
-  }
-
-  auto dev_num = g_device_manager->stage_device_num();
-  auto parameters = root->parameters();
-  for (auto &parameter : parameters) {
-    if (IsUsedParameter(root, parameter)) {
-      continue;
-    }
-    auto parameter_shape = GetNodeShape(parameter);
-    if (parameter_shape.empty()) {
-      continue;
-    }
-    Shape slice_shape = parameter_shape[0];
-    if (slice_shape.empty()) {
-      continue;
-    }
-    slice_shape[0] = slice_shape[0] / dev_num;
-    auto slice_shape_ptr = std::make_shared<abstract::Shape>(slice_shape);
-    auto abstract = parameter->abstract();
-    MS_EXCEPTION_IF_NULL(abstract);
-    auto abstract_cloned = abstract->Clone();
-    MS_EXCEPTION_IF_NULL(abstract_cloned);
-    abstract_cloned->set_shape(slice_shape_ptr);
-    parameter->set_abstract(abstract_cloned);
-  }
-}
-
-static bool IsFullySplitParameter(const ParameterPtr &param_ptr) {
-  auto tensor_layout = param_ptr->user_data<parallel::TensorLayout>();
-  if (tensor_layout == nullptr) {
-    return false;
-  }
-
-  auto dev_mat_shape = tensor_layout->device_arrangement().array();
-  auto tensor_map = tensor_layout->tensor_map().array();
-  int64_t rank = g_device_manager->global_rank();
-  RankList rank_list = g_device_manager->GetDeviceListInThisStage();
-  DeviceMatrix dev_matrix(rank, rank_list, dev_mat_shape);
-  RankList group_devices;
-  if (dev_matrix.GetDevicesByTensorMap(tensor_map, &group_devices) != SUCCESS) {
-    MS_LOG(WARNING) << "Get devices by tensor map failed, invalid tensor layout";
-    return false;
-  }
-
-  if (group_devices.size() == 1) {
-    MS_LOG(INFO) << "The parameter: " << param_ptr->name() << " is fully split";
-    return true;
-  }
-  return false;
-}
-
-static void InsertFullySplitParamGradAccu(const std::pair<AnfNodePtr, int> &node_user,
-                                          const FuncGraphManagerPtr &manager, const AnfNodePtr &accu_parameter) {
-  auto cnode = node_user.first->cast<CNodePtr>();
-  auto prim = GetCNodePrimitive(cnode);
-  if (prim == nullptr) {
-    MS_LOG(WARNING) << cnode->DebugString() << " can not insert fully split param grad accumulation node";
-    return;
-  }
-  OperatorAttrs attrs;
-  auto py_instance = CreatOpInstance(attrs, "_VirtualAdd", "grad_accu");
-  auto value_node = NewValueNode(py_instance);
-  std::vector<AnfNodePtr> virtual_node_input = {value_node, cnode->input(node_user.second), accu_parameter};
-  auto graph = cnode->func_graph();
-  auto virtual_node = graph->NewCNode(virtual_node_input);
-  manager->SetEdge(cnode, node_user.second, virtual_node);
-}
-
-static void HandleFullySplitParameters(const FuncGraphPtr &root) {
-  int64_t grad_accumulation_step = ParallelContext::GetInstance()->grad_accumulation_step();
-  if ((grad_accumulation_step <= 1) || root->has_flag(ACCUMULATION)) {
-    return;
-  }
-
-  auto parameters = root->parameters();
-  auto node_users_map = root->manager()->node_users();
-  for (auto &parameter : parameters) {
-    auto param_ptr = parameter->cast<ParameterPtr>();
-    MS_EXCEPTION_IF_NULL(param_ptr);
-
-    if (!IsFullySplitParameter(param_ptr)) {
-      continue;
-    }
-
-    auto accu_parameter = FindGradAccuParameter(parameters, param_ptr->name());
-    if (!accu_parameter) {
-      continue;  // some parameters no need to handle, such as itself or lr
-    }
-
-    auto node_users = node_users_map[parameter];
-    for (auto &user : node_users) {
-      auto node = user.first;
-      auto cnode = node->cast<CNodePtr>();
-      MS_EXCEPTION_IF_NULL(cnode);
-      if (!cnode->in_forward_flag()) {
-        continue;
-      }
-      InsertFullySplitParamGradAccu(user, root->manager(), accu_parameter);
-      MS_LOG(INFO) << "Insert full split assign add node for " << param_ptr->name();
-      break;  // only need to insert once, if the parameter has many users
-    }
-  }
-}
-
 void ReorderForPipelineSplit(const FuncGraphPtr &root, const FuncGraphManagerPtr &manager, int64_t pipeline_stages) {
   if (!root->has_flag(BACKWARD) && pipeline_stages > 1) {
     root->set_flag(BACKWARD, true);
@@ -3824,6 +3267,8 @@ bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer)
   // set the shape for optimizer's clone tensor
   SetClonedTensorShapeForOptimizer(root);
 
+  HandleAdaFactorOpt(root);
+
   // save strategy as checkpoint for multi-train
   if (StrategyCheckpoint::GetInstance().SaveCheckPointOn()) {
     CheckpointStrategy(all_nodes, root);
diff --git a/mindspore/ccsrc/frontend/parallel/step_parallel.h b/mindspore/ccsrc/frontend/parallel/step_parallel.h
index 996cc11ba33..8fc4ec5e0c8 100644
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.h
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.h
@@ -54,11 +54,6 @@ struct CommInfo {
   std::string communication_backend;
 };
 
-struct ParameterSliceInfo {
-  Shape slice_shape;
-  RankList group_ranks;
-};
-
 std::vector<AnfNodePtr> CreateInput(const Operator &op, const AnfNodePtr &node, const std::string &instance_name);
 std::string CreateInstanceName(const CNodePtr &node, size_t index);
 void ForwardCommunication(OperatorVector forward_op, const CNodePtr &node);
@@ -77,8 +72,6 @@ void Redistribution(const std::pair<AnfNodePtr, int64_t> &node_pair, const Opera
 
 bool StrategyFound(std::unordered_map<std::string, ValuePtr> attrs);
 
-bool IsParallelCareNode(const CNodePtr &cnode);
-
 void MarkForwardCNode(const FuncGraphPtr &root);
 
 bool FindCommunicationOp(const std::vector<AnfNodePtr> &all_nodes);
@@ -108,8 +101,6 @@ OperatorInfoPtr NewOperatorInstance(const PrimitivePtr &prim, const PrimitiveAtt
 // Extract strategy from attr
 StrategyPtr ExtractStrategy(const ValuePtr &strategy);
 
-Shapes GetNodeShape(const AnfNodePtr &node);
-
 // Extract shape from anfnode
 std::vector<Shapes> ExtractShape(const CNodePtr &node);
 
@@ -160,15 +151,8 @@ std::set<FuncGraphPtr> ForwardGraph(const FuncGraphPtr &root);
 
 std::vector<std::string> ExtractInputsTensorName(const CNodePtr &node);
 
-using RefKeyPair = std::pair<AnfNodePtr, std::vector<AnfNodePtr>>;
-using ParameterUsersInfo = std::pair<std::string, std::pair<AnfNodePtr, AnfNodeIndexSet>>;
-
-RefKeyPair CNodeWithRefKeys(const AnfNodePtr &cnode);
-
 std::shared_ptr<TensorLayout> FindParameterNextLayout(const AnfNodePtr &node);
 
-ParameterUsersInfo FindParameterUsers(const AnfNodePtr &node, bool (*IsCareNode)(const CNodePtr &));
-
 bool IsUsedParameter(const FuncGraphPtr &graph, const AnfNodePtr &parameter);
 
 void ApplyParallelOptOnParam(TensorLayout *tensor_layout, const OperatorInfoPtr &distribute_operator,
@@ -187,6 +171,8 @@ std::string MirrorOpName();
 
 CommInfo GetCommInfo();
 
+std::string GetPrimName(const CNodePtr &node);
+
 void ReorderForPipelineSplit(const FuncGraphPtr &root, const FuncGraphManagerPtr &manager, int64_t pipeline_stages);
 }  // namespace parallel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
index 454d33ebb6d..d6ad9a57254 100644
--- a/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
@@ -267,6 +267,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
     else()
         target_link_libraries(_c_dataengine PRIVATE ${SECUREC_LIBRARY})
     endif()
+    target_link_options(_c_dataengine PRIVATE -Wl,--allow-multiple-definition)
 else()
     set(ICU_LIB mindspore::icuuc mindspore::icudata mindspore::icui18n)
     if(ENABLE_PYTHON)
diff --git a/mindspore/ccsrc/minddata/dataset/api/audio.cc b/mindspore/ccsrc/minddata/dataset/api/audio.cc
index eb4f8c20c1d..aa0f33d0fdc 100644
--- a/mindspore/ccsrc/minddata/dataset/api/audio.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/audio.cc
@@ -23,6 +23,9 @@
 #include "minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.h"
 #include "minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.h"
 #include "minddata/dataset/audio/ir/kernels/bass_biquad_ir.h"
+#include "minddata/dataset/audio/ir/kernels/complex_norm_ir.h"
+#include "minddata/dataset/audio/ir/kernels/frequency_masking_ir.h"
+#include "minddata/dataset/audio/ir/kernels/time_masking_ir.h"
 #include "minddata/dataset/audio/ir/kernels/time_stretch_ir.h"
 
 namespace mindspore {
@@ -45,7 +48,7 @@ std::shared_ptr<TensorOperation> AllpassBiquad::Parse() {
   return std::make_shared<AllpassBiquadOperation>(data_->sample_rate_, data_->central_freq_, data_->Q_);
 }
 
-// AmplitudeToDB Operation.
+// AmplitudeToDB Transform Operation.
 struct AmplitudeToDB::Data {
   Data(ScaleType stype, float ref_value, float amin, float top_db)
       : stype_(stype), ref_value_(ref_value), amin_(amin), top_db_(top_db) {}
@@ -134,7 +137,56 @@ std::shared_ptr<TensorOperation> BassBiquad::Parse() {
   return std::make_shared<BassBiquadOperation>(data_->sample_rate_, data_->gain_, data_->central_freq_, data_->Q_);
 }
 
-// TimeStretch Operation.
+// ComplexNorm Transform Operation.
+struct ComplexNorm::Data {
+  explicit Data(float power) : power_(power) {}
+  float power_;
+};
+
+ComplexNorm::ComplexNorm(float power) : data_(std::make_shared<Data>(power)) {}
+
+std::shared_ptr<TensorOperation> ComplexNorm::Parse() { return std::make_shared<ComplexNormOperation>(data_->power_); }
+
+// FrequencyMasking Transform Operation.
+struct FrequencyMasking::Data {
+  Data(bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, double mask_value)
+      : iid_masks_(iid_masks),
+        frequency_mask_param_(frequency_mask_param),
+        mask_start_(mask_start),
+        mask_value_(mask_value) {}
+  int32_t frequency_mask_param_;
+  int32_t mask_start_;
+  bool iid_masks_;
+  double mask_value_;
+};
+
+FrequencyMasking::FrequencyMasking(bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, double mask_value)
+    : data_(std::make_shared<Data>(iid_masks, frequency_mask_param, mask_start, mask_value)) {}
+
+std::shared_ptr<TensorOperation> FrequencyMasking::Parse() {
+  return std::make_shared<FrequencyMaskingOperation>(data_->iid_masks_, data_->frequency_mask_param_,
+                                                     data_->mask_start_, data_->mask_value_);
+}
+
+// TimeMasking Transform Operation.
+struct TimeMasking::Data {
+  Data(bool iid_masks, int64_t time_mask_param, int64_t mask_start, double mask_value)
+      : iid_masks_(iid_masks), time_mask_param_(time_mask_param), mask_start_(mask_start), mask_value_(mask_value) {}
+  int64_t time_mask_param_;
+  int64_t mask_start_;
+  bool iid_masks_;
+  double mask_value_;
+};
+
+TimeMasking::TimeMasking(bool iid_masks, int64_t time_mask_param, int64_t mask_start, double mask_value)
+    : data_(std::make_shared<Data>(iid_masks, time_mask_param, mask_start, mask_value)) {}
+
+std::shared_ptr<TensorOperation> TimeMasking::Parse() {
+  return std::make_shared<TimeMaskingOperation>(data_->iid_masks_, data_->time_mask_param_, data_->mask_start_,
+                                                data_->mask_value_);
+}
+
+// TimeStretch Transform Operation.
 struct TimeStretch::Data {
   explicit Data(float hop_length, int n_freq, float fixed_rate)
       : hop_length_(hop_length), n_freq_(n_freq), fixed_rate_(fixed_rate) {}
diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc
index cf4898f9766..1f6a6a35f79 100644
--- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc
@@ -85,7 +85,6 @@
 // IR leaf nodes
 #include "minddata/dataset/engine/ir/datasetops/source/album_node.h"
 #include "minddata/dataset/engine/ir/datasetops/source/mnist_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/cmu_arctic_node.h"
 
 // IR leaf nodes disabled for android
 #ifndef ENABLE_ANDROID
@@ -594,14 +593,16 @@ SchemaObj::SchemaObj(const std::vector<char> &schema_file) : data_(std::make_sha
 
 // SchemaObj Init function
 Status SchemaObj::Init() {
-  if (!data_->schema_file_.empty()) {
-    Path schema_file(data_->schema_file_);
+  if (data_ != nullptr && !data_->schema_file_.empty()) {
+    std::string real_path;
+    RETURN_IF_NOT_OK(Path::RealPath(data_->schema_file_, real_path));
+    Path schema_file(real_path);
     CHECK_FAIL_RETURN_UNEXPECTED(schema_file.Exists(),
                                  "The file " + data_->schema_file_ + " does not exist or permission denied!");
 
     nlohmann::json js;
     try {
-      std::ifstream in(data_->schema_file_);
+      std::ifstream in(real_path);
       in >> js;
       CHECK_FAIL_RETURN_UNEXPECTED(js.find("columns") != js.end(),
                                    "\"columns\" node is required in the schema json file.");
@@ -1137,27 +1138,6 @@ MnistDataset::MnistDataset(const std::vector<char> &dataset_dir, const std::vect
   ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
 }
 
-CmuArcticDataset::CmuArcticDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
-                           const std::shared_ptr<Sampler> &sampler, const std::shared_ptr<DatasetCache> &cache) {
-  auto sampler_obj = sampler ? sampler->Parse() : nullptr;
-  auto ds = std::make_shared<CmuArcticNode>(CharToString(dataset_dir), CharToString(usage), sampler_obj, cache);
-  ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
-}
-
-CmuArcticDataset::CmuArcticDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage, const Sampler *sampler,
-                           const std::shared_ptr<DatasetCache> &cache) {
-  auto sampler_obj = sampler ? sampler->Parse() : nullptr;
-  auto ds = std::make_shared<CmuArcticNode>(CharToString(dataset_dir), CharToString(usage), sampler_obj, cache);
-  ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
-}
-
-CmuArcticDataset::CmuArcticDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
-                           const std::reference_wrapper<Sampler> sampler, const std::shared_ptr<DatasetCache> &cache) {
-  auto sampler_obj = sampler.get().Parse();
-  auto ds = std::make_shared<CmuArcticNode>(CharToString(dataset_dir), CharToString(usage), sampler_obj, cache);
-  ir_node_ = std::static_pointer_cast<DatasetNode>(ds);
-}
-
 #ifndef ENABLE_ANDROID
 TextFileDataset::TextFileDataset(const std::vector<std::vector<char>> &dataset_files, int64_t num_samples,
                                  ShuffleMode shuffle, int32_t num_shards, int32_t shard_id,
diff --git a/mindspore/ccsrc/minddata/dataset/api/iterator.cc b/mindspore/ccsrc/minddata/dataset/api/iterator.cc
index cb23e9395fe..3de7f6bd8fe 100644
--- a/mindspore/ccsrc/minddata/dataset/api/iterator.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/iterator.cc
@@ -27,7 +27,8 @@ Iterator::~Iterator() { Stop(); }
 
 // Get the next row from the data pipeline.
 Status Iterator::GetNextRowCharIF(MSTensorMapChar *row) {
-  // Clean data row
+  RETURN_UNEXPECTED_IF_NULL(row);
+  // Clean data buffer
   row->clear();
   std::unordered_map<std::string, std::shared_ptr<dataset::Tensor>> md_map;
   Status rc = consumer_->GetNextAsMap(&md_map);
@@ -47,6 +48,7 @@ Status Iterator::GetNextRowCharIF(MSTensorMapChar *row) {
 // Get the next row from the data pipeline.
 Status Iterator::GetNextRow(MSTensorVec *row) {
   // Clean data row
+  RETURN_UNEXPECTED_IF_NULL(row);
   row->clear();
   // create a dataset tensor row and fetch. Then we convert the output to MSTensor
   std::vector<std::shared_ptr<dataset::Tensor>> md_row;
@@ -84,6 +86,7 @@ Status Iterator::BuildAndLaunchTree(std::shared_ptr<Dataset> ds, int32_t num_epo
 PullIterator::PullIterator() : pull_consumer_(nullptr) {}
 // Get the next row from the data pipeline.
 Status PullIterator::GetRows(int32_t num_rows, std::vector<MSTensorVec> *const row) {
+  RETURN_UNEXPECTED_IF_NULL(row);
   for (int i = 0; i < num_rows; i++) {
     std::vector<std::shared_ptr<dataset::Tensor>> md_row;
     Status rc = pull_consumer_->GetNextAsVector(&md_row);
@@ -105,6 +108,7 @@ Status PullIterator::GetRows(int32_t num_rows, std::vector<MSTensorVec> *const r
 }
 
 Status PullIterator::GetNextRow(MSTensorVec *const row) {
+  RETURN_UNEXPECTED_IF_NULL(row);
   CHECK_FAIL_RETURN_UNEXPECTED(pull_consumer_ != nullptr, "Consumer is nullptr.");
   std::vector<std::shared_ptr<dataset::Tensor>> md_row;
   Status rc = pull_consumer_->GetNextAsVector(&md_row);
diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/audio/kernels/ir/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/audio/kernels/ir/bindings.cc
index 8f4c63469cb..4564426ab74 100644
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/audio/kernels/ir/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/audio/kernels/ir/bindings.cc
@@ -17,6 +17,8 @@
 
 #include "minddata/dataset/api/python/pybind_conversion.h"
 #include "minddata/dataset/api/python/pybind_register.h"
+#include "minddata/dataset/include/dataset/transforms.h"
+
 #include "minddata/dataset/audio/ir/kernels/allpass_biquad_ir.h"
 #include "minddata/dataset/audio/ir/kernels/amplitude_to_db_ir.h"
 #include "minddata/dataset/audio/ir/kernels/angle_ir.h"
@@ -24,8 +26,10 @@
 #include "minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.h"
 #include "minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.h"
 #include "minddata/dataset/audio/ir/kernels/bass_biquad_ir.h"
+#include "minddata/dataset/audio/ir/kernels/complex_norm_ir.h"
+#include "minddata/dataset/audio/ir/kernels/frequency_masking_ir.h"
+#include "minddata/dataset/audio/ir/kernels/time_masking_ir.h"
 #include "minddata/dataset/audio/ir/kernels/time_stretch_ir.h"
-#include "minddata/dataset/include/dataset/transforms.h"
 
 namespace mindspore {
 namespace dataset {
@@ -114,6 +118,42 @@ PYBIND_REGISTER(
       }));
   }));
 
+PYBIND_REGISTER(
+  ComplexNormOperation, 1, ([](const py::module *m) {
+    (void)py::class_<audio::ComplexNormOperation, TensorOperation, std::shared_ptr<audio::ComplexNormOperation>>(
+      *m, "ComplexNormOperation")
+      .def(py::init([](float power) {
+        auto complex_norm = std::make_shared<audio::ComplexNormOperation>(power);
+        THROW_IF_ERROR(complex_norm->ValidateParams());
+        return complex_norm;
+      }));
+  }));
+
+PYBIND_REGISTER(
+  FrequencyMaskingOperation, 1, ([](const py::module *m) {
+    (void)
+      py::class_<audio::FrequencyMaskingOperation, TensorOperation, std::shared_ptr<audio::FrequencyMaskingOperation>>(
+        *m, "FrequencyMaskingOperation")
+        .def(py::init([](bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, double mask_value) {
+          auto frequency_masking =
+            std::make_shared<audio::FrequencyMaskingOperation>(iid_masks, frequency_mask_param, mask_start, mask_value);
+          THROW_IF_ERROR(frequency_masking->ValidateParams());
+          return frequency_masking;
+        }));
+  }));
+
+PYBIND_REGISTER(
+  TimeMaskingOperation, 1, ([](const py::module *m) {
+    (void)py::class_<audio::TimeMaskingOperation, TensorOperation, std::shared_ptr<audio::TimeMaskingOperation>>(
+      *m, "TimeMaskingOperation")
+      .def(py::init([](bool iid_masks, int64_t time_mask_param, int64_t mask_start, double mask_value) {
+        auto time_masking =
+          std::make_shared<audio::TimeMaskingOperation>(iid_masks, time_mask_param, mask_start, mask_value);
+        THROW_IF_ERROR(time_masking->ValidateParams());
+        return time_masking;
+      }));
+  }));
+
 PYBIND_REGISTER(
   TimeStretchOperation, 1, ([](const py::module *m) {
     (void)py::class_<audio::TimeStretchOperation, TensorOperation, std::shared_ptr<audio::TimeStretchOperation>>(
diff --git a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/source/bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/source/bindings.cc
index a6265bcf592..f9573deac3a 100644
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/source/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/source/bindings.cc
@@ -1,291 +1,289 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "pybind11/pybind11.h"
-
-#include "minddata/dataset/api/python/pybind_conversion.h"
-#include "minddata/dataset/api/python/pybind_register.h"
-#include "minddata/dataset/include/dataset/constants.h"
-#include "minddata/dataset/include/dataset/datasets.h"
-
-#include "minddata/dataset/core/config_manager.h"
-#include "minddata/dataset/core/data_type.h"
-#include "minddata/dataset/util/path.h"
-
-// IR leaf nodes
-#include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/clue_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/coco_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/generator_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/mnist_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/cmu_arctic_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/random_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h"
-
-// IR leaf nodes disabled for android
-#ifndef ENABLE_ANDROID
-#include "minddata/dataset/engine/ir/datasetops/source/manifest_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/minddata_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/tf_record_node.h"
-#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h"
-#endif
-
-namespace mindspore {
-namespace dataset {
-
-// PYBIND FOR LEAF NODES
-// (In alphabetical order)
-
-PYBIND_REGISTER(CelebANode, 2, ([](const py::module *m) {
-                  (void)py::class_<CelebANode, DatasetNode, std::shared_ptr<CelebANode>>(*m, "CelebANode",
-                                                                                         "to create a CelebANode")
-                    .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler, bool decode,
-                                     py::list extensions) {
-                      auto celebA = std::make_shared<CelebANode>(dataset_dir, usage, toSamplerObj(sampler), decode,
-                                                                 toStringSet(extensions), nullptr);
-                      THROW_IF_ERROR(celebA->ValidateParams());
-                      return celebA;
-                    }));
-                }));
-
-PYBIND_REGISTER(Cifar10Node, 2, ([](const py::module *m) {
-                  (void)py::class_<Cifar10Node, DatasetNode, std::shared_ptr<Cifar10Node>>(*m, "Cifar10Node",
-                                                                                           "to create a Cifar10Node")
-                    .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler) {
-                      auto cifar10 = std::make_shared<Cifar10Node>(dataset_dir, usage, toSamplerObj(sampler), nullptr);
-                      THROW_IF_ERROR(cifar10->ValidateParams());
-                      return cifar10;
-                    }));
-                }));
-
-PYBIND_REGISTER(Cifar100Node, 2, ([](const py::module *m) {
-                  (void)py::class_<Cifar100Node, DatasetNode, std::shared_ptr<Cifar100Node>>(*m, "Cifar100Node",
-                                                                                             "to create a Cifar100Node")
-                    .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler) {
-                      auto cifar100 =
-                        std::make_shared<Cifar100Node>(dataset_dir, usage, toSamplerObj(sampler), nullptr);
-                      THROW_IF_ERROR(cifar100->ValidateParams());
-                      return cifar100;
-                    }));
-                }));
-
-PYBIND_REGISTER(CLUENode, 2, ([](const py::module *m) {
-                  (void)py::class_<CLUENode, DatasetNode, std::shared_ptr<CLUENode>>(*m, "CLUENode",
-                                                                                     "to create a CLUENode")
-                    .def(py::init([](py::list files, std::string task, std::string usage, int64_t num_samples,
-                                     int32_t shuffle, int32_t num_shards, int32_t shard_id) {
-                      std::shared_ptr<CLUENode> clue_node =
-                        std::make_shared<dataset::CLUENode>(toStringVector(files), task, usage, num_samples,
-                                                            toShuffleMode(shuffle), num_shards, shard_id, nullptr);
-                      THROW_IF_ERROR(clue_node->ValidateParams());
-                      return clue_node;
-                    }));
-                }));
-
-PYBIND_REGISTER(CocoNode, 2, ([](const py::module *m) {
-                  (void)py::class_<CocoNode, DatasetNode, std::shared_ptr<CocoNode>>(*m, "CocoNode",
-                                                                                     "to create a CocoNode")
-                    .def(py::init([](std::string dataset_dir, std::string annotation_file, std::string task,
-                                     bool decode, py::handle sampler, bool extra_metadata) {
-                      std::shared_ptr<CocoNode> coco = std::make_shared<CocoNode>(
-                        dataset_dir, annotation_file, task, decode, toSamplerObj(sampler), nullptr, extra_metadata);
-                      THROW_IF_ERROR(coco->ValidateParams());
-                      return coco;
-                    }));
-                }));
-
-PYBIND_REGISTER(CSVNode, 2, ([](const py::module *m) {
-                  (void)py::class_<CSVNode, DatasetNode, std::shared_ptr<CSVNode>>(*m, "CSVNode", "to create a CSVNode")
-                    .def(py::init([](std::vector<std::string> csv_files, char field_delim, py::list column_defaults,
-                                     std::vector<std::string> column_names, int64_t num_samples, int32_t shuffle,
-                                     int32_t num_shards, int32_t shard_id) {
-                      auto csv =
-                        std::make_shared<CSVNode>(csv_files, field_delim, toCSVBase(column_defaults), column_names,
-                                                  num_samples, toShuffleMode(shuffle), num_shards, shard_id, nullptr);
-                      THROW_IF_ERROR(csv->ValidateParams());
-                      return csv;
-                    }));
-                }));
-
-PYBIND_REGISTER(GeneratorNode, 2, ([](const py::module *m) {
-                  (void)py::class_<GeneratorNode, DatasetNode, std::shared_ptr<GeneratorNode>>(
-                    *m, "GeneratorNode", "to create a GeneratorNode")
-                    .def(
-                      py::init([](py::function generator_function, const std::vector<std::string> &column_names,
-                                  const std::vector<DataType> &column_types, int64_t dataset_len, py::handle sampler) {
-                        auto gen = std::make_shared<GeneratorNode>(generator_function, column_names, column_types,
-                                                                   dataset_len, toSamplerObj(sampler));
-                        THROW_IF_ERROR(gen->ValidateParams());
-                        return gen;
-                      }))
-                    .def(py::init([](py::function generator_function, const std::shared_ptr<SchemaObj> schema,
-                                     int64_t dataset_len, py::handle sampler) {
-                      auto gen =
-                        std::make_shared<GeneratorNode>(generator_function, schema, dataset_len, toSamplerObj(sampler));
-                      THROW_IF_ERROR(gen->ValidateParams());
-                      return gen;
-                    }));
-                }));
-
-PYBIND_REGISTER(ImageFolderNode, 2, ([](const py::module *m) {
-                  (void)py::class_<ImageFolderNode, DatasetNode, std::shared_ptr<ImageFolderNode>>(
-                    *m, "ImageFolderNode", "to create an ImageFolderNode")
-                    .def(py::init([](std::string dataset_dir, bool decode, py::handle sampler, py::list extensions,
-                                     py::dict class_indexing) {
-                      // Don't update recursive to true
-                      bool recursive = false;  // Will be removed in future PR
-                      auto imagefolder = std::make_shared<ImageFolderNode>(dataset_dir, decode, toSamplerObj(sampler),
-                                                                           recursive, toStringSet(extensions),
-                                                                           toStringMap(class_indexing), nullptr);
-                      THROW_IF_ERROR(imagefolder->ValidateParams());
-                      return imagefolder;
-                    }));
-                }));
-
-PYBIND_REGISTER(ManifestNode, 2, ([](const py::module *m) {
-                  (void)py::class_<ManifestNode, DatasetNode, std::shared_ptr<ManifestNode>>(*m, "ManifestNode",
-                                                                                             "to create a ManifestNode")
-                    .def(py::init([](std::string dataset_file, std::string usage, py::handle sampler,
-                                     py::dict class_indexing, bool decode) {
-                      auto manifest = std::make_shared<ManifestNode>(dataset_file, usage, toSamplerObj(sampler),
-                                                                     toStringMap(class_indexing), decode, nullptr);
-                      THROW_IF_ERROR(manifest->ValidateParams());
-                      return manifest;
-                    }));
-                }));
-
-PYBIND_REGISTER(MindDataNode, 2, ([](const py::module *m) {
-                  (void)py::class_<MindDataNode, DatasetNode, std::shared_ptr<MindDataNode>>(*m, "MindDataNode",
-                                                                                             "to create a MindDataNode")
-                    .def(py::init([](std::string dataset_file, py::list columns_list, py::handle sampler,
-                                     py::dict padded_sample, int64_t num_padded, ShuffleMode shuffle_mode) {
-                      nlohmann::json padded_sample_json;
-                      std::map<std::string, std::string> sample_bytes;
-                      THROW_IF_ERROR(ToJson(padded_sample, &padded_sample_json, &sample_bytes));
-                      auto minddata = std::make_shared<MindDataNode>(dataset_file, toStringVector(columns_list),
-                                                                     toSamplerObj(sampler, true), padded_sample_json,
-                                                                     num_padded, shuffle_mode, nullptr);
-                      minddata->SetSampleBytes(&sample_bytes);
-                      THROW_IF_ERROR(minddata->ValidateParams());
-                      return minddata;
-                    }))
-                    .def(py::init([](py::list dataset_file, py::list columns_list, py::handle sampler,
-                                     py::dict padded_sample, int64_t num_padded, ShuffleMode shuffle_mode) {
-                      nlohmann::json padded_sample_json;
-                      std::map<std::string, std::string> sample_bytes;
-                      THROW_IF_ERROR(ToJson(padded_sample, &padded_sample_json, &sample_bytes));
-                      auto minddata = std::make_shared<MindDataNode>(
-                        toStringVector(dataset_file), toStringVector(columns_list), toSamplerObj(sampler, true),
-                        padded_sample_json, num_padded, shuffle_mode, nullptr);
-                      minddata->SetSampleBytes(&sample_bytes);
-                      THROW_IF_ERROR(minddata->ValidateParams());
-                      return minddata;
-                    }));
-                }));
-
-PYBIND_REGISTER(MnistNode, 2, ([](const py::module *m) {
-                  (void)py::class_<MnistNode, DatasetNode, std::shared_ptr<MnistNode>>(*m, "MnistNode",
-                                                                                       "to create an MnistNode")
-                    .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler) {
-                      auto mnist = std::make_shared<MnistNode>(dataset_dir, usage, toSamplerObj(sampler), nullptr);
-                      THROW_IF_ERROR(mnist->ValidateParams());
-                      return mnist;
-                    }));
-                }));
-
-
-
-PYBIND_REGISTER(CmuArcticNode, 2, ([](const py::module *m) {
-                  (void)py::class_<CmuArcticNode, DatasetNode, std::shared_ptr<CmuArcticNode>>(*m, "CmuArcticNode",
-                                                                                       "to create an CmuArcticNode")
-                    .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler) {
-                      auto cmuarctic = std::make_shared<CmuArcticNode>(dataset_dir, usage, toSamplerObj(sampler), nullptr);
-                      THROW_IF_ERROR(cmuarctic->ValidateParams());
-                      return cmuarctic;
-                    }));
-                }));
-
-
-PYBIND_REGISTER(RandomNode, 2, ([](const py::module *m) {
-                  (void)py::class_<RandomNode, DatasetNode, std::shared_ptr<RandomNode>>(*m, "RandomNode",
-                                                                                         "to create a RandomNode")
-                    .def(py::init([](int32_t total_rows, std::shared_ptr<SchemaObj> schema, py::list columns_list) {
-                      auto random_node =
-                        std::make_shared<RandomNode>(total_rows, schema, toStringVector(columns_list), nullptr);
-                      THROW_IF_ERROR(random_node->ValidateParams());
-                      return random_node;
-                    }))
-                    .def(py::init([](int32_t total_rows, std::string schema, py::list columns_list) {
-                      auto random_node =
-                        std::make_shared<RandomNode>(total_rows, schema, toStringVector(columns_list), nullptr);
-                      THROW_IF_ERROR(random_node->ValidateParams());
-                      return random_node;
-                    }));
-                }));
-
-PYBIND_REGISTER(TextFileNode, 2, ([](const py::module *m) {
-                  (void)py::class_<TextFileNode, DatasetNode, std::shared_ptr<TextFileNode>>(*m, "TextFileNode",
-                                                                                             "to create a TextFileNode")
-                    .def(py::init([](py::list dataset_files, int32_t num_samples, int32_t shuffle, int32_t num_shards,
-                                     int32_t shard_id) {
-                      std::shared_ptr<TextFileNode> textfile_node =
-                        std::make_shared<TextFileNode>(toStringVector(dataset_files), num_samples,
-                                                       toShuffleMode(shuffle), num_shards, shard_id, nullptr);
-                      THROW_IF_ERROR(textfile_node->ValidateParams());
-                      return textfile_node;
-                    }));
-                }));
-
-PYBIND_REGISTER(TFRecordNode, 2, ([](const py::module *m) {
-                  (void)py::class_<TFRecordNode, DatasetNode, std::shared_ptr<TFRecordNode>>(*m, "TFRecordNode",
-                                                                                             "to create a TFRecordNode")
-                    .def(py::init([](const py::list dataset_files, std::shared_ptr<SchemaObj> schema,
-                                     const py::list columns_list, int64_t num_samples, int32_t shuffle,
-                                     int32_t num_shards, int32_t shard_id, bool shard_equal_rows) {
-                      std::shared_ptr<TFRecordNode> tfrecord = std::make_shared<TFRecordNode>(
-                        toStringVector(dataset_files), schema, toStringVector(columns_list), num_samples,
-                        toShuffleMode(shuffle), num_shards, shard_id, shard_equal_rows, nullptr);
-                      THROW_IF_ERROR(tfrecord->ValidateParams());
-                      return tfrecord;
-                    }))
-                    .def(py::init([](const py::list dataset_files, std::string schema, py::list columns_list,
-                                     int64_t num_samples, int32_t shuffle, int32_t num_shards, int32_t shard_id,
-                                     bool shard_equal_rows) {
-                      std::shared_ptr<TFRecordNode> tfrecord = std::make_shared<TFRecordNode>(
-                        toStringVector(dataset_files), schema, toStringVector(columns_list), num_samples,
-                        toShuffleMode(shuffle), num_shards, shard_id, shard_equal_rows, nullptr);
-                      THROW_IF_ERROR(tfrecord->ValidateParams());
-                      return tfrecord;
-                    }));
-                }));
-
-PYBIND_REGISTER(VOCNode, 2, ([](const py::module *m) {
-                  (void)py::class_<VOCNode, DatasetNode, std::shared_ptr<VOCNode>>(*m, "VOCNode", "to create a VOCNode")
-                    .def(py::init([](std::string dataset_dir, std::string task, std::string usage,
-                                     py::dict class_indexing, bool decode, py::handle sampler, bool extra_metadata) {
-                      std::shared_ptr<VOCNode> voc =
-                        std::make_shared<VOCNode>(dataset_dir, task, usage, toStringMap(class_indexing), decode,
-                                                  toSamplerObj(sampler), nullptr, extra_metadata);
-                      THROW_IF_ERROR(voc->ValidateParams());
-                      return voc;
-                    }));
-                }));
-
-}  // namespace dataset
-}  // namespace mindspore
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pybind11/pybind11.h"
+
+#include "minddata/dataset/api/python/pybind_conversion.h"
+#include "minddata/dataset/api/python/pybind_register.h"
+#include "minddata/dataset/include/dataset/constants.h"
+#include "minddata/dataset/include/dataset/datasets.h"
+
+#include "minddata/dataset/core/config_manager.h"
+#include "minddata/dataset/core/data_type.h"
+#include "minddata/dataset/util/path.h"
+
+// IR leaf nodes
+#include "minddata/dataset/engine/ir/datasetops/source/celeba_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/cifar100_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/cifar10_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/clue_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/coco_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/csv_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/flickr_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/generator_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/image_folder_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/mnist_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/random_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/text_file_node.h"
+
+// IR leaf nodes disabled for android
+#ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/ir/datasetops/source/manifest_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/minddata_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/tf_record_node.h"
+#include "minddata/dataset/engine/ir/datasetops/source/voc_node.h"
+#endif
+
+namespace mindspore {
+namespace dataset {
+
+// PYBIND FOR LEAF NODES
+// (In alphabetical order)
+
+PYBIND_REGISTER(CelebANode, 2, ([](const py::module *m) {
+                  (void)py::class_<CelebANode, DatasetNode, std::shared_ptr<CelebANode>>(*m, "CelebANode",
+                                                                                         "to create a CelebANode")
+                    .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler, bool decode,
+                                     py::list extensions) {
+                      auto celebA = std::make_shared<CelebANode>(dataset_dir, usage, toSamplerObj(sampler), decode,
+                                                                 toStringSet(extensions), nullptr);
+                      THROW_IF_ERROR(celebA->ValidateParams());
+                      return celebA;
+                    }));
+                }));
+
+PYBIND_REGISTER(Cifar10Node, 2, ([](const py::module *m) {
+                  (void)py::class_<Cifar10Node, DatasetNode, std::shared_ptr<Cifar10Node>>(*m, "Cifar10Node",
+                                                                                           "to create a Cifar10Node")
+                    .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler) {
+                      auto cifar10 = std::make_shared<Cifar10Node>(dataset_dir, usage, toSamplerObj(sampler), nullptr);
+                      THROW_IF_ERROR(cifar10->ValidateParams());
+                      return cifar10;
+                    }));
+                }));
+
+PYBIND_REGISTER(Cifar100Node, 2, ([](const py::module *m) {
+                  (void)py::class_<Cifar100Node, DatasetNode, std::shared_ptr<Cifar100Node>>(*m, "Cifar100Node",
+                                                                                             "to create a Cifar100Node")
+                    .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler) {
+                      auto cifar100 =
+                        std::make_shared<Cifar100Node>(dataset_dir, usage, toSamplerObj(sampler), nullptr);
+                      THROW_IF_ERROR(cifar100->ValidateParams());
+                      return cifar100;
+                    }));
+                }));
+
+PYBIND_REGISTER(CLUENode, 2, ([](const py::module *m) {
+                  (void)py::class_<CLUENode, DatasetNode, std::shared_ptr<CLUENode>>(*m, "CLUENode",
+                                                                                     "to create a CLUENode")
+                    .def(py::init([](py::list files, std::string task, std::string usage, int64_t num_samples,
+                                     int32_t shuffle, int32_t num_shards, int32_t shard_id) {
+                      std::shared_ptr<CLUENode> clue_node =
+                        std::make_shared<dataset::CLUENode>(toStringVector(files), task, usage, num_samples,
+                                                            toShuffleMode(shuffle), num_shards, shard_id, nullptr);
+                      THROW_IF_ERROR(clue_node->ValidateParams());
+                      return clue_node;
+                    }));
+                }));
+
+PYBIND_REGISTER(CocoNode, 2, ([](const py::module *m) {
+                  (void)py::class_<CocoNode, DatasetNode, std::shared_ptr<CocoNode>>(*m, "CocoNode",
+                                                                                     "to create a CocoNode")
+                    .def(py::init([](std::string dataset_dir, std::string annotation_file, std::string task,
+                                     bool decode, py::handle sampler, bool extra_metadata) {
+                      std::shared_ptr<CocoNode> coco = std::make_shared<CocoNode>(
+                        dataset_dir, annotation_file, task, decode, toSamplerObj(sampler), nullptr, extra_metadata);
+                      THROW_IF_ERROR(coco->ValidateParams());
+                      return coco;
+                    }));
+                }));
+
+PYBIND_REGISTER(CSVNode, 2, ([](const py::module *m) {
+                  (void)py::class_<CSVNode, DatasetNode, std::shared_ptr<CSVNode>>(*m, "CSVNode", "to create a CSVNode")
+                    .def(py::init([](std::vector<std::string> csv_files, char field_delim, py::list column_defaults,
+                                     std::vector<std::string> column_names, int64_t num_samples, int32_t shuffle,
+                                     int32_t num_shards, int32_t shard_id) {
+                      auto csv =
+                        std::make_shared<CSVNode>(csv_files, field_delim, toCSVBase(column_defaults), column_names,
+                                                  num_samples, toShuffleMode(shuffle), num_shards, shard_id, nullptr);
+                      THROW_IF_ERROR(csv->ValidateParams());
+                      return csv;
+                    }));
+                }));
+
+PYBIND_REGISTER(
+  FlickrNode, 2, ([](const py::module *m) {
+    (void)py::class_<FlickrNode, DatasetNode, std::shared_ptr<FlickrNode>>(*m, "FlickrNode", "to create a FlickrNode")
+      .def(py::init([](std::string dataset_dir, std::string annotation_file, bool decode, py::handle sampler) {
+        auto flickr =
+          std::make_shared<FlickrNode>(dataset_dir, annotation_file, decode, toSamplerObj(sampler), nullptr);
+        THROW_IF_ERROR(flickr->ValidateParams());
+        return flickr;
+      }));
+  }));
+
+PYBIND_REGISTER(GeneratorNode, 2, ([](const py::module *m) {
+                  (void)py::class_<GeneratorNode, DatasetNode, std::shared_ptr<GeneratorNode>>(
+                    *m, "GeneratorNode", "to create a GeneratorNode")
+                    .def(
+                      py::init([](py::function generator_function, const std::vector<std::string> &column_names,
+                                  const std::vector<DataType> &column_types, int64_t dataset_len, py::handle sampler) {
+                        auto gen = std::make_shared<GeneratorNode>(generator_function, column_names, column_types,
+                                                                   dataset_len, toSamplerObj(sampler));
+                        THROW_IF_ERROR(gen->ValidateParams());
+                        return gen;
+                      }))
+                    .def(py::init([](py::function generator_function, const std::shared_ptr<SchemaObj> schema,
+                                     int64_t dataset_len, py::handle sampler) {
+                      auto gen =
+                        std::make_shared<GeneratorNode>(generator_function, schema, dataset_len, toSamplerObj(sampler));
+                      THROW_IF_ERROR(gen->ValidateParams());
+                      return gen;
+                    }));
+                }));
+
+PYBIND_REGISTER(ImageFolderNode, 2, ([](const py::module *m) {
+                  (void)py::class_<ImageFolderNode, DatasetNode, std::shared_ptr<ImageFolderNode>>(
+                    *m, "ImageFolderNode", "to create an ImageFolderNode")
+                    .def(py::init([](std::string dataset_dir, bool decode, py::handle sampler, py::list extensions,
+                                     py::dict class_indexing) {
+                      // Don't update recursive to true
+                      bool recursive = false;  // Will be removed in future PR
+                      auto imagefolder = std::make_shared<ImageFolderNode>(dataset_dir, decode, toSamplerObj(sampler),
+                                                                           recursive, toStringSet(extensions),
+                                                                           toStringMap(class_indexing), nullptr);
+                      THROW_IF_ERROR(imagefolder->ValidateParams());
+                      return imagefolder;
+                    }));
+                }));
+
+PYBIND_REGISTER(ManifestNode, 2, ([](const py::module *m) {
+                  (void)py::class_<ManifestNode, DatasetNode, std::shared_ptr<ManifestNode>>(*m, "ManifestNode",
+                                                                                             "to create a ManifestNode")
+                    .def(py::init([](std::string dataset_file, std::string usage, py::handle sampler,
+                                     py::dict class_indexing, bool decode) {
+                      auto manifest = std::make_shared<ManifestNode>(dataset_file, usage, toSamplerObj(sampler),
+                                                                     toStringMap(class_indexing), decode, nullptr);
+                      THROW_IF_ERROR(manifest->ValidateParams());
+                      return manifest;
+                    }));
+                }));
+
+PYBIND_REGISTER(MindDataNode, 2, ([](const py::module *m) {
+                  (void)py::class_<MindDataNode, DatasetNode, std::shared_ptr<MindDataNode>>(*m, "MindDataNode",
+                                                                                             "to create a MindDataNode")
+                    .def(py::init([](std::string dataset_file, py::list columns_list, py::handle sampler,
+                                     py::dict padded_sample, int64_t num_padded, ShuffleMode shuffle_mode) {
+                      nlohmann::json padded_sample_json;
+                      std::map<std::string, std::string> sample_bytes;
+                      THROW_IF_ERROR(ToJson(padded_sample, &padded_sample_json, &sample_bytes));
+                      auto minddata = std::make_shared<MindDataNode>(dataset_file, toStringVector(columns_list),
+                                                                     toSamplerObj(sampler, true), padded_sample_json,
+                                                                     num_padded, shuffle_mode, nullptr);
+                      minddata->SetSampleBytes(&sample_bytes);
+                      THROW_IF_ERROR(minddata->ValidateParams());
+                      return minddata;
+                    }))
+                    .def(py::init([](py::list dataset_file, py::list columns_list, py::handle sampler,
+                                     py::dict padded_sample, int64_t num_padded, ShuffleMode shuffle_mode) {
+                      nlohmann::json padded_sample_json;
+                      std::map<std::string, std::string> sample_bytes;
+                      THROW_IF_ERROR(ToJson(padded_sample, &padded_sample_json, &sample_bytes));
+                      auto minddata = std::make_shared<MindDataNode>(
+                        toStringVector(dataset_file), toStringVector(columns_list), toSamplerObj(sampler, true),
+                        padded_sample_json, num_padded, shuffle_mode, nullptr);
+                      minddata->SetSampleBytes(&sample_bytes);
+                      THROW_IF_ERROR(minddata->ValidateParams());
+                      return minddata;
+                    }));
+                }));
+
+PYBIND_REGISTER(MnistNode, 2, ([](const py::module *m) {
+                  (void)py::class_<MnistNode, DatasetNode, std::shared_ptr<MnistNode>>(*m, "MnistNode",
+                                                                                       "to create an MnistNode")
+                    .def(py::init([](std::string dataset_dir, std::string usage, py::handle sampler) {
+                      auto mnist = std::make_shared<MnistNode>(dataset_dir, usage, toSamplerObj(sampler), nullptr);
+                      THROW_IF_ERROR(mnist->ValidateParams());
+                      return mnist;
+                    }));
+                }));
+
+PYBIND_REGISTER(RandomNode, 2, ([](const py::module *m) {
+                  (void)py::class_<RandomNode, DatasetNode, std::shared_ptr<RandomNode>>(*m, "RandomNode",
+                                                                                         "to create a RandomNode")
+                    .def(py::init([](int32_t total_rows, std::shared_ptr<SchemaObj> schema, py::list columns_list) {
+                      auto random_node =
+                        std::make_shared<RandomNode>(total_rows, schema, toStringVector(columns_list), nullptr);
+                      THROW_IF_ERROR(random_node->ValidateParams());
+                      return random_node;
+                    }))
+                    .def(py::init([](int32_t total_rows, std::string schema, py::list columns_list) {
+                      auto random_node =
+                        std::make_shared<RandomNode>(total_rows, schema, toStringVector(columns_list), nullptr);
+                      THROW_IF_ERROR(random_node->ValidateParams());
+                      return random_node;
+                    }));
+                }));
+
+PYBIND_REGISTER(TextFileNode, 2, ([](const py::module *m) {
+                  (void)py::class_<TextFileNode, DatasetNode, std::shared_ptr<TextFileNode>>(*m, "TextFileNode",
+                                                                                             "to create a TextFileNode")
+                    .def(py::init([](py::list dataset_files, int32_t num_samples, int32_t shuffle, int32_t num_shards,
+                                     int32_t shard_id) {
+                      std::shared_ptr<TextFileNode> textfile_node =
+                        std::make_shared<TextFileNode>(toStringVector(dataset_files), num_samples,
+                                                       toShuffleMode(shuffle), num_shards, shard_id, nullptr);
+                      THROW_IF_ERROR(textfile_node->ValidateParams());
+                      return textfile_node;
+                    }));
+                }));
+
+PYBIND_REGISTER(TFRecordNode, 2, ([](const py::module *m) {
+                  (void)py::class_<TFRecordNode, DatasetNode, std::shared_ptr<TFRecordNode>>(*m, "TFRecordNode",
+                                                                                             "to create a TFRecordNode")
+                    .def(py::init([](const py::list dataset_files, std::shared_ptr<SchemaObj> schema,
+                                     const py::list columns_list, int64_t num_samples, int32_t shuffle,
+                                     int32_t num_shards, int32_t shard_id, bool shard_equal_rows) {
+                      std::shared_ptr<TFRecordNode> tfrecord = std::make_shared<TFRecordNode>(
+                        toStringVector(dataset_files), schema, toStringVector(columns_list), num_samples,
+                        toShuffleMode(shuffle), num_shards, shard_id, shard_equal_rows, nullptr);
+                      THROW_IF_ERROR(tfrecord->ValidateParams());
+                      return tfrecord;
+                    }))
+                    .def(py::init([](const py::list dataset_files, std::string schema, const py::list columns_list,
+                                     int64_t num_samples, int32_t shuffle, int32_t num_shards, int32_t shard_id,
+                                     bool shard_equal_rows) {
+                      std::shared_ptr<TFRecordNode> tfrecord = std::make_shared<TFRecordNode>(
+                        toStringVector(dataset_files), schema, toStringVector(columns_list), num_samples,
+                        toShuffleMode(shuffle), num_shards, shard_id, shard_equal_rows, nullptr);
+                      THROW_IF_ERROR(tfrecord->ValidateParams());
+                      return tfrecord;
+                    }));
+                }));
+
+PYBIND_REGISTER(VOCNode, 2, ([](const py::module *m) {
+                  (void)py::class_<VOCNode, DatasetNode, std::shared_ptr<VOCNode>>(*m, "VOCNode", "to create a VOCNode")
+                    .def(py::init([](std::string dataset_dir, std::string task, std::string usage,
+                                     py::dict class_indexing, bool decode, py::handle sampler, bool extra_metadata) {
+                      std::shared_ptr<VOCNode> voc =
+                        std::make_shared<VOCNode>(dataset_dir, task, usage, toStringMap(class_indexing), decode,
+                                                  toSamplerObj(sampler), nullptr, extra_metadata);
+                      THROW_IF_ERROR(voc->ValidateParams());
+                      return voc;
+                    }));
+                }));
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/CMakeLists.txt
index 0547fd3850b..e2abe7f99a3 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/CMakeLists.txt
@@ -9,5 +9,8 @@ add_library(audio-ir-kernels OBJECT
         bandpass_biquad_ir.cc
         bandreject_biquad_ir.cc
         bass_biquad_ir.cc
+        complex_norm_ir.cc
+        frequency_masking_ir.cc
+        time_masking_ir.cc
         time_stretch_ir.cc
         )
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/allpass_biquad_ir.cc b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/allpass_biquad_ir.cc
index 35cf10b83c6..b760aae4844 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/allpass_biquad_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/allpass_biquad_ir.cc
@@ -16,20 +16,20 @@
 
 #include "minddata/dataset/audio/ir/kernels/allpass_biquad_ir.h"
 
-#include "minddata/dataset/audio/kernels/allpass_biquad_op.h"
-
 #include "minddata/dataset/audio/ir/validators.h"
+#include "minddata/dataset/audio/kernels/allpass_biquad_op.h"
 
 namespace mindspore {
 namespace dataset {
 namespace audio {
+
 // AllpassBiquadOperation
 AllpassBiquadOperation::AllpassBiquadOperation(int32_t sample_rate, float central_freq, float Q)
     : sample_rate_(sample_rate), central_freq_(central_freq), Q_(Q) {}
 
 Status AllpassBiquadOperation::ValidateParams() {
-  RETURN_IF_NOT_OK(CheckScalarNotZero("AllpassBiquad", "sample_rate", sample_rate_));
-  RETURN_IF_NOT_OK(CheckScalarNotZero("AllpassBiquad", "central_freq", central_freq_));
+  RETURN_IF_NOT_OK(ValidateScalarNotZero("AllpassBiquad", "sample_rate", sample_rate_));
+  RETURN_IF_NOT_OK(ValidateScalarNotZero("AllpassBiquad", "central_freq", central_freq_));
   RETURN_IF_NOT_OK(ValidateScalar("AllpassBiquad", "Q", Q_, {0, 1.0}, true, false));
   return Status::OK();
 }
@@ -38,6 +38,7 @@ std::shared_ptr<TensorOp> AllpassBiquadOperation::Build() {
   std::shared_ptr<AllpassBiquadOp> tensor_op = std::make_shared<AllpassBiquadOp>(sample_rate_, central_freq_, Q_);
   return tensor_op;
 }
+
 Status AllpassBiquadOperation::to_json(nlohmann::json *out_json) {
   nlohmann::json args;
   args["sample_rate"] = sample_rate_;
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/allpass_biquad_ir.h b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/allpass_biquad_ir.h
index c8d2be832bf..398287db244 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/allpass_biquad_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/allpass_biquad_ir.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "include/api/status.h"
 #include "minddata/dataset/include/dataset/constants.h"
 #include "minddata/dataset/include/dataset/transforms.h"
@@ -27,9 +28,8 @@
 
 namespace mindspore {
 namespace dataset {
-
 namespace audio {
-// Char arrays storing name of corresponding classes (in alphabetical order)
+
 constexpr char kAllpassBiquadOperation[] = "AllpassBiquad";
 
 class AllpassBiquadOperation : public TensorOperation {
@@ -52,7 +52,6 @@ class AllpassBiquadOperation : public TensorOperation {
   float Q_;
 };
 }  // namespace audio
-
 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_ALLPASS_BIQUAD_IR_H_
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/amplitude_to_db_ir.cc b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/amplitude_to_db_ir.cc
index 80412b1c437..61313e7fac1 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/amplitude_to_db_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/amplitude_to_db_ir.cc
@@ -15,15 +15,15 @@
  */
 
 #include "minddata/dataset/audio/ir/kernels/amplitude_to_db_ir.h"
-#include "minddata/dataset/audio/kernels/amplitude_to_db_op.h"
 
 #include "minddata/dataset/audio/ir/validators.h"
+#include "minddata/dataset/audio/kernels/amplitude_to_db_op.h"
 
 namespace mindspore {
 namespace dataset {
 namespace audio {
 
-// AmplitudeToDB
+// AmplitudeToDBOperation
 AmplitudeToDBOperation::AmplitudeToDBOperation(ScaleType stype, float ref_value, float amin, float top_db)
     : stype_(stype), ref_value_(ref_value), amin_(amin), top_db_(top_db) {}
 
@@ -32,9 +32,9 @@ AmplitudeToDBOperation::~AmplitudeToDBOperation() = default;
 std::string AmplitudeToDBOperation::Name() const { return kAmplitudeToDBOperation; }
 
 Status AmplitudeToDBOperation::ValidateParams() {
-  RETURN_IF_NOT_OK(CheckFloatScalarNonNegative("AmplitudeToDB", "top_db", top_db_));
-  RETURN_IF_NOT_OK(CheckFloatScalarPositive("AmplitudeToDB", "amin", amin_));
-  RETURN_IF_NOT_OK(CheckFloatScalarPositive("AmplitudeToDB", "ref_value", ref_value_));
+  RETURN_IF_NOT_OK(ValidateFloatScalarNonNegative("AmplitudeToDB", "top_db", top_db_));
+  RETURN_IF_NOT_OK(ValidateFloatScalarPositive("AmplitudeToDB", "amin", amin_));
+  RETURN_IF_NOT_OK(ValidateFloatScalarPositive("AmplitudeToDB", "ref_value", ref_value_));
 
   return Status::OK();
 }
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/angle_ir.cc b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/angle_ir.cc
index 53b1850e976..131a440e279 100755
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/angle_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/angle_ir.cc
@@ -16,13 +16,12 @@
 
 #include "minddata/dataset/audio/ir/kernels/angle_ir.h"
 
-// Kernel Audio headers
 #include "minddata/dataset/audio/kernels/angle_op.h"
 
 namespace mindspore {
 namespace dataset {
-
 namespace audio {
+
 // AngleOperation
 AngleOperation::AngleOperation() {}
 
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/angle_ir.h b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/angle_ir.h
index e0f1ce2ff80..0c35ba075b0 100755
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/angle_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/angle_ir.h
@@ -29,9 +29,8 @@
 
 namespace mindspore {
 namespace dataset {
-
 namespace audio {
-// Char arrays storing name of corresponding classes
+
 constexpr char kAngleOperation[] = "Angle";
 
 class AngleOperation : public TensorOperation {
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/band_biquad_ir.cc b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/band_biquad_ir.cc
index a231ef2b943..2757a7bda1c 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/band_biquad_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/band_biquad_ir.cc
@@ -16,9 +16,8 @@
 
 #include "minddata/dataset/audio/ir/kernels/band_biquad_ir.h"
 
-#include "minddata/dataset/audio/kernels/band_biquad_op.h"
-
 #include "minddata/dataset/audio/ir/validators.h"
+#include "minddata/dataset/audio/kernels/band_biquad_op.h"
 
 namespace mindspore {
 namespace dataset {
@@ -30,7 +29,7 @@ BandBiquadOperation::BandBiquadOperation(int32_t sample_rate, float central_freq
 
 Status BandBiquadOperation::ValidateParams() {
   RETURN_IF_NOT_OK(ValidateScalar("BandBiquad", "Q", Q_, {0, 1.0}, true, false));
-  RETURN_IF_NOT_OK(CheckScalarNotZero("BandBIquad", "sample_rate", sample_rate_));
+  RETURN_IF_NOT_OK(ValidateScalarNotZero("BandBIquad", "sample_rate", sample_rate_));
   return Status::OK();
 }
 
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/band_biquad_ir.h b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/band_biquad_ir.h
index 7f29f7e0aba..0d9e687302a 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/band_biquad_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/band_biquad_ir.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "include/api/status.h"
 #include "minddata/dataset/include/dataset/constants.h"
 #include "minddata/dataset/include/dataset/transforms.h"
@@ -30,7 +31,6 @@ namespace mindspore {
 namespace dataset {
 namespace audio {
 
-// Char arrays storing name of corresponding classes (in alphabetical order)
 constexpr char kBandBiquadOperation[] = "BandBiquad";
 
 class BandBiquadOperation : public TensorOperation {
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.cc b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.cc
index a335f6500fd..062cfd2a43a 100755
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.cc
@@ -16,13 +16,13 @@
 
 #include "minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.h"
 
-#include "minddata/dataset/audio/kernels/bandpass_biquad_op.h"
-
 #include "minddata/dataset/audio/ir/validators.h"
+#include "minddata/dataset/audio/kernels/bandpass_biquad_op.h"
 
 namespace mindspore {
 namespace dataset {
 namespace audio {
+
 // BandpassBiquadOperation
 BandpassBiquadOperation::BandpassBiquadOperation(int32_t sample_rate, float central_freq, float Q,
                                                  bool const_skirt_gain)
@@ -30,9 +30,10 @@ BandpassBiquadOperation::BandpassBiquadOperation(int32_t sample_rate, float cent
 
 Status BandpassBiquadOperation::ValidateParams() {
   RETURN_IF_NOT_OK(ValidateScalar("BandpassBiquad", "Q", Q_, {0, 1.0}, true, false));
-  RETURN_IF_NOT_OK(CheckScalarNotZero("BandpassBiquad", "sample_rate", sample_rate_));
+  RETURN_IF_NOT_OK(ValidateScalarNotZero("BandpassBiquad", "sample_rate", sample_rate_));
   return Status::OK();
 }
+
 std::shared_ptr<TensorOp> BandpassBiquadOperation::Build() {
   std::shared_ptr<BandpassBiquadOp> tensor_op =
     std::make_shared<BandpassBiquadOp>(sample_rate_, central_freq_, Q_, const_skirt_gain_);
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.h b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.h
index 23cb220e9f1..309d0453833 100755
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "include/api/status.h"
 #include "minddata/dataset/include/dataset/constants.h"
 #include "minddata/dataset/include/dataset/transforms.h"
@@ -28,9 +29,8 @@
 
 namespace mindspore {
 namespace dataset {
-
 namespace audio {
-// Char arrays storing name of corresponding classes (in alphabetical order)
+
 constexpr char kBandpassBiquadOperation[] = "BandpassBiquad";
 
 class BandpassBiquadOperation : public TensorOperation {
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.cc b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.cc
index 0688cb6b4d6..f66c65030da 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.cc
@@ -15,19 +15,21 @@
  */
 
 #include "minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.h"
-#include "minddata/dataset/audio/kernels/bandreject_biquad_op.h"
+
 #include "minddata/dataset/audio/ir/validators.h"
+#include "minddata/dataset/audio/kernels/bandreject_biquad_op.h"
 
 namespace mindspore {
 namespace dataset {
 namespace audio {
+
 // BandrejectBiquadOperation
 BandrejectBiquadOperation::BandrejectBiquadOperation(int32_t sample_rate, float central_freq, float Q)
     : sample_rate_(sample_rate), central_freq_(central_freq), Q_(Q) {}
 
 Status BandrejectBiquadOperation::ValidateParams() {
   RETURN_IF_NOT_OK(ValidateScalar("BandrejectBiquad", "Q", Q_, {0, 1.0}, true, false));
-  RETURN_IF_NOT_OK(CheckScalarNotZero("BandrejectBiquad", "sample_rate", sample_rate_));
+  RETURN_IF_NOT_OK(ValidateScalarNotZero("BandrejectBiquad", "sample_rate", sample_rate_));
   return Status::OK();
 }
 
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.h b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.h
index 9a38185c4b8..28b75c60739 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.h
@@ -16,10 +16,12 @@
 
 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_BANDREJECT_BIQUAD_IR_H_
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_BANDREJECT_BIQUAD_IR_H_
+
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "include/api/status.h"
 #include "minddata/dataset/include/dataset/constants.h"
 #include "minddata/dataset/include/dataset/transforms.h"
@@ -27,10 +29,8 @@
 
 namespace mindspore {
 namespace dataset {
-
 namespace audio {
 
-// Char arrays storing name of corresponding classes (in alphabetical order)
 constexpr char kBandrejectBiquadOperation[] = "BandrejectBiquad";
 
 class BandrejectBiquadOperation : public TensorOperation {
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bass_biquad_ir.cc b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bass_biquad_ir.cc
index f2f22aff0be..83766e50a6a 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bass_biquad_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bass_biquad_ir.cc
@@ -16,9 +16,8 @@
 
 #include "minddata/dataset/audio/ir/kernels/bass_biquad_ir.h"
 
-#include "minddata/dataset/audio/kernels/bass_biquad_op.h"
-
 #include "minddata/dataset/audio/ir/validators.h"
+#include "minddata/dataset/audio/kernels/bass_biquad_op.h"
 
 namespace mindspore {
 namespace dataset {
@@ -30,7 +29,7 @@ BassBiquadOperation::BassBiquadOperation(int32_t sample_rate, float gain, float
 
 Status BassBiquadOperation::ValidateParams() {
   RETURN_IF_NOT_OK(ValidateScalar("BassBiquad", "Q", Q_, {0, 1.0}, true, false));
-  RETURN_IF_NOT_OK(CheckScalarNotZero("BassBiquad", "sample_rate", sample_rate_));
+  RETURN_IF_NOT_OK(ValidateScalarNotZero("BassBiquad", "sample_rate", sample_rate_));
   return Status::OK();
 }
 
@@ -38,6 +37,7 @@ std::shared_ptr<TensorOp> BassBiquadOperation::Build() {
   std::shared_ptr<BassBiquadOp> tensor_op = std::make_shared<BassBiquadOp>(sample_rate_, gain_, central_freq_, Q_);
   return tensor_op;
 }
+
 Status BassBiquadOperation::to_json(nlohmann::json *out_json) {
   nlohmann::json args;
   args["sample_rate"] = sample_rate_;
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bass_biquad_ir.h b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bass_biquad_ir.h
index 1fdd38b8a90..725000591a7 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bass_biquad_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/bass_biquad_ir.h
@@ -31,7 +31,6 @@ namespace mindspore {
 namespace dataset {
 namespace audio {
 
-// Char arrays storing name of corresponding classes (in alphabetical order)
 constexpr char kBassBiquadOperation[] = "BassBiquad";
 
 class BassBiquadOperation : public TensorOperation {
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/time_stretch_ir.cc b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/time_stretch_ir.cc
index a78c4523705..4a94c4c6693 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/time_stretch_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/time_stretch_ir.cc
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 #include "minddata/dataset/audio/ir/kernels/time_stretch_ir.h"
-#include "minddata/dataset/audio/kernels/time_stretch_op.h"
 
 #include "minddata/dataset/audio/ir/validators.h"
+#include "minddata/dataset/audio/kernels/time_stretch_op.h"
+
 namespace mindspore {
 namespace dataset {
 namespace audio {
 
-// TimeStretch
+// TimeStretchOperation
 TimeStretchOperation::TimeStretchOperation(float hop_length, int n_freq, float fixed_rate)
     : hop_length_(hop_length), n_freq_(n_freq), fixed_rate_(fixed_rate) {}
 
@@ -31,10 +32,10 @@ std::string TimeStretchOperation::Name() const { return kTimeStretchOperation; }
 
 Status TimeStretchOperation::ValidateParams() {
   //  param check
-  RETURN_IF_NOT_OK(CheckFloatScalarPositive("TimeStretch", "hop_length", hop_length_));
-  RETURN_IF_NOT_OK(CheckIntScalarPositive("TimeStretch", "n_freq", n_freq_));
-  RETURN_IF_NOT_OK(CheckFloatScalarNotNan("TimeStretch", "fixed_rate", fixed_rate_));
-  RETURN_IF_NOT_OK(CheckFloatScalarPositive("TimeStretch", "fixed_rate", fixed_rate_));
+  RETURN_IF_NOT_OK(ValidateFloatScalarPositive("TimeStretch", "hop_length", hop_length_));
+  RETURN_IF_NOT_OK(ValidateIntScalarPositive("TimeStretch", "n_freq", n_freq_));
+  RETURN_IF_NOT_OK(ValidateFloatScalarNotNan("TimeStretch", "fixed_rate", fixed_rate_));
+  RETURN_IF_NOT_OK(ValidateFloatScalarPositive("TimeStretch", "fixed_rate", fixed_rate_));
   return Status::OK();
 }
 
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/validators.cc b/mindspore/ccsrc/minddata/dataset/audio/ir/validators.cc
index 7700298c1a7..e3f8c127b54 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/validators.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/validators.cc
@@ -17,82 +17,20 @@
 
 namespace mindspore {
 namespace dataset {
-/* ####################################### Validator Functions ############################################ */
-Status CheckFloatScalarPositive(const std::string &op_name, const std::string &scalar_name, float scalar) {
-  RETURN_IF_NOT_OK(CheckScalar(op_name, scalar_name, scalar, {0}, true));
+
+Status ValidateIntScalarNonNegative(const std::string &op_name, const std::string &scalar_name, int32_t scalar) {
+  RETURN_IF_NOT_OK(ValidateScalar(op_name, scalar_name, scalar, {0}, false));
   return Status::OK();
 }
 
-Status CheckFloatScalarNotNan(const std::string &op_name, const std::string &scalar_name, float scalar) {
+Status ValidateFloatScalarNotNan(const std::string &op_name, const std::string &scalar_name, float scalar) {
   if (std::isnan(scalar)) {
-    std::string err_msg = op_name + ":" + scalar_name + " should be specified, got: Nan.";
+    std::string err_msg = op_name + ": " + scalar_name + " should be specified, got: Nan";
     MS_LOG(ERROR) << err_msg;
     return Status(StatusCode::kMDSyntaxError, __LINE__, __FILE__, err_msg);
   }
   return Status::OK();
 }
 
-Status CheckFloatScalarNonNegative(const std::string &op_name, const std::string &scalar_name, float scalar) {
-  RETURN_IF_NOT_OK(CheckScalar(op_name, scalar_name, scalar, {0}, false));
-  return Status::OK();
-}
-
-Status CheckIntScalarPositive(const std::string &op_name, const std::string &scalar_name, int32_t scalar) {
-  RETURN_IF_NOT_OK(CheckScalar(op_name, scalar_name, scalar, {0}, true));
-  return Status::OK();
-}
-
-Status CheckStringScalarInList(const std::string &op_name, const std::string &scalar_name, const std::string &scalar,
-                               const std::vector<std::string> &str_vec) {
-  auto ret = std::find(str_vec.begin(), str_vec.end(), scalar);
-  if (ret == str_vec.end()) {
-    std::string interval_description = "[";
-    for (int m = 0; m < str_vec.size(); m++) {
-      std::string word = str_vec[m];
-      interval_description = interval_description + word;
-      if (m != str_vec.size() - 1) interval_description = interval_description + ", ";
-    }
-    interval_description = interval_description + "]";
-
-    std::string err_msg = op_name + ": " + scalar_name + " must be one of " + interval_description + ", got: " + scalar;
-    MS_LOG(ERROR) << err_msg;
-    return Status(StatusCode::kMDSyntaxError, __LINE__, __FILE__, err_msg);
-  }
-  return Status::OK();
-}
-
-template <typename T>
-Status CheckScalar(const std::string &op_name, const std::string &scalar_name, const T scalar,
-                   const std::vector<T> &range, bool left_open_interval, bool right_open_interval) {
-  if (range.empty() || range.size() > 2) {
-    std::string err_msg = "Range check expecting size 1 or 2, but got: " + std::to_string(range.size());
-    MS_LOG(ERROR) << err_msg;
-    return Status(StatusCode::kMDSyntaxError, __LINE__, __FILE__, err_msg);
-  }
-  if ((left_open_interval && scalar <= range[0]) || (!left_open_interval && scalar < range[0])) {
-    std::string interval_description = left_open_interval ? " greater than " : " greater than or equal to ";
-    std::string err_msg = op_name + ":" + scalar_name + " must be" + interval_description + std::to_string(range[0]) +
-                          ", got: " + std::to_string(scalar);
-    MS_LOG(ERROR) << err_msg;
-    return Status(StatusCode::kMDSyntaxError, __LINE__, __FILE__, err_msg);
-  }
-  if (range.size() == 2) {
-    if ((right_open_interval && scalar >= range[1]) || (!right_open_interval && scalar > range[1])) {
-      std::string left_bracket = left_open_interval ? "(" : "[";
-      std::string right_bracket = right_open_interval ? ")" : "]";
-      std::string err_msg = op_name + ":" + scalar_name + " is out of range " + left_bracket +
-                            std::to_string(range[0]) + ", " + std::to_string(range[1]) + right_bracket +
-                            ", got: " + std::to_string(scalar);
-      MS_LOG(ERROR) << err_msg;
-      return Status(StatusCode::kMDSyntaxError, __LINE__, __FILE__, err_msg);
-    }
-  }
-  return Status::OK();
-}
-template Status CheckScalar(const std::string &op_name, const std::string &scalar_name, const float scalar,
-                            const std::vector<float> &range, bool left_open_interval, bool right_open_interval);
-
-template Status CheckScalar(const std::string &op_name, const std::string &scalar_name, const int32_t scalar,
-                            const std::vector<int32_t> &range, bool left_open_interval, bool right_open_interval);
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/audio/ir/validators.h b/mindspore/ccsrc/minddata/dataset/audio/ir/validators.h
index 7cfa0bfa0be..2fa2978c310 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/validators.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/validators.h
@@ -19,49 +19,32 @@
 
 #include <string>
 #include <vector>
-#include "minddata/dataset/kernels/ir/validators.h"
 
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/kernels/ir/tensor_operation.h"
+#include "minddata/dataset/kernels/ir/validators.h"
 #include "minddata/dataset/util/status.h"
 
 namespace mindspore {
 namespace dataset {
 
-// Helper function to non-nan float scalar
-Status CheckFloatScalarNotNan(const std::string &op_name, const std::string &scalar_name, float scalar);
-
-// Helper function to positive float scalar
-Status CheckFloatScalarPositive(const std::string &op_name, const std::string &scalar_name, float scalar);
-
 // Helper function to positive int scalar
-Status CheckIntScalarPositive(const std::string &op_name, const std::string &scalar_name, int32_t scalar);
+Status ValidateIntScalarNonNegative(const std::string &op_name, const std::string &scalar_name, int32_t scalar);
+
+// Helper function to non-nan float scalar
+Status ValidateFloatScalarNotNan(const std::string &op_name, const std::string &scalar_name, float scalar);
 
 template <typename T>
 // Helper function to check scalar is not equal to zero
-Status CheckScalarNotZero(const std::string &op_name, const std::string &scalar_name, const T scalar) {
+Status ValidateScalarNotZero(const std::string &op_name, const std::string &scalar_name, const T scalar) {
   if (scalar == 0) {
-    std::string err_msg = op_name + ":" + scalar_name + " can't be 0" + ", got: " + std::to_string(scalar);
+    std::string err_msg = op_name + ": " + scalar_name + " can't be zero, got: " + std::to_string(scalar);
     MS_LOG(ERROR) << err_msg;
     return Status(StatusCode::kMDSyntaxError, __LINE__, __FILE__, err_msg);
   }
   return Status::OK();
 }
 
-// Helper function to positive float scalar
-Status CheckFloatScalarPositive(const std::string &op_name, const std::string &scalar_name, float scalar);
-
-// Helper function to non-negative float scalar
-Status CheckFloatScalarNonNegative(const std::string &op_name, const std::string &scalar_name, float scalar);
-
-// Helper function to check string scalar
-Status CheckStringScalarInList(const std::string &op_name, const std::string &scalar_name, const std::string &scalar,
-                               const std::vector<std::string> &str_vec);
-
-// Helper function to validate scalar
-template <typename T>
-Status CheckScalar(const std::string &op_name, const std::string &scalar_name, const T scalar,
-                   const std::vector<T> &range, bool left_open_interval = false, bool right_open_interval = false);
 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ADUIO_IR_VALIDATORS_H_
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/audio/kernels/CMakeLists.txt
index c6517814031..08a92eee232 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/CMakeLists.txt
@@ -10,5 +10,8 @@ add_library(audio-kernels OBJECT
         bandpass_biquad_op.cc
         bandreject_biquad_op.cc
         bass_biquad_op.cc
+        complex_norm_op.cc
+        frequency_masking_op.cc
+        time_masking_op.cc
         time_stretch_op.cc
         )
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/allpass_biquad_op.cc b/mindspore/ccsrc/minddata/dataset/audio/kernels/allpass_biquad_op.cc
index da2f88964af..b1b4625e066 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/allpass_biquad_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/allpass_biquad_op.cc
@@ -20,14 +20,15 @@
 
 namespace mindspore {
 namespace dataset {
+
 Status AllpassBiquadOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   IO_CHECK(input, output);
   TensorShape input_shape = input->shape();
-  CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0, "AllpassBiquad: input dimension should be greater than 0.");
-  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType(DataType::DE_FLOAT32) ||
-                                 input->type() == DataType(DataType::DE_FLOAT16) ||
-                                 input->type() == DataType(DataType::DE_FLOAT64),
-                               "AllpassBiquad: input type should be float, but got " + input->type().ToString());
+  CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0, "AllpassBiquad: input tensor is not in shape of <..., time>.");
+  CHECK_FAIL_RETURN_UNEXPECTED(
+    input->type() == DataType(DataType::DE_FLOAT32) || input->type() == DataType(DataType::DE_FLOAT16) ||
+      input->type() == DataType(DataType::DE_FLOAT64),
+    "AllpassBiquad: input tensor type should be float, but got: " + input->type().ToString());
   double w0 = 2 * PI * central_freq_ / sample_rate_;
   double alpha = sin(w0) / 2 / Q_;
   double b0 = 1 - alpha;
@@ -36,15 +37,16 @@ Status AllpassBiquadOp::Compute(const std::shared_ptr<Tensor> &input, std::share
   double a0 = b2;
   double a1 = -2 * cos(w0);
   double a2 = 1 - alpha;
-  if (input->type() == DataType(DataType::DE_FLOAT32))
+  if (input->type() == DataType(DataType::DE_FLOAT32)) {
     return Biquad(input, output, static_cast<float>(b0), static_cast<float>(b1), static_cast<float>(b2),
                   static_cast<float>(a0), static_cast<float>(a1), static_cast<float>(a2));
-  else if (input->type() == DataType(DataType::DE_FLOAT64))
+  } else if (input->type() == DataType(DataType::DE_FLOAT64)) {
     return Biquad(input, output, static_cast<double>(b0), static_cast<double>(b1), static_cast<double>(b2),
                   static_cast<double>(a0), static_cast<double>(a1), static_cast<double>(a2));
-  else
+  } else {
     return Biquad(input, output, static_cast<float16>(b0), static_cast<float16>(b1), static_cast<float16>(b2),
                   static_cast<float16>(a0), static_cast<float16>(a1), static_cast<float16>(a2));
+  }
 }
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/allpass_biquad_op.h b/mindspore/ccsrc/minddata/dataset/audio/kernels/allpass_biquad_op.h
index d4e7e17b95a..26c7b729f0a 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/allpass_biquad_op.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/allpass_biquad_op.h
@@ -26,6 +26,7 @@
 
 namespace mindspore {
 namespace dataset {
+
 class AllpassBiquadOp : public TensorOp {
  public:
   AllpassBiquadOp(int32_t sample_rate, float central_freq, float Q)
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/amplitude_to_db_op.cc b/mindspore/ccsrc/minddata/dataset/audio/kernels/amplitude_to_db_op.cc
index dbebec42d39..8a202f497c4 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/amplitude_to_db_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/amplitude_to_db_op.cc
@@ -13,9 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <limits>
-
 #include "minddata/dataset/audio/kernels/amplitude_to_db_op.h"
+
 #include "minddata/dataset/audio/kernels/audio_utils.h"
 #include "minddata/dataset/kernels/data/data_utils.h"
 #include "minddata/dataset/util/status.h"
@@ -26,7 +25,7 @@ namespace dataset {
 Status AmplitudeToDBOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   IO_CHECK(input, output);
   if (input->shape().Rank() < 2) {
-    std::string err_msg = "AmplitudeToDB: input tensor shape should be <..., freq, time>";
+    std::string err_msg = "AmplitudeToDB: input tensor is not in shape of <..., freq, time>.";
     MS_LOG(ERROR) << err_msg;
     RETURN_STATUS_SYNTAX_ERROR(err_msg);
   }
@@ -40,12 +39,12 @@ Status AmplitudeToDBOp::Compute(const std::shared_ptr<Tensor> &input, std::share
 
   // typecast
   CHECK_FAIL_RETURN_UNEXPECTED(input->type() != DataType::DE_STRING,
-                               "AmplitudeToDB: input type should be float, but got string.");
+                               "AmplitudeToDB: input tensor type should be float, but got: string.");
   if (input->type() != DataType::DE_FLOAT64) {
-    CHECK_FAIL_RETURN_UNEXPECTED(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)),
-                                 "AmplitudeToDB: input type should be float, but got " + input->type().ToString());
+    CHECK_FAIL_RETURN_UNEXPECTED(
+      TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)),
+      "AmplitudeToDB: input tensor type should be float, but got: " + input->type().ToString());
     return AmplitudeToDB<float>(input_tensor, output, multiplier, amin, db_multiplier, top_db);
-
   } else {
     input_tensor = input;
     return AmplitudeToDB<double>(input_tensor, output, multiplier, amin, db_multiplier, top_db);
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/amplitude_to_db_op.h b/mindspore/ccsrc/minddata/dataset/audio/kernels/amplitude_to_db_op.h
index bd84e888f9e..9aa2672878e 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/amplitude_to_db_op.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/amplitude_to_db_op.h
@@ -29,6 +29,7 @@
 
 namespace mindspore {
 namespace dataset {
+
 class AmplitudeToDBOp : public TensorOp {
  public:
   AmplitudeToDBOp(ScaleType stype, float ref_value, float amin, float top_db)
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/angle_op.cc b/mindspore/ccsrc/minddata/dataset/audio/kernels/angle_op.cc
index 54827c934ee..9dc313f606a 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/angle_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/angle_op.cc
@@ -25,8 +25,10 @@ namespace dataset {
 Status AngleOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   IO_CHECK(input, output);
   // if If the last dimension is not 2, then it's not a complex number
-  CHECK_FAIL_RETURN_UNEXPECTED(input->shape()[-1] == 2, "Angle: The input is not several legal complex numbers");
-  CHECK_FAIL_RETURN_UNEXPECTED(input->type().IsNumeric(), "Angle: The input type should be numbers");
+  CHECK_FAIL_RETURN_UNEXPECTED(input->shape()[-1] == 2, "Angle: input tensor is not in shape of <..., complex=2>.");
+  CHECK_FAIL_RETURN_UNEXPECTED(
+    input->type().IsNumeric(),
+    "Angle: input tensor type should be int, float or double, but got: " + input->type().ToString());
   if (input->type() == DataType(DataType::DE_FLOAT64)) {
     return Angle<double>(input, output);
   } else {
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/angle_op.h b/mindspore/ccsrc/minddata/dataset/audio/kernels/angle_op.h
index aff0ab44a4d..501981b2138 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/angle_op.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/angle_op.h
@@ -26,6 +26,7 @@
 
 namespace mindspore {
 namespace dataset {
+
 class AngleOp : public TensorOp {
  public:
   // Convert complex numbers to angles
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.cc b/mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.cc
index 701a4ca6dde..d225eabd48b 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.cc
@@ -16,62 +16,27 @@
 
 #include "minddata/dataset/audio/kernels/audio_utils.h"
 
+#include <complex>
+
+#include "mindspore/core/base/float16.h"
+#include "minddata/dataset/core/type_id.h"
+#include "minddata/dataset/kernels/data/data_utils.h"
+#include "minddata/dataset/util/random.h"
+#include "minddata/dataset/util/status.h"
+
 namespace mindspore {
 namespace dataset {
 
-template <typename T>
-Status AmplitudeToDB(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T multiplier, T amin,
-                     T db_multiplier, T top_db) {
-  TensorShape input_shape = input->shape();
-  TensorShape to_shape = input_shape.Rank() == 2
-                           ? TensorShape({1, 1, input_shape[-2], input_shape[-1]})
-                           : TensorShape({input->Size() / (input_shape[-3] * input_shape[-2] * input_shape[-1]),
-                                          input_shape[-3], input_shape[-2], input_shape[-1]});
-  RETURN_IF_NOT_OK(input->Reshape(to_shape));
-
-  std::vector<T> max_val;
-  int step = to_shape[-3] * input_shape[-2] * input_shape[-1];
-  int cnt = 0;
-  T temp_max = std::numeric_limits<T>::lowest();
-  for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++) {
-    // do clamp
-    *itr = *itr < amin ? log10(amin) * multiplier : log10(*itr) * multiplier;
-    *itr -= multiplier * db_multiplier;
-    // calculate max by axis
-    cnt++;
-    if ((*itr) > temp_max) temp_max = *itr;
-    if (cnt % step == 0) {
-      max_val.push_back(temp_max);
-      temp_max = std::numeric_limits<T>::lowest();
-    }
-  }
-
-  if (!std::isnan(top_db)) {
-    int ind = 0;
-    for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++, ind++) {
-      float lower_bound = max_val[ind / step] - top_db;
-      *itr = std::max((*itr), static_cast<T>(lower_bound));
-    }
-  }
-  RETURN_IF_NOT_OK(input->Reshape(input_shape));
-  *output = input;
-  return Status::OK();
-}
-template Status AmplitudeToDB<float>(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
-                                     float multiplier, float amin, float db_multiplier, float top_db);
-template Status AmplitudeToDB<double>(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
-                                      double multiplier, double amin, double db_multiplier, double top_db);
-
-/// \brief Generate linearly spaced vector
+/// \brief Generate linearly spaced vector.
 /// \param[in] start - Value of the startpoint.
 /// \param[in] end - Value of the endpoint.
 /// \param[in] n - N points in the output tensor.
 /// \param[out] output - Tensor has n points with linearly space. The spacing between the points is (end-start)/(n-1).
-/// \return Status return code
+/// \return Status return code.
 template <typename T>
-Status Linespace(std::shared_ptr<Tensor> *output, T start, T end, int n) {
+Status Linspace(std::shared_ptr<Tensor> *output, T start, T end, int n) {
   if (start > end) {
-    std::string err = "Linespace: input param end must be greater than start.";
+    std::string err = "Linspace: input param end must be greater than start.";
     RETURN_STATUS_UNEXPECTED(err);
   }
   n = std::isnan(n) ? 100 : n;
@@ -89,10 +54,10 @@ Status Linespace(std::shared_ptr<Tensor> *output, T start, T end, int n) {
   return Status::OK();
 }
 
-/// \brief Calculate complex tensor angle
+/// \brief Calculate complex tensor angle.
 /// \param[in] input - Input tensor, must be complex, <channel, freq, time, complex=2>.
 /// \param[out] output - Complex tensor angle.
-/// \return Status return code
+/// \return Status return code.
 template <typename T>
 Status ComplexAngle(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   // check complex
@@ -121,10 +86,10 @@ Status ComplexAngle(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor
   return Status::OK();
 }
 
-/// \brief Calculate complex tensor abs
+/// \brief Calculate complex tensor abs.
 /// \param[in] input - Input tensor, must be complex, <channel, freq, time, complex=2>.
 /// \param[out] output - Complex tensor abs.
-/// \return Status return code
+/// \return Status return code.
 template <typename T>
 Status ComplexAbs(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   // check complex
@@ -150,17 +115,17 @@ Status ComplexAbs(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor>
   return Status::OK();
 }
 
-/// \brief Reconstruct complex tensor from norm and angle
+/// \brief Reconstruct complex tensor from norm and angle.
 /// \param[in] abs - The absolute value of the complex tensor.
 /// \param[in] angle - The angle of the complex tensor.
 /// \param[out] output - Complex tensor, <channel, freq, time, complex=2>.
-/// \return Status return code
+/// \return Status return code.
 template <typename T>
 Status Polar(const std::shared_ptr<Tensor> &abs, const std::shared_ptr<Tensor> &angle,
              std::shared_ptr<Tensor> *output) {
   // check shape
   if (abs->shape() != angle->shape()) {
-    std::string err_msg = "Polar: input shape of abs and angle must be same.";
+    std::string err_msg = "Polar: input tensor shape of abs and angle must be the same.";
     MS_LOG(ERROR) << err_msg;
     RETURN_STATUS_SYNTAX_ERROR(err_msg);
   }
@@ -183,12 +148,12 @@ Status Polar(const std::shared_ptr<Tensor> &abs, const std::shared_ptr<Tensor> &
   return Status::OK();
 }
 
-/// \brief Pad complex tensor
+/// \brief Pad complex tensor.
 /// \param[in] input - The complex tensor.
 /// \param[in] length - The length of padding.
 /// \param[in] dim - The dim index for padding.
 /// \param[out] output - Complex tensor, <channel, freq, time, complex=2>.
-/// \return Status return code
+/// \return Status return code.
 template <typename T>
 Status PadComplexTensor(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int length, int dim) {
   TensorShape input_shape = input->shape();
@@ -216,13 +181,13 @@ Status PadComplexTensor(const std::shared_ptr<Tensor> &input, std::shared_ptr<Te
   return Status::OK();
 }
 
-/// \brief Calculate phase
+/// \brief Calculate phase.
 /// \param[in] angle_0 - The angle.
 /// \param[in] angle_1 - The angle.
 /// \param[in] phase_advance - The phase advance.
 /// \param[in] phase_time0 - The phase at time 0.
 /// \param[out] output - Phase tensor.
-/// \return Status return code
+/// \return Status return code.
 template <typename T>
 Status Phase(const std::shared_ptr<Tensor> &angle_0, const std::shared_ptr<Tensor> &angle_1,
              const std::shared_ptr<Tensor> &phase_advance, const std::shared_ptr<Tensor> &phase_time0,
@@ -267,12 +232,12 @@ Status Phase(const std::shared_ptr<Tensor> &angle_0, const std::shared_ptr<Tenso
   return Status::OK();
 }
 
-/// \brief Calculate magnitude
+/// \brief Calculate magnitude.
 /// \param[in] alphas - The alphas.
 /// \param[in] abs_0 - The norm.
 /// \param[in] abs_1 - The norm.
 /// \param[out] output - Magnitude tensor.
-/// \return Status return code
+/// \return Status return code.
 template <typename T>
 Status Mag(const std::shared_ptr<Tensor> &abs_0, const std::shared_ptr<Tensor> &abs_1, std::shared_ptr<Tensor> *output,
            const std::vector<T> &alphas) {
@@ -367,19 +332,178 @@ Status TimeStretch(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *outpu
   std::shared_ptr<Tensor> phase_advance;
   switch (input->type().value()) {
     case DataType::DE_FLOAT32:
-      RETURN_IF_NOT_OK(Linespace<float>(&phase_advance, 0, PI * hop_length, n_freq));
+      RETURN_IF_NOT_OK(Linspace<float>(&phase_advance, 0, PI * hop_length, n_freq));
       RETURN_IF_NOT_OK(TimeStretch<float>(input, output, rate, phase_advance));
       break;
     case DataType::DE_FLOAT64:
-      RETURN_IF_NOT_OK(Linespace<double>(&phase_advance, 0, PI * hop_length, n_freq));
+      RETURN_IF_NOT_OK(Linspace<double>(&phase_advance, 0, PI * hop_length, n_freq));
       RETURN_IF_NOT_OK(TimeStretch<double>(input, output, rate, phase_advance));
       break;
     default:
-      RETURN_STATUS_UNEXPECTED(
-        "TimeStretch: unsupported type, currently supported types include "
-        "[float, double].");
+      RETURN_STATUS_UNEXPECTED("TimeStretch: input tensor type should be float or double, but got: " +
+                               input->type().ToString());
   }
   return Status::OK();
 }
+
+Status RandomMaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int64_t mask_param,
+                           double mask_value, int axis, std::mt19937 rnd) {
+  std::uniform_int_distribution<int64_t> mask_width_value(0, mask_param);
+  TensorShape input_shape = input->shape();
+  int64_t mask_dim_size = axis == 1 ? input_shape[-2] : input_shape[-1];
+  int64_t mask_width = mask_width_value(rnd);
+  std::uniform_int_distribution<int64_t> min_freq_value(0, mask_dim_size - mask_width);
+  int64_t mask_start = min_freq_value(rnd);
+
+  return MaskAlongAxis(input, output, mask_width, mask_start, mask_value, axis);
+}
+
+Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int64_t mask_width,
+                     int64_t mask_start, double mask_value, int axis) {
+  if (axis != 2 && axis != 1) {
+    RETURN_STATUS_UNEXPECTED("MaskAlongAxis: only support Time and Frequency masking, axis should be 1 or 2.");
+  }
+  TensorShape input_shape = input->shape();
+  // squeeze input
+  TensorShape squeeze_shape = TensorShape({-1, input_shape[-2], input_shape[-1]});
+  input->Reshape(squeeze_shape);
+
+  int check_dim_ind = (axis == 1) ? -2 : -1;
+  CHECK_FAIL_RETURN_UNEXPECTED(0 <= mask_start && mask_start <= input_shape[check_dim_ind],
+                               "MaskAlongAxis: mask_start should be less than the length of chosen dimension.");
+  CHECK_FAIL_RETURN_UNEXPECTED(mask_start + mask_width <= input_shape[check_dim_ind],
+                               "MaskAlongAxis: the sum of mask_start and mask_width is out of bounds.");
+
+  int64_t cell_size = input->type().SizeInBytes();
+
+  if (axis == 1) {
+    // freq
+    for (int ind = 0; ind < input->Size() / input_shape[-2] * mask_width; ind++) {
+      int block_num = ind / (mask_width * input_shape[-1]);
+      auto start_pos = ind % (mask_width * input_shape[-1]) + mask_start * input_shape[-1] +
+                       input_shape[-1] * input_shape[-2] * block_num;
+      auto start_mem_pos = const_cast<uchar *>(input->GetBuffer() + start_pos * cell_size);
+      if (input->type() != DataType::DE_FLOAT64) {
+        // tensor float 32
+        auto mask_val = static_cast<float>(mask_value);
+        CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s(start_mem_pos, cell_size, &mask_val, cell_size) == 0,
+                                     "MaskAlongAxis: mask failed, memory copy error.");
+      } else {
+        // tensor float 64
+        CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s(start_mem_pos, cell_size, &mask_value, cell_size) == 0,
+                                     "MaskAlongAxis: mask failed, memory copy error.");
+      }
+    }
+  } else {
+    // time
+    for (int ind = 0; ind < input->Size() / input_shape[-1] * mask_width; ind++) {
+      int row_num = ind / mask_width;
+      auto start_pos = ind % mask_width + mask_start + input_shape[-1] * row_num;
+      auto start_mem_pos = const_cast<uchar *>(input->GetBuffer() + start_pos * cell_size);
+      if (input->type() != DataType::DE_FLOAT64) {
+        // tensor float 32
+        auto mask_val = static_cast<float>(mask_value);
+        CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s(start_mem_pos, cell_size, &mask_val, cell_size) == 0,
+                                     "MaskAlongAxis: mask failed, memory copy error.");
+      } else {
+        // tensor float 64
+        CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s(start_mem_pos, cell_size, &mask_value, cell_size) == 0,
+                                     "MaskAlongAxis: mask failed, memory copy error.");
+      }
+    }
+  }
+  // unsqueeze input
+  input->Reshape(input_shape);
+  *output = input;
+  return Status::OK();
+}
+
+template <typename T>
+Status Norm(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power) {
+  // calcutate total complex num
+  int32_t dim = input->shape().Size();
+  int32_t total_num = 1;
+  for (int32_t i = 0; i < (dim - 1); i++) {
+    total_num *= (input->shape()[i]);
+  }
+
+  // calculate the output dimension
+  auto input_size = input->shape().AsVector();
+  int32_t dim_back = input_size.back();
+  CHECK_FAIL_RETURN_UNEXPECTED(
+    dim_back == 2, "ComplexNorm: expect complex input of shape <..., 2>, but got: " + std::to_string(dim_back));
+  input_size.pop_back();
+  int32_t complex_num = input_size.back();
+  int32_t iter_num = total_num / complex_num;
+  // TensorShape out_put_shape{}
+  input_size.pop_back();
+  input_size.emplace_back(2);
+  TensorShape out_shape = TensorShape(input_size);
+  RETURN_IF_NOT_OK(Tensor::CreateEmpty(out_shape, input->type(), output));
+
+  // slice input into real tensor and imaginary tensor
+  std::shared_ptr<Tensor> re_tensor;
+  std::shared_ptr<Tensor> im_tensor;
+  RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({total_num, 1}), input->type(), &re_tensor));
+  RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({total_num, 1}), input->type(), &im_tensor));
+  std::vector<SliceOption> slice_re = {};
+  std::vector<SliceOption> slice_im = {};
+  for (int32_t i = 0; i < (dim - 1); i++) {
+    slice_re.emplace_back(SliceOption(true));
+    slice_im.emplace_back(SliceOption(true));
+  }
+  slice_re.emplace_back(SliceOption(std::vector<dsize_t>{0}));
+  slice_im.emplace_back(SliceOption(std::vector<dsize_t>{1}));
+  RETURN_IF_NOT_OK(input->Slice(&re_tensor, slice_re));
+  RETURN_IF_NOT_OK(input->Slice(&im_tensor, slice_im));
+
+  // calculate norm, using: .pow(2.).sum(-1).pow(0.5 * power)
+  auto itr_out = (*output)->begin<T>();
+  auto itr_re = re_tensor->begin<T>();
+  auto itr_im = im_tensor->begin<T>();
+  for (int32_t i = 0; i < iter_num; i++) {
+    double re = 0.0;
+    double im = 0.0;
+    for (int32_t j = complex_num * i; j < complex_num * (i + 1); j++) {
+      double a = static_cast<double>(*itr_re);
+      double b = static_cast<double>(*itr_im);
+      re = re + (pow(a, 2) - pow(b, 2));
+      im = im + (2 * a * b);
+      ++itr_re;
+      ++itr_im;
+    }
+    std::complex<double> comp(re, im);
+    comp = std::pow(comp, (0.5 * power));
+    *itr_out = static_cast<T>(comp.real());
+    ++itr_out;
+    *itr_out = static_cast<T>(comp.imag());
+    ++itr_out;
+  }
+  RETURN_IF_NOT_OK((*output)->Reshape(out_shape));
+  return Status::OK();
+}
+
+Status ComplexNorm(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power) {
+  try {
+    if (input->type().value() >= DataType::DE_INT8 && input->type().value() <= DataType::DE_FLOAT16) {
+      // convert the data type to float
+      std::shared_ptr<Tensor> input_tensor;
+      RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), DataType(DataType::DE_FLOAT32), &input_tensor));
+      RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
+
+      Norm<float>(input_tensor, output, power);
+    } else if (input->type().value() == DataType::DE_FLOAT32) {
+      Norm<float>(input, output, power);
+    } else if (input->type().value() == DataType::DE_FLOAT64) {
+      Norm<double>(input, output, power);
+    } else {
+      RETURN_STATUS_UNEXPECTED("ComplexNorm: input tensor type should be int, float or double, but got: " +
+                               input->type().ToString());
+    }
+    return Status::OK();
+  } catch (std::runtime_error &e) {
+    RETURN_STATUS_UNEXPECTED("ComplexNorm: " + std::string(e.what()));
+  }
+}
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.h b/mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.h
index d66340fbf76..932e7e03dc2 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.h
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
 
@@ -21,6 +20,7 @@
 #include <cmath>
 #include <limits>
 #include <memory>
+#include <random>
 #include <string>
 #include <vector>
 
@@ -29,21 +29,58 @@
 #include "minddata/dataset/util/status.h"
 
 constexpr double PI = 3.141592653589793;
+
 namespace mindspore {
 namespace dataset {
+
 /// \brief Turn a tensor from the power/amplitude scale to the decibel scale.
-/// \param input/output: Tensor of shape <...,freq,time>
-/// \param multiplier: power - 10, amplitude - 20
-/// \param amin: lower bound
-/// \param db_multiplier: multiplier for decibels
-/// \param top_db: the lower bound for decibels cut-off
-/// \return Status code
+/// \param input/output: Tensor of shape <..., freq, time>.
+/// \param multiplier: power - 10, amplitude - 20.
+/// \param amin: lower bound.
+/// \param db_multiplier: multiplier for decibels.
+/// \param top_db: the lower bound for decibels cut-off.
+/// \return Status code.
 template <typename T>
 Status AmplitudeToDB(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T multiplier, T amin,
-                     T db_multiplier, T top_db);
+                     T db_multiplier, T top_db) {
+  TensorShape input_shape = input->shape();
+  TensorShape to_shape = input_shape.Rank() == 2
+                           ? TensorShape({1, 1, input_shape[-2], input_shape[-1]})
+                           : TensorShape({input->Size() / (input_shape[-3] * input_shape[-2] * input_shape[-1]),
+                                          input_shape[-3], input_shape[-2], input_shape[-1]});
+  RETURN_IF_NOT_OK(input->Reshape(to_shape));
 
-/// \brief Calculate the angles of the complex numbers
-/// \param input/output: Tensor of shape <...,time>
+  std::vector<T> max_val;
+  int step = to_shape[-3] * input_shape[-2] * input_shape[-1];
+  int cnt = 0;
+  T temp_max = std::numeric_limits<T>::lowest();
+  for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++) {
+    // do clamp
+    *itr = *itr < amin ? log10(amin) * multiplier : log10(*itr) * multiplier;
+    *itr -= multiplier * db_multiplier;
+    // calculate max by axis
+    cnt++;
+    if ((*itr) > temp_max) temp_max = *itr;
+    if (cnt % step == 0) {
+      max_val.push_back(temp_max);
+      temp_max = std::numeric_limits<T>::lowest();
+    }
+  }
+
+  if (!std::isnan(top_db)) {
+    int ind = 0;
+    for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++, ind++) {
+      float lower_bound = max_val[ind / step] - top_db;
+      *itr = std::max((*itr), static_cast<T>(lower_bound));
+    }
+  }
+  RETURN_IF_NOT_OK(input->Reshape(input_shape));
+  *output = input;
+  return Status::OK();
+}
+
+/// \brief Calculate the angles of the complex numbers.
+/// \param input/output: Tensor of shape <..., time>.
 template <typename T>
 Status Angle(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   TensorShape shape = input->shape();
@@ -68,14 +105,14 @@ Status Angle(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *outp
 }
 
 /// \brief Perform a biquad filter of input tensor.
-/// \param input/output: Tensor of shape <...,time>
-/// \param a0: denominator coefficient of current output y[n], typically 1
-/// \param a1: denominator coefficient of current output y[n-1]
-/// \param a2: denominator coefficient of current output y[n-2]
-/// \param b0: numerator coefficient of current input, x[n]
-/// \param b1: numerator coefficient of input one time step ago x[n-1]
-/// \param b2: numerator coefficient of input two time steps ago x[n-2]
-/// \return Status code
+/// \param input/output: Tensor of shape <..., time>.
+/// \param a0: denominator coefficient of current output y[n], typically 1.
+/// \param a1: denominator coefficient of current output y[n-1].
+/// \param a2: denominator coefficient of current output y[n-2].
+/// \param b0: numerator coefficient of current input, x[n].
+/// \param b1: numerator coefficient of input one time step ago x[n-1].
+/// \param b2: numerator coefficient of input two time steps ago x[n-2].
+/// \return Status code.
 template <typename T>
 Status Biquad(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T b0, T b1, T b2, T a0, T a1,
               T a2) {
@@ -91,10 +128,10 @@ Status Biquad(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out
 }
 
 /// \brief Perform an IIR filter by evaluating difference equation.
-/// \param input/output: Tensor of shape <...,time>
+/// \param input/output: Tensor of shape <..., time>
 /// \param a_coeffs: denominator coefficients of difference equation of dimension of (n_order + 1).
 /// \param b_coeffs: numerator coefficients of difference equation of dimension of (n_order + 1).
-/// \param clamp: If True, clamp the output signal to be in the range [-1, 1] (Default: True)
+/// \param clamp: If True, clamp the output signal to be in the range [-1, 1] (Default: True).
 /// \return Status code
 template <typename T>
 Status LFilter(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, std::vector<T> a_coeffs,
@@ -112,7 +149,7 @@ Status LFilter(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *ou
   size_t channel_idx = 1;
   size_t m_num_order = b_coeffs.size() - 1;
   size_t m_den_order = a_coeffs.size() - 1;
-  // init  A_coeffs and B_coeffs by div(a0)
+  // init A_coeffs and B_coeffs by div(a0)
   for (size_t i = 1; i < a_coeffs.size(); i++) {
     a_coeffs[i] /= a_coeffs[0];
   }
@@ -172,20 +209,50 @@ Status LFilter(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *ou
   // unpack batch
   Tensor::CreateFromVector(out_vect, input_shape, &out);
   *output = out;
-  delete m_px;
-  delete m_py;
+  delete[] m_px;
+  delete[] m_py;
   return Status::OK();
 }
 
 /// \brief Stretch STFT in time at a given rate, without changing the pitch.
-/// \param[in] input - Tensor of shape <...,freq,time>.
+/// \param[in] input - Tensor of shape <..., freq, time>.
 /// \param[in] rate - Stretch factor.
 /// \param[in] phase_advance - Expected phase advance in each bin.
 /// \param[out] output - Tensor after stretch in time domain.
-/// \return Status return code
+/// \return Status return code.
 Status TimeStretch(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, float rate, float hop_length,
                    float n_freq);
 
+/// \brief Apply a mask along axis.
+/// \param input: Tensor of shape <..., freq, time>.
+/// \param output: Tensor of shape <..., freq, time>.
+/// \param mask_param: Number of columns to be masked will be uniformly sampled from [0, mask_param].
+/// \param mask_value: Value to assign to the masked columns.
+/// \param axis: Axis to apply masking on (1 -> frequency, 2 -> time).
+/// \param rnd: Number generator.
+/// \return Status code.
+Status RandomMaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int64_t mask_param,
+                           double mask_value, int axis, std::mt19937 rnd);
+
+/// \brief Apply a mask along axis. All examples will have the same mask interval.
+/// \param input: Tensor of shape <..., freq, time>.
+/// \param output: Tensor of shape <..., freq, time>.
+/// \param mask_width: The width of the mask.
+/// \param mask_start: Starting position of the mask.
+///     Mask will be applied from indices [mask_start, mask_start + mask_width).
+/// \param mask_value: Value to assign to the masked columns.
+/// \param axis: Axis to apply masking on (1 -> frequency, 2 -> time).
+/// \return Status code.
+Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int64_t mask_width,
+                     int64_t mask_start, double mask_value, int axis);
+
+/// \brief Compute the norm of complex tensor input.
+/// \param power Power of the norm description (optional).
+/// \param input Tensor shape of <..., complex=2>.
+/// \param output Tensor shape of <..., complex=2>.
+/// \return Status code.
+Status ComplexNorm(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power);
+
 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/band_biquad_op.cc b/mindspore/ccsrc/minddata/dataset/audio/kernels/band_biquad_op.cc
index a6ea14f550f..9ea9a62e3d3 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/band_biquad_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/band_biquad_op.cc
@@ -25,12 +25,12 @@ Status BandBiquadOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_p
   IO_CHECK(input, output);
   TensorShape input_shape = input->shape();
   // check input tensor dimension, it should be greater than 0.
-  CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0, "BandBiquad: input dimension should be greater than 0.");
+  CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0, "BandBiquad: input tensor is not in shape of <..., time>.");
   // check input type, it should be DE_FLOAT32 or DE_FLOAT16 or DE_FLOAT64
   CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType(DataType::DE_FLOAT32) ||
                                  input->type() == DataType(DataType::DE_FLOAT16) ||
                                  input->type() == DataType(DataType::DE_FLOAT64),
-                               "BandBiquad: input type should be float, but got " + input->type().ToString());
+                               "BandBiquad: input tensor type should be float, but got: " + input->type().ToString());
   double w0 = 2 * PI * central_freq_ / sample_rate_;
   double bw_Hz = central_freq_ / Q_;
   double a0 = 1.;
@@ -45,15 +45,16 @@ Status BandBiquadOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_p
   }
   double b1 = 0.;
   double b2 = 0.;
-  if (input->type() == DataType(DataType::DE_FLOAT32))
+  if (input->type() == DataType(DataType::DE_FLOAT32)) {
     return Biquad(input, output, static_cast<float>(b0), static_cast<float>(b1), static_cast<float>(b2),
                   static_cast<float>(a0), static_cast<float>(a1), static_cast<float>(a2));
-  else if (input->type() == DataType(DataType::DE_FLOAT64))
+  } else if (input->type() == DataType(DataType::DE_FLOAT64)) {
     return Biquad(input, output, static_cast<double>(b0), static_cast<double>(b1), static_cast<double>(b2),
                   static_cast<double>(a0), static_cast<double>(a1), static_cast<double>(a2));
-  else
+  } else {
     return Biquad(input, output, static_cast<float16>(b0), static_cast<float16>(b1), static_cast<float16>(b2),
                   static_cast<float16>(a0), static_cast<float16>(a1), static_cast<float16>(a2));
+  }
 }
 
 }  // namespace dataset
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/band_biquad_op.h b/mindspore/ccsrc/minddata/dataset/audio/kernels/band_biquad_op.h
index 7ddb3ca6832..c92bda5fdd6 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/band_biquad_op.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/band_biquad_op.h
@@ -17,8 +17,8 @@
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_BAND_BIQUAD_OP_H_
 
 #include <memory>
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/kernels/tensor_op.h"
@@ -26,6 +26,7 @@
 
 namespace mindspore {
 namespace dataset {
+
 class BandBiquadOp : public TensorOp {
  public:
   BandBiquadOp(int32_t sample_rate, float central_freq, float Q, bool noise)
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/bandpass_biquad_op.cc b/mindspore/ccsrc/minddata/dataset/audio/kernels/bandpass_biquad_op.cc
index 475485f0e1a..ab0fa546f3a 100755
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/bandpass_biquad_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/bandpass_biquad_op.cc
@@ -24,12 +24,12 @@ namespace dataset {
 Status BandpassBiquadOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   IO_CHECK(input, output);
   TensorShape input_shape = input->shape();
-  CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0, "BandpassBiquad: inpute dimension should be greater than 0.");
+  CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0, "BandpassBiquad: input tensor is not in shape of <..., time>.");
   // check input type, it should be DE_FLOAT32 or DE_FLOAT16 or DE_FLOAT64
-  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType(DataType::DE_FLOAT32) ||
-                                 input->type() == DataType(DataType::DE_FLOAT16) ||
-                                 input->type() == DataType(DataType::DE_FLOAT64),
-                               "BandpassBiquad: input type should be float, but got " + input->type().ToString());
+  CHECK_FAIL_RETURN_UNEXPECTED(
+    input->type() == DataType(DataType::DE_FLOAT32) || input->type() == DataType(DataType::DE_FLOAT16) ||
+      input->type() == DataType(DataType::DE_FLOAT64),
+    "BandpassBiquad: input tensor type should be float, but got: " + input->type().ToString());
   float w0 = 2 * PI * central_freq_ / sample_rate_;
   float alpha = sin(w0) / 2 / Q_;
   float temp;
@@ -46,15 +46,16 @@ Status BandpassBiquadOp::Compute(const std::shared_ptr<Tensor> &input, std::shar
   float a1 = (-2) * cos(w0);
   float a2 = 1 - alpha;
 
-  if (input->type() == DataType(DataType::DE_FLOAT32))
+  if (input->type() == DataType(DataType::DE_FLOAT32)) {
     return Biquad(input, output, static_cast<float>(b0), static_cast<float>(b1), static_cast<float>(b2),
                   static_cast<float>(a0), static_cast<float>(a1), static_cast<float>(a2));
-  else if (input->type() == DataType(DataType::DE_FLOAT64))
+  } else if (input->type() == DataType(DataType::DE_FLOAT64)) {
     return Biquad(input, output, static_cast<double>(b0), static_cast<double>(b1), static_cast<double>(b2),
                   static_cast<double>(a0), static_cast<double>(a1), static_cast<double>(a2));
-  else
+  } else {
     return Biquad(input, output, static_cast<float16>(b0), static_cast<float16>(b1), static_cast<float16>(b2),
                   static_cast<float16>(a0), static_cast<float16>(a1), static_cast<float16>(a2));
+  }
 }
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/bandpass_biquad_op.h b/mindspore/ccsrc/minddata/dataset/audio/kernels/bandpass_biquad_op.h
index 0fb21441425..dead035fbc4 100755
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/bandpass_biquad_op.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/bandpass_biquad_op.h
@@ -17,8 +17,8 @@
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_BANDPASS_BIQUAD_OP_H_
 
 #include <memory>
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/kernels/tensor_op.h"
@@ -26,6 +26,7 @@
 
 namespace mindspore {
 namespace dataset {
+
 class BandpassBiquadOp : public TensorOp {
  public:
   BandpassBiquadOp(int32_t sample_rate, float central_freq, float Q, bool const_skirt_gain)
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/bandreject_biquad_op.cc b/mindspore/ccsrc/minddata/dataset/audio/kernels/bandreject_biquad_op.cc
index d321cbf6d52..0e9244af2b1 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/bandreject_biquad_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/bandreject_biquad_op.cc
@@ -20,15 +20,17 @@
 
 namespace mindspore {
 namespace dataset {
+
 Status BandrejectBiquadOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   IO_CHECK(input, output);
   // check input type and input shape
   TensorShape input_shape = input->shape();
-  CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0, "BandrejectBiquad: input dimension should be greater than 0.");
-  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType(DataType::DE_FLOAT32) ||
-                                 input->type() == DataType(DataType::DE_FLOAT16) ||
-                                 input->type() == DataType(DataType::DE_FLOAT64),
-                               "BandrejectBiquad: input type should be float, but got " + input->type().ToString());
+  CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0,
+                               "BandrejectBiquad: input tensor is not in shape of <..., time>.");
+  CHECK_FAIL_RETURN_UNEXPECTED(
+    input->type() == DataType(DataType::DE_FLOAT32) || input->type() == DataType(DataType::DE_FLOAT16) ||
+      input->type() == DataType(DataType::DE_FLOAT64),
+    "BandrejectBiquad: input tensor type should be float, but got: " + input->type().ToString());
   double w0 = 2 * PI * central_freq_ / sample_rate_;
   double alpha = sin(w0) / 2 / Q_;
   double b0 = 1;
@@ -37,15 +39,16 @@ Status BandrejectBiquadOp::Compute(const std::shared_ptr<Tensor> &input, std::sh
   double a0 = 1 + alpha;
   double a1 = b1;
   double a2 = 1 - alpha;
-  if (input->type() == DataType(DataType::DE_FLOAT32))
+  if (input->type() == DataType(DataType::DE_FLOAT32)) {
     return Biquad(input, output, static_cast<float>(b0), static_cast<float>(b1), static_cast<float>(b2),
                   static_cast<float>(a0), static_cast<float>(a1), static_cast<float>(a2));
-  else if (input->type() == DataType(DataType::DE_FLOAT64))
+  } else if (input->type() == DataType(DataType::DE_FLOAT64)) {
     return Biquad(input, output, static_cast<double>(b0), static_cast<double>(b1), static_cast<double>(b2),
                   static_cast<double>(a0), static_cast<double>(a1), static_cast<double>(a2));
-  else
+  } else {
     return Biquad(input, output, static_cast<float16>(b0), static_cast<float16>(b1), static_cast<float16>(b2),
                   static_cast<float16>(a0), static_cast<float16>(a1), static_cast<float16>(a2));
+  }
 }
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/bandreject_biquad_op.h b/mindspore/ccsrc/minddata/dataset/audio/kernels/bandreject_biquad_op.h
index 3b42a6ccb82..e59d0cf3220 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/bandreject_biquad_op.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/bandreject_biquad_op.h
@@ -26,6 +26,7 @@
 
 namespace mindspore {
 namespace dataset {
+
 class BandrejectBiquadOp : public TensorOp {
  public:
   BandrejectBiquadOp(int32_t sample_rate, float central_freq, float Q)
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/bass_biquad_op.cc b/mindspore/ccsrc/minddata/dataset/audio/kernels/bass_biquad_op.cc
index 71799b17852..d05a7ff2471 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/bass_biquad_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/bass_biquad_op.cc
@@ -24,12 +24,12 @@ namespace dataset {
 Status BassBiquadOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   IO_CHECK(input, output);
   TensorShape input_shape = input->shape();
-  CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0, "BassBiquad: input dimension should be greater than 0.");
+  CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0, "BassBiquad: input tensor is not in shape of <..., time>.");
   // check input type, it should be DE_FLOAT32 or DE_FLOAT16 or DE_FLOAT64
   CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType(DataType::DE_FLOAT32) ||
                                  input->type() == DataType(DataType::DE_FLOAT16) ||
                                  input->type() == DataType(DataType::DE_FLOAT64),
-                               "BassBiquad: input type should be float, but got " + input->type().ToString());
+                               "BassBiquad: input tensor type should be float, but got: " + input->type().ToString());
 
   double w0 = 2 * PI * central_freq_ / sample_rate_;
   double alpha = sin(w0) / 2 / Q_;
@@ -45,17 +45,18 @@ Status BassBiquadOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_p
   double a0 = (A + 1) + temp2 + temp1;
   double a1 = -2 * ((A - 1) + temp3);
   double a2 = (A + 1) + temp2 - temp1;
-  if (input->type() == DataType(DataType::DE_FLOAT32))
+  if (input->type() == DataType(DataType::DE_FLOAT32)) {
     return Biquad(input, output, static_cast<float>(b0 / a0), static_cast<float>(b1 / a0), static_cast<float>(b2 / a0),
                   static_cast<float>(1.0), static_cast<float>(a1 / a0), static_cast<float>(a2 / a0));
-  else if (input->type() == DataType(DataType::DE_FLOAT64))
+  } else if (input->type() == DataType(DataType::DE_FLOAT64)) {
     return Biquad(input, output, static_cast<double>(b0 / a0), static_cast<double>(b1 / a0),
                   static_cast<double>(b2 / a0), static_cast<double>(1.0), static_cast<double>(a1 / a0),
                   static_cast<double>(a2 / a0));
-  else
+  } else {
     return Biquad(input, output, static_cast<float16>(b0 / a0), static_cast<float16>(b1 / a0),
                   static_cast<float16>(b2 / a0), static_cast<float16>(1.0), static_cast<float16>(a1 / a0),
                   static_cast<float16>(a2 / a0));
+  }
 }
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/bass_biquad_op.h b/mindspore/ccsrc/minddata/dataset/audio/kernels/bass_biquad_op.h
index 2aa31f2428c..68552c1bb80 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/bass_biquad_op.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/bass_biquad_op.h
@@ -27,6 +27,7 @@
 
 namespace mindspore {
 namespace dataset {
+
 class BassBiquadOp : public TensorOp {
  public:
   BassBiquadOp(int32_t sample_rate, float gain, float central_freq, float Q)
@@ -35,7 +36,7 @@ class BassBiquadOp : public TensorOp {
   ~BassBiquadOp() override = default;
 
   void Print(std::ostream &out) const override {
-    out << Name() << ": sample_rate: " << sample_rate_ << ", gain:" << gain_ << ", central_freq: " << central_freq_
+    out << Name() << ": sample_rate: " << sample_rate_ << ", gain: " << gain_ << ", central_freq: " << central_freq_
         << ", Q: " << Q_ << std::endl;
   }
 
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/time_stretch_op.cc b/mindspore/ccsrc/minddata/dataset/audio/kernels/time_stretch_op.cc
index 0f990348ff7..05a14891b00 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/time_stretch_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/time_stretch_op.cc
@@ -33,15 +33,8 @@ Status TimeStretchOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_
   IO_CHECK(input, output);
 
   // check shape
-  if (input->shape().Rank() < 3) {
-    std::string err_msg = "TimeStretch: input tensor shape is not <..., freq, num_frame, complex=2>.";
-    MS_LOG(ERROR) << err_msg;
-    RETURN_STATUS_SYNTAX_ERROR(err_msg);
-  }
-
-  // check complex
-  if (!input->IsComplex()) {
-    std::string err_msg = "TimeStretch: input tensor is not in shape of <..., 2>.";
+  if (input->shape().Rank() < 3 || !input->IsComplex()) {
+    std::string err_msg = "TimeStretch: input tensor is not in shape of <..., freq, num_frame, complex=2>.";
     MS_LOG(ERROR) << err_msg;
     RETURN_STATUS_SYNTAX_ERROR(err_msg);
   }
@@ -51,7 +44,7 @@ Status TimeStretchOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_
   float hop_length = std::isnan(hop_length_) ? (n_freq_ - 1) : hop_length_;
   // typecast
   CHECK_FAIL_RETURN_UNEXPECTED(input->type() != DataType::DE_STRING,
-                               "TimeStretch: input tensor type should be [int, float, double], but got string.");
+                               "TimeStretch: input tensor type should be int, float or double, but got: string.");
   if (input->type() != DataType::DE_FLOAT64) {
     RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
   } else {
diff --git a/mindspore/ccsrc/minddata/dataset/audio/kernels/time_stretch_op.h b/mindspore/ccsrc/minddata/dataset/audio/kernels/time_stretch_op.h
index d1a119bf8c0..5a7b1dd9d59 100644
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/time_stretch_op.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/time_stretch_op.h
@@ -27,6 +27,7 @@
 
 namespace mindspore {
 namespace dataset {
+
 class TimeStretchOp : public TensorOp {
  public:
   /// Default value
@@ -43,9 +44,6 @@ class TimeStretchOp : public TensorOp {
 
   std::string Name() const override { return kTimeStretchOp; }
 
-  /// \param[in] inputs
-  /// \param[out] outputs
-  /// \return  Status code
   Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
 
  private:
diff --git a/mindspore/ccsrc/minddata/dataset/core/config_manager.cc b/mindspore/ccsrc/minddata/dataset/core/config_manager.cc
index 5154812253b..71d285bdaca 100644
--- a/mindspore/ccsrc/minddata/dataset/core/config_manager.cc
+++ b/mindspore/ccsrc/minddata/dataset/core/config_manager.cc
@@ -107,6 +107,7 @@ Status ConfigManager::LoadFile(const std::string &settingsFile) {
     nlohmann::json js;
     in >> js;
     rc = FromJson(js);
+    in.close();
   } catch (const nlohmann::json::type_error &e) {
     std::ostringstream ss;
     ss << "Client file failed to load:\n" << e.what();
diff --git a/mindspore/ccsrc/minddata/dataset/core/cv_tensor.cc b/mindspore/ccsrc/minddata/dataset/core/cv_tensor.cc
index 052a585eb1d..48980fb929a 100644
--- a/mindspore/ccsrc/minddata/dataset/core/cv_tensor.cc
+++ b/mindspore/ccsrc/minddata/dataset/core/cv_tensor.cc
@@ -29,8 +29,10 @@ CVTensor::CVTensor(std::shared_ptr<Tensor> tensor) : Tensor(std::move(*tensor))
 }
 
 Status CVTensor::CreateEmpty(const TensorShape &shape, DataType type, CVTensorPtr *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   const CVTensorAlloc *alloc = GlobalContext::Instance()->cv_tensor_allocator();
   *out = std::allocate_shared<CVTensor>(*alloc, shape, type);
+  RETURN_UNEXPECTED_IF_NULL(out);
   int64_t byte_size = (*out)->SizeInBytes();
   // Don't allocate if we have a tensor with no elements.
   if (byte_size != 0) {
@@ -41,6 +43,7 @@ Status CVTensor::CreateEmpty(const TensorShape &shape, DataType type, CVTensorPt
 }
 
 Status CVTensor::CreateFromMat(const cv::Mat &mat, const dsize_t rank, CVTensorPtr *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   TensorPtr out_tensor;
   cv::Mat mat_local = mat;
   // if the input Mat's memory is not continuous, copy it to one block of memory
@@ -78,6 +81,9 @@ std::pair<std::array<int, 2>, int> CVTensor::IsValidImage(const TensorShape &sha
 }
 
 std::shared_ptr<CVTensor> CVTensor::AsCVTensor(std::shared_ptr<Tensor> t) {
+  if (t == nullptr) {
+    return nullptr;
+  }
   std::shared_ptr<CVTensor> cv_t = std::dynamic_pointer_cast<CVTensor>(t);
   if (cv_t != nullptr) {
     return cv_t;
@@ -88,13 +94,13 @@ std::shared_ptr<CVTensor> CVTensor::AsCVTensor(std::shared_ptr<Tensor> t) {
 }
 
 Status CVTensor::MatInit(uchar *data, const TensorShape &shape, const DataType &type, cv::Mat *mat) {
-  std::pair<std::array<int, 2>, int> cv_shape_type = IsValidImage(shape, type);
+  RETURN_UNEXPECTED_IF_NULL(data);
+  RETURN_UNEXPECTED_IF_NULL(mat);
+  const int kShapeAsDefault = 2;
+  std::pair<std::array<int, kShapeAsDefault>, int> cv_shape_type = IsValidImage(shape, type);
   if (cv_shape_type.second == -1) {
     std::vector<dsize_t> sizes = shape.AsVector();
     std::vector<int> sizes32(sizes.begin(), sizes.end());  // convert long to int for usage with OpenCV
-    if (static_cast<int>(shape.Rank()) != shape.Rank()) {
-      RETURN_STATUS_UNEXPECTED("Error in creating CV mat. Wrong shape.");
-    }
 
     uint8_t cv_type = type.AsCVType();
     if (cv_type == kCVInvalidType) {
@@ -102,7 +108,7 @@ Status CVTensor::MatInit(uchar *data, const TensorShape &shape, const DataType &
     }
     *mat = cv::Mat(static_cast<int>(shape.Rank()), &sizes32[0], cv_type, data);
   } else {
-    *mat = cv::Mat(2, &(cv_shape_type.first[0]), cv_shape_type.second, data);
+    *mat = cv::Mat(kShapeAsDefault, &(cv_shape_type.first[0]), cv_shape_type.second, data);
   }
   return Status::OK();
 }
@@ -121,10 +127,14 @@ Status CVTensor::ExpandDim(const dsize_t &axis) {
 
 void CVTensor::Squeeze() {
   Tensor::Squeeze();
-  (void)this->MatInit(GetMutableBuffer(), shape_, type_, &mat_);
+  Status rc = this->MatInit(GetMutableBuffer(), shape_, type_, &mat_);
+  if (rc.IsError()) {
+    MS_LOG(ERROR) << "Squeeze failed, error details is " << rc;
+  }
 }
 
 Status CVTensor::MatAtIndex(const std::vector<dsize_t> &index, cv::Mat *mat) {
+  RETURN_UNEXPECTED_IF_NULL(mat);
   uchar *start = nullptr;
   TensorShape remaining({-1});
   RETURN_IF_NOT_OK(this->StartAddrOfIndex(index, &start, &remaining));
diff --git a/mindspore/ccsrc/minddata/dataset/core/data_type.h b/mindspore/ccsrc/minddata/dataset/core/data_type.h
index c5621df60dd..1ac5443d15d 100644
--- a/mindspore/ccsrc/minddata/dataset/core/data_type.h
+++ b/mindspore/ccsrc/minddata/dataset/core/data_type.h
@@ -143,15 +143,15 @@ class DataType {
   constexpr bool operator!=(const Type a) const { return type_ != a; }
 
   // Disable this usage `if(d)` where d is of type DataType
-  /// \return
+  /// \return return nothing since we deiable this function.
   operator bool() = delete;
 
   // To be used in Switch/case
-  /// \return
+  /// \return data type internal.
   operator Type() const { return type_; }
 
   // The number of bytes needed to store one value of this type
-  /// \return
+  /// \return the number of bytes of the type.
   uint8_t SizeInBytes() const;
 
 #ifndef ENABLE_ANDROID
diff --git a/mindspore/ccsrc/minddata/dataset/core/de_tensor.cc b/mindspore/ccsrc/minddata/dataset/core/de_tensor.cc
index 957e4c763cd..73fba5c6cb7 100644
--- a/mindspore/ccsrc/minddata/dataset/core/de_tensor.cc
+++ b/mindspore/ccsrc/minddata/dataset/core/de_tensor.cc
@@ -41,15 +41,17 @@ DETensor::DETensor(std::shared_ptr<dataset::DeviceTensor> device_tensor_impl, bo
     : device_tensor_impl_(device_tensor_impl), name_("MindDataDeviceTensor"), is_device_(is_device) {
   // The sequence of shape_ is (width, widthStride, height, heightStride) in Dvpp module
   // We need to add [1]widthStride and [3]heightStride, which are actual YUV image shape, into shape_ attribute
-  uint8_t flag = 0;
-  for (auto &i : device_tensor_impl->GetYuvStrideShape()) {
-    if (flag % 2 == 1) {
-      int64_t j = static_cast<int64_t>(i);
-      shape_.emplace_back(j);
+  if (device_tensor_impl && device_tensor_impl->GetYuvStrideShape().size() > 0) {
+    uint8_t flag = 0;
+    for (auto &i : device_tensor_impl->GetYuvStrideShape()) {
+      if (flag % 2 == 1) {
+        int64_t j = static_cast<int64_t>(i);
+        shape_.emplace_back(j);
+      }
+      ++flag;
     }
-    ++flag;
+    std::reverse(shape_.begin(), shape_.end());
   }
-  std::reverse(shape_.begin(), shape_.end());
   MS_LOG(INFO) << "This is a YUV420 format image, one pixel takes 1.5 bytes. Therefore, the shape of"
                << " image is in (H, W) format. You can search for more information about YUV420 format";
 }
diff --git a/mindspore/ccsrc/minddata/dataset/core/device_tensor.cc b/mindspore/ccsrc/minddata/dataset/core/device_tensor.cc
index b12177b8694..80cfc532a24 100644
--- a/mindspore/ccsrc/minddata/dataset/core/device_tensor.cc
+++ b/mindspore/ccsrc/minddata/dataset/core/device_tensor.cc
@@ -23,7 +23,10 @@
 
 namespace mindspore {
 namespace dataset {
-DeviceTensor::DeviceTensor(const TensorShape &shape, const DataType &type) : Tensor(shape, type) {
+const int kYuvDefaultChannels = 4;
+
+DeviceTensor::DeviceTensor(const TensorShape &shape, const DataType &type)
+    : Tensor(shape, type), device_data_(nullptr), size_(0) {
   // grab the mem pool from global context and create the allocator for char data area
   std::shared_ptr<MemoryPool> global_pool = GlobalContext::Instance()->mem_pool();
   data_allocator_ = std::make_unique<Allocator<unsigned char>>(global_pool);
@@ -34,6 +37,7 @@ DeviceTensor::DeviceTensor(const TensorShape &shape, const DataType &type) : Ten
 Status DeviceTensor::CreateEmpty(const TensorShape &shape, const DataType &type, std::shared_ptr<DeviceTensor> *out) {
   CHECK_FAIL_RETURN_UNEXPECTED(shape.known(), "Invalid shape.");
   CHECK_FAIL_RETURN_UNEXPECTED(type != DataType::DE_UNKNOWN, "Invalid data type.");
+  CHECK_FAIL_RETURN_UNEXPECTED(out != nullptr, "Invalid nullptr pointer.");
   const DeviceTensorAlloc *alloc = GlobalContext::Instance()->device_tensor_allocator();
   *out = std::allocate_shared<DeviceTensor>(*alloc, shape, type);
   // if it's a string tensor and it has no elements, Just initialize the shape and type.
@@ -42,6 +46,7 @@ Status DeviceTensor::CreateEmpty(const TensorShape &shape, const DataType &type,
   }
 
   CHECK_FAIL_RETURN_UNEXPECTED(type.IsNumeric(), "Number of elements is not 0. The type should be numeric.");
+  CHECK_FAIL_RETURN_UNEXPECTED(out != nullptr, "Allocate memory faiiled.");
 
   int64_t bytes = (*out)->SizeInBytes();
   // Don't allocate if we have a tensor with no elements.
@@ -58,9 +63,11 @@ Status DeviceTensor::CreateFromDeviceMemory(const TensorShape &shape, const Data
   CHECK_FAIL_RETURN_UNEXPECTED(type != DataType::DE_UNKNOWN, "Invalid data type.");
   CHECK_FAIL_RETURN_UNEXPECTED(data_ptr != nullptr, "Data pointer is NULL");
   CHECK_FAIL_RETURN_UNEXPECTED(dataSize > 0, "Invalid data size");
+  CHECK_FAIL_RETURN_UNEXPECTED(out != nullptr, "Out pointer is NULL");
 
   const DeviceTensorAlloc *alloc = GlobalContext::Instance()->device_tensor_allocator();
   *out = std::allocate_shared<DeviceTensor>(*alloc, shape, type);
+  CHECK_FAIL_RETURN_UNEXPECTED(out != nullptr, "Allocate memory failed.");
 
   // if it's a string tensor and it has no elements, Just initialize the shape and type.
   if (!type.IsNumeric() && shape.NumOfElements() == 0) {
@@ -76,6 +83,8 @@ Status DeviceTensor::CreateFromDeviceMemory(const TensorShape &shape, const Data
     RETURN_IF_NOT_OK((*out)->AllocateBuffer(byte_size));
   }
 
+  CHECK_FAIL_RETURN_UNEXPECTED(attributes.size() >= kYuvDefaultChannels,
+                               "Invalid attributes size, should be greater than 4.");
   CHECK_FAIL_RETURN_UNEXPECTED(
     (*out)->SetAttributes(data_ptr, dataSize, attributes[0], attributes[1], attributes[2], attributes[3]),
     "Fail to set attributes for DeviceTensor");
@@ -129,6 +138,7 @@ Status DeviceTensor::SetSize_(const uint32_t &new_size) {
 
 #ifdef ENABLE_ACL
 Status DeviceTensor::DataPop_(std::shared_ptr<Tensor> *host_tensor) {
+  CHECK_FAIL_RETURN_UNEXPECTED(host_tensor != nullptr, "host tensor pointer is NULL.");
   void *resHostBuf = nullptr;
   APP_ERROR ret = aclrtMallocHost(&resHostBuf, this->DeviceDataSize());
   if (ret != APP_ERR_OK) {
@@ -151,13 +161,18 @@ Status DeviceTensor::DataPop_(std::shared_ptr<Tensor> *host_tensor) {
 
   mindspore::dataset::dsize_t dvppDataSize = this->DeviceDataSize();
   const mindspore::dataset::TensorShape dvpp_shape({dvppDataSize, 1, 1});
+
+  CHECK_FAIL_RETURN_UNEXPECTED(this->GetYuvStrideShape().size() >= kYuvDefaultChannels,
+                               "Invalid YuvShape, should greater than 4");
+
   uint32_t _output_width_ = this->GetYuvStrideShape()[0];
   uint32_t _output_widthStride_ = this->GetYuvStrideShape()[1];
   uint32_t _output_height_ = this->GetYuvStrideShape()[2];
   uint32_t _output_heightStride_ = this->GetYuvStrideShape()[3];
   const mindspore::dataset::DataType dvpp_data_type(mindspore::dataset::DataType::DE_UINT8);
 
-  mindspore::dataset::Tensor::CreateFromMemory(dvpp_shape, dvpp_data_type, ret_ptr, host_tensor);
+  RETURN_IF_NOT_OK(mindspore::dataset::Tensor::CreateFromMemory(dvpp_shape, dvpp_data_type, ret_ptr, host_tensor));
+  CHECK_FAIL_RETURN_UNEXPECTED(host_tensor != nullptr, "Allocate memory failed.");
 
   (*host_tensor)->SetYuvShape(_output_width_, _output_widthStride_, _output_height_, _output_heightStride_);
   if (!(*host_tensor)->HasData()) {
diff --git a/mindspore/ccsrc/minddata/dataset/core/pybind_support.h b/mindspore/ccsrc/minddata/dataset/core/pybind_support.h
index 7a553b9fef8..85e1c6d5f41 100644
--- a/mindspore/ccsrc/minddata/dataset/core/pybind_support.h
+++ b/mindspore/ccsrc/minddata/dataset/core/pybind_support.h
@@ -39,7 +39,9 @@ struct npy_scalar_caster {
   bool load(handle src, bool convert) {
     // Taken from Eigen casters. Permits either scalar dtype or scalar array.
     handle type = dtype::of<T>().attr("type");  // Could make more efficient.
-    if (!convert && !isinstance<Array>(src) && !isinstance(src, type)) return false;
+    if (!convert && !isinstance<Array>(src) && !isinstance(src, type)) {
+      return false;
+    }
 
     Array tmp = Array::ensure(src);
     if (tmp && tmp.size() == 1 && tmp.ndim() == 0) {
diff --git a/mindspore/ccsrc/minddata/dataset/core/tensor.cc b/mindspore/ccsrc/minddata/dataset/core/tensor.cc
index 315ce87ed84..95c96864d46 100644
--- a/mindspore/ccsrc/minddata/dataset/core/tensor.cc
+++ b/mindspore/ccsrc/minddata/dataset/core/tensor.cc
@@ -91,8 +91,10 @@ Tensor &Tensor::operator=(Tensor &&other) noexcept {
 Status Tensor::CreateEmpty(const TensorShape &shape, const DataType &type, TensorPtr *out) {
   CHECK_FAIL_RETURN_UNEXPECTED(shape.known(), "Invalid shape.");
   CHECK_FAIL_RETURN_UNEXPECTED(type != DataType::DE_UNKNOWN, "Invalid data type.");
+  RETURN_UNEXPECTED_IF_NULL(out);
   const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
   *out = std::allocate_shared<Tensor>(*alloc, shape, type);
+  CHECK_FAIL_RETURN_UNEXPECTED(out != nullptr, "Allocate memory failed.");
   // if it's a string tensor and it has no elements, Just initialize the shape and type.
   if (!type.IsNumeric() && shape.NumOfElements() == 0) {
     return Status::OK();
@@ -110,7 +112,7 @@ Status Tensor::CreateEmpty(const TensorShape &shape, const DataType &type, Tenso
 }
 Status Tensor::CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src, TensorPtr *out) {
   RETURN_IF_NOT_OK(CreateEmpty(shape, type, out));
-  if (src != nullptr) {
+  if (src != nullptr && out != nullptr) {
     // Given the shape/type of this tensor, compute the data size and copy in the input bytes.
     int64_t byte_size = (*out)->SizeInBytes();
     if (byte_size == 0) {
@@ -129,9 +131,11 @@ Status Tensor::CreateFromMemory(const TensorShape &shape, const DataType &type,
 
 Status Tensor::CreateFromMemory(const TensorShape &shape, const DataType &type, const unsigned char *src,
                                 const dsize_t &length, TensorPtr *out) {
-  CHECK_FAIL_RETURN_UNEXPECTED(src != nullptr, "Pointer to source data is null.");
+  RETURN_UNEXPECTED_IF_NULL(src);
+  RETURN_UNEXPECTED_IF_NULL(out);
   const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
   *out = std::allocate_shared<Tensor>(*alloc, shape, type);
+  CHECK_FAIL_RETURN_UNEXPECTED(out != nullptr, "Allocate memory failed.");
   if (type.IsNumeric()) {
     dsize_t calculated_length = (*out)->SizeInBytes();
     CHECK_FAIL_RETURN_UNEXPECTED(calculated_length == length, "Length of source data does not match the shape.");
@@ -159,6 +163,7 @@ Status Tensor::CreateFromMemory(const TensorShape &shape, const DataType &type,
 
 #ifdef ENABLE_PYTHON
 Status Tensor::CreateFromNpString(py::array arr, std::shared_ptr<Tensor> *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   std::vector<dsize_t> shape;
   for (dsize_t i = 0; i < arr.ndim(); i++) {
     shape.push_back(static_cast<dsize_t>(arr.shape()[i]));
@@ -167,9 +172,11 @@ Status Tensor::CreateFromNpString(py::array arr, std::shared_ptr<Tensor> *out) {
   std::vector<std::string> strings;
 
   if (arr.dtype().kind() == 'U') {
-    std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast<py::str>(s)); });
+    (void)std::for_each(arr.begin(), arr.end(),
+                        [&strings](const auto &s) { strings.emplace_back(py::cast<py::str>(s)); });
   } else {
-    std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast<py::bytes>(s)); });
+    (void)std::for_each(arr.begin(), arr.end(),
+                        [&strings](const auto &s) { strings.emplace_back(py::cast<py::bytes>(s)); });
   }
 
   arr.resize(shape);  // resize arr back to the original shape
@@ -178,6 +185,7 @@ Status Tensor::CreateFromNpString(py::array arr, std::shared_ptr<Tensor> *out) {
 }
 
 Status Tensor::CreateFromNpArray(const py::array &arr, std::shared_ptr<Tensor> *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   if (DataType::FromNpArray(arr) == DataType::DE_STRING) {
     return CreateFromNpString(arr, out);
   }
@@ -191,7 +199,7 @@ Status Tensor::CreateFromNpArray(const py::array &arr, std::shared_ptr<Tensor> *
     shape.push_back(static_cast<dsize_t>(arr.shape()[i]));
     strides.push_back(static_cast<dsize_t>(arr.strides()[i]));
     // in case of empty array num_items=0
-    if (count != 0) {
+    if (count != 0 && shape.size() > i && shape[i] != 0) {
       count /= shape[i];
       if (strides[i] != arr.itemsize() * count) {
         is_strided = true;
@@ -213,9 +221,11 @@ Status Tensor::CreateFromNpArray(const py::array &arr, std::shared_ptr<Tensor> *
 
 #ifndef ENABLE_ANDROID
 Status Tensor::CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape, TensorPtr *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
   *out = std::allocate_shared<Tensor>(*alloc, TensorShape({static_cast<dsize_t>(bytes_list.value_size())}),
                                       DataType(DataType::DE_STRING));
+  CHECK_FAIL_RETURN_UNEXPECTED(out != nullptr, "Allocate memory failed.");
   // total bytes needed = offset array + strings
   // offset array needs to store one offset var per element + 1 extra to get the length of the last string.
   // strings will be null-terminated --> need 1 extra byte per element
@@ -236,9 +246,7 @@ Status Tensor::CreateFromByteList(const dataengine::BytesList &bytes_list, const
     num_bytes -= kOffsetSize;
     // insert actual string
     int ret_code = memcpy_s((*out)->data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1);
-    if (ret_code != 0) {
-      MS_LOG(ERROR) << "Cannot copy string into Tensor";
-    }
+    CHECK_FAIL_RETURN_UNEXPECTED(ret_code == 0, "Cannot copy string into Tensor");
     //  next string will be stored right after the current one.
     offset = offset + str.length() + 1;
     // total bytes are reduced by the length of the string
@@ -257,6 +265,7 @@ Status Tensor::CreateFromByteList(const dataengine::BytesList &bytes_list, const
 #endif
 
 Status Tensor::CreateFromFile(const std::string &path, std::shared_ptr<Tensor> *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   Path file(path);
   if (file.IsDirectory()) {
     RETURN_STATUS_UNEXPECTED("Invalid file found: " + path + ", should be file, but got directory.");
@@ -269,8 +278,10 @@ Status Tensor::CreateFromFile(const std::string &path, std::shared_ptr<Tensor> *
   CHECK_FAIL_RETURN_UNEXPECTED(fs.seekg(0, std::ios::beg).good(), "Failed to find size of file, check path: " + path);
   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape{num_bytes}, DataType(DataType::DE_UINT8), out));
   int64_t written_bytes = fs.read(reinterpret_cast<char *>((*out)->GetMutableBuffer()), num_bytes).gcount();
-  CHECK_FAIL_RETURN_UNEXPECTED(written_bytes == num_bytes && fs.good(),
-                               "Error in writing to tensor, check path: " + path);
+  if (!(written_bytes == num_bytes && fs.good())) {
+    fs.close();
+    RETURN_STATUS_UNEXPECTED("Error in writing to tensor, check path: " + path);
+  }
   fs.close();
   return Status::OK();
 }
@@ -278,8 +289,10 @@ Status Tensor::CreateFromFile(const std::string &path, std::shared_ptr<Tensor> *
 #ifndef ENABLE_ANDROID
 Status Tensor::CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape,
                                   const DataType &type, dsize_t pad_size, TensorPtr *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   RETURN_IF_NOT_OK(Tensor::CreateEmpty(shape, type, out));
 
+  RETURN_UNEXPECTED_IF_NULL(out);
   unsigned char *current_tensor_addr = (*out)->GetMutableBuffer();
   int64_t tensor_bytes_remaining = bytes_list.value_size() * pad_size;
 
@@ -313,18 +326,23 @@ Status Tensor::CreateFromByteList(const dataengine::BytesList &bytes_list, const
 // Here we convert array C to array A, by memcpy index by index (Note that not all elements in C is copied)
 Status Tensor::CopyStridedArray(unsigned char *dst, unsigned char *src, std::vector<dsize_t> shape,
                                 std::vector<dsize_t> strides, uint8_t type_size) {
+  RETURN_UNEXPECTED_IF_NULL(dst);
+  RETURN_UNEXPECTED_IF_NULL(src);
   dsize_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<>());
   for (dsize_t i = 0; i < size; ++i) {
     dsize_t offset = 0;
     dsize_t count = i;
     for (size_t j = 0; j < shape.size(); ++j) {
       // convert 1d array's index to 3d array's index (A -> B)
+      CHECK_FAIL_RETURN_UNEXPECTED(shape[shape.size() - 1 - j] != 0, "Invalid data, shape can't be zero.");
       dsize_t idx = count % shape[shape.size() - 1 - j];
       count /= shape[shape.size() - 1 - j];
       // calculate the raw data offset based on strides (B -> C)
       offset += idx * strides[shape.size() - 1 - j];
       // once count = 0, the following idxes are all zero, skip them
-      if (count == 0) break;
+      if (count == 0) {
+        break;
+      }
     }
     // strides already consider byte size of the data type, but dst doesn't.
     // dst[i] = dst + i * type_size = src + offset
@@ -482,6 +500,7 @@ void Tensor::Invalidate() {
 
 template <typename T>
 Status Tensor::GetItemPtr(T **ptr, const std::vector<dsize_t> &index) const {
+  RETURN_UNEXPECTED_IF_NULL(ptr);
   if (type_.IsCompatible<T>()) {
     if (data_ == nullptr) {
       std::string err = "Data is not allocated yet";
@@ -490,6 +509,7 @@ Status Tensor::GetItemPtr(T **ptr, const std::vector<dsize_t> &index) const {
     dsize_t flat_idx;
     RETURN_IF_NOT_OK(shape_.ToFlatIndex(index, &flat_idx));
     *ptr = reinterpret_cast<T *>(data_ + flat_idx * type_.SizeInBytes());
+    RETURN_UNEXPECTED_IF_NULL(ptr);
 
     return Status::OK();
   } else {
@@ -499,6 +519,8 @@ Status Tensor::GetItemPtr(T **ptr, const std::vector<dsize_t> &index) const {
 }
 
 Status Tensor::GetItemPtr(uchar **ptr, const std::vector<dsize_t> &index, offset_t *length) const {
+  RETURN_UNEXPECTED_IF_NULL(ptr);
+  RETURN_UNEXPECTED_IF_NULL(length);
   if (type_ == DataType::DE_STRING) {
     if (data_ == nullptr) {
       std::string err = "Data is not allocated yet";
@@ -519,6 +541,8 @@ Status Tensor::GetItemPtr(uchar **ptr, const std::vector<dsize_t> &index, offset
 }
 
 Status Tensor::StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_index, TensorShape *remaining) {
+  RETURN_UNEXPECTED_IF_NULL(start_addr_of_index);
+  RETURN_UNEXPECTED_IF_NULL(remaining);
   if (type() == DataType::DE_STRING) {
     RETURN_STATUS_UNEXPECTED("StartAddrOfIndex does not support string tensors yet.");
   }
@@ -541,6 +565,7 @@ Status Tensor::StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_
 
 Status Tensor::InsertTensor(const std::vector<dsize_t> &ind, const std::shared_ptr<Tensor> &tensor,
                             const bool partial_insert) {
+  RETURN_UNEXPECTED_IF_NULL(tensor);
   std::string err_msg;
   if (partial_insert) {
     err_msg += (ind.size() != 1)
@@ -603,13 +628,14 @@ Status Tensor::ExpandDim(const dsize_t &axis) {
 std::vector<dsize_t> Tensor::Strides() const {
   std::vector<dsize_t> strides = shape_.Strides();
   uint8_t size = type_.SizeInBytes();
-  std::transform(strides.begin(), strides.end(), strides.begin(), [&size](const auto &c) { return c * size; });
+  (void)std::transform(strides.begin(), strides.end(), strides.begin(), [&size](const auto &c) { return c * size; });
   return strides;
 }
 
 #ifdef ENABLE_PYTHON
 Status Tensor::GetBufferInfo(Tensor *t, py::buffer_info *out) {
   RETURN_UNEXPECTED_IF_NULL(t);
+  RETURN_UNEXPECTED_IF_NULL(out);
   CHECK_FAIL_RETURN_UNEXPECTED(t->type().IsNumeric(), "Cannot use GetBufferInfo on tensor of strings.");
 
   std::string format_desc = t->type().GetPybindFormat();
@@ -622,6 +648,7 @@ Status Tensor::GetBufferInfo(Tensor *t, py::buffer_info *out) {
                          t->Rank(),               /* Number of dimensions */
                          t->shape().AsVector(),   /* Buffer dimensions */
                          t->Strides());
+  RETURN_UNEXPECTED_IF_NULL(out);
   return Status::OK();
 }
 #endif
@@ -721,6 +748,7 @@ Status Tensor::from_json_convert(nlohmann::json json_data, TensorShape shape, st
 
 template <typename T>
 Status Tensor::GetItemAt(T *o, const std::vector<dsize_t> &index) const {
+  RETURN_UNEXPECTED_IF_NULL(o);
   if (data_ == nullptr) {
     RETURN_STATUS_UNEXPECTED("Data is not allocated yet");
   }
@@ -794,6 +822,7 @@ Status Tensor::GetDataAsNumpy(py::array *data) {
   return Status::OK();
 }
 Status Tensor::GetDataAsNumpyStrings(py::array *data) {
+  RETURN_UNEXPECTED_IF_NULL(data);
   auto itr = begin<std::string_view>();
   uint64_t max_value = 0;
   for (; itr != end<std::string_view>(); ++itr) {
@@ -807,7 +836,9 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) {
   max_value = (max_value == 0 ? 1 : max_value);
   uint64_t total_size = shape_.NumOfElements() * max_value;
   char *tmp_data = reinterpret_cast<char *>(data_allocator_->allocate(total_size));
-  if (tmp_data == nullptr) RETURN_STATUS_UNEXPECTED("Cannot create temp array.");
+  if (tmp_data == nullptr) {
+    RETURN_STATUS_UNEXPECTED("Cannot create temp array.");
+  }
   int ret_code = memset_s(tmp_data, total_size, 0, total_size);
   CHECK_FAIL_RETURN_UNEXPECTED(ret_code == 0, "Failed to initialize temp memory");
 
@@ -820,9 +851,10 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) {
     }
   }
   auto strides = shape_.Strides();
-  std::transform(strides.begin(), strides.end(), strides.begin(),
-                 [&max_value](const auto &s) { return s * max_value; });
+  (void)std::transform(strides.begin(), strides.end(), strides.begin(),
+                       [&max_value](const auto &s) { return s * max_value; });
   *data = py::array(py::dtype("S" + std::to_string(max_value)), shape_.AsVector(), strides, tmp_data);
+  RETURN_UNEXPECTED_IF_NULL(data);
   data_allocator_->deallocate(reinterpret_cast<uchar *>(tmp_data));
   return Status::OK();
 }
@@ -832,6 +864,7 @@ void Tensor::Squeeze() { shape_ = shape_.Squeeze(); }
 
 template <typename T>
 Status Tensor::GetUnsignedIntAt(T *o, const std::vector<dsize_t> &index) const {
+  RETURN_UNEXPECTED_IF_NULL(o);
   if (data_ == nullptr) {
     RETURN_STATUS_UNEXPECTED("Data is not allocated yet");
   }
@@ -873,6 +906,7 @@ Status Tensor::GetUnsignedIntAt(T *o, const std::vector<dsize_t> &index) const {
 
 template <typename T>
 Status Tensor::GetSignedIntAt(T *o, const std::vector<dsize_t> &index) const {
+  RETURN_UNEXPECTED_IF_NULL(o);
   if (data_ == nullptr) {
     RETURN_STATUS_UNEXPECTED("Data is not allocated yet");
   }
@@ -914,6 +948,7 @@ Status Tensor::GetSignedIntAt(T *o, const std::vector<dsize_t> &index) const {
 
 template <typename T>
 Status Tensor::GetFloatAt(T *o, const std::vector<dsize_t> &index) const {
+  RETURN_UNEXPECTED_IF_NULL(o);
   if (data_ == nullptr) {
     RETURN_STATUS_UNEXPECTED("Data is not allocated yet");
   }
@@ -958,6 +993,7 @@ Status Tensor::GetStringAt(dsize_t index, uchar **string_start, offset_t *length
   return Status::OK();
 }
 Status Tensor::CopyLastDimAt(const std::shared_ptr<Tensor> &src, const std::vector<dsize_t> &index) {
+  RETURN_UNEXPECTED_IF_NULL(src);
   CHECK_FAIL_RETURN_UNEXPECTED(src->type() == type_, "Source Tensor has a different type");
   CHECK_FAIL_RETURN_UNEXPECTED(index.back() == 0, "Last dim in index should be 0");
 
@@ -975,6 +1011,7 @@ Status Tensor::CopyLastDimAt(const std::shared_ptr<Tensor> &src, const std::vect
 
 Status Tensor::GetSliceOption(const SliceOption &slice_option, const int32_t &slice_index,
                               SliceOption *slice_option_ptr) {
+  RETURN_UNEXPECTED_IF_NULL(slice_option_ptr);
   if (slice_option.indices_.empty() && !slice_option.slice_.valid()) {
     RETURN_STATUS_UNEXPECTED("Both indices and slices can not be empty.");
   }
@@ -983,6 +1020,7 @@ Status Tensor::GetSliceOption(const SliceOption &slice_option, const int32_t &sl
     RETURN_STATUS_UNEXPECTED("Both indices and slices can not be given.");
   }
 
+  CHECK_FAIL_RETURN_UNEXPECTED(shape_.Size() > slice_index, "Invalid shape, should greater than slices index.");
   // if slice object was provided, indices should be empty. Generate indices from the slice object.
   if (slice_option.indices_.empty()) {
     // check if slice is valid
@@ -1010,6 +1048,7 @@ Status Tensor::GetSliceOption(const SliceOption &slice_option, const int32_t &sl
 }
 
 Status Tensor::Slice(std::shared_ptr<Tensor> *out, const std::vector<SliceOption> slice_options_) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   std::vector<SliceOption> converted_slice_objects;
 
   CHECK_FAIL_RETURN_UNEXPECTED(slice_options_.size() <= static_cast<size_t>(std::numeric_limits<dsize_t>::max()),
@@ -1046,7 +1085,7 @@ Status Tensor::Slice(std::shared_ptr<Tensor> *out, const std::vector<SliceOption
   for (int i = 0; i < shape_.Rank(); i++) {
     if (i < slice_len) {
       // if it's a slice
-      if (converted_slice_objects[i].indices_.size() == 0) {
+      if (converted_slice_objects[i].indices_.size() == 0 && converted_slice_objects[i].slice_.step_ != 0) {
         slice_len_ind = (converted_slice_objects[i].slice_.stop_ - converted_slice_objects[i].slice_.start_) /
                         converted_slice_objects[i].slice_.step_;
         if ((converted_slice_objects[i].slice_.stop_ - converted_slice_objects[i].slice_.start_) %
@@ -1085,8 +1124,10 @@ Status Tensor::Slice(std::shared_ptr<Tensor> *out, const std::vector<SliceOption
 
 Status Tensor::SliceNumeric(std::shared_ptr<Tensor> *out, const std::vector<std::vector<dsize_t>> &indices,
                             const TensorShape &shape) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   RETURN_IF_NOT_OK(CreateEmpty(shape, type_, out));
 
+  RETURN_UNEXPECTED_IF_NULL(out);
   (*out)->GetMutableBuffer();
   dsize_t out_index = 0;
   std::vector<dsize_t> dim_length = shape_.AsVector();
@@ -1131,6 +1172,7 @@ Status Tensor::SliceNumeric(std::shared_ptr<Tensor> *out, const std::vector<std:
 }
 Status Tensor::SliceString(std::shared_ptr<Tensor> *out, const std::vector<std::vector<dsize_t>> &indices,
                            const TensorShape &shape) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   std::vector<dsize_t> dim_length = shape_.AsVector();
   std::vector<std::string> strings;
 
diff --git a/mindspore/ccsrc/minddata/dataset/core/tensor.h b/mindspore/ccsrc/minddata/dataset/core/tensor.h
index cc011232fde..3c6833049a8 100644
--- a/mindspore/ccsrc/minddata/dataset/core/tensor.h
+++ b/mindspore/ccsrc/minddata/dataset/core/tensor.h
@@ -414,6 +414,10 @@ class Tensor {
   /// \param[in] index_vector vector of indices
   /// \return std::vector<dsize_t> modified vector of indices
   static inline std::vector<dsize_t> HandleNegIndices(std::vector<dsize_t> index_vector, std::vector<dsize_t> length) {
+    if (length.size() < index_vector.size()) {
+      MS_LOG(ERROR) << "The size of length should be greater than the shape of index_vector";
+      return {};
+    }
     std::vector<dsize_t> indices(index_vector.size(), 0);
     for (int i = 0; i < index_vector.size(); i++) {
       indices[i] = HandleNeg(index_vector[i], length[i]);
@@ -780,12 +784,14 @@ inline Tensor::TensorIterator<std::string_view> Tensor::end<std::string_view>()
 template <>
 inline Status Tensor::CreateFromVector<std::string>(const std::vector<std::string> &items, const TensorShape &shape,
                                                     TensorPtr *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   CHECK_FAIL_RETURN_UNEXPECTED(
     items.size() == shape.NumOfElements(),
     "Number of elements in the vector does not match the number of elements of the shape required");
   const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
   *out = std::allocate_shared<Tensor>(*alloc, TensorShape({static_cast<dsize_t>(items.size())}),
                                       DataType(DataType::DE_STRING));
+  CHECK_FAIL_RETURN_UNEXPECTED(out != nullptr, "Allocate memory failed.");
   if (items.size() == 0) {
     if (shape.known()) {
       return (*out)->Reshape(shape);
@@ -835,6 +841,7 @@ inline Status Tensor::CreateFromVector<std::string>(const std::vector<std::strin
 /// \return Status code
 template <>
 inline Status Tensor::CreateScalar<std::string>(const std::string &item, TensorPtr *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   return CreateFromVector<std::string>({item}, TensorShape::CreateScalar(), out);
 }
 }  // namespace dataset
diff --git a/mindspore/ccsrc/minddata/dataset/core/tensor_helpers.cc b/mindspore/ccsrc/minddata/dataset/core/tensor_helpers.cc
index 5965ff6fdf5..c358e24dd1d 100644
--- a/mindspore/ccsrc/minddata/dataset/core/tensor_helpers.cc
+++ b/mindspore/ccsrc/minddata/dataset/core/tensor_helpers.cc
@@ -16,6 +16,8 @@
 #include <string>
 #include <vector>
 #include "minddata/dataset/core/tensor_helpers.h"
+#include "minddata/dataset/util/log_adapter.h"
+#include "minddata/dataset/util/status.h"
 
 namespace mindspore {
 namespace dataset {
@@ -23,6 +25,10 @@ namespace dataset {
 void IndexGeneratorHelper(int8_t depth, std::vector<dsize_t> *numbers,
                           const std::vector<mindspore::dataset::SliceOption> &slice_list,
                           std::vector<std::vector<dsize_t>> *matrix) {
+  if (numbers == nullptr || matrix == nullptr) {
+    MS_LOG(ERROR) << "Invalid input pointer, can't be NULL";
+    return;
+  }
   // for loop changes if its an index instead of a slice object
   if (depth > 0) {
     int8_t new_depth = depth - 1;
diff --git a/mindspore/ccsrc/minddata/dataset/core/tensor_row.h b/mindspore/ccsrc/minddata/dataset/core/tensor_row.h
index 2b8291135e5..b2e88df125a 100644
--- a/mindspore/ccsrc/minddata/dataset/core/tensor_row.h
+++ b/mindspore/ccsrc/minddata/dataset/core/tensor_row.h
@@ -87,6 +87,7 @@ class TensorRow {
   /// \param[out] output TensorRow
   template <typename T>
   static Status ConvertToTensorRow(const std::vector<T> &o, TensorRow *output) {
+    RETURN_UNEXPECTED_IF_NULL(output);
     DataType data_type = DataType::FromCType<T>();
     if (data_type == DataType::DE_UNKNOWN) {
       RETURN_STATUS_UNEXPECTED("ConvertToTensorRow: Data type was not recognized.");
@@ -106,6 +107,7 @@ class TensorRow {
   /// \param[out] output TensorRow
   template <typename T>
   static Status ConvertToTensorRow(const T &o, TensorRow *output) {
+    RETURN_UNEXPECTED_IF_NULL(output);
     DataType data_type = DataType::FromCType<T>();
     if (data_type == DataType::DE_UNKNOWN) {
       RETURN_STATUS_UNEXPECTED("ConvertToTensorRow: Data type was not recognized.");
@@ -125,6 +127,7 @@ class TensorRow {
   /// \param[out] o the primitive variable
   template <typename T>
   static Status ConvertFromTensorRow(const TensorRow &input, T *o) {
+    RETURN_UNEXPECTED_IF_NULL(o);
     DataType data_type = DataType::FromCType<T>();
     RETURN_IF_NOT_OK(ValidateTensorRow(input, data_type));
     if (input.at(0)->type() != data_type) {
@@ -142,6 +145,7 @@ class TensorRow {
   /// \param[out] o vector of primitive variable
   template <typename T>
   static Status ConvertFromTensorRow(const TensorRow &input, std::vector<T> *o) {
+    RETURN_UNEXPECTED_IF_NULL(o);
     DataType data_type = DataType::FromCType<T>();
     RETURN_IF_NOT_OK(ValidateTensorRow(input, data_type));
     if (input.at(0)->Rank() != 1)
diff --git a/mindspore/ccsrc/minddata/dataset/core/tensor_shape.cc b/mindspore/ccsrc/minddata/dataset/core/tensor_shape.cc
index 30fdd7dedff..37c9016df04 100644
--- a/mindspore/ccsrc/minddata/dataset/core/tensor_shape.cc
+++ b/mindspore/ccsrc/minddata/dataset/core/tensor_shape.cc
@@ -40,7 +40,7 @@ bool multi_ok(dsize_t x, dsize_t y) {
 }
 
 dsize_t TensorShape::NumOfElements() const {
-  if (!known()) {
+  if (!known() && strides_.size() < 1) {
     return 0;
   }
   return strides_[0];
@@ -216,12 +216,9 @@ py::list TensorShape::AsPyList() {
 #endif
 
 TensorShape TensorShape::Squeeze() const {
-  std::vector<dsize_t> new_shape;
-  for (auto s : AsVector()) {
-    if (s != 1) {
-      new_shape.push_back(s);
-    }
-  }
+  std::vector<dsize_t> new_shape(raw_shape_.size());
+  auto it = std::copy_if(raw_shape_.begin(), raw_shape_.end(), new_shape.begin(), [](auto s) { return s != 1; });
+  new_shape.resize(std::distance(new_shape.begin(), it));
   return TensorShape(new_shape);
 }
 
@@ -230,6 +227,7 @@ std::vector<dsize_t> TensorShape::Strides() const { return std::vector<dsize_t>{
 // Name: ToFlatIndex()
 // Description: convert a vector style index to number, used to access memory internal use only
 Status TensorShape::ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const {
+  RETURN_UNEXPECTED_IF_NULL(flat_index);
   if (index.size() != raw_shape_.size()) {
     std::stringstream ss;
     ss << "Index size (" << index.size() << ") does not match the shape size (" << raw_shape_.size() << ").";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_hw.cc b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_hw.cc
index b5fc586267c..11e28b55f96 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_hw.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_hw.cc
@@ -101,8 +101,8 @@ Status CacheServerHW::GetNumaNodeInfo() {
   };
   // Look for name starts with 'node' and followed by digits.
   const char kNodeName[] = "node";
-  while (it->hasNext()) {
-    auto p = it->next();
+  while (it->HasNext()) {
+    auto p = it->Next();
     const std::string entry = p.Basename();
     const char *name = entry.data();
     if (strncmp(name, kNodeName, strlen(kNodeName)) == 0 && isdigit_string(name + strlen(kNodeName))) {
diff --git a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_pool.cc b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_pool.cc
index 8e2b591407b..26a704a04a1 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_pool.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_pool.cc
@@ -63,8 +63,8 @@ Status CachePool::DoServiceStop() {
   if (!root_.toString().empty()) {
     Path spill = GetSpillPath();
     auto it = Path::DirIterator::OpenDirectory(&spill);
-    while (it->hasNext()) {
-      rc = it->next().Remove();
+    while (it->HasNext()) {
+      rc = it->Next().Remove();
       if (rc.IsError() && rc2.IsOk()) {
         rc2 = rc;
       }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/consumers/pull_based_tree_consumer.cc b/mindspore/ccsrc/minddata/dataset/engine/consumers/pull_based_tree_consumer.cc
index ac5fa282c67..2275badfda5 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/consumers/pull_based_tree_consumer.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/pull_based_tree_consumer.cc
@@ -24,6 +24,7 @@ namespace mindspore::dataset {
 PullBasedIteratorConsumer::PullBasedIteratorConsumer() { tree_adapter_lite_ = std::make_unique<TreeAdapterLite>(); }
 
 Status PullBasedIteratorConsumer::Init(std::shared_ptr<DatasetNode> root) {
+  RETURN_UNEXPECTED_IF_NULL(root);
   return tree_adapter_lite_->BuildTree(std::move(root));
 }
 
diff --git a/mindspore/ccsrc/minddata/dataset/engine/consumers/python_tree_consumer.cc b/mindspore/ccsrc/minddata/dataset/engine/consumers/python_tree_consumer.cc
index b59bcbf9467..a3bcc94acff 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/consumers/python_tree_consumer.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/python_tree_consumer.cc
@@ -20,6 +20,7 @@
 
 namespace mindspore::dataset {
 Status PythonIteratorConsumer::GetNextAsList(py::list *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   std::vector<TensorPtr> row;
   {
     py::gil_scoped_release gil_release;
@@ -32,6 +33,7 @@ Status PythonIteratorConsumer::GetNextAsList(py::list *out) {
 }
 
 Status PythonIteratorConsumer::GetNextAsDict(py::dict *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   std::vector<std::pair<std::string, std::shared_ptr<Tensor>>> vec;
   Status s;
   {
@@ -64,6 +66,8 @@ Status PythonTreeGetters::GetRow(TensorRow *const r) {
   return TreeGetters::GetRow(r);
 }
 Status PythonDatasetSizeGetter::GetRow(const std::shared_ptr<TreeAdapter> &tree_adapter, TensorRow *r) {
+  RETURN_UNEXPECTED_IF_NULL(tree_adapter);
+  RETURN_UNEXPECTED_IF_NULL(r);
   py::gil_scoped_release gil_release;
   return DatasetSizeGetter::GetRow(tree_adapter, r);
 }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc
index c99ffdaf733..01ae379c2ab 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include <algorithm>
 #include <memory>
 #include <string>
@@ -179,6 +178,8 @@ Status ToDevice::Stop() {
 }
 
 Status ToDevice::GetDataInfo(std::vector<DataType> *const types, std::vector<TensorShape> *const shapes) {
+  RETURN_UNEXPECTED_IF_NULL(types);
+  RETURN_UNEXPECTED_IF_NULL(shapes);
   // tree_.root() must be DeviceQueueOp
   std::shared_ptr<DatasetOp> root = std::shared_ptr<DatasetOp>(tree_adapter_->GetRoot());
   CHECK_FAIL_RETURN_UNEXPECTED(root != nullptr, "Root is a nullptr.");
@@ -218,8 +219,13 @@ Status SaveToDisk::ValidateParams() {
     MS_LOG(ERROR) << err;
     RETURN_STATUS_SYNTAX_ERROR(err);
   }
-  auto parent_path = dir.ParentPath();
-  if (!parent_path.empty() && access(common::SafeCStr(parent_path), R_OK) == -1) {
+  std::string real_path;
+  if (Path::RealPath(dir.ParentPath(), real_path).IsError()) {
+    std::string err_msg = "CreateSaver failed, can not get real dataset path: " + dir.ParentPath();
+    MS_LOG(ERROR) << err_msg;
+    RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+  if (access(dir.ParentPath().c_str(), R_OK) == -1) {
     std::string err_msg = "CreateSaver failed, no access to specified dataset path: " + dataset_path_;
     MS_LOG(ERROR) << err_msg;
     RETURN_STATUS_SYNTAX_ERROR(err_msg);
@@ -250,15 +256,15 @@ Status SaveToDisk::Save() {
   auto mr_header = std::make_shared<mindrecord::ShardHeader>();
   auto mr_writer = std::make_unique<mindrecord::ShardWriter>();
   std::vector<std::string> blob_fields;
-  if (mindrecord::SUCCESS != mindrecord::ShardWriter::initialize(&mr_writer, file_names)) {
+  if (mindrecord::SUCCESS != mindrecord::ShardWriter::Initialize(&mr_writer, file_names)) {
     RETURN_STATUS_UNEXPECTED("Error: failed to initialize ShardWriter, please check above `ERROR` level message.");
   }
 
   std::unordered_map<std::string, int32_t> column_name_id_map;
   for (auto el : tree_adapter_->GetColumnNameMap()) {
     std::string column_name = el.first;
-    std::transform(column_name.begin(), column_name.end(), column_name.begin(),
-                   [](unsigned char c) { return ispunct(c) ? '_' : c; });
+    (void)std::transform(column_name.begin(), column_name.end(), column_name.begin(),
+                         [](unsigned char c) { return ispunct(c) ? '_' : c; });
     column_name_id_map[column_name] = el.second;
   }
 
@@ -281,17 +287,21 @@ Status SaveToDisk::Save() {
       RETURN_IF_NOT_OK(FetchMetaFromTensorRow(column_name_id_map, row, &mr_json, &index_fields));
       MS_LOG(INFO) << "Schema of saved mindrecord: " << mr_json.dump();
       if (mindrecord::SUCCESS !=
-          mindrecord::ShardHeader::initialize(&mr_header, mr_json, index_fields, blob_fields, mr_schema_id)) {
+          mindrecord::ShardHeader::Initialize(&mr_header, mr_json, index_fields, blob_fields, mr_schema_id)) {
         RETURN_STATUS_UNEXPECTED("Error: failed to initialize ShardHeader.");
       }
-      mr_writer->SetShardHeader(mr_header);
+      if (mindrecord::SUCCESS != mr_writer->SetShardHeader(mr_header)) {
+        RETURN_STATUS_UNEXPECTED("Error: failed to set header of ShardWriter.");
+      }
       first_loop = false;
     }
     // construct data
     if (!row.empty()) {  // write data
       RETURN_IF_NOT_OK(FetchDataFromTensorRow(row, column_name_id_map, &row_raw_data, &row_bin_data));
       std::shared_ptr<std::vector<uint8_t>> output_bin_data;
-      mr_writer->MergeBlobData(blob_fields, row_bin_data, &output_bin_data);
+      if (mindrecord::SUCCESS != mr_writer->MergeBlobData(blob_fields, row_bin_data, &output_bin_data)) {
+        RETURN_STATUS_UNEXPECTED("Error: failed to merge blob data of ShardWriter.");
+      }
       std::map<std::uint64_t, std::vector<nlohmann::json>> raw_data;
       raw_data.insert(
         std::pair<uint64_t, std::vector<nlohmann::json>>(mr_schema_id, std::vector<nlohmann::json>{row_raw_data}));
@@ -299,12 +309,16 @@ Status SaveToDisk::Save() {
       if (output_bin_data != nullptr) {
         bin_data.emplace_back(*output_bin_data);
       }
-      mr_writer->WriteRawData(raw_data, bin_data);
+      if (mindrecord::SUCCESS != mr_writer->WriteRawData(raw_data, bin_data)) {
+        RETURN_STATUS_UNEXPECTED("Error: failed to write raw data to ShardWriter.");
+      }
     }
   } while (!row.empty());
 
-  mr_writer->Commit();
-  if (mindrecord::SUCCESS != mindrecord::ShardIndexGenerator::finalize(file_names)) {
+  if (mindrecord::SUCCESS != mr_writer->Commit()) {
+    RETURN_STATUS_UNEXPECTED("Error: failed to commit ShardWriter.");
+  }
+  if (mindrecord::SUCCESS != mindrecord::ShardIndexGenerator::Finalize(file_names)) {
     RETURN_STATUS_UNEXPECTED("Error: failed to finalize ShardIndexGenerator.");
   }
   return Status::OK();
@@ -407,7 +421,7 @@ Status SaveToDisk::FetchMetaFromTensorRow(const std::unordered_map<std::string,
   return Status::OK();
 }
 
-static Status ValidateInputParams(nlohmann::json *row_raw_data,
+inline Status ValidateInputParams(nlohmann::json *row_raw_data,
                                   std::map<std::string, std::unique_ptr<std::vector<uint8_t>>> *row_bin_data,
                                   const std::unordered_map<std::string, int32_t> &column_name_id_map) {
   if (row_raw_data == nullptr) {
@@ -424,6 +438,8 @@ static Status ValidateInputParams(nlohmann::json *row_raw_data,
 
 Status SaveToDisk::FetchFloatData(std::shared_ptr<Tensor> tensor, std::string column_name, nlohmann::json *row_raw_data,
                                   std::unique_ptr<std::vector<uint8_t>> *data_ptr) {
+  RETURN_UNEXPECTED_IF_NULL(row_raw_data);
+  RETURN_UNEXPECTED_IF_NULL(data_ptr);
   auto column_type = tensor->type();
   Status s;
   if (column_type == DataType::DE_FLOAT32) {
@@ -442,6 +458,9 @@ Status SaveToDisk::FetchFloatData(std::shared_ptr<Tensor> tensor, std::string co
 
 Status SaveToDisk::FetchItemData(std::shared_ptr<Tensor> tensor, std::string column_name, nlohmann::json *row_raw_data,
                                  std::map<std::string, std::unique_ptr<std::vector<uint8_t>>> *row_bin_data) {
+  RETURN_UNEXPECTED_IF_NULL(tensor);
+  RETURN_UNEXPECTED_IF_NULL(row_raw_data);
+  RETURN_UNEXPECTED_IF_NULL(row_bin_data);
   auto column_type = tensor->type();
   Status s;
   std::unique_ptr<std::vector<uint8_t>> data_ptr;
@@ -492,7 +511,6 @@ Status SaveToDisk::FetchItemData(std::shared_ptr<Tensor> tensor, std::string col
     RETURN_IF_NOT_OK(tensor->GetItemAt(&sv, {}));  // assume scalar string tensor
     std::string ss(sv);
     (*row_raw_data)[column_name] = std::move(ss);
-    return Status::OK();
   } else {
     RETURN_STATUS_UNEXPECTED("Got unexpected type when casting data.");
   }
@@ -506,6 +524,8 @@ Status SaveToDisk::FetchDataFromTensorRow(const TensorRow &row,
                                           const std::unordered_map<std::string, int32_t> &column_name_id_map,
                                           nlohmann::json *row_raw_data,
                                           std::map<std::string, std::unique_ptr<std::vector<uint8_t>>> *row_bin_data) {
+  RETURN_UNEXPECTED_IF_NULL(row_raw_data);
+  RETURN_UNEXPECTED_IF_NULL(row_bin_data);
   Status s;
   s = ValidateInputParams(row_raw_data, row_bin_data, column_name_id_map);
   if (s.IsError()) {
@@ -525,9 +545,11 @@ template <typename T, typename S>
 Status SaveToDisk::TransformTensor(const unsigned char *src, const TensorShape &shape, const int64_t num_of_elements,
                                    std::unique_ptr<T> *data, std::unique_ptr<std::vector<uint8_t>> *data_ptr,
                                    std::unique_ptr<S> *s, bool need_convert) {
-  if (nullptr == src) {
-    RETURN_STATUS_UNEXPECTED("Error: buffer of Tensor is NULL.");
-  }
+  RETURN_UNEXPECTED_IF_NULL(src);
+  RETURN_UNEXPECTED_IF_NULL(data);
+  RETURN_UNEXPECTED_IF_NULL(data_ptr);
+  RETURN_UNEXPECTED_IF_NULL(s);
+
   *data_ptr = std::make_unique<std::vector<uint8_t>>(num_of_elements * sizeof(T));
   if (need_convert) {
     auto tmp_ptr = std::make_unique<std::vector<uint8_t>>(num_of_elements * sizeof(S));
@@ -560,25 +582,32 @@ TreeGetters::TreeGetters() : dataset_size_(-1), init_flag_(false), first_row_obt
 }
 
 Status TreeGetters::Init(std::shared_ptr<DatasetNode> d) {
+  RETURN_UNEXPECTED_IF_NULL(d);
   root_ = std::move(d);
   return Status::OK();
 }
 
-Status TreeGetters::GetRow(TensorRow *row) { return tree_adapter_->GetNext(row); }
+Status TreeGetters::GetRow(TensorRow *row) {
+  RETURN_UNEXPECTED_IF_NULL(row);
+  return tree_adapter_->GetNext(row);
+}
 
 Status TreeGetters::GetOutputTypes(std::vector<DataType> *types) {
+  RETURN_UNEXPECTED_IF_NULL(types);
   RETURN_IF_NOT_OK(GetFirstRowShapeAndType());
   *types = first_row_type_;
   return Status::OK();
 }
 
 Status TreeGetters::GetOutputShapes(std::vector<TensorShape> *shapes) {
+  RETURN_UNEXPECTED_IF_NULL(shapes);
   RETURN_IF_NOT_OK(GetFirstRowShapeAndType());
   *shapes = first_row_shape_;
   return Status::OK();
 }
 
 Status TreeGetters::GetBatchSize(int64_t *batch_size) {
+  RETURN_UNEXPECTED_IF_NULL(batch_size);
   RETURN_IF_NOT_OK(InternalInit());
   std::shared_ptr<DatasetOp> root = std::shared_ptr<DatasetOp>(tree_adapter_->GetRoot());
   RETURN_UNEXPECTED_IF_NULL(root);
@@ -588,6 +617,7 @@ Status TreeGetters::GetBatchSize(int64_t *batch_size) {
 }
 
 Status TreeGetters::GetRepeatCount(int64_t *repeat_count) {
+  RETURN_UNEXPECTED_IF_NULL(repeat_count);
   RETURN_IF_NOT_OK(InternalInit());
   std::shared_ptr<DatasetOp> root = std::shared_ptr<DatasetOp>(tree_adapter_->GetRoot());
   RETURN_UNEXPECTED_IF_NULL(root);
@@ -596,6 +626,7 @@ Status TreeGetters::GetRepeatCount(int64_t *repeat_count) {
 }
 
 Status TreeGetters::GetNumClasses(int64_t *num_classes) {
+  RETURN_UNEXPECTED_IF_NULL(num_classes);
   RETURN_IF_NOT_OK(InternalInit());
   std::shared_ptr<DatasetOp> root = std::shared_ptr<DatasetOp>(tree_adapter_->GetRoot());
   RETURN_UNEXPECTED_IF_NULL(root);
@@ -604,6 +635,7 @@ Status TreeGetters::GetNumClasses(int64_t *num_classes) {
 }
 
 Status TreeGetters::GetColumnNames(std::vector<std::string> *output) {
+  RETURN_UNEXPECTED_IF_NULL(output);
   RETURN_IF_NOT_OK(InternalInit());
   std::shared_ptr<DatasetOp> root = std::shared_ptr<DatasetOp>(tree_adapter_->GetRoot());
   RETURN_UNEXPECTED_IF_NULL(root);
@@ -620,6 +652,7 @@ Status TreeGetters::GetColumnNames(std::vector<std::string> *output) {
 }
 
 Status TreeGetters::GetClassIndexing(std::vector<std::pair<std::string, std::vector<int32_t>>> *output_class_indexing) {
+  RETURN_UNEXPECTED_IF_NULL(output_class_indexing);
   RETURN_IF_NOT_OK(InternalInit());
   std::shared_ptr<DatasetOp> root = std::shared_ptr<DatasetOp>(tree_adapter_->GetRoot());
   RETURN_UNEXPECTED_IF_NULL(root);
@@ -671,6 +704,7 @@ Status DatasetSizeGetter::Init(std::shared_ptr<DatasetNode> d) {
   return Status::OK();
 }
 Status DatasetSizeGetter::DryRun(std::shared_ptr<DatasetNode> ir_node, int64_t *dataset_size) {
+  RETURN_UNEXPECTED_IF_NULL(dataset_size);
   std::shared_ptr<TreeAdapter> tree_adapter = std::make_shared<TreeAdapter>(TreeAdapter::UsageFlag::kDeGetter);
   tree_adapters_.push_back(tree_adapter);
   RETURN_IF_NOT_OK(tree_adapter->Compile(ir_node, 1));
@@ -685,6 +719,7 @@ Status DatasetSizeGetter::DryRun(std::shared_ptr<DatasetNode> ir_node, int64_t *
   return Status::OK();
 }
 Status DatasetSizeGetter::GetRow(const std::shared_ptr<TreeAdapter> &tree_adapter, TensorRow *row) {
+  RETURN_UNEXPECTED_IF_NULL(row);
   return tree_adapter->GetNext(row);
 }
 Status DatasetSizeGetter::Terminate() {
diff --git a/mindspore/ccsrc/minddata/dataset/engine/data_schema.cc b/mindspore/ccsrc/minddata/dataset/engine/data_schema.cc
index 849b903cdb8..2990a2f5fe0 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/data_schema.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/data_schema.cc
@@ -73,7 +73,7 @@ ColDescriptor::ColDescriptor(const std::string &col_name, DataType col_type, Ten
 ColDescriptor::ColDescriptor(const ColDescriptor &in_cd)
     : type_(in_cd.type_), rank_(in_cd.rank_), tensor_impl_(in_cd.tensor_impl_), col_name_(in_cd.col_name_) {
   // If it has a tensor shape, make a copy of it with our own unique_ptr.
-  tensor_shape_ = in_cd.hasShape() ? std::make_unique<TensorShape>(in_cd.shape()) : nullptr;
+  tensor_shape_ = in_cd.HasShape() ? std::make_unique<TensorShape>(in_cd.Shape()) : nullptr;
 }
 
 // Assignment overload
@@ -84,7 +84,7 @@ ColDescriptor &ColDescriptor::operator=(const ColDescriptor &in_cd) {
     tensor_impl_ = in_cd.tensor_impl_;
     col_name_ = in_cd.col_name_;
     // If it has a tensor shape, make a copy of it with our own unique_ptr.
-    tensor_shape_ = in_cd.hasShape() ? std::make_unique<TensorShape>(in_cd.shape()) : nullptr;
+    tensor_shape_ = in_cd.HasShape() ? std::make_unique<TensorShape>(in_cd.Shape()) : nullptr;
   }
   return *this;
 }
@@ -113,7 +113,7 @@ Status ColDescriptor::MaterializeTensorShape(int32_t num_elements, TensorShape *
 
   // If the shape is not given in this column, then we assume the shape will be: {numElements}
   if (tensor_shape_ == nullptr) {
-    if (this->rank() == 0 && num_elements == 1) {
+    if (this->Rank() == 0 && num_elements == 1) {
       *out_shape = TensorShape::CreateScalar();
       return Status::OK();
     }
@@ -155,7 +155,9 @@ Status ColDescriptor::MaterializeTensorShape(int32_t num_elements, TensorShape *
 
   // Sanity check the the computed element counts divide evenly into the input element count
   if (num_elements < num_elements_of_shape || num_elements_of_shape == 0 || num_elements % num_elements_of_shape != 0) {
-    RETURN_STATUS_UNEXPECTED("Requested shape has an invalid element count!");
+    std::string err = "Requested shape has an invalid element count! Number elements: " + std::to_string(num_elements) +
+                      ", number elements of shape: " + std::to_string(num_elements_of_shape);
+    RETURN_STATUS_UNEXPECTED(err);
   }
 
   // If there was any unknown dimensions, then update the requested shape to fill in the unknown
@@ -171,7 +173,7 @@ Status ColDescriptor::MaterializeTensorShape(int32_t num_elements, TensorShape *
 }
 
 // getter function for the shape
-TensorShape ColDescriptor::shape() const {
+TensorShape ColDescriptor::Shape() const {
   if (tensor_shape_ != nullptr) {
     return *tensor_shape_;  // copy construct a shape to return
   } else {
@@ -255,7 +257,7 @@ Status DataSchema::ColumnOrderLoad(nlohmann::json column_tree, const std::vector
 }
 
 // Internal helper function for parsing shape info and building a vector for the shape construction.
-static Status buildShape(const nlohmann::json &shapeVal, std::vector<dsize_t> *outShape) {
+static Status BuildShape(const nlohmann::json &shapeVal, std::vector<dsize_t> *outShape) {
   if (outShape == nullptr) {
     RETURN_STATUS_UNEXPECTED("null output shape");
   }
@@ -272,7 +274,8 @@ static Status buildShape(const nlohmann::json &shapeVal, std::vector<dsize_t> *o
 Status DataSchema::ColumnLoad(nlohmann::json column_child_tree, const std::string &col_name) {
   int32_t rank_value = -1;
   TensorImpl t_impl_value = TensorImpl::kFlexible;
-  std::string name, type_str;
+  std::string name = "";
+  std::string type_str = "";
   std::vector<dsize_t> tmp_shape = {};
   bool shape_field_exists = false;
   // Iterate over this column's attributes.
@@ -289,7 +292,7 @@ Status DataSchema::ColumnLoad(nlohmann::json column_child_tree, const std::strin
       STR_TO_TENSORIMPL(it_child.value(), t_impl_value);
     } else if (it_child.key() == "shape") {
       shape_field_exists = true;
-      RETURN_IF_NOT_OK(buildShape(it_child.value(), &tmp_shape));
+      RETURN_IF_NOT_OK(BuildShape(it_child.value(), &tmp_shape));
     } else {
       std::string err_msg = "Unexpected column attribute " + it_child.key() + " for column " + col_name;
       RETURN_STATUS_UNEXPECTED(err_msg);
@@ -322,10 +325,10 @@ Status DataSchema::ColumnLoad(nlohmann::json column_child_tree, const std::strin
   // Create the column descriptor for this column from the data we pulled from the json file
   TensorShape col_shape = TensorShape(tmp_shape);
   if (shape_field_exists)
-    (void)this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value, &col_shape));
+    RETURN_IF_NOT_OK(this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value, &col_shape)));
   else
     // Create a column descriptor that doesn't have a shape
-    (void)this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value));
+    RETURN_IF_NOT_OK(this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value)));
   return Status::OK();
 }
 
@@ -343,19 +346,30 @@ Status DataSchema::LoadSchemaFile(const std::string &schema_file_path,
     } catch (nlohmann::json::out_of_range &e) {
       num_rows_ = 0;
     } catch (nlohmann::json::exception &e) {
+      in.close();
       RETURN_STATUS_UNEXPECTED("Unable to parse \"numRows\" from schema");
     }
     nlohmann::json column_tree = js.at("columns");
     if (column_tree.empty()) {
+      in.close();
       RETURN_STATUS_UNEXPECTED("columns is null");
     }
     if (columns_to_load.empty()) {
       // Parse the json tree and load the schema's columns in whatever order that the json
       // layout decides
-      RETURN_IF_NOT_OK(this->AnyOrderLoad(column_tree));
+      Status rc = this->AnyOrderLoad(column_tree);
+      if (rc.IsError()) {
+        in.close();
+        return rc;
+      }
     } else {
-      RETURN_IF_NOT_OK(this->ColumnOrderLoad(column_tree, columns_to_load));
+      Status rc = this->ColumnOrderLoad(column_tree, columns_to_load);
+      if (rc.IsError()) {
+        in.close();
+        return rc;
+      }
     }
+    in.close();
   } catch (const std::exception &err) {
     // Catch any exception and convert to Status return code
     RETURN_STATUS_UNEXPECTED("Schema file failed to load with JSON tools. File is: " + schema_file_path);
@@ -392,7 +406,7 @@ Status DataSchema::LoadSchemaString(const std::string &schema_json_string,
 DataSchema::~DataSchema() = default;
 
 // Getter for the ColDescriptor by index
-const ColDescriptor &DataSchema::column(int32_t idx) const {
+const ColDescriptor &DataSchema::Column(int32_t idx) const {
   MS_ASSERT(idx < static_cast<int>(col_descs_.size()));
   return col_descs_[idx];
 }
@@ -409,9 +423,9 @@ void DataSchema::Print(std::ostream &out) const {
 Status DataSchema::AddColumn(const ColDescriptor &cd) {
   // Sanity check there's not a duplicate name before adding the column
   for (auto i = 0; i < col_descs_.size(); ++i) {
-    if (col_descs_[i].name() == cd.name()) {
+    if (col_descs_[i].Name() == cd.Name()) {
       std::ostringstream ss;
-      ss << "column name '" << cd.name() << "' already exists in schema.";
+      ss << "column name '" << cd.Name() << "' already exists in schema.";
       std::string err_msg = ss.str();
       RETURN_STATUS_UNEXPECTED(err_msg);
     }
@@ -437,11 +451,11 @@ Status DataSchema::GetColumnNameMap(std::unordered_map<std::string, int32_t> *ou
   }
 
   for (size_t i = 0; i < col_descs_.size(); ++i) {
-    if (col_descs_[i].name().empty()) {
+    if (col_descs_[i].Name().empty()) {
       return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
                     "Constructing column name map from schema, but found empty column name.");
     }
-    (*out_column_name_map)[col_descs_[i].name()] = i;
+    (*out_column_name_map)[col_descs_[i].Name()] = i;
   }
 
   return Status::OK();
diff --git a/mindspore/ccsrc/minddata/dataset/engine/data_schema.h b/mindspore/ccsrc/minddata/dataset/engine/data_schema.h
index d9f556c22ac..a92f64a3855 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/data_schema.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/data_schema.h
@@ -81,27 +81,27 @@ class ColDescriptor {
 
   /// \brief getter function
   /// \return The column's DataType
-  DataType type() const { return type_; }
+  DataType Type() const { return type_; }
 
   /// \brief getter function
   /// \return The column's rank
-  int32_t rank() const { return rank_; }
+  int32_t Rank() const { return rank_; }
 
   /// \brief getter function
   /// \return The column's name
-  std::string name() const { return col_name_; }
+  std::string Name() const { return col_name_; }
 
   /// \brief getter function
   /// \return The column's shape
-  TensorShape shape() const;
+  TensorShape Shape() const;
 
   /// \brief getter function
   /// \return TF if the column has an assigned fixed shape.
-  bool hasShape() const { return tensor_shape_ != nullptr; }
+  bool HasShape() const { return tensor_shape_ != nullptr; }
 
   /// \brief getter function
   /// \return The column's tensor implementation type
-  TensorImpl tensorImpl() const { return tensor_impl_; }
+  TensorImpl GetTensorImpl() const { return tensor_impl_; }
 
  private:
   DataType type_;                              // The columns type
@@ -153,7 +153,7 @@ class DataSchema {
 
   /// \brief getter
   /// \return The reference to a ColDescriptor to get (const version)
-  const ColDescriptor &column(int32_t idx) const;
+  const ColDescriptor &Column(int32_t idx) const;
 
   /// \brief getter
   /// \return The number of columns in the schema
@@ -163,7 +163,7 @@ class DataSchema {
 
   /// \brief getter
   /// \return The number of rows read from schema
-  int64_t num_rows() const { return num_rows_; }
+  int64_t NumRows() const { return num_rows_; }
 
   static const char DEFAULT_DATA_SCHEMA_FILENAME[];
 
diff --git a/mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.cc b/mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.cc
index 2b722a0d0e3..86024e94698 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include "minddata/dataset/engine/dataset_iterator.h"
+#include <algorithm>
 #include <unordered_map>
 #include <utility>
 #include "minddata/dataset/core/data_type.h"
diff --git a/mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.h b/mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.h
index 54b0768b198..e2d75efd1c0 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.h
@@ -49,7 +49,7 @@ class DatasetIterator {
   // @return The string to column id mapping.
   std::unordered_map<std::string, int32_t> GetColumnNameMap() const;
 
-  bool eof_handled() const { return eof_handled_; }
+  bool EofHandled() const { return eof_handled_; }
 
   // Fetches one row of data from the iterator.
   // the base class version simply performs error handling and returns empty row. Actual
@@ -108,11 +108,11 @@ class ChildIterator {
   std::unordered_map<std::string, int32_t> GetColumnNameMap() const;
 
   // Return T/F if end of epoch
-  bool end_of_epoch() { return end_epoch_; }
+  bool EndOfEpoch() { return end_epoch_; }
 
   // Getter
   // @return T/F if this iterator is completely done after getting an eof
-  bool eof_handled() const { return eof_handled_; }
+  bool EofHandled() const { return eof_handled_; }
 
  private:
   DatasetOp *current_op_;  // The parent operator. We consume from it's children.
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/barrier_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/barrier_op.cc
index df47d471350..ee7c1185b73 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/barrier_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/barrier_op.cc
@@ -113,6 +113,7 @@ Status BarrierOp::blockCond() {
 
 // fetches next Barrier row
 Status BarrierOp::getNextTensorRow(TensorRow *new_row) {
+  RETURN_UNEXPECTED_IF_NULL(new_row);
   // iterate over all iterators and generate a row
   RETURN_IF_NOT_OK((child_iterator_)->FetchNextTensorRow(new_row));
   // add each new row to iterator, check if row is empty, if row from iterator is empty return empty row
@@ -122,7 +123,7 @@ Status BarrierOp::getNextTensorRow(TensorRow *new_row) {
     MS_LOG(INFO) << "Barrier operator child iterator produced empty row.";
     clean_up_ = true;
     // If we picked up an eof here, then we are completely done.
-    if ((child_iterator_)->eof_handled()) {
+    if ((child_iterator_)->EofHandled()) {
       MS_LOG(INFO) << "Barrier operator iterator got EOF.";
       eof_ = true;
     }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/batch_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/batch_op.cc
index f6614995b88..5b045c0ecfc 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/batch_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/batch_op.cc
@@ -36,6 +36,7 @@ BatchOp::Builder::Builder(int32_t batch_size) : builder_drop_(false), builder_pa
 }
 
 Status BatchOp::Builder::Build(std::shared_ptr<BatchOp> *ptr) {
+  RETURN_UNEXPECTED_IF_NULL(ptr);
 #ifdef ENABLE_PYTHON
   *ptr = std::make_shared<BatchOp>(builder_batch_size_, builder_drop_, builder_pad_, builder_op_connector_size_,
                                    builder_num_workers_, builder_in_names_, builder_out_names_,
@@ -106,7 +107,7 @@ Status BatchOp::operator()() {
   RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
   int32_t cur_batch_size = 0;
   RETURN_IF_NOT_OK(GetBatchSize(&cur_batch_size, CBatchInfo(0, 0, 0)));
-  while (child_iterator_->eof_handled() == false) {
+  while (child_iterator_->EofHandled() == false) {
     while (new_row.empty() == false) {
       table->emplace_back(new_row);
       // if # of rows is enough to make 1 batch, send it to worker_queue
@@ -142,7 +143,7 @@ Status BatchOp::operator()() {
                       << "reduce memory usage.";
     }
 #endif
-  }  // end of eof_handled() == false
+  }  // end of EofHandled() == false
   RETURN_IF_NOT_OK(
     worker_queues_[cnt++ % num_workers_]->EmplaceBack(std::make_pair(nullptr, CBatchInfo(batchCtrl::kEOF))));
   // EOF received, send quit signal to all workers
@@ -168,6 +169,8 @@ void BatchOp::Print(std::ostream &out, bool show_all) const {
 }
 
 Status BatchOp::BatchRows(const std::unique_ptr<TensorQTable> *src, TensorRow *dest, dsize_t batch_size) {
+  RETURN_UNEXPECTED_IF_NULL(src);
+  RETURN_UNEXPECTED_IF_NULL(dest);
   if ((*src)->size() != batch_size) {
     RETURN_STATUS_UNEXPECTED("[Internal ERROR] Source table size does not match the batch_size.");
   }
@@ -274,6 +277,8 @@ Status BatchOp::EoeReceived(int32_t) {
 
 #ifdef ENABLE_PYTHON
 Status BatchOp::MapColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> *table_pair) {
+  RETURN_UNEXPECTED_IF_NULL(table_pair);
+  RETURN_UNEXPECTED_IF_NULL(table_pair->first);
   std::unique_ptr<TensorQTable> in_q_table = std::move(table_pair->first);
   size_t num_rows = in_q_table->size();
   auto out_q_table = std::make_unique<TensorQTable>(num_rows, TensorRow(column_name_id_map_.size(), nullptr));
@@ -316,6 +321,7 @@ Status BatchOp::MapColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo>
 #endif
 
 Status BatchOp::GetBatchSize(int32_t *batch_size, CBatchInfo info) {
+  RETURN_UNEXPECTED_IF_NULL(batch_size);
 #ifdef ENABLE_PYTHON
   if (batch_size_func_) {
     RETURN_IF_NOT_OK(InvokeBatchSizeFunc(batch_size, info));
@@ -330,6 +336,7 @@ Status BatchOp::GetBatchSize(int32_t *batch_size, CBatchInfo info) {
 
 #ifdef ENABLE_PYTHON
 Status BatchOp::InvokeBatchSizeFunc(int32_t *batch_size, CBatchInfo info) {
+  RETURN_UNEXPECTED_IF_NULL(batch_size);
   {
     // Acquire Python GIL
     py::gil_scoped_acquire gil_acquire;
@@ -355,6 +362,8 @@ Status BatchOp::InvokeBatchSizeFunc(int32_t *batch_size, CBatchInfo info) {
 }
 
 Status BatchOp::InvokeBatchMapFunc(TensorTable *input, TensorTable *output, CBatchInfo info) {
+  RETURN_UNEXPECTED_IF_NULL(input);
+  RETURN_UNEXPECTED_IF_NULL(output);
   {
     // Acquire Python GIL
     py::gil_scoped_acquire gil_acquire;
@@ -471,6 +480,9 @@ Status BatchOp::UnpackPadInfo(const PadInfo &pad_info,
                               const std::unordered_map<std::string, int32_t> &column_name_id_map,
                               std::set<int32_t> *pad_cols, std::vector<std::shared_ptr<Tensor>> *pad_vals,
                               std::vector<std::vector<dsize_t>> *pad_shapes) {
+  RETURN_UNEXPECTED_IF_NULL(pad_cols);
+  RETURN_UNEXPECTED_IF_NULL(pad_vals);
+  RETURN_UNEXPECTED_IF_NULL(pad_shapes);
   if (pad_info.empty()) {  // if pad_info empty, pad every columns automatically
     for (size_t col_id = 0; col_id < column_name_id_map.size(); col_id++) {
       pad_cols->insert(col_id);
@@ -561,6 +573,7 @@ int64_t BatchOp::GetTreeBatchSize() {
 }
 
 Status BatchOp::GetNextRowPullMode(TensorRow *const row) {
+  RETURN_UNEXPECTED_IF_NULL(row);
   std::unique_ptr<TensorQTable> table = std::make_unique<TensorQTable>();
   child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
   int32_t cur_batch_size = 0;
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.cc
index e9b61aeb8ec..1f8ef1b4b5a 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/bucket_batch_by_length_op.cc
@@ -60,7 +60,7 @@ Status BucketBatchByLengthOp::operator()() {
   TensorRow current_row;
   child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
   RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&current_row));
-  while (!child_iterator_->eof_handled()) {
+  while (!child_iterator_->EofHandled()) {
     while (!current_row.empty()) {
       int32_t element_length;
       RETURN_IF_NOT_OK(ObtainElementLength(&element_length, current_row));
@@ -99,6 +99,7 @@ Status BucketBatchByLengthOp::operator()() {
 }
 
 Status BucketBatchByLengthOp::ObtainElementLength(int32_t *out_element_length, TensorRow element) {
+  RETURN_UNEXPECTED_IF_NULL(out_element_length);
   // call pyfunc here if given pyfunc, otherwise return 0th dimension of shape of
   // the single column specified in length_dependent_columns_
   if (element_length_function_) {
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc
index 67c280b3eaa..cf6fe16bae3 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.cc
@@ -52,7 +52,7 @@ Status BuildSentencePieceVocabOp::operator()() {
   RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
 
   bool eoe_warning = false;  // give out warning if receive more than 1 eoe
-  while (child_iterator_->eof_handled() == false) {
+  while (child_iterator_->EofHandled() == false) {
     while (new_row.empty() == false) {
       RETURN_IF_NOT_OK(sentence_queue_->EmplaceBack(new_row));
       RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc
index be363ade17a..66bdc5eb079 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/build_vocab_op.cc
@@ -107,7 +107,7 @@ Status BuildVocabOp::operator()() {
     }
   }
   bool eoe_warning = false;  // give out warning if receive more than 1 eoe
-  while (child_iterator_->eof_handled() == false) {
+  while (child_iterator_->EofHandled() == false) {
     while (new_row.empty() == false) {
       RETURN_IF_NOT_OK(distributor_queue_->EmplaceBack(new_row));
       RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
index 7d02443ac6e..6c5349cd12e 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
@@ -593,7 +593,7 @@ Status DeviceQueueOp::SendDataToCPU() {
   MS_LOG(INFO) << "Device queue, sending data to CPU.";
   int64_t total_batch = 0;
 
-  while (!(child_iterator_->eof_handled())) {
+  while (!(child_iterator_->EofHandled())) {
     TensorRow curr_row;
     RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&curr_row));
 
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/filter_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/filter_op.cc
index 2a1983a4ef0..d0d5baac2ad 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/filter_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/filter_op.cc
@@ -62,7 +62,7 @@ Status FilterOp::operator()() {
   TensorRow new_row;
   RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
   int64_t cnt = 0;
-  while (child_iterator_->eof_handled() == false) {
+  while (child_iterator_->EofHandled() == false) {
     while (new_row.empty() == false) {
       RETURN_IF_NOT_OK(worker_queues_[cnt % num_workers_]->EmplaceBack(new_row));
       cnt++;
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/shuffle_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/shuffle_op.cc
index 2d4643eb95e..37eacfad944 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/shuffle_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/shuffle_op.cc
@@ -124,7 +124,7 @@ Status ShuffleOp::operator()() {
     RETURN_IF_NOT_OK(InitShuffleBuffer());
 
     // This is our main loop exit condition, when the iterator has no more data completely.
-    if (child_iterator_->eof_handled()) {
+    if (child_iterator_->EofHandled()) {
       RETURN_IF_NOT_OK(out_connector_->SendEOF());
       break;
     }
@@ -214,7 +214,7 @@ Status ShuffleOp::InitShuffleBuffer() {
   TensorRow new_row;
   RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
 
-  if (child_iterator_->eof_handled()) {
+  if (child_iterator_->EofHandled()) {
     MS_LOG(DEBUG) << "Shuffle operator init picked up EOF. No more epochs.";
     RETURN_IF_NOT_OK(out_connector_->SendEOF());
     return Status::OK();
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc
index 3c8af4dd067..42f17df4a78 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/album_op.cc
@@ -43,7 +43,7 @@ AlbumOp::AlbumOp(int32_t num_wkrs, std::string file_dir, int32_t queue_size, boo
       curr_row_(0) {
   // Set the column name map (base class field)
   for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-    column_name_id_map_[data_schema_->column(i).name()] = i;
+    column_name_id_map_[data_schema_->Column(i).Name()] = i;
   }
   io_block_queues_.Init(num_workers_, queue_size);
 }
@@ -70,8 +70,8 @@ Status AlbumOp::PrescanEntry() {
   }
   MS_LOG(INFO) << "Album folder Path found: " << folder_path_ << ".";
 
-  while (dirItr->hasNext()) {
-    Path file = dirItr->next();
+  while (dirItr->HasNext()) {
+    Path file = dirItr->Next();
     if (extensions_.empty() || extensions_.find(file.Extension()) != extensions_.end()) {
       (void)image_rows_.push_back(file.toString().substr(dirname_offset_));
     } else {
@@ -192,7 +192,7 @@ Status AlbumOp::LoadStringTensor(const nlohmann::json &json_obj, int32_t col_num
 Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, int32_t col_num, TensorRow *row) {
   TensorPtr label;
   // consider templating this function to handle all ints
-  if (data_schema_->column(col_num).type() == DataType::DE_INT64) {
+  if (data_schema_->Column(col_num).Type() == DataType::DE_INT64) {
     std::vector<int64_t> data;
 
     // Iterate over the integer list and add those values to the output shape tensor
@@ -201,7 +201,7 @@ Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, int32_t col_n
     (void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); });
 
     RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label));
-  } else if (data_schema_->column(col_num).type() == DataType::DE_INT32) {
+  } else if (data_schema_->Column(col_num).Type() == DataType::DE_INT32) {
     std::vector<int32_t> data;
 
     // Iterate over the integer list and add those values to the output shape tensor
@@ -212,7 +212,7 @@ Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, int32_t col_n
     RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &label));
   } else {
     RETURN_STATUS_UNEXPECTED("Invalid data, column type in data_schema is neither int32 nor int64, it is " +
-                             data_schema_->column(col_num).type().ToString());
+                             data_schema_->Column(col_num).Type().ToString());
   }
   row->push_back(std::move(label));
   return Status::OK();
@@ -221,7 +221,7 @@ Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, int32_t col_n
 Status AlbumOp::LoadFloatArrayTensor(const nlohmann::json &json_obj, int32_t col_num, TensorRow *row) {
   TensorPtr float_array;
   // consider templating this function to handle all ints
-  if (data_schema_->column(col_num).type() == DataType::DE_FLOAT64) {
+  if (data_schema_->Column(col_num).Type() == DataType::DE_FLOAT64) {
     std::vector<double> data;
 
     // Iterate over the integer list and add those values to the output shape tensor
@@ -230,7 +230,7 @@ Status AlbumOp::LoadFloatArrayTensor(const nlohmann::json &json_obj, int32_t col
     (void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); });
 
     RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &float_array));
-  } else if (data_schema_->column(col_num).type() == DataType::DE_FLOAT32) {
+  } else if (data_schema_->Column(col_num).Type() == DataType::DE_FLOAT32) {
     std::vector<float> data;
 
     // Iterate over the integer list and add those values to the output shape tensor
@@ -241,14 +241,15 @@ Status AlbumOp::LoadFloatArrayTensor(const nlohmann::json &json_obj, int32_t col
     RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, &float_array));
   } else {
     RETURN_STATUS_UNEXPECTED("Invalid data, column type in data_schema is neither float32 nor float64, it is " +
-                             data_schema_->column(col_num).type().ToString());
+                             data_schema_->Column(col_num).Type().ToString());
   }
   row->push_back(std::move(float_array));
   return Status::OK();
 }
 
 Status AlbumOp::LoadIDTensor(const std::string &file, int32_t col_num, TensorRow *row) {
-  if (data_schema_->column(col_num).type() == DataType::DE_STRING) {
+  RETURN_UNEXPECTED_IF_NULL(row);
+  if (data_schema_->Column(col_num).Type() == DataType::DE_STRING) {
     TensorPtr id;
     RETURN_IF_NOT_OK(Tensor::CreateScalar<std::string>(file, &id));
     row->push_back(std::move(id));
@@ -266,7 +267,7 @@ Status AlbumOp::LoadIDTensor(const std::string &file, int32_t col_num, TensorRow
 Status AlbumOp::LoadEmptyTensor(int32_t col_num, TensorRow *row) {
   // hack to get the file name without extension, the 1 is to get rid of the backslash character
   TensorPtr empty_tensor;
-  RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({0}), data_schema_->column(col_num).type(), &empty_tensor));
+  RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({0}), data_schema_->Column(col_num).Type(), &empty_tensor));
   row->push_back(std::move(empty_tensor));
   return Status::OK();
 }
@@ -277,11 +278,11 @@ Status AlbumOp::LoadEmptyTensor(int32_t col_num, TensorRow *row) {
 // only be float32, seems like a weird limitation to impose
 Status AlbumOp::LoadFloatTensor(const nlohmann::json &json_obj, int32_t col_num, TensorRow *row) {
   TensorPtr float_tensor;
-  if (data_schema_->column(col_num).type() == DataType::DE_FLOAT64) {
+  if (data_schema_->Column(col_num).Type() == DataType::DE_FLOAT64) {
     double data = json_obj;
     MS_LOG(INFO) << "double found: " << json_obj << ".";
     RETURN_IF_NOT_OK(Tensor::CreateScalar<double>(data, &float_tensor));
-  } else if (data_schema_->column(col_num).type() == DataType::DE_FLOAT32) {
+  } else if (data_schema_->Column(col_num).Type() == DataType::DE_FLOAT32) {
     float data1 = json_obj;
     RETURN_IF_NOT_OK(Tensor::CreateScalar<float>(data1, &float_tensor));
     MS_LOG(INFO) << "float found: " << json_obj << ".";
@@ -293,11 +294,11 @@ Status AlbumOp::LoadFloatTensor(const nlohmann::json &json_obj, int32_t col_num,
 // Loads a tensor with int value, we have to cast the value to type specified in the schema.
 Status AlbumOp::LoadIntTensor(const nlohmann::json &json_obj, int32_t col_num, TensorRow *row) {
   TensorPtr int_tensor;
-  if (data_schema_->column(col_num).type() == DataType::DE_INT64) {
+  if (data_schema_->Column(col_num).Type() == DataType::DE_INT64) {
     int64_t data = json_obj;
     MS_LOG(INFO) << "int64 found: " << json_obj << ".";
     RETURN_IF_NOT_OK(Tensor::CreateScalar<int64_t>(data, &int_tensor));
-  } else if (data_schema_->column(col_num).type() == DataType::DE_INT32) {
+  } else if (data_schema_->Column(col_num).Type() == DataType::DE_INT32) {
     int32_t data = json_obj;
     RETURN_IF_NOT_OK(Tensor::CreateScalar<int32_t>(data, &int_tensor));
     MS_LOG(INFO) << "int32 found: " << json_obj << ".";
@@ -349,35 +350,35 @@ Status AlbumOp::LoadTensorRow(row_id_type row_id, TensorRow *row) {
 Status AlbumOp::loadColumnData(const std::string &file, int32_t index, nlohmann::json js, TensorRow *row) {
   int32_t i = index;
   // special case to handle
-  if (data_schema_->column(i).name() == "id") {
+  if (data_schema_->Column(i).Name() == "id") {
     // id is internal, special case to load from file
     return LoadIDTensor(file, i, row);
   }
   // find if key does not exist, insert placeholder nullptr if not found
-  if (js.find(data_schema_->column(i).name()) == js.end()) {
+  if (js.find(data_schema_->Column(i).Name()) == js.end()) {
     // iterator not found, push nullptr as placeholder
-    MS_LOG(INFO) << "Pushing empty tensor for column: " << data_schema_->column(i).name() << ".";
+    MS_LOG(INFO) << "Pushing empty tensor for column: " << data_schema_->Column(i).Name() << ".";
     return LoadEmptyTensor(i, row);
   }
-  nlohmann::json column_value = js.at(data_schema_->column(i).name());
-  MS_LOG(INFO) << "This column is: " << data_schema_->column(i).name() << ".";
+  nlohmann::json column_value = js.at(data_schema_->Column(i).Name());
+  MS_LOG(INFO) << "This column is: " << data_schema_->Column(i).Name() << ".";
   bool is_array = column_value.is_array();
   // load single string
-  if (column_value.is_string() && data_schema_->column(i).type() == DataType::DE_STRING) {
+  if (column_value.is_string() && data_schema_->Column(i).Type() == DataType::DE_STRING) {
     return LoadStringTensor(column_value, i, row);
   }
   // load string array
-  if (is_array && data_schema_->column(i).type() == DataType::DE_STRING) {
+  if (is_array && data_schema_->Column(i).Type() == DataType::DE_STRING) {
     return LoadStringArrayTensor(column_value, i, row);
   }
   // load image file
-  if (column_value.is_string() && data_schema_->column(i).type() != DataType::DE_STRING) {
+  if (column_value.is_string() && data_schema_->Column(i).Type() != DataType::DE_STRING) {
     std::string image_file_path = column_value;
     return LoadImageTensor(image_file_path, i, row);
   }
   // load float value
-  bool judge_float = (data_schema_->column(i).type() == DataType::DE_FLOAT32) ||
-                     (data_schema_->column(i).type() == DataType::DE_FLOAT64);
+  bool judge_float = (data_schema_->Column(i).Type() == DataType::DE_FLOAT32) ||
+                     (data_schema_->Column(i).Type() == DataType::DE_FLOAT64);
   if (!is_array && judge_float) {
     return LoadFloatTensor(column_value, i, row);
   }
@@ -387,15 +388,15 @@ Status AlbumOp::loadColumnData(const std::string &file, int32_t index, nlohmann:
   }
   // int value
   if (!is_array &&
-      (data_schema_->column(i).type() == DataType::DE_INT64 || data_schema_->column(i).type() == DataType::DE_INT32)) {
+      (data_schema_->Column(i).Type() == DataType::DE_INT64 || data_schema_->Column(i).Type() == DataType::DE_INT32)) {
     return LoadIntTensor(column_value, i, row);
   }
   // int array
   if (is_array &&
-      (data_schema_->column(i).type() == DataType::DE_INT64 || data_schema_->column(i).type() == DataType::DE_INT32)) {
+      (data_schema_->Column(i).Type() == DataType::DE_INT64 || data_schema_->Column(i).Type() == DataType::DE_INT32)) {
     return LoadIntArrayTensor(column_value, i, row);
   } else {
-    MS_LOG(WARNING) << "Value type for column: " << data_schema_->column(i).name() << " is not supported.";
+    MS_LOG(WARNING) << "Value type for column: " << data_schema_->Column(i).Name() << " is not supported.";
     return Status::OK();
   }
 }
@@ -438,7 +439,7 @@ Status AlbumOp::ComputeColMap() {
   // Set the column name map (base class field)
   if (column_name_id_map_.empty()) {
     for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-      column_name_id_map_[data_schema_->column(i).name()] = i;
+      column_name_id_map_[data_schema_->Column(i).Name()] = i;
     }
   } else {
     MS_LOG(WARNING) << "Column name map is already set!";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/celeba_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/celeba_op.cc
index f9c8be4423c..1e6d79a23e7 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/celeba_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/celeba_op.cc
@@ -258,7 +258,7 @@ Status CelebAOp::LoadTensorRow(row_id_type row_id, TensorRow *row) {
   }
 
   RETURN_IF_NOT_OK(
-    Tensor::CreateEmpty(TensorShape({1, (uint32_t)image_label.second.size()}), data_schema_->column(1).type(), &label));
+    Tensor::CreateEmpty(TensorShape({1, (uint32_t)image_label.second.size()}), data_schema_->Column(1).Type(), &label));
   RETURN_IF_NOT_OK(label->Zero());
   for (uint32_t index = 0; index < image_label.second.size(); index++) {
     if (image_label.second[index] == 1) {
@@ -294,7 +294,7 @@ Status CelebAOp::ComputeColMap() {
   // Set the column name map (base class field)
   if (column_name_id_map_.empty()) {
     for (int32_t index = 0; index < data_schema_->NumColumns(); index++) {
-      column_name_id_map_[data_schema_->column(index).name()] = index;
+      column_name_id_map_[data_schema_->Column(index).Name()] = index;
     }
   } else {
     MS_LOG(WARNING) << "Column name map is already set!";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc
index 6b865917ed4..4f752201dcd 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cifar_op.cc
@@ -205,8 +205,8 @@ Status CifarOp::GetCifarFiles() {
   Path dir_path(folder_path_);
   auto dirIt = Path::DirIterator::OpenDirectory(&dir_path);
   if (dirIt) {
-    while (dirIt->hasNext()) {
-      Path file = dirIt->next();
+    while (dirIt->HasNext()) {
+      Path file = dirIt->Next();
       if (file.Extension() == kExtension) {
         cifar_files_.push_back(file.toString());
       }
@@ -236,7 +236,7 @@ Status CifarOp::ParseCifarData() {
 
       std::shared_ptr<Tensor> image_tensor;
       RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({kCifarImageHeight, kCifarImageWidth, kCifarImageChannel}),
-                                           data_schema_->column(0).type(), &image_tensor));
+                                           data_schema_->Column(0).Type(), &image_tensor));
       auto itr = image_tensor->begin<uint8_t>();
       uint32_t total_pix = kCifarImageHeight * kCifarImageWidth;
       for (uint32_t pix = 0; pix < total_pix; ++pix) {
@@ -369,7 +369,7 @@ Status CifarOp::ComputeColMap() {
   // set the column name map (base class field)
   if (column_name_id_map_.empty()) {
     for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-      column_name_id_map_[data_schema_->column(i).name()] = i;
+      column_name_id_map_[data_schema_->Column(i).Name()] = i;
     }
   } else {
     MS_LOG(WARNING) << "Column name map is already set!";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cmu_arctic_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cmu_arctic_op.cc
deleted file mode 100644
index ee7f872b590..00000000000
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cmu_arctic_op.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "minddata/dataset/engine/datasetops/source/cmu_arctic_op.h"
-
-#include <fstream>
-#include <iomanip>
-#include <set>
-#include "utils/ms_utils.h"
-#include "minddata/dataset/core/config_manager.h"
-#include "minddata/dataset/core/tensor_shape.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
-#include "minddata/dataset/engine/db_connector.h"
-#include "minddata/dataset/engine/execution_tree.h"
-
-namespace mindspore {
-namespace dataset {
-
-const size_t kWavHandSize=44;
-const size_t kReadbufferSize=20480;
-const std::string dataDirectory = "wav";
-const std::string labelDirectory = "etc";
-const std::string labelFileName = "txt.done.data";
-
-const std::string pre="cmu_us_";
-const std::string suf="_arctic";
-
-CmuArcticOp::CmuArcticOp(const std::string &usage, int32_t num_workers, std::string folder_path, int32_t queue_size,
-             std::unique_ptr<DataSchema> data_schema, std::shared_ptr<SamplerRT> sampler)
-    : MappableLeafOp(num_workers, queue_size, std::move(sampler)),
-      usage_(usage),
-      folder_path_(folder_path),
-      data_schema_(std::move(data_schema)) {
-  io_block_queues_.Init(num_workers, queue_size);
-}
-
-Status CmuArcticOp::LoadTensorRow(row_id_type row_id, TensorRow *trow) {
-  CmuArcticLabelTuple audio_tuple = audio_label_tuple_[row_id];
-  std::shared_ptr <Tensor> waveform, rate, utterance, utterance_id;
-  RETURN_IF_NOT_OK(Tensor::CreateFromTensor(audio_tuple.waveform, &waveform));
-  RETURN_IF_NOT_OK(Tensor::CreateScalar(audio_tuple.sample_rate, &rate));
-  RETURN_IF_NOT_OK(Tensor::CreateScalar(audio_tuple.utterance, &utterance));
-  RETURN_IF_NOT_OK(Tensor::CreateScalar(audio_tuple.utterance_id, &utterance_id));
-  (*trow) = TensorRow(row_id, {std::move(waveform), std::move(rate), std::move(utterance), std::move(utterance_id)});
-  trow->setPath({audio_names_[row_id].first});
-  return Status::OK();
-}
-
-void CmuArcticOp::Print(std::ostream &out, bool show_all) const {
-  if (!show_all) {
-    // Call the super class for displaying any common 1-liner info
-    ParallelOp::Print(out, show_all);
-    // Then show any custom derived-internal 1-liner info for this op
-    out << "\n";
-  }
-  else {
-    // Call the super class for displaying any common detailed info
-    ParallelOp::Print(out, show_all);
-    // Then show any custom derived-internal stuff
-    out << "\nNumber of rows:" << num_rows_ << "\nCmuArctic Directory: " << folder_path_ << "\n\n";
-  }
-}
-
-// Derived from RandomAccessOp
-Status CmuArcticOp::GetClassIds(std::map<std::string, std::vector<int64_t>> *cls_ids) const {
-  if (cls_ids == nullptr || !cls_ids->empty() || audio_label_tuple_.empty()) {
-    if (audio_label_tuple_.empty()) {
-      RETURN_STATUS_UNEXPECTED("No audio found in dataset, please check if Op read audios successfully or not.");
-    }
-    else {
-      RETURN_STATUS_UNEXPECTED(
-          "Map for storaging audio-index pair is nullptr or has been set in other place,"
-          "it must be empty before using GetClassIds.");
-    }
-  }
-  for (size_t i = 0; i < audio_label_tuple_.size(); ++i) {
-    (*cls_ids)[audio_label_tuple_[i].utterance_id].push_back(i);//
-  }
-  for (auto &pair : (*cls_ids)) {
-    pair.second.shrink_to_fit();
-  }
-  return Status::OK();
-}
-
-
-Status CmuArcticOp::CountTotalRows(const std::string &dir, const std::string &usage, int64_t *count) {
-  *count = 0;
-  const int64_t num_samples = 0;
-  const int64_t start_index = 0;
-  auto sampler = std::make_shared<SequentialSamplerRT>(start_index, num_samples);
-  auto schema = std::make_unique<DataSchema>();
-
-  RETURN_IF_NOT_OK(schema->AddColumn(ColDescriptor("waveform", DataType(DataType::DE_FLOAT64), TensorImpl::kCv, 1)));
-  TensorShape scalar_rate = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-      schema->AddColumn(ColDescriptor("sample_rate", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0,
-                      &scalar_rate)));
-  TensorShape scalar_utterance = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-      schema->AddColumn(ColDescriptor("utterance", DataType(DataType::DE_STRING), TensorImpl::kFlexible, 0,
-                      &scalar_utterance)));
-  TensorShape scalar_utterance_id = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-      schema->AddColumn(ColDescriptor("utterance_id", DataType(DataType::DE_STRING), TensorImpl::kFlexible, 0,
-                      &scalar_utterance_id)));
-  std::shared_ptr <ConfigManager> cfg = GlobalContext::config_manager();
-
-  int32_t num_workers = cfg->num_parallel_workers();
-  int32_t op_connect_size = cfg->op_connector_size();
-  auto op = std::make_shared<CmuArcticOp>(usage, num_workers, dir, op_connect_size, std::move(schema),
-                      std::move(sampler));
-  RETURN_IF_NOT_OK(op->WalkAllFiles());
-  *count = op->audio_names_.size();
-  return Status::OK();
-}
-
-Status CmuArcticOp::ComputeColMap() {
-  // set the column name map (base class field)
-  if (column_name_id_map_.empty()) {
-    for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-      column_name_id_map_[data_schema_->column(i).name()] = i;
-    }
-  }
-  else {
-    MS_LOG(WARNING) << "Column name map is already set!";
-  }
-  return Status::OK();
-}
-
-Status CmuArcticOp::ReadLabel() {
-  char buffer[1024];
-  for (std::string u:label_files_) {
-    std::ifstream in(u);
-    while (!in.eof()) {
-      in.getline(buffer, 1024);
-      if (buffer[0] != '(')
-        break;
-      int32_t blank[3] = {0};
-      int32_t cur = 0;
-      for (int32_t i = 0; cur < 2 && i < 1024; i++) {
-        if (buffer[i] == '"')
-          blank[cur++] = i;
-      }
-      if (cur != 2)
-        RETURN_STATUS_UNEXPECTED("Label file error!");
-      buffer[blank[0] - 1] = 0;
-      buffer[blank[1]] = 0;
-      label_pairs_.push_back({std::string(buffer + 2), std::string(buffer + blank[0] + 1)});
-    }
-  }
-  if (audio_names_.size() != label_pairs_.size())
-    RETURN_STATUS_UNEXPECTED("The number of files is different from the number of labels!");
-  std::sort(audio_names_.begin(), audio_names_.end());
-  std::sort(label_pairs_.begin(), label_pairs_.end());
-  return Status::OK();
-}
-
-Status CmuArcticOp::ReadAudio() {
-  char header[kWavHandSize];
-  short buff[kReadbufferSize];
-  const double mx = 32768.0;
-  std::vector<double> tempArr;
-  for (uint32_t i = 0; i < audio_names_.size(); i++) {
-    if (audio_names_[i].first != label_pairs_[i].first + ".wav") {
-      RETURN_STATUS_UNEXPECTED("An error occurred between the label and the file content!");
-    }
-    tempArr.clear();
-    auto item = audio_names_[i];
-    const char *dir = item.second.data();
-    FILE *fp = fopen(dir, "rb");
-    if (fp == NULL) {
-      MS_LOG(WARNING) << "File missing . dir:" << dir;
-      continue;
-    }
-    uint32_t s = fread(header, 1, kWavHandSize, fp);
-    if (s != kWavHandSize)
-      RETURN_STATUS_UNEXPECTED("Audio header error!");
-    uint32_t rate = *(uint32_t * )(header + 0x18);
-    uint32_t frame = *(uint32_t * )(header + 0x28) / 2;
-    uint32_t surplus = frame;
-    while (surplus) {
-      uint32_t len = fread(buff, 2, kReadbufferSize, fp);
-      for (uint32_t i = 0; i < len; i++) {
-        tempArr.push_back(buff[i] / mx);
-      }
-      surplus -= len;
-    }
-    fclose(fp);
-    std::shared_ptr <Tensor> audio;
-    RETURN_IF_NOT_OK(Tensor::CreateFromVector(tempArr, &audio));
-    audio_label_tuple_.push_back({audio, rate, label_pairs_[i].second, label_pairs_[i].first});
-  }
-  num_rows_ = audio_names_.size();
-  return Status::OK();
-}
-
-Status CmuArcticOp::WalkAllFiles() {
-  Path dir(folder_path_);
-  Path fullDir = (dir + pre + usage_ + suf) / dataDirectory;
-  Path label = (dir + pre + usage_ + suf) / labelDirectory / labelFileName;
-  label_files_.push_back(label.toString());
-  auto dirIt = Path::DirIterator::OpenDirectory(&fullDir);
-  if (dirIt != nullptr) {
-    while (dirIt->hasNext()) {
-      Path file = dirIt->next();
-      std::string fileName = file.toString();
-      auto pos = fileName.find_last_of('.');
-      std::string ext = fileName.substr(pos);
-      if (ext == ".wav") {
-        audio_names_.push_back({file.Basename(), file.toString()});
-      }
-      else {
-        MS_LOG(WARNING) << "File name format error :" << file.toString() << ".";
-      }
-    }
-  }
-  else {
-    MS_LOG(WARNING) << "Unable to open directory " << fullDir.toString() << ".";
-  }
-  return Status::OK();
-}
-
-Status CmuArcticOp::LaunchThreadsAndInitOp() {
-  if (tree_ == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Pipeline init failed, Execution tree not set.");
-  }
-  RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks()));
-  RETURN_IF_NOT_OK(wait_for_workers_post_.Register(tree_->AllTasks()));
-  RETURN_IF_NOT_OK(
-      tree_->LaunchWorkers(num_workers_, std::bind(&CmuArcticOp::WorkerEntry, this, std::placeholders::_1), "",
-                 id()));
-  TaskManager::FindMe()->Post();
-  RETURN_IF_NOT_OK(this->WalkAllFiles());
-  RETURN_IF_NOT_OK(this->ReadLabel());
-  RETURN_IF_NOT_OK(this->ReadAudio());
-  RETURN_IF_NOT_OK(this->InitSampler());  // handle shake with sampler
-  return Status::OK();
-}
-
-}  // namespace dataset
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cmu_arctic_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cmu_arctic_op.h
deleted file mode 100644
index bb7ceff5a5a..00000000000
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/cmu_arctic_op.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CMUARCTIC_OP_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CMUARCTIC_OP_H_
-
-#include <memory>
-#include <string>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <utility>
-
-#include "minddata/dataset/core/tensor.h"
-
-#include "minddata/dataset/engine/data_schema.h"
-#include "minddata/dataset/engine/datasetops/parallel_op.h"
-#include "minddata/dataset/engine/datasetops/source/mappable_leaf_op.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
-#include "minddata/dataset/util/path.h"
-#include "minddata/dataset/util/queue.h"
-#include "minddata/dataset/util/status.h"
-#include "minddata/dataset/util/wait_post.h"
-
-namespace mindspore {
-namespace dataset {
-
-
-
-
-
-struct CmuArcticLabelTuple{
-  std::shared_ptr<Tensor> waveform;
-  uint32_t sample_rate;
-  std::string utterance;
-  std::string utterance_id;
-};
-
-
-class CmuArcticOp : public MappableLeafOp {
- public:
-  // Constructor
-  // @param const std::string &usage - Usage of this dataset, can be 'train', 'test' ,'valid'or 'all'
-  // @param int32_t num_workers - number of workers reading audios in parallel
-  // @param std::string folder_path - dir directory of mnist
-  // @param int32_t queue_size - connector queue size
-  // @param std::unique_ptr<DataSchema> data_schema - the schema of the mnist dataset
-  // @param td::unique_ptr<Sampler> sampler - sampler tells CmuArcticOp what to read
-  CmuArcticOp(const std::string &usage, int32_t num_workers, std::string folder_path, int32_t queue_size,
-          std::unique_ptr<DataSchema> data_schema, std::shared_ptr<SamplerRT> sampler);
-
-  // Destructor.
-  ~CmuArcticOp() = default;
-
-  // Method derived from RandomAccess Op, enable Sampler to get all ids for each class
-  // @param (std::map<uint64_t, std::vector<uint64_t >> * map - key label, val all ids for this class
-  // @return Status The status code returned
-  Status GetClassIds(std::map<std::string, std::vector<int64_t>> *cls_ids) const ;
-
-  // A print method typically used for debugging
-  // @param out
-  // @param show_all
-  void Print(std::ostream &out, bool show_all) const override;
-
-  // Function to count the number of samples in the MNIST dataset
-  // @param dir path to the MNIST directory
-  // @param count output arg that will hold the minimum of the actual dataset size and numSamples
-  // @return
-
- static Status CountTotalRows(const std::string &dir, const std::string &usage, int64_t *count);
-
-  // Op name getter
-  // @return Name of the current Op
- std::string Name() const override { return "CmuArcticOp"; }
-
- private:
-  // Load a tensor row according to a pair
-  // @param row_id_type row_id - id for this tensor row
-  // @param ImageLabelPair pair - <audiofile,label>
-  // @param TensorRow row - audio & label read into this tensor row
-  // @return Status The status code returned
-  Status LoadTensorRow(row_id_type row_id, TensorRow *row) override;
-
-  Status ReadAudio();
-
-  Status ReadLabel();
-
-  // Read all files in the directory
-  // @return Status The status code returned
-  Status WalkAllFiles();
-
-  // Called first when function is called
-  // @return Status The status code returned
-  Status LaunchThreadsAndInitOp() override;
-
-  // Private function for computing the assignment of the column name map.
-  // @return - Status
-  Status ComputeColMap() override;
-
-
-  std::string folder_path_;  // directory of audio folder
-  const std::string usage_;  
-  std::unique_ptr<DataSchema> data_schema_;
-  std::vector<CmuArcticLabelTuple> audio_label_tuple_;
-  std::vector<std::pair<std::string,std::string>> audio_names_;
-  std::vector<std::pair<std::string,std::string>> label_pairs_;
-  std::vector<std::string> label_files_;
-};
-
-
-
-}  // namespace dataset
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CMUARCTIC_OP_H_
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/coco_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/coco_op.cc
index 824980c296c..ac946c0edb5 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/coco_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/coco_op.cc
@@ -86,7 +86,7 @@ Status CocoOp::LoadTensorRow(row_id_type row_id, TensorRow *trow) {
   }
 
   std::string kImageFile = image_folder_path_ + std::string("/") + image_id;
-  RETURN_IF_NOT_OK(ReadImageToTensor(kImageFile, data_schema_->column(0), &image));
+  RETURN_IF_NOT_OK(ReadImageToTensor(kImageFile, data_schema_->Column(0), &image));
 
   auto bboxRow = itr->second;
   std::vector<float> bbox_row;
@@ -505,7 +505,7 @@ Status CocoOp::ComputeColMap() {
   // Set the column name map (base class field)
   if (column_name_id_map_.empty()) {
     for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-      column_name_id_map_[data_schema_->column(i).name()] = i;
+      column_name_id_map_[data_schema_->Column(i).Name()] = i;
     }
   } else {
     MS_LOG(WARNING) << "Column name map is already set!";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/flickr_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/flickr_op.cc
index 65735a488f9..6a3c17f39c2 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/flickr_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/flickr_op.cc
@@ -17,10 +17,10 @@
 
 #include <algorithm>
 #include <fstream>
-#include <iomanip>
 #include <set>
 #include <utility>
 
+#include "debug/common.h"
 #include "minddata/dataset/core/config_manager.h"
 #include "minddata/dataset/core/tensor_shape.h"
 #include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
@@ -94,7 +94,13 @@ void FlickrOp::Print(std::ostream &out, bool show_all) const {
 }
 
 Status FlickrOp::ParseFlickrData() {
-  std::ifstream file_handle(file_path_);
+  auto real_file_path = Common::GetRealPath(file_path_);
+  if (!real_file_path.has_value()) {
+    MS_LOG(ERROR) << "Get real path failed, path=" << file_path_;
+    RETURN_STATUS_UNEXPECTED("Get real path failed, path=" + file_path_);
+  }
+
+  std::ifstream file_handle(real_file_path.value());
   if (!file_handle.is_open()) {
     RETURN_STATUS_UNEXPECTED("Invalid file, failed to open Flickr annotation file: " + file_path_);
   }
@@ -129,7 +135,11 @@ Status FlickrOp::ParseFlickrData() {
       }
 
       bool valid = false;
-      RETURN_IF_NOT_OK(CheckImageType(image_file_path, &valid));
+      Status type_check = CheckImageType(image_file_path, &valid);
+      if (type_check.IsError()) {
+        file_handle.close();
+        RETURN_IF_NOT_OK(type_check);
+      }
       if (!valid) {
         continue;
       }
@@ -153,10 +163,16 @@ Status FlickrOp::ParseFlickrData() {
 // Optimization: Could take in a tensor
 // This function does not return status because we want to just skip bad input, not crash
 Status FlickrOp::CheckImageType(const std::string &file_name, bool *valid) {
+  auto real_file_name = Common::GetRealPath(file_name);
+  if (!real_file_name.has_value()) {
+    MS_LOG(ERROR) << "Get real path failed, path=" << file_name;
+    RETURN_STATUS_UNEXPECTED("Get real path failed, path=" + file_name);
+  }
+
   std::ifstream file_handle;
   constexpr int read_num = 3;
   *valid = false;
-  file_handle.open(file_name, std::ios::binary | std::ios::in);
+  file_handle.open(real_file_name.value(), std::ios::binary | std::ios::in);
   if (!file_handle.is_open()) {
     RETURN_STATUS_UNEXPECTED("Invalid file, failed to open image file: " + file_name);
   }
@@ -224,7 +240,7 @@ Status FlickrOp::ComputeColMap() {
   // Set the column name map (base class field)
   if (column_name_id_map_.empty()) {
     for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-      column_name_id_map_[data_schema_->column(i).name()] = i;
+      column_name_id_map_[data_schema_->Column(i).Name()] = i;
     }
   } else {
     MS_LOG(WARNING) << "Column name map is already set!";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc
index 00b7ae4251f..782a2d87f61 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/image_folder_op.cc
@@ -156,8 +156,8 @@ Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) {
       RETURN_STATUS_UNEXPECTED("Invalid file, failed to open folder: " + folder_name);
     }
     std::set<std::string> imgs;  // use this for ordering
-    while (dirItr->hasNext()) {
-      Path file = dirItr->next();
+    while (dirItr->HasNext()) {
+      Path file = dirItr->Next();
       if (extensions_.empty() || extensions_.find(file.Extension()) != extensions_.end()) {
         (void)imgs.insert(file.toString().substr(dirname_offset_));
       } else {
@@ -182,8 +182,8 @@ Status ImageFolderOp::PrescanWorkerEntry(int32_t worker_id) {
 Status ImageFolderOp::RecursiveWalkFolder(Path *dir) {
   std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(dir);
   RETURN_UNEXPECTED_IF_NULL(dir_itr);
-  while (dir_itr->hasNext()) {
-    Path subdir = dir_itr->next();
+  while (dir_itr->HasNext()) {
+    Path subdir = dir_itr->Next();
     if (subdir.IsDirectory()) {
       if (class_index_.empty() ||
           class_index_.find(subdir.toString().substr(dirname_offset_ + 1)) != class_index_.end()) {
@@ -256,8 +256,8 @@ Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::se
   std::queue<std::string> folder_paths;
   std::shared_ptr<Path::DirIterator> dir_itr = Path::DirIterator::OpenDirectory(&dir);
   std::unordered_set<std::string> folder_names;
-  while (dir_itr->hasNext()) {
-    Path subdir = dir_itr->next();
+  while (dir_itr->HasNext()) {
+    Path subdir = dir_itr->Next();
     if (subdir.IsDirectory()) {
       folder_paths.push(subdir.toString());
       if (!class_index.empty()) folder_names.insert(subdir.Basename());
@@ -283,7 +283,7 @@ Status ImageFolderOp::CountRowsAndClasses(const std::string &path, const std::se
     if (subdir.Exists() == false || dir_itr == nullptr) {
       RETURN_STATUS_UNEXPECTED("Invalid file, failed to open folder: " + subdir.toString());
     }
-    while (dir_itr->hasNext()) {
+    while (dir_itr->HasNext()) {
       if (exts.empty() || exts.find(subdir.Extension()) != exts.end()) {
         ++row_cnt;
       }
@@ -298,7 +298,7 @@ Status ImageFolderOp::ComputeColMap() {
   // Set the column name map (base class field)
   if (column_name_id_map_.empty()) {
     for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-      column_name_id_map_[data_schema_->column(i).name()] = i;
+      column_name_id_map_[data_schema_->Column(i).Name()] = i;
     }
   } else {
     MS_LOG(WARNING) << "Column name map is already set!";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/libri_speech_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/libri_speech_op.cc
deleted file mode 100644
index 936ad337c33..00000000000
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/libri_speech_op.cc
+++ /dev/null
@@ -1,385 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "minddata/dataset/engine/datasetops/source/libri_speech_op.h"
-
-
-#include <fstream>
-#include <iomanip>
-#include <set>
-#include "utils/ms_utils.h"
-#include "minddata/dataset/core/config_manager.h"
-#include "minddata/dataset/core/tensor_shape.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
-#include "minddata/dataset/engine/db_connector.h"
-#include "minddata/dataset/engine/execution_tree.h"
-
-namespace mindspore {
-namespace dataset {
-
-const uint32_t kAudioBufferSize = 20480;
-const uint32_t kAudioRefillThresh = 4096;
-
-LibriSpeechOp::LibriSpeechOp(const std::string &usage, int32_t num_workers, std::string folder_path, int32_t queue_size,
-               std::unique_ptr<DataSchema> data_schema, std::shared_ptr<SamplerRT> sampler)
-    : MappableLeafOp(num_workers, queue_size, std::move(sampler)),
-      usage_(usage),
-      folder_path_(folder_path),
-      data_schema_(std::move(data_schema)) {
-  io_block_queues_.Init(num_workers, queue_size);
-}
-
-Status LibriSpeechOp::LoadTensorRow(row_id_type row_id, TensorRow *trow) {
-  LibriSpeechLabelTuple audio_tuple = audio_label_tuple_[row_id];
-  std::shared_ptr <Tensor> waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id;
-
-  RETURN_IF_NOT_OK(Tensor::CreateFromTensor(audio_tuple.waveform, &waveform));
-  RETURN_IF_NOT_OK(Tensor::CreateScalar(audio_tuple.sample_rate, &sample_rate));
-  RETURN_IF_NOT_OK(Tensor::CreateScalar(audio_tuple.utterance, &utterance));
-  RETURN_IF_NOT_OK(Tensor::CreateScalar(audio_tuple.speaker_id, &speaker_id));
-  RETURN_IF_NOT_OK(Tensor::CreateScalar(audio_tuple.chapter_id, &chapter_id));
-  RETURN_IF_NOT_OK(Tensor::CreateScalar(audio_tuple.utterance_id, &utterance_id));
-
-  (*trow) = TensorRow(row_id,
-            {std::move(waveform), std::move(sample_rate), std::move(utterance), std::move(speaker_id),
-             std::move(chapter_id), std::move(utterance_id)});
-  trow->setPath({flac_nodes_[row_id].file_link});
-  return Status::OK();
-}
-
-void LibriSpeechOp::Print(std::ostream &out, bool show_all) const {
-  if (!show_all) {
-    // Call the super class for displaying any common 1-liner info
-    ParallelOp::Print(out, show_all);
-    // Then show any custom derived-internal 1-liner info for this op
-    out << "\n";
-  }
-  else {
-    // Call the super class for displaying any common detailed info
-    ParallelOp::Print(out, show_all);
-    // Then show any custom derived-internal stuff
-    out << "\nNumber of rows:" << num_rows_ << "\nLibriSpeech Directory: " << folder_path_ << "\n\n";
-  }
-}
-
-// Derived from RandomAccessOp
-Status LibriSpeechOp::GetClassIds(std::map<uint32_t, std::vector<int64_t>> *cls_ids) const {
-  if (cls_ids == nullptr || !cls_ids->empty() || audio_label_tuple_.empty()) {
-    if (audio_label_tuple_.empty()) {
-      RETURN_STATUS_UNEXPECTED("No audio found in dataset, please check if Op read images successfully or not.");
-    }
-    else {
-      RETURN_STATUS_UNEXPECTED(
-          "Map for storaging image-index pair is nullptr or has been set in other place,"
-          "it must be empty before using GetClassIds.");
-    }
-  }
-  for (size_t i = 0; i < audio_label_tuple_.size(); ++i) {
-    (*cls_ids)[audio_label_tuple_[i].utterance_id].push_back(i);//
-  }
-  for (auto &pair : (*cls_ids)) {
-    pair.second.shrink_to_fit();
-  }
-  return Status::OK();
-}
-
-
-Status LibriSpeechOp::CountTotalRows(const std::string &dir, const std::string &usage, int64_t *count) {
-  // the logic of counting the number of samples is copied from ParseMnistData() and uses CheckReader()
-  *count = 0;
-  const int64_t num_samples = 0;
-  const int64_t start_index = 0;
-  auto sampler = std::make_shared<SequentialSamplerRT>(start_index, num_samples);
-  auto schema = std::make_unique<DataSchema>();
-
-  RETURN_IF_NOT_OK(schema->AddColumn(ColDescriptor("waveform", DataType(DataType::DE_FLOAT64), TensorImpl::kCv, 1)));
-  TensorShape scalar_rate = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-      schema->AddColumn(ColDescriptor("sample_rate", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0,
-                      &scalar_rate)));
-  TensorShape scalar_utterance = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-      schema->AddColumn(ColDescriptor("utterance", DataType(DataType::DE_STRING), TensorImpl::kFlexible, 0,
-                      &scalar_utterance)));
-  TensorShape scalar_speaker_id = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-      schema->AddColumn(ColDescriptor("speaker_id", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0,
-                      &scalar_speaker_id)));
-  TensorShape scalar_chapter_id = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-      schema->AddColumn(ColDescriptor("chapter_id", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0,
-                      &scalar_chapter_id)));
-  TensorShape scalar_utterance_id = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-      schema->AddColumn(ColDescriptor("utterance_id", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0,
-                      &scalar_utterance_id)));
-
-  std::shared_ptr <ConfigManager> cfg = GlobalContext::config_manager();
-  int32_t num_workers = cfg->num_parallel_workers();
-  int32_t op_connect_size = cfg->op_connector_size();
-  auto op = std::make_shared<LibriSpeechOp>(usage, num_workers, dir, op_connect_size, std::move(schema),
-                        std::move(sampler));
-  RETURN_IF_NOT_OK(op->WalkAllFiles());
-  *count = op->flac_files_.size();
-  return Status::OK();
-}
-
-
-
-Status LibriSpeechOp::DecodeFlac(AVCodecContext *dec_ctx, AVPacket *pkt, AVFrame *frame,std::vector<double> &arr) {
-  int32_t i, ch;
-  int32_t ret, data_size;
-
-  ret = avcodec_send_packet(dec_ctx, pkt);
-  if (ret < 0) {
-    RETURN_STATUS_UNEXPECTED("Error submitting the packet to the decoder!");
-  }
-
-  while (ret >= 0) {
-    ret = avcodec_receive_frame(dec_ctx, frame);
-    if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
-      return Status::OK();
-    }
-    else if (ret < 0) {
-      RETURN_STATUS_UNEXPECTED("Error during decoding!");
-    }
-    data_size = av_get_bytes_per_sample(dec_ctx->sample_fmt);
-    if (data_size < 0) {
-      RETURN_STATUS_UNEXPECTED("Failed to calculate data size!");
-    }
-    for (i = 0; i < frame->nb_samples; i++)
-      for (ch = 0; ch < dec_ctx->channels; ch++)
-        arr.push_back((*(short *) (frame->data[ch] + data_size * i)) / 32768.0);
-  }
-  return Status::OK();
-}
-
-
-Status LibriSpeechOp::ComputeColMap() {
-  // set the column name map (base class field)
-  if (column_name_id_map_.empty()) {
-    for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-      column_name_id_map_[data_schema_->column(i).name()] = i;
-    }
-  }
-  else {
-    MS_LOG(WARNING) << "Column name map is already set!";
-  }
-  return Status::OK();
-}
-
-Status LibriSpeechOp::ReadLabel() {
-  char buff[2048];
-  for (auto u:label_files_) {
-    std::ifstream in(u);
-    while (!in.eof()) {
-      in.getline(buff, 2048);
-      if (buff[0] < '0' || buff[0] > '9')
-        break;
-
-      uint32_t blank[3] = {0};
-      uint32_t cur = 0;
-      uint32_t start = 0;
-      for (uint32_t i = 0; i < 2048; i++) {
-        if (buff[i] == '-')
-          blank[cur++] = i;
-        if (buff[i] == ' ') {
-          start = i + 1;
-          break;
-        }
-      }
-      if (cur != 2)
-        RETURN_STATUS_UNEXPECTED("Label file error!");
-      uint32_t speaker_id = 0;
-      uint32_t chapter_id = 0;
-      uint32_t utterance_id = 0;
-      for (uint32_t i = 0; i < blank[0]; i++)
-        speaker_id = speaker_id * 10 + buff[i] - '0';
-      for (uint32_t i = blank[0] + 1; i < blank[1]; i++)
-        chapter_id = chapter_id * 10 + buff[i] - '0';
-      for (uint32_t i = blank[1] + 1; i < start - 1; i++)
-        utterance_id = utterance_id * 10 + buff[i] - '0';
-      buff[start - 1] = 0;
-      flac_nodes_.push_back({std::string(buff), std::string(buff + start), speaker_id, chapter_id, utterance_id});
-    }
-  }
-
-  std::sort(flac_files_.begin(), flac_files_.end());
-  std::sort(flac_nodes_.begin(), flac_nodes_.end(),
-        [&](flac_node a, flac_node b) { return a.file_link < b.file_link; });
-  for (uint32_t i = 0; i < flac_files_.size(); i++) {
-    if (flac_nodes_[i].file_link != flac_files_[i].first) {
-      RETURN_STATUS_UNEXPECTED("An error occurred between the label and the file content!");
-    }
-    flac_nodes_[i].file_link = flac_files_[i].second;
-  }
-  return Status::OK();
-}
-
-Status LibriSpeechOp::ReadAudio() {
-
-  for (flac_node u:flac_nodes_) {
-    std::vector<double> arr;
-    char *filename = u.file_link.data();
-    const AVCodec *codec;
-
-    AVCodecContext *c = NULL;
-    AVCodecParserContext *parser = NULL;
-    AVPacket *pkt;
-    AVFrame *decoded_frame = NULL;
-    FILE *f;
-
-    int32_t len, ret;
-    uint8_t inbuf[kAudioBufferSize + AV_INPUT_BUFFER_PADDING_SIZE];
-    uint8_t *data;
-    size_t data_size;
-
-    pkt = av_packet_alloc();
-    codec = avcodec_find_decoder(AV_CODEC_ID_FLAC);
-    if (!codec) {
-      RETURN_STATUS_UNEXPECTED("Codec not found!");
-    }
-    parser = av_parser_init(codec->id);
-    if (!parser) {
-      RETURN_STATUS_UNEXPECTED("Parser not found!");
-    }
-    c = avcodec_alloc_context3(codec);
-    if (!c) {
-      RETURN_STATUS_UNEXPECTED("Could not allocate audio codec context!");
-    }
-    if (avcodec_open2(c, codec, NULL) < 0) {
-      RETURN_STATUS_UNEXPECTED("Could not open codec!");
-    }
-
-    f = fopen(filename, "rb");
-    if (!f) {
-      RETURN_STATUS_UNEXPECTED(std::string("Could not open ") + filename);
-    }
-
-    data = inbuf;
-    data_size = fread(inbuf, 1, kAudioBufferSize, f);
-
-    decoded_frame = av_frame_alloc();
-    while (true) {
-      pkt->size = 0;
-      pkt->data = nullptr;
-      ret = av_parser_parse2(parser, c, &pkt->data, &pkt->size,
-                   data, data_size,
-                   AV_NOPTS_VALUE, AV_NOPTS_VALUE, 0);
-
-      if (pkt->size == 0 && data_size == 0)
-        break;
-      if (ret < 0) {
-        RETURN_STATUS_UNEXPECTED("Error while parsing");
-      }
-      data += ret;
-      data_size -= ret;
-      if (pkt->size) {
-        RETURN_IF_NOT_OK(DecodeFlac(c, pkt, decoded_frame, arr));
-      }
-
-      if (data_size < kAudioRefillThresh) {
-        memmove(inbuf, data, data_size);
-        data = inbuf;
-        len = fread(data + data_size, 1,
-              kAudioBufferSize - data_size, f);
-        if (len > 0)
-          data_size += len;
-      }
-    }
-
-    pkt->size = 0;
-    pkt->data = nullptr;
-    RETURN_IF_NOT_OK(DecodeFlac(c, pkt, decoded_frame, arr));
-    uint32_t rate = c->sample_rate;
-    fclose(f);
-    avcodec_free_context(&c);
-    av_parser_close(parser);
-    av_frame_free(&decoded_frame);
-    av_packet_free(&pkt);
-    std::shared_ptr <Tensor> audio;
-    RETURN_IF_NOT_OK(Tensor::CreateFromVector(arr, &audio));
-    audio_label_tuple_.push_back({audio, rate, u.utterance, u.speaker_id, u.speaker_id, u.utterance_id});
-  }
-  num_rows_ = audio_label_tuple_.size();
-  return Status::OK();
-}
-
-Status LibriSpeechOp::WalkAllFiles() {
-  Path dir(folder_path_);
-  Path fullDir = dir + usage_;
-  auto dirIt = Path::DirIterator::OpenDirectory(&fullDir);
-  if (dirIt != nullptr) {
-    while (dirIt->hasNext()) {
-      Path file = dirIt->next();
-
-      auto subDirIt = Path::DirIterator::OpenDirectory(&file);
-      if (subDirIt != nullptr) {
-        while (subDirIt->hasNext()) {
-          Path subFile = subDirIt->next();
-
-          auto leafDirIt = Path::DirIterator::OpenDirectory(&subFile);
-          if (leafDirIt != nullptr) {
-            while (leafDirIt->hasNext()) {
-              Path actFile = leafDirIt->next();
-              std::string p = actFile.toString();
-              size_t pos = p.size() - 3;
-              size_t len = actFile.Basename().size() - 5;
-              if (pos < 0 || len < 0)
-                RETURN_STATUS_UNEXPECTED("File name parsing error!");
-              std::string t = p.substr(pos);
-              if (t == "lac") {
-                flac_files_.push_back({actFile.Basename().substr(0, len), p});
-              }
-              else if (t == "txt") {
-                label_files_.push_back(p);
-              }
-              else {
-                MS_LOG(WARNING) << "File name format error :" << actFile.toString() << ".";
-              }
-            }
-          }//leafDirIt
-
-        }
-      }//subDirIt
-
-    }
-  }//DirIt
-  else {
-    MS_LOG(WARNING) << "Unable to open directory " << fullDir.toString() << ".";
-  }
-  return Status::OK();
-}
-
-Status LibriSpeechOp::LaunchThreadsAndInitOp() {
-  if (tree_ == nullptr) {
-    RETURN_STATUS_UNEXPECTED("Pipeline init failed, Execution tree not set.");
-  }
-  RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks()));
-  RETURN_IF_NOT_OK(wait_for_workers_post_.Register(tree_->AllTasks()));
-  RETURN_IF_NOT_OK(
-      tree_->LaunchWorkers(num_workers_, std::bind(&LibriSpeechOp::WorkerEntry, this, std::placeholders::_1), "",
-                 id()));
-  TaskManager::FindMe()->Post();
-  RETURN_IF_NOT_OK(this->WalkAllFiles());
-  RETURN_IF_NOT_OK(this->ReadLabel());
-  RETURN_IF_NOT_OK(this->ReadAudio());
-  RETURN_IF_NOT_OK(this->InitSampler());  // handle shake with sampler
-  return Status::OK();
-}
-
-}  // namespace dataset
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/libri_speech_op.h b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/libri_speech_op.h
deleted file mode 100644
index d91fb488412..00000000000
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/libri_speech_op.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_LIBRISPEECH_OP_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_LIBRISPEECH_OP_H_
-
-extern "C"{
-  #include <libavutil/frame.h>
-  #include <libavutil/mem.h>
-  #include <libavutil/file.h>
-  #include <libavcodec/avcodec.h>
-  #include <libavformat/avformat.h>
-  #include <libavformat/avio.h>
-}
-
-#include <memory>
-#include <string>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <utility>
-
-#include "minddata/dataset/core/tensor.h"
-
-#include "minddata/dataset/engine/data_schema.h"
-#include "minddata/dataset/engine/datasetops/parallel_op.h"
-#include "minddata/dataset/engine/datasetops/source/mappable_leaf_op.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
-#include "minddata/dataset/util/path.h"
-#include "minddata/dataset/util/queue.h"
-#include "minddata/dataset/util/status.h"
-#include "minddata/dataset/util/wait_post.h"
-
-namespace mindspore {
-namespace dataset {
-
-struct LibriSpeechLabelTuple{
-  std::shared_ptr<Tensor> waveform;
-  uint32_t sample_rate;
-  std::string utterance;
-  uint32_t speaker_id;
-  uint32_t chapter_id;
-  uint32_t utterance_id;
-};
-
-struct flac_node {
-  std::string file_link;
-  std::string utterance;
-  uint32_t speaker_id;
-  uint32_t chapter_id;
-  uint32_t utterance_id;
-};
-
-class LibriSpeechOp : public MappableLeafOp {
- public:
-  // Constructor
-  // @param const std::string &usage - Usage of this dataset, can be 'train', 'test' ,'valid'or 'all'
-  // @param int32_t num_workers - number of workers reading audios in parallel
-  // @param std::string folder_path - dir directory of LibriSppech
-  // @param int32_t queue_size - connector queue size
-  // @param std::unique_ptr<DataSchema> data_schema - the schema of the LibriSppech dataset
-  // @param td::unique_ptr<Sampler> sampler - sampler tells LibriSpeechOp what to read
-  LibriSpeechOp(const std::string &usage, int32_t num_workers, std::string folder_path, int32_t queue_size,
-          std::unique_ptr<DataSchema> data_schema, std::shared_ptr<SamplerRT> sampler);
-
-  // Destructor.
-  ~LibriSpeechOp() = default;
-
-  // Method derived from RandomAccess Op, enable Sampler to get all ids for each class
-  // @param (std::map<uint64_t, std::vector<uint64_t >> * map - key label, val all ids for this class
-  // @return Status The status code returned
-  Status GetClassIds(std::map<uint32_t, std::vector<int64_t>> *cls_ids) const ;
-
-  // A print method typically used for debugging
-  // @param out
-  // @param show_all
-  void Print(std::ostream &out, bool show_all) const override;
-
-  // Function to count the number of samples in the LibriSppech dataset
-  // @param dir path to the LibriSppech directory
-  // @param count output arg that will hold the minimum of the actual dataset size and numSamples
-  // @return
-
- static Status CountTotalRows(const std::string &dir, const std::string &usage, int64_t *count);
-
-  // Op name getter
-  // @return Name of the current Op
- std::string Name() const override { return "LibriSpeechOp"; }
-
- private:
-  Status DecodeFlac(AVCodecContext *dec_ctx, AVPacket *pkt, AVFrame *frame,std::vector<double> &arr);
-  
-  // Load a tensor row according to a pair
-  // @param row_id_type row_id - id for this tensor row
-  // @param ImageLabelPair pair - <audiofile,label>
-  // @param TensorRow row - audio & label read into this tensor row
-  // @return Status The status code returned
-  Status LoadTensorRow(row_id_type row_id, TensorRow *row) override;
-
-  Status ReadAudio();
-
-  Status ReadLabel();
-
-  // Read all files in the directory
-  // @return Status The status code returned
-  Status WalkAllFiles();
-
-  // Called first when function is called
-  // @return Status The status code returned
-  Status LaunchThreadsAndInitOp() override;
-
-  // Private function for computing the assignment of the column name map.
-  // @return - Status
-  Status ComputeColMap() override;
-
-
-  std::string folder_path_;  // directory of audio folder
-  const std::string usage_;  // can only be either "train" or "test"
-  
-  std::unique_ptr<DataSchema> data_schema_;
-  std::vector<LibriSpeechLabelTuple> audio_label_tuple_;
-
-  std::vector<std::string>label_files_;
-  std::vector<std::pair<std::string, std::string>>flac_files_;
-  std::vector<flac_node>flac_nodes_;
-};
-
-
-
-}  // namespace dataset
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_LIBRISPEECH_OP_H_
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/manifest_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/manifest_op.cc
index 7fbba5daaaf..62134cedec4 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/manifest_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/manifest_op.cc
@@ -274,7 +274,7 @@ Status ManifestOp::ComputeColMap() {
   // Set the column name map (base class field)
   if (column_name_id_map_.empty()) {
     for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-      column_name_id_map_[data_schema_->column(i).name()] = i;
+      column_name_id_map_[data_schema_->Column(i).Name()] = i;
     }
   } else {
     MS_LOG(WARNING) << "Column name map is already set!";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc
index 91d7c14566c..beb23ec80e6 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc
@@ -113,7 +113,7 @@ Status MindRecordOp::Init() {
       CHECK_FAIL_RETURN_UNEXPECTED(
         colname_to_ind.find(colname) != colname_to_ind.end(),
         "Invalid data, specified loading column name: " + colname + " does not exist in data file.");
-      RETURN_IF_NOT_OK(tmp_schema->AddColumn(data_schema_->column(colname_to_ind[colname])));
+      RETURN_IF_NOT_OK(tmp_schema->AddColumn(data_schema_->Column(colname_to_ind[colname])));
     }
     data_schema_ = std::move(tmp_schema);
   }
@@ -271,8 +271,8 @@ Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint
     }
 
     std::shared_ptr<Tensor> tensor;
-    const ColDescriptor &column = data_schema_->column(i_col);
-    DataType type = column.type();
+    const ColDescriptor &column = data_schema_->Column(i_col);
+    DataType type = column.Type();
 
     // Set shape
     CHECK_FAIL_RETURN_UNEXPECTED(column_data_type_size != 0, "Found memory size of column data type is 0.");
@@ -280,9 +280,14 @@ Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint
     if (type == DataType::DE_STRING) {
       std::string s{data, data + n_bytes};
       RETURN_IF_NOT_OK(Tensor::CreateScalar(s, &tensor));
-    } else if (column.hasShape()) {
-      auto new_shape = TensorShape(column.shape());
-      RETURN_IF_NOT_OK(column.MaterializeTensorShape(static_cast<int32_t>(num_elements), &new_shape));
+    } else if (column.HasShape()) {
+      auto new_shape = TensorShape(column.Shape());
+      // if the numpy is null, create empty tensor shape
+      if (num_elements == 0) {
+        new_shape = TensorShape({});
+      } else {
+        RETURN_IF_NOT_OK(column.MaterializeTensorShape(static_cast<int32_t>(num_elements), &new_shape));
+      }
       RETURN_IF_NOT_OK(Tensor::CreateFromMemory(new_shape, type, data, &tensor));
     } else {
       std::vector<dsize_t> shapeDetails = {static_cast<dsize_t>(num_elements)};
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mnist_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mnist_op.cc
index d8f0c4c45ff..7e8728607b7 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mnist_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mnist_op.cc
@@ -180,7 +180,7 @@ Status MnistOp::ReadImageAndLabel(std::ifstream *image_reader, std::ifstream *la
       pixels[m] = (pixels[m] == 0) ? 0 : 255;
     }
     std::shared_ptr<Tensor> image;
-    RETURN_IF_NOT_OK(Tensor::CreateFromMemory(img_tensor_shape, data_schema_->column(0).type(),
+    RETURN_IF_NOT_OK(Tensor::CreateFromMemory(img_tensor_shape, data_schema_->Column(0).Type(),
                                               reinterpret_cast<unsigned char *>(pixels), &image));
     image_label_pairs_.emplace_back(std::make_pair(image, labels_buf[j]));
     image_path_.push_back(image_names_[index]);
@@ -225,8 +225,8 @@ Status MnistOp::WalkAllFiles() {
   std::string prefix;  // empty string, used to match usage = "" (default) or usage == "all"
   if (usage_ == "train" || usage_ == "test") prefix = (usage_ == "test" ? test_prefix : train_prefix);
   if (dir_it != nullptr) {
-    while (dir_it->hasNext()) {
-      Path file = dir_it->next();
+    while (dir_it->HasNext()) {
+      Path file = dir_it->Next();
       std::string fname = file.Basename();  // name of the mnist file
       if ((fname.find(prefix + "-images") != std::string::npos) && (fname.find(img_ext) != std::string::npos)) {
         image_names_.push_back(file.toString());
@@ -307,7 +307,7 @@ Status MnistOp::ComputeColMap() {
   // set the column name map (base class field)
   if (column_name_id_map_.empty()) {
     for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-      column_name_id_map_[data_schema_->column(i).name()] = i;
+      column_name_id_map_[data_schema_->Column(i).Name()] = i;
     }
   } else {
     MS_LOG(WARNING) << "Column name map is already set!";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/random_data_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/random_data_op.cc
index 64cdb151a7d..b5a81ec2a08 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/random_data_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/random_data_op.cc
@@ -267,8 +267,8 @@ Status RandomDataOp::CreateRandomRow(int32_t worker_id, TensorRow *new_row) {
 
   // Create a tensor for each column, then add the tensor to the row
   for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-    const ColDescriptor current_col = data_schema_->column(i);
-    std::vector<dsize_t> current_shape = current_col.shape().AsVector();
+    const ColDescriptor current_col = data_schema_->Column(i);
+    std::vector<dsize_t> current_shape = current_col.Shape().AsVector();
     std::unique_ptr<TensorShape> new_shape = nullptr;
     std::unique_ptr<unsigned char[]> buf = nullptr;
     std::shared_ptr<Tensor> new_tensor = nullptr;
@@ -282,7 +282,7 @@ Status RandomDataOp::CreateRandomRow(int32_t worker_id, TensorRow *new_row) {
     }
 
     new_shape = std::make_unique<TensorShape>(current_shape);
-    int64_t size_in_bytes = new_shape->NumOfElements() * current_col.type().SizeInBytes();
+    int64_t size_in_bytes = new_shape->NumOfElements() * current_col.Type().SizeInBytes();
 
     // Generate a random byte of data.  This may cause some funny data for things like doubles,floats, bools
     // however the random data op is not too concerned about the physical data itself.
@@ -296,7 +296,7 @@ Status RandomDataOp::CreateRandomRow(int32_t worker_id, TensorRow *new_row) {
       return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__, "Failed to set random bytes for a tensor.");
     }
 
-    RETURN_IF_NOT_OK(Tensor::CreateFromMemory(*new_shape, current_col.type(), buf.get(), &new_tensor));
+    RETURN_IF_NOT_OK(Tensor::CreateFromMemory(*new_shape, current_col.Type(), buf.get(), &new_tensor));
 
     // Add this tensor to the tensor row for output
     (*new_row).push_back(std::move(new_tensor));
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sampler.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sampler.cc
index 715bf993ab9..1441dc9f41b 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sampler.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/sampler/sampler.cc
@@ -75,7 +75,7 @@ Status SamplerRT::CreateSamplerTensor(std::shared_ptr<Tensor> *sample_ids, int64
     col_desc_ = std::make_unique<ColDescriptor>("sampleIds", DataType(DataType::DE_INT64), TensorImpl::kFlexible, 1);
   }
   TensorShape shape(std::vector<dsize_t>(1, num_elements));
-  RETURN_IF_NOT_OK(Tensor::CreateEmpty(shape, col_desc_->type(), sample_ids));
+  RETURN_IF_NOT_OK(Tensor::CreateEmpty(shape, col_desc_->Type(), sample_ids));
   return Status::OK();
 }
 
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/text_file_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/text_file_op.cc
index db6d1b4dd43..a9dfd672e02 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/text_file_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/text_file_op.cc
@@ -225,7 +225,7 @@ Status TextFileOp::ComputeColMap() {
   // Set the column name mapping (base class field)
   if (column_name_id_map_.empty()) {
     for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-      column_name_id_map_[data_schema_->column(i).name()] = i;
+      column_name_id_map_[data_schema_->Column(i).Name()] = i;
     }
   } else {
     MS_LOG(WARNING) << "Column name map is already set!";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/tf_reader_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/tf_reader_op.cc
index fda009a0d75..763673de558 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/tf_reader_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/tf_reader_op.cc
@@ -123,7 +123,7 @@ Status TFReaderOp::Init() {
   }
 
   if (total_rows_ == 0) {
-    total_rows_ = data_schema_->num_rows();
+    total_rows_ = data_schema_->NumRows();
   }
   if (total_rows_ < 0) {
     RETURN_STATUS_UNEXPECTED(
@@ -332,12 +332,12 @@ Status TFReaderOp::LoadFile(const std::string &filename, int64_t start_offset, i
 Status TFReaderOp::LoadExample(const dataengine::Example *tf_file, TensorRow *out_row) {
   int32_t num_columns = data_schema_->NumColumns();
   for (int32_t col = 0; col < num_columns; ++col) {
-    const ColDescriptor current_col = data_schema_->column(col);
+    const ColDescriptor current_col = data_schema_->Column(col);
     const dataengine::Features &example_features = tf_file->features();
     const google::protobuf::Map<std::string, dataengine::Feature> &feature_map = example_features.feature();
-    auto iter_column = feature_map.find(current_col.name());
+    auto iter_column = feature_map.find(current_col.Name());
     if (iter_column == feature_map.end()) {
-      RETURN_STATUS_UNEXPECTED("Invalid parameter, column name: " + current_col.name() + " does not exist.");
+      RETURN_STATUS_UNEXPECTED("Invalid parameter, column name: " + current_col.Name() + " does not exist.");
     }
     const dataengine::Feature &column_values_list = iter_column->second;
     RETURN_IF_NOT_OK(LoadFeature(out_row, column_values_list, current_col, col));
@@ -379,7 +379,7 @@ Status TFReaderOp::LoadFeature(TensorRow *tensor_row, const dataengine::Feature
       // into the tensor
       TensorShape current_shape = TensorShape::CreateUnknownRankShape();
       RETURN_IF_NOT_OK(current_col.MaterializeTensorShape(num_elements, &current_shape));
-      RETURN_IF_NOT_OK(Tensor::CreateFromMemory(current_shape, current_col.type(), data_ptr, &ts));
+      RETURN_IF_NOT_OK(Tensor::CreateFromMemory(current_shape, current_col.Type(), data_ptr, &ts));
       break;
     }
     case dataengine::Feature::KindCase::kInt64List: {
@@ -406,10 +406,10 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor &current_col, const dataeng
   // kBytesList can map to the following DE types ONLY!
   // DE_UINT8, DE_INT8
   // Must be single byte type for each element!
-  if (current_col.type() != DataType::DE_UINT8 && current_col.type() != DataType::DE_INT8 &&
-      current_col.type() != DataType::DE_STRING) {
-    std::string err_msg = "Invalid data, invalid data type for Tensor at column: " + current_col.name() +
-                          ", data type should be int8, uint8 or string, but got " + current_col.type().ToString();
+  if (current_col.Type() != DataType::DE_UINT8 && current_col.Type() != DataType::DE_INT8 &&
+      current_col.Type() != DataType::DE_STRING) {
+    std::string err_msg = "Invalid data, invalid data type for Tensor at column: " + current_col.Name() +
+                          ", data type should be int8, uint8 or string, but got " + current_col.Type().ToString();
     RETURN_STATUS_UNEXPECTED(err_msg);
   }
 
@@ -417,7 +417,7 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor &current_col, const dataeng
 
   *num_elements = bytes_list.value_size();
 
-  if (current_col.type() == DataType::DE_STRING) {
+  if (current_col.Type() == DataType::DE_STRING) {
     TensorShape shape = TensorShape::CreateScalar();
     RETURN_IF_NOT_OK(current_col.MaterializeTensorShape(*num_elements, &shape));
     RETURN_IF_NOT_OK(Tensor::CreateFromByteList(bytes_list, shape, tensor));
@@ -436,14 +436,14 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor &current_col, const dataeng
   int64_t pad_size = max_size;
 
   // if user provides a shape in the form of [-1, d1, 2d, ... , dn], we need to pad to d1 * d2 * ... * dn
-  if (current_col.hasShape()) {
-    TensorShape cur_shape = current_col.shape();
+  if (current_col.HasShape()) {
+    TensorShape cur_shape = current_col.Shape();
     if (cur_shape.Size() >= 2 && cur_shape[0] == TensorShape::kDimUnknown) {
       int64_t new_pad_size = 1;
       for (int i = 1; i < cur_shape.Size(); ++i) {
         if (cur_shape[i] == TensorShape::kDimUnknown) {
           std::string err_msg =
-            "Invalid data, more than one unknown dimension in the shape of column: " + current_col.name();
+            "Invalid data, more than one unknown dimension in the shape of column: " + current_col.Name();
           RETURN_STATUS_UNEXPECTED(err_msg);
         }
         new_pad_size *= cur_shape[i];
@@ -451,7 +451,7 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor &current_col, const dataeng
       pad_size = new_pad_size;
     } else {
       if (cur_shape.known() && cur_shape.NumOfElements() != max_size) {
-        std::string err_msg = "Shape in schema's column '" + current_col.name() + "' is incorrect." +
+        std::string err_msg = "Shape in schema's column '" + current_col.Name() + "' is incorrect." +
                               "\nshape received: " + cur_shape.ToString() +
                               "\ntotal elements in shape received: " + std::to_string(cur_shape.NumOfElements()) +
                               "\nexpected total elements in shape: " + std::to_string(max_size);
@@ -463,7 +463,7 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor &current_col, const dataeng
   // know how many elements there are and the total bytes, create tensor here:
   TensorShape current_shape = TensorShape::CreateScalar();
   RETURN_IF_NOT_OK(current_col.MaterializeTensorShape((*num_elements) * pad_size, &current_shape));
-  RETURN_IF_NOT_OK(Tensor::CreateFromByteList(bytes_list, current_shape, current_col.type(), pad_size, tensor));
+  RETURN_IF_NOT_OK(Tensor::CreateFromByteList(bytes_list, current_shape, current_col.Type(), pad_size, tensor));
 
   return Status::OK();
 }
@@ -472,9 +472,9 @@ Status TFReaderOp::LoadFloatList(const ColDescriptor &current_col, const dataeng
                                  int32_t *num_elements, std::unique_ptr<float[]> *float_array) {
   // KFloatList can only map to DE types:
   // DE_FLOAT32
-  if (current_col.type() != DataType::DE_FLOAT32) {
-    std::string err_msg = "Invalid data, invalid data type for Tensor at column: " + current_col.name() +
-                          ", data type should be string, but got " + current_col.type().ToString();
+  if (current_col.Type() != DataType::DE_FLOAT32) {
+    std::string err_msg = "Invalid data, invalid data type for Tensor at column: " + current_col.Name() +
+                          ", data type should be string, but got " + current_col.Type().ToString();
     RETURN_STATUS_UNEXPECTED(err_msg);
   }
 
@@ -494,26 +494,26 @@ Status TFReaderOp::LoadFloatList(const ColDescriptor &current_col, const dataeng
 // Determines which template type to use and calls LoadIntList
 Status TFReaderOp::LoadIntListSwitch(const ColDescriptor &current_col, const dataengine::Feature &column_values_list,
                                      int32_t *num_elements, std::shared_ptr<Tensor> *tensor) {
-  if (current_col.type() == DataType::DE_UINT64) {
+  if (current_col.Type() == DataType::DE_UINT64) {
     RETURN_IF_NOT_OK(LoadIntList<uint64_t>(current_col, column_values_list, num_elements, tensor));
-  } else if (current_col.type() == DataType::DE_INT64) {
+  } else if (current_col.Type() == DataType::DE_INT64) {
     RETURN_IF_NOT_OK(LoadIntList<int64_t>(current_col, column_values_list, num_elements, tensor));
-  } else if (current_col.type() == DataType::DE_UINT32) {
+  } else if (current_col.Type() == DataType::DE_UINT32) {
     RETURN_IF_NOT_OK(LoadIntList<uint32_t>(current_col, column_values_list, num_elements, tensor));
-  } else if (current_col.type() == DataType::DE_INT32) {
+  } else if (current_col.Type() == DataType::DE_INT32) {
     RETURN_IF_NOT_OK(LoadIntList<int32_t>(current_col, column_values_list, num_elements, tensor));
-  } else if (current_col.type() == DataType::DE_UINT16) {
+  } else if (current_col.Type() == DataType::DE_UINT16) {
     RETURN_IF_NOT_OK(LoadIntList<uint16_t>(current_col, column_values_list, num_elements, tensor));
-  } else if (current_col.type() == DataType::DE_INT16) {
+  } else if (current_col.Type() == DataType::DE_INT16) {
     RETURN_IF_NOT_OK(LoadIntList<int16_t>(current_col, column_values_list, num_elements, tensor));
-  } else if (current_col.type() == DataType::DE_UINT8) {
+  } else if (current_col.Type() == DataType::DE_UINT8) {
     RETURN_IF_NOT_OK(LoadIntList<uint8_t>(current_col, column_values_list, num_elements, tensor));
-  } else if (current_col.type() == DataType::DE_INT8) {
+  } else if (current_col.Type() == DataType::DE_INT8) {
     RETURN_IF_NOT_OK(LoadIntList<int8_t>(current_col, column_values_list, num_elements, tensor));
   } else {
-    std::string err_msg = "Invalid data, invalid datatype for Tensor at column: " + current_col.name() +
+    std::string err_msg = "Invalid data, invalid datatype for Tensor at column: " + current_col.Name() +
                           ", data type should be uint64, int64, uint32, int32, uint16, int16, uint8 or int8" +
-                          ", but got " + current_col.type().ToString();
+                          ", but got " + current_col.Type().ToString();
     RETURN_STATUS_UNEXPECTED(err_msg);
   }
 
@@ -525,9 +525,9 @@ Status TFReaderOp::LoadIntListSwitch(const ColDescriptor &current_col, const dat
 template <typename T>
 Status TFReaderOp::LoadIntList(const ColDescriptor &current_col, const dataengine::Feature &column_values_list,
                                int32_t *num_elements, std::shared_ptr<Tensor> *tensor) {
-  if (!(current_col.type().IsInt())) {
-    std::string err_msg = "Invalid data, invalid data type for Tensor at column: " + current_col.name() +
-                          ", data type should be int, but got " + current_col.type().ToString();
+  if (!(current_col.Type().IsInt())) {
+    std::string err_msg = "Invalid data, invalid data type for Tensor at column: " + current_col.Name() +
+                          ", data type should be int, but got " + current_col.Type().ToString();
     RETURN_STATUS_UNEXPECTED(err_msg);
   }
 
@@ -540,7 +540,7 @@ Status TFReaderOp::LoadIntList(const ColDescriptor &current_col, const dataengin
   // know how many elements there are, create tensor here:
   TensorShape current_shape = TensorShape::CreateUnknownRankShape();
   RETURN_IF_NOT_OK(current_col.MaterializeTensorShape(*num_elements, &current_shape));
-  RETURN_IF_NOT_OK(Tensor::CreateEmpty(current_shape, current_col.type(), tensor));
+  RETURN_IF_NOT_OK(Tensor::CreateEmpty(current_shape, current_col.Type(), tensor));
 
   int64_t i = 0;
   auto it = (*tensor)->begin<T>();
@@ -719,7 +719,7 @@ Status TFReaderOp::ComputeColMap() {
   // Construct the column name map for this operator (base class field)
   if (column_name_id_map_.empty()) {
     for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-      column_name_id_map_[data_schema_->column(i).name()] = i;
+      column_name_id_map_[data_schema_->Column(i).Name()] = i;
     }
   } else {
     MS_LOG(WARNING) << "Column name map is already set!";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/voc_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/voc_op.cc
index fa94aef0d23..42c69d912e9 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/voc_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/voc_op.cc
@@ -83,8 +83,8 @@ Status VOCOp::LoadTensorRow(row_id_type row_id, TensorRow *trow) {
     std::shared_ptr<Tensor> image, target;
     const std::string kTargetFile =
       folder_path_ + std::string(kSegmentationClassFolder) + image_id + std::string(kSegmentationExtension);
-    RETURN_IF_NOT_OK(ReadImageToTensor(kImageFile, data_schema_->column(0), &image));
-    RETURN_IF_NOT_OK(ReadImageToTensor(kTargetFile, data_schema_->column(1), &target));
+    RETURN_IF_NOT_OK(ReadImageToTensor(kImageFile, data_schema_->Column(0), &image));
+    RETURN_IF_NOT_OK(ReadImageToTensor(kTargetFile, data_schema_->Column(1), &target));
     (*trow) = TensorRow(row_id, {std::move(image), std::move(target)});
     path_list = {kImageFile, kTargetFile};
   } else if (task_type_ == TaskType::Detection) {
@@ -92,7 +92,7 @@ Status VOCOp::LoadTensorRow(row_id_type row_id, TensorRow *trow) {
     TensorRow annotation;
     const std::string kAnnotationFile =
       folder_path_ + std::string(kAnnotationsFolder) + image_id + std::string(kAnnotationExtension);
-    RETURN_IF_NOT_OK(ReadImageToTensor(kImageFile, data_schema_->column(0), &image));
+    RETURN_IF_NOT_OK(ReadImageToTensor(kImageFile, data_schema_->Column(0), &image));
     RETURN_IF_NOT_OK(ReadAnnotationToTensor(kAnnotationFile, &annotation));
     trow->setId(row_id);
     trow->push_back(std::move(image));
@@ -326,7 +326,7 @@ Status VOCOp::ComputeColMap() {
   // Set the column name map (base class field)
   if (column_name_id_map_.empty()) {
     for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
-      column_name_id_map_[data_schema_->column(i).name()] = i;
+      column_name_id_map_[data_schema_->Column(i).Name()] = i;
     }
   } else {
     MS_LOG(WARNING) << "Column name map is already set!";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc b/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc
index 6365622c8b4..b7240006c8d 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc
@@ -62,6 +62,7 @@ ExecutionTree::~ExecutionTree() {
 // provides it with a link to the tree. A node cannot form any relationships (parent/child) with
 // other nodes unless they are associated with the same tree.
 Status ExecutionTree::AssociateNode(const std::shared_ptr<DatasetOp> &op) {
+  RETURN_UNEXPECTED_IF_NULL(op);
   // If we are already a part of the tree, no-op
   if (op->tree_ == this) {
     return Status::OK();
@@ -88,6 +89,7 @@ Status ExecutionTree::AssociateNode(const std::shared_ptr<DatasetOp> &op) {
 
 // Sets the root node of the tree
 Status ExecutionTree::AssignRoot(const std::shared_ptr<DatasetOp> &op) {
+  RETURN_UNEXPECTED_IF_NULL(op);
   // Tree must be in building state before we can assign root to it
   if (tree_state_ != kDeTStateBuilding) {
     std::string err_msg =
@@ -121,6 +123,9 @@ void ExecutionTree::Print(std::ostream &out, const std::shared_ptr<DatasetOp> &o
 // A helper functions for doing the recursive printing
 void ExecutionTree::PrintNode(std::ostream &out, const std::shared_ptr<DatasetOp> &dataset_op, std::string indent,
                               bool last, bool detailed) const {
+  if (dataset_op == nullptr) {
+    return;
+  }
   // Decide which printer to use based on detailed arg.
   if (!detailed) {
     out << indent << "+- " << *dataset_op;
diff --git a/mindspore/ccsrc/minddata/dataset/engine/gnn/graph_data_impl.cc b/mindspore/ccsrc/minddata/dataset/engine/gnn/graph_data_impl.cc
index 100cdb0c605..56d9fa7fd7a 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/gnn/graph_data_impl.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/gnn/graph_data_impl.cc
@@ -41,6 +41,7 @@ GraphDataImpl::GraphDataImpl(std::string dataset_file, int32_t num_workers, bool
 GraphDataImpl::~GraphDataImpl() {}
 
 Status GraphDataImpl::GetAllNodes(NodeType node_type, std::shared_ptr<Tensor> *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   auto itr = node_type_map_.find(node_type);
   if (itr == node_type_map_.end()) {
     std::string err_msg = "Invalid node type:" + std::to_string(node_type);
@@ -54,6 +55,7 @@ Status GraphDataImpl::GetAllNodes(NodeType node_type, std::shared_ptr<Tensor> *o
 template <typename T>
 Status GraphDataImpl::CreateTensorByVector(const std::vector<std::vector<T>> &data, DataType type,
                                            std::shared_ptr<Tensor> *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   if (!type.IsCompatible<T>()) {
     RETURN_STATUS_UNEXPECTED("Data type not compatible");
   }
@@ -96,6 +98,7 @@ Status GraphDataImpl::ComplementVector(std::vector<std::vector<T>> *data, size_t
 }
 
 Status GraphDataImpl::GetAllEdges(EdgeType edge_type, std::shared_ptr<Tensor> *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   auto itr = edge_type_map_.find(edge_type);
   if (itr == edge_type_map_.end()) {
     std::string err_msg = "Invalid edge type:" + std::to_string(edge_type);
@@ -110,6 +113,7 @@ Status GraphDataImpl::GetNodesFromEdges(const std::vector<EdgeIdType> &edge_list
   if (edge_list.empty()) {
     RETURN_STATUS_UNEXPECTED("Input edge_list is empty");
   }
+  RETURN_UNEXPECTED_IF_NULL(out);
 
   std::vector<std::vector<NodeIdType>> node_list;
   node_list.reserve(edge_list.size());
@@ -156,6 +160,7 @@ Status GraphDataImpl::GetAllNeighbors(const std::vector<NodeIdType> &node_list,
                                       const OutputFormat &format, std::shared_ptr<Tensor> *out) {
   CHECK_FAIL_RETURN_UNEXPECTED(!node_list.empty(), "Input node_list is empty.");
   RETURN_IF_NOT_OK(CheckNeighborType(neighbor_type));
+  RETURN_UNEXPECTED_IF_NULL(out);
 
   std::vector<std::vector<NodeIdType>> neighbors;
 
@@ -251,6 +256,7 @@ Status GraphDataImpl::GetSampledNeighbors(const std::vector<NodeIdType> &node_li
   for (const auto &type : neighbor_types) {
     RETURN_IF_NOT_OK(CheckNeighborType(type));
   }
+  RETURN_UNEXPECTED_IF_NULL(out);
   std::vector<std::vector<NodeIdType>> neighbors_vec(node_list.size());
   for (size_t node_idx = 0; node_idx < node_list.size(); ++node_idx) {
     std::shared_ptr<Node> input_node;
@@ -285,6 +291,7 @@ Status GraphDataImpl::NegativeSample(const std::vector<NodeIdType> &data, const
                                      size_t *start_index, const std::unordered_set<NodeIdType> &exclude_data,
                                      int32_t samples_num, std::vector<NodeIdType> *out_samples) {
   CHECK_FAIL_RETURN_UNEXPECTED(!data.empty(), "Input data is empty.");
+  RETURN_UNEXPECTED_IF_NULL(start_index);
   size_t index = *start_index;
   for (size_t i = index; i < shuffled_ids.size(); ++i) {
     ++index;
@@ -305,6 +312,7 @@ Status GraphDataImpl::GetNegSampledNeighbors(const std::vector<NodeIdType> &node
   CHECK_FAIL_RETURN_UNEXPECTED(!node_list.empty(), "Input node_list is empty.");
   RETURN_IF_NOT_OK(CheckSamplesNum(samples_num));
   RETURN_IF_NOT_OK(CheckNeighborType(neg_neighbor_type));
+  RETURN_UNEXPECTED_IF_NULL(out);
 
   const std::vector<NodeIdType> &all_nodes = node_type_map_[neg_neighbor_type];
   std::vector<NodeIdType> shuffled_id(all_nodes.size());
@@ -321,9 +329,9 @@ Status GraphDataImpl::GetNegSampledNeighbors(const std::vector<NodeIdType> &node
     std::vector<NodeIdType> neighbors;
     RETURN_IF_NOT_OK(node->GetAllNeighbors(neg_neighbor_type, &neighbors));
     std::unordered_set<NodeIdType> exclude_nodes;
-    std::transform(neighbors.begin(), neighbors.end(),
-                   std::insert_iterator<std::unordered_set<NodeIdType>>(exclude_nodes, exclude_nodes.begin()),
-                   [](const NodeIdType node) { return node; });
+    (void)std::transform(neighbors.begin(), neighbors.end(),
+                         std::insert_iterator<std::unordered_set<NodeIdType>>(exclude_nodes, exclude_nodes.begin()),
+                         [](const NodeIdType node) { return node; });
     neg_neighbors_vec[node_idx].emplace_back(node->id());
     if (all_nodes.size() > exclude_nodes.size()) {
       while (neg_neighbors_vec[node_idx].size() < samples_num + 1) {
@@ -355,6 +363,7 @@ Status GraphDataImpl::GetNegSampledNeighbors(const std::vector<NodeIdType> &node
 Status GraphDataImpl::RandomWalk(const std::vector<NodeIdType> &node_list, const std::vector<NodeType> &meta_path,
                                  float step_home_param, float step_away_param, NodeIdType default_node,
                                  std::shared_ptr<Tensor> *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   RETURN_IF_NOT_OK(random_walk_.Build(node_list, meta_path, step_home_param, step_away_param, default_node));
   std::vector<std::vector<NodeIdType>> walks;
   RETURN_IF_NOT_OK(random_walk_.SimulateWalk(&walks));
@@ -363,6 +372,7 @@ Status GraphDataImpl::RandomWalk(const std::vector<NodeIdType> &node_list, const
 }
 
 Status GraphDataImpl::GetNodeDefaultFeature(FeatureType feature_type, std::shared_ptr<Feature> *out_feature) {
+  RETURN_UNEXPECTED_IF_NULL(out_feature);
   auto itr = default_node_feature_map_.find(feature_type);
   if (itr == default_node_feature_map_.end()) {
     std::string err_msg = "Invalid feature type:" + std::to_string(feature_type);
@@ -374,6 +384,7 @@ Status GraphDataImpl::GetNodeDefaultFeature(FeatureType feature_type, std::share
 }
 
 Status GraphDataImpl::GetEdgeDefaultFeature(FeatureType feature_type, std::shared_ptr<Feature> *out_feature) {
+  RETURN_UNEXPECTED_IF_NULL(out_feature);
   auto itr = default_edge_feature_map_.find(feature_type);
   if (itr == default_edge_feature_map_.end()) {
     std::string err_msg = "Invalid feature type:" + std::to_string(feature_type);
@@ -390,6 +401,7 @@ Status GraphDataImpl::GetNodeFeature(const std::shared_ptr<Tensor> &nodes,
     RETURN_STATUS_UNEXPECTED("Input nodes is empty");
   }
   CHECK_FAIL_RETURN_UNEXPECTED(!feature_types.empty(), "Input feature_types is empty");
+  RETURN_UNEXPECTED_IF_NULL(out);
   TensorRow tensors;
   for (const auto &f_type : feature_types) {
     std::shared_ptr<Feature> default_feature;
@@ -436,6 +448,7 @@ Status GraphDataImpl::GetNodeFeatureSharedMemory(const std::shared_ptr<Tensor> &
   if (!nodes || nodes->Size() == 0) {
     RETURN_STATUS_UNEXPECTED("Input nodes is empty");
   }
+  RETURN_UNEXPECTED_IF_NULL(out);
   TensorShape shape = nodes->shape().AppendDim(2);
   std::shared_ptr<Tensor> fea_tensor;
   RETURN_IF_NOT_OK(Tensor::CreateEmpty(shape, DataType(DataType::DE_INT64), &fea_tensor));
@@ -478,6 +491,7 @@ Status GraphDataImpl::GetEdgeFeature(const std::shared_ptr<Tensor> &edges,
     RETURN_STATUS_UNEXPECTED("Input edges is empty");
   }
   CHECK_FAIL_RETURN_UNEXPECTED(!feature_types.empty(), "Input feature_types is empty");
+  RETURN_UNEXPECTED_IF_NULL(out);
   TensorRow tensors;
   for (const auto &f_type : feature_types) {
     std::shared_ptr<Feature> default_feature;
@@ -520,6 +534,7 @@ Status GraphDataImpl::GetEdgeFeatureSharedMemory(const std::shared_ptr<Tensor> &
   if (!edges || edges->Size() == 0) {
     RETURN_STATUS_UNEXPECTED("Input edges is empty");
   }
+  RETURN_UNEXPECTED_IF_NULL(out);
   TensorShape shape = edges->shape().AppendDim(2);
   std::shared_ptr<Tensor> fea_tensor;
   RETURN_IF_NOT_OK(Tensor::CreateEmpty(shape, DataType(DataType::DE_INT64), &fea_tensor));
@@ -554,14 +569,15 @@ Status GraphDataImpl::Init() {
 }
 
 Status GraphDataImpl::GetMetaInfo(MetaInfo *meta_info) {
+  RETURN_UNEXPECTED_IF_NULL(meta_info);
   meta_info->node_type.resize(node_type_map_.size());
-  std::transform(node_type_map_.begin(), node_type_map_.end(), meta_info->node_type.begin(),
-                 [](auto itr) { return itr.first; });
+  (void)std::transform(node_type_map_.begin(), node_type_map_.end(), meta_info->node_type.begin(),
+                       [](auto itr) { return itr.first; });
   std::sort(meta_info->node_type.begin(), meta_info->node_type.end());
 
   meta_info->edge_type.resize(edge_type_map_.size());
-  std::transform(edge_type_map_.begin(), edge_type_map_.end(), meta_info->edge_type.begin(),
-                 [](auto itr) { return itr.first; });
+  (void)std::transform(edge_type_map_.begin(), edge_type_map_.end(), meta_info->edge_type.begin(),
+                       [](auto itr) { return itr.first; });
   std::sort(meta_info->edge_type.begin(), meta_info->edge_type.end());
 
   for (const auto &node : node_type_map_) {
@@ -594,6 +610,7 @@ Status GraphDataImpl::GetMetaInfo(MetaInfo *meta_info) {
 
 #ifdef ENABLE_PYTHON
 Status GraphDataImpl::GraphInfo(py::dict *out) {
+  RETURN_UNEXPECTED_IF_NULL(out);
   MetaInfo meta_info;
   RETURN_IF_NOT_OK(GetMetaInfo(&meta_info));
   (*out)["node_type"] = py::cast(meta_info.node_type);
@@ -616,6 +633,7 @@ Status GraphDataImpl::LoadNodeAndEdge() {
 }
 
 Status GraphDataImpl::GetNodeByNodeId(NodeIdType id, std::shared_ptr<Node> *node) {
+  RETURN_UNEXPECTED_IF_NULL(node);
   auto itr = node_id_map_.find(id);
   if (itr == node_id_map_.end()) {
     std::string err_msg = "Invalid node id:" + std::to_string(id);
@@ -627,6 +645,7 @@ Status GraphDataImpl::GetNodeByNodeId(NodeIdType id, std::shared_ptr<Node> *node
 }
 
 Status GraphDataImpl::GetEdgeByEdgeId(EdgeIdType id, std::shared_ptr<Edge> *edge) {
+  RETURN_UNEXPECTED_IF_NULL(edge);
   auto itr = edge_id_map_.find(id);
   if (itr == edge_id_map_.end()) {
     std::string err_msg = "Invalid edge id:" + std::to_string(id);
@@ -682,6 +701,7 @@ Status GraphDataImpl::RandomWalkBase::Build(const std::vector<NodeIdType> &node_
 }
 
 Status GraphDataImpl::RandomWalkBase::Node2vecWalk(const NodeIdType &start_node, std::vector<NodeIdType> *walk_path) {
+  RETURN_UNEXPECTED_IF_NULL(walk_path);
   // Simulate a random walk starting from start node.
   auto walk = std::vector<NodeIdType>(1, start_node);  // walk is an vector
   // walk simulate
@@ -722,6 +742,7 @@ Status GraphDataImpl::RandomWalkBase::Node2vecWalk(const NodeIdType &start_node,
 }
 
 Status GraphDataImpl::RandomWalkBase::SimulateWalk(std::vector<std::vector<NodeIdType>> *walks) {
+  RETURN_UNEXPECTED_IF_NULL(walks);
   for (int32_t i = 0; i < num_walks_; ++i) {
     for (const auto &node : node_list_) {
       std::vector<NodeIdType> walk;
@@ -734,6 +755,7 @@ Status GraphDataImpl::RandomWalkBase::SimulateWalk(std::vector<std::vector<NodeI
 
 Status GraphDataImpl::RandomWalkBase::GetNodeProbability(const NodeIdType &node_id, const NodeType &node_type,
                                                          std::shared_ptr<StochasticIndex> *node_probability) {
+  RETURN_UNEXPECTED_IF_NULL(node_probability);
   // Generate alias nodes
   std::shared_ptr<Node> node;
   RETURN_IF_NOT_OK(graph_->GetNodeByNodeId(node_id, &node));
@@ -749,6 +771,7 @@ Status GraphDataImpl::RandomWalkBase::GetNodeProbability(const NodeIdType &node_
 Status GraphDataImpl::RandomWalkBase::GetEdgeProbability(const NodeIdType &src, const NodeIdType &dst,
                                                          uint32_t meta_path_index,
                                                          std::shared_ptr<StochasticIndex> *edge_probability) {
+  RETURN_UNEXPECTED_IF_NULL(edge_probability);
   // Get the alias edge setup lists for a given edge.
   std::shared_ptr<Node> src_node;
   RETURN_IF_NOT_OK(graph_->GetNodeByNodeId(src, &src_node));
@@ -760,6 +783,8 @@ Status GraphDataImpl::RandomWalkBase::GetEdgeProbability(const NodeIdType &src,
   std::vector<NodeIdType> dst_neighbors;
   RETURN_IF_NOT_OK(dst_node->GetAllNeighbors(meta_path_[meta_path_index + 1], &dst_neighbors, true));
 
+  CHECK_FAIL_RETURN_UNEXPECTED(step_home_param_ != 0, "Invalid data, step home parameter can't be zero.");
+  CHECK_FAIL_RETURN_UNEXPECTED(step_away_param_ != 0, "Invalid data, step away parameter can't be zero.");
   std::sort(dst_neighbors.begin(), dst_neighbors.end());
   std::vector<float> non_normalized_probability;
   for (const auto &dst_nbr : dst_neighbors) {
diff --git a/mindspore/ccsrc/minddata/dataset/engine/gnn/graph_shared_memory.cc b/mindspore/ccsrc/minddata/dataset/engine/gnn/graph_shared_memory.cc
index 0bf4575517c..e77525b7770 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/gnn/graph_shared_memory.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/gnn/graph_shared_memory.cc
@@ -17,6 +17,8 @@
 #include "minddata/dataset/engine/gnn/graph_shared_memory.h"
 
 #include <string>
+#include "debug/common.h"
+#include "utils/ms_utils.h"
 #include "minddata/dataset/util/log_adapter.h"
 
 namespace mindspore {
@@ -51,7 +53,9 @@ GraphSharedMemory::~GraphSharedMemory() {
 Status GraphSharedMemory::CreateSharedMemory() {
   if (memory_key_ == -1) {
     // ftok to generate unique key
-    memory_key_ = ftok(mr_file_.data(), kGnnSharedMemoryId);
+    auto realpath = Common::GetRealPath(mr_file_);
+    CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Get real path failed, path=" + mr_file_);
+    memory_key_ = ftok(common::SafeCStr(realpath.value()), kGnnSharedMemoryId);
     CHECK_FAIL_RETURN_UNEXPECTED(memory_key_ != -1, "Failed to get key of shared memory. file_name:" + mr_file_);
     std::stringstream stream;
     stream << std::hex << memory_key_;
@@ -89,6 +93,7 @@ Status GraphSharedMemory::DeleteSharedMemory() {
 
 Status GraphSharedMemory::SharedMemoryImpl(const int &shmflg) {
   // shmget returns an identifier in shmid
+  CHECK_FAIL_RETURN_UNEXPECTED(memory_size_ >= 0, "Invalid memory size, should be greater than zero.");
   int shmid = shmget(memory_key_, memory_size_, shmflg);
   CHECK_FAIL_RETURN_UNEXPECTED(shmid != -1, "Failed to get shared memory. key=0x" + memory_key_str_);
 
@@ -103,6 +108,7 @@ Status GraphSharedMemory::SharedMemoryImpl(const int &shmflg) {
 Status GraphSharedMemory::InsertData(const uint8_t *data, int64_t len, int64_t *offset) {
   CHECK_FAIL_RETURN_UNEXPECTED(data, "Input data is nullptr.");
   CHECK_FAIL_RETURN_UNEXPECTED(len > 0, "Input len is invalid.");
+  CHECK_FAIL_RETURN_UNEXPECTED(offset, "Input offset is nullptr.");
 
   std::lock_guard<std::mutex> lck(mutex_);
   CHECK_FAIL_RETURN_UNEXPECTED((memory_size_ - memory_offset_ >= len),
diff --git a/mindspore/ccsrc/minddata/dataset/engine/gpu_item_connector.h b/mindspore/ccsrc/minddata/dataset/engine/gpu_item_connector.h
index 680fdc27561..716fd23a909 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/gpu_item_connector.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/gpu_item_connector.h
@@ -46,6 +46,7 @@ class GpuItemConnector : public Connector<std::vector<device::DataItemGpu>> {
   }
 
   Status Pop(int32_t worker_id, std::vector<device::DataItemGpu> *result) noexcept override {
+    RETURN_UNEXPECTED_IF_NULL(result);
     {
       MS_ASSERT(worker_id < num_consumers_);
       std::unique_lock<std::mutex> lock(m_);
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/cache/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/engine/ir/cache/CMakeLists.txt
index bfb4ebcb3b9..696a2207d5b 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/cache/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/cache/CMakeLists.txt
@@ -2,4 +2,5 @@ file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc"
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
 add_library(engine-ir-cache OBJECT
         pre_built_dataset_cache.cc
-        dataset_cache_impl.cc)
+        dataset_cache_impl.cc
+        dataset_cache.cc)
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache.h b/mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache.h
index 5c1c9240726..0e49eb80687 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/cache/dataset_cache.h
@@ -35,6 +35,10 @@ class DatasetCache {
   virtual Status CreateCacheMergeOp(int32_t num_workers, int32_t connector_queue_size,
                                     std::shared_ptr<DatasetOp> *ds) = 0;
   virtual Status to_json(nlohmann::json *out_json) { return Status::OK(); }
+
+#ifndef ENABLE_ANDROID
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetCache> *cache);
+#endif
 };
 }  // namespace mindspore::dataset
 
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc
index 1d191a2800b..ac9a0002989 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.cc
@@ -169,5 +169,19 @@ Status BatchNode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+Status BatchNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                            std::shared_ptr<DatasetNode> *result) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("batch_size") != json_obj.end(), "Failed to find batch_size");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("drop_remainder") != json_obj.end(), "Failed to find drop_remainder");
+  int32_t batch_size = json_obj["batch_size"];
+  bool drop_remainder = json_obj["drop_remainder"];
+  *result = std::make_shared<BatchNode>(ds, batch_size, drop_remainder);
+  (*result)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.h
index 6f0c767a95a..89d2cda3680 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/batch_node.h
@@ -105,6 +105,14 @@ class BatchNode : public DatasetNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+  /// \brief Function for read dataset operation from json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[in] ds dataset node constructed
+  /// \param[out] result Deserialized dataset after the operation
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                          std::shared_ptr<DatasetNode> *result);
+
  private:
   int32_t batch_size_;
   bool drop_remainder_;
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc
index 6a062658ecd..a08adbcdabb 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.cc
@@ -30,6 +30,7 @@ namespace dataset {
 // Helper function to compute a default shuffle size
 Status ComputeShuffleSize(int64_t num_files, int64_t num_devices, int64_t num_rows, int64_t total_rows,
                           int64_t *shuffle_size) {
+  RETURN_UNEXPECTED_IF_NULL(shuffle_size);
   const int64_t average_files_multiplier = 4;
   const int64_t shuffle_max = 10000;
   int64_t avg_rows_per_file = 0;
@@ -59,6 +60,7 @@ Status ComputeShuffleSize(int64_t num_files, int64_t num_devices, int64_t num_ro
 // Helper function to inject a shuffle operator over top of current operator being built
 Status AddShuffleOp(int64_t num_files, int64_t num_devices, int64_t num_rows, int64_t total_rows,
                     int32_t connector_que_size, std::shared_ptr<DatasetOp> *shuffle_op) {
+  RETURN_UNEXPECTED_IF_NULL(shuffle_op);
   int64_t shuffle_size = 0;
   RETURN_IF_NOT_OK(ComputeShuffleSize(num_files, num_devices, num_rows, total_rows, &shuffle_size));
   MS_LOG(INFO) << "Dataset::AddShuffleOp - num_rows: " << num_rows << ", shuffle_size: " << shuffle_size;
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h
index a591484cc4b..6a28776204f 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/dataset_node.h
@@ -79,7 +79,6 @@ constexpr char kCelebANode[] = "CelebADataset";
 constexpr char kCifar100Node[] = "Cifar100Dataset";
 constexpr char kCifar10Node[] = "Cifar10Dataset";
 constexpr char kCLUENode[] = "CLUEDataset";
-constexpr char kCmuArcticNode[] = "CmuArcticDataset";
 constexpr char kCocoNode[] = "CocoDataset";
 constexpr char kCSVNode[] = "CSVDataset";
 constexpr char kFlickrNode[] = "FlickrDataset";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc
index 0443d7f387b..bf622d6aa71 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc
@@ -22,6 +22,9 @@
 #include <utility>
 #include <vector>
 
+#ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
+#endif
 #include "minddata/dataset/engine/datasetops/map_op/map_op.h"
 #include "minddata/dataset/engine/opt/pass.h"
 #include "minddata/dataset/kernels/ir/tensor_operation.h"
@@ -56,6 +59,7 @@ void MapNode::Print(std::ostream &out) const {
 }
 
 Status MapNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) {
+  RETURN_UNEXPECTED_IF_NULL(node_ops);
   std::vector<std::shared_ptr<TensorOp>> tensor_ops;
 
   // Build tensorOp from tensorOperation vector
@@ -128,12 +132,16 @@ Status MapNode::ValidateParams() {
 
 // Visitor accepting method for IRNodePass
 Status MapNode::Accept(IRNodePass *const p, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(p);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   // Downcast shared pointer then call visitor
   return p->Visit(shared_from_base<MapNode>(), modified);
 }
 
 // Visitor accepting method for IRNodePass
 Status MapNode::AcceptAfter(IRNodePass *const p, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(p);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   // Downcast shared pointer then call visitor
   return p->VisitAfter(shared_from_base<MapNode>(), modified);
 }
@@ -144,6 +152,7 @@ void MapNode::setOperations(const std::vector<std::shared_ptr<TensorOperation>>
 std::vector<std::shared_ptr<TensorOperation>> MapNode::operations() { return operations_; }
 
 Status MapNode::to_json(nlohmann::json *out_json) {
+  RETURN_UNEXPECTED_IF_NULL(out_json);
   nlohmann::json args;
   args["num_parallel_workers"] = num_workers_;
   args["input_columns"] = input_columns_;
@@ -154,10 +163,10 @@ Status MapNode::to_json(nlohmann::json *out_json) {
     RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
     args["cache"] = cache_args;
   }
-
   std::vector<nlohmann::json> ops;
   std::vector<int32_t> cbs;
   for (auto op : operations_) {
+    RETURN_UNEXPECTED_IF_NULL(op);
     nlohmann::json op_args;
     RETURN_IF_NOT_OK(op->to_json(&op_args));
     if (op->Name() == "PyFuncOp") {
@@ -170,13 +179,33 @@ Status MapNode::to_json(nlohmann::json *out_json) {
     }
   }
   args["operations"] = ops;
-  std::transform(callbacks_.begin(), callbacks_.end(), std::back_inserter(cbs),
-                 [](std::shared_ptr<DSCallback> cb) -> int32_t { return cb->step_size(); });
+  (void)std::transform(callbacks_.begin(), callbacks_.end(), std::back_inserter(cbs),
+                       [](std::shared_ptr<DSCallback> cb) -> int32_t { return cb != nullptr ? cb->step_size() : 0; });
   args["callback"] = cbs;
   *out_json = args;
   return Status::OK();
 }
 
+#ifndef ENABLE_ANDROID
+Status MapNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                          std::shared_ptr<DatasetNode> *result) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("input_columns") != json_obj.end(), "Failed to find input_columns");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("output_columns") != json_obj.end(), "Failed to find output_columns");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("project_columns") != json_obj.end(), "Failed to find project_columns");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("operations") != json_obj.end(), "Failed to find operations");
+  std::vector<std::string> input_columns = json_obj["input_columns"];
+  std::vector<std::string> output_columns = json_obj["output_columns"];
+  std::vector<std::string> project_columns = json_obj["project_columns"];
+  std::vector<std::shared_ptr<TensorOperation>> operations;
+  RETURN_IF_NOT_OK(Serdes::ConstructTensorOps(json_obj["operations"], &operations));
+  *result = std::make_shared<MapNode>(ds, operations, input_columns, output_columns, project_columns);
+  (*result)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+#endif
+
 // Gets the dataset size
 Status MapNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
                                int64_t *dataset_size) {
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h
index d379d080adb..511f7e0e2cf 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h
@@ -93,6 +93,16 @@ class MapNode : public DatasetNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function for read dataset operation from json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[in] ds dataset node constructed
+  /// \param[out] result Deserialized dataset after the operation
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                          std::shared_ptr<DatasetNode> *result);
+#endif
+
   /// \brief Base-class override for GetDatasetSize
   /// \param[in] size_getter Shared pointer to DatasetSizeGetter
   /// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.cc
index d5987d1c8a5..bb297ebc026 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.cc
@@ -66,5 +66,13 @@ Status ProjectNode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+Status ProjectNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                              std::shared_ptr<DatasetNode> *result) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("columns") != json_obj.end(), "Failed to find columns");
+  std::vector<std::string> columns = json_obj["columns"];
+  *result = std::make_shared<ProjectNode>(ds, columns);
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.h
index 791bf8f865c..b439580f433 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/project_node.h
@@ -63,6 +63,14 @@ class ProjectNode : public DatasetNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+  /// \brief Function for read dataset operation from json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[in] ds dataset node constructed
+  /// \param[out] result Deserialized dataset after the operation
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                          std::shared_ptr<DatasetNode> *result);
+
  private:
   std::vector<std::string> columns_;
 };
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.cc
index b0ff4f19f5f..4f9b0f759ec 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.cc
@@ -72,5 +72,16 @@ Status RenameNode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+Status RenameNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                             std::shared_ptr<DatasetNode> *result) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("input_columns") != json_obj.end(), "Failed to find input_columns");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("output_columns") != json_obj.end(), "Failed to find output_columns");
+  std::vector<std::string> input_columns = json_obj["input_columns"];
+  std::vector<std::string> output_columns = json_obj["output_columns"];
+  *result = std::make_shared<RenameNode>(ds, input_columns, output_columns);
+  return Status::OK();
+}
+
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h
index 23ec767cd09..5b1a0e46bbf 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/rename_node.h
@@ -65,6 +65,14 @@ class RenameNode : public DatasetNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+  /// \brief Function for read dataset operation from json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[in] ds dataset node constructed
+  /// \param[out] result Deserialized dataset after the operation
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                          std::shared_ptr<DatasetNode> *result);
+
  private:
   std::vector<std::string> input_columns_;
   std::vector<std::string> output_columns_;
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.cc
index aa66f5a0505..4d2357ec4ca 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.cc
@@ -104,5 +104,14 @@ Status RepeatNode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+Status RepeatNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                             std::shared_ptr<DatasetNode> *result) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("count") != json_obj.end(), "Failed to find count");
+  int32_t count = json_obj["count"];
+  *result = std::make_shared<RepeatNode>(ds, count);
+  return Status::OK();
+}
+
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h
index 9ee902b7e96..c8f26b5b036 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/repeat_node.h
@@ -123,6 +123,14 @@ class RepeatNode : public DatasetNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+  /// \brief Function for read dataset operation from json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[in] ds dataset node constructed
+  /// \param[out] result Deserialized dataset after the operation
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                          std::shared_ptr<DatasetNode> *result);
+
  protected:
   std::shared_ptr<RepeatOp> op_;                // keep its corresponding run-time op of EpochCtrlNode and RepeatNode
   std::shared_ptr<RepeatNode> reset_ancestor_;  // updated its immediate Repeat/EpochCtrl ancestor in GeneratorNodePass
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.cc
index 39015fd9c87..ef222d89804 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.cc
@@ -66,9 +66,19 @@ Status ShuffleNode::ValidateParams() {
 Status ShuffleNode::to_json(nlohmann::json *out_json) {
   nlohmann::json args;
   args["buffer_size"] = shuffle_size_;
-  args["reshuffle_each_epoch"] = reset_every_epoch_;
+  args["reset_each_epoch"] = reset_every_epoch_;
   *out_json = args;
   return Status::OK();
 }
+
+Status ShuffleNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                              std::shared_ptr<DatasetNode> *result) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("buffer_size") != json_obj.end(), "Failed to find buffer_size");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("reset_each_epoch") != json_obj.end(), "Failed to find reset_each_epoch");
+  int32_t buffer_size = json_obj["buffer_size"];
+  bool reset_every_epoch = json_obj["reset_each_epoch"];
+  *result = std::make_shared<ShuffleNode>(ds, buffer_size, reset_every_epoch);
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h
index 5482e7f1a15..98f44721f32 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/shuffle_node.h
@@ -63,6 +63,14 @@ class ShuffleNode : public DatasetNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+  /// \brief Function for read dataset operation from json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[in] ds dataset node constructed
+  /// \param[out] result Deserialized dataset after the operation
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                          std::shared_ptr<DatasetNode> *result);
+
  private:
   int32_t shuffle_size_;
   uint32_t shuffle_seed_;
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.cc
index 7ea4e1a24a8..99489078ee8 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.cc
@@ -93,5 +93,13 @@ Status SkipNode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+Status SkipNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                           std::shared_ptr<DatasetNode> *result) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("count") != json_obj.end(), "Failed to find count");
+  int32_t count = json_obj["count"];
+  *result = std::make_shared<SkipNode>(ds, count);
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.h
index e98e49036cb..e52a26fdb18 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/skip_node.h
@@ -88,6 +88,14 @@ class SkipNode : public DatasetNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+  /// \brief Function for read dataset operation from json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[in] ds dataset node constructed
+  /// \param[out] result Deserialized dataset after the operation
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                          std::shared_ptr<DatasetNode> *result);
+
  private:
   int32_t skip_count_;
 };
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc
index 54d191be18a..543ab401990 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/album_node.cc
@@ -106,8 +106,8 @@ Status AlbumNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_
   }
   std::set<std::string> extensions = {".json", ".JSON"};
 
-  while (dirItr->hasNext()) {
-    Path file = dirItr->next();
+  while (dirItr->HasNext()) {
+    Path file = dirItr->Next();
     if (extensions.empty() || extensions.find(file.Extension()) != extensions.end()) {
       num_rows += 1;
     }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.cc
index 0df0670db24..0771a8dfde4 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.cc
@@ -25,6 +25,9 @@
 
 #include "debug/common.h"
 #include "minddata/dataset/engine/datasetops/source/celeba_op.h"
+#ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
+#endif
 #include "minddata/dataset/util/status.h"
 namespace mindspore {
 namespace dataset {
@@ -182,5 +185,28 @@ Status CelebANode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+#ifndef ENABLE_ANDROID
+Status CelebANode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("decode") != json_obj.end(), "Failed to find decode");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("extensions") != json_obj.end(), "Failed to find extension");
+  std::string dataset_dir = json_obj["dataset_dir"];
+  std::string usage = json_obj["usage"];
+  std::shared_ptr<SamplerObj> sampler;
+  RETURN_IF_NOT_OK(Serdes::ConstructSampler(json_obj["sampler"], &sampler));
+  bool decode = json_obj["decode"];
+  std::set<std::string> extension = json_obj["extensions"];
+  std::shared_ptr<DatasetCache> cache = nullptr;
+  RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
+  *ds = std::make_shared<CelebANode>(dataset_dir, usage, sampler, decode, extension, cache);
+  (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+#endif
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.h
index ef9c3b06734..75f139982aa 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/celeba_node.h
@@ -82,6 +82,14 @@ class CelebANode : public MappableSourceNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function to read dataset in json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] ds Deserialized dataset
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
+#endif
+
   /// \brief Sampler getter
   /// \return SamplerObj of the current node
   std::shared_ptr<SamplerObj> Sampler() override { return sampler_; }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.cc
index c703836b5d5..fdd5c948c1e 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.cc
@@ -22,6 +22,9 @@
 #include <vector>
 
 #include "minddata/dataset/engine/datasetops/source/cifar_op.h"
+#ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
+#endif
 
 #include "minddata/dataset/util/status.h"
 namespace mindspore {
@@ -117,5 +120,24 @@ Status Cifar100Node::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+#ifndef ENABLE_ANDROID
+Status Cifar100Node::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
+  std::string dataset_dir = json_obj["dataset_dir"];
+  std::string usage = json_obj["usage"];
+  std::shared_ptr<SamplerObj> sampler;
+  RETURN_IF_NOT_OK(Serdes::ConstructSampler(json_obj["sampler"], &sampler));
+  std::shared_ptr<DatasetCache> cache = nullptr;
+  RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
+  *ds = std::make_shared<Cifar100Node>(dataset_dir, usage, sampler, cache);
+  (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+#endif
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.h
index 17bdfb39e9c..a0a8be8fa7e 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar100_node.h
@@ -78,6 +78,14 @@ class Cifar100Node : public MappableSourceNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function to read dataset in json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] ds Deserialized dataset
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
+#endif
+
   /// \brief Sampler getter
   /// \return SamplerObj of the current node
   std::shared_ptr<SamplerObj> Sampler() override { return sampler_; }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.cc
index faa0e1b8b61..e74728b7d35 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.cc
@@ -22,6 +22,9 @@
 #include <vector>
 
 #include "minddata/dataset/engine/datasetops/source/cifar_op.h"
+#ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
+#endif
 
 #include "minddata/dataset/util/status.h"
 namespace mindspore {
@@ -118,5 +121,24 @@ Status Cifar10Node::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+#ifndef ENABLE_ANDROID
+Status Cifar10Node::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
+  std::string dataset_dir = json_obj["dataset_dir"];
+  std::string usage = json_obj["usage"];
+  std::shared_ptr<SamplerObj> sampler;
+  RETURN_IF_NOT_OK(Serdes::ConstructSampler(json_obj["sampler"], &sampler));
+  std::shared_ptr<DatasetCache> cache = nullptr;
+  RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
+  *ds = std::make_shared<Cifar10Node>(dataset_dir, usage, sampler, cache);
+  (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+#endif
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.h
index a77eac9b4d7..b14bba17c3f 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cifar10_node.h
@@ -78,6 +78,14 @@ class Cifar10Node : public MappableSourceNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function to read dataset in json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] ds Deserialized dataset
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
+#endif
+
   /// \brief Sampler getter
   /// \return SamplerObj of the current node
   std::shared_ptr<SamplerObj> Sampler() override { return sampler_; }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.cc
index 111d9d6018a..4426455e319 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.cc
@@ -249,6 +249,29 @@ Status CLUENode::to_json(nlohmann::json *out_json) {
   return Status::OK();
 }
 
+Status CLUENode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("task") != json_obj.end(), "Failed to find task");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_samples") != json_obj.end(), "Failed to find num_samples");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shuffle") != json_obj.end(), "Failed to find shuffle");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_shards") != json_obj.end(), "Failed to find num_shards");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shard_id") != json_obj.end(), "Failed to find shard_id");
+  std::vector<std::string> dataset_files = json_obj["dataset_dir"];
+  std::string task = json_obj["task"];
+  std::string usage = json_obj["usage"];
+  int64_t num_samples = json_obj["num_samples"];
+  ShuffleMode shuffle = static_cast<ShuffleMode>(json_obj["shuffle"]);
+  int32_t num_shards = json_obj["num_shards"];
+  int32_t shard_id = json_obj["shard_id"];
+  std::shared_ptr<DatasetCache> cache = nullptr;
+  RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
+  *ds = std::make_shared<CLUENode>(dataset_files, task, usage, num_samples, shuffle, num_shards, shard_id, cache);
+  (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
 // Note: The following two functions are common among NonMappableSourceNode and should be promoted to its parent
 // class. CLUE by itself is a non-mappable dataset that does not support sampling. However, if a cache operator is
 // injected at some other place higher in the tree, that cache can inherit this sampler from the leaf, providing
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.h
index b255462b449..ca83de77cc5 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/clue_node.h
@@ -86,6 +86,12 @@ class CLUENode : public NonMappableSourceNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+  /// \brief Function to read dataset in json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] ds Deserialized dataset
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
+
   /// \brief CLUE by itself is a non-mappable dataset that does not support sampling.
   ///     However, if a cache operator is injected at some other place higher in the tree, that cache can
   ///     inherit this sampler from the leaf, providing sampling support from the caching layer.
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cmu_arctic_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cmu_arctic_node.cc
deleted file mode 100644
index f86485a0168..00000000000
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cmu_arctic_node.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-#include "minddata/dataset/engine/ir/datasetops/source/cmu_arctic_node.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "minddata/dataset/engine/datasetops/source/cmu_arctic_op.h"
-
-#include "minddata/dataset/util/status.h"
-namespace mindspore {
-namespace dataset {
-    
-CmuArcticNode::CmuArcticNode(std::string dataset_dir, std::string usage, std::shared_ptr<SamplerObj> sampler,
-                     std::shared_ptr<DatasetCache> cache)  
-    : MappableSourceNode(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {}
-    
-void CmuArcticNode::Print(std::ostream &out) const { out << Name(); }
-    
-std::shared_ptr<DatasetNode> CmuArcticNode::Copy() {
-  std::shared_ptr<SamplerObj> sampler = (sampler_ == nullptr) ? nullptr : sampler_->SamplerCopy();
-  auto node = std::make_shared<CmuArcticNode>(dataset_dir_, usage_, sampler, cache_);
-  return node;
-}
-    
-Status CmuArcticNode::ValidateParams() {
-  RETURN_IF_NOT_OK(DatasetNode::ValidateParams());
-  RETURN_IF_NOT_OK(ValidateDatasetDirParam("CmuArcticNode", dataset_dir_));
-  RETURN_IF_NOT_OK(ValidateDatasetSampler("CmuArcticNode", sampler_));
-  RETURN_IF_NOT_OK(ValidateStringValue("CmuArcticNode", usage_, {"aew", "ahw", "aup", "awb", "axb", "bdl", "clb", "eey", "fem", "gka", "jmk", "ksp", "ljm", "lnh", "rms", "rxr", "slp" , "slt"}));
-  return Status::OK();
-}
-    
-Status CmuArcticNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) {
-  // Do internal Schema generation.
-  auto schema = std::make_unique<DataSchema>();
-  
-
-  RETURN_IF_NOT_OK(schema->AddColumn(ColDescriptor("waveform", DataType(DataType::DE_FLOAT64), TensorImpl::kCv, 1)));  
-  TensorShape scalar_rate = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-    schema->AddColumn(ColDescriptor("sample_rate", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar_rate)));
-  TensorShape scalar_utterance = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-    schema->AddColumn(ColDescriptor("utterance", DataType(DataType::DE_STRING), TensorImpl::kFlexible, 0, &scalar_utterance)));
-  TensorShape scalar_utterance_id = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-    schema->AddColumn(ColDescriptor("utterance_id", DataType(DataType::DE_STRING), TensorImpl::kFlexible, 0, &scalar_utterance_id)));
-
-
-
-  std::shared_ptr<SamplerRT> sampler_rt = nullptr;
-  RETURN_IF_NOT_OK(sampler_->SamplerBuild(&sampler_rt));
-
-  auto op = std::make_shared<CmuArcticOp>(usage_, num_workers_, dataset_dir_, connector_que_size_, std::move(schema),std::move(sampler_rt));
-  op->set_total_repeats(GetTotalRepeats());
-  op->set_num_repeats_per_epoch(GetNumRepeatsPerEpoch());
-  node_ops->push_back(op);
-
-  return Status::OK();
-}
-    
-// Get the shard id of node
-Status CmuArcticNode::GetShardId(int32_t *shard_id) {
-  *shard_id = sampler_->ShardId();
-  return Status::OK();
-}
-
-    
-// Get Dataset size
-Status CmuArcticNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,int64_t *dataset_size) {
-  if (dataset_size_ > 0) {
-    *dataset_size = dataset_size_;
-    return Status::OK();
-  }
-  int64_t num_rows, sample_size;
-  RETURN_IF_NOT_OK(CmuArcticOp::CountTotalRows(dataset_dir_, usage_, &num_rows));
-  std::shared_ptr<SamplerRT> sampler_rt = nullptr;
-  RETURN_IF_NOT_OK(sampler_->SamplerBuild(&sampler_rt));
-  sample_size = sampler_rt->CalculateNumSamples(num_rows);
-  if (sample_size == -1) {
-    RETURN_IF_NOT_OK(size_getter->DryRun(shared_from_this(), &sample_size));
-  }
-  *dataset_size = sample_size;
-  dataset_size_ = *dataset_size;
-  return Status::OK();
-}
-
-    
-Status CmuArcticNode::to_json(nlohmann::json *out_json) {
-  nlohmann::json args, sampler_args;
-  RETURN_IF_NOT_OK(sampler_->to_json(&sampler_args));
-  args["sampler"] = sampler_args;
-  args["num_parallel_workers"] = num_workers_;
-  args["dataset_dir"] = dataset_dir_;
-  args["usage"] = usage_;
-  if (cache_ != nullptr) {
-    nlohmann::json cache_args;
-    RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
-    args["cache"] = cache_args;
-  }
-  *out_json = args;
-  return Status::OK();
-}
-   
-} // namespace dataset
-} // namespace mindspor
\ No newline at end of file
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cmu_arctic_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cmu_arctic_node.h
deleted file mode 100644
index 6b79dd07a31..00000000000
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/cmu_arctic_node.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CUMARCTIC_NODE_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_CUMARCTIC_NODE_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-    
-#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
-namespace mindspore {
-namespace dataset {
-class CmuArcticNode:public MappableSourceNode {
-public:
-
-CmuArcticNode(std::string dataset_dir, std::string usage, std::shared_ptr<SamplerObj> sampler, std::shared_ptr<DatasetCache> cache);   
-
-~ CmuArcticNode() = default;
-    
-/// \brief Node name getter
-/// \return Name of the current node
-std::string Name() const override { return "kCmuArcticNode"; } 
-
-/// \brief Print the description
-/// \param out - The output stream to write output to
-void Print(std::ostream &out) const override;
-    
-/// \brief Copy the node to a new object
-/// \return A shared pointer to the new copy
-std::shared_ptr<DatasetNode> Copy() override;
-    
-/// \brief a base class override function to create the required runtime dataset op objects for this class
-/// \param node_ops - A vector containing shared pointer to the Dataset Ops that this object will create
-/// \return Status Status::OK() if build successfully
-Status Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) override;
-    
-/// \brief Parameters validation
-/// \return Status Status::OK() if all the parameters are valid
-Status ValidateParams() override;
-    
-/// \brief Get the shard id of node    什么是shard id？？
-/// \return Status Status::OK() if get shard id successfully
-Status GetShardId(int32_t *shard_id) override;
-    
-/// \brief Base-class override for GetDatasetSize
-/// \param[in] size_getter Shared pointer to DatasetSizeGetter
-/// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting
-///     dataset size at the expense of accuracy.
-/// \param[out] dataset_size the size of the dataset
-/// \return Status of the function
-Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
-                        int64_t *dataset_size) override;
-
-/// \brief Getter functions
-const std::string &DatasetDir() const { return dataset_dir_; }
-const std::string &Usage() const { return usage_; }
-        
-/// \brief Get the arguments of node
-/// \param[out] out_json JSON string of all attributes
-/// \return Status of the function
-Status to_json(nlohmann::json *out_json) override;
-    
-/// \brief Sampler getter
-/// \return SamplerObj of the current node
-std::shared_ptr<SamplerObj> Sampler() override { return sampler_; }
-
-
-void SetSampler(std::shared_ptr<SamplerObj> sampler) override { sampler_ = sampler; }
-
-private:
-std::string dataset_dir_;
-std::string usage_;
-std::shared_ptr<SamplerObj> sampler_;
-};
-
-} // namespace dataset
-} // namespace mindspore
-#endif ///home/user06/zjm/act/mindspore/mindspore/ccsrc/minddata/dataset/api
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.cc
index 3f9f7619cf8..8da109391f0 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.cc
@@ -22,6 +22,9 @@
 #include <vector>
 
 #include "minddata/dataset/engine/datasetops/source/coco_op.h"
+#ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
+#endif
 
 #include "minddata/dataset/util/status.h"
 namespace mindspore {
@@ -181,6 +184,7 @@ Status CocoNode::to_json(nlohmann::json *out_json) {
   args["annotation_file"] = annotation_file_;
   args["task"] = task_;
   args["decode"] = decode_;
+  args["extra_metadata"] = extra_metadata_;
   if (cache_ != nullptr) {
     nlohmann::json cache_args;
     RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
@@ -189,5 +193,30 @@ Status CocoNode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+#ifndef ENABLE_ANDROID
+Status CocoNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("annotation_file") != json_obj.end(), "Failed to find annotation_file");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("task") != json_obj.end(), "Failed to find task");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("decode") != json_obj.end(), "Failed to find decode");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("extra_metadata") != json_obj.end(), "Failed to find extra_metadata");
+  std::string dataset_dir = json_obj["dataset_dir"];
+  std::string annotation_file = json_obj["annotation_file"];
+  std::string task = json_obj["task"];
+  bool decode = json_obj["decode"];
+  std::shared_ptr<SamplerObj> sampler;
+  RETURN_IF_NOT_OK(Serdes::ConstructSampler(json_obj["sampler"], &sampler));
+  std::shared_ptr<DatasetCache> cache = nullptr;
+  RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
+  bool extra_metadata = json_obj["extra_metadata"];
+  *ds = std::make_shared<CocoNode>(dataset_dir, annotation_file, task, decode, sampler, cache, extra_metadata);
+  (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+#endif
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.h
index de70972d8ce..4bc29360af7 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/coco_node.h
@@ -81,6 +81,14 @@ class CocoNode : public MappableSourceNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function to read dataset in json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] ds Deserialized dataset
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
+#endif
+
   /// \brief Sampler getter
   /// \return SamplerObj of the current node
   std::shared_ptr<SamplerObj> Sampler() override { return sampler_; }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc
index 29445d08865..83693e14d3b 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.cc
@@ -187,6 +187,32 @@ Status CSVNode::to_json(nlohmann::json *out_json) {
   return Status::OK();
 }
 
+Status CSVNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_files") != json_obj.end(), "Failed to find dataset_files");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("field_delim") != json_obj.end(), "Failed to find field_delim");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("column_names") != json_obj.end(), "Failed to find column_names");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_samples") != json_obj.end(), "Failed to find num_samples");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shuffle") != json_obj.end(), "Failed to find shuffle");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_shards") != json_obj.end(), "Failed to find num_shards");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shard_id") != json_obj.end(), "Failed to find shard_id");
+  std::vector<std::string> dataset_files = json_obj["dataset_files"];
+  std::string field_delim = json_obj["field_delim"];
+  std::vector<std::shared_ptr<CsvBase>> column_defaults = {};
+  std::vector<std::string> column_names = json_obj["column_names"];
+  int64_t num_samples = json_obj["num_samples"];
+  ShuffleMode shuffle = static_cast<ShuffleMode>(json_obj["shuffle"]);
+  int32_t num_shards = json_obj["num_shards"];
+  int32_t shard_id = json_obj["shard_id"];
+  std::shared_ptr<DatasetCache> cache = nullptr;
+  RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
+  *ds = std::make_shared<CSVNode>(dataset_files, field_delim.c_str()[0], column_defaults, column_names, num_samples,
+                                  shuffle, num_shards, shard_id, cache);
+  (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+
 // Note: The following two functions are common among NonMappableSourceNode and should be promoted to its parent class.
 // CSV by itself is a non-mappable dataset that does not support sampling.
 // However, if a cache operator is injected at some other place higher in the tree, that cache can
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.h
index 2c774991631..6602f83daf6 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/csv_node.h
@@ -107,6 +107,12 @@ class CSVNode : public NonMappableSourceNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+  /// \brief Function to read dataset in json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] ds Deserialized dataset
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
+
   /// \brief CSV by itself is a non-mappable dataset that does not support sampling.
   ///     However, if a cache operator is injected at some other place higher in the tree, that cache can
   ///     inherit this sampler from the leaf, providing sampling support from the caching layer.
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/generator_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/generator_node.cc
index 5b7a676eb62..b13ce660f5a 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/generator_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/generator_node.cc
@@ -73,9 +73,9 @@ Status GeneratorNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_
     RETURN_IF_NOT_OK(data_schema->LoadSchemaString(schema_json_string, {}));
 
     for (int32_t i = 0; i < data_schema->NumColumns(); i++) {
-      ColDescriptor col = data_schema->column(i);
-      column_names_.push_back(col.name());
-      column_types_.push_back((col.type()));
+      ColDescriptor col = data_schema->Column(i);
+      column_names_.push_back(col.Name());
+      column_types_.push_back((col.Type()));
     }
   }
   std::shared_ptr<SamplerRT> sampler_rt = nullptr;
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.cc
index ebf268d1c04..65b5b5693f5 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.cc
@@ -24,6 +24,9 @@
 #include <vector>
 
 #include "minddata/dataset/engine/datasetops/source/image_folder_op.h"
+#ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
+#endif
 
 #include "minddata/dataset/util/status.h"
 namespace mindspore {
@@ -113,6 +116,7 @@ Status ImageFolderNode::to_json(nlohmann::json *out_json) {
   args["sampler"] = sampler_args;
   args["num_parallel_workers"] = num_workers_;
   args["dataset_dir"] = dataset_dir_;
+  args["recursive"] = recursive_;
   args["decode"] = decode_;
   args["extensions"] = exts_;
   args["class_indexing"] = class_indexing_;
@@ -124,5 +128,36 @@ Status ImageFolderNode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+#ifndef ENABLE_ANDROID
+Status ImageFolderNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("decode") != json_obj.end(), "Failed to find decode");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("recursive") != json_obj.end(), "Failed to find recursive");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("extensions") != json_obj.end(), "Failed to find extension");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("class_indexing") != json_obj.end(), "Failed to find class_indexing");
+  std::string dataset_dir = json_obj["dataset_dir"];
+  bool decode = json_obj["decode"];
+  std::shared_ptr<SamplerObj> sampler;
+  RETURN_IF_NOT_OK(Serdes::ConstructSampler(json_obj["sampler"], &sampler));
+  bool recursive = json_obj["recursive"];
+  std::set<std::string> extension = json_obj["extensions"];
+  std::map<std::string, int32_t> class_indexing;
+  nlohmann::json class_map = json_obj["class_indexing"];
+  for (const auto &class_map_child : class_map) {
+    std::string class_ = class_map_child[0];
+    int32_t indexing = class_map_child[1];
+    class_indexing.insert({class_, indexing});
+  }
+  std::shared_ptr<DatasetCache> cache = nullptr;
+  RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
+  *ds = std::make_shared<ImageFolderNode>(dataset_dir, decode, sampler, recursive, extension, class_indexing, cache);
+  (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+#endif
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.h
index 47688ae43ed..24e2067516f 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/image_folder_node.h
@@ -87,6 +87,14 @@ class ImageFolderNode : public MappableSourceNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function to read dataset in json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] ds Deserialized dataset
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
+#endif
+
   /// \brief Sampler getter
   /// \return SamplerObj of the current node
   std::shared_ptr<SamplerObj> Sampler() override { return sampler_; }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/libri_speech_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/libri_speech_node.cc
deleted file mode 100644
index ce3e3213a81..00000000000
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/libri_speech_node.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-#include "minddata/dataset/engine/ir/datasetops/source/libri_speech_node.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "minddata/dataset/engine/datasetops/source/libri_speech_op.h"
-
-#include "minddata/dataset/util/status.h"
-namespace mindspore {
-namespace dataset {
-    
-LibriSpeechNode::LibriSpeechNode(std::string dataset_dir, std::string usage, std::shared_ptr<SamplerObj> sampler,
-                     std::shared_ptr<DatasetCache> cache)  
-    : MappableSourceNode(std::move(cache)), dataset_dir_(dataset_dir), usage_(usage), sampler_(sampler) {}
-    
-void LibriSpeechNode::Print(std::ostream &out) const { out << Name(); }
-    
-std::shared_ptr<DatasetNode> LibriSpeechNode::Copy() {
-  std::shared_ptr<SamplerObj> sampler = (sampler_ == nullptr) ? nullptr : sampler_->SamplerCopy();
-  auto node = std::make_shared<LibriSpeechNode>(dataset_dir_, usage_, sampler, cache_);
-  return node;
-}
-    
-Status LibriSpeechNode::ValidateParams() {
-  RETURN_IF_NOT_OK(DatasetNode::ValidateParams());
-  RETURN_IF_NOT_OK(ValidateDatasetDirParam("LibriSpeechNode", dataset_dir_));
-  RETURN_IF_NOT_OK(ValidateDatasetSampler("LibriSpeechNode", sampler_));
-  RETURN_IF_NOT_OK(ValidateStringValue("LibriSpeechNode", usage_, {"dev-clean", "dev-other", "test-clean","test-other","train-clean-100","train-clean-360","train-other-500"}));
-  return Status::OK();
-}
-    
-Status LibriSpeechNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) {
-  // Do internal Schema generation.
-  auto schema = std::make_unique<DataSchema>();
-  
-  RETURN_IF_NOT_OK(schema->AddColumn(ColDescriptor("waveform", DataType(DataType::DE_FLOAT64), TensorImpl::kCv, 1)));  
-  TensorShape scalar_rate = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-    schema->AddColumn(ColDescriptor("sample_rate", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar_rate)));
-  TensorShape scalar_utterance = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-    schema->AddColumn(ColDescriptor("utterance", DataType(DataType::DE_STRING), TensorImpl::kFlexible, 0, &scalar_utterance)));
-  TensorShape scalar_speaker_id = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-    schema->AddColumn(ColDescriptor("speaker_id", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar_speaker_id)));
-  TensorShape scalar_chapter_id = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-    schema->AddColumn(ColDescriptor("chapter_id", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar_chapter_id)));
-  TensorShape scalar_utterance_id = TensorShape::CreateScalar();
-  RETURN_IF_NOT_OK(
-    schema->AddColumn(ColDescriptor("utterance_id", DataType(DataType::DE_UINT32), TensorImpl::kFlexible, 0, &scalar_utterance_id)));
-  
-
-
-
-  std::shared_ptr<SamplerRT> sampler_rt = nullptr;
-  RETURN_IF_NOT_OK(sampler_->SamplerBuild(&sampler_rt));
-
-  auto op = std::make_shared<LibriSpeechOp>(usage_, num_workers_, dataset_dir_, connector_que_size_, std::move(schema),std::move(sampler_rt));
-  op->set_total_repeats(GetTotalRepeats());
-  op->set_num_repeats_per_epoch(GetNumRepeatsPerEpoch());
-  node_ops->push_back(op);
-
-  return Status::OK();
-}
-    
-// Get the shard id of node
-Status LibriSpeechNode::GetShardId(int32_t *shard_id) {
-  *shard_id = sampler_->ShardId();
-  return Status::OK();
-}
-
-    
-// Get Dataset size
-Status LibriSpeechNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,int64_t *dataset_size) {
-  if (dataset_size_ > 0) {
-    *dataset_size = dataset_size_;
-    return Status::OK();
-  }
-  int64_t num_rows, sample_size;
-  RETURN_IF_NOT_OK(LibriSpeechOp::CountTotalRows(dataset_dir_, usage_, &num_rows));
-  std::shared_ptr<SamplerRT> sampler_rt = nullptr;
-  RETURN_IF_NOT_OK(sampler_->SamplerBuild(&sampler_rt));
-  sample_size = sampler_rt->CalculateNumSamples(num_rows);
-  if (sample_size == -1) {
-    RETURN_IF_NOT_OK(size_getter->DryRun(shared_from_this(), &sample_size));
-  }
-  *dataset_size = sample_size;
-  dataset_size_ = *dataset_size;
-  return Status::OK();
-}
-
-    
-Status LibriSpeechNode::to_json(nlohmann::json *out_json) {
-  nlohmann::json args, sampler_args;
-  RETURN_IF_NOT_OK(sampler_->to_json(&sampler_args));
-  args["sampler"] = sampler_args;
-  args["num_parallel_workers"] = num_workers_;
-  args["dataset_dir"] = dataset_dir_;
-  args["usage"] = usage_;
-  if (cache_ != nullptr) {
-    nlohmann::json cache_args;
-    RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
-    args["cache"] = cache_args;
-  }
-  *out_json = args;
-  return Status::OK();
-}
-   
-} // namespace dataset
-} // namespace mindspor
\ No newline at end of file
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/libri_speech_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/libri_speech_node.h
deleted file mode 100644
index 20240d24c5c..00000000000
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/libri_speech_node.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_LIBRISPPECH_NODE_H_
-#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_LIBRISPPECH_NODE_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-    
-#include "minddata/dataset/engine/ir/datasetops/dataset_node.h"
-namespace mindspore {
-namespace dataset {
-class LibriSpeechNode:public MappableSourceNode {
-public:
-
-LibriSpeechNode(std::string dataset_dir, std::string usage, std::shared_ptr<SamplerObj> sampler, std::shared_ptr<DatasetCache> cache);   
-
-~ LibriSpeechNode() = default;
-    
-/// \brief Node name getter
-/// \return Name of the current node
-std::string Name() const override { return "kLibriSpeechNode"; } 
-
-/// \brief Print the description
-/// \param out - The output stream to write output to
-void Print(std::ostream &out) const override;
-    
-/// \brief Copy the node to a new object
-/// \return A shared pointer to the new copy
-std::shared_ptr<DatasetNode> Copy() override;
-    
-/// \brief a base class override function to create the required runtime dataset op objects for this class
-/// \param node_ops - A vector containing shared pointer to the Dataset Ops that this object will create
-/// \return Status Status::OK() if build successfully
-Status Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) override;
-    
-/// \brief Parameters validation
-/// \return Status Status::OK() if all the parameters are valid
-Status ValidateParams() override;
-    
-/// \brief Get the shard id of node    什么是shard id？？
-/// \return Status Status::OK() if get shard id successfully
-Status GetShardId(int32_t *shard_id) override;
-    
-/// \brief Base-class override for GetDatasetSize
-/// \param[in] size_getter Shared pointer to DatasetSizeGetter
-/// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting
-///     dataset size at the expense of accuracy.
-/// \param[out] dataset_size the size of the dataset
-/// \return Status of the function
-Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
-                        int64_t *dataset_size) override;
-
-/// \brief Getter functions
-const std::string &DatasetDir() const { return dataset_dir_; }
-const std::string &Usage() const { return usage_; }
-        
-/// \brief Get the arguments of node
-/// \param[out] out_json JSON string of all attributes
-/// \return Status of the function
-Status to_json(nlohmann::json *out_json) override;
-    
-/// \brief Sampler getter
-/// \return SamplerObj of the current node
-std::shared_ptr<SamplerObj> Sampler() override { return sampler_; }
-
-
-void SetSampler(std::shared_ptr<SamplerObj> sampler) override { sampler_ = sampler; }
-
-private:
-std::string dataset_dir_;
-std::string usage_;
-std::shared_ptr<SamplerObj> sampler_;
-};
-
-} // namespace dataset
-} // namespace mindspore
-#endif ///home/user06/zjm/act/mindspore/mindspore/ccsrc/minddata/dataset/api
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.cc
index 21329db8c70..1a03024f585 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.cc
@@ -23,6 +23,9 @@
 #include <vector>
 
 #include "minddata/dataset/engine/datasetops/source/manifest_op.h"
+#ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
+#endif
 
 #include "minddata/dataset/util/status.h"
 namespace mindspore {
@@ -152,5 +155,34 @@ Status ManifestNode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+#ifndef ENABLE_ANDROID
+Status ManifestNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_file") != json_obj.end(), "Failed to find dataset_file");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("class_indexing") != json_obj.end(), "Failed to find class_indexing");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("decode") != json_obj.end(), "Failed to find decode");
+  std::string dataset_file = json_obj["dataset_file"];
+  std::string usage = json_obj["usage"];
+  std::shared_ptr<SamplerObj> sampler;
+  RETURN_IF_NOT_OK(Serdes::ConstructSampler(json_obj["sampler"], &sampler));
+  std::map<std::string, int32_t> class_indexing;
+  nlohmann::json class_map = json_obj["class_indexing"];
+  for (const auto &class_map_child : class_map) {
+    std::string class_ = class_map_child[0];
+    int32_t indexing = class_map_child[1];
+    class_indexing.insert({class_, indexing});
+  }
+  bool decode = json_obj["decode"];
+  std::shared_ptr<DatasetCache> cache = nullptr;
+  RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
+  *ds = std::make_shared<ManifestNode>(dataset_file, usage, sampler, class_indexing, decode, cache);
+  (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+#endif
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.h
index ee7012eded2..e4f23deb411 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/manifest_node.h
@@ -78,9 +78,18 @@ class ManifestNode : public MappableSourceNode {
 
   /// \brief Get the arguments of node
   /// \param[out] out_json JSON string of all attributes
+  /// \param[in] cache Dataset cache for constructor input
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function to read dataset in json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] ds Deserialized dataset
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
+#endif
+
   /// \brief Sampler getter
   /// \return SamplerObj of the current node
   std::shared_ptr<SamplerObj> Sampler() override { return sampler_; }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc
index b14a803ae12..7ca90b4e493 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.cc
@@ -22,6 +22,9 @@
 #include <vector>
 
 #include "minddata/dataset/engine/datasetops/source/mnist_op.h"
+#ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
+#endif
 
 #include "minddata/dataset/util/status.h"
 namespace mindspore {
@@ -111,5 +114,24 @@ Status MnistNode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+#ifndef ENABLE_ANDROID
+Status MnistNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
+  std::string dataset_dir = json_obj["dataset_dir"];
+  std::string usage = json_obj["usage"];
+  std::shared_ptr<SamplerObj> sampler;
+  RETURN_IF_NOT_OK(Serdes::ConstructSampler(json_obj["sampler"], &sampler));
+  std::shared_ptr<DatasetCache> cache = nullptr;
+  RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
+  *ds = std::make_shared<MnistNode>(dataset_dir, usage, sampler, cache);
+  (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+#endif
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h
index 183ef75cea5..0e896f03b3f 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/mnist_node.h
@@ -78,6 +78,14 @@ class MnistNode : public MappableSourceNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function to read dataset in json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] ds Deserialized dataset
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
+#endif
+
   /// \brief Sampler getter
   /// \return SamplerObj of the current node
   std::shared_ptr<SamplerObj> Sampler() override { return sampler_; }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc
index e1183c49389..fee51c2489b 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/random_node.cc
@@ -131,7 +131,7 @@ Status RandomNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size
     *dataset_size = dataset_size_;
     return Status::OK();
   }
-  int64_t num_rows = total_rows_ != 0 ? total_rows_ : data_schema_->num_rows();
+  int64_t num_rows = total_rows_ != 0 ? total_rows_ : data_schema_->NumRows();
   *dataset_size = num_rows;
   dataset_size_ = *dataset_size;
   return Status::OK();
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/distributed_sampler_ir.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/distributed_sampler_ir.cc
index 5a4f2f7a2ad..41bb5b63284 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/distributed_sampler_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/distributed_sampler_ir.cc
@@ -106,6 +106,30 @@ Status DistributedSamplerObj::to_json(nlohmann::json *const out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+#ifndef ENABLE_ANDROID
+Status DistributedSamplerObj::from_json(nlohmann::json json_obj, int64_t num_samples,
+                                        std::shared_ptr<SamplerObj> *sampler) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_shards") != json_obj.end(), "Failed to find num_shards");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shard_id") != json_obj.end(), "Failed to find shard_id");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shuffle") != json_obj.end(), "Failed to find shuffle");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("seed") != json_obj.end(), "Failed to find seed");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("offset") != json_obj.end(), "Failed to find offset");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("even_dist") != json_obj.end(), "Failed to find even_dist");
+  int64_t num_shards = json_obj["num_shards"];
+  int64_t shard_id = json_obj["shard_id"];
+  bool shuffle = json_obj["shuffle"];
+  uint32_t seed = json_obj["seed"];
+  int64_t offset = json_obj["offset"];
+  bool even_dist = json_obj["even_dist"];
+  *sampler =
+    std::make_shared<DistributedSamplerObj>(num_shards, shard_id, shuffle, num_samples, seed, offset, even_dist);
+  // Run common code in super class to add children samplers
+  RETURN_IF_NOT_OK(SamplerObj::from_json(json_obj, sampler));
+  return Status::OK();
+}
+#endif
+
 std::shared_ptr<SamplerObj> DistributedSamplerObj::SamplerCopy() {
   auto sampler =
     std::make_shared<DistributedSamplerObj>(num_shards_, shard_id_, shuffle_, num_samples_, seed_, offset_, even_dist_);
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/distributed_sampler_ir.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/distributed_sampler_ir.h
index 6a957e83128..fe3565719ad 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/distributed_sampler_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/distributed_sampler_ir.h
@@ -56,6 +56,15 @@ class DistributedSamplerObj : public SamplerObj {
   /// \return Status of the function
   Status to_json(nlohmann::json *const out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function for read sampler from JSON object
+  /// \param[in] json_obj JSON object to be read
+  /// \param[in] num_samples number of sample in the sampler
+  /// \param[out] sampler Sampler constructed from parameters in JSON object
+  /// \return Status of the function
+  static Status from_json(nlohmann::json json_obj, int64_t num_samples, std::shared_ptr<SamplerObj> *sampler);
+#endif
+
   Status ValidateParams() override;
 
   /// \brief Function to get the shard id of sampler
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/pk_sampler_ir.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/pk_sampler_ir.cc
index b1f8c3275b6..a14ebd6b41a 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/pk_sampler_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/pk_sampler_ir.cc
@@ -60,6 +60,19 @@ Status PKSamplerObj::to_json(nlohmann::json *const out_json) {
   return Status::OK();
 }
 
+#ifndef ENABLE_ANDROID
+Status PKSamplerObj::from_json(nlohmann::json json_obj, int64_t num_samples, std::shared_ptr<SamplerObj> *sampler) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_val") != json_obj.end(), "Failed to find num_val");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shuffle") != json_obj.end(), "Failed to find shuffle");
+  int64_t num_val = json_obj["num_val"];
+  bool shuffle = json_obj["shuffle"];
+  *sampler = std::make_shared<PKSamplerObj>(num_val, shuffle, num_samples);
+  // Run common code in super class to add children samplers
+  RETURN_IF_NOT_OK(SamplerObj::from_json(json_obj, sampler));
+  return Status::OK();
+}
+#endif
+
 Status PKSamplerObj::SamplerBuild(std::shared_ptr<SamplerRT> *sampler) {
   // runtime sampler object
   *sampler = std::make_shared<dataset::PKSamplerRT>(num_val_, shuffle_, num_samples_);
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/pk_sampler_ir.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/pk_sampler_ir.h
index eb8f6222bdc..e2a805d37ba 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/pk_sampler_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/pk_sampler_ir.h
@@ -55,6 +55,15 @@ class PKSamplerObj : public SamplerObj {
   /// \return Status of the function
   Status to_json(nlohmann::json *const out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function for read sampler from JSON object
+  /// \param[in] json_obj JSON object to be read
+  /// \param[in] num_samples number of sample in the sampler
+  /// \param[out] sampler Sampler constructed from parameters in JSON object
+  /// \return Status of the function
+  static Status from_json(nlohmann::json json_obj, int64_t num_samples, std::shared_ptr<SamplerObj> *sampler);
+#endif
+
   Status ValidateParams() override;
 
  private:
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/random_sampler_ir.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/random_sampler_ir.cc
index 86828d900e0..48b004b9b1c 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/random_sampler_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/random_sampler_ir.cc
@@ -56,6 +56,20 @@ Status RandomSamplerObj::to_json(nlohmann::json *const out_json) {
   return Status::OK();
 }
 
+#ifndef ENABLE_ANDROID
+Status RandomSamplerObj::from_json(nlohmann::json json_obj, int64_t num_samples, std::shared_ptr<SamplerObj> *sampler) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("replacement") != json_obj.end(), "Failed to find replacement");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("reshuffle_each_epoch") != json_obj.end(),
+                               "Failed to find reshuffle_each_epoch");
+  bool replacement = json_obj["replacement"];
+  bool reshuffle_each_epoch = json_obj["reshuffle_each_epoch"];
+  *sampler = std::make_shared<RandomSamplerObj>(replacement, num_samples, reshuffle_each_epoch);
+  // Run common code in super class to add children samplers
+  RETURN_IF_NOT_OK(SamplerObj::from_json(json_obj, sampler));
+  return Status::OK();
+}
+#endif
+
 Status RandomSamplerObj::SamplerBuild(std::shared_ptr<SamplerRT> *sampler) {
   // runtime sampler object
   *sampler = std::make_shared<dataset::RandomSamplerRT>(replacement_, num_samples_, reshuffle_each_epoch_);
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/random_sampler_ir.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/random_sampler_ir.h
index e43089353fe..1af197a6f2e 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/random_sampler_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/random_sampler_ir.h
@@ -55,6 +55,15 @@ class RandomSamplerObj : public SamplerObj {
   /// \return Status of the function
   Status to_json(nlohmann::json *const out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function for read sampler from JSON object
+  /// \param[in] json_obj JSON object to be read
+  /// \param[in] num_samples number of sample in the sampler
+  /// \param[out] sampler Sampler constructed from parameters in JSON object
+  /// \return Status of the function
+  static Status from_json(nlohmann::json json_obj, int64_t num_samples, std::shared_ptr<SamplerObj> *sampler);
+#endif
+
   Status ValidateParams() override;
 
  private:
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/samplers_ir.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/samplers_ir.cc
index 9f7e6bf3ebf..f03f8eeb09c 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/samplers_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/samplers_ir.cc
@@ -16,6 +16,9 @@
 
 #include "minddata/dataset/engine/ir/datasetops/source/samplers/samplers_ir.h"
 #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
+#ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
+#endif
 
 #include "minddata/dataset/core/config_manager.h"
 
@@ -73,5 +76,15 @@ Status SamplerObj::to_json(nlohmann::json *const out_json) {
   return Status::OK();
 }
 
+#ifndef ENABLE_ANDROID
+Status SamplerObj::from_json(nlohmann::json json_obj, std::shared_ptr<SamplerObj> *parent_sampler) {
+  for (nlohmann::json child : json_obj["child_sampler"]) {
+    std::shared_ptr<SamplerObj> child_sampler;
+    RETURN_IF_NOT_OK(Serdes::ConstructSampler(child, &child_sampler));
+    (*parent_sampler)->AddChildSampler(child_sampler);
+  }
+  return Status::OK();
+}
+#endif
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/samplers_ir.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/samplers_ir.h
index a3a1e666629..df2c80c08f3 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/samplers_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/samplers_ir.h
@@ -67,6 +67,14 @@ class SamplerObj {
 
   virtual Status to_json(nlohmann::json *const out_json);
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function to construct children samplers
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] parent_sampler given parent sampler, output constructed parent sampler with children samplers added
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<SamplerObj> *parent_sampler);
+#endif
+
   std::vector<std::shared_ptr<SamplerObj>> GetChild() { return children_; }
 
 #ifndef ENABLE_ANDROID
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/sequential_sampler_ir.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/sequential_sampler_ir.cc
index df4ddab65c4..3fe80140d48 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/sequential_sampler_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/sequential_sampler_ir.cc
@@ -61,6 +61,18 @@ Status SequentialSamplerObj::to_json(nlohmann::json *const out_json) {
   return Status::OK();
 }
 
+#ifndef ENABLE_ANDROID
+Status SequentialSamplerObj::from_json(nlohmann::json json_obj, int64_t num_samples,
+                                       std::shared_ptr<SamplerObj> *sampler) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("start_index") != json_obj.end(), "Failed to find start_index");
+  int64_t start_index = json_obj["start_index"];
+  *sampler = std::make_shared<SequentialSamplerObj>(start_index, num_samples);
+  // Run common code in super class to add children samplers
+  RETURN_IF_NOT_OK(SamplerObj::from_json(json_obj, sampler));
+  return Status::OK();
+}
+#endif
+
 Status SequentialSamplerObj::SamplerBuild(std::shared_ptr<SamplerRT> *sampler) {
   // runtime sampler object
   *sampler = std::make_shared<dataset::SequentialSamplerRT>(start_index_, num_samples_);
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/sequential_sampler_ir.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/sequential_sampler_ir.h
index 0ad0cd1f4b8..b33957f36f0 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/sequential_sampler_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/sequential_sampler_ir.h
@@ -55,6 +55,15 @@ class SequentialSamplerObj : public SamplerObj {
   /// \return Status of the function
   Status to_json(nlohmann::json *const out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function for read sampler from JSON object
+  /// \param[in] json_obj JSON object to be read
+  /// \param[in] num_samples number of sample in the sampler
+  /// \param[out] sampler Sampler constructed from parameters in JSON object
+  /// \return Status of the function
+  static Status from_json(nlohmann::json json_obj, int64_t num_samples, std::shared_ptr<SamplerObj> *sampler);
+#endif
+
   Status ValidateParams() override;
 
  private:
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_random_sampler_ir.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_random_sampler_ir.cc
index 504a4862e12..cebe26ed615 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_random_sampler_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_random_sampler_ir.cc
@@ -63,6 +63,19 @@ Status SubsetRandomSamplerObj::to_json(nlohmann::json *const out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+#ifndef ENABLE_ANDROID
+Status SubsetRandomSamplerObj::from_json(nlohmann::json json_obj, int64_t num_samples,
+                                         std::shared_ptr<SamplerObj> *sampler) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("indices") != json_obj.end(), "Failed to find indices");
+  std::vector<int64_t> indices = json_obj["indices"];
+  *sampler = std::make_shared<SubsetRandomSamplerObj>(indices, num_samples);
+  // Run common code in super class to add children samplers
+  RETURN_IF_NOT_OK(SamplerObj::from_json(json_obj, sampler));
+  return Status::OK();
+}
+#endif
+
 std::shared_ptr<SamplerObj> SubsetRandomSamplerObj::SamplerCopy() {
   auto sampler = std::make_shared<SubsetRandomSamplerObj>(indices_, num_samples_);
   for (const auto &child : children_) {
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_random_sampler_ir.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_random_sampler_ir.h
index 8360d7575cb..d11e0f04e61 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_random_sampler_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_random_sampler_ir.h
@@ -45,6 +45,10 @@ class SubsetRandomSamplerObj : public SubsetSamplerObj {
 
   Status to_json(nlohmann::json *const out_json) override;
 
+#ifndef ENABLE_ANDROID
+  static Status from_json(nlohmann::json json_obj, int64_t num_samples, std::shared_ptr<SamplerObj> *sampler);
+#endif
+
   Status SamplerBuild(std::shared_ptr<SamplerRT> *sampler) override;
 
   std::shared_ptr<SamplerObj> SamplerCopy() override;
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_sampler_ir.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_sampler_ir.cc
index 9cde95a3d50..420babf365b 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_sampler_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_sampler_ir.cc
@@ -72,6 +72,17 @@ Status SubsetSamplerObj::to_json(nlohmann::json *const out_json) {
   return Status::OK();
 }
 
+#ifndef ENABLE_ANDROID
+Status SubsetSamplerObj::from_json(nlohmann::json json_obj, int64_t num_samples, std::shared_ptr<SamplerObj> *sampler) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("indices") != json_obj.end(), "Failed to find indices");
+  std::vector<int64_t> indices = json_obj["indices"];
+  *sampler = std::make_shared<SubsetSamplerObj>(indices, num_samples);
+  // Run common code in super class to add children samplers
+  RETURN_IF_NOT_OK(SamplerObj::from_json(json_obj, sampler));
+  return Status::OK();
+}
+#endif
+
 std::shared_ptr<SamplerObj> SubsetSamplerObj::SamplerCopy() {
   auto sampler = std::make_shared<SubsetSamplerObj>(indices_, num_samples_);
   for (const auto &child : children_) {
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_sampler_ir.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_sampler_ir.h
index e72e344c67d..db55644825f 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_sampler_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/subset_sampler_ir.h
@@ -55,6 +55,15 @@ class SubsetSamplerObj : public SamplerObj {
   /// \return Status of the function
   Status to_json(nlohmann::json *const out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function for read sampler from JSON object
+  /// \param[in] json_obj JSON object to be read
+  /// \param[in] num_samples number of sample in the sampler
+  /// \param[out] sampler Sampler constructed from parameters in JSON object
+  /// \return Status of the function
+  static Status from_json(nlohmann::json json_obj, int64_t num_samples, std::shared_ptr<SamplerObj> *sampler);
+#endif
+
   Status ValidateParams() override;
 
  protected:
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/weighted_random_sampler_ir.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/weighted_random_sampler_ir.cc
index 58aa745f570..c78dbb14e76 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/weighted_random_sampler_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/weighted_random_sampler_ir.cc
@@ -63,6 +63,20 @@ Status WeightedRandomSamplerObj::to_json(nlohmann::json *const out_json) {
   return Status::OK();
 }
 
+#ifndef ENABLE_ANDROID
+Status WeightedRandomSamplerObj::from_json(nlohmann::json json_obj, int64_t num_samples,
+                                           std::shared_ptr<SamplerObj> *sampler) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("weights") != json_obj.end(), "Failed to find weights");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("replacement") != json_obj.end(), "Failed to find replacement");
+  std::vector<double> weights = json_obj["weights"];
+  bool replacement = json_obj["replacement"];
+  *sampler = std::make_shared<WeightedRandomSamplerObj>(weights, num_samples, replacement);
+  // Run common code in super class to add children samplers
+  RETURN_IF_NOT_OK(SamplerObj::from_json(json_obj, sampler));
+  return Status::OK();
+}
+#endif
+
 Status WeightedRandomSamplerObj::SamplerBuild(std::shared_ptr<SamplerRT> *sampler) {
   *sampler = std::make_shared<dataset::WeightedRandomSamplerRT>(weights_, num_samples_, replacement_);
   Status s = BuildChildren(sampler);
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/weighted_random_sampler_ir.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/weighted_random_sampler_ir.h
index 9661c32199c..4c966a92ff5 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/weighted_random_sampler_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/samplers/weighted_random_sampler_ir.h
@@ -51,6 +51,15 @@ class WeightedRandomSamplerObj : public SamplerObj {
   /// \return Status of the function
   Status to_json(nlohmann::json *const out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function for read sampler from JSON object
+  /// \param[in] json_obj JSON object to be read
+  /// \param[in] num_samples number of sample in the sampler
+  /// \param[out] sampler Sampler constructed from parameters in JSON object
+  /// \return Status of the function
+  static Status from_json(nlohmann::json json_obj, int64_t num_samples, std::shared_ptr<SamplerObj> *sampler);
+#endif
+
   Status ValidateParams() override;
 
  private:
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.cc
index 84b069cc3a4..0e049f61ea3 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.cc
@@ -153,6 +153,26 @@ Status TextFileNode::to_json(nlohmann::json *out_json) {
   return Status::OK();
 }
 
+Status TextFileNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_files") != json_obj.end(), "Failed to find dataset_files");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_samples") != json_obj.end(), "Failed to find num_samples");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shuffle") != json_obj.end(), "Failed to find shuffle");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_shards") != json_obj.end(), "Failed to find num_shards");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shard_id") != json_obj.end(), "Failed to find shard_id");
+  std::vector<std::string> dataset_files = json_obj["dataset_files"];
+  int64_t num_samples = json_obj["num_samples"];
+  ShuffleMode shuffle = static_cast<ShuffleMode>(json_obj["shuffle"]);
+  int32_t num_shards = json_obj["num_shards"];
+  int32_t shard_id = json_obj["shard_id"];
+  std::shared_ptr<DatasetCache> cache = nullptr;
+  RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
+  *ds = std::make_shared<TextFileNode>(dataset_files, num_samples, shuffle, num_shards, shard_id, cache);
+  (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+
 // Note: The following two functions are common among NonMappableSourceNode and should be promoted to its parent class.
 // TextFile by itself is a non-mappable dataset that does not support sampling.
 // However, if a cache operator is injected at some other place higher in the tree, that cache can
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.h
index 9cea20f09aa..81507dc8441 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/text_file_node.h
@@ -83,6 +83,12 @@ class TextFileNode : public NonMappableSourceNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+  /// \brief Function to read dataset in json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] ds Deserialized dataset
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
+
   /// \brief TextFile by itself is a non-mappable dataset that does not support sampling.
   ///     However, if a cache operator is injected at some other place higher in the tree, that cache can
   ///     inherit this sampler from the leaf, providing sampling support from the caching layer.
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc
index b9bf8fec4d9..4ccfe0ade04 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.cc
@@ -22,6 +22,7 @@
 #include <utility>
 #include <vector>
 
+#include "debug/common.h"
 #include "minddata/dataset/engine/datasetops/source/tf_reader_op.h"
 #include "minddata/dataset/engine/jagged_connector.h"
 #include "minddata/dataset/engine/opt/pass.h"
@@ -58,13 +59,9 @@ Status TFRecordNode::ValidateParams() {
   }
 
   for (const auto &f : dataset_files_) {
-    Path dataset_file(f);
-    if (!dataset_file.Exists()) {
-      std::string err_msg = "TFRecordNode: dataset file: [" + f + "] is invalid or does not exist.";
-      MS_LOG(ERROR) << err_msg;
-
-      return Status(StatusCode::kMDSyntaxError, __LINE__, __FILE__, err_msg);
-    }
+    auto realpath = Common::GetRealPath(f);
+    CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(),
+                                 "TFRecordNode: dataset file: [" + f + "] is invalid or does not exist.");
   }
 
   if (num_samples_ < 0) {
@@ -107,6 +104,7 @@ Status TFRecordNode::ValidateParams() {
 
 // Function to build TFRecordNode
 Status TFRecordNode::Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) {
+  RETURN_UNEXPECTED_IF_NULL(node_ops);
   // Sort the datasets file in a lexicographical order
   std::vector<std::string> sorted_dir_files = dataset_files_;
   std::sort(sorted_dir_files.begin(), sorted_dir_files.end());
@@ -165,6 +163,8 @@ Status TFRecordNode::GetShardId(int32_t *const shard_id) {
 // Get Dataset size
 Status TFRecordNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate,
                                     int64_t *dataset_size) {
+  RETURN_UNEXPECTED_IF_NULL(size_getter);
+  RETURN_UNEXPECTED_IF_NULL(dataset_size);
   if (dataset_size_ > 0) {
     *dataset_size = dataset_size_;
     return Status::OK();
@@ -189,6 +189,7 @@ Status TFRecordNode::GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &si
 
 // Get the file list of the specific shard ID
 Status TFRecordNode::GetShardFileList(std::vector<std::string> *shard_filenames) {
+  RETURN_UNEXPECTED_IF_NULL(shard_filenames);
   if (!shard_filenames->empty()) {
     RETURN_STATUS_UNEXPECTED("The initial file list must be empty.");
   }
@@ -201,6 +202,7 @@ Status TFRecordNode::GetShardFileList(std::vector<std::string> *shard_filenames)
 }
 
 Status TFRecordNode::to_json(nlohmann::json *out_json) {
+  RETURN_UNEXPECTED_IF_NULL(out_json);
   nlohmann::json args;
   args["num_parallel_workers"] = num_workers_;
   args["dataset_files"] = dataset_files_;
@@ -229,12 +231,40 @@ Status TFRecordNode::to_json(nlohmann::json *out_json) {
   return Status::OK();
 }
 
+Status TFRecordNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_files") != json_obj.end(), "Failed to find dataset_files");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("schema") != json_obj.end(), "Failed to find schema");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("columns_list") != json_obj.end(), "Failed to find columns_list");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_samples") != json_obj.end(), "Failed to find num_samples");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shuffle") != json_obj.end(), "Failed to find shuffle");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_shards") != json_obj.end(), "Failed to find num_shards");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shard_id") != json_obj.end(), "Failed to find shard_id");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shard_equal_rows") != json_obj.end(), "Failed to find shard_equal_rows");
+  std::vector<std::string> dataset_files = json_obj["dataset_files"];
+  std::string schema = json_obj["schema"];
+  std::vector<std::string> columns_list = json_obj["columns_list"];
+  int64_t num_samples = json_obj["num_samples"];
+  ShuffleMode shuffle = static_cast<ShuffleMode>(json_obj["shuffle"]);
+  int32_t num_shards = json_obj["num_shards"];
+  int32_t shard_id = json_obj["shard_id"];
+  bool shard_equal_rows = json_obj["shard_equal_rows"];
+  std::shared_ptr<DatasetCache> cache = nullptr;
+  RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
+  *ds = std::make_shared<TFRecordNode>(dataset_files, schema, columns_list, num_samples, shuffle, num_shards, shard_id,
+                                       shard_equal_rows, cache);
+  (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+
 // Note: The following two functions are common among NonMappableSourceNode and should be promoted to its parent class.
 // TFRecord by itself is a non-mappable dataset that does not support sampling.
 // However, if a cache operator is injected at some other place higher in the tree, that cache can
 // inherit this sampler from the leaf, providing sampling support from the caching layer.
 // That is why we setup the sampler for a leaf node that does not use sampling.
 Status TFRecordNode::SetupSamplerForCache(std::shared_ptr<SamplerObj> *sampler) {
+  RETURN_UNEXPECTED_IF_NULL(sampler);
   bool shuffle_files = (shuffle_ == ShuffleMode::kGlobal || shuffle_ == ShuffleMode::kFiles);
   *sampler = SelectSampler(num_samples_, shuffle_files, num_shards_, shard_id_);
   return Status::OK();
@@ -254,12 +284,16 @@ Status TFRecordNode::MakeSimpleProducer() {
 
 // Visitor accepting method for IRNodePass
 Status TFRecordNode::Accept(IRNodePass *p, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(p);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   // Downcast shared pointer then call visitor
   return p->Visit(shared_from_base<TFRecordNode>(), modified);
 }
 
 // Visitor accepting method for IRNodePass
 Status TFRecordNode::AcceptAfter(IRNodePass *const p, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(p);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   // Downcast shared pointer then call visitor
   return p->VisitAfter(shared_from_base<TFRecordNode>(), modified);
 }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h
index 9a9ccfc0266..9c7e301d73f 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/tf_record_node.h
@@ -126,6 +126,12 @@ class TFRecordNode : public NonMappableSourceNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+  /// \brief Function to read dataset in json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] ds Deserialized dataset
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
+
   /// \brief TFRecord by itself is a non-mappable dataset that does not support sampling.
   ///     However, if a cache operator is injected at some other place higher in the tree, that cache can
   ///     inherit this sampler from the leaf, providing sampling support from the caching layer.
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.cc
index daef556f75e..ea40c5495cb 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.cc
@@ -23,6 +23,9 @@
 #include <vector>
 
 #include "minddata/dataset/engine/datasetops/source/voc_op.h"
+#ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
+#endif
 
 #include "minddata/dataset/util/status.h"
 namespace mindspore {
@@ -169,6 +172,7 @@ Status VOCNode::to_json(nlohmann::json *out_json) {
   args["usage"] = usage_;
   args["class_indexing"] = class_index_;
   args["decode"] = decode_;
+  args["extra_metadata"] = extra_metadata_;
   if (cache_ != nullptr) {
     nlohmann::json cache_args;
     RETURN_IF_NOT_OK(cache_->to_json(&cache_args));
@@ -177,5 +181,38 @@ Status VOCNode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+#ifndef ENABLE_ANDROID
+Status VOCNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
+                               "Failed to find num_parallel_workers");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("task") != json_obj.end(), "Failed to find task");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("class_indexing") != json_obj.end(), "Failed to find class_indexing");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("decode") != json_obj.end(), "Failed to find decode");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("extra_metadata") != json_obj.end(), "Failed to find extra_metadata");
+  std::string dataset_dir = json_obj["dataset_dir"];
+  std::string task = json_obj["task"];
+  std::string usage = json_obj["usage"];
+  std::map<std::string, int32_t> class_indexing;
+  nlohmann::json class_map = json_obj["class_indexing"];
+  for (const auto &class_map_child : class_map) {
+    std::string class_ = class_map_child[0];
+    int32_t indexing = class_map_child[1];
+    class_indexing.insert({class_, indexing});
+  }
+  bool decode = json_obj["decode"];
+  std::shared_ptr<SamplerObj> sampler;
+  RETURN_IF_NOT_OK(Serdes::ConstructSampler(json_obj["sampler"], &sampler));
+  bool extra_metadata = json_obj["extra_metadata"];
+  std::shared_ptr<DatasetCache> cache = nullptr;
+  RETURN_IF_NOT_OK(DatasetCache::from_json(json_obj, &cache));
+  *ds = std::make_shared<VOCNode>(dataset_dir, task, usage, class_indexing, decode, sampler, cache, extra_metadata);
+  (*ds)->SetNumWorkers(json_obj["num_parallel_workers"]);
+  return Status::OK();
+}
+#endif
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.h
index ba3268b34e4..0fd0b4e5485 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/source/voc_node.h
@@ -83,6 +83,14 @@ class VOCNode : public MappableSourceNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+#ifndef ENABLE_ANDROID
+  /// \brief Function to read dataset in json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] ds Deserialized dataset
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
+#endif
+
   /// \brief Sampler getter
   /// \return SamplerObj of the current node
   std::shared_ptr<SamplerObj> Sampler() override { return sampler_; }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.cc
index a3d8752e1ad..36dbeb37722 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.cc
@@ -91,5 +91,13 @@ Status TakeNode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+Status TakeNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                           std::shared_ptr<DatasetNode> *result) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("count") != json_obj.end(), "Failed to find count");
+  int32_t count = json_obj["count"];
+  *result = std::make_shared<TakeNode>(ds, count);
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.h
index 598ba445983..c6ff10c41f3 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/take_node.h
@@ -88,6 +88,14 @@ class TakeNode : public DatasetNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+  /// \brief Function for read dataset operation from json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[in] ds dataset node constructed
+  /// \param[out] result Deserialized dataset after the operation
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                          std::shared_ptr<DatasetNode> *result);
+
  private:
   int32_t take_count_;
 };
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.cc b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.cc
index 2d0bcc6d38d..c45be3031a6 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.cc
@@ -126,5 +126,25 @@ Status TransferNode::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+Status TransferNode::from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                               std::shared_ptr<DatasetNode> *result) {
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("queue_name") != json_obj.end(), "Failed to find queue_name");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("device_type") != json_obj.end(), "Failed to find device_type");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("device_id") != json_obj.end(), "Failed to find device_id");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("send_epoch_end") != json_obj.end(), "Failed to find send_epoch_end");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("total_batch") != json_obj.end(), "Failed to find total_batch");
+  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("create_data_info_queue") != json_obj.end(),
+                               "Failed to find create_data_info_queue");
+  std::string queue_name = json_obj["queue_name"];
+  std::string device_type = json_obj["device_type"];
+  int32_t device_id = json_obj["device_id"];
+  bool send_epoch_end = json_obj["send_epoch_end"];
+  int32_t total_batch = json_obj["total_batch"];
+  bool create_data_info_queue = json_obj["create_data_info_queue"];
+  *result = std::make_shared<TransferNode>(ds, queue_name, device_type, device_id, send_epoch_end, total_batch,
+                                           create_data_info_queue);
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.h b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.h
index b136ea71bfa..411a40429d6 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/transfer_node.h
@@ -84,6 +84,14 @@ class TransferNode : public DatasetNode {
   /// \return Status of the function
   Status to_json(nlohmann::json *out_json) override;
 
+  /// \brief Function for read dataset operation from json
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[in] ds dataset node constructed
+  /// \param[out] result Deserialized dataset after the operation
+  /// \return Status The status code returned
+  static Status from_json(nlohmann::json json_obj, std::shared_ptr<DatasetNode> ds,
+                          std::shared_ptr<DatasetNode> *result);
+
  private:
   std::string queue_name_;
   int32_t device_id_;
diff --git a/mindspore/ccsrc/minddata/dataset/engine/jagged_connector.h b/mindspore/ccsrc/minddata/dataset/engine/jagged_connector.h
index dea086fe744..1a610b3f177 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/jagged_connector.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/jagged_connector.h
@@ -43,6 +43,7 @@ class JaggedConnector : public Connector<TensorRow> {
   }
 
   Status Pop(int32_t worker_id, TensorRow *result) noexcept override {
+    RETURN_UNEXPECTED_IF_NULL(result);
     {
       MS_ASSERT(worker_id < num_consumers_);
       std::unique_lock<std::mutex> lock(m_);
@@ -53,7 +54,7 @@ class JaggedConnector : public Connector<TensorRow> {
       }
 
       RETURN_IF_NOT_OK(queues_[pop_from_]->PopFront(result));
-      if (result->eoe()) {
+      if (result != nullptr && result->eoe()) {
         is_queue_finished_[pop_from_] = true;
       }
 
diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc b/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc
index e211f03b228..753fad75296 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.cc
@@ -32,12 +32,14 @@ namespace mindspore {
 namespace dataset {
 
 Status TensorOpFusionPass::Visit(std::shared_ptr<MapNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   std::vector<std::shared_ptr<TensorOperation>> ops = node->operations();
 
   // start temporary code, to deal with pre-built TensorOperation
   std::vector<std::string> pattern = {kDecodeOp, kRandomCropAndResizeOp};
   auto itr = std::search(ops.begin(), ops.end(), pattern.begin(), pattern.end(),
-                         [](auto op, const std::string &nm) { return op->Name() == nm; });
+                         [](auto op, const std::string &nm) { return op != nullptr ? op->Name() == nm : false; });
   if (itr != ops.end()) {
     MS_LOG(WARNING) << "Fusing pre-build Decode and RandomCropResize into one pre-build.";
     auto fused_op = dynamic_cast<RandomCropAndResizeOp *>((*(itr + 1))->Build().get());
@@ -52,7 +54,7 @@ Status TensorOpFusionPass::Visit(std::shared_ptr<MapNode> node, bool *const modi
   // logic below is for non-prebuilt TensorOperation
   pattern = {vision::kDecodeOperation, vision::kRandomResizedCropOperation};
   itr = std::search(ops.begin(), ops.end(), pattern.begin(), pattern.end(),
-                    [](auto op, const std::string &nm) { return op->Name() == nm; });
+                    [](auto op, const std::string &nm) { return op != nullptr ? op->Name() == nm : false; });
 
   // return here if no pattern is found
   RETURN_OK_IF_TRUE(itr == ops.end());
diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/post/auto_worker_pass.cc b/mindspore/ccsrc/minddata/dataset/engine/opt/post/auto_worker_pass.cc
index ead6bd4d69f..da4d0887321 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/post/auto_worker_pass.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/post/auto_worker_pass.cc
@@ -27,6 +27,8 @@ namespace dataset {
 
 // this will become the RootNode:DatasetNode when it is turned on
 Status AutoWorkerPass::RunOnTree(std::shared_ptr<DatasetNode> root_ir, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(root_ir);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   uint8_t config = GlobalContext::config_manager()->get_auto_worker_config();
 
   OpWeightPass pass(kOpWeightConfigs[config < kOpWeightConfigs.size() ? config : 0]);
@@ -46,6 +48,8 @@ Status AutoWorkerPass::RunOnTree(std::shared_ptr<DatasetNode> root_ir, bool *con
   // get the maximum weight of all the ops, this value is used to ensure the ratio of num_workers between ops
   float max_weight = 0;
   for (const auto &p : pass.weight_profile_) max_weight = std::max(max_weight, p.second);
+
+  CHECK_FAIL_RETURN_UNEXPECTED(max_weight != 0, "Internal error, doesn't allow divide zero.");
   RETURN_IF_NOT_OK(pass.Run(root_ir, modified));
   constexpr size_t max_num_ops = 3;
   if (pass.parallel_ops_.size() > max_num_ops) {
@@ -53,6 +57,7 @@ Status AutoWorkerPass::RunOnTree(std::shared_ptr<DatasetNode> root_ir, bool *con
                     << "1 batch and 1 map. AutoNumWorker may not be optimal for usage on complex pipelines.";
   }
 
+  CHECK_FAIL_RETURN_UNEXPECTED(pass.weight_sum_ != 0, "Internal error, doesn't allow divide zero.");
   for (auto &p : pass.parallel_ops_) {
     // get the num worker via the weight ratio
     int32_t num_workers = std::ceil((thread_cnt_ * p.second) / (pass.weight_sum_ * num_shards));
diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/post/repeat_pass.cc b/mindspore/ccsrc/minddata/dataset/engine/opt/post/repeat_pass.cc
index 778c1262b5d..a7d98ccc361 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/post/repeat_pass.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/post/repeat_pass.cc
@@ -33,6 +33,8 @@ RepeatPass::RepeatPass()
 
 // Identifies the subtree below this node as being in a repeated path of the tree.
 Status RepeatPass::Visit(std::shared_ptr<RepeatNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   // If this is an infinite repeat under infinite repeat/epoch, adjust current num_repeats_.
   // Otherwise, after multiplication it would become positive and this repeat wouldn't run infinitely.
   if (node->Count() == DatasetOp::kInfiniteRepeat && num_repeats_ < 0) {
@@ -56,6 +58,8 @@ Status RepeatPass::Visit(std::shared_ptr<RepeatNode> node, bool *const modified)
 
 // Identifies the subtree below this node as being in a repeated path of the tree.
 Status RepeatPass::Visit(std::shared_ptr<EpochCtrlNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   // Get the total number of epochs from the EpochCtrlOp parameter
   num_epochs_ = node->Count();
   // Every node below this EpochCtrlOp should be repeated for num_epochs_ times.
@@ -69,6 +73,8 @@ Status RepeatPass::Visit(std::shared_ptr<EpochCtrlNode> node, bool *const modifi
 #ifndef ENABLE_ANDROID
 // Identifies the subtree below this node as being in a cache merge path
 Status RepeatPass::Visit(std::shared_ptr<CacheMergeNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   // Turn on the flag that we're under a merge op
   is_merge_ = true;
   return Status::OK();
@@ -76,6 +82,8 @@ Status RepeatPass::Visit(std::shared_ptr<CacheMergeNode> node, bool *const modif
 
 // Identifies the subtree below this node as being cached
 Status RepeatPass::Visit(std::shared_ptr<CacheNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   // Turn on the flag that we're under a merge op
   is_cached_ = true;
   return Status::OK();
@@ -84,6 +92,8 @@ Status RepeatPass::Visit(std::shared_ptr<CacheNode> node, bool *const modified)
 
 // Hooks up any identified eoe nodes under this repeat.
 Status RepeatPass::VisitAfter(std::shared_ptr<RepeatNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   // We are a repeat op in the descendant tree of a merge op, then we take the saved lookup up
   // and set its total repeats. It is important that the op is removed from the save area,
   // because the merge op above us may also take action on it later for a different case when
@@ -103,12 +113,16 @@ Status RepeatPass::VisitAfter(std::shared_ptr<RepeatNode> node, bool *const modi
   // The total repeats of nodes above this Repeat(n) have nothing to do with this RepeatOp's parameter n.
   // But num_repeats_ has been multiplied by n during this Repeat(n)'s PreRunOnNode,
   // so we divide num_repeats_ by n to be able to correctly set total repeats for nodes above this RepeatOp.
+  CHECK_FAIL_RETURN_UNEXPECTED(node->Count() != 0, "Invalid data, the number of node can't be 0.");
   num_repeats_ /= node->Count();
   return Status::OK();
 }
 
 // Hooks up any identified eoe nodes under this repeat.
 Status RepeatPass::VisitAfter(std::shared_ptr<EpochCtrlNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
+  CHECK_FAIL_RETURN_UNEXPECTED(node->Count() != 0, "Invalid data, the number of node can't be 0.");
   node->SetTotalRepeats(num_repeats_);
   node->SetNumEpochs(num_epochs_);
   // We finish the walk of this EpochCtrl's descendent nodes.
@@ -119,6 +133,8 @@ Status RepeatPass::VisitAfter(std::shared_ptr<EpochCtrlNode> node, bool *const m
 // All operators have a flag that might be set related to the repeat and any leaf nodes need to be set up
 // for use with a controlling repeat above it.
 Status RepeatPass::VisitAfter(std::shared_ptr<DatasetNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   // If we are under a cache op, then save ourselves to the cached op stack.
   if (is_cached_) {
     AddToCachedNodeStack(node);
@@ -132,6 +148,8 @@ Status RepeatPass::VisitAfter(std::shared_ptr<DatasetNode> node, bool *const mod
 #ifndef ENABLE_ANDROID
 // CacheOp removes previous leaf ops and replaces them with itself
 Status RepeatPass::VisitAfter(std::shared_ptr<CacheNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   is_cached_ = false;
 
   // if we are a cache within a repeat path of the tree, then adjust the total repeats and total epochs for cached ops.
@@ -153,6 +171,8 @@ Status RepeatPass::VisitAfter(std::shared_ptr<CacheNode> node, bool *const modif
 
 // Turns off the tracking for operations under merge op
 Status RepeatPass::VisitAfter(std::shared_ptr<CacheMergeNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   // If there was not any repeat in the merge cache miss leg, then the cache_lookup
   // would not have been consumed yet.  In that case, we need to set its total repeats for it.
   if (cache_lookup_) {
@@ -168,6 +188,8 @@ Status RepeatPass::VisitAfter(std::shared_ptr<CacheMergeNode> node, bool *const
 
 // Saves the lookup up in case it needs to be referenced by a repeat
 Status RepeatPass::VisitAfter(std::shared_ptr<CacheLookupNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   if (!node->IsLeaf()) {
     // By definition, the CacheLookup must be a leaf op.  Make that clear here.
     RETURN_STATUS_UNEXPECTED("CacheLookupOp must be a leaf node!");
@@ -185,6 +207,8 @@ Status RepeatPass::VisitAfter(std::shared_ptr<CacheLookupNode> node, bool *const
 #endif
 
 Status RepeatPass::VisitAfter(std::shared_ptr<TransferNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   // Set total repeats and total epochs for the TransferNode
   node->SetTotalRepeats(num_epochs_);
   node->SetNumEpochs(num_epochs_);
@@ -192,7 +216,12 @@ Status RepeatPass::VisitAfter(std::shared_ptr<TransferNode> node, bool *const mo
 }
 
 // Adds an operator to the cached operator stack save area
-void RepeatPass::AddToCachedNodeStack(const std::shared_ptr<DatasetNode> &node) { cached_node_stacks_.push(node); }
+void RepeatPass::AddToCachedNodeStack(const std::shared_ptr<DatasetNode> &node) {
+  if (node == nullptr) {
+    return;
+  }
+  cached_node_stacks_.push(node);
+}
 
 // Pops an operator from the cached operator stack save area
 std::shared_ptr<DatasetNode> RepeatPass::PopFromCachedNodeStack() {
diff --git a/mindspore/ccsrc/minddata/dataset/engine/opt/pre/epoch_ctrl_pass.cc b/mindspore/ccsrc/minddata/dataset/engine/opt/pre/epoch_ctrl_pass.cc
index 302d84e6a79..082557c2ae2 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/pre/epoch_ctrl_pass.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pre/epoch_ctrl_pass.cc
@@ -29,6 +29,10 @@ EpochCtrlPass::InjectionFinder::InjectionFinder(std::shared_ptr<DatasetNode> nod
 
 // Performs finder work for BuildVocabOp that has special rules about epoch control injection
 Status EpochCtrlPass::InjectionFinder::Visit(std::shared_ptr<RootNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
+  CHECK_FAIL_RETURN_UNEXPECTED(node->Children().size() > 0,
+                               "Invalid data, the node of child should greater than zero.");
   // The injection is at the child of the root node
   injection_point_ = node->Children()[0];
   num_epochs_ = node->num_epochs();
@@ -37,6 +41,8 @@ Status EpochCtrlPass::InjectionFinder::Visit(std::shared_ptr<RootNode> node, boo
 
 // Performs finder work for BuildVocabOp that has special rules about epoch control injection
 Status EpochCtrlPass::InjectionFinder::Visit(std::shared_ptr<BuildVocabNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   injection_point_ = nullptr;
   return Status::OK();
 }
@@ -44,12 +50,18 @@ Status EpochCtrlPass::InjectionFinder::Visit(std::shared_ptr<BuildVocabNode> nod
 #ifndef ENABLE_ANDROID
 // Performs finder work for BuildSentencePieceVocabNode that has special rules about epoch control injection
 Status EpochCtrlPass::InjectionFinder::Visit(std::shared_ptr<BuildSentenceVocabNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   injection_point_ = nullptr;
   return Status::OK();
 }
 #endif
 
 Status EpochCtrlPass::InjectionFinder::VisitAfter(std::shared_ptr<TransferNode> node, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(modified);
+  CHECK_FAIL_RETURN_UNEXPECTED(node->Children().size() > 0,
+                               "Invalid data, the node of child should greater than zero.");
   // Assumption: There is only one TransferNode in a pipeline. This assumption is not validated here.
   // Move the injection point to the child of this node.
   injection_point_ = node->Children()[0];
@@ -61,6 +73,8 @@ EpochCtrlPass::EpochCtrlPass() {}
 
 // Runs an injection pass to inject in operators needed at the pre pass stage
 Status EpochCtrlPass::RunOnTree(std::shared_ptr<DatasetNode> root_ir, bool *const modified) {
+  RETURN_UNEXPECTED_IF_NULL(root_ir);
+  RETURN_UNEXPECTED_IF_NULL(modified);
   MS_LOG(INFO) << "Pre pass: Injection pass started.";
 
   // First, run the finder to perform any injection info before we can go ahead to drive the op injection work.
diff --git a/mindspore/ccsrc/minddata/dataset/engine/perf/connector_size.cc b/mindspore/ccsrc/minddata/dataset/engine/perf/connector_size.cc
index 14baf948932..b5108f8d804 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/connector_size.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/connector_size.cc
@@ -53,8 +53,8 @@ json ConnectorSize::ParseOpInfo(const DatasetOp &node, const std::vector<int32_t
 
   auto children = node.Children();
   std::vector<int32_t> children_id;
-  std::transform(children.begin(), children.end(), std::back_inserter(children_id),
-                 [](std::shared_ptr<DatasetOp> op) -> int32_t { return op->id(); });
+  (void)std::transform(children.begin(), children.end(), std::back_inserter(children_id),
+                       [](const std::shared_ptr<DatasetOp> &op) -> int32_t { return op->id(); });
   if (!children_id.empty()) {
     json_node["children"] = children_id;
   }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/perf/connector_throughput.cc b/mindspore/ccsrc/minddata/dataset/engine/perf/connector_throughput.cc
index acd80290486..e685b660b78 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/connector_throughput.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/connector_throughput.cc
@@ -29,6 +29,9 @@ namespace dataset {
 
 // temporary helper
 int ConnectorThroughput::InitNodes() {
+  if (tree_ == nullptr) {
+    return 0;
+  }
   auto it = (*tree_).begin();
   return it.NumNodes();
 }
@@ -43,15 +46,16 @@ Status ConnectorThroughput::Sample() {
     out_row_count_row[col] = cur_out_rows_count;
     auto sz = timestamps_.size();
     cur_time = std::chrono::steady_clock::now();
-    double dt = 0;
+    double data_time = 0;
     if (sz > 1) {
-      auto _dt = std::chrono::duration_cast<std::chrono::microseconds>(timestamps_[0][sz - 1] - timestamps_[0][sz - 2]);
-      dt = std::chrono::duration<double>(_dt).count();
+      auto full_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(timestamps_[0][sz - 1] - timestamps_[0][sz - 2]);
+      data_time = std::chrono::duration<double>(full_time).count();
     }
     auto prev_out_rows_count = out_row_count_table_[col][out_row_count_table_.size() - 1];
-    if (dt != 0) {
+    if (data_time != 0) {
       const int32_t multiplier = 1000;
-      auto thr = (cur_out_rows_count - prev_out_rows_count) / (multiplier * dt);
+      auto thr = (cur_out_rows_count - prev_out_rows_count) / (multiplier * data_time);
       throughput_row[col] = thr;
     } else {
       throughput_row[col] = 0;
@@ -70,7 +74,7 @@ json ConnectorThroughput::ParseOpInfo(const DatasetOp &node, const std::vector<d
   auto children = node.Children();
   std::vector<int32_t> children_id;
   std::transform(children.begin(), children.end(), std::back_inserter(children_id),
-                 [](std::shared_ptr<DatasetOp> op) -> int32_t { return op->id(); });
+                 [](const std::shared_ptr<DatasetOp> &op) -> int32_t { return op ? op->id() : 0; });
   json json_node;
   json_node["op_id"] = node.id();
   json_node["op_type"] = node.Name();
@@ -100,8 +104,10 @@ Status ConnectorThroughput::SaveToFile() {
   int col = 0;
   for (auto &node : *tree_) {
     std::vector<double> throughput;
-    for (auto i = 0; i < throughput_.size(); i++) {
-      throughput.push_back(throughput_[col][i]);
+    if (throughput_.size() > col) {
+      for (auto i = 0; i < throughput_[col].size(); i++) {
+        throughput.push_back(throughput_[col][i]);
+      }
     }
 
     if (!path.Exists()) {
diff --git a/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc b/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc
index 5e5c14d11a1..066450848f3 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc
@@ -18,9 +18,9 @@
 #if !defined(_WIN32) && !defined(_WIN64) && !defined(__ANDROID__) && !defined(ANDROID) && !defined(__APPLE__)
 #include <sys/syscall.h>
 #endif
-#include <algorithm>
 #include <cmath>
 #include <cstdio>
+#include <algorithm>
 #include <fstream>
 #include <memory>
 #include <string>
@@ -33,8 +33,8 @@
 using json = nlohmann::json;
 namespace mindspore {
 namespace dataset {
-bool BaseCpu::fetched_all_process_shared = false;
-std::unordered_map<int32_t, std::vector<pid_t>> BaseCpu::op_process_shared = {};
+bool BaseCpu::fetched_all_process_shared_ = false;
+std::unordered_map<int32_t, std::vector<pid_t>> BaseCpu::op_process_shared_ = {};
 
 #if !defined(_WIN32) && !defined(_WIN64) && !defined(__ANDROID__) && !defined(ANDROID) && !defined(__APPLE__)
 #define USING_LINUX
@@ -46,8 +46,8 @@ BaseCpu::BaseCpu() {
   pre_cpu_stat_.io_stat_ = 0;
   pre_cpu_stat_.idle_stat_ = 0;
   pre_cpu_stat_.total_stat_ = 0;
-  fetched_all_process = false;
-  pre_fetched_state = false;
+  fetched_all_process_ = false;
+  pre_fetched_state_ = false;
   cpu_processor_num_ = 0;
 }
 
@@ -157,6 +157,7 @@ Status DeviceCpu::Collect(const ExecutionTree *tree) {
   return Status::OK();
 }
 Status DeviceCpu::Analyze(std::string *name, double *utilization, std::string *extra_message) {
+  RETURN_UNEXPECTED_IF_NULL(name);
   name->clear();
   name->append("device_info");
   int total_samples = cpu_util_.size();
@@ -221,6 +222,7 @@ Status DeviceCpu::SaveToFile(const std::string &file_path) {
 
 Status OperatorCpu::ParseCpuInfo(int32_t op_id, int64_t thread_id,
                                  std::unordered_map<int32_t, std::unordered_map<int64_t, CpuOpStat>> *op_stat) {
+  RETURN_UNEXPECTED_IF_NULL(op_stat);
   pid_t pid = 0;
 #if defined(USING_LINUX)
   pid = syscall(SYS_getpid);
@@ -257,11 +259,12 @@ Status OperatorCpu::ParseCpuInfo(int32_t op_id, int64_t thread_id,
 }
 
 Status OperatorCpu::Collect(const ExecutionTree *tree) {
+  RETURN_UNEXPECTED_IF_NULL(tree);
   if (first_collect_) {
     for (auto iter = tree->begin(); iter != tree->end(); ++iter) {
       id_count_++;
-      op_name[iter->id()] = iter->NameWithID();
-      op_parallel_workers[iter->id()] = iter->num_workers();
+      op_name_[iter->id()] = iter->NameWithID();
+      op_parallel_workers_[iter->id()] = iter->num_workers();
     }
 #if defined(USING_LINUX)
     cpu_processor_num_ = get_nprocs_conf();
@@ -269,34 +272,34 @@ Status OperatorCpu::Collect(const ExecutionTree *tree) {
   }
 
   // Obtain the op and thread mapping
-  op_thread.clear();
+  op_thread_.clear();
   List<Task> allTasks = tree->AllTasks()->GetTask();
   for (auto &task1 : allTasks) {
     int32_t op_id = task1.get_operator_id();
-    op_thread[op_id].emplace_back(task1.get_linux_id());
+    op_thread_[op_id].emplace_back(task1.get_linux_id());
   }
 
   // add process id into op_thread
-  if (!fetched_all_process) {
+  if (!fetched_all_process_) {
     {
       py::gil_scoped_acquire gil_acquire;
       py::module ds = py::module::import("mindspore.dataset.engine.datasets");
       py::tuple process_info = ds.attr("_get_operator_process")();
       py::dict sub_process = py::reinterpret_borrow<py::dict>(process_info[0]);
-      fetched_all_process = py::reinterpret_borrow<py::bool_>(process_info[1]);
+      fetched_all_process_ = py::reinterpret_borrow<py::bool_>(process_info[1]);
       // parse dict value
-      op_process = toIntMap(sub_process);
-      BaseCpu::op_process_shared = op_process;
-      BaseCpu::fetched_all_process_shared = fetched_all_process;
+      op_process_ = toIntMap(sub_process);
+      BaseCpu::op_process_shared_ = op_process_;
+      BaseCpu::fetched_all_process_shared_ = fetched_all_process_;
     }
 
     // judge whether there is device_que operator, if so operator id may need increase by one, temp use directly
-    for (auto item : op_process) {
+    for (auto item : op_process_) {
       if (!item.second.empty()) {
-        if (op_thread.find(item.first) != op_thread.end()) {
-          op_thread[item.first].insert(op_thread[item.first].end(), item.second.begin(), item.second.end());
+        if (op_thread_.find(item.first) != op_thread_.end()) {
+          op_thread_[item.first].insert(op_thread_[item.first].end(), item.second.begin(), item.second.end());
         } else {
-          op_thread[item.first] = item.second;
+          op_thread_[item.first] = item.second;
         }
       }
     }
@@ -310,16 +313,15 @@ Status OperatorCpu::Collect(const ExecutionTree *tree) {
   if (!first_collect_) {
     // obtain all the op id in current tasks
     std::vector<int32_t> total_op_id;
-    for (auto iter = op_thread.begin(); iter != op_thread.end(); iter++) {
-      total_op_id.emplace_back(iter->first);
-    }
+    (void)std::transform(op_thread_.begin(), op_thread_.end(), std::back_inserter(total_op_id),
+                         [](const auto &iter) { return iter.first; });
 
     // iter all the op, and obtain the CPU utilization of each operator
     for (auto op_id = -1; op_id < id_count_; op_id++) {
       float user_util = 0, sys_util = 0;
       auto iter = std::find(total_op_id.begin(), total_op_id.end(), op_id);
       if (iter != total_op_id.end()) {
-        for (auto thread_id : op_thread[op_id]) {
+        for (auto thread_id : op_thread_[op_id]) {
           if (ParseCpuInfo(op_id, thread_id, &op_stat_) == Status::OK()) {
             user_util += (op_stat_[op_id][thread_id].user_stat_ - pre_op_stat_[op_id][thread_id].user_stat_) * 1.0 /
                          (total_stat_ - pre_total_stat_) * 100;
@@ -329,7 +331,7 @@ Status OperatorCpu::Collect(const ExecutionTree *tree) {
         }
       }
       CpuOpUtil info;
-      info.op_id = op_id;
+      info.op_id_ = op_id;
       info.sys_utilization_ = sys_util;
       info.user_utilization_ = user_util;
       cpu_step_util_.emplace_back(info);
@@ -337,10 +339,10 @@ Status OperatorCpu::Collect(const ExecutionTree *tree) {
     cpu_op_util_.emplace_back(cpu_step_util_);
   } else {
     // mainly obtain the init CPU execute time in first collect
-    for (auto iter = op_thread.begin(); iter != op_thread.end(); iter++) {
-      int32_t op_id = iter->first;
-      for (auto thread_id_ : iter->second) {
-        // ignore errors in the first collect
+    for (const auto &iter : op_thread_) {
+      int32_t op_id = iter.first;
+      for (auto thread_id_ : iter.second) {
+        // ParseCpuInfo may execute failed for cpu data not ready, but we still get next thread cpu info
         (void)ParseCpuInfo(op_id, thread_id_, &op_stat_);
       }
     }
@@ -355,6 +357,8 @@ Status OperatorCpu::Collect(const ExecutionTree *tree) {
 }
 
 Status OperatorCpu::Analyze(std::string *name, double *utilization, std::string *extra_message) {
+  RETURN_UNEXPECTED_IF_NULL(name);
+  RETURN_UNEXPECTED_IF_NULL(extra_message);
   int total_samples = cpu_op_util_.size();
 
   // Only analyze the middle half of the samples
@@ -374,15 +378,15 @@ Status OperatorCpu::Analyze(std::string *name, double *utilization, std::string
       sum += cpu_op_util_[i][index].sys_utilization_;
     }
     if ((end_analyze - start_analyze) > 0) {
-      op_util = 1.0 * sum * cpu_processor_num_ / (op_parallel_workers[op_id] * (end_analyze - start_analyze));
+      op_util = 1.0 * sum * cpu_processor_num_ / (op_parallel_workers_[op_id] * (end_analyze - start_analyze));
     }
     if (op_util > *utilization) {
       *utilization = op_util;
       name->clear();
-      name->append(op_name[op_id]);
+      (void)name->append(op_name_[op_id]);
     }
-    extra_message->append(op_name[op_id] + " utiliization per thread: " + std::to_string(op_util) + "% (" +
-                          std::to_string(op_parallel_workers[op_id]) + " parallel_workers);  ");
+    (void)extra_message->append(op_name_[op_id] + " utilization per thread: " + std::to_string(op_util) + "% (" +
+                                std::to_string(op_parallel_workers_[op_id]) + " parallel_workers); ");
   }
   return Status::OK();
 }
@@ -428,24 +432,24 @@ Status ProcessCpu::ParseCpuInfo() {
   uint64_t total_stat_;
   RETURN_IF_NOT_OK(GetTotalCpuTime(&total_stat_));
 
-  if (!pre_fetched_state) {
-    process_id.clear();
+  if (!pre_fetched_state_) {
+    process_id_.clear();
     pid_t main_pid = 0;
 #if defined(USING_LINUX)
     main_pid = syscall(SYS_getpid);
 #endif
-    process_id.emplace_back(main_pid);
-    op_process = BaseCpu::op_process_shared;
-    fetched_all_process = BaseCpu::fetched_all_process_shared;
-    for (auto item : op_process) {
-      for (auto id : item.second) {
-        process_id.emplace_back(id);
+    process_id_.emplace_back(main_pid);
+    op_process_ = BaseCpu::op_process_shared_;
+    fetched_all_process_ = BaseCpu::fetched_all_process_shared_;
+    for (const auto &item : op_process_) {
+      for (const auto &id : item.second) {
+        process_id_.emplace_back(id);
       }
     }
   }
 
   float user_util = 0, sys_util = 0;
-  for (auto pid : process_id) {
+  for (const auto &pid : process_id_) {
     std::string stat_path = "/proc/" + std::to_string(pid) + "/stat";
 
     std::ifstream file(stat_path);
@@ -479,11 +483,12 @@ Status ProcessCpu::ParseCpuInfo() {
   }
   pre_total_stat_ = total_stat_;
   first_collect_ = false;
-  pre_fetched_state = fetched_all_process;
+  pre_fetched_state_ = fetched_all_process_;
   return Status::OK();
 }
 
 Status ProcessCpu::Collect(const ExecutionTree *tree) {
+  RETURN_UNEXPECTED_IF_NULL(tree);
   if (first_collect_) {
 #if defined(USING_LINUX)
     cpu_processor_num_ = get_nprocs_conf();
@@ -495,6 +500,9 @@ Status ProcessCpu::Collect(const ExecutionTree *tree) {
 }
 
 Status ProcessCpu::Analyze(std::string *name, double *utilization, std::string *extra_message) {
+  RETURN_UNEXPECTED_IF_NULL(name);
+  RETURN_UNEXPECTED_IF_NULL(utilization);
+  RETURN_UNEXPECTED_IF_NULL(extra_message);
   name->clear();
   name->append("process_info");
   int total_samples = process_util_.size();
diff --git a/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.h b/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.h
index 5d12e1a3b87..59ba22e020a 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.h
@@ -49,7 +49,7 @@ typedef struct CpuInfo_s {
 typedef struct CpuOpInfo_s {
   float user_utilization_;
   float sys_utilization_;
-  int32_t op_id;
+  int32_t op_id_;
 } CpuOpUtil;
 
 // CPU utilization of process
@@ -78,11 +78,11 @@ class BaseCpu {
  protected:
   std::vector<CpuUtil> cpu_util_;
   CpuStat pre_cpu_stat_;
-  static bool fetched_all_process_shared;
-  static std::unordered_map<int32_t, std::vector<pid_t>> op_process_shared;
-  bool fetched_all_process;
-  bool pre_fetched_state;
-  std::unordered_map<int32_t, std::vector<pid_t>> op_process;
+  static bool fetched_all_process_shared_;
+  static std::unordered_map<int32_t, std::vector<pid_t>> op_process_shared_;
+  bool fetched_all_process_;
+  bool pre_fetched_state_;
+  std::unordered_map<int32_t, std::vector<pid_t>> op_process_;
   int32_t cpu_processor_num_;
 };
 
@@ -136,9 +136,9 @@ class OperatorCpu : public BaseCpu {
   bool first_collect_;
 
   // Store the id and its corresponding threads.
-  std::unordered_map<int32_t, std::vector<pid_t>> op_thread;
-  std::unordered_map<int32_t, std::string> op_name;
-  std::unordered_map<int32_t, int32_t> op_parallel_workers;
+  std::unordered_map<int32_t, std::vector<pid_t>> op_thread_;
+  std::unordered_map<int32_t, std::string> op_name_;
+  std::unordered_map<int32_t, int32_t> op_parallel_workers_;
   std::unordered_map<int32_t, std::unordered_map<int64_t, CpuOpStat>> pre_op_stat_;
   uint64_t pre_total_stat_;
   int32_t id_count_;
@@ -161,7 +161,7 @@ class ProcessCpu : public BaseCpu {
   std::vector<CpuProcessUtil> process_util_;
   uint64_t pre_total_stat_;
   std::unordered_map<int64_t, CpuOpStat> pre_process_stat_;
-  std::vector<pid_t> process_id;
+  std::vector<pid_t> process_id_;
 };
 
 // Sampling CPU information
diff --git a/mindspore/ccsrc/minddata/dataset/engine/perf/perf_data.h b/mindspore/ccsrc/minddata/dataset/engine/perf/perf_data.h
index 538b84f3468..2a251057236 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/perf_data.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/perf_data.h
@@ -52,7 +52,9 @@ class PerfData {
   void AddSample(const T &row) {
     auto i = 0;
     for (const auto &e : row) {
-      data_[i++].push_back(e);
+      if (data_.size() > i) {
+        data_[i++].push_back(e);
+      }
     }
     counter_++;
   }
@@ -62,7 +64,9 @@ class PerfData {
   auto Row(dsize_t idx) {
     std::vector<V> row(n_cols_);
     for (auto i = 0; i < n_cols_; i++) {
-      row[i] = data_[i][idx];
+      if (data_.size() > i && data_[i].size() > idx) {
+        row[i] = data_[i][idx];
+      }
     }
     return row;
   }
diff --git a/mindspore/ccsrc/minddata/dataset/engine/perf/profiling.cc b/mindspore/ccsrc/minddata/dataset/engine/perf/profiling.cc
index 6d6b3645d2a..3be230ea4af 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/profiling.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/profiling.cc
@@ -51,6 +51,7 @@ Status Tracing::SaveToFile() {
 }
 
 Status Sampling::ReadJson(nlohmann::json *output) {
+  RETURN_UNEXPECTED_IF_NULL(output);
   Path path = Path(file_path_);
   if (path.Exists()) {
     MS_LOG(DEBUG) << file_path_ << " exists";
diff --git a/mindspore/ccsrc/minddata/dataset/engine/serdes.cc b/mindspore/ccsrc/minddata/dataset/engine/serdes.cc
index 5d53483d75c..243a4860050 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/serdes.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/serdes.cc
@@ -25,6 +25,8 @@ std::map<std::string, Status (*)(nlohmann::json json_obj, std::shared_ptr<Tensor
   Serdes::func_ptr_ = Serdes::InitializeFuncPtr();
 
 Status Serdes::SaveToJSON(std::shared_ptr<DatasetNode> node, const std::string &filename, nlohmann::json *out_json) {
+  RETURN_UNEXPECTED_IF_NULL(node);
+  RETURN_UNEXPECTED_IF_NULL(out_json);
   // Dump attributes of current node to json string
   nlohmann::json args;
   RETURN_IF_NOT_OK(node->to_json(&args));
@@ -124,584 +126,97 @@ Status Serdes::CreateNode(std::shared_ptr<DatasetNode> child_ds, nlohmann::json
   return Status::OK();
 }
 
-Status Serdes::CreateCelebADatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("decode") != json_obj.end(), "Failed to find decode");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("extensions") != json_obj.end(), "Failed to find extension");
-  std::string dataset_dir = json_obj["dataset_dir"];
-  std::string usage = json_obj["usage"];
-  std::shared_ptr<SamplerObj> sampler;
-  RETURN_IF_NOT_OK(ConstructSampler(json_obj["sampler"], &sampler));
-  bool decode = json_obj["decode"];
-  std::set<std::string> extension = json_obj["extensions"];
-  // default value for cache - to_json function does not have the output
-  std::shared_ptr<DatasetCache> cache = nullptr;
-  *ds = std::make_shared<CelebANode>(dataset_dir, usage, sampler, decode, extension, cache);
-  return Status::OK();
-}
-
-Status Serdes::CreateCifar10DatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
-  std::string dataset_dir = json_obj["dataset_dir"];
-  std::string usage = json_obj["usage"];
-  std::shared_ptr<SamplerObj> sampler;
-  RETURN_IF_NOT_OK(ConstructSampler(json_obj["sampler"], &sampler));
-  // default value for cache - to_json function does not have the output
-  std::shared_ptr<DatasetCache> cache = nullptr;
-  *ds = std::make_shared<Cifar10Node>(dataset_dir, usage, sampler, cache);
-  return Status::OK();
-}
-
-Status Serdes::CreateCifar100DatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
-  std::string dataset_dir = json_obj["dataset_dir"];
-  std::string usage = json_obj["usage"];
-  std::shared_ptr<SamplerObj> sampler;
-  RETURN_IF_NOT_OK(ConstructSampler(json_obj["sampler"], &sampler));
-  // default value for cache - to_json function does not have the output
-  std::shared_ptr<DatasetCache> cache = nullptr;
-  *ds = std::make_shared<Cifar100Node>(dataset_dir, usage, sampler, cache);
-  return Status::OK();
-}
-
-Status Serdes::CreateCLUEDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("task") != json_obj.end(), "Failed to find task");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_samples") != json_obj.end(), "Failed to find num_samples");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shuffle") != json_obj.end(), "Failed to find shuffle");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_shards") != json_obj.end(), "Failed to find num_shards");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shard_id") != json_obj.end(), "Failed to find shard_id");
-  std::vector<std::string> dataset_files = json_obj["dataset_dir"];
-  std::string task = json_obj["task"];
-  std::string usage = json_obj["usage"];
-  int64_t num_samples = json_obj["num_samples"];
-  ShuffleMode shuffle = static_cast<ShuffleMode>(json_obj["shuffle"]);
-  int32_t num_shards = json_obj["num_shards"];
-  int32_t shard_id = json_obj["shard_id"];
-  // default value for cache - to_json function does not have the output
-  std::shared_ptr<DatasetCache> cache = nullptr;
-  *ds = std::make_shared<CLUENode>(dataset_files, task, usage, num_samples, shuffle, num_shards, shard_id, cache);
-  return Status::OK();
-}
-
-Status Serdes::CreateCocoDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("annotation_file") != json_obj.end(), "Failed to find annotation_file");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("task") != json_obj.end(), "Failed to find task");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("decode") != json_obj.end(), "Failed to find decode");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
-  std::string dataset_dir = json_obj["dataset_dir"];
-  std::string annotation_file = json_obj["annotation_file"];
-  std::string task = json_obj["task"];
-  bool decode = json_obj["decode"];
-  std::shared_ptr<SamplerObj> sampler;
-  RETURN_IF_NOT_OK(ConstructSampler(json_obj["sampler"], &sampler));
-  // default value for cache and extra_metadata - to_json function does not have the output
-  std::shared_ptr<DatasetCache> cache = nullptr;
-  bool extra_metadata = false;
-  *ds = std::make_shared<CocoNode>(dataset_dir, annotation_file, task, decode, sampler, cache, extra_metadata);
-  return Status::OK();
-}
-
-Status Serdes::CreateCSVDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_files") != json_obj.end(), "Failed to find dataset_files");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("field_delim") != json_obj.end(), "Failed to find field_delim");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("column_names") != json_obj.end(), "Failed to find column_names");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_samples") != json_obj.end(), "Failed to find num_samples");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shuffle") != json_obj.end(), "Failed to find shuffle");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_shards") != json_obj.end(), "Failed to find num_shards");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shard_id") != json_obj.end(), "Failed to find shard_id");
-  std::vector<std::string> dataset_files = json_obj["dataset_files"];
-  std::string field_delim = json_obj["field_delim"];
-  std::vector<std::shared_ptr<CsvBase>> column_defaults = {};
-  std::vector<std::string> column_names = json_obj["column_names"];
-  int64_t num_samples = json_obj["num_samples"];
-  ShuffleMode shuffle = static_cast<ShuffleMode>(json_obj["shuffle"]);
-  int32_t num_shards = json_obj["num_shards"];
-  int32_t shard_id = json_obj["shard_id"];
-  // default value for cache - to_json function does not have the output
-  std::shared_ptr<DatasetCache> cache = nullptr;
-  *ds = std::make_shared<CSVNode>(dataset_files, field_delim.c_str()[0], column_defaults, column_names, num_samples,
-                                  shuffle, num_shards, shard_id, cache);
-  return Status::OK();
-}
-
-Status Serdes::CreateImageFolderDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("decode") != json_obj.end(), "Failed to find decode");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("extensions") != json_obj.end(), "Failed to find extension");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("class_indexing") != json_obj.end(), "Failed to find class_indexing");
-  std::string dataset_dir = json_obj["dataset_dir"];
-  bool decode = json_obj["decode"];
-  std::shared_ptr<SamplerObj> sampler;
-  RETURN_IF_NOT_OK(ConstructSampler(json_obj["sampler"], &sampler));
-  // This arg exists in ImageFolderOp, but not externalized (in Python API). The default value is false.
-  bool recursive = false;
-  std::set<std::string> extension = json_obj["extensions"];
-  std::map<std::string, int32_t> class_indexing;
-  nlohmann::json class_map = json_obj["class_indexing"];
-  for (const auto &class_map_child : class_map) {
-    std::string class_ = class_map_child[0];
-    int32_t indexing = class_map_child[1];
-    class_indexing.insert({class_, indexing});
-  }
-  // default value for cache - to_json function does not have the output
-  std::shared_ptr<DatasetCache> cache = nullptr;
-  *ds = std::make_shared<ImageFolderNode>(dataset_dir, decode, sampler, recursive, extension, class_indexing, cache);
-  return Status::OK();
-}
-
-Status Serdes::CreateManifestDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_file") != json_obj.end(), "Failed to find dataset_file");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("class_indexing") != json_obj.end(), "Failed to find class_indexing");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("decode") != json_obj.end(), "Failed to find decode");
-  std::string dataset_file = json_obj["dataset_file"];
-  std::string usage = json_obj["usage"];
-  std::shared_ptr<SamplerObj> sampler;
-  RETURN_IF_NOT_OK(ConstructSampler(json_obj["sampler"], &sampler));
-  std::map<std::string, int32_t> class_indexing;
-  nlohmann::json class_map = json_obj["class_indexing"];
-  for (const auto &class_map_child : class_map) {
-    std::string class_ = class_map_child[0];
-    int32_t indexing = class_map_child[1];
-    class_indexing.insert({class_, indexing});
-  }
-  bool decode = json_obj["decode"];
-  // default value for cache - to_json function does not have the output
-  std::shared_ptr<DatasetCache> cache = nullptr;
-  *ds = std::make_shared<ManifestNode>(dataset_file, usage, sampler, class_indexing, decode, cache);
-  return Status::OK();
-}
-
-Status Serdes::CreateMnistDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
-  std::string dataset_dir = json_obj["dataset_dir"];
-  std::string usage = json_obj["usage"];
-  std::shared_ptr<SamplerObj> sampler;
-  RETURN_IF_NOT_OK(ConstructSampler(json_obj["sampler"], &sampler));
-  // default value for cache - to_json function does not have the output
-  std::shared_ptr<DatasetCache> cache = nullptr;
-  *ds = std::make_shared<MnistNode>(dataset_dir, usage, sampler, cache);
-  return Status::OK();
-}
-
-Status Serdes::CreateTextFileDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_files") != json_obj.end(), "Failed to find dataset_files");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_samples") != json_obj.end(), "Failed to find num_samples");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shuffle") != json_obj.end(), "Failed to find shuffle");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_shards") != json_obj.end(), "Failed to find num_shards");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shard_id") != json_obj.end(), "Failed to find shard_id");
-  std::vector<std::string> dataset_files = json_obj["dataset_files"];
-  int64_t num_samples = json_obj["num_samples"];
-  ShuffleMode shuffle = static_cast<ShuffleMode>(json_obj["shuffle"]);
-  int32_t num_shards = json_obj["num_shards"];
-  int32_t shard_id = json_obj["shard_id"];
-  // default value for cache - to_json function does not have the output
-  std::shared_ptr<DatasetCache> cache = nullptr;
-  *ds = std::make_shared<TextFileNode>(dataset_files, num_samples, shuffle, num_shards, shard_id, cache);
-  return Status::OK();
-}
-
-Status Serdes::CreateTFRecordDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_files") != json_obj.end(), "Failed to find dataset_files");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("schema") != json_obj.end(), "Failed to find schema");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("columns_list") != json_obj.end(), "Failed to find columns_list");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_samples") != json_obj.end(), "Failed to find num_samples");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shuffle") != json_obj.end(), "Failed to find shuffle");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_shards") != json_obj.end(), "Failed to find num_shards");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shard_id") != json_obj.end(), "Failed to find shard_id");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shard_equal_rows") != json_obj.end(), "Failed to find shard_equal_rows");
-  std::vector<std::string> dataset_files = json_obj["dataset_files"];
-  std::string schema = json_obj["schema"];
-  std::vector<std::string> columns_list = json_obj["columns_list"];
-  int64_t num_samples = json_obj["num_samples"];
-  ShuffleMode shuffle = static_cast<ShuffleMode>(json_obj["shuffle"]);
-  int32_t num_shards = json_obj["num_shards"];
-  int32_t shard_id = json_obj["shard_id"];
-  bool shard_equal_rows = json_obj["shard_equal_rows"];
-  // default value for cache - to_json function does not have the output
-  std::shared_ptr<DatasetCache> cache = nullptr;
-  *ds = std::make_shared<TFRecordNode>(dataset_files, schema, columns_list, num_samples, shuffle, num_shards, shard_id,
-                                       shard_equal_rows, cache);
-  return Status::OK();
-}
-
-Status Serdes::CreateVOCDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("dataset_dir") != json_obj.end(), "Failed to find dataset_dir");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("task") != json_obj.end(), "Failed to find task");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("usage") != json_obj.end(), "Failed to find usage");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("class_indexing") != json_obj.end(), "Failed to find class_indexing");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("decode") != json_obj.end(), "Failed to find decode");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler") != json_obj.end(), "Failed to find sampler");
-  std::string dataset_dir = json_obj["dataset_dir"];
-  std::string task = json_obj["task"];
-  std::string usage = json_obj["usage"];
-  std::map<std::string, int32_t> class_indexing;
-  nlohmann::json class_map = json_obj["class_indexing"];
-  for (const auto &class_map_child : class_map) {
-    std::string class_ = class_map_child[0];
-    int32_t indexing = class_map_child[1];
-    class_indexing.insert({class_, indexing});
-  }
-  bool decode = json_obj["decode"];
-  std::shared_ptr<SamplerObj> sampler;
-  RETURN_IF_NOT_OK(ConstructSampler(json_obj["sampler"], &sampler));
-  // default value for cache and extra_metadata - to_json function does not have the output
-  std::shared_ptr<DatasetCache> cache = nullptr;
-  bool extra_metadata = false;
-  *ds = std::make_shared<VOCNode>(dataset_dir, task, usage, class_indexing, decode, sampler, cache, extra_metadata);
-  return Status::OK();
-}
-
 Status Serdes::CreateDatasetNode(nlohmann::json json_obj, std::string op_type, std::shared_ptr<DatasetNode> *ds) {
   if (op_type == kCelebANode) {
-    RETURN_IF_NOT_OK(CreateCelebADatasetNode(json_obj, ds));
+    RETURN_IF_NOT_OK(CelebANode::from_json(json_obj, ds));
   } else if (op_type == kCifar10Node) {
-    RETURN_IF_NOT_OK(CreateCifar10DatasetNode(json_obj, ds));
+    RETURN_IF_NOT_OK(Cifar10Node::from_json(json_obj, ds));
   } else if (op_type == kCifar100Node) {
-    RETURN_IF_NOT_OK(CreateCifar100DatasetNode(json_obj, ds));
+    RETURN_IF_NOT_OK(Cifar100Node::from_json(json_obj, ds));
   } else if (op_type == kCLUENode) {
-    RETURN_IF_NOT_OK(CreateCLUEDatasetNode(json_obj, ds));
+    RETURN_IF_NOT_OK(CLUENode::from_json(json_obj, ds));
   } else if (op_type == kCocoNode) {
-    RETURN_IF_NOT_OK(CreateCocoDatasetNode(json_obj, ds));
+    RETURN_IF_NOT_OK(CocoNode::from_json(json_obj, ds));
   } else if (op_type == kCSVNode) {
-    RETURN_IF_NOT_OK(CreateCSVDatasetNode(json_obj, ds));
+    RETURN_IF_NOT_OK(CSVNode::from_json(json_obj, ds));
   } else if (op_type == kImageFolderNode) {
-    RETURN_IF_NOT_OK(CreateImageFolderDatasetNode(json_obj, ds));
+    RETURN_IF_NOT_OK(ImageFolderNode::from_json(json_obj, ds));
   } else if (op_type == kManifestNode) {
-    RETURN_IF_NOT_OK(CreateManifestDatasetNode(json_obj, ds));
+    RETURN_IF_NOT_OK(ManifestNode::from_json(json_obj, ds));
   } else if (op_type == kMnistNode) {
-    RETURN_IF_NOT_OK(CreateMnistDatasetNode(json_obj, ds));
+    RETURN_IF_NOT_OK(MnistNode::from_json(json_obj, ds));
   } else if (op_type == kTextFileNode) {
-    RETURN_IF_NOT_OK(CreateTextFileDatasetNode(json_obj, ds));
+    RETURN_IF_NOT_OK(TextFileNode::from_json(json_obj, ds));
   } else if (op_type == kTFRecordNode) {
-    RETURN_IF_NOT_OK(CreateTFRecordDatasetNode(json_obj, ds));
+    RETURN_IF_NOT_OK(TFRecordNode::from_json(json_obj, ds));
   } else if (op_type == kVOCNode) {
-    RETURN_IF_NOT_OK(CreateVOCDatasetNode(json_obj, ds));
+    RETURN_IF_NOT_OK(VOCNode::from_json(json_obj, ds));
   } else {
     return Status(StatusCode::kMDUnexpectedError, op_type + " is not supported");
   }
   return Status::OK();
 }
 
-Status Serdes::CreateBatchOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                        std::shared_ptr<DatasetNode> *result) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("batch_size") != json_obj.end(), "Failed to find batch_size");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("drop_remainder") != json_obj.end(), "Failed to find drop_remainder");
-  int32_t batch_size = json_obj["batch_size"];
-  bool drop_remainder = json_obj["drop_remainder"];
-  *result = std::make_shared<BatchNode>(ds, batch_size, drop_remainder);
-  return Status::OK();
-}
-
-Status Serdes::CreateMapOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                      std::shared_ptr<DatasetNode> *result) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_parallel_workers") != json_obj.end(),
-                               "Failed to find num_parallel_workers");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("input_columns") != json_obj.end(), "Failed to find input_columns");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("output_columns") != json_obj.end(), "Failed to find output_columns");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("project_columns") != json_obj.end(), "Failed to find project_columns");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("operations") != json_obj.end(), "Failed to find operations");
-  std::vector<std::string> input_columns = json_obj["input_columns"];
-  std::vector<std::string> output_columns = json_obj["output_columns"];
-  std::vector<std::string> project_columns = json_obj["project_columns"];
-  std::vector<std::shared_ptr<TensorOperation>> operations;
-  RETURN_IF_NOT_OK(ConstructTensorOps(json_obj["operations"], &operations));
-  *result = std::make_shared<MapNode>(ds, operations, input_columns, output_columns, project_columns);
-  (*result)->SetNumWorkers(json_obj["num_parallel_workers"]);
-  return Status::OK();
-}
-
-Status Serdes::CreateProjectOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                          std::shared_ptr<DatasetNode> *result) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("columns") != json_obj.end(), "Failed to find columns");
-  std::vector<std::string> columns = json_obj["columns"];
-  *result = std::make_shared<ProjectNode>(ds, columns);
-  return Status::OK();
-}
-
-Status Serdes::CreateRenameOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                         std::shared_ptr<DatasetNode> *result) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("input_columns") != json_obj.end(), "Failed to find input_columns");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("output_columns") != json_obj.end(), "Failed to find output_columns");
-  std::vector<std::string> input_columns = json_obj["input_columns"];
-  std::vector<std::string> output_columns = json_obj["output_columns"];
-  *result = std::make_shared<RenameNode>(ds, input_columns, output_columns);
-  return Status::OK();
-}
-
-Status Serdes::CreateRepeatOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                         std::shared_ptr<DatasetNode> *result) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("count") != json_obj.end(), "Failed to find count");
-  int32_t count = json_obj["count"];
-  *result = std::make_shared<RepeatNode>(ds, count);
-  return Status::OK();
-}
-
-Status Serdes::CreateShuffleOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                          std::shared_ptr<DatasetNode> *result) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("buffer_size") != json_obj.end(), "Failed to find buffer_size");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("reshuffle_each_epoch") != json_obj.end(),
-                               "Failed to find reshuffle_each_epoch");
-  int32_t buffer_size = json_obj["buffer_size"];
-  bool reset_every_epoch = json_obj["reshuffle_each_epoch"];
-  *result = std::make_shared<ShuffleNode>(ds, buffer_size, reset_every_epoch);
-  return Status::OK();
-}
-
-Status Serdes::CreateSkipOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                       std::shared_ptr<DatasetNode> *result) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("count") != json_obj.end(), "Failed to find count");
-  int32_t count = json_obj["count"];
-  *result = std::make_shared<SkipNode>(ds, count);
-  return Status::OK();
-}
-
-Status Serdes::CreateTransferOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                           std::shared_ptr<DatasetNode> *result) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("queue_name") != json_obj.end(), "Failed to find queue_name");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("device_type") != json_obj.end(), "Failed to find device_type");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("device_id") != json_obj.end(), "Failed to find device_id");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("send_epoch_end") != json_obj.end(), "Failed to find send_epoch_end");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("total_batch") != json_obj.end(), "Failed to find total_batch");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("create_data_info_queue") != json_obj.end(),
-                               "Failed to find create_data_info_queue");
-  std::string queue_name = json_obj["queue_name"];
-  std::string device_type = json_obj["device_type"];
-  int32_t device_id = json_obj["device_id"];
-  bool send_epoch_end = json_obj["send_epoch_end"];
-  int32_t total_batch = json_obj["total_batch"];
-  bool create_data_info_queue = json_obj["create_data_info_queue"];
-  *result = std::make_shared<TransferNode>(ds, queue_name, device_type, device_id, send_epoch_end, total_batch,
-                                           create_data_info_queue);
-  return Status::OK();
-}
-
-Status Serdes::CreateTakeOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                       std::shared_ptr<DatasetNode> *result) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("count") != json_obj.end(), "Failed to find count");
-  int32_t count = json_obj["count"];
-  *result = std::make_shared<TakeNode>(ds, count);
-  return Status::OK();
-}
-
 Status Serdes::CreateDatasetOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj, std::string op_type,
                                           std::shared_ptr<DatasetNode> *result) {
   if (op_type == kBatchNode) {
-    RETURN_IF_NOT_OK(CreateBatchOperationNode(ds, json_obj, result));
+    RETURN_IF_NOT_OK(BatchNode::from_json(json_obj, ds, result));
   } else if (op_type == kMapNode) {
-    RETURN_IF_NOT_OK(CreateMapOperationNode(ds, json_obj, result));
+    RETURN_IF_NOT_OK(MapNode::from_json(json_obj, ds, result));
   } else if (op_type == kProjectNode) {
-    RETURN_IF_NOT_OK(CreateProjectOperationNode(ds, json_obj, result));
+    RETURN_IF_NOT_OK(ProjectNode::from_json(json_obj, ds, result));
   } else if (op_type == kRenameNode) {
-    RETURN_IF_NOT_OK(CreateRenameOperationNode(ds, json_obj, result));
+    RETURN_IF_NOT_OK(RenameNode::from_json(json_obj, ds, result));
   } else if (op_type == kRepeatNode) {
-    RETURN_IF_NOT_OK(CreateRepeatOperationNode(ds, json_obj, result));
+    RETURN_IF_NOT_OK(RepeatNode::from_json(json_obj, ds, result));
   } else if (op_type == kShuffleNode) {
-    RETURN_IF_NOT_OK(CreateShuffleOperationNode(ds, json_obj, result));
+    RETURN_IF_NOT_OK(ShuffleNode::from_json(json_obj, ds, result));
   } else if (op_type == kSkipNode) {
-    RETURN_IF_NOT_OK(CreateSkipOperationNode(ds, json_obj, result));
+    RETURN_IF_NOT_OK(SkipNode::from_json(json_obj, ds, result));
   } else if (op_type == kTransferNode) {
-    RETURN_IF_NOT_OK(CreateTransferOperationNode(ds, json_obj, result));
+    RETURN_IF_NOT_OK(TransferNode::from_json(json_obj, ds, result));
   } else if (op_type == kTakeNode) {
-    RETURN_IF_NOT_OK(CreateTakeOperationNode(ds, json_obj, result));
+    RETURN_IF_NOT_OK(TakeNode::from_json(json_obj, ds, result));
   } else {
     return Status(StatusCode::kMDUnexpectedError, op_type + " operation is not supported");
   }
   return Status::OK();
 }
 
-Status Serdes::ConstructDistributedSampler(nlohmann::json json_obj, int64_t num_samples,
-                                           std::shared_ptr<SamplerObj> *sampler) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_shards") != json_obj.end(), "Failed to find num_shards");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shard_id") != json_obj.end(), "Failed to find shard_id");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shuffle") != json_obj.end(), "Failed to find shuffle");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("seed") != json_obj.end(), "Failed to find seed");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("offset") != json_obj.end(), "Failed to find offset");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("even_dist") != json_obj.end(), "Failed to find even_dist");
-  int64_t num_shards = json_obj["num_shards"];
-  int64_t shard_id = json_obj["shard_id"];
-  bool shuffle = json_obj["shuffle"];
-  uint32_t seed = json_obj["seed"];
-  int64_t offset = json_obj["offset"];
-  bool even_dist = json_obj["even_dist"];
-  *sampler =
-    std::make_shared<DistributedSamplerObj>(num_shards, shard_id, shuffle, num_samples, seed, offset, even_dist);
-  if (json_obj.find("child_sampler") != json_obj.end()) {
-    std::shared_ptr<SamplerObj> parent_sampler = *sampler;
-    RETURN_IF_NOT_OK(ChildSamplerFromJson(json_obj, parent_sampler, sampler));
-  }
-  return Status::OK();
-}
-
-Status Serdes::ConstructPKSampler(nlohmann::json json_obj, int64_t num_samples, std::shared_ptr<SamplerObj> *sampler) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_val") != json_obj.end(), "Failed to find num_val");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("shuffle") != json_obj.end(), "Failed to find shuffle");
-  int64_t num_val = json_obj["num_val"];
-  bool shuffle = json_obj["shuffle"];
-  *sampler = std::make_shared<PKSamplerObj>(num_val, shuffle, num_samples);
-  if (json_obj.find("child_sampler") != json_obj.end()) {
-    std::shared_ptr<SamplerObj> parent_sampler = *sampler;
-    RETURN_IF_NOT_OK(ChildSamplerFromJson(json_obj, parent_sampler, sampler));
-  }
-  return Status::OK();
-}
-
-Status Serdes::ConstructRandomSampler(nlohmann::json json_obj, int64_t num_samples,
-                                      std::shared_ptr<SamplerObj> *sampler) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("replacement") != json_obj.end(), "Failed to find replacement");
-  bool replacement = json_obj["replacement"];
-  *sampler = std::make_shared<RandomSamplerObj>(replacement, num_samples);
-  if (json_obj.find("child_sampler") != json_obj.end()) {
-    std::shared_ptr<SamplerObj> parent_sampler = *sampler;
-    RETURN_IF_NOT_OK(ChildSamplerFromJson(json_obj, parent_sampler, sampler));
-  }
-  return Status::OK();
-}
-
-Status Serdes::ConstructSequentialSampler(nlohmann::json json_obj, int64_t num_samples,
-                                          std::shared_ptr<SamplerObj> *sampler) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("start_index") != json_obj.end(), "Failed to find start_index");
-  int64_t start_index = json_obj["start_index"];
-  *sampler = std::make_shared<SequentialSamplerObj>(start_index, num_samples);
-  if (json_obj.find("child_sampler") != json_obj.end()) {
-    std::shared_ptr<SamplerObj> parent_sampler = *sampler;
-    RETURN_IF_NOT_OK(ChildSamplerFromJson(json_obj, parent_sampler, sampler));
-  }
-  return Status::OK();
-}
-
-Status Serdes::ConstructSubsetRandomSampler(nlohmann::json json_obj, int64_t num_samples,
-                                            std::shared_ptr<SamplerObj> *sampler) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("indices") != json_obj.end(), "Failed to find indices");
-  std::vector<int64_t> indices = json_obj["indices"];
-  *sampler = std::make_shared<SubsetRandomSamplerObj>(indices, num_samples);
-  if (json_obj.find("child_sampler") != json_obj.end()) {
-    std::shared_ptr<SamplerObj> parent_sampler = *sampler;
-    RETURN_IF_NOT_OK(ChildSamplerFromJson(json_obj, parent_sampler, sampler));
-  }
-  return Status::OK();
-}
-
-Status Serdes::ConstructWeightedRandomSampler(nlohmann::json json_obj, int64_t num_samples,
-                                              std::shared_ptr<SamplerObj> *sampler) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("replacement") != json_obj.end(), "Failed to find replacement");
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("weights") != json_obj.end(), "Failed to find weights");
-  bool replacement = json_obj["replacement"];
-  std::vector<double> weights = json_obj["weights"];
-  *sampler = std::make_shared<WeightedRandomSamplerObj>(weights, num_samples, replacement);
-  if (json_obj.find("child_sampler") != json_obj.end()) {
-    std::shared_ptr<SamplerObj> parent_sampler = *sampler;
-    RETURN_IF_NOT_OK(ChildSamplerFromJson(json_obj, parent_sampler, sampler));
-  }
-  return Status::OK();
-}
-
 Status Serdes::ConstructSampler(nlohmann::json json_obj, std::shared_ptr<SamplerObj> *sampler) {
   CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("num_samples") != json_obj.end(), "Failed to find num_samples");
   CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("sampler_name") != json_obj.end(), "Failed to find sampler_name");
   int64_t num_samples = json_obj["num_samples"];
   std::string sampler_name = json_obj["sampler_name"];
   if (sampler_name == "DistributedSampler") {
-    RETURN_IF_NOT_OK(ConstructDistributedSampler(json_obj, num_samples, sampler));
+    RETURN_IF_NOT_OK(DistributedSamplerObj::from_json(json_obj, num_samples, sampler));
   } else if (sampler_name == "PKSampler") {
-    RETURN_IF_NOT_OK(ConstructPKSampler(json_obj, num_samples, sampler));
+    RETURN_IF_NOT_OK(PKSamplerObj::from_json(json_obj, num_samples, sampler));
   } else if (sampler_name == "RandomSampler") {
-    RETURN_IF_NOT_OK(ConstructRandomSampler(json_obj, num_samples, sampler));
+    RETURN_IF_NOT_OK(RandomSamplerObj::from_json(json_obj, num_samples, sampler));
   } else if (sampler_name == "SequentialSampler") {
-    RETURN_IF_NOT_OK(ConstructSequentialSampler(json_obj, num_samples, sampler));
+    RETURN_IF_NOT_OK(SequentialSamplerObj::from_json(json_obj, num_samples, sampler));
+  } else if (sampler_name == "SubsetSampler") {
+    RETURN_IF_NOT_OK(SubsetSamplerObj::from_json(json_obj, num_samples, sampler));
   } else if (sampler_name == "SubsetRandomSampler") {
-    RETURN_IF_NOT_OK(ConstructSubsetRandomSampler(json_obj, num_samples, sampler));
+    RETURN_IF_NOT_OK(SubsetRandomSamplerObj::from_json(json_obj, num_samples, sampler));
   } else if (sampler_name == "WeightedRandomSampler") {
-    RETURN_IF_NOT_OK(ConstructWeightedRandomSampler(json_obj, num_samples, sampler));
+    RETURN_IF_NOT_OK(WeightedRandomSamplerObj::from_json(json_obj, num_samples, sampler));
   } else {
     return Status(StatusCode::kMDUnexpectedError, sampler_name + "Sampler is not supported");
   }
   return Status::OK();
 }
 
-Status Serdes::ChildSamplerFromJson(nlohmann::json json_obj, std::shared_ptr<SamplerObj> parent_sampler,
-                                    std::shared_ptr<SamplerObj> *sampler) {
-  CHECK_FAIL_RETURN_UNEXPECTED(json_obj.find("child_sampler") != json_obj.end(), "Failed to find child_sampler");
-  for (nlohmann::json child : json_obj["child_sampler"]) {
-    std::shared_ptr<SamplerObj> child_sampler;
-    RETURN_IF_NOT_OK(ConstructSampler(child, &child_sampler));
-    parent_sampler.get()->AddChildSampler(child_sampler);
-  }
-  return Status::OK();
-}
-
-Status Serdes::BoundingBoxAugmentFromJson(nlohmann::json op_params, std::shared_ptr<TensorOperation> *operation) {
-  CHECK_FAIL_RETURN_UNEXPECTED(op_params.find("transform") != op_params.end(), "Failed to find transform");
-  CHECK_FAIL_RETURN_UNEXPECTED(op_params.find("ratio") != op_params.end(), "Failed to find ratio");
-  std::vector<std::shared_ptr<TensorOperation>> transforms;
-  std::vector<nlohmann::json> json_operations = {};
-  json_operations.push_back(op_params["transform"]);
-  RETURN_IF_NOT_OK(ConstructTensorOps(json_operations, &transforms));
-  float ratio = op_params["ratio"];
-  CHECK_FAIL_RETURN_UNEXPECTED(transforms.size() == 1,
-                               "Expect size one of transforms parameter, but got:" + std::to_string(transforms.size()));
-  *operation = std::make_shared<vision::BoundingBoxAugmentOperation>(transforms[0], ratio);
-  return Status::OK();
-}
-
-Status Serdes::RandomSelectSubpolicyFromJson(nlohmann::json op_params, std::shared_ptr<TensorOperation> *operation) {
-  CHECK_FAIL_RETURN_UNEXPECTED(op_params.find("policy") != op_params.end(), "Failed to find policy");
-  nlohmann::json policy_json = op_params["policy"];
-  std::vector<std::vector<std::pair<std::shared_ptr<TensorOperation>, double>>> policy;
-  std::vector<std::pair<std::shared_ptr<TensorOperation>, double>> policy_items;
-  for (nlohmann::json item : policy_json) {
-    for (nlohmann::json item_pair : item) {
-      CHECK_FAIL_RETURN_UNEXPECTED(item_pair.find("prob") != item_pair.end(), "Failed to find prob");
-      CHECK_FAIL_RETURN_UNEXPECTED(item_pair.find("tensor_op") != item_pair.end(), "Failed to find tensor_op");
-      std::vector<std::shared_ptr<TensorOperation>> operations;
-      std::pair<std::shared_ptr<TensorOperation>, double> policy_pair;
-      std::shared_ptr<TensorOperation> operation;
-      nlohmann::json tensor_op_json;
-      double prob = item_pair["prob"];
-      tensor_op_json.push_back(item_pair["tensor_op"]);
-      RETURN_IF_NOT_OK(ConstructTensorOps(tensor_op_json, &operations));
-      CHECK_FAIL_RETURN_UNEXPECTED(operations.size() == 1, "There should be only 1 tensor operation");
-      policy_pair = std::make_pair(operations[0], prob);
-      policy_items.push_back(policy_pair);
-    }
-    policy.push_back(policy_items);
-  }
-  *operation = std::make_shared<vision::RandomSelectSubpolicyOperation>(policy);
-  return Status::OK();
-}
-
-Status Serdes::UniformAugFromJson(nlohmann::json op_params, std::shared_ptr<TensorOperation> *operation) {
-  CHECK_FAIL_RETURN_UNEXPECTED(op_params.find("transforms") != op_params.end(), "Failed to find transforms");
-  CHECK_FAIL_RETURN_UNEXPECTED(op_params.find("num_ops") != op_params.end(), "Failed to find num_ops");
-  std::vector<std::shared_ptr<TensorOperation>> transforms = {};
-  RETURN_IF_NOT_OK(ConstructTensorOps(op_params["transforms"], &transforms));
-  int32_t num_ops = op_params["num_ops"];
-  *operation = std::make_shared<vision::UniformAugOperation>(transforms, num_ops);
-  return Status::OK();
-}
-
-Status Serdes::ConstructTensorOps(nlohmann::json operations, std::vector<std::shared_ptr<TensorOperation>> *result) {
+Status Serdes::ConstructTensorOps(nlohmann::json json_obj, std::vector<std::shared_ptr<TensorOperation>> *result) {
   std::vector<std::shared_ptr<TensorOperation>> output;
-  for (auto op : operations) {
-    CHECK_FAIL_RETURN_UNEXPECTED(op.find("is_python_front_end_op") == op.end(),
+  for (nlohmann::json item : json_obj) {
+    CHECK_FAIL_RETURN_UNEXPECTED(item.find("is_python_front_end_op") == item.end(),
                                  "python operation is not yet supported");
-    CHECK_FAIL_RETURN_UNEXPECTED(op.find("tensor_op_name") != op.end(), "Failed to find tensor_op_name");
-    CHECK_FAIL_RETURN_UNEXPECTED(op.find("tensor_op_params") != op.end(), "Failed to find tensor_op_params");
-    std::string op_name = op["tensor_op_name"];
-    nlohmann::json op_params = op["tensor_op_params"];
+    CHECK_FAIL_RETURN_UNEXPECTED(item.find("tensor_op_name") != item.end(), "Failed to find tensor_op_name");
+    CHECK_FAIL_RETURN_UNEXPECTED(item.find("tensor_op_params") != item.end(), "Failed to find tensor_op_params");
+    std::string op_name = item["tensor_op_name"];
+    nlohmann::json op_params = item["tensor_op_params"];
     std::shared_ptr<TensorOperation> operation = nullptr;
     CHECK_FAIL_RETURN_UNEXPECTED(func_ptr_.find(op_name) != func_ptr_.end(), "Failed to find " + op_name);
     RETURN_IF_NOT_OK(func_ptr_[op_name](op_params, &operation));
@@ -716,7 +231,7 @@ Serdes::InitializeFuncPtr() {
   std::map<std::string, Status (*)(nlohmann::json json_obj, std::shared_ptr<TensorOperation> * operation)> ops_ptr;
   ops_ptr[vision::kAffineOperation] = &(vision::AffineOperation::from_json);
   ops_ptr[vision::kAutoContrastOperation] = &(vision::AutoContrastOperation::from_json);
-  ops_ptr[vision::kBoundingBoxAugmentOperation] = &(BoundingBoxAugmentFromJson);
+  ops_ptr[vision::kBoundingBoxAugmentOperation] = &(vision::BoundingBoxAugmentOperation::from_json);
   ops_ptr[vision::kCenterCropOperation] = &(vision::CenterCropOperation::from_json);
   ops_ptr[vision::kCropOperation] = &(vision::CropOperation::from_json);
   ops_ptr[vision::kCutMixBatchOperation] = &(vision::CutMixBatchOperation::from_json);
@@ -745,7 +260,7 @@ Serdes::InitializeFuncPtr() {
   ops_ptr[vision::kRandomResizedCropOperation] = &(vision::RandomResizedCropOperation::from_json);
   ops_ptr[vision::kRandomResizedCropWithBBoxOperation] = &(vision::RandomResizedCropWithBBoxOperation::from_json);
   ops_ptr[vision::kRandomRotationOperation] = &(vision::RandomRotationOperation::from_json);
-  ops_ptr[vision::kRandomSelectSubpolicyOperation] = &(RandomSelectSubpolicyFromJson);
+  ops_ptr[vision::kRandomSelectSubpolicyOperation] = &(vision::RandomSelectSubpolicyOperation::from_json);
   ops_ptr[vision::kRandomSharpnessOperation] = &(vision::RandomSharpnessOperation::from_json);
   ops_ptr[vision::kRandomSolarizeOperation] = &(vision::RandomSolarizeOperation::from_json);
   ops_ptr[vision::kRandomVerticalFlipOperation] = &(vision::RandomVerticalFlipOperation::from_json);
@@ -766,7 +281,7 @@ Serdes::InitializeFuncPtr() {
     &(vision::SoftDvppDecodeRandomCropResizeJpegOperation::from_json);
   ops_ptr[vision::kSoftDvppDecodeResizeJpegOperation] = &(vision::SoftDvppDecodeResizeJpegOperation::from_json);
   ops_ptr[vision::kSwapRedBlueOperation] = &(vision::SwapRedBlueOperation::from_json);
-  ops_ptr[vision::kUniformAugOperation] = &(UniformAugFromJson);
+  ops_ptr[vision::kUniformAugOperation] = &(vision::UniformAugOperation::from_json);
   ops_ptr[vision::kVerticalFlipOperation] = &(vision::VerticalFlipOperation::from_json);
   ops_ptr[transforms::kFillOperation] = &(transforms::FillOperation::from_json);
   ops_ptr[transforms::kOneHotOperation] = &(transforms::OneHotOperation::from_json);
diff --git a/mindspore/ccsrc/minddata/dataset/engine/serdes.h b/mindspore/ccsrc/minddata/dataset/engine/serdes.h
index 962b622c66d..72c8721af95 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/serdes.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/serdes.h
@@ -159,6 +159,18 @@ class Serdes {
   /// \return Status The status code returned
   static Status ConstructPipeline(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
 
+  /// \brief Helper functions for creating sampler, separate different samplers and call the related function
+  /// \param[in] json_obj The JSON object to be deserialized
+  /// \param[out] sampler Deserialized sampler
+  /// \return Status The status code returned
+  static Status ConstructSampler(nlohmann::json json_obj, std::shared_ptr<SamplerObj> *sampler);
+
+  /// \brief helper function to construct tensor operations
+  /// \param[in] json_obj json object of operations to be deserilized
+  /// \param[out] vector of tensor operation pointer
+  /// \return Status The status code returned
+  static Status ConstructTensorOps(nlohmann::json json_obj, std::vector<std::shared_ptr<TensorOperation>> *result);
+
  protected:
   /// \brief Helper function to save JSON to a file
   /// \param[in] json_string The JSON string to be saved to the file
@@ -189,91 +201,6 @@ class Serdes {
   static Status CreateDatasetOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
                                            std::string op_type, std::shared_ptr<DatasetNode> *result);
 
-  /// \brief Helper functions for creating sampler, separate different samplers and call the related function
-  /// \param[in] json_obj The JSON object to be deserialized
-  /// \param[out] sampler Deserialized sampler
-  /// \return Status The status code returned
-  static Status ConstructSampler(nlohmann::json json_obj, std::shared_ptr<SamplerObj> *sampler);
-
-  /// \brief helper function to construct tensor operations
-  /// \param[in] operations operations to be deserilized
-  /// \param[out] vector of tensor operation pointer
-  /// \return Status The status code returned
-  static Status ConstructTensorOps(nlohmann::json operations, std::vector<std::shared_ptr<TensorOperation>> *result);
-
-  /// \brief Helper functions for different datasets
-  /// \param[in] json_obj The JSON object to be deserialized
-  /// \param[out] ds Deserialized dataset
-  /// \return Status The status code returned
-  static Status CreateCelebADatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
-  static Status CreateCifar10DatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
-  static Status CreateCifar100DatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
-  static Status CreateCLUEDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
-  static Status CreateCocoDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
-  static Status CreateCSVDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
-  static Status CreateImageFolderDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
-  static Status CreateManifestDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
-  static Status CreateMnistDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
-  static Status CreateTextFileDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
-  static Status CreateTFRecordDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
-  static Status CreateVOCDatasetNode(nlohmann::json json_obj, std::shared_ptr<DatasetNode> *ds);
-
-  /// \brief Helper functions for different operations
-  /// \param[in] ds dataset node constructed
-  /// \param[in] json_obj The JSON object to be deserialized
-  /// \param[out] result Deserialized dataset after the operation
-  /// \return Status The status code returned
-  static Status CreateBatchOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                         std::shared_ptr<DatasetNode> *result);
-  static Status CreateMapOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                       std::shared_ptr<DatasetNode> *result);
-  static Status CreateProjectOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                           std::shared_ptr<DatasetNode> *result);
-  static Status CreateRenameOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                          std::shared_ptr<DatasetNode> *result);
-  static Status CreateRepeatOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                          std::shared_ptr<DatasetNode> *result);
-  static Status CreateShuffleOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                           std::shared_ptr<DatasetNode> *result);
-  static Status CreateSkipOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                        std::shared_ptr<DatasetNode> *result);
-  static Status CreateTransferOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                            std::shared_ptr<DatasetNode> *result);
-  static Status CreateTakeOperationNode(std::shared_ptr<DatasetNode> ds, nlohmann::json json_obj,
-                                        std::shared_ptr<DatasetNode> *result);
-
-  /// \brief Helper functions for different samplers
-  /// \param[in] json_obj The JSON object to be deserialized
-  /// \param[out] sampler Deserialized sampler
-  /// \return Status The status code returned
-  static Status ConstructDistributedSampler(nlohmann::json json_obj, int64_t num_samples,
-                                            std::shared_ptr<SamplerObj> *sampler);
-  static Status ConstructPKSampler(nlohmann::json json_obj, int64_t num_samples, std::shared_ptr<SamplerObj> *sampler);
-  static Status ConstructRandomSampler(nlohmann::json json_obj, int64_t num_samples,
-                                       std::shared_ptr<SamplerObj> *sampler);
-  static Status ConstructSequentialSampler(nlohmann::json json_obj, int64_t num_samples,
-                                           std::shared_ptr<SamplerObj> *sampler);
-  static Status ConstructSubsetRandomSampler(nlohmann::json json_obj, int64_t num_samples,
-                                             std::shared_ptr<SamplerObj> *sampler);
-  static Status ConstructWeightedRandomSampler(nlohmann::json json_obj, int64_t num_samples,
-                                               std::shared_ptr<SamplerObj> *sampler);
-
-  /// \brief Helper functions to construct children samplers
-  /// \param[in] json_obj The JSON object to be deserialized
-  /// \param[in] parent_sampler given parent sampler
-  /// \param[out] sampler sampler constructed - parent sampler with children samplers added
-  /// \return Status The status code returned
-  static Status ChildSamplerFromJson(nlohmann::json json_obj, std::shared_ptr<SamplerObj> parent_sampler,
-                                     std::shared_ptr<SamplerObj> *sampler);
-
-  /// \brief Helper functions for vision operations, which requires tensor operations as input
-  /// \param[in] op_params operation parameters for the operation
-  /// \param[out] operation deserialized operation
-  /// \return Status The status code returned
-  static Status BoundingBoxAugmentFromJson(nlohmann::json op_params, std::shared_ptr<TensorOperation> *operation);
-  static Status RandomSelectSubpolicyFromJson(nlohmann::json op_params, std::shared_ptr<TensorOperation> *operation);
-  static Status UniformAugFromJson(nlohmann::json op_params, std::shared_ptr<TensorOperation> *operation);
-
   /// \brief Helper function to map the function pointers
   /// \return map of key to function pointer
   static std::map<std::string, Status (*)(nlohmann::json json_obj, std::shared_ptr<TensorOperation> *operation)>
diff --git a/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc b/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc
index ee2900cb72f..18171c5bb20 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc
@@ -48,6 +48,7 @@ TreeAdapter::TreeAdapter(UsageFlag usage) : usage_(usage), launched_(false), tre
 }
 
 Status TreeAdapter::PrePass(std::shared_ptr<DatasetNode> ir) {
+  RETURN_UNEXPECTED_IF_NULL(ir);
   // Vector of actions in pre-pass phase
   std::vector<std::unique_ptr<IRPass>> actions;
 
@@ -73,6 +74,7 @@ Status TreeAdapter::PrePass(std::shared_ptr<DatasetNode> ir) {
 }
 
 Status TreeAdapter::Optimize(std::shared_ptr<DatasetNode> ir) {
+  RETURN_UNEXPECTED_IF_NULL(ir);
   // Vector of optimizations
   std::vector<std::unique_ptr<IRNodePass>> optimizations;
   MS_LOG(INFO) << "Running optimization pass loops";
@@ -89,6 +91,7 @@ Status TreeAdapter::Optimize(std::shared_ptr<DatasetNode> ir) {
 }
 
 Status TreeAdapter::PostPass(std::shared_ptr<DatasetNode> ir) {
+  RETURN_UNEXPECTED_IF_NULL(ir);
   // Vector of actions in post-pass phase
   std::vector<std::unique_ptr<IRPass>> actions;
   MS_LOG(INFO) << "Running post pass loops.";
@@ -118,6 +121,9 @@ Status TreeAdapter::PostPass(std::shared_ptr<DatasetNode> ir) {
 }
 
 Status TreeAdapter::BuildExecutionTreeRecur(std::shared_ptr<DatasetNode> ir, std::shared_ptr<DatasetOp> *const op) {
+  RETURN_UNEXPECTED_IF_NULL(ir);
+  RETURN_UNEXPECTED_IF_NULL(op);
+  RETURN_UNEXPECTED_IF_NULL(tree_);
   // Build the DatasetOp ExecutionTree from the optimized IR tree
   std::vector<std::shared_ptr<DatasetOp>> ops;
   RETURN_IF_NOT_OK(ir->Build(&ops));
@@ -133,7 +139,7 @@ Status TreeAdapter::BuildExecutionTreeRecur(std::shared_ptr<DatasetNode> ir, std
   }
 
   // Build the children of IR, once they return, add the return value to *op
-  for (std::shared_ptr<DatasetNode> child_ir : ir->Children()) {
+  for (const std::shared_ptr<DatasetNode> &child_ir : ir->Children()) {
     std::shared_ptr<DatasetOp> child_op;
     RETURN_IF_NOT_OK(BuildExecutionTreeRecur(child_ir, &child_op));
     RETURN_IF_NOT_OK(ops.back()->AddChild(child_op));  // append children to the last of ops
@@ -143,6 +149,7 @@ Status TreeAdapter::BuildExecutionTreeRecur(std::shared_ptr<DatasetNode> ir, std
 }
 
 Status TreeAdapter::Build(std::shared_ptr<DatasetNode> root_ir) {
+  RETURN_UNEXPECTED_IF_NULL(root_ir);
   // This will evolve in the long run
   tree_ = std::make_unique<ExecutionTree>();
   // disable profiling if this is only a getter pass
diff --git a/mindspore/ccsrc/minddata/dataset/engine/tree_adapter_lite.cc b/mindspore/ccsrc/minddata/dataset/engine/tree_adapter_lite.cc
index fb9b39a621e..a6817a9ee3a 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/tree_adapter_lite.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/tree_adapter_lite.cc
@@ -22,6 +22,8 @@ namespace dataset {
 TreeAdapterLite::TreeAdapterLite() : root_(nullptr) { tree_ = std::make_unique<ExecutionTree>(); }
 
 Status TreeAdapterLite::BuildExecutionTreeRecur(std::shared_ptr<DatasetNode> ir, std::shared_ptr<DatasetOp> *const op) {
+  RETURN_UNEXPECTED_IF_NULL(ir);
+  RETURN_UNEXPECTED_IF_NULL(op);
   // Build the DatasetOp ExecutionTree from the optimized IR tree
   std::vector<std::shared_ptr<DatasetOp>> ops;
   RETURN_IF_NOT_OK(ir->Build(&ops));
@@ -41,7 +43,7 @@ Status TreeAdapterLite::BuildExecutionTreeRecur(std::shared_ptr<DatasetNode> ir,
   }
 
   // Build the children of IR, once they return, add the return value to *op
-  for (std::shared_ptr<DatasetNode> child_ir : ir->Children()) {
+  for (const std::shared_ptr<DatasetNode> &child_ir : ir->Children()) {
     std::shared_ptr<DatasetOp> child_op;
     RETURN_IF_NOT_OK(BuildExecutionTreeRecur(child_ir, &child_op));
     RETURN_IF_NOT_OK(ops.back()->AddChild(child_op));  // append children to the last of ops
@@ -60,6 +62,7 @@ Status TreeAdapterLite::BuildTree(std::shared_ptr<DatasetNode> root_ir) {
 Status TreeAdapterLite::GetNextRow(TensorRow *const row) {
   RETURN_UNEXPECTED_IF_NULL(root_);
   RETURN_IF_NOT_OK(root_->GetNextRowPullMode(row));
+  RETURN_UNEXPECTED_IF_NULL(row);
   return Status::OK();
 }
 
diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/audio.h b/mindspore/ccsrc/minddata/dataset/include/dataset/audio.h
index e3286e968ab..b2fa960ad20 100644
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/audio.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/audio.h
@@ -35,6 +35,7 @@ class TensorOperation;
 
 // Transform operations for performing computer audio.
 namespace audio {
+
 /// \brief Compute the angle of complex tensor input.
 class Angle final : public TensorTransform {
  public:
@@ -98,10 +99,10 @@ class AllpassBiquad final : public TensorTransform {
 class AmplitudeToDB final : public TensorTransform {
  public:
   /// \brief Constructor.
-  /// \param[in] stype ['kPower', 'kMagnitude']
-  /// \param[in] ref_value Calculate db_multiplier
-  /// \param[in] amin Clamp the input waveform
-  /// \param[in] top_db Decibels cut-off value
+  /// \param[in] stype ['kPower', 'kMagnitude'].
+  /// \param[in] ref_value Calculate db_multiplier.
+  /// \param[in] amin Clamp the input waveform.
+  /// \param[in] top_db Decibels cut-off value.
   explicit AmplitudeToDB(ScaleType stype = ScaleType::kPower, float ref_value = 1.0, float amin = 1e-10,
                          float top_db = 80.0);
 
@@ -124,9 +125,9 @@ class BandpassBiquad final : public TensorTransform {
   /// \brief Constructor.
   /// \param[in] sample_rate Sampling rate of the waveform, e.g. 44100 (Hz).
   /// \param[in] central_freq Central frequency (in Hz).
-  /// \param[in] Q Quality factor, https://en.wikipedia.org/wiki/Q_factor  (Default: 0.707).
+  /// \param[in] Q Quality factor, https://en.wikipedia.org/wiki/Q_factor (Default: 0.707).
   /// \param[in] const_skirt_gain, If ``True``, uses a constant skirt gain (peak gain = Q). If ``False``, uses a
-  /// constant 0dB peak gain. (Default: False).
+  ///     constant 0dB peak gain (Default: False).
   explicit BandpassBiquad(int32_t sample_rate, float central_freq, float Q = 0.707, bool const_skirt_gain = false);
 
   /// \brief Destructor.
@@ -187,6 +188,81 @@ class BassBiquad final : public TensorTransform {
   std::shared_ptr<Data> data_;
 };
 
+/// \brief ComplexNorm TensorTransform.
+/// \notes Compute the norm of complex tensor input.
+class ComplexNorm final : public TensorTransform {
+ public:
+  /// \brief Constructor.
+  /// \param[in] power Power of the norm, which must be non-negative (Default: 1.0).
+  explicit ComplexNorm(float power = 1.0);
+
+  /// \brief Destructor.
+  ~ComplexNorm() = default;
+
+ protected:
+  /// \brief Function to convert TensorTransform object into a TensorOperation object.
+  /// \return Shared pointer to TensorOperation object.
+  std::shared_ptr<TensorOperation> Parse() override;
+
+ private:
+  struct Data;
+  std::shared_ptr<Data> data_;
+};
+
+/// \brief FrequencyMasking TensorTransform.
+/// \notes Apply masking to a spectrogram in the frequency domain.
+class FrequencyMasking final : public TensorTransform {
+ public:
+  /// \brief Constructor.
+  /// \param[in] iid_masks Whether to apply different masks to each example.
+  /// \param[in] frequency_mask_param Maximum possible length of the mask.
+  ///     Indices uniformly sampled from [0, frequency_mask_param].
+  ///     Mask width when iid_masks=true.
+  /// \param[in] mask_start Mask start when iid_masks=true.
+  /// \param[in] mask_value Mask value.
+  explicit FrequencyMasking(bool iid_masks = false, int32_t frequency_mask_param = 0, int32_t mask_start = 0,
+                            double mask_value = 0.0);
+
+  /// \brief Destructor.
+  ~FrequencyMasking() = default;
+
+ protected:
+  /// \brief Function to convert TensorTransform object into a TensorOperation object.
+  /// \return Shared pointer to TensorOperation object.
+  std::shared_ptr<TensorOperation> Parse() override;
+
+ private:
+  struct Data;
+  std::shared_ptr<Data> data_;
+};
+
+/// \brief TimeMasking TensorTransform.
+/// \notes Apply masking to a spectrogram in the time domain.
+class TimeMasking final : public TensorTransform {
+ public:
+  /// \brief Constructor.
+  /// \param[in] iid_masks Whether to apply different masks to each example.
+  /// \param[in] time_mask_param Maximum possible length of the mask.
+  ///     Indices uniformly sampled from [0, time_mask_param].
+  ///     Mask width when iid_masks=true.
+  /// \param[in] mask_start Mask start when iid_masks=true.
+  /// \param[in] mask_value Mask value.
+  explicit TimeMasking(bool iid_masks = false, int64_t time_mask_param = 0, int64_t mask_start = 0,
+                       double mask_value = 0.0);
+
+  /// \brief Destructor.
+  ~TimeMasking() = default;
+
+ protected:
+  /// \brief Function to convert TensorTransform object into a TensorOperation object.
+  /// \return Shared pointer to TensorOperation object.
+  std::shared_ptr<TensorOperation> Parse() override;
+
+ private:
+  struct Data;
+  std::shared_ptr<Data> data_;
+};
+
 /// \brief TimeStretch TensorTransform
 /// \notes Stretch STFT in time at a given rate, without changing the pitch.
 class TimeStretch final : public TensorTransform {
diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/constants.h b/mindspore/ccsrc/minddata/dataset/include/dataset/constants.h
index 7af6fb81267..47f081825e2 100644
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/constants.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/constants.h
@@ -153,9 +153,19 @@ enum class OutputFormat {
 // convenience functions for 32bit int bitmask
 inline bool BitTest(uint32_t bits, uint32_t bitMask) { return (bits & bitMask) == bitMask; }
 
-inline void BitSet(uint32_t *bits, uint32_t bitMask) { *bits |= bitMask; }
+inline void BitSet(uint32_t *bits, uint32_t bitMask) {
+  if (bits == nullptr) {
+    return;
+  }
+  *bits |= bitMask;
+}
 
-inline void BitClear(uint32_t *bits, uint32_t bitMask) { *bits &= (~bitMask); }
+inline void BitClear(uint32_t *bits, uint32_t bitMask) {
+  if (bits == nullptr) {
+    return;
+  }
+  *bits &= (~bitMask);
+}
 
 constexpr int64_t kDeMaxDim = std::numeric_limits<int64_t>::max();
 constexpr int32_t kDeMaxRank = std::numeric_limits<int32_t>::max();
diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h b/mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h
index 90017f22968..d76c39733e0 100644
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/datasets.h
@@ -1545,60 +1545,6 @@ inline std::shared_ptr<ConcatDataset> operator+(const std::shared_ptr<Dataset> &
   return std::make_shared<ConcatDataset>(std::vector({datasets1, datasets2}));
 }
 
-class CmuArcticDataset : public Dataset {
- public:
-  explicit CmuArcticDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
-                        const std::shared_ptr<Sampler> &sampler, const std::shared_ptr<DatasetCache> &cache);
-  explicit CmuArcticDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage, const Sampler *sampler,
-                        const std::shared_ptr<DatasetCache> &cache);
-  explicit CmuArcticDataset(const std::vector<char> &dataset_dir, const std::vector<char> &usage,
-                        const std::reference_wrapper<Sampler> sampler, const std::shared_ptr<DatasetCache> &cache);
-  ~CmuArcticDataset() = default;
-};
-
-/// \brief Function to create a CmuArcticDataset.
-/// \note The generated dataset has two columns ["audio", "samplerate", "label"].
-/// \param[in] dataset_dir Path to the root directory that contains the dataset.
-/// \param[in] usage Part of dataset of GTZAN, can be "training", "validation", "testing" or "all" (default = "all").
-/// \param[in] sampler Shared pointer to a sampler object used to choose samples from the dataset. If sampler is not
-///     given, a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler()).
-/// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used).
-/// \return Shared pointer to the CmuArcticDataset.
-inline std::shared_ptr<CmuArcticDataset> CmuArctic(const std::string &dataset_dir, const std::string &usage = "all",
-                                           const std::shared_ptr<Sampler> &sampler = std::make_shared<RandomSampler>(),
-                                           const std::shared_ptr<DatasetCache> &cache = nullptr) {
-  return std::make_shared<CmuArcticDataset>(StringToChar(dataset_dir), StringToChar(usage), sampler, cache);
-}
-
-
-/// \brief Function to create a CmuArcticDataset.
-/// \note The generated dataset has two columns ["audio", "samplerate", "label"].
-/// \param[in] dataset_dir Path to the root directory that contains the dataset.
-/// \param[in] usage Part of dataset of GTZAN, can be "training", "validation", "testing" or "all" (default = "all").
-/// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
-/// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used).
-/// \return Shared pointer to the CmuArcticDataset.
-inline std::shared_ptr<CmuArcticDataset> CmuArctic(const std::string &dataset_dir, const std::string &usage,
-                                           const Sampler *sampler,
-                                           const std::shared_ptr<DatasetCache> &cache = nullptr) {
-  return std::make_shared<CmuArcticDataset>(StringToChar(dataset_dir), StringToChar(usage), sampler, cache);
-}
-
-
-/// \brief Function to create a CmuArcticDataset.
-/// \note The generated dataset has two columns ["audio", "samplerate", "label"].
-/// \param[in] dataset_dir Path to the root directory that contains the dataset.
-/// \param[in] usage Part of dataset of GTZAN, can be "training", "validation", "testing" or "all" (default = "all").
-/// \param[in] sampler Raw pointer to a sampler object used to choose samples from the dataset.
-/// \param[in] cache Tensor cache to use (default=nullptr which means no cache is used).
-/// \return Shared pointer to the CmuArcticDataset.
-inline std::shared_ptr<CmuArcticDataset> CmuArctic(const std::string &dataset_dir, const std::string &usage,
-                                           const std::reference_wrapper<Sampler> sampler,
-                                           const std::shared_ptr<DatasetCache> &cache = nullptr) {
-  return std::make_shared<CmuArcticDataset>(StringToChar(dataset_dir), StringToChar(usage), sampler, cache);
-}
-
-
 class RandomDataDataset : public Dataset {
  public:
   RandomDataDataset(const int32_t &total_rows, std::shared_ptr<SchemaObj> schema,
diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h b/mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h
index 8bb241fdb3a..710e1317247 100644
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/samplers.h
@@ -42,7 +42,6 @@ class Sampler : std::enable_shared_from_this<Sampler> {
   friend class ManifestDataset;
   friend class MindDataDataset;
   friend class MnistDataset;
-  friend class CmuArcticDataset;
   friend class RandomDataDataset;
   friend class TextFileDataset;
   friend class TFRecordDataset;
diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/vision.h b/mindspore/ccsrc/minddata/dataset/include/dataset/vision.h
index c16b6e9e22b..b9775252570 100644
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/vision.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/vision.h
@@ -36,26 +36,6 @@ class TensorOperation;
 
 // Transform operations for performing computer vision.
 namespace vision {
-/// \brief Apply automatic contrast on the input image.
-class AutoContrast final : public TensorTransform {
- public:
-  /// \brief Constructor.
-  /// \param[in] cutoff Percent of pixels to cut off from the histogram, the valid range of cutoff value is 0 to 50.
-  /// \param[in] ignore Pixel values to ignore.
-  explicit AutoContrast(float cutoff = 0.0, std::vector<uint32_t> ignore = {});
-
-  /// \brief Destructor.
-  ~AutoContrast() = default;
-
- protected:
-  /// \brief The function to convert a TensorTransform object into a TensorOperation object.
-  /// \return Shared pointer to TensorOperation object.
-  std::shared_ptr<TensorOperation> Parse() override;
-
- private:
-  struct Data;
-  std::shared_ptr<Data> data_;
-};
 
 /// \brief AdjustGamma TensorTransform.
 /// \notes Apply gamma correction on input image.
@@ -80,6 +60,27 @@ class AdjustGamma final : public TensorTransform {
   std::shared_ptr<Data> data_;
 };
 
+/// \brief Apply automatic contrast on the input image.
+class AutoContrast final : public TensorTransform {
+ public:
+  /// \brief Constructor.
+  /// \param[in] cutoff Percent of pixels to cut off from the histogram, the valid range of cutoff value is 0 to 50.
+  /// \param[in] ignore Pixel values to ignore.
+  explicit AutoContrast(float cutoff = 0.0, std::vector<uint32_t> ignore = {});
+
+  /// \brief Destructor.
+  ~AutoContrast() = default;
+
+ protected:
+  /// \brief The function to convert a TensorTransform object into a TensorOperation object.
+  /// \return Shared pointer to TensorOperation object.
+  std::shared_ptr<TensorOperation> Parse() override;
+
+ private:
+  struct Data;
+  std::shared_ptr<Data> data_;
+};
+
 /// \brief BoundingBoxAugment TensorTransform.
 /// \note  Apply a given image transform on a random selection of bounding box regions of a given image.
 class BoundingBoxAugment final : public TensorTransform {
diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/vision_lite.h b/mindspore/ccsrc/minddata/dataset/include/dataset/vision_lite.h
index ff8e26bc397..e47accce1bb 100644
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/vision_lite.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/vision_lite.h
@@ -89,39 +89,6 @@ class CenterCrop final : public TensorTransform {
   std::shared_ptr<Data> data_;
 };
 
-/// \brief RGB2BGR TensorTransform.
-/// \notes Convert the format of input image from RGB to BGR.
-class RGB2BGR final : public TensorTransform {
- public:
-  /// \brief Constructor.
-  RGB2BGR() = default;
-
-  /// \brief Destructor.
-  ~RGB2BGR() = default;
-
- protected:
-  /// \brief The function to convert a TensorTransform object into a TensorOperation object.
-  /// \return Shared pointer to TensorOperation object.
-  std::shared_ptr<TensorOperation> Parse() override;
-};
-
-/// \brief RGB2GRAY TensorTransform.
-/// \note Convert RGB image or color image to grayscale image.
-/// \brief Convert a RGB image or color image to a grayscale one.
-class RGB2GRAY final : public TensorTransform {
- public:
-  /// \brief Constructor.
-  RGB2GRAY() = default;
-
-  /// \brief Destructor.
-  ~RGB2GRAY() = default;
-
- protected:
-  /// \brief The function to convert a TensorTransform object into a TensorOperation object.
-  /// \return Shared pointer to TensorOperation object.
-  std::shared_ptr<TensorOperation> Parse() override;
-};
-
 /// \brief Crop an image based on location and crop size.
 class Crop final : public TensorTransform {
  public:
@@ -308,6 +275,39 @@ class ResizePreserveAR final : public TensorTransform {
   std::shared_ptr<Data> data_;
 };
 
+/// \brief RGB2BGR TensorTransform.
+/// \notes Convert the format of input image from RGB to BGR.
+class RGB2BGR final : public TensorTransform {
+ public:
+  /// \brief Constructor.
+  RGB2BGR() = default;
+
+  /// \brief Destructor.
+  ~RGB2BGR() = default;
+
+ protected:
+  /// \brief The function to convert a TensorTransform object into a TensorOperation object.
+  /// \return Shared pointer to TensorOperation object.
+  std::shared_ptr<TensorOperation> Parse() override;
+};
+
+/// \brief RGB2GRAY TensorTransform.
+/// \note Convert RGB image or color image to grayscale image.
+/// \brief Convert a RGB image or color image to a grayscale one.
+class RGB2GRAY final : public TensorTransform {
+ public:
+  /// \brief Constructor.
+  RGB2GRAY() = default;
+
+  /// \brief Destructor.
+  ~RGB2GRAY() = default;
+
+ protected:
+  /// \brief The function to convert a TensorTransform object into a TensorOperation object.
+  /// \return Shared pointer to TensorOperation object.
+  std::shared_ptr<TensorOperation> Parse() override;
+};
+
 /// \brief Rotate the input image according to parameters.
 class Rotate final : public TensorTransform {
  public:
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/adjust_gamma_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/adjust_gamma_op.cc
index 3698482fe56..338d257d547 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/adjust_gamma_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/adjust_gamma_op.cc
@@ -15,21 +15,21 @@
  */
 
 #include "minddata/dataset/kernels/image/adjust_gamma_op.h"
-#include <memory>
+
 #include "minddata/dataset/kernels/data/data_utils.h"
 #include "minddata/dataset/kernels/image/image_utils.h"
 
 namespace mindspore {
 namespace dataset {
 
-const float AdjustGammaOp::kGain = 1.0;
+constexpr float AdjustGammaOp::kGain = 1.0;
 
 Status AdjustGammaOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   IO_CHECK(input, output);
 
   // typecast
   CHECK_FAIL_RETURN_UNEXPECTED(input->type() != DataType::DE_STRING,
-                               "AdjustGamma: input tensor type should be [int, float, double], but got string.");
+                               "AdjustGamma: input tensor type should be int, float or double, but got: string.");
 
   if (input->type().IsFloat()) {
     std::shared_ptr<Tensor> input_tensor;
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/dvpp_normalize_op.h b/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/dvpp_normalize_op.h
index 7e4dbe09bb5..8e1264bb817 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/dvpp_normalize_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/dvpp_normalize_op.h
@@ -19,6 +19,7 @@
 
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #include "minddata/dataset/core/device_tensor.h"
 #include "minddata/dataset/core/device_resource.h"
@@ -30,7 +31,8 @@ namespace mindspore {
 namespace dataset {
 class DvppNormalizeOp : public TensorOp {
  public:
-  explicit DvppNormalizeOp(std::vector<float> mean, std::vector<float> std) : mean_(mean), std_(std) {}
+  explicit DvppNormalizeOp(std::vector<float> mean, std::vector<float> std)
+      : mean_(std::move(mean)), std_(std::move(std)) {}
 
   ~DvppNormalizeOp() = default;
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/CommonDataType.h b/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/CommonDataType.h
index cf898815a72..bd5026b972a 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/CommonDataType.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/CommonDataType.h
@@ -18,7 +18,7 @@
 #ifndef ENABLE_DVPP_INTERFACE
 #define ENABLE_DVPP_INTERFACE
 #endif
-#include <stdio.h>
+#include <cstdio>
 #include <iostream>
 #include <memory>
 #include <vector>
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/MDAclProcess.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/MDAclProcess.cc
index 55886fcdf80..852eb98ec36 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/MDAclProcess.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/MDAclProcess.cc
@@ -13,13 +13,14 @@
  * limitations under the License.
  */
 
+#include "minddata/dataset/kernels/image/dvpp/utils/MDAclProcess.h"
+
+#include <thread>
+#include <sys/stat.h>
+#include <sys/time.h>
 #include "minddata/dataset/include/dataset/constants.h"
 #include "minddata/dataset/core/tensor_shape.h"
 #include "minddata/dataset/kernels/image/image_utils.h"
-#include "MDAclProcess.h"
-#include <sys/time.h>
-#include <thread>
-#include <sys/stat.h>
 
 namespace {
 const int BUFFER_SIZE = 2048;
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/MDAclProcess.h b/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/MDAclProcess.h
index 41b790ef938..cd162823f7b 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/MDAclProcess.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/MDAclProcess.h
@@ -17,25 +17,25 @@
 #define MDACLMANAGER_H
 
 #include <climits>
-#include <string>
-#include <string.h>
+#include <cstdio>
 #include <map>
 #include <iostream>
 #include <memory>
+#include <unistd.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
 #include "acl/acl.h"
-#include "CommonDataType.h"
+
 #include "minddata/dataset/core/tensor_shape.h"
 #include "minddata/dataset/core/data_type.h"
+#include "minddata/dataset/kernels/image/dvpp/utils/CommonDataType.h"
+#include "minddata/dataset/kernels/image/dvpp/utils/DvppCommon.h"
+#include "minddata/dataset/kernels/image/dvpp/utils/ErrorCode.h"
 #include "mindspore/ccsrc/minddata/dataset/core/device_tensor.h"
 #include "mindspore/ccsrc/minddata/dataset/core/tensor.h"
 #include "mindspore/core/utils/log_adapter.h"
 #include "mindspore/ccsrc/minddata/dataset/util/status.h"
-#include "ErrorCode.h"
-#include "DvppCommon.h"
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/stat.h>
-#include <sys/types.h>
 
 mode_t SetFileDefaultUmask();
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/ResourceManager.h b/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/ResourceManager.h
index ff5f29099f2..daed1f9faed 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/ResourceManager.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/dvpp/utils/ResourceManager.h
@@ -16,17 +16,18 @@
 #ifndef RESOURCEMANAGER_H
 #define RESOURCEMANAGER_H
 
-#include <vector>
-#include <set>
+#include <climits>
 #include <cstring>
 #include <climits>
-#include <unordered_map>
 #include <mutex>
-#include "CommonDataType.h"
-#include "ErrorCode.h"
+#include <set>
 #include <sys/stat.h>
+#include <unordered_map>
+#include <vector>
 #include "mindspore/core/utils/log_adapter.h"
 #include "mindspore/ccsrc/cxx_api/graph/acl/acl_env_guard.h"
+#include "minddata/dataset/kernels/image/dvpp/utils/CommonDataType.h"
+#include "minddata/dataset/kernels/image/dvpp/utils/ErrorCode.h"
 
 enum ModelLoadMethod {
   LOAD_FROM_FILE = 0,       // Loading from file, memory of model and weights are managed by ACL
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
index 1c3b7e35b1a..77d4931d8a3 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
@@ -877,7 +877,7 @@ Status AdjustGamma(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor>
   try {
     int num_channels = 1;
     if (input->Rank() < 2) {
-      RETURN_STATUS_UNEXPECTED("AdjustGamma: image shape is not <...,H,W,C> or <H,W>.");
+      RETURN_STATUS_UNEXPECTED("AdjustGamma: input tensor is not in shape of <...,H,W,C> or <H,W>.");
     }
     if (input->Rank() > 2) {
       num_channels = input->shape()[-1];
@@ -1255,14 +1255,53 @@ Status RgbaToBgr(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *
 
 Status RgbToBgr(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   try {
-    std::shared_ptr<CVTensor> input_cv = CVTensor::AsCVTensor(std::move(input));
-    if (input_cv->Rank() != 3 || input_cv->shape()[2] != 3) {
-      RETURN_STATUS_UNEXPECTED("RgbToBgr: image shape is not <H,W,C> or channel is not 3.");
+    auto input_type = input->type();
+    std::shared_ptr<CVTensor> input_cv = CVTensor::AsCVTensor(input);
+    if (!input_cv->mat().data) {
+      RETURN_STATUS_UNEXPECTED("RgbToBgr: load image failed.");
     }
-    TensorShape out_shape = TensorShape({input_cv->shape()[0], input_cv->shape()[1], 3});
+    if (input_cv->Rank() != 3 || input_cv->shape()[2] != 3) {
+      RETURN_STATUS_UNEXPECTED("RgbToBgr: input tensor is not in shape of <H,W,C> or channel is not 3.");
+    }
+
+    cv::Mat image = input_cv->mat().clone();
+    if (input_type == DataType::DE_FLOAT16 || input_type == DataType::DE_INT16 || input_type == DataType::DE_UINT16) {
+      for (int i = 0; i < input_cv->mat().rows; ++i) {
+        cv::Vec3s *p1 = input_cv->mat().ptr<cv::Vec3s>(i);
+        cv::Vec3s *p2 = image.ptr<cv::Vec3s>(i);
+        for (int j = 0; j < input_cv->mat().cols; ++j) {
+          p2[j][2] = p1[j][0];
+          p2[j][1] = p1[j][1];
+          p2[j][0] = p1[j][2];
+        }
+      }
+    } else if (input_type == DataType::DE_FLOAT32 || input_type == DataType::DE_INT32) {
+      for (int i = 0; i < input_cv->mat().rows; ++i) {
+        cv::Vec3f *p1 = input_cv->mat().ptr<cv::Vec3f>(i);
+        cv::Vec3f *p2 = image.ptr<cv::Vec3f>(i);
+        for (int j = 0; j < input_cv->mat().cols; ++j) {
+          p2[j][2] = p1[j][0];
+          p2[j][1] = p1[j][1];
+          p2[j][0] = p1[j][2];
+        }
+      }
+    } else if (input_type == DataType::DE_FLOAT64) {
+      for (int i = 0; i < input_cv->mat().rows; ++i) {
+        cv::Vec3d *p1 = input_cv->mat().ptr<cv::Vec3d>(i);
+        cv::Vec3d *p2 = image.ptr<cv::Vec3d>(i);
+        for (int j = 0; j < input_cv->mat().cols; ++j) {
+          p2[j][2] = p1[j][0];
+          p2[j][1] = p1[j][1];
+          p2[j][0] = p1[j][2];
+        }
+      }
+    } else {
+      cv::cvtColor(input_cv->mat(), image, cv::COLOR_RGB2BGR);
+    }
+
     std::shared_ptr<CVTensor> output_cv;
-    RETURN_IF_NOT_OK(CVTensor::CreateEmpty(out_shape, input_cv->type(), &output_cv));
-    cv::cvtColor(input_cv->mat(), output_cv->mat(), static_cast<int>(cv::COLOR_RGB2BGR));
+    RETURN_IF_NOT_OK(CVTensor::CreateFromMat(image, input_cv->Rank(), &output_cv));
+
     *output = std::static_pointer_cast<Tensor>(output_cv);
     return Status::OK();
   } catch (const cv::Exception &e) {
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/canny.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/canny.cc
index 96e4c89e1a4..0bde0e63216 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/canny.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/canny.cc
@@ -48,7 +48,7 @@ static void GetSobelKernel(float *kernel, int flag, int ksize, double scale) {
       buffer[0] = 1, buffer[1] = -2, buffer[2] = 1;
     }
   } else {
-    int old, now;
+    float old, now;
     buffer[0] = 1;
     for (int i = 0; i < ksize; i++) {
       buffer[i + 1] = 0;
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/image_process.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/image_process.cc
index 3e1c6f6fe49..04549c9638e 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/image_process.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/image_process.cc
@@ -571,9 +571,8 @@ bool ConvertTo(const LiteMat &src, LiteMat &dst, double scale) {
 
   if (dst.IsEmpty()) {
     dst.Init(src.width_, src.height_, src.channel_, LDataType::FLOAT32);
-  } else if (src.width_ != dst.width_ || src.height_ != dst.height_ || src.channel_ != dst.channel_) {
-    return false;
-  } else if (dst.data_type_ != LDataType::FLOAT32) {
+  } else if (src.width_ != dst.width_ || src.height_ != dst.height_ || src.channel_ != dst.channel_ ||
+             dst.data_type_ != LDataType::FLOAT32) {
     return false;
   }
 
@@ -662,24 +661,16 @@ bool Crop(const LiteMat &src, LiteMat &dst, int x, int y, int w, int h) {
 }
 
 static bool CheckZero(const std::vector<float> &vs) {
-  for (int i = 0; i < vs.size(); i++) {
-    if (Equal(vs[i], 0.0f)) {
-      return true;
-    }
-  }
-  return false;
+  return std::any_of(vs.begin(), vs.end(), [](const float &v) { return Equal(v, 0.0f); });
 }
 
 static bool CheckZero(const std::vector<size_t> &vs) {
-  for (int i = 0; i < vs.size(); i++) {
-    if (vs[i] == 0) return true;
-  }
-  return false;
+  return std::any_of(vs.begin(), vs.end(), [](const float &v) { return v == 0; });
 }
 
 static bool CheckMeanAndStd(const LiteMat &src, LiteMat &dst, int channel, const std::vector<float> &mean,
                             const std::vector<float> &std) {
-  if (mean.size() == 0 && std.size() == 0) {
+  if (mean.empty() && std.empty()) {
     return false;
   }
   if (src.data_type_ != LDataType::FLOAT32) {
@@ -935,8 +926,8 @@ bool Merge(const std::vector<LiteMat> &mv, LiteMat &dst) {
   LDataType data_type = mv[0].data_type_;
 
   // The arrays in list must be single-channel
-  for (int i = 0; i < mv.size(); i++) {
-    if (mv[i].channel_ != 1) return false;
+  if (std::any_of(mv.begin(), mv.end(), [](const LiteMat &m) { return m.channel_ != 1; })) {
+    return false;
   }
 
   for (int i = 1; i < mv.size(); i++) {
@@ -962,16 +953,23 @@ bool Merge(const std::vector<LiteMat> &mv, LiteMat &dst) {
 
 bool Pad(const LiteMat &src, LiteMat &dst, int top, int bottom, int left, int right, PaddBorderType pad_type,
          uint8_t fill_b_or_gray, uint8_t fill_g, uint8_t fill_r) {
+  RETURN_FALSE_IF_LITEMAT_EMPTY(src);
   if (top < 0 || bottom < 0 || left < 0 || right < 0) {
     return false;
   }
-  if (src.IsEmpty()) {
+  if (src.width_ > std::numeric_limits<int>::max() - left ||
+      src.width_ + left > std::numeric_limits<int>::max() - right) {
+    return false;
+  }
+  if (src.height_ > std::numeric_limits<int>::max() - top ||
+      src.height_ + top > std::numeric_limits<int>::max() - bottom) {
     return false;
   }
   int dst_width = src.width_ + left + right;
   int dst_height = src.height_ + top + bottom;
   if (dst.IsEmpty()) {
     dst.Init(dst_width, dst_height, src.channel_, src.data_type_);
+    RETURN_FALSE_IF_LITEMAT_EMPTY(dst);
   } else if (dst.width_ != dst_width || dst.height_ != dst_height || src.channel_ != dst.channel_) {
     return false;
   } else if (src.data_type_ != dst.data_type_) {
@@ -991,7 +989,7 @@ bool Pad(const LiteMat &src, LiteMat &dst, int top, int bottom, int left, int ri
   return true;
 }
 
-std::vector<std::vector<float>> GetDefaultBoxes(BoxesConfig config) {
+std::vector<std::vector<float>> GetDefaultBoxes(const BoxesConfig config) {
   size_t size = config.num_default.size();
   if (size <= 1 || config.feature_size.size() != size || config.steps.size() != size ||
       config.aspect_rations.size() != size) {
@@ -1109,6 +1107,7 @@ std::vector<int> ApplyNms(const std::vector<std::vector<float>> &all_boxes, std:
       }
     }
     std::vector<int> new_order;
+    new_order.reserve(inds.size());
     for (int k = 0; k < inds.size(); k++) {
       new_order.push_back(order[inds[k]]);
     }
@@ -1544,8 +1543,9 @@ bool GetAffineTransformImpl(LiteMat &src, LiteMat &dst) {
     }
 
     if (std::abs(src.ptr<double>(k)[i]) < DBL_EPSILON * 100) {
-      double x[6] = {0};
-      dst.Init(1, 6, x, LDataType(LDataType::DOUBLE));
+      dst.Init(1, 6, LDataType(LDataType::DOUBLE));
+      (void)memset(dst.data_ptr_, 0, 6 * sizeof(double));
+      RETURN_FALSE_IF_LITEMAT_EMPTY(dst);
       return false;
     }
     if (k != i) {
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.cc
index d555a248c29..5d17bc4f51b 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.cc
@@ -283,9 +283,7 @@ void LiteMat::Release() {
     if (data_ptr_) {
       AlignFree(data_ptr_);
     }
-    if (ref_count_) {
-      delete[] ref_count_;
-    }
+    delete[] ref_count_;
   }
   data_ptr_ = nullptr;
   elem_size_ = 0;
@@ -293,7 +291,7 @@ void LiteMat::Release() {
   height_ = 0;
   channel_ = 0;
   c_step_ = 0;
-  ref_count_ = 0;
+  ref_count_ = nullptr;
   size_ = 0;
   setSteps(0, 0, 0);
 }
@@ -305,6 +303,7 @@ void *LiteMat::AlignMalloc(unsigned int size) {
   }
   void *p_raw = reinterpret_cast<void *>(malloc(size + length));
   if (p_raw) {
+    release_flag = true;
     void **p_algin = reinterpret_cast<void **>(((size_t)(p_raw) + length) & ~(ALIGN - 1));
     p_algin[-1] = p_raw;
     return p_algin;
@@ -313,8 +312,11 @@ void *LiteMat::AlignMalloc(unsigned int size) {
 }
 
 void LiteMat::AlignFree(void *ptr) {
-  (void)free(reinterpret_cast<void **>(ptr)[-1]);
-  ptr = nullptr;
+  if (release_flag) {
+    (void)free(reinterpret_cast<void **>(ptr)[-1]);
+    ptr = nullptr;
+    release_flag = false;
+  }
 }
 
 inline void LiteMat::InitElemSize(LDataType data_type) { elem_size_ = data_type.SizeInBytes(); }
@@ -414,7 +416,7 @@ inline void SubtractImpl(const uint32_t *src0, const uint32_t *src1, uint32_t *d
 }
 
 inline bool CheckSubstract(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) {
-  if (dst == NULL) {
+  if (dst == nullptr) {
     return false;
   }
 
@@ -422,10 +424,7 @@ inline bool CheckSubstract(const LiteMat &src_a, const LiteMat &src_b, LiteMat *
     return false;
   }
 
-  if (src_a.data_type_ != src_b.data_type_) {
-    return false;
-  }
-  return true;
+  return src_a.data_type_ == src_b.data_type_;
 }
 
 bool Subtract(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) {
@@ -581,7 +580,7 @@ inline void DivideImpl(const uint32_t *src0, const uint32_t *src1, uint32_t *dst
 }
 
 inline bool CheckDivide(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) {
-  if (dst == NULL) {
+  if (dst == nullptr) {
     return false;
   }
 
@@ -589,10 +588,7 @@ inline bool CheckDivide(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst
     return false;
   }
 
-  if (src_a.data_type_ != src_b.data_type_) {
-    return false;
-  }
-  return true;
+  return src_a.data_type_ == src_b.data_type_;
 }
 
 bool Divide(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) {
@@ -689,7 +685,7 @@ inline void MultiplyImpl(const uint32_t *src0, const uint32_t *src1, uint32_t *d
 }
 
 inline bool CheckMultiply(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) {
-  if (dst == NULL) {
+  if (dst == nullptr) {
     return false;
   }
 
@@ -697,10 +693,7 @@ inline bool CheckMultiply(const LiteMat &src_a, const LiteMat &src_b, LiteMat *d
     return false;
   }
 
-  if (src_a.data_type_ != src_b.data_type_) {
-    return false;
-  }
-  return true;
+  return src_a.data_type_ == src_b.data_type_;
 }
 
 bool Multiply(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) {
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.h b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.h
index 6acead3ed05..db43b464399 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.h
@@ -166,15 +166,9 @@ class LDataType {
   ~LDataType() = default;
 
   inline Type Value() const { return type_; }
-  inline bool operator==(const LDataType &ps) const {
-    if (this->type_ == ps.type_) return true;
-    return false;
-  }
+  inline bool operator==(const LDataType &ps) const { return this->type_ == ps.type_; }
 
-  inline bool operator!=(const LDataType &ps) const {
-    if (this->type_ != ps.type_) return true;
-    return false;
-  }
+  inline bool operator!=(const LDataType &ps) const { return this->type_ != ps.type_; }
 
   uint8_t SizeInBytes() const {
     if (type_ < LDataType::NUM_OF_TYPES)
@@ -304,6 +298,7 @@ class LiteMat {
   LDataType data_type_;
   int *ref_count_;
   size_t steps_[MAX_DIMS];
+  bool release_flag;
 };
 
 /// \brief Calculates the difference between the two images for each element
@@ -315,6 +310,20 @@ bool Divide(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst);
 /// \brief Calculates the multiply between the two images for each element
 bool Multiply(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst);
 
+#define RETURN_FALSE_IF_LITEMAT_EMPTY(_m) \
+  do {                                    \
+    if ((_m).IsEmpty()) {                 \
+      return false;                       \
+    }                                     \
+  } while (false)
+
+#define RETURN_IF_LITEMAT_EMPTY(_m) \
+  do {                              \
+    if ((_m).IsEmpty()) {           \
+      return;                       \
+    }                               \
+  } while (false)
+
 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINI_MAT_H_
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/warp_affine.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/warp_affine.cc
index 2ec3fb0fed3..f8729a99fd5 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/warp_affine.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/warp_affine.cc
@@ -381,11 +381,9 @@ bool WarpAffineBilinear(const LiteMat &src, LiteMat &dst, const LiteMat &M, int
   }
   if (dst.IsEmpty()) {
     (void)dst.Init(dst_w, dst_h, src.channel_, LDataType::UINT8);
-  } else if (dst.height_ != dst_h || dst.width_ != dst_w || dst.channel_ != src.channel_) {
+  } else if (dst.height_ != dst_h || dst.width_ != dst_w || dst.channel_ != src.channel_ ||
+             dst.data_type_ != LDataType::UINT8) {
     return false;
-  } else if (dst.data_type_ != LDataType::UINT8) {
-    return false;
-  } else {
   }
 
   double IM[6];
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_image_utils.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_image_utils.cc
index 7fa5853db78..2c94e1447b8 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_image_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_image_utils.cc
@@ -182,6 +182,8 @@ Status JpegCropAndDecode(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
   } catch (std::runtime_error &e) {
     return DestroyDecompressAndReturnError(e.what());
   }
+  CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() - crop_w) > crop_x, "invalid crop width");
+  CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() - crop_h) > crop_y, "invalid crop height");
   if (crop_x == 0 && crop_y == 0 && crop_w == 0 && crop_h == 0) {
     crop_w = cinfo.output_width;
     crop_h = cinfo.output_height;
@@ -190,6 +192,7 @@ Status JpegCropAndDecode(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
     return DestroyDecompressAndReturnError("Decode: invalid crop size");
   }
   const int mcu_size = cinfo.min_DCT_scaled_size;
+  CHECK_FAIL_RETURN_UNEXPECTED(mcu_size != 0, "Invalid data.");
   unsigned int crop_x_aligned = (crop_x / mcu_size) * mcu_size;
   unsigned int crop_w_aligned = crop_w + crop_x - crop_x_aligned;
   try {
@@ -206,8 +209,13 @@ Status JpegCropAndDecode(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
   RETURN_IF_NOT_OK(Tensor::CreateEmpty(ts, DataType(DataType::DE_UINT8), &output_tensor));
   const int buffer_size = output_tensor->SizeInBytes();
   JSAMPLE *buffer = reinterpret_cast<JSAMPLE *>(&(*output_tensor->begin<uint8_t>()));
+  // stride refers to output tensor, which has 3 components at most
+  CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() - skipped_scanlines) > crop_h,
+                               "Invalid crop height.");
   const int max_scanlines_to_read = skipped_scanlines + crop_h;
   // stride refers to output tensor, which has 3 components at most
+  CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() / crop_w) > kOutNumComponents,
+                               "Invalid crop width.");
   const int stride = crop_w * kOutNumComponents;
   // offset is calculated for scanlines read from the image, therefore
   // has the same number of components as the image
@@ -246,6 +254,8 @@ Status Crop(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *outpu
     RETURN_STATUS_UNEXPECTED("Crop: image datatype is not float32 or uint8");
   }
 
+  CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() - y) > h, "Invalid crop height.");
+  CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() - x) > w, "Invalid crop width.");
   // account for integer overflow
   if (y < 0 || (y + h) > input->shape()[0] || (y + h) < 0) {
     RETURN_STATUS_UNEXPECTED(
@@ -410,7 +420,10 @@ Status Resize(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out
 Status ResizePreserve(const TensorRow &inputs, int32_t height, int32_t width, int32_t img_orientation,
                       TensorRow *outputs) {
   outputs->resize(3);
+  CHECK_FAIL_RETURN_UNEXPECTED(inputs.size() > 0,
+                               "Invalid input, should greater than 0, but got " + std::to_string(inputs.size()));
   std::shared_ptr<Tensor> input = inputs[0];
+  CHECK_FAIL_RETURN_UNEXPECTED(input->shape().Size() >= 3, "Invalid input shape, should be greater than 3 dimensions.");
   LiteMat lite_mat_src(input->shape()[1], input->shape()[0], input->shape()[2],
                        const_cast<void *>(reinterpret_cast<const void *>(input->GetBuffer())),
                        GetLiteCVDataType(input->type()));
@@ -537,7 +550,15 @@ Status Pad(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output
 
     std::shared_ptr<Tensor> output_tensor;
 
+    CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() - lite_mat_rgb.width_) > pad_left,
+                                 "Invalid pad width.");
+    CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() - lite_mat_rgb.width_ + pad_left) > pad_right,
+                                 "Invalid pad width.");
     int pad_width = lite_mat_rgb.width_ + pad_left + pad_right;
+    CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() - lite_mat_rgb.height_) > pad_top,
+                                 "Invalid pad height.");
+    CHECK_FAIL_RETURN_UNEXPECTED((std::numeric_limits<int32_t>::max() - lite_mat_rgb.height_ + pad_top) > pad_bottom,
+                                 "Invalid pad height.");
     int pad_height = lite_mat_rgb.height_ + pad_top + pad_bottom;
     TensorShape new_shape = TensorShape({pad_height, pad_width, input->shape()[2]});
     RETURN_IF_NOT_OK(Tensor::CreateEmpty(new_shape, input->type(), &output_tensor));
@@ -721,11 +742,13 @@ Status Affine(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out
     }
     int height = 0;
     int width = 0;
+    CHECK_FAIL_RETURN_UNEXPECTED(mat.size() <= 6, "Invalid mat shape.");
     double M[6] = {};
     for (int i = 0; i < mat.size(); i++) {
       M[i] = static_cast<double>(mat[i]);
     }
 
+    CHECK_FAIL_RETURN_UNEXPECTED(input->shape().Size() >= 3, "Invalid input shape, should be 3.");
     LiteMat lite_mat_rgb(input->shape()[1], input->shape()[0], input->shape()[2],
                          const_cast<void *>(reinterpret_cast<const void *>(input->GetBuffer())),
                          GetLiteCVDataType(input->type()));
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/resize_preserve_ar_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/resize_preserve_ar_op.cc
index 8e09463a35a..2cd13e1ab62 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/resize_preserve_ar_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/resize_preserve_ar_op.cc
@@ -22,7 +22,7 @@
 
 namespace mindspore {
 namespace dataset {
-const int32_t ResizePreserveAROp::kDefImgorientation = 0;
+const int32_t ResizePreserveAROp::kDefImgOrientation = 0;
 
 ResizePreserveAROp::ResizePreserveAROp(int32_t height, int32_t width, int32_t img_orientation)
     : height_(height), width_(width), img_orientation_(img_orientation) {}
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/resize_preserve_ar_op.h b/mindspore/ccsrc/minddata/dataset/kernels/image/resize_preserve_ar_op.h
index d473c80c351..67ca8dbc2b1 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/resize_preserve_ar_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/resize_preserve_ar_op.h
@@ -34,9 +34,9 @@ namespace dataset {
 class ResizePreserveAROp : public TensorOp {
  public:
   // Default values, also used by python_bindings.cc
-  static const int32_t kDefImgorientation;
+  static const int32_t kDefImgOrientation;
 
-  ResizePreserveAROp(int32_t height, int32_t width, int32_t img_orientation = kDefImgorientation);
+  ResizePreserveAROp(int32_t height, int32_t width, int32_t img_orientation = kDefImgOrientation);
 
   ~ResizePreserveAROp() override = default;
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/resize_with_bbox_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/resize_with_bbox_op.cc
index b2ea0aeb14c..0d5fe7ecc98 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/resize_with_bbox_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/resize_with_bbox_op.cc
@@ -35,9 +35,9 @@ Status ResizeWithBBoxOp::Compute(const TensorRow &input, TensorRow *output) {
   int32_t input_w = input[0]->shape()[1];
 
   output->resize(2);
-  (*output)[1] = std::move(input[1]);  // move boxes over to output
+  (*output)[1] = input[1];  // move boxes over to output
 
-  std::shared_ptr<CVTensor> input_cv = CVTensor::AsCVTensor(std::move(input[0]));
+  std::shared_ptr<CVTensor> input_cv = CVTensor::AsCVTensor(input[0]);
 
   RETURN_IF_NOT_OK(ResizeOp::Compute(std::static_pointer_cast<Tensor>(input_cv), &(*output)[0]));
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/rgb_to_bgr_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/rgb_to_bgr_op.cc
index f5b2b021815..a0b8ffb40d9 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/rgb_to_bgr_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/rgb_to_bgr_op.cc
@@ -25,6 +25,11 @@ namespace dataset {
 
 Status RgbToBgrOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   IO_CHECK(input, output);
+  auto input_type = input->type();
+  CHECK_FAIL_RETURN_UNEXPECTED(input_type != DataType::DE_UINT32 && input_type != DataType::DE_UINT64 &&
+                                 input_type != DataType::DE_INT64 && input_type != DataType::DE_STRING,
+                               "RgbToBgr: unsupported data type as [uint32, int64, uint64, string].");
+
   return RgbToBgr(input, output);
 }
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/rgb_to_bgr_op.h b/mindspore/ccsrc/minddata/dataset/kernels/image/rgb_to_bgr_op.h
index 031bd1982e2..b80940cab00 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/rgb_to_bgr_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/rgb_to_bgr_op.h
@@ -17,8 +17,8 @@
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_RGB_TO_BGR_OP_H_
 
 #include <memory>
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/kernels/tensor_op.h"
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/rgba_to_bgr_op.h b/mindspore/ccsrc/minddata/dataset/kernels/image/rgba_to_bgr_op.h
index 0502de73a78..77f215062d3 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/rgba_to_bgr_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/rgba_to_bgr_op.h
@@ -29,7 +29,7 @@ namespace mindspore {
 namespace dataset {
 class RgbaToBgrOp : public TensorOp {
  public:
-  RgbaToBgrOp() {}
+  RgbaToBgrOp() = default;
 
   ~RgbaToBgrOp() override = default;
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/rgba_to_rgb_op.h b/mindspore/ccsrc/minddata/dataset/kernels/image/rgba_to_rgb_op.h
index 602dd4abd3f..deed2513e6f 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/rgba_to_rgb_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/rgba_to_rgb_op.h
@@ -29,7 +29,7 @@ namespace mindspore {
 namespace dataset {
 class RgbaToRgbOp : public TensorOp {
  public:
-  RgbaToRgbOp() {}
+  RgbaToRgbOp() = default;
 
   ~RgbaToRgbOp() override = default;
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/sharpness_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/sharpness_op.cc
index b24359089ac..c7609601c66 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/sharpness_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/sharpness_op.cc
@@ -42,9 +42,10 @@ Status SharpnessOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_pt
     ///                              1, 5, 1,
     ///                              1, 1, 1
 
-    float filterSum = 13.0;
+    const float filterMid = 5.0;
+    const float filterSum = 13.0;
     cv::Mat filter = cv::Mat(3, 3, CV_32F, cv::Scalar::all(1.0 / filterSum));
-    filter.at<float>(1, 1) = 5.0 / filterSum;
+    filter.at<float>(1, 1) = filterMid / filterSum;
 
     /// applying filter on channels
     cv::Mat result = cv::Mat();
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.cc
index 237dc590dcc..43ca7a43a5c 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.cc
@@ -57,7 +57,7 @@ Status SoftDvppDecodeRandomCropResizeJpegOp::Compute(const std::shared_ptr<Tenso
   SoftDpCropInfo crop_info;
   RETURN_IF_NOT_OK(GetCropInfo(input, &crop_info));
   try {
-    unsigned char *buffer = const_cast<unsigned char *>(input->GetBuffer());
+    auto buffer = const_cast<unsigned char *>(input->GetBuffer());
     CHECK_FAIL_RETURN_UNEXPECTED(buffer != nullptr,
                                  "SoftDvppDecodeRandomCropResizeJpeg: the input image buffer is empty.");
     SoftDpProcsessInfo info;
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.h b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.h
index 1c13433d08d..2672b32ec42 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_random_crop_resize_jpeg_op.h
@@ -21,9 +21,9 @@
 #include <random>
 #include <string>
 
-#include "./utils/external_soft_dp.h"
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/kernels/image/random_crop_and_resize_op.h"
+#include "minddata/dataset/kernels/image/soft_dvpp/utils/external_soft_dp.h"
 #include "minddata/dataset/util/status.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_resize_jpeg_op.h b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_resize_jpeg_op.h
index 21bb54c2225..9bc3381d6a2 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_resize_jpeg_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/soft_dvpp_decode_resize_jpeg_op.h
@@ -32,7 +32,7 @@ class SoftDvppDecodeResizeJpegOp : public TensorOp {
       : target_height_(target_height), target_width_(target_width) {}
 
   /// \brief Destructor
-  ~SoftDvppDecodeResizeJpegOp() = default;
+  ~SoftDvppDecodeResizeJpegOp() override = default;
 
   Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
   Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/external_soft_dp.h b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/external_soft_dp.h
index b703eb35cc6..d7336f0fc32 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/external_soft_dp.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/external_soft_dp.h
@@ -17,7 +17,7 @@
 #ifndef EXTERNAL_SOFTDP_H
 #define EXTERNAL_SOFTDP_H
 
-#include <stdint.h>
+#include <cstdint>
 
 struct SoftDpProcsessInfo {
   uint8_t *input_buffer;       // input buffer
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp.cc
index 3f90b4cf028..793e4164d0d 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp.cc
@@ -44,11 +44,10 @@ uint32_t DecodeAndResizeJpeg(SoftDpProcsessInfo *soft_dp_process_info) {
   }
 
   // use vpc interface to resize and convert RGB, give user output buf and output size.
-  SoftDpCropInfo crop;
-  crop.left = 0;
-  crop.right = vpc_input_info.real_width - 1;
-  crop.up = 0;
-  crop.down = vpc_input_info.real_height - 1;
+  auto crop = SoftDpCropInfo{.left = 0,
+                             .right = static_cast<uint32_t>(vpc_input_info.real_width - 1),
+                             .up = 0,
+                             .down = static_cast<uint32_t>(vpc_input_info.real_height - 1)};
 
   VpcInfo output;
   output.addr = soft_dp_process_info->output_buffer;
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp.h b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp.h
index 5cfb87cf767..a706c129bf5 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp.h
@@ -17,8 +17,8 @@
 #ifndef SOFT_DP_H
 #define SOFT_DP_H
 
-#include <stdint.h>
-#include "./external_soft_dp.h"
+#include <cstdint>
+#include "minddata/dataset/kernels/image/soft_dvpp/utils/external_soft_dp.h"
 
 enum JpegdToVpcFormat {
   INPUT_VPC_UNKNOWN = -1,
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp_log.h b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp_log.h
index b40d9f5e54d..95a023d0de6 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp_log.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp_log.h
@@ -25,11 +25,10 @@
 #define DP_EVENT 0x10000
 #define DP_DEBUG_LEVEL (DP_EVENT | DP_ERR | DP_WARNING | DP_INFO | DP_DEBUG)
 
-#include <vector>
-#include <string>
-
 #if defined(DVPP_UTST) || defined(DEBUG)
 #include <stdio.h>
+#include <string>
+#include <vector>
 
 #define DP_LOG(model, level, format, ...)                              \
   do {                                                                 \
@@ -67,6 +66,8 @@
 
 #include <securec.h>
 #include <cstdio>
+#include <vector>
+#include <string>
 #include "glog/logging.h"
 
 template <typename... Args>
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp_tools.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp_tools.cc
index 7afd61868cb..dfae51e53e6 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp_tools.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp_tools.cc
@@ -48,9 +48,5 @@ bool IsDirectory(const std::string &path) {
     return false;
   }
 
-  if (S_ISDIR(buf.st_mode)) {
-    return true;
-  } else {
-    return false;
-  }
+  return S_ISDIR(buf.st_mode);
 }
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp_tools.h b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp_tools.h
index 549ad4a6ff8..14cc673a9fb 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp_tools.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_dp_tools.h
@@ -40,11 +40,7 @@ T1 AlignDown(T1 num, T2 align) {
 
 template <typename T>
 bool IsInTheScope(T num, T left_point, T right_point) {
-  if (num >= left_point && num <= right_point) {
-    return true;
-  }
-
-  return false;
+  return num >= left_point && num <= right_point;
 }
 
 template <typename T>
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_vpc.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_vpc.cc
index 1a67a30e087..d40edbda7e7 100755
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_vpc.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_vpc.cc
@@ -109,19 +109,19 @@ int32_t SoftVpc::CheckParamter() {
 
   uint32_t out_width = out_width_;
   uint32_t out_height = out_height_;
-  bool flag = (out_width * 32 >= crop_width) ? true : false;  // A maximum of 32x zoom-out
+  bool flag = (out_width * 32 >= crop_width);  // A maximum of 32x zoom-out
   VPC_CHECK_COND_FAIL_PRINT_RETURN(flag, dpFail,
                                    "Max reduction multiple is 32. Please check left(%u), right(%u), out_width(%u).",
                                    left_, right_, out_width);  // Up to 16x magnification
-  flag = (crop_width * 16 >= out_width) ? true : false;
+  flag = (crop_width * 16 >= out_width);
   VPC_CHECK_COND_FAIL_PRINT_RETURN(flag, dpFail,
                                    "Max magnification is 16. Please check left(%u), right(%u), out_width(%u).", left_,
                                    right_, out_width);
-  flag = (out_height * 32 >= crop_height) ? true : false;  // A maximum of 32x zoom-out
+  flag = (out_height * 32 >= crop_height);  // A maximum of 32x zoom-out
   VPC_CHECK_COND_FAIL_PRINT_RETURN(flag, dpFail,
                                    "Max reduction multiple is 32. Please check up(%u), down(%u), out_height(%u).", up_,
                                    down_, out_height);
-  flag = (crop_height * 16 >= out_height) ? true : false;  // Up to 16x magnification
+  flag = (crop_height * 16 >= out_height);  // Up to 16x magnification
   VPC_CHECK_COND_FAIL_PRINT_RETURN(
     flag, dpFail, "Max magnification is 16. Please check up(%u), down(%u), out_height(%u).", up_, down_, out_height);
   return dpSucc;
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_vpc.h b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_vpc.h
index 4622d7d16e3..ed93a2353f7 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_vpc.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/soft_vpc.h
@@ -34,7 +34,7 @@ class SoftVpc {
  public:
   SoftVpc();
 
-  ~SoftVpc() {}
+  ~SoftVpc() = default;
 
   /*
    * @brief : vpc Cropping and Scaling APIs.
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/yuv_scaler_para_set.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/yuv_scaler_para_set.cc
index 1b9bf6399eb..df27cf8e65b 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/yuv_scaler_para_set.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/soft_dvpp/utils/yuv_scaler_para_set.cc
@@ -75,7 +75,7 @@ void GetParaSet(std::string str_line, int32_t *flag_ctl, int32_t *flag_tap, YuvW
 
   // taps_4, the second character in the square brackets is the start address of the array block.
   if ((*flag_ctl - initBracketNum) % arrTypeNum == 2) {
-    while (1) {
+    while (true) {
       ss >> yuv_scaler_paraset->scale[cnt].taps_4[index->first_index++];
       if (ss.fail()) {  // rerad failed.
         index->first_index = index->first_index - 1;
@@ -94,7 +94,7 @@ void GetParaSet(std::string str_line, int32_t *flag_ctl, int32_t *flag_tap, YuvW
 
   // taps_6
   if ((*flag_ctl - initBracketNum) % arrTypeNum == 0) {
-    while (1) {
+    while (true) {
       ss >> yuv_scaler_paraset->scale[cnt].taps_6[index->second_index++];
       if (ss.fail()) {  // read failed.
         index->second_index = index->second_index - 1;
@@ -115,7 +115,6 @@ void GetParaSet(std::string str_line, int32_t *flag_ctl, int32_t *flag_tap, YuvW
 }
 
 int32_t CheckParamater(std::pair<bool, std::string> rlt, uint32_t i) {
-  int32_t ret = dpSucc;
   if (rlt.first == false) {
     API_LOGE("Get real path failed. index = %u", i);
     return dpFail;
@@ -126,7 +125,7 @@ int32_t CheckParamater(std::pair<bool, std::string> rlt, uint32_t i) {
     return dpFail;
   }
 
-  return ret;
+  return dpSucc;
 }
 
 // Read the parameter set file and skip the comments in the file.
@@ -177,7 +176,7 @@ int32_t ParseFileToVar(const std::string *para_set_name, uint32_t yuv_scaler_par
       }
 
       // cale the number of "{",check the location of the data.
-      if (str_line.find("{") != std::string::npos) {
+      if (str_line.find('{') != std::string::npos) {
         flag_ctl++;
         flag_tap = 1;
       }
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/solarize_op.h b/mindspore/ccsrc/minddata/dataset/kernels/image/solarize_op.h
index b69d91106de..ab36e53d359 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/solarize_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/solarize_op.h
@@ -19,6 +19,7 @@
 
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "minddata/dataset/core/tensor.h"
@@ -29,9 +30,9 @@ namespace mindspore {
 namespace dataset {
 class SolarizeOp : public TensorOp {
  public:
-  explicit SolarizeOp(std::vector<uint8_t> threshold = {0, 255}) : threshold_(threshold) {}
+  explicit SolarizeOp(std::vector<uint8_t> threshold = {0, 255}) : threshold_(std::move(threshold)) {}
 
-  ~SolarizeOp() = default;
+  ~SolarizeOp() override = default;
 
   Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/swap_red_blue_op.h b/mindspore/ccsrc/minddata/dataset/kernels/image/swap_red_blue_op.h
index 696d00b33bb..48206e488c2 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/swap_red_blue_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/swap_red_blue_op.h
@@ -30,7 +30,7 @@ namespace dataset {
 class SwapRedBlueOp : public TensorOp {
  public:
   /// \brief Constructor
-  SwapRedBlueOp() {}
+  SwapRedBlueOp() = default;
 
   SwapRedBlueOp(const SwapRedBlueOp &rhs) = default;
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/uniform_aug_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/uniform_aug_op.cc
index 95d75af0f2d..d27b6f9e3aa 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/uniform_aug_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/uniform_aug_op.cc
@@ -22,7 +22,7 @@ namespace dataset {
 const int UniformAugOp::kDefNumOps = 2;
 
 UniformAugOp::UniformAugOp(std::vector<std::shared_ptr<TensorOp>> op_list, int32_t num_ops)
-    : tensor_op_list_(op_list), num_ops_(num_ops) {
+    : tensor_op_list_(std::move(op_list)), num_ops_(num_ops) {
   rnd_.seed(GetSeed());
 }
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/ir/data/transforms_ir.cc b/mindspore/ccsrc/minddata/dataset/kernels/ir/data/transforms_ir.cc
index ffb398c61ac..f12e758b1c4 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/data/transforms_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/data/transforms_ir.cc
@@ -70,7 +70,7 @@ Status ComposeOperation::ValidateParams() {
 std::shared_ptr<TensorOp> ComposeOperation::Build() {
   std::vector<std::shared_ptr<TensorOp>> tensor_ops;
   (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
-                       [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
+                       [](const auto &op) -> std::shared_ptr<TensorOp> { return op->Build(); });
   return std::make_shared<ComposeOp>(tensor_ops);
 }
 
@@ -198,7 +198,7 @@ std::shared_ptr<TensorOp> PadEndOperation::Build() { return std::make_shared<Pad
 #endif
 
 // PreBuiltOperation
-PreBuiltOperation::PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op) : op_(tensor_op) {
+PreBuiltOperation::PreBuiltOperation(std::shared_ptr<TensorOp> tensor_op) : op_(std::move(tensor_op)) {
 #ifdef ENABLE_PYTHON
   auto pyfunc_tensor_op = std::dynamic_pointer_cast<PyFuncOp>(tensor_op);
   if (pyfunc_tensor_op && pyfunc_tensor_op->IsRandom()) random_op_ = true;
@@ -245,7 +245,7 @@ Status RandomChoiceOperation::ValidateParams() {
 std::shared_ptr<TensorOp> RandomChoiceOperation::Build() {
   std::vector<std::shared_ptr<TensorOp>> tensor_ops;
   (void)std::transform(transforms_.begin(), transforms_.end(), std::back_inserter(tensor_ops),
-                       [](std::shared_ptr<TensorOperation> op) -> std::shared_ptr<TensorOp> { return op->Build(); });
+                       [](const auto &op) -> std::shared_ptr<TensorOp> { return op->Build(); });
   return std::make_shared<RandomChoiceOp>(tensor_ops);
 }
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/adjust_gamma_ir.cc b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/adjust_gamma_ir.cc
index 8b81888f965..52c75289141 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/adjust_gamma_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/adjust_gamma_ir.cc
@@ -13,8 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <algorithm>
-
 #include "minddata/dataset/kernels/ir/vision/adjust_gamma_ir.h"
 
 #ifndef ENABLE_ANDROID
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/bounding_box_augment_ir.cc b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/bounding_box_augment_ir.cc
index e99074c5a49..dc4e4af653f 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/bounding_box_augment_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/bounding_box_augment_ir.cc
@@ -18,6 +18,7 @@
 #include "minddata/dataset/kernels/ir/vision/bounding_box_augment_ir.h"
 
 #ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
 #include "minddata/dataset/kernels/image/bounding_box_augment_op.h"
 #endif
 
@@ -56,6 +57,20 @@ Status BoundingBoxAugmentOperation::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+Status BoundingBoxAugmentOperation::from_json(nlohmann::json op_params, std::shared_ptr<TensorOperation> *operation) {
+  CHECK_FAIL_RETURN_UNEXPECTED(op_params.find("transform") != op_params.end(), "Failed to find transform");
+  CHECK_FAIL_RETURN_UNEXPECTED(op_params.find("ratio") != op_params.end(), "Failed to find ratio");
+  std::vector<std::shared_ptr<TensorOperation>> transforms;
+  std::vector<nlohmann::json> json_operations = {};
+  json_operations.push_back(op_params["transform"]);
+  RETURN_IF_NOT_OK(Serdes::ConstructTensorOps(json_operations, &transforms));
+  float ratio = op_params["ratio"];
+  CHECK_FAIL_RETURN_UNEXPECTED(transforms.size() == 1,
+                               "Expect size one of transforms parameter, but got:" + std::to_string(transforms.size()));
+  *operation = std::make_shared<vision::BoundingBoxAugmentOperation>(transforms[0], ratio);
+  return Status::OK();
+}
 #endif
 }  // namespace vision
 }  // namespace dataset
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/bounding_box_augment_ir.h b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/bounding_box_augment_ir.h
index f209c659530..a1655a74148 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/bounding_box_augment_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/bounding_box_augment_ir.h
@@ -49,6 +49,8 @@ class BoundingBoxAugmentOperation : public TensorOperation {
 
   Status to_json(nlohmann::json *out_json) override;
 
+  static Status from_json(nlohmann::json op_params, std::shared_ptr<TensorOperation> *operation);
+
  private:
   std::shared_ptr<TensorOperation> transform_;
   float ratio_;
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/random_select_subpolicy_ir.cc b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/random_select_subpolicy_ir.cc
index 38b7692c5bd..350240e6715 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/random_select_subpolicy_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/random_select_subpolicy_ir.cc
@@ -18,6 +18,7 @@
 #include "minddata/dataset/kernels/ir/vision/random_select_subpolicy_ir.h"
 
 #ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
 #include "minddata/dataset/kernels/image/random_select_subpolicy_op.h"
 #endif
 
@@ -100,6 +101,33 @@ Status RandomSelectSubpolicyOperation::to_json(nlohmann::json *out_json) {
   (*out_json)["policy"] = policy_tensor_ops;
   return Status::OK();
 }
+
+Status RandomSelectSubpolicyOperation::from_json(nlohmann::json op_params,
+                                                 std::shared_ptr<TensorOperation> *operation) {
+  CHECK_FAIL_RETURN_UNEXPECTED(op_params.find("policy") != op_params.end(), "Failed to find policy");
+  nlohmann::json policy_json = op_params["policy"];
+  std::vector<std::vector<std::pair<std::shared_ptr<TensorOperation>, double>>> policy;
+  std::vector<std::pair<std::shared_ptr<TensorOperation>, double>> policy_items;
+  for (nlohmann::json item : policy_json) {
+    for (nlohmann::json item_pair : item) {
+      CHECK_FAIL_RETURN_UNEXPECTED(item_pair.find("prob") != item_pair.end(), "Failed to find prob");
+      CHECK_FAIL_RETURN_UNEXPECTED(item_pair.find("tensor_op") != item_pair.end(), "Failed to find tensor_op");
+      std::vector<std::shared_ptr<TensorOperation>> operations;
+      std::pair<std::shared_ptr<TensorOperation>, double> policy_pair;
+      std::shared_ptr<TensorOperation> operation;
+      nlohmann::json tensor_op_json;
+      double prob = item_pair["prob"];
+      tensor_op_json.push_back(item_pair["tensor_op"]);
+      RETURN_IF_NOT_OK(Serdes::ConstructTensorOps(tensor_op_json, &operations));
+      CHECK_FAIL_RETURN_UNEXPECTED(operations.size() == 1, "There should be only 1 tensor operation");
+      policy_pair = std::make_pair(operations[0], prob);
+      policy_items.push_back(policy_pair);
+    }
+    policy.push_back(policy_items);
+  }
+  *operation = std::make_shared<vision::RandomSelectSubpolicyOperation>(policy);
+  return Status::OK();
+}
 #endif
 }  // namespace vision
 }  // namespace dataset
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/random_select_subpolicy_ir.h b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/random_select_subpolicy_ir.h
index 63cbef029d0..225c7342b55 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/random_select_subpolicy_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/random_select_subpolicy_ir.h
@@ -50,6 +50,8 @@ class RandomSelectSubpolicyOperation : public TensorOperation {
 
   Status to_json(nlohmann::json *out_json) override;
 
+  static Status from_json(nlohmann::json op_params, std::shared_ptr<TensorOperation> *operation);
+
  private:
   std::vector<std::vector<std::pair<std::shared_ptr<TensorOperation>, double>>> policy_;
 };
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/rgb_to_bgr_ir.cc b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/rgb_to_bgr_ir.cc
index 4f23dcffb07..1295acb837d 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/rgb_to_bgr_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/rgb_to_bgr_ir.cc
@@ -13,14 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <algorithm>
-
 #include "minddata/dataset/kernels/ir/vision/rgb_to_bgr_ir.h"
 
 #include "minddata/dataset/kernels/image/rgb_to_bgr_op.h"
 
-#include "minddata/dataset/kernels/ir/validators.h"
-
 namespace mindspore {
 namespace dataset {
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/uniform_aug_ir.cc b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/uniform_aug_ir.cc
index d400cd3127b..f92555775e1 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/uniform_aug_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/uniform_aug_ir.cc
@@ -18,6 +18,7 @@
 #include "minddata/dataset/kernels/ir/vision/uniform_aug_ir.h"
 
 #ifndef ENABLE_ANDROID
+#include "minddata/dataset/engine/serdes.h"
 #include "minddata/dataset/kernels/image/uniform_aug_op.h"
 #endif
 
@@ -74,6 +75,16 @@ Status UniformAugOperation::to_json(nlohmann::json *out_json) {
   *out_json = args;
   return Status::OK();
 }
+
+Status UniformAugOperation::from_json(nlohmann::json op_params, std::shared_ptr<TensorOperation> *operation) {
+  CHECK_FAIL_RETURN_UNEXPECTED(op_params.find("transforms") != op_params.end(), "Failed to find transforms");
+  CHECK_FAIL_RETURN_UNEXPECTED(op_params.find("num_ops") != op_params.end(), "Failed to find num_ops");
+  std::vector<std::shared_ptr<TensorOperation>> transforms = {};
+  RETURN_IF_NOT_OK(Serdes::ConstructTensorOps(op_params["transforms"], &transforms));
+  int32_t num_ops = op_params["num_ops"];
+  *operation = std::make_shared<vision::UniformAugOperation>(transforms, num_ops);
+  return Status::OK();
+}
 #endif
 }  // namespace vision
 }  // namespace dataset
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/uniform_aug_ir.h b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/uniform_aug_ir.h
index 8189c36a31f..fad559e2f35 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/uniform_aug_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/ir/vision/uniform_aug_ir.h
@@ -49,6 +49,8 @@ class UniformAugOperation : public TensorOperation {
 
   Status to_json(nlohmann::json *out_json) override;
 
+  static Status from_json(nlohmann::json op_params, std::shared_ptr<TensorOperation> *operation);
+
  private:
   std::vector<std::shared_ptr<TensorOperation>> transforms_;
   int32_t num_ops_;
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
index 8c4308d41f6..5be990a3329 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
@@ -145,6 +145,9 @@ constexpr char kBandBiquadOp[] = "BandBiquadOp";
 constexpr char kBandpassBiquadOp[] = "BandpassBiquadOp";
 constexpr char kBandrejectBiquadOp[] = "BandrejectBiquadOp";
 constexpr char kBassBiquadOp[] = "BassBiquadOp";
+constexpr char kComplexNormOp[] = "ComplexNormOp";
+constexpr char kFrequencyMaskingOp[] = "FrequencyMaskingOp";
+constexpr char kTimeMaskingOp[] = "TimeMaskingOp";
 constexpr char kTimeStretchOp[] = "TimeStretchOp";
 
 // data
diff --git a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
index 028111bfea2..68c4407ceb3 100644
--- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
@@ -15,7 +15,6 @@
  */
 
 #include <unistd.h>
-
 #include "minddata/dataset/text/ir/kernels/text_ir.h"
 
 #ifndef _WIN32
@@ -316,7 +315,9 @@ Status SentencePieceTokenizerOperation::ValidateParams() {
       RETURN_STATUS_SYNTAX_ERROR(err_msg);
     }
   } else {
-    Path vocab_file(vocab_path_);
+    std::string real_vocab_path;
+    RETURN_IF_NOT_OK(Path::RealPath(vocab_path_, real_vocab_path));
+    Path vocab_file(real_vocab_path);
     if (!vocab_file.Exists() || vocab_file.IsDirectory()) {
       std::string err_msg = "SentencePieceTokenizer : vocab file: [" + vocab_path_ + "] is invalid or does not exist.";
       MS_LOG(ERROR) << err_msg;
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/basic_tokenizer_op.cc b/mindspore/ccsrc/minddata/dataset/text/kernels/basic_tokenizer_op.cc
index cee1de58447..1ae12990ae3 100644
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/basic_tokenizer_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/basic_tokenizer_op.cc
@@ -54,10 +54,10 @@ BasicTokenizerOp::BasicTokenizerOp(const bool &lower_case, const bool &keep_whit
     : TokenizerOp(with_offsets),
       lower_case_(lower_case),
       keep_whitespace_(keep_whitespace),
+      normalization_form_(normalization_form),
       preserve_unused_token_(preserve_unused_token),
       case_fold_(std::make_unique<CaseFoldOp>()),
       nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
-      normalization_form_(normalization_form),
       common_normalize_(std::make_unique<NormalizeUTF8Op>(normalization_form)),
       replace_accent_chars_(std::make_unique<RegexReplaceOp>("\\p{Mn}", "")),
       replace_control_chars_(std::make_unique<RegexReplaceOp>("\\p{Cc}|\\p{Cf}", " ")) {
@@ -81,6 +81,7 @@ Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::string_view &text
   icu::ErrorCode error;
   const icu::Normalizer2 *nfkc_case_fold = icu::Normalizer2::getNFKCCasefoldInstance(error);
   CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "BasicTokenizer: getNFKCCasefoldInstance failed.");
+  RETURN_UNEXPECTED_IF_NULL(output);
   output->clear();
 
   // 1. get start and end offsets of not case fold strs
@@ -131,7 +132,7 @@ Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor
   IO_CHECK(input, output);
   CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "BasicTokenizer: input is not string datatype.");
   std::vector<std::string> strs(input->Size());
-  int i = 0;
+  size_t i = 0;
   for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
     RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(*iter, kUnusedWords, &strs[i++]));
   }
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/case_fold_op.cc b/mindspore/ccsrc/minddata/dataset/text/kernels/case_fold_op.cc
index a3b93336c3f..f9f7a2790f8 100644
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/case_fold_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/case_fold_op.cc
@@ -31,7 +31,7 @@ Status CaseFoldOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr
   const icu::Normalizer2 *nfkc_case_fold = icu::Normalizer2::getNFKCCasefoldInstance(error);
   CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "CaseFold: getNFKCCasefoldInstance failed.");
   std::vector<std::string> strs(input->Size());
-  int i = 0;
+  size_t i = 0;
   for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
     icu::StringByteSink<std::string> sink(&strs[i++]);
     nfkc_case_fold->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/ngram_op.cc b/mindspore/ccsrc/minddata/dataset/text/kernels/ngram_op.cc
index d9b24eae454..b794b4c00f4 100644
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/ngram_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/ngram_op.cc
@@ -44,7 +44,9 @@ Status NgramOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Te
   offsets.reserve(1 + l_len_ + r_len_ + input->shape().NumOfElements());
   str_buffer.reserve(l_pad_with_sp_.size() * l_len_ + r_pad_with_sp_.size() * r_len_ + input->SizeInBytes());
   offsets.push_back(str_buffer.size());  // insert 0 as the starting pos
-  for (int l_i = 0; l_i < l_len_; l_i++) offsets.push_back((str_buffer += l_pad_with_sp_).size());
+  for (int l_i = 0; l_i < l_len_; l_i++) {
+    offsets.push_back((str_buffer += l_pad_with_sp_).size());
+  }
 
   for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); ++itr) {
     str_buffer += (*itr);
@@ -52,7 +54,9 @@ Status NgramOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Te
     offsets.push_back(str_buffer.size());
   }
 
-  for (int r_i = 0; r_i < r_len_; r_i++) offsets.push_back((str_buffer += r_pad_with_sp_).size());
+  for (int r_i = 0; r_i < r_len_; r_i++) {
+    offsets.push_back((str_buffer += r_pad_with_sp_).size());
+  }
 
   for (auto n : ngrams_) {
     CHECK_FAIL_RETURN_UNEXPECTED(n > 0, "Ngram: ngrams needs to be a positive number.\n");
diff --git a/mindspore/ccsrc/minddata/dataset/util/allocator.h b/mindspore/ccsrc/minddata/dataset/util/allocator.h
index 82cf9956fc2..6df5b1d6925 100644
--- a/mindspore/ccsrc/minddata/dataset/util/allocator.h
+++ b/mindspore/ccsrc/minddata/dataset/util/allocator.h
@@ -92,8 +92,9 @@ template <typename T, typename C = std::allocator<T>, typename... Args>
 Status MakeUnique(std::unique_ptr<T[], std::function<void(T *)>> *out, C alloc, size_t n, Args &&... args) {
   RETURN_UNEXPECTED_IF_NULL(out);
   CHECK_FAIL_RETURN_UNEXPECTED(n > 0, "size must be positive");
+  T *data = nullptr;
   try {
-    T *data = alloc.allocate(n);
+    data = alloc.allocate(n);
     // Some of our implementation of allocator (e.g. NumaAllocator) don't throw std::bad_alloc.
     // So we have to catch for null ptr
     if (data == nullptr) {
@@ -114,8 +115,14 @@ Status MakeUnique(std::unique_ptr<T[], std::function<void(T *)>> *out, C alloc,
     };
     *out = std::unique_ptr<T[], std::function<void(T *)>>(data, std::bind(deleter, std::placeholders::_1, alloc, n));
   } catch (const std::bad_alloc &e) {
+    if (data != nullptr) {
+      alloc.deallocate(data, n);
+    }
     return Status(StatusCode::kMDOutOfMemory);
   } catch (const std::exception &e) {
+    if (data != nullptr) {
+      alloc.deallocate(data, n);
+    }
     RETURN_STATUS_UNEXPECTED(e.what());
   }
   return Status::OK();
diff --git a/mindspore/ccsrc/minddata/dataset/util/arena.cc b/mindspore/ccsrc/minddata/dataset/util/arena.cc
index b64b2874f03..3540406a87a 100644
--- a/mindspore/ccsrc/minddata/dataset/util/arena.cc
+++ b/mindspore/ccsrc/minddata/dataset/util/arena.cc
@@ -42,6 +42,7 @@ ArenaImpl::ArenaImpl(void *ptr, size_t sz) : size_in_bytes_(sz), ptr_(ptr) {
 }
 
 Status ArenaImpl::Allocate(size_t n, void **p) {
+  RETURN_UNEXPECTED_IF_NULL(p);
   if (n == 0) {
     *p = nullptr;
     return Status::OK();
@@ -83,6 +84,10 @@ std::pair<std::pair<uint64_t, uint64_t>, bool> ArenaImpl::FindPrevBlk(uint64_t a
 }
 
 void ArenaImpl::Deallocate(void *p) {
+  if (p == nullptr) {
+    MS_LOG(ERROR) << "The pointer[p] is null.";
+    return;
+  }
   auto *q = get_base_addr(p);
   MemHdr hdr(0, 0);
   MemHdr::getHdr(q, &hdr);
@@ -147,8 +152,8 @@ bool ArenaImpl::BlockEnlarge(uint64_t *addr, uint64_t old_sz, uint64_t new_sz) {
 }
 
 Status ArenaImpl::FreeAndAlloc(void **pp, size_t old_sz, size_t new_sz) {
-  MS_ASSERT(pp);
-  MS_ASSERT(*pp);
+  RETURN_UNEXPECTED_IF_NULL(pp);
+  RETURN_UNEXPECTED_IF_NULL(*pp);
   void *p = nullptr;
   void *q = *pp;
   RETURN_IF_NOT_OK(Allocate(new_sz, &p));
@@ -163,8 +168,8 @@ Status ArenaImpl::FreeAndAlloc(void **pp, size_t old_sz, size_t new_sz) {
 }
 
 Status ArenaImpl::Reallocate(void **pp, size_t old_sz, size_t new_sz) {
-  MS_ASSERT(pp);
-  MS_ASSERT(*pp);
+  RETURN_UNEXPECTED_IF_NULL(pp);
+  RETURN_UNEXPECTED_IF_NULL(*pp);
   uint64_t actual_size = static_cast<uint64_t>(new_sz) + ARENA_WALL_OVERHEAD_SZ;
   if (actual_size > this->get_max_size()) {
     RETURN_STATUS_UNEXPECTED("Request size too big : " + std::to_string(new_sz));
@@ -212,6 +217,10 @@ int ArenaImpl::PercentFree() const {
   for (auto &it : tr_) {
     sz += it.priority;
   }
+  if (size_in_bytes_ == 0) {
+    MS_LOG(ERROR) << "size_in_bytes_ can not be zero.";
+    return 0;
+  }
   double ratio = static_cast<double>(sz * ARENA_BLK_SZ) / static_cast<double>(size_in_bytes_);
   return static_cast<int>(ratio * 100.0);
 }
diff --git a/mindspore/ccsrc/minddata/dataset/util/buddy.cc b/mindspore/ccsrc/minddata/dataset/util/buddy.cc
index 2c9c0305d34..bb11771ad0a 100644
--- a/mindspore/ccsrc/minddata/dataset/util/buddy.cc
+++ b/mindspore/ccsrc/minddata/dataset/util/buddy.cc
@@ -65,6 +65,8 @@ Status BuddySpace::Init() {
 }
 
 Status BuddySpace::Alloc(const uint64_t sz, BSpaceDescriptor *desc, addr_t *p) noexcept {
+  RETURN_UNEXPECTED_IF_NULL(desc);
+  RETURN_UNEXPECTED_IF_NULL(p);
   std::lock_guard<std::mutex> lock(mutex_);
   addr_t addr = AllocNoLock(sz, desc);
   if (addr != NOSPACE) {
@@ -100,6 +102,10 @@ void BuddySpace::FreeNoLock(const BSpaceDescriptor *desc) {
 }
 
 void BuddySpace::Free(const BSpaceDescriptor *desc) {
+  if (desc == nullptr) {
+    MS_LOG(ERROR) << "The pointer[desc] is null.";
+    return;
+  }
   std::lock_guard<std::mutex> lock(mutex_);
   return FreeNoLock(desc);
 }
@@ -135,6 +141,18 @@ std::ostream &operator<<(std::ostream &os, const BuddySpace &s) {
   return os;
 }
 
+uint32_t BuddySpace::SizeToBlock(const uint64_t sz) const {
+  if (min_ == 0) {
+    MS_LOG(ERROR) << "min_ can not be zero.";
+    return 0;
+  }
+  uint32_t reqSize = (sz / min_);
+  if (sz % min_) {
+    reqSize++;
+  }
+  return reqSize;
+}
+
 void BuddySpace::GetBuddySegState(const rel_addr_t rel_addr, size_t *rel_sz, STATE *st) const {
   const int32_t kAddrOffset = 4;
   const int32_t kShiftOffset = 2;
diff --git a/mindspore/ccsrc/minddata/dataset/util/buddy.h b/mindspore/ccsrc/minddata/dataset/util/buddy.h
index 97834c1c436..1264001431a 100644
--- a/mindspore/ccsrc/minddata/dataset/util/buddy.h
+++ b/mindspore/ccsrc/minddata/dataset/util/buddy.h
@@ -105,13 +105,7 @@ class BuddySpace {
 
   void FreeNoLock(const BSpaceDescriptor *desc);
 
-  uint32_t SizeToBlock(const uint64_t sz) const {
-    uint32_t reqSize = (sz / min_);
-    if (sz % min_) {
-      reqSize++;
-    }
-    return reqSize;
-  }
+  uint32_t SizeToBlock(const uint64_t sz) const;
 
   void GetBuddySegState(const rel_addr_t rel_addr, size_t *rel_sz, STATE *st) const;
 
diff --git a/mindspore/ccsrc/minddata/dataset/util/json_helper.cc b/mindspore/ccsrc/minddata/dataset/util/json_helper.cc
index ea721d42035..56e5e460bbc 100644
--- a/mindspore/ccsrc/minddata/dataset/util/json_helper.cc
+++ b/mindspore/ccsrc/minddata/dataset/util/json_helper.cc
@@ -31,6 +31,7 @@ namespace dataset {
 Status JsonHelper::CreateAlbum(const std::string &in_dir, const std::string &out_dir) {
   // in check
   Path base_dir = Path(in_dir);
+  RETURN_IF_NOT_OK(RealPath(in_dir));
   if (!base_dir.IsDirectory() || !base_dir.Exists()) {
     RETURN_STATUS_UNEXPECTED("Input dir is not a directory or doesn't exist");
   }
@@ -41,8 +42,8 @@ Status JsonHelper::CreateAlbum(const std::string &in_dir, const std::string &out
   // iterate over in dir and create json for all images
   uint64_t index = 0;
   auto dir_it = Path::DirIterator::OpenDirectory(&base_dir);
-  while (dir_it->hasNext()) {
-    Path v = dir_it->next();
+  while (dir_it->HasNext()) {
+    Path v = dir_it->Next();
     // check if found file fits image extension
 
     // create json file in output dir with the path
@@ -53,6 +54,12 @@ Status JsonHelper::CreateAlbum(const std::string &in_dir, const std::string &out
   return Status::OK();
 }
 
+Status JsonHelper::RealPath(const std::string &path) {
+  std::string real_path;
+  RETURN_IF_NOT_OK(Path::RealPath(path, real_path));
+  return Status::OK();
+}
+
 // A print method typically used for debugging
 void JsonHelper::Print(std::ostream &out) const {
   out << "  Data Helper"
@@ -65,10 +72,16 @@ Status JsonHelper::UpdateArray(const std::string &in_file, const std::string &ke
     Path in = Path(in_file);
     nlohmann::json js;
     if (in.Exists()) {
-      std::ifstream in_stream(in_file);
-      MS_LOG(INFO) << "Filename: " << in_file << ".";
-      in_stream >> js;
-      in_stream.close();
+      RETURN_IF_NOT_OK(RealPath(in_file));
+      try {
+        std::ifstream in_stream(in_file);
+        MS_LOG(INFO) << "Filename: " << in_file << ".";
+        in_stream >> js;
+        in_stream.close();
+      } catch (const std::exception &err) {
+        RETURN_STATUS_UNEXPECTED("Invalid file, failed to open json file: " + in_file +
+                                 ", please delete it and try again!");
+      }
     }
     js[key] = value;
     MS_LOG(INFO) << "Write outfile is: " << js << ".";
@@ -94,12 +107,18 @@ Status JsonHelper::RemoveKey(const std::string &in_file, const std::string &key,
     Path in = Path(in_file);
     nlohmann::json js;
     if (in.Exists()) {
-      std::ifstream in_stream(in_file);
-      MS_LOG(INFO) << "Filename: " << in_file << ".";
-      in_stream >> js;
-      in_stream.close();
+      RETURN_IF_NOT_OK(RealPath(in_file));
+      try {
+        std::ifstream in_stream(in_file);
+        MS_LOG(INFO) << "Filename: " << in_file << ".";
+        in_stream >> js;
+        in_stream.close();
+      } catch (const std::exception &err) {
+        RETURN_STATUS_UNEXPECTED("Invalid file, failed to open json file: " + in_file +
+                                 ", please delete it and try again!");
+      }
     }
-    js.erase(key);
+    (void)js.erase(key);
     MS_LOG(INFO) << "Write outfile is: " << js << ".";
     if (out_file == "") {
       std::ofstream o(in_file, std::ofstream::trunc);
diff --git a/mindspore/ccsrc/minddata/dataset/util/json_helper.h b/mindspore/ccsrc/minddata/dataset/util/json_helper.h
index 26541438794..cfa729a3a5c 100644
--- a/mindspore/ccsrc/minddata/dataset/util/json_helper.h
+++ b/mindspore/ccsrc/minddata/dataset/util/json_helper.h
@@ -70,13 +70,20 @@ class JsonHelper {
       Path in = Path(in_file);
       nlohmann::json js;
       if (in.Exists()) {
-        std::ifstream in(in_file);
-        MS_LOG(INFO) << "Filename: " << in_file << ".";
-        in >> js;
-        in.close();
+        RETURN_IF_NOT_OK(RealPath(in_file));
+        try {
+          std::ifstream in_stream(in_file);
+          MS_LOG(INFO) << "Filename: " << in_file << ".";
+          in_stream >> js;
+          in_stream.close();
+        } catch (const std::exception &err) {
+          RETURN_STATUS_UNEXPECTED("Invalid file, failed to open json file: " + in_file +
+                                   ", please delete it and try again!");
+        }
       }
       js[key] = value;
       MS_LOG(INFO) << "Write outfile is: " << js << ".";
+
       if (out_file == "") {
         std::ofstream o(in_file, std::ofstream::trunc);
         o << js;
@@ -107,10 +114,16 @@ class JsonHelper {
       Path in = Path(in_file);
       nlohmann::json js;
       if (in.Exists()) {
-        std::ifstream in(in_file);
-        MS_LOG(INFO) << "Filename: " << in_file << ".";
-        in >> js;
-        in.close();
+        RETURN_IF_NOT_OK(RealPath(in_file));
+        try {
+          std::ifstream in_stream(in_file);
+          MS_LOG(INFO) << "Filename: " << in_file << ".";
+          in_stream >> js;
+          in_stream.close();
+        } catch (const std::exception &err) {
+          RETURN_STATUS_UNEXPECTED("Invalid file, failed to open json file: " + in_file +
+                                   ", please delete it and try again!");
+        }
       }
       js[key] = value;
       MS_LOG(INFO) << "Write outfile is: " << js << ".";
@@ -161,7 +174,9 @@ class JsonHelper {
   template <typename T>
   Status WriteBinFile(const std::string &in_file, T *data, size_t length) {
     try {
-      std::ofstream o(in_file, std::ios::binary | std::ios::out);
+      std::string real_in_file;
+      RETURN_IF_NOT_OK(Path::RealPath(in_file, real_in_file));
+      std::ofstream o(real_in_file, std::ios::binary | std::ios::out);
       if (!o.is_open()) {
         RETURN_STATUS_UNEXPECTED("Error opening Bin file to write");
       }
@@ -185,7 +200,7 @@ class JsonHelper {
   size_t DumpData(const unsigned char *tensor_addr, const size_t &tensor_size, void *addr, const size_t &buffer_size);
 
   /// \brief Helper function to delete key in json file
-  /// note This function will return okay even if key not found
+  /// \note This function will return okay even if key not found
   /// \param[in] in_file Json file to remove key from
   /// \param[in] key The key to remove
   /// \return Status The status code returned
@@ -195,10 +210,16 @@ class JsonHelper {
   /// \param out - The output stream to write output to
   void Print(std::ostream &out) const;
 
+  /// \brief Helper function to check real path
+  /// \note This function will return okay even if key not found
+  /// \param[in] path Path to Json file
+  /// \return Status The status code returned
+  Status RealPath(const std::string &path);
+
   /// \brief << Stream output operator overload
-  /// \notes This allows you to write the debug print info using stream operators
+  /// \note This allows you to write the debug print info using stream operators
   /// \param out Reference to the output stream being overloaded
-  /// \param ds Reference to the DataSchema to display
+  /// \param dh Reference to the DataSchema to display
   /// \return The output stream must be returned
   friend std::ostream &operator<<(std::ostream &out, const JsonHelper &dh) {
     dh.Print(out);
diff --git a/mindspore/ccsrc/minddata/dataset/util/numa_interface.cc b/mindspore/ccsrc/minddata/dataset/util/numa_interface.cc
index a61bcf75498..47560b71173 100644
--- a/mindspore/ccsrc/minddata/dataset/util/numa_interface.cc
+++ b/mindspore/ccsrc/minddata/dataset/util/numa_interface.cc
@@ -27,6 +27,14 @@ inline void *LoadLibrary(const char *name) {
 }
 
 inline void *GetNumaAdapterFunc(void *handle, const char *name) {
+  if (handle == nullptr) {
+    MS_LOG(ERROR) << "The pointer[handle] is null.";
+    return nullptr;
+  }
+  if (name == nullptr) {
+    MS_LOG(ERROR) << "The pointer[name] is null.";
+    return nullptr;
+  }
   void *func = dlsym(handle, name);
   return func;
 }
diff --git a/mindspore/ccsrc/minddata/dataset/util/numa_interface.h b/mindspore/ccsrc/minddata/dataset/util/numa_interface.h
index daa3c0f0583..19dad6d3a91 100644
--- a/mindspore/ccsrc/minddata/dataset/util/numa_interface.h
+++ b/mindspore/ccsrc/minddata/dataset/util/numa_interface.h
@@ -16,6 +16,7 @@
 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_UTIL_NUMA_INTERFACE_H_
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_UTIL_NUMA_INTERFACE_H_
 
+#include "minddata/dataset/util/log_adapter.h"
 #include "minddata/dataset/util/status.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/minddata/dataset/util/path.cc b/mindspore/ccsrc/minddata/dataset/util/path.cc
index a2764f2a33f..e81680533be 100644
--- a/mindspore/ccsrc/minddata/dataset/util/path.cc
+++ b/mindspore/ccsrc/minddata/dataset/util/path.cc
@@ -20,7 +20,6 @@
 #include <unistd.h>
 #include <new>
 #include <sstream>
-#include <utility>
 
 #include "./securec.h"
 #include "utils/ms_utils.h"
@@ -324,7 +323,7 @@ Path::DirIterator::DirIterator(Path *f) : dir_(f), dp_(nullptr), entry_(nullptr)
   dp_ = opendir(f->toString().c_str());
 }
 
-bool Path::DirIterator::hasNext() {
+bool Path::DirIterator::HasNext() {
   do {
     entry_ = readdir(dp_);
     if (entry_) {
@@ -337,7 +336,25 @@ bool Path::DirIterator::hasNext() {
   return (entry_ != nullptr);
 }
 
-Path Path::DirIterator::next() { return (*(this->dir_) / Path(entry_->d_name)); }
+Path Path::DirIterator::Next() { return (*(this->dir_) / Path(entry_->d_name)); }
+
+Status Path::RealPath(const std::string &path, std::string &realpath_str) {
+  char real_path[PATH_MAX] = {0};
+  // input_path is only file_name
+#if defined(_WIN32) || defined(_WIN64)
+  CHECK_FAIL_RETURN_UNEXPECTED(path.length() < PATH_MAX,
+                               "The length of path: " + path + " exceeds limit: " + std::to_string(PATH_MAX));
+  auto ret = _fullpath(real_path, common::SafeCStr(path), PATH_MAX);
+  CHECK_FAIL_RETURN_UNEXPECTED(ret != nullptr, "The file " + path + " does not exist.");
+#else
+  CHECK_FAIL_RETURN_UNEXPECTED(path.length() < NAME_MAX,
+                               "The length of path: " + path + " exceeds limit: " + std::to_string(NAME_MAX));
+  auto ret = realpath(common::SafeCStr(path), real_path);
+  CHECK_FAIL_RETURN_UNEXPECTED(ret != nullptr, "The file " + path + " does not exist.");
+#endif
+  realpath_str = std::string(real_path);
+  return Status::OK();
+}
 
 std::ostream &operator<<(std::ostream &os, const Path &s) {
   os << s.path_;
diff --git a/mindspore/ccsrc/minddata/dataset/util/path.h b/mindspore/ccsrc/minddata/dataset/util/path.h
index cb131ad5ae0..ea340b07916 100644
--- a/mindspore/ccsrc/minddata/dataset/util/path.h
+++ b/mindspore/ccsrc/minddata/dataset/util/path.h
@@ -32,9 +32,9 @@ class Path {
 
     ~DirIterator();
 
-    bool hasNext();
+    bool HasNext();
 
-    Path next();
+    Path Next();
 
    private:
     explicit DirIterator(Path *f);
@@ -116,6 +116,8 @@ class Path {
 
   std::string Basename();
 
+  static Status RealPath(const std::string &path, std::string &realpath_str);  // NOLINT
+
   friend std::ostream &operator<<(std::ostream &os, const Path &s);
 
  private:
diff --git a/mindspore/ccsrc/minddata/dataset/util/slice.h b/mindspore/ccsrc/minddata/dataset/util/slice.h
index ca76b546a0f..0c3f07f9295 100644
--- a/mindspore/ccsrc/minddata/dataset/util/slice.h
+++ b/mindspore/ccsrc/minddata/dataset/util/slice.h
@@ -105,7 +105,7 @@ class WritableSlice : public ReadableSlice {
   WritableSlice &operator=(const WritableSlice &lhs) {
     if (this != &lhs) {
       mutable_data_ = lhs.mutable_data_;
-      ReadableSlice::operator=(lhs);
+      (void)ReadableSlice::operator=(lhs);
     }
     return *this;
   }
@@ -119,7 +119,7 @@ class WritableSlice : public ReadableSlice {
     if (this != &lhs) {
       mutable_data_ = lhs.mutable_data_;
       lhs.mutable_data_ = nullptr;
-      ReadableSlice::operator=(std::move(lhs));
+      (void)ReadableSlice::operator=(std::move(lhs));
     }
     return *this;
   }
diff --git a/mindspore/ccsrc/minddata/dataset/util/system_pool.h b/mindspore/ccsrc/minddata/dataset/util/system_pool.h
index 789252dc8c8..4e43ef235af 100644
--- a/mindspore/ccsrc/minddata/dataset/util/system_pool.h
+++ b/mindspore/ccsrc/minddata/dataset/util/system_pool.h
@@ -39,9 +39,14 @@ class SystemPool : public MemoryPool {
 
   Status Allocate(size_t n, void **pp) override { return DeMalloc(n, pp, false); }
 
-  void Deallocate(void *p) override { free(p); }
+  void Deallocate(void *p) override {
+    if (p != nullptr) {
+      free(p);
+    }
+  }
 
   Status Reallocate(void **p, size_t old_sz, size_t new_sz) override {
+    RETURN_UNEXPECTED_IF_NULL(p);
     if (old_sz >= new_sz) {
       // Do nothing if we shrink.
       return Status::OK();
diff --git a/mindspore/ccsrc/minddata/dataset/util/task_manager.cc b/mindspore/ccsrc/minddata/dataset/util/task_manager.cc
index 3e7303fbb26..635113cb558 100644
--- a/mindspore/ccsrc/minddata/dataset/util/task_manager.cc
+++ b/mindspore/ccsrc/minddata/dataset/util/task_manager.cc
@@ -53,7 +53,7 @@ Status TaskManager::CreateAsyncTask(const std::string &my_name, const std::funct
   // Track all the TaskGroup. Used for control-c
   {
     LockGuard lck(&tg_lock_);
-    this->grp_list_.insert(vg);
+    (void)this->grp_list_.insert(vg);
   }
   RETURN_IF_NOT_OK((*task)->wp_.Register(vg));
   RETURN_IF_NOT_OK((*task)->Run());
@@ -170,7 +170,7 @@ Status TaskManager::DoServiceStart() {
     watchdog_grp_ = nullptr;
     return rc;
   }
-  grp_list_.erase(watchdog_grp_);
+  (void)grp_list_.erase(watchdog_grp_);
   lru_.Remove(watchdog_);
 #endif
   return Status::OK();
diff --git a/mindspore/ccsrc/minddata/mindrecord/include/shard_header.h b/mindspore/ccsrc/minddata/mindrecord/include/shard_header.h
index fd3aa9d2d87..e2bff12c469 100644
--- a/mindspore/ccsrc/minddata/mindrecord/include/shard_header.h
+++ b/mindspore/ccsrc/minddata/mindrecord/include/shard_header.h
@@ -133,7 +133,7 @@ class __attribute__((visibility("default"))) ShardHeader {
 
   MSRStatus FileToPages(const std::string dump_file_name);
 
-  static MSRStatus initialize(const std::shared_ptr<ShardHeader> *header_ptr, const json &schema,
+  static MSRStatus Initialize(const std::shared_ptr<ShardHeader> *header_ptr, const json &schema,
                               const std::vector<std::string> &index_fields, std::vector<std::string> &blob_fields,
                               uint64_t &schema_id);
 
diff --git a/mindspore/ccsrc/minddata/mindrecord/include/shard_index_generator.h b/mindspore/ccsrc/minddata/mindrecord/include/shard_index_generator.h
index 8b5d58c74d2..474d6bb6d41 100644
--- a/mindspore/ccsrc/minddata/mindrecord/include/shard_index_generator.h
+++ b/mindspore/ccsrc/minddata/mindrecord/include/shard_index_generator.h
@@ -57,7 +57,7 @@ class __attribute__((visibility("default"))) ShardIndexGenerator {
   /// \brief create databases for indexes
   MSRStatus WriteToDatabase();
 
-  static MSRStatus finalize(const std::vector<std::string> file_names);
+  static MSRStatus Finalize(const std::vector<std::string> file_names);
 
  private:
   static int Callback(void *not_used, int argc, char **argv, char **az_col_name);
diff --git a/mindspore/ccsrc/minddata/mindrecord/include/shard_writer.h b/mindspore/ccsrc/minddata/mindrecord/include/shard_writer.h
index afff0ecae7a..d014536ff3b 100644
--- a/mindspore/ccsrc/minddata/mindrecord/include/shard_writer.h
+++ b/mindspore/ccsrc/minddata/mindrecord/include/shard_writer.h
@@ -112,7 +112,7 @@ class __attribute__((visibility("default"))) ShardWriter {
                           const std::map<std::string, std::unique_ptr<std::vector<uint8_t>>> &row_bin_data,
                           std::shared_ptr<std::vector<uint8_t>> *output);
 
-  static MSRStatus initialize(const std::unique_ptr<ShardWriter> *writer_ptr,
+  static MSRStatus Initialize(const std::unique_ptr<ShardWriter> *writer_ptr,
                               const std::vector<std::string> &file_names);
 
  private:
diff --git a/mindspore/ccsrc/minddata/mindrecord/io/shard_index_generator.cc b/mindspore/ccsrc/minddata/mindrecord/io/shard_index_generator.cc
index 59a68116912..4c6681e1516 100644
--- a/mindspore/ccsrc/minddata/mindrecord/io/shard_index_generator.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/io/shard_index_generator.cc
@@ -499,7 +499,6 @@ ROW_DATA ShardIndexGenerator::GenerateRowData(int shard_no, const std::map<int,
         in.seekg(page_size_ * (cur_raw_page->GetPageID()) + header_size_ + cur_raw_page_offset, std::ios::beg);
       if (!io_seekg.good() || io_seekg.fail() || io_seekg.bad()) {
         MS_LOG(ERROR) << "File seekg failed";
-        in.close();
         return {FAILED, {}};
       }
 
@@ -511,7 +510,6 @@ ROW_DATA ShardIndexGenerator::GenerateRowData(int shard_no, const std::map<int,
           auto &io_read = in.read(reinterpret_cast<char *>(&schema_size), kInt64Len);
           if (!io_read.good() || io_read.fail() || io_read.bad()) {
             MS_LOG(ERROR) << "File read failed";
-            in.close();
             return {FAILED, {}};
           }
 
@@ -598,15 +596,21 @@ MSRStatus ShardIndexGenerator::ExecuteTransaction(const int &shard_no, std::pair
     auto sql = GenerateRawSQL(fields_);
     if (sql.first != SUCCESS) {
       MS_LOG(ERROR) << "Generate raw SQL failed";
+      in.close();
+      sqlite3_close(db.second);
       return FAILED;
     }
     auto data = GenerateRowData(shard_no, blob_id_to_page_id, raw_page_id, in);
     if (data.first != SUCCESS) {
       MS_LOG(ERROR) << "Generate raw data failed";
+      in.close();
+      sqlite3_close(db.second);
       return FAILED;
     }
     if (BindParameterExecuteSQL(db.second, sql.second, data.second) == FAILED) {
       MS_LOG(ERROR) << "Execute SQL failed";
+      in.close();
+      sqlite3_close(db.second);
       return FAILED;
     }
     MS_LOG(INFO) << "Insert " << data.second.size() << " rows to index db.";
@@ -690,7 +694,7 @@ void ShardIndexGenerator::DatabaseWriter() {
     shard_no = task_++;
   }
 }
-MSRStatus ShardIndexGenerator::finalize(const std::vector<std::string> file_names) {
+MSRStatus ShardIndexGenerator::Finalize(const std::vector<std::string> file_names) {
   if (file_names.empty()) {
     MS_LOG(ERROR) << "Mindrecord files is empty.";
     return FAILED;
diff --git a/mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc b/mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc
index f182d503b1e..ec5bd0436df 100644
--- a/mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc
@@ -101,6 +101,7 @@ MSRStatus ShardReader::Init(const std::vector<std::string> &file_paths, bool loa
     sqlite3 *db = nullptr;
     auto ret3 = VerifyDataset(&db, file);
     if (ret3 != SUCCESS) {
+      sqlite3_close(db);
       return FAILED;
     }
 
@@ -154,6 +155,7 @@ MSRStatus ShardReader::VerifyDataset(sqlite3 **db, const string &file) {
   auto rc = sqlite3_open_v2(common::SafeCStr(file + ".db"), db, SQLITE_OPEN_READONLY, nullptr);
   if (rc != SQLITE_OK) {
     MS_LOG(ERROR) << "Invalid file, failed to open database: " << file + ".db, error: " << sqlite3_errmsg(*db);
+    sqlite3_close(*db);
     return FAILED;
   }
   MS_LOG(DEBUG) << "Opened database successfully";
@@ -177,6 +179,7 @@ MSRStatus ShardReader::VerifyDataset(sqlite3 **db, const string &file) {
       return FAILED;
     }
   }
+  sqlite3_free(errmsg);
   return SUCCESS;
 }
 
@@ -400,16 +403,19 @@ MSRStatus ShardReader::ConvertLabelToJson(const std::vector<std::vector<std::str
       }
     } catch (std::out_of_range &e) {
       MS_LOG(ERROR) << "Out of range: " << e.what();
+      fs->close();
       return FAILED;
     } catch (std::invalid_argument &e) {
       MS_LOG(ERROR) << "Invalid argument: " << e.what();
+      fs->close();
       return FAILED;
     } catch (...) {
       MS_LOG(ERROR) << "Exception was caught while convert label to json.";
+      fs->close();
       return FAILED;
     }
   }
-
+  fs->close();
   return SUCCESS;
 }  // namespace mindrecord
 
@@ -499,6 +505,7 @@ void ShardReader::GetClassesInShard(sqlite3 *db, int shard_id, const std::string
   for (int i = 0; i < static_cast<int>(columns.size()); ++i) {
     category_ptr->emplace(columns[i][0]);
   }
+  sqlite3_free(errmsg);
 }
 
 ROW_GROUPS ShardReader::ReadAllRowGroup(const std::vector<std::string> &columns) {
@@ -931,8 +938,8 @@ int64_t ShardReader::GetNumClasses(const std::string &category_field) {
   std::string sql = "SELECT DISTINCT " + ret.second + " FROM INDEXES";
   std::vector<std::thread> threads = std::vector<std::thread>(shard_count);
   auto category_ptr = std::make_shared<std::set<std::string>>();
+  sqlite3 *db = nullptr;
   for (int x = 0; x < shard_count; x++) {
-    sqlite3 *db = nullptr;
     int rc = sqlite3_open_v2(common::SafeCStr(file_paths_[x] + ".db"), &db, SQLITE_OPEN_READONLY, nullptr);
     if (SQLITE_OK != rc) {
       MS_LOG(ERROR) << "Invalid file, failed to open database: " << file_paths_[x] + ".db, error: "
@@ -941,10 +948,10 @@ int64_t ShardReader::GetNumClasses(const std::string &category_field) {
     }
     threads[x] = std::thread(&ShardReader::GetClassesInShard, this, db, x, sql, category_ptr);
   }
-
   for (int x = 0; x < shard_count; x++) {
     threads[x].join();
   }
+  sqlite3_close(db);
   return category_ptr->size();
 }
 
diff --git a/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc b/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc
index c23e2656084..e80d16c2124 100644
--- a/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc
@@ -569,6 +569,7 @@ int ShardWriter::LockWriter(bool parallel_writer) {
     auto realpath = Common::GetRealPath(file);
     if (!realpath.has_value()) {
       MS_LOG(ERROR) << "Get real path failed, path=" << file;
+      close(fd);
       return -1;
     }
 
@@ -576,6 +577,7 @@ int ShardWriter::LockWriter(bool parallel_writer) {
     fs->open(realpath.value(), std::ios::in | std::ios::out | std::ios::binary);
     if (fs->fail()) {
       MS_LOG(ERROR) << "Invalid file, failed to open file: " << file;
+      close(fd);
       return -1;
     }
     file_streams_.push_back(fs);
@@ -583,6 +585,7 @@ int ShardWriter::LockWriter(bool parallel_writer) {
 
   if (shard_header_->FileToPages(pages_file_) == FAILED) {
     MS_LOG(ERROR) << "Invalid data, failed to read pages from file.";
+    close(fd);
     return -1;
   }
   return fd;
@@ -1212,6 +1215,7 @@ MSRStatus ShardWriter::WriteShardHeader() {
       uint64_t line_len = bin_header.size();
       if (line_len + kInt64Len > header_size_) {
         MS_LOG(ERROR) << "Shard header is too big";
+        file_streams_[shard_id]->close();
         return FAILED;
       }
 
@@ -1304,7 +1308,7 @@ void ShardWriter::SetLastBlobPage(const int &shard_id, std::shared_ptr<Page> &la
   }
 }
 
-MSRStatus ShardWriter::initialize(const std::unique_ptr<ShardWriter> *writer_ptr,
+MSRStatus ShardWriter::Initialize(const std::unique_ptr<ShardWriter> *writer_ptr,
                                   const std::vector<std::string> &file_names) {
   if (writer_ptr == nullptr) {
     MS_LOG(ERROR) << "ShardWriter pointer is NULL.";
diff --git a/mindspore/ccsrc/minddata/mindrecord/meta/shard_column.cc b/mindspore/ccsrc/minddata/mindrecord/meta/shard_column.cc
index 84b8e45d698..25d0463dd81 100644
--- a/mindspore/ccsrc/minddata/mindrecord/meta/shard_column.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/meta/shard_column.cc
@@ -421,6 +421,12 @@ MSRStatus ShardColumn::UncompressInt(const uint64_t &column_id, std::unique_ptr<
 
   auto data = reinterpret_cast<const unsigned char *>(array_data.get());
   *data_ptr = std::make_unique<unsigned char[]>(*num_bytes);
+
+  // field is none. for example: numpy is null
+  if (*num_bytes == 0) {
+    return SUCCESS;
+  }
+
   int ret_code = memcpy_s(data_ptr->get(), *num_bytes, data, *num_bytes);
   if (ret_code != 0) {
     MS_LOG(ERROR) << "Failed to copy data!";
diff --git a/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc b/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc
index 040aa115e3e..737b6e93c2b 100644
--- a/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc
@@ -372,9 +372,10 @@ std::vector<std::string> ShardHeader::SerializeHeader() {
 std::string ShardHeader::SerializeIndexFields() {
   json j;
   auto fields = index_->GetFields();
-  for (const auto &field : fields) {
-    j.push_back({{"schema_id", field.first}, {"index_field", field.second}});
-  }
+  (void)std::transform(fields.begin(), fields.end(), std::back_inserter(j),
+                       [](const std::pair<uint64_t, std::string> &field) -> json {
+                         return {{"schema_id", field.first}, {"index_field", field.second}};
+                       });
   return j.dump();
 }
 
@@ -382,9 +383,8 @@ std::vector<std::string> ShardHeader::SerializePage() {
   std::vector<string> pages;
   for (auto &shard_pages : pages_) {
     json j;
-    for (const auto &p : shard_pages) {
-      j.emplace_back(p->GetPage());
-    }
+    (void)std::transform(shard_pages.begin(), shard_pages.end(), std::back_inserter(j),
+                         [](const std::shared_ptr<Page> &p) { return p->GetPage(); });
     pages.emplace_back(j.dump());
   }
   return pages;
@@ -392,25 +392,22 @@ std::vector<std::string> ShardHeader::SerializePage() {
 
 std::string ShardHeader::SerializeStatistics() {
   json j;
-  for (const auto &stats : statistics_) {
-    j.emplace_back(stats->GetStatistics());
-  }
+  (void)std::transform(statistics_.begin(), statistics_.end(), std::back_inserter(j),
+                       [](const std::shared_ptr<Statistics> &stats) { return stats->GetStatistics(); });
   return j.dump();
 }
 
 std::string ShardHeader::SerializeSchema() {
   json j;
-  for (const auto &schema : schema_) {
-    j.emplace_back(schema->GetSchema());
-  }
+  (void)std::transform(schema_.begin(), schema_.end(), std::back_inserter(j),
+                       [](const std::shared_ptr<Schema> &schema) { return schema->GetSchema(); });
   return j.dump();
 }
 
 std::string ShardHeader::SerializeShardAddress() {
   json j;
-  for (const auto &addr : shard_addresses_) {
-    j.emplace_back(GetFileName(addr).second);
-  }
+  (void)std::transform(shard_addresses_.begin(), shard_addresses_.end(), std::back_inserter(j),
+                       [](const std::string &addr) { return GetFileName(addr).second; });
   return j.dump();
 }
 
@@ -759,7 +756,7 @@ MSRStatus ShardHeader::FileToPages(const std::string dump_file_name) {
   return SUCCESS;
 }
 
-MSRStatus ShardHeader::initialize(const std::shared_ptr<ShardHeader> *header_ptr, const json &schema,
+MSRStatus ShardHeader::Initialize(const std::shared_ptr<ShardHeader> *header_ptr, const json &schema,
                                   const std::vector<std::string> &index_fields, std::vector<std::string> &blob_fields,
                                   uint64_t &schema_id) {
   if (header_ptr == nullptr) {
@@ -775,9 +772,8 @@ MSRStatus ShardHeader::initialize(const std::shared_ptr<ShardHeader> *header_ptr
   // create index
   std::vector<std::pair<uint64_t, std::string>> id_index_fields;
   if (!index_fields.empty()) {
-    for (auto &el : index_fields) {
-      id_index_fields.emplace_back(schema_id, el);
-    }
+    (void)std::transform(index_fields.begin(), index_fields.end(), std::back_inserter(id_index_fields),
+                         [schema_id](const std::string &el) { return std::make_pair(schema_id, el); });
     if (SUCCESS != (*header_ptr)->AddIndexFields(id_index_fields)) {
       MS_LOG(ERROR) << "Got unexpected error when adding mindrecord index.";
       return FAILED;
diff --git a/mindspore/ccsrc/pipeline/jit/CMakeLists.txt b/mindspore/ccsrc/pipeline/jit/CMakeLists.txt
index 2829aad6069..e1a6b32b0e0 100644
--- a/mindspore/ccsrc/pipeline/jit/CMakeLists.txt
+++ b/mindspore/ccsrc/pipeline/jit/CMakeLists.txt
@@ -8,7 +8,6 @@ file(GLOB_RECURSE _PIPELINE_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
     "pipeline_split.cc"
     "parse/*.cc"
     "static_analysis/*.cc"
-    "prim_bprop_optimizer.cc"
 )
 
 
diff --git a/mindspore/ccsrc/pipeline/jit/action.cc b/mindspore/ccsrc/pipeline/jit/action.cc
index df4f77a8f18..b77878dec0a 100644
--- a/mindspore/ccsrc/pipeline/jit/action.cc
+++ b/mindspore/ccsrc/pipeline/jit/action.cc
@@ -36,6 +36,7 @@
 #include "pipeline/jit/static_analysis/remove_monad.h"
 #include "abstract/abstract_value.h"
 #include "pipeline/jit/static_analysis/static_analysis.h"
+#include "pipeline/jit/static_analysis/async_eval_result.h"
 #include "pipeline/jit/static_analysis/program_specialize.h"
 #include "pipeline/jit/resource.h"
 #include "utils/ms_context.h"
@@ -108,7 +109,7 @@ void ExecuteActionForMindRT(const ResourcePtr &res) {
   // Construct the graph run function ptr.
   compile::VmEvalFuncPtr run =
     std::make_shared<compile::VmEvalFunc>([mindrt_bc_ptr, actor_info](const VectorRef &args) -> BaseRef {
-      MS_LOG(INFO) << "Execute args size " << args.size();
+      MS_LOG(DEBUG) << "Execute args size " << args.size();
       VectorRef outputs;
       mindrt_bc_ptr->RunGraph(actor_info, args, &outputs);
       MS_LOG(DEBUG) << "out size " << outputs.size();
@@ -121,28 +122,6 @@ using CompileGraphs = compile::CompileGraphs;
 using abstract::AnalysisResult;
 using mindspore::abstract::AnalysisContextPtr;
 
-inline bool ResetCNodeFromLoad(const AnfNodePtr &node) {
-  if (node->isa<CNode>() && node->cast<CNodePtr>()->get_load_flag()) {
-    // Process partial("DeadNode",args) when the graph is loaded.
-    auto operatorPtr = node->cast<CNodePtr>()->input(0);
-    // Set abstract of switch(c,f,t) to null
-    auto prim = GetValueNode<PrimitivePtr>(operatorPtr);
-    if (IsPrimitiveEquals(prim::kPrimSwitch, prim) || IsPrimitiveEquals(prim::kPrimSwitchLayer, prim)) {
-      node->set_abstract(nullptr);
-      return true;
-    }
-    // Set abstract of switch(c,f,t)() to null
-    prim = GetCNodePrimitive(operatorPtr);
-    if (IsPrimitiveEquals(prim::kPrimSwitch, prim) || IsPrimitiveEquals(prim::kPrimSwitchLayer, prim)) {
-      node->set_abstract(nullptr);
-      return true;
-    }
-    // Previous inferred value
-    return true;
-  }
-  return false;
-}
-
 abstract::AnalysisResult AbstractAnalyze(const ResourcePtr &res, const FuncGraphPtr &func_graph,
                                          const abstract::AbstractBasePtrList &args_spec, bool clear) {
   MS_LOG(DEBUG) << "AbstractAnalyze start";
@@ -154,24 +133,22 @@ abstract::AnalysisResult AbstractAnalyze(const ResourcePtr &res, const FuncGraph
     engine->Clear();
     for (auto &node : manager->all_nodes()) {
       MS_EXCEPTION_IF_NULL(node);
-      const AbstractBasePtr &prev_inferred = node->abstract();
-
-      // AbstractFunction has context,but contexts in cache have been cleaned.
-      if (prev_inferred != nullptr && prev_inferred->isa<abstract::AbstractFunction>()) {
-        node->set_abstract(nullptr);
-        MS_LOG(DEBUG) << "Abstract of node " << node->ToString() << " is set to nullptr";
-        continue;
-      }
 
       // Handle previous inferred value for CNode if is loaded from MindIR
-      if (res->is_load() && ResetCNodeFromLoad(node)) {
-        continue;
+      if (res->is_load()) {
+        // If the primitive is not defined in front end,keep the inferred value loaded from MindIR.
+        auto primitive = GetCNodePrimitive(node);
+        if (primitive != nullptr && abstract::GetPrimEvaluator(primitive, engine) == nullptr) {
+          MS_LOG(INFO) << "The primitive is not defined in front end. Primitive: " << primitive->ToString();
+          continue;
+        }
       }
 
+      const AbstractBasePtr &prev_inferred = node->abstract();
       // Keep previous inferred value for ValueNode if the inferred value is not AbstractFunction.
       if (!node->isa<ValueNode>() || (prev_inferred != nullptr && prev_inferred->isa<abstract::AbstractFunction>())) {
         node->set_abstract(nullptr);
-        MS_LOG(DEBUG) << "Abstract of node " << node->ToString() << " is set to nullptr";
+        MS_LOG(DEBUG) << "Abstract of node " << node->DebugString() << " is set to nullptr";
       }
     }
   }
@@ -219,80 +196,6 @@ FuncGraphPtr Renormalize(const ResourcePtr &res, const FuncGraphPtr &func_graph,
   return ret;
 }
 
-const FuncGraphPtr GetLoadedGraph(const ResourcePtr &res) {
-  MS_EXCEPTION_IF_NULL(res);
-  auto manager = res->manager();
-  MS_EXCEPTION_IF_NULL(manager);
-  FuncGraphPtr loaded_graph = nullptr;
-  size_t loaded_graph_num = 0;
-  auto all_graphs = manager->func_graphs();
-  for (auto &graph : all_graphs) {
-    MS_EXCEPTION_IF_NULL(graph);
-    if (graph->has_attr("is_load")) {
-      loaded_graph = graph;
-      loaded_graph_num += 1;
-      res->set_is_load(true);
-    }
-  }
-  if (loaded_graph_num == 0) {
-    return nullptr;
-  }
-  if (loaded_graph_num == 1) {
-    return loaded_graph;
-  }
-  MS_LOG(EXCEPTION) << "The loaded sub graph currently should less than 2, but got " << loaded_graph_num;
-}
-
-void CheckRootInputShapeAndType(const ResourcePtr &res, const FuncGraphPtr &loaded_graph) {
-  MS_EXCEPTION_IF_NULL(res);
-  auto manager = res->manager();
-  MS_EXCEPTION_IF_NULL(manager);
-  FuncGraphPtr root_graph = *(manager->roots().begin());
-  auto root_inputs = root_graph->get_inputs();
-  auto loaded_inputs = loaded_graph->get_inputs();
-  MS_LOG(DEBUG) << "root_graph: " << root_graph->ToString();
-  MS_LOG(DEBUG) << "loaded_graph: " << loaded_graph->ToString();
-
-  size_t root_inputs_num = root_inputs.size();
-  size_t loaded_inputs_num = loaded_inputs.size();
-  if (root_inputs_num != loaded_inputs_num) {
-    MS_LOG(EXCEPTION) << "The inputs number " << root_inputs_num << " not equal to the inputs number of loaded graph "
-                      << loaded_inputs_num;
-  }
-  for (size_t index = 0; index < root_inputs_num; index++) {
-    auto root_input = root_inputs[index];
-    auto loaded_input = loaded_inputs[index];
-
-    MS_LOG(DEBUG) << "root_input[" << index << "]: " << root_input->DebugString(1);
-    MS_LOG(DEBUG) << "loaded_input[" << index << "]: " << loaded_input->DebugString(1);
-    MS_LOG(DEBUG) << "root_input abstract[" << index
-                  << "]: " << (root_input->abstract() ? root_input->abstract()->ToString() : "NULL");
-    MS_LOG(DEBUG) << "loaded_input abstract [" << index
-                  << "]: " << (loaded_input->abstract() ? loaded_input->abstract()->ToString() : "NULL");
-
-    auto root_shape = root_input->Shape() == nullptr ? nullptr : dyn_cast<abstract::Shape>(root_input->Shape());
-    auto loaded_shape = loaded_input->Shape() == nullptr ? nullptr : dyn_cast<abstract::Shape>(loaded_input->Shape());
-    auto root_type = root_input->Type() == nullptr ? nullptr : dyn_cast<Type>(root_input->Type());
-    auto loaded_type = loaded_input->Type() == nullptr ? nullptr : dyn_cast<Type>(loaded_input->Type());
-
-    MS_EXCEPTION_IF_NULL(root_shape);
-    MS_EXCEPTION_IF_NULL(loaded_shape);
-    MS_EXCEPTION_IF_NULL(root_type);
-    MS_EXCEPTION_IF_NULL(loaded_type);
-
-    if (root_shape->shape() != loaded_shape->shape()) {
-      MS_EXCEPTION(ValueError) << "The " << index
-                               << " th input shape differ from loaded graph. Input shape: " << root_shape->ToString()
-                               << ", input shape of loaded graph: " << loaded_shape->ToString();
-    }
-    if (root_type->type_id() != loaded_type->type_id()) {
-      MS_EXCEPTION(TypeError) << "The " << std::to_string(index)
-                              << " th input type differ from loaded graph. Input type: " << root_type->ToString()
-                              << ", input type of loaded graph: " << loaded_type->ToString();
-    }
-  }
-}
-
 bool ParseAction(const ResourcePtr &res) {
   MS_EXCEPTION_IF_NULL(res);
   if (!res->input()) {
@@ -475,8 +378,6 @@ bool AbstractSpecializeAction(const ResourcePtr &res) {
   MS_EXCEPTION_IF_NULL(parallel::ParallelContext::GetInstance());
   context->ParallelParameterContextInitShape(func_graph);
 
-  // get original loaded graph to check inputs later
-  auto loaded_graph_ptr = GetLoadedGraph(res);
   // suppose that there is not KeywordArgument for the top graph
   // get the hyper parameter
   for (const auto &param : func_graph->parameters()) {
@@ -513,10 +414,6 @@ bool AbstractSpecializeAction(const ResourcePtr &res) {
       }
     }
   }
-  // check input after abstract when there is a loaded graph
-  if (loaded_graph_ptr != nullptr) {
-    CheckRootInputShapeAndType(res, loaded_graph_ptr);
-  }
   MS_LOG(DEBUG) << "End graph: " << new_fg->ToString() << ", return: " << new_fg->get_return()->DebugString(true);
   return true;
 }
@@ -636,9 +533,19 @@ bool TaskEmitAction(const ResourcePtr &res) {
     context_ptr->set_param<bool>(MS_CTX_ENABLE_LOOP_SINK, false);
   } else if (context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode) {
     std::string device_target = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
-    if (device_target == kAscendDevice && backend != kMsVm) {
+    auto manager = func_graph->manager();
+    auto graphs = manager->func_graphs();
+    bool exist_while =
+      std::any_of(graphs.cbegin(), graphs.cend(), [](const FuncGraphPtr &fg) { return fg->recursive(); });
+    if (device_target == kAscendDevice && backend != kMsVm && !exist_while) {
+      MS_LOG(INFO) << "Run graph mode with multigraph sink.";
       bc_ptr->set_is_multi_graph_sink(true);
       context_ptr->set_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK, true);
+    } else {
+      MS_LOG(INFO) << "Run graph mode with vm.";
+      bc_ptr->set_is_multi_graph_sink(false);
+      context_ptr->set_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK, false);
+      context_ptr->set_param<bool>(MS_CTX_ENABLE_LOOP_SINK, false);
     }
   }
 
@@ -752,7 +659,8 @@ bool StartServerAction(const ResourcePtr &res) {
     {"updateModel", true, update_model_time_window, true, update_model_threshold},
     {"getModel"},
     {"pullWeight"},
-    {"pushWeight", false, 3000, true, server_num, true}};
+    {"pushWeight", false, 3000, true, server_num, true},
+    {"pushMetrics", false, 3000, true, 1}};
 
   float share_secrets_ratio = ps::PSContext::instance()->share_secrets_ratio();
   uint64_t cipher_time_window = ps::PSContext::instance()->cipher_time_window();
@@ -834,6 +742,66 @@ bool RemoveValueNodeDuplicationsAction(const ResourcePtr &res) {
 bool PipelineSplitAction(const ResourcePtr &res) { return PipelineSplitPass(res); }
 bool ValidateAction(const ResourcePtr &res) { return ValidatePass(res); }
 
+bool SetMindIRGraphAction(const ResourcePtr &res) {
+  MS_EXCEPTION_IF_NULL(res);
+  res->set_is_load(true);
+  auto cell = py::cast<CellPtr>(res->input());
+  if (cell == nullptr) {
+    MS_LOG(EXCEPTION) << "The graph loaded from mindir is null.";
+  }
+  const std::string mindir_graph = "graph_load_from_mindir";
+  auto obj = cell->GetAttr(mindir_graph);
+  if (obj == nullptr) {
+    MS_LOG(EXCEPTION) << "The graph loaded from mindir is null. The cell has not attribute: " << mindir_graph;
+  }
+  auto fg = GetValue<FuncGraphPtr>(obj);
+  if (fg == nullptr) {
+    MS_LOG(EXCEPTION) << "The graph loaded from mindir is null.";
+  }
+  res->set_func_graph(fg);
+  FuncGraphManagerPtr mng = fg->manager();
+  if (mng == nullptr) {
+    auto res_mng = res->manager();
+    MS_EXCEPTION_IF_NULL(res_mng);
+    res_mng->AddFuncGraph(fg);
+    fg->set_manager(res_mng);
+  }
+  abstract::AbstractBasePtrList broaded_args;
+  const auto &args_spec_list = res->args_spec();
+  (void)std::transform(args_spec_list.begin(), args_spec_list.end(), std::back_inserter(broaded_args),
+                       [](const AbstractBasePtr &arg) -> AbstractBasePtr {
+                         MS_EXCEPTION_IF_NULL(arg);
+                         if (arg->GetValueTrack() != kAnyValue) {
+                           return arg->Broaden();
+                         }
+                         return arg;
+                       });
+
+  // suppose that there is not KeywordArgument for the top graph
+  // get the hyper parameter
+  for (const auto &param : fg->parameters()) {
+    auto param_node = std::static_pointer_cast<Parameter>(param);
+    MS_EXCEPTION_IF_NULL(param_node);
+    if (param_node->has_default()) {
+      auto value = param_node->default_param();
+      MS_EXCEPTION_IF_NULL(value);
+      auto abs_value = value->ToAbstract()->cast<abstract::AbstractTensorPtr>();
+      auto ref_key = std::make_shared<RefKey>(param_node->name());
+      auto abs_ref_key = ref_key->ToAbstract();
+      auto abs_ref = std::make_shared<abstract::AbstractRef>(abs_ref_key, abs_value);
+      broaded_args.push_back(abs_ref);
+    }
+  }
+  auto result = AbstractAnalyze(res, res->func_graph(), broaded_args, true);
+  auto it = abstract::AnalysisResultCacheMgr::GetInstance().begin();
+  auto it_end = abstract::AnalysisResultCacheMgr::GetInstance().end();
+  for (; it != it_end; ++it) {
+    it->first->node()->set_abstract(it->second->abstract());
+  }
+  abstract::AnalysisResultCacheMgr::GetInstance().Clear();
+  return true;
+}
+
 bool ActionPyStub(const ResourcePtr &res, opt::python_pass::Phase phase) {
   MS_EXCEPTION_IF_NULL(res->manager());
   MS_EXCEPTION_IF_NULL(res->func_graph());
@@ -974,7 +942,17 @@ std::vector<ActionItem> BackendPipeline() {
   (void)actions.emplace_back(std::make_pair("execute", ExecuteAction));
   return actions;
 }
-
+std::vector<ActionItem> MindIRPipeline() {
+  std::vector<ActionItem> actions;
+  // Set funcGraph loaded from MindIR to resource.
+  (void)actions.emplace_back(std::make_pair("load_mindir", SetMindIRGraphAction));
+  (void)actions.emplace_back(std::make_pair("validate", ValidateAction));
+  // compile the ANF graph
+  (void)actions.emplace_back(std::make_pair("task_emit", TaskEmitAction));
+  // to execute the graph
+  (void)actions.emplace_back(std::make_pair("execute", ExecuteAction));
+  return actions;
+}
 #if ((defined ENABLE_CPU) && (!defined _WIN32))
 std::vector<ActionItem> ServerPipeline() {
   auto actions = CommonPipeline();
diff --git a/mindspore/ccsrc/pipeline/jit/action.h b/mindspore/ccsrc/pipeline/jit/action.h
index a88044369d6..4e75447e14c 100644
--- a/mindspore/ccsrc/pipeline/jit/action.h
+++ b/mindspore/ccsrc/pipeline/jit/action.h
@@ -49,6 +49,7 @@ bool StartServerAction(const ResourcePtr &res);
 
 std::vector<ActionItem> GePipeline();
 std::vector<ActionItem> VmPipeline();
+std::vector<ActionItem> MindIRPipeline();
 std::vector<ActionItem> BackendPipeline();
 std::vector<ActionItem> PServerPipeline();
 std::vector<ActionItem> ServerPipeline();
diff --git a/mindspore/ccsrc/pipeline/jit/base.h b/mindspore/ccsrc/pipeline/jit/base.h
index 41fbc05bcb2..34e07dbcfb3 100644
--- a/mindspore/ccsrc/pipeline/jit/base.h
+++ b/mindspore/ccsrc/pipeline/jit/base.h
@@ -24,7 +24,6 @@
 
 #include "ir/anf.h"
 #include "pipeline/jit/resource.h"
-#include "utils/ms_context.h"
 
 namespace mindspore {
 namespace pipeline {
@@ -45,20 +44,6 @@ inline std::string GetPhasePrefix(const std::string &phase) {
   }
   return phase.substr(0, pos);
 }
-
-inline std::string GetSaveGraphsPathName(const std::string &file_name) {
-  std::ostringstream oss;
-  auto ms_context = MsContext::GetInstance();
-  if (ms_context == nullptr) {
-    MS_LOG(EXCEPTION) << "ms_context is nullptr";
-  }
-  auto save_graphs_path = ms_context->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
-  if (save_graphs_path.empty()) {
-    save_graphs_path = ".";
-  }
-  oss << save_graphs_path << "/" << file_name;
-  return oss.str();
-}
 }  // namespace pipeline
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/pipeline/jit/init.cc b/mindspore/ccsrc/pipeline/jit/init.cc
index 9544e9ed3d5..08a172fa2e8 100644
--- a/mindspore/ccsrc/pipeline/jit/init.cc
+++ b/mindspore/ccsrc/pipeline/jit/init.cc
@@ -96,7 +96,9 @@ PYBIND11_MODULE(_c_expression, m) {
          py::arg("broadcast_params") = py::dict(), "Build data graph.")
     .def("has_compiled", &ExecutorPy::HasCompiled, py::arg("phase") = py::str(""), "get if cell compiled.")
     .def("run_init_graph", &ExecutorPy::RunInitGraph, "Run init Graph.")
-    .def("set_py_exe_path", &ExecutorPy::PyExePath, py::arg("phase") = py::str(""), "set python executable path.");
+    .def("set_py_exe_path", &ExecutorPy::PyExePath, py::arg("py_exe_path") = py::str(""), "set python executable path.")
+    .def("set_kernel_build_server_dir", &ExecutorPy::KernelBuildServerDir,
+         py::arg("kernel_build_server_dir") = py::str(""), "set kernel build server directory path.");
 
   (void)py::class_<EnvInstance, std::shared_ptr<EnvInstance>>(m, "EnvInstance_").def(py::init());
 
@@ -105,6 +107,8 @@ PYBIND11_MODULE(_c_expression, m) {
   (void)m.def("reset_op_id", &mindspore::pipeline::ResetOpId, "Reset Operator Id");
   (void)m.def("init_hccl", &mindspore::pipeline::InitHccl, "Init Hccl");
   (void)m.def("finalize_hccl", &mindspore::pipeline::FinalizeHccl, "Finalize Hccl");
+  (void)m.def("get_hccl_rank_id", &mindspore::pipeline::GetHcclRankId, "Get Hccl Rank Id");
+  (void)m.def("get_hccl_rank_size", &mindspore::pipeline::GetHcclRankSize, "Get Hccl Rank Size");
   (void)m.def("verify_inputs_signature", &mindspore::pipeline::VerifyInputSignature, "Verify input signature.");
   (void)m.def("init_exec_dataset", &mindspore::pipeline::InitExecDataset, py::arg("queue_name"), py::arg("size"),
               py::arg("batch_size"), py::arg("types"), py::arg("shapes"), py::arg("input_indexs"),
diff --git a/mindspore/ccsrc/pipeline/jit/parse/parse.cc b/mindspore/ccsrc/pipeline/jit/parse/parse.cc
index 37d4cede426..62d2c02741d 100644
--- a/mindspore/ccsrc/pipeline/jit/parse/parse.cc
+++ b/mindspore/ccsrc/pipeline/jit/parse/parse.cc
@@ -193,7 +193,7 @@ void Parser::GenerateArgsNodeForFunction(const FunctionBlockPtr &block, const py
   block_fg->set_has_kwarg(!py::isinstance<py::none>(kw_arg_node));
 
   py::list kwonly_args = python_adapter::GetPyObjAttr(func_args, "kwonlyargs");
-  block_fg->set_kwonlyargs_count(SizeToLong(kwonly_args.size()));
+  block_fg->set_kwonlyargs_count(SizeToInt(kwonly_args.size()));
 
   MS_EXCEPTION_IF_NULL(ast_);
   py::list args = ast_->GetArgs(fn_node);
diff --git a/mindspore/ccsrc/pipeline/jit/pass.cc b/mindspore/ccsrc/pipeline/jit/pass.cc
index 43bde4e9cd5..ccea7fd7209 100644
--- a/mindspore/ccsrc/pipeline/jit/pass.cc
+++ b/mindspore/ccsrc/pipeline/jit/pass.cc
@@ -263,6 +263,7 @@ opt::OptPassConfig GetOptPassA1(const opt::irpass::OptimizeIRPassLib &irpass) {
     irpass.env_get_set_item_eliminate_,
     irpass.env_get_item_depend_swap_,
 
+    irpass.cast_eliminate_,
     irpass.reshape_eliminate_,
     irpass.reduce_eliminate_,
     irpass.tile_eliminate_,
@@ -295,11 +296,11 @@ OptPassGroupMap GetOptPassesA(const opt::irpass::OptimizeIRPassLib &irpass) {
   opt::OptPassConfig a_2 = opt::OptPassConfig(
     {
       irpass.switch_simplify_,
-      irpass.cast_eliminate_,
       irpass.specialize_transform_,
       irpass.merge_addn_,
       irpass.float_tuple_getitem_switch_,
       irpass.float_env_getitem_switch_,
+      irpass.inline_,
       irpass.incorporate_getitem_set_,
       irpass.incorporate_call_,
       irpass.incorporate_call_switch_,
@@ -685,19 +686,19 @@ bool AutoMonadElimOptPass(const FuncGraphPtr &func_graph) {
   // opt::irpass::OptimizeIRPassLib is not used here to avoid double free problems in external calls.
   opt::SubstitutionPtr updatestate_depend_eliminater =
     opt::MakeSubstitution(std::make_shared<opt::irpass::UpdatestateDependEliminater>(), "updatestate_depend_eliminater",
-                          prim::kPrimUpdateState);
+                          prim::kPrimUpdateState, true);
   opt::SubstitutionPtr updatestate_assign_eliminater =
     opt::MakeSubstitution(std::make_shared<opt::irpass::UpdatestateAssignEliminater>(), "updatestate_assign_eliminater",
-                          prim::kPrimUpdateState);
+                          prim::kPrimUpdateState, true);
   opt::SubstitutionPtr updatestate_maketuple_eliminater =
     opt::MakeSubstitution(std::make_shared<opt::irpass::UpdatestateMakeTupleEliminater>(),
-                          "updatestate_maketuple_eliminater", prim::kPrimUpdateState);
+                          "updatestate_maketuple_eliminater", prim::kPrimUpdateState, true);
   opt::SubstitutionPtr updatestate_only_used_node_eliminater =
     opt::MakeSubstitution(std::make_shared<opt::irpass::UpdatestateOnlyUsedNodeEliminater>(),
                           "updatestate_only_used_node_eliminater", prim::kPrimUpdateState);
   opt::SubstitutionPtr updatestate_loads_eliminater =
     opt::MakeSubstitution(std::make_shared<opt::irpass::UpdatestateLoadsEliminater>(), "updatestate_loads_eliminater",
-                          prim::kPrimUpdateState);
+                          prim::kPrimUpdateState, true);
   opt::SubstitutionPtr updatestate_pure_node_eliminater =
     opt::MakeSubstitution(std::make_shared<opt::irpass::UpdatestatePureNodeEliminater>(),
                           "updatestate_pure_node_eliminater", prim::kPrimUpdateState);
diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.cc b/mindspore/ccsrc/pipeline/jit/pipeline.cc
index 6d5cff6cc18..4fc01a93177 100644
--- a/mindspore/ccsrc/pipeline/jit/pipeline.cc
+++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc
@@ -45,13 +45,14 @@
 #include "backend/session/executor_manager.h"
 #include "debug/trace.h"
 #include "debug/draw.h"
+#include "debug/common.h"
 #include "pipeline/pynative/pynative_execute.h"
 #include "frontend/optimizer/py_pass_manager.h"
 #include "pybind_api/pybind_patch.h"
 #include "utils/shape_utils.h"
 #include "utils/info.h"
 #include "load_mindir/load_model.h"
-#include "pipeline/jit/prim_bprop_optimizer.h"
+#include "frontend/optimizer/ad/prim_bprop_optimizer.h"
 #include "runtime/hardware/device_context_manager.h"
 #include "utils/crypto.h"
 
@@ -142,20 +143,21 @@ std::string GetCompileExceptionInfo() {
   return oss.str();
 }
 
-void SetGpuLoopSink(const ResourcePtr &resource) {
+void SetLoopCount(const ResourcePtr &resource) {
   MS_EXCEPTION_IF_NULL(resource);
   auto func_graph = resource->func_graph();
   if (func_graph != nullptr && func_graph->manager() != nullptr) {
     auto manager = func_graph->manager();
     size_t graph_nums = manager->func_graphs().size();
-    int64_t sinksize = ConfigManager::GetInstance().iter_num();
-    if (graph_nums == 1 || MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
-      resource->set_gpu_loopsink(true, sinksize);
-    } else {
-      resource->set_gpu_loopsink(false, sinksize);
+    int64_t loop_size = ConfigManager::GetInstance().iter_num();
+    const auto context_ptr = MsContext::GetInstance();
+    if (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
+      resource->set_vm_loop(!context_ptr->get_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK), loop_size);
+    } else if (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
+      bool run_with_mind_rt = graph_nums == 1 || context_ptr->get_param<bool>(MS_CTX_ENABLE_MINDRT);
+      resource->set_vm_loop(!run_with_mind_rt, loop_size);
     }
-    MS_LOG(INFO) << "Change gpu_loopsink_flag_ to " << resource->gpu_loopsink_flag() << ", set loopsink size to "
-                 << sinksize;
+    MS_LOG(INFO) << "Change vm_loop_flag to " << resource->vm_loop_flag() << ", set loop_size to " << loop_size;
   }
 }
 
@@ -610,6 +612,11 @@ bool IsPhaseTrain(const std::string &phase_s) {
   return phase_s.rfind(phase_to_train) != std::string::npos;
 }
 
+bool IsPhaseLoadFromMindIR(const std::string &phase_s) {
+  const std::string mindir_graph = "graph_load_from_mindir";
+  return phase_s.rfind(mindir_graph) != std::string::npos;
+}
+
 std::vector<ActionItem> GetPipeline(const ResourcePtr &resource, const std::string &phase_s, bool use_vm) {
   MS_EXCEPTION_IF_NULL(resource);
   bool is_air = IsPhaseExportAir(phase_s);
@@ -644,6 +651,9 @@ std::vector<ActionItem> GetPipeline(const ResourcePtr &resource, const std::stri
         resource->func_graph() != nullptr) {
       return BackendPipeline();
     }
+    if (IsPhaseLoadFromMindIR(phase_s)) {
+      return MindIRPipeline();
+    }
     return VmPipeline();
   }
   return GePipeline();
@@ -827,7 +837,7 @@ void Pipeline::Run(const std::string &phase_s) {
         MS_LOG(DEBUG) << "Action " << action.first << " end.";
       };
       if (action.first == "task_emit") {
-        SetGpuLoopSink(resource_);
+        SetLoopCount(resource_);
       } else if (action.first == "validate") {
         CacheValidateFuncGraph(phase_s, resource_);
       }
@@ -1003,13 +1013,17 @@ py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) {
     MS_LOG(EXCEPTION) << "Can't find run graph func for " << phase_s;
   }
   // Set loopsink size for each phase.
-  bool is_loopsink = info_[phase_s]->resource->gpu_loopsink_flag();
-  int64_t sinksize = info_[phase_s]->resource->gpu_loopsink_size();
-  ConfigManager::GetInstance().set_gpu_loopsink_size(is_loopsink ? sinksize : 1);
-  // If target is not gpu or is loopsink, keep vmloop 1.
-  bool g = (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
-  int64_t vm_loop = (!g || is_loopsink) ? 1 : sinksize;
-  MS_LOG(INFO) << "VM loop size " << vm_loop << ", loopsink size " << (is_loopsink ? sinksize : 1);
+  bool vm_loop_flag = info_[phase_s]->resource->vm_loop_flag();
+  int64_t loop_size = info_[phase_s]->resource->loop_size();
+  int64_t vm_loop = 1;
+  if (vm_loop_flag) {
+    vm_loop = loop_size;
+  } else {
+    // Set the loop size in config if graphs nums is 1(is_loop_sin=True), then there will be a loop embrace
+    // 'Execute(graph)' in GPUSession.
+    ConfigManager::GetInstance().set_gpu_loopsink_size(loop_size);
+  }
+  MS_LOG(INFO) << "VM loop size " << vm_loop << ", loopsink size " << vm_loop;
   py::object ret;
   MS_LOG(DEBUG) << "Eval run" << backend;
   for (int64_t i = 0; i < vm_loop; i++) {
@@ -1055,13 +1069,22 @@ void ExecutorPy::RunInitGraph(const py::dict &init_params, const std::string &ph
 
 void ExecutorPy::PyExePath(const py::object &py_exe_path) {
   if (!py::isinstance<py::str>(py_exe_path)) {
-    MS_LOG(EXCEPTION) << "Failed, phase input is not a str";
+    MS_LOG(EXCEPTION) << "Failed, py_exe_path input is not a str";
   }
   auto py_exe_path_s = py::cast<std::string>(py_exe_path);
   auto ms_context = MsContext::GetInstance();
   ms_context->set_param<std::string>(MS_CTX_PYTHON_EXE_PATH, py_exe_path_s);
 }
 
+void ExecutorPy::KernelBuildServerDir(const py::object &kernel_build_server_dir) {
+  if (!py::isinstance<py::str>(kernel_build_server_dir)) {
+    MS_LOG(EXCEPTION) << "Failed, kernel_build_server_dir input is not a str";
+  }
+  auto kernel_build_server_dir_s = py::cast<std::string>(kernel_build_server_dir);
+  auto ms_context = MsContext::GetInstance();
+  ms_context->set_param<std::string>(MS_CTX_KERNEL_BUILD_SERVER_DIR, kernel_build_server_dir_s);
+}
+
 bool InitExecDataset(const std::string &queue_name, int64_t iter_num, int64_t batch_size,
                      const std::vector<TypePtr> &types, const std::vector<std::vector<int64_t>> &shapes,
                      const std::vector<int64_t> &input_indexes, const std::string &phase, bool need_run) {
@@ -1159,9 +1182,6 @@ bool InitExecDatasetVm(const std::string &queue_name, int64_t size, int64_t batc
   // Convert CNodeList to LinConvertResult.
   auto segment = std::make_shared<GraphSegment>(std::vector<AnfNodePtr>{app_init}, false);
   auto runner = convert_fn(segment, "");
-  if (MsContext::GetInstance()->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode) {
-    backend->Link(runner.graph_id);
-  }
   ConfigManager::GetInstance().set_iter_num(size);
   // PS cache does not support loop sink.
 #if ((defined ENABLE_CPU) && (!defined _WIN32))
@@ -1227,6 +1247,32 @@ void FinalizeHccl() {
 #endif
 }
 
+auto GetAscendRuntimeInstance() {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto runtime_instance = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  auto backend = ms_context->backend_policy();
+  auto device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
+  if (backend == "ms" && device_target == kAscendDevice) {
+    return runtime_instance;
+  } else {
+    MS_LOG(EXCEPTION) << "Get MindSpore ascend runtime instance failed";
+  }
+}
+
+uint32_t GetHcclRankId() {
+  auto runtime_instance = GetAscendRuntimeInstance();
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  return runtime_instance->GetRankId();
+}
+
+uint32_t GetHcclRankSize() {
+  auto runtime_instance = GetAscendRuntimeInstance();
+  MS_EXCEPTION_IF_NULL(runtime_instance);
+  return runtime_instance->GetRankSize();
+}
+
 void ExportGraph(const std::string &file_name, const std::string &, const std::string &phase) {
 #if ((defined ENABLE_GE) || (defined ENABLE_D))
   ExportDFGraph(file_name, phase);
@@ -1320,7 +1366,7 @@ void ClearResAtexit() {
   device::DeviceContextManager::GetInstance().ClearDeviceContexts();
   ad::g_k_prims.clear();
   ad::ClearKPynativeCellStaticRes();
-  PrimBpropOptimizer::GetPrimBpropOptimizerInst().Clear();
+  ad::PrimBpropOptimizer::GetPrimBpropOptimizerInst().Clear();
 
   abstract::ClearPrimEvaluatorMap();
   pipeline::GetMethodMap().clear();
diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.h b/mindspore/ccsrc/pipeline/jit/pipeline.h
index 36f5bd433d8..431c4dfa6cb 100644
--- a/mindspore/ccsrc/pipeline/jit/pipeline.h
+++ b/mindspore/ccsrc/pipeline/jit/pipeline.h
@@ -92,7 +92,8 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> {
   void UpdataParamNodeDefaultInput(const std::string &phase,
                                    const std::unordered_map<std::string, tensor::TensorPtr> &params);
   void RunInitGraph(const py::dict &init_params, const std::string &phase) const;
-  void PyExePath(const py::object &phase);
+  void PyExePath(const py::object &py_exe_path);
+  void KernelBuildServerDir(const py::object &kernel_build_server_dir);
   py::dict GetParameterLayout(const std::string &phase);
   py::dict GetCNodeStrategy(const std::string &phase);
   py::list GetParallelParameterNameList(const std::string &phase);
@@ -140,6 +141,8 @@ bool InitDistribute(const std::map<std::string, std::string> &options);
 void ResetOpId();
 void InitHccl();
 void FinalizeHccl();
+uint32_t GetHcclRankId();
+uint32_t GetHcclRankSize();
 void InitPipeline();
 void FinalizeBackend();
 void ClearResAtexit();
diff --git a/mindspore/ccsrc/pipeline/jit/prim_bprop_optimizer.cc b/mindspore/ccsrc/pipeline/jit/prim_bprop_optimizer.cc
deleted file mode 100644
index 969efd290b0..00000000000
--- a/mindspore/ccsrc/pipeline/jit/prim_bprop_optimizer.cc
+++ /dev/null
@@ -1,372 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <memory>
-#include "ir/func_graph_cloner.h"
-#include "pipeline/jit/prim_bprop_optimizer.h"
-#include "pipeline/jit/pass.h"
-
-namespace mindspore {
-namespace pipeline {
-void PrimBpropOptGraphLevel2Info::TryFreeArgsValue(const ValuePtrList &op_args, const ValuePtr &out) {
-  // args_value_using_info_ contains out
-  if (args_value_using_info_.size() != op_args.size() + 1) {
-    MS_LOG(EXCEPTION) << "param size :" << args_value_using_info_.size()
-                      << " of bp_graph:" << opt_func_graph_->ToString()
-                      << " not match input arguments num:" << op_args.size();
-  }
-
-  ValuePtrList new_args(op_args);
-  (void)new_args.emplace_back(out);
-  TryFreeOneValue(new_args, args_value_using_info_);
-}
-
-void PrimBpropOptGraphLevel2Info::TryFreeOneValue(const ValuePtrList &op_args,
-                                                  const std::vector<ParamUsingInfo> &param_info_vec) {
-  if (param_info_vec.size() != op_args.size()) {
-    MS_LOG(EXCEPTION) << "param size :" << param_info_vec.size() << " of bp_graph:" << opt_func_graph_->ToString()
-                      << " not match input arguments num:" << op_args.size();
-  }
-
-  for (size_t i = 0; i < op_args.size(); ++i) {
-    if (!param_info_vec[i].using_flg_ && !param_info_vec[i].tuple_flg_ && op_args[i]->isa<tensor::Tensor>()) {
-      auto value = op_args[i]->cast<tensor::TensorPtr>();
-      value->set_device_address(nullptr);
-    } else if (param_info_vec[i].tuple_flg_ && op_args[i]->isa<ValueTuple>()) {
-      auto value = op_args[i]->cast<ValueTuplePtr>();
-      MS_EXCEPTION_IF_NULL(value);
-      TryFreeOneValue(value->value(), param_info_vec[i].sub_using_info_);
-    }
-  }
-}
-
-void PrimBpropOptGraphLevel2Info::AnalysisArgUsingInfo(const FuncGraphManagerPtr &manager) {
-  MS_EXCEPTION_IF_NULL(manager);
-  if (analysis_finish_flg_) {
-    return;
-  }
-  MS_EXCEPTION_IF_NULL(opt_func_graph_);
-  auto &params = opt_func_graph_->parameters();
-  const auto &node_users = manager->node_users();
-  args_value_using_info_.resize(params.size() - 1);
-  // analysis value using flg except dout
-  for (size_t i = 0; i < params.size() - 1; ++i) {
-    auto &param = params[i];
-    auto &arg_info = args_value_using_info_[i];
-    ArgInfoRefresh(param, &arg_info);
-    AnalysisNodeUsingInfo(node_users, param, &arg_info);
-  }
-  analysis_finish_flg_ = true;
-}
-
-void PrimBpropOptGraphLevel2Info::AnalysisNodeUsingInfo(const NodeUsersMap &node_users,
-                                                        const std::shared_ptr<AnfNode> &param,
-                                                        ParamUsingInfo *arg_info) const {
-  MS_EXCEPTION_IF_NULL(arg_info);
-  auto iter = node_users.find(param);
-  if (iter == node_users.end()) {
-    arg_info->using_flg_ = false;
-    return;
-  }
-
-  // tensor return directly
-  if (!arg_info->tuple_flg_) {
-    arg_info->using_flg_ = true;
-    return;
-  }
-
-  // specific process for tuple parameter, may only partial items used
-  const auto &users_info = iter->second;
-  for (auto &user_info : users_info) {
-    auto user_node = user_info.first;
-    arg_info->using_flg_ = true;
-    MS_LOG(DEBUG) << "param:" << param->ToString() << " used by node:" << user_node->ToString();
-    if (!IsPrimitiveCNode(user_node, prim::kPrimTupleGetItem)) {
-      for (auto &sub_info : arg_info->sub_using_info_) {
-        sub_info.using_flg_ = true;
-      }
-    } else {
-      AalysisForTupleGetItem(node_users, param, arg_info, user_node);
-    }
-  }
-}
-void PrimBpropOptGraphLevel2Info::AalysisForTupleGetItem(const NodeUsersMap &node_users,
-                                                         const std::shared_ptr<AnfNode> &param,
-                                                         ParamUsingInfo *arg_info, const AnfNodePtr &user_node) const {
-  MS_EXCEPTION_IF_NULL(arg_info);
-  MS_EXCEPTION_IF_NULL(user_node);
-  auto cnode = user_node->cast<CNodePtr>();
-  MS_EXCEPTION_IF_NULL(cnode);
-  const size_t tuple_get_item_size = 3;
-  const size_t index = 2;
-  if (cnode->size() != tuple_get_item_size) {
-    MS_LOG(EXCEPTION) << "TupleGetItem Node:" << user_node->ToString() << " of bp_graph:" << opt_func_graph_->ToString()
-                      << "input size is:" << cnode->size();
-  }
-  auto idx_node = cnode->input(index);
-  if (!idx_node->isa<ValueNode>()) {
-    MS_LOG(EXCEPTION) << "tuple :" << param->ToString() << " of bp_graph:" << opt_func_graph_->ToString()
-                      << " unexpected used by node:" << user_node->ToString()
-                      << " TupleGetItem idx node:" << idx_node->ToString();
-  }
-
-  auto vnode = idx_node->cast<ValueNodePtr>();
-  auto value_ptr = vnode->value();
-  if (value_ptr == nullptr || !value_ptr->isa<Int64Imm>()) {
-    MS_LOG(EXCEPTION) << "tuple :" << param->ToString() << " of bp_graph:" << opt_func_graph_->ToString()
-                      << " unexpected used by node:" << user_node->ToString()
-                      << " TupleGetItem idx node:" << idx_node->ToString() << " idx Value :" << value_ptr;
-  }
-
-  auto idx = LongToSize(value_ptr->cast<Int64ImmPtr>()->value());
-  arg_info->sub_using_info_[idx].using_flg_ = true;
-  ArgInfoRefresh(cnode, &(arg_info->sub_using_info_[idx]));
-
-  if (arg_info->tuple_flg_) {
-    AnalysisNodeUsingInfo(node_users, cnode, &(arg_info->sub_using_info_[idx]));
-  }
-}
-
-void PrimBpropOptGraphLevel2Info::ArgInfoRefresh(const std::shared_ptr<AnfNode> &param,
-                                                 ParamUsingInfo *arg_info) const {
-  MS_EXCEPTION_IF_NULL(arg_info);
-  MS_EXCEPTION_IF_NULL(param);
-  auto abs = param->abstract();
-  MS_EXCEPTION_IF_NULL(abs);
-  if (abs->isa<abstract::AbstractTensor>()) {
-    arg_info->tuple_flg_ = false;
-    MS_LOG(DEBUG) << "param abstract:" << param->ToString() << " is a AbstractTensor";
-  } else if (abs->isa<abstract::AbstractTuple>()) {
-    auto abs_tuple = abs->cast<abstract::AbstractTuplePtr>();
-    MS_LOG(DEBUG) << "param abstract:" << param->ToString() << " is a AbstractTuple";
-    arg_info->tuple_flg_ = true;
-    arg_info->tuple_size_ = abs_tuple->size();
-    arg_info->sub_using_info_.resize(abs_tuple->size());
-  } else {
-    arg_info->tuple_flg_ = false;
-  }
-}
-
-PrimBpropOptimizer &PrimBpropOptimizer::GetPrimBpropOptimizerInst() {
-  static PrimBpropOptimizer g_prim_bprop_opt = PrimBpropOptimizer();
-  return g_prim_bprop_opt;
-}
-
-void PrimBpropOptimizer::Clear() {
-  prim_bprop_cache_.clear();
-  tuple_list_bprop_cache_.clear();
-}
-
-// bprop_fg has the signature:
-// (sens_input1, sens_input2,...)bprop_fg(input1, input2, ..., out, d_out)
-// c_node contains the prim(input 0) and the input parameters of that prim;
-// op_args contains the arguments list of each input parameters, it maybe tensor or tuple
-// out contains the out of c_node;
-FuncGraphPtr PrimBpropOptimizer::OptimizeBPropFuncGraph(const FuncGraphPtr &bprop_fg, const CNodePtr &c_node,
-                                                        const ValuePtrList &op_args, const ValuePtr &out) {
-  MS_EXCEPTION_IF_NULL(bprop_fg);
-  MS_EXCEPTION_IF_NULL(c_node);
-  MS_EXCEPTION_IF_NULL(out);
-  auto &inputs = c_node->inputs();
-  if (inputs.size() < 1 || inputs.size() - 1 != op_args.size()) {
-    MS_LOG(EXCEPTION) << "The parameters num " << inputs.size() - 1 << " not match arguments num " << op_args.size()
-                      << ", CNode:" << c_node->ToString() << " grap:" << bprop_fg->ToString();
-  }
-
-  if (!IsValueNode<Primitive>(inputs[0])) {
-    MS_LOG(EXCEPTION) << "CNode:" << c_node->ToString()
-                      << " not a primitive node, input_0 is:" << inputs[0]->ToString();
-  }
-
-  PrimitivePtr prim = GetValueNode<PrimitivePtr>(inputs[0]);
-  MS_LOG(DEBUG) << "Hash of prim " << prim->ToString() << " is:" << prim->hash();
-
-  //  kPrimHookBackward
-  bool hookback_flg = IsPrimitiveEquals(prim, prim::kPrimHookBackward);
-  if (hookback_flg || IsPrimitiveEquals(prim, prim::kPrimMakeTuple) || IsPrimitiveEquals(prim, prim::kPrimMakeList)) {
-    return GenSpecOptBprop(bprop_fg, op_args, out, prim, hookback_flg);
-  }
-
-  return GetOptBpropFromCache(bprop_fg, op_args, out, prim);
-}
-
-FuncGraphPtr PrimBpropOptimizer::GetOptBpropFromCache(const FuncGraphPtr &bprop_fg, const ValuePtrList &op_args,
-                                                      const ValuePtr &out, const PrimitivePtr &prim) {
-  MS_EXCEPTION_IF_NULL(bprop_fg);
-  abstract::AbstractBasePtrList abs_list;
-  ArgsToAbs(prim, op_args, &abs_list);
-
-  PrimBpropOptGraphLevel2InfoPtr level_2_graph_info;
-  PrimBpropOptGraphInfoPtr level_1_graph_info;
-  ECacheQrtRes cache_res = GetOptBpfgFromCache(prim, abs_list, &level_2_graph_info, &level_1_graph_info);
-
-  MS_LOG(DEBUG) << "Cache match result " << cache_res << ", prim: " << prim->ToString();
-  if (cache_res == E_LEVEL_2) {
-    MS_LOG(DEBUG) << "Level 2 cache matched, prim: " << prim->ToString();
-    level_2_graph_info->TryFreeArgsValue(op_args, out);
-    return BasicClone(level_2_graph_info->opt_func_graph());
-  }
-
-  // do step1 opt
-  if (cache_res == E_NOT_FOUND) {
-    bprop_fg->debug_info()->set_name(prim->ToString());
-    level_1_graph_info = PrimBpropOptStep1(bprop_fg);
-    prim_bprop_cache_[prim] = level_1_graph_info;
-  }
-  FuncGraphPtr level_1_graph = BasicClone(level_1_graph_info->opt_func_graph_);
-
-  // do step2 opt
-  auto new_abs_list = AddOutToAbsList(out, abs_list);
-  level_2_graph_info = PrimBpropOptStep2(level_1_graph, new_abs_list);
-  level_1_graph_info->graph_level_2_cache_[abs_list] = level_2_graph_info;
-  level_2_graph_info->TryFreeArgsValue(op_args, out);
-  return BasicClone(level_2_graph_info->opt_func_graph());
-}
-
-FuncGraphPtr PrimBpropOptimizer::GenSpecOptBprop(const FuncGraphPtr &bprop_fg, const ValuePtrList &op_args,
-                                                 const ValuePtr &out, const PrimitivePtr &prim, bool hook_flg) {
-  MS_EXCEPTION_IF_NULL(bprop_fg);
-  abstract::AbstractBasePtrList abs_list;
-  ArgsToAbs(prim, op_args, &abs_list);
-  if (!hook_flg) {
-    auto iter = tuple_list_bprop_cache_.find(std::pair(prim, abs_list));
-    if (iter != tuple_list_bprop_cache_.end()) {
-      return BasicClone(iter->second);
-    }
-  }
-
-  // do step1 opt
-  bprop_fg->debug_info()->set_name(prim->ToString());
-  auto level_1_graph_info = PrimBpropOptStep1(bprop_fg);
-
-  // do step2 opt
-  auto new_abs_list = AddOutToAbsList(out, abs_list);
-  auto level_2_graph_info = PrimBpropOptStep2(level_1_graph_info->opt_func_graph_, new_abs_list);
-  level_2_graph_info->TryFreeArgsValue(op_args, out);
-
-  if (!hook_flg) {
-    tuple_list_bprop_cache_[std::pair(prim, abs_list)] = BasicClone(level_2_graph_info->opt_func_graph());
-  }
-  return level_2_graph_info->opt_func_graph();
-}
-
-PrimBpropOptGraphInfoPtr PrimBpropOptimizer::PrimBpropOptStep1(const FuncGraphPtr &bprop_fg) {
-  opt::irpass::OptimizeIRPassLib irpass;
-  auto level_1_graph_info = std::make_shared<PrimBpropOptGraphInfo>();
-  auto prim_bprop_opt_res = std::make_shared<pipeline::Resource>();
-  auto prim_bprop_opt_manage = prim_bprop_opt_res->manager();
-  auto graph_for_cache = BasicClone(bprop_fg);
-  prim_bprop_opt_res->set_func_graph(graph_for_cache);
-  prim_bprop_opt_manage->AddFuncGraph(graph_for_cache);
-  auto opt_bprop_fg = PrimBpOptPassStep1(irpass, prim_bprop_opt_res);
-  level_1_graph_info->opt_func_graph_ = opt_bprop_fg;
-  return level_1_graph_info;
-}
-
-void PrimBpropOptimizer::BindAbsToParameters(const FuncGraphPtr &bprop_fg,
-                                             const abstract::AbstractBasePtrList &abs_list_input) {
-  MS_EXCEPTION_IF_NULL(bprop_fg);
-  auto &params = bprop_fg->parameters();
-  if (abs_list_input.size() != params.size()) {
-    MS_LOG(EXCEPTION) << "Param num:" << params.size() << " not match inputs num " << abs_list_input.size();
-  }
-
-  for (size_t i = 0; i < abs_list_input.size(); i++) {
-    params[i]->set_abstract(abs_list_input[i]);
-  }
-}
-
-PrimBpropOptGraphLevel2InfoPtr PrimBpropOptimizer::PrimBpropOptStep2(
-  const FuncGraphPtr &bprop_fg, const abstract::AbstractBasePtrList &abs_list_input) {
-  opt::irpass::OptimizeIRPassLib irpass;
-  BindAbsToParameters(bprop_fg, abs_list_input);
-  pipeline::ResourcePtr resource = std::make_shared<pipeline::Resource>();
-  auto manager = resource->manager();
-  resource->set_func_graph(bprop_fg);
-  manager->AddFuncGraph(bprop_fg);
-  auto opt_bprop_fg = PrimBpOptPassStep2(irpass, resource);
-  auto level_2_graph_info = std::make_shared<PrimBpropOptGraphLevel2Info>(opt_bprop_fg);
-  level_2_graph_info->AnalysisArgUsingInfo(manager);
-  return level_2_graph_info;
-}
-
-FuncGraphPtr PrimBpropOptimizer::BpropGraphFinalOpt(const ResourcePtr &res) const {
-  MS_EXCEPTION_IF_NULL(res);
-  auto after_opt_bg = BpropGraphFinalOptPass(res);
-  return after_opt_bg;
-}
-
-ECacheQrtRes PrimBpropOptimizer::GetOptBpfgFromCache(const PrimitivePtr &prim,
-                                                     const abstract::AbstractBasePtrList &abs_list,
-                                                     PrimBpropOptGraphLevel2InfoPtr *level_2_graph_info,
-                                                     PrimBpropOptGraphInfoPtr *level_1_graph_info) {
-  MS_EXCEPTION_IF_NULL(prim);
-  MS_EXCEPTION_IF_NULL(level_1_graph_info);
-  MS_EXCEPTION_IF_NULL(level_2_graph_info);
-  auto attrs_ = prim->attrs();
-  for (auto &item : attrs_) {
-    MS_LOG(DEBUG) << "prim:" << prim->ToString() << " attr: " << item.first << " value:" << item.second->ToString();
-  }
-
-  auto iter = prim_bprop_cache_.find(prim);
-  if (iter == prim_bprop_cache_.end()) {
-    return E_NOT_FOUND;
-  }
-
-  *level_1_graph_info = iter->second;
-  auto second_iter = (*level_1_graph_info)->graph_level_2_cache_.find(abs_list);
-  if (second_iter == (*level_1_graph_info)->graph_level_2_cache_.end()) {
-    return E_LEVEL_1;
-  }
-  *level_2_graph_info = second_iter->second;
-  return E_LEVEL_2;
-}
-
-void PrimBpropOptimizer::ArgsToAbs(const PrimitivePtr &prim, const ValuePtrList &op_args,
-                                   abstract::AbstractBasePtrList *abs_list) {
-  MS_EXCEPTION_IF_NULL(prim);
-  MS_EXCEPTION_IF_NULL(abs_list);
-  auto const_input_index = prim->get_const_input_indexes();
-  bool have_const_input = !const_input_index.empty();
-  bool is_const_prim = prim->is_const_prim();
-  for (size_t i = 0; i < op_args.size(); ++i) {
-    bool is_const_input =
-      have_const_input && std::find(const_input_index.begin(), const_input_index.end(), i) != const_input_index.end();
-    auto &arg_value = op_args[i];
-    auto arg_abs = arg_value->ToAbstract();
-    if (!is_const_prim && !is_const_input) {
-      arg_abs = arg_abs->PartialBroaden();
-      MS_LOG(DEBUG) << "Broaden for " << prim->ToString();
-    }
-    (void)abs_list->emplace_back(arg_abs);
-  }
-}
-
-abstract::AbstractBasePtrList PrimBpropOptimizer::AddOutToAbsList(const ValuePtr &out,
-                                                                  const abstract::AbstractBasePtrList &abs_list) {
-  MS_EXCEPTION_IF_NULL(out);
-  if (!out->isa<tensor::Tensor>() && !out->isa<ValueTuple>()) {
-    MS_LOG(EXCEPTION) << "Out value not Tensor or Tuple, please check the input arguments.";
-  }
-  abstract::AbstractBasePtrList new_abs_list(abs_list);
-  auto out_abs = out->ToAbstract();
-  out_abs = out_abs->PartialBroaden();
-  (void)new_abs_list.emplace_back(out_abs);
-  (void)new_abs_list.emplace_back(out_abs);
-  return new_abs_list;
-}
-}  // namespace pipeline
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/pipeline/jit/prim_bprop_optimizer.h b/mindspore/ccsrc/pipeline/jit/prim_bprop_optimizer.h
deleted file mode 100644
index be8a8410514..00000000000
--- a/mindspore/ccsrc/pipeline/jit/prim_bprop_optimizer.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_PIPELINE_JIT_PRIM_BPROP_OPTIMIZER_H
-#define MINDSPORE_CCSRC_PIPELINE_JIT_PRIM_BPROP_OPTIMIZER_H
-
-#include <vector>
-#include <utility>
-#include <unordered_map>
-#include <memory>
-
-#include "frontend/optimizer/irpass.h"
-#include "ir/func_graph.h"
-#include "pipeline/jit/resource.h"
-
-namespace mindspore {
-namespace pipeline {
-struct PrimBpropOptGraphInfo;
-
-class PrimBpropOptGraphLevel2Info;
-
-struct PrimitiveTotalEqual;
-
-struct PrimitiveTupleListHasher;
-
-struct PrimitiveTupleListEqual;
-
-using PrimBpropOptGraphInfoPtr = std::shared_ptr<PrimBpropOptGraphInfo>;
-
-using PrimBpropOptGraphLevel2InfoPtr = std::shared_ptr<PrimBpropOptGraphLevel2Info>;
-
-using PrimBpropCache = std::unordered_map<PrimitivePtr, PrimBpropOptGraphInfoPtr, PrimitiveHasher, PrimitiveTotalEqual>;
-
-using TupleListKey = std::pair<PrimitivePtr, abstract::AbstractBasePtrList>;
-
-using PrimBpropLevel2Cache =
-  std::unordered_map<abstract::AbstractBasePtrList, PrimBpropOptGraphLevel2InfoPtr, abstract::AbstractBasePtrListHasher,
-                     abstract::AbstractBasePtrListEqual>;
-
-using PrimTupleListCache =
-  std::unordered_map<TupleListKey, FuncGraphPtr, PrimitiveTupleListHasher, PrimitiveTupleListEqual>;
-
-struct PrimitiveTupleListHasher {
-  bool operator()(const TupleListKey &key) const {
-    abstract::AbstractBasePtrListHasher hasher;
-    return hasher(key.second);
-  }
-};
-
-struct PrimitiveTupleListEqual {
-  bool operator()(TupleListKey const &t1, TupleListKey const &t2) const {
-    MS_EXCEPTION_IF_NULL(t1.first);
-    MS_EXCEPTION_IF_NULL(t2.first);
-
-    if (!(*t1.first == *t2.first)) {
-      return false;
-    }
-    abstract::AbstractBasePtrListEqual cmp;
-    return cmp(t1.second, t2.second);
-  }
-};
-
-struct PrimitiveTotalEqual {
-  bool operator()(PrimitivePtr const &t1, PrimitivePtr const &t2) const {
-    MS_EXCEPTION_IF_NULL(t1);
-    MS_EXCEPTION_IF_NULL(t2);
-    return *t1 == *t2;
-  }
-};
-
-enum ECacheQrtRes { E_NOT_FOUND, E_LEVEL_1, E_LEVEL_2 };
-
-struct PrimBpropOptGraphInfo {
-  // the level1 opt func_graph without infer, no shape/type info provide
-  FuncGraphPtr opt_func_graph_;
-  // the opt func_graph after infer, func_graph level2 cache
-  PrimBpropLevel2Cache graph_level_2_cache_;
-};
-
-struct ParamUsingInfo {
-  bool using_flg_{false};
-  bool tuple_flg_{false};
-  size_t tuple_size_;
-  std::vector<ParamUsingInfo> sub_using_info_;
-};
-
-class PrimBpropOptGraphLevel2Info {
- public:
-  explicit PrimBpropOptGraphLevel2Info(const FuncGraphPtr &func_graph) : opt_func_graph_(func_graph) {}
-  ~PrimBpropOptGraphLevel2Info() = default;
-
-  const FuncGraphPtr &opt_func_graph() const { return opt_func_graph_; }
-
-  void TryFreeArgsValue(const ValuePtrList &op_args, const ValuePtr &out);
-
-  void AnalysisArgUsingInfo(const FuncGraphManagerPtr &manager);
-
- private:
-  void ArgInfoRefresh(const std::shared_ptr<AnfNode> &param, ParamUsingInfo *arg_info) const;
-
-  void AnalysisNodeUsingInfo(const NodeUsersMap &node_users, const std::shared_ptr<AnfNode> &param,
-                             ParamUsingInfo *arg_info) const;
-
-  void TryFreeOneValue(const ValuePtrList &op_args, const std::vector<ParamUsingInfo> &param_info_vec);
-
-  void AalysisForTupleGetItem(const NodeUsersMap &node_users, const std::shared_ptr<AnfNode> &param,
-                              ParamUsingInfo *arg_info, const AnfNodePtr &user_node) const;
-
- private:
-  // the level2 opt func_graph
-  FuncGraphPtr opt_func_graph_;
-  // to indicate arguments value using or not, if not using should free device memory
-  std::vector<ParamUsingInfo> args_value_using_info_;
-  bool analysis_finish_flg_{false};
-};
-
-class PrimBpropOptimizer {
- public:
-  ~PrimBpropOptimizer() = default;
-
-  void Clear();
-
-  static PrimBpropOptimizer &GetPrimBpropOptimizerInst();
-
-  // bprop_fg has the signature:
-  // (sens_input1, sens_input2,...)bprop_fg(input1, input2, ..., out, d_out)
-  // c_node contains the prim(input 0) and the input parameters of that prim;
-  // op_args contains the arguments list of each input parameters, it maybe tensor or tuple
-  // out contains the out of c_node;
-  FuncGraphPtr OptimizeBPropFuncGraph(const FuncGraphPtr &bprop_fg, const CNodePtr &c_node, const ValuePtrList &op_args,
-                                      const ValuePtr &out);
-
-  // do inline opt for final bprop graph
-  FuncGraphPtr BpropGraphFinalOpt(const ResourcePtr &res) const;
-
- private:
-  PrimBpropOptimizer() = default;
-
-  ECacheQrtRes GetOptBpfgFromCache(const PrimitivePtr &prim, const abstract::AbstractBasePtrList &abs_list,
-                                   PrimBpropOptGraphLevel2InfoPtr *level_2_graph_info,
-                                   PrimBpropOptGraphInfoPtr *level_1_graph_info);
-
-  // converter tensor args to abs value;
-  void ArgsToAbs(const PrimitivePtr &prim, const ValuePtrList &op_args, abstract::AbstractBasePtrList *abs_list);
-
-  // add out && dout to abs list
-  abstract::AbstractBasePtrList AddOutToAbsList(const ValuePtr &out, const abstract::AbstractBasePtrList &abs_list);
-
-  // do opt without input info, no infer
-  PrimBpropOptGraphInfoPtr PrimBpropOptStep1(const FuncGraphPtr &bprop_fg);
-
-  // do opt with input info
-  PrimBpropOptGraphLevel2InfoPtr PrimBpropOptStep2(const FuncGraphPtr &bprop_fg,
-                                                   const abstract::AbstractBasePtrList &abs_list_input);
-
-  void BindAbsToParameters(const FuncGraphPtr &bprop_fg, const abstract::AbstractBasePtrList &abs_list_input);
-
-  FuncGraphPtr GetOptBpropFromCache(const FuncGraphPtr &bprop_fg, const ValuePtrList &op_args, const ValuePtr &out,
-                                    const PrimitivePtr &prim);
-
-  FuncGraphPtr GenSpecOptBprop(const FuncGraphPtr &bprop_fg, const ValuePtrList &op_args, const ValuePtr &out,
-                               const PrimitivePtr &prim, bool hook_flg);
-
- private:
-  // cache optimized bprop graph
-  PrimBpropCache prim_bprop_cache_;
-  PrimTupleListCache tuple_list_bprop_cache_;
-};
-
-}  // namespace pipeline
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_PIPELINE_JIT_PRIM_BPROP_OPTIMIZER_H
diff --git a/mindspore/ccsrc/pipeline/jit/resource.h b/mindspore/ccsrc/pipeline/jit/resource.h
index f31bf37376c..9a859efd3f3 100644
--- a/mindspore/ccsrc/pipeline/jit/resource.h
+++ b/mindspore/ccsrc/pipeline/jit/resource.h
@@ -75,14 +75,14 @@ class Resource : public ResourceBase {
   const abstract::AbstractBasePtrList &args_spec() const { return args_spec_; }
   void set_args_spec(const abstract::AbstractBasePtrList &args_spec) { args_spec_ = args_spec; }
 
-  void set_gpu_loopsink(const bool &flag, const int64_t size) {
-    gpu_loopsink_flag_ = flag;
-    gpu_loopsink_size_ = size;
+  void set_vm_loop(const bool &flag, const int64_t size) {
+    vm_loop_flag_ = flag;
+    loop_size_ = size;
   }
   void set_is_load(bool flag) { is_load_ = flag; }
   bool is_load() { return is_load_; }
-  bool gpu_loopsink_flag() { return gpu_loopsink_flag_; }
-  int64_t gpu_loopsink_size() { return gpu_loopsink_size_; }
+  bool vm_loop_flag() { return vm_loop_flag_; }
+  int64_t loop_size() { return loop_size_; }
   // Reclaim resource and clear the cache.
   // ExecutorPy::Compile() can be called multiple times, so cache
   // should be cleared.
@@ -94,10 +94,10 @@ class Resource : public ResourceBase {
   abstract::AbstractBasePtrList args_spec_;
   py::object input_;
   bool is_cleaned_;
-  bool gpu_loopsink_flag_{false};
   // The func_graph_ is loaded from mindir
   bool is_load_{false};
-  int64_t gpu_loopsink_size_{1};
+  bool vm_loop_flag_{false};
+  int64_t loop_size_{1};
 };
 
 using ResourcePtr = std::shared_ptr<pipeline::Resource>;
diff --git a/mindspore/ccsrc/pipeline/jit/static_analysis/async_eval_result.h b/mindspore/ccsrc/pipeline/jit/static_analysis/async_eval_result.h
index 4a5e0bf0681..a9d5ba36cc2 100644
--- a/mindspore/ccsrc/pipeline/jit/static_analysis/async_eval_result.h
+++ b/mindspore/ccsrc/pipeline/jit/static_analysis/async_eval_result.h
@@ -293,6 +293,11 @@ class EvaluatorCacheMgr {
 // AnalysisCache
 class AnalysisResultCacheMgr {
  public:
+  using AnalysisConfigResultMap =
+    std::unordered_map<AnfNodeConfigPtr, EvalResultPtr, AnfNodeConfigHasher, AnfNodeConfigEqual>;
+  using AnalysisConfigResultCache = NormalCache<AnfNodeConfigPtr, EvalResultPtr, AnalysisConfigResultMap>;
+  using const_iterator = typename AnalysisConfigResultCache::const_iterator;
+
   ~AnalysisResultCacheMgr() = default;
   AnalysisResultCacheMgr(const AnalysisResultCacheMgr &) = delete;
   AnalysisResultCacheMgr &operator=(const AnalysisResultCacheMgr &) = delete;
@@ -306,17 +311,14 @@ class AnalysisResultCacheMgr {
   AbstractBasePtr GetSwitchValue(const AnfNodeConfigPtr &conf);
   AbstractBasePtr TryGetSwitchValue(const AnfNodeConfigPtr &conf);
   void SetSwitchValue(const AnfNodeConfigPtr &conf, const AbstractBasePtr &vale);
+  const_iterator begin() { return cache_.begin(); }
+  const_iterator end() { return cache_.end(); }
 
  private:
   using AnalysisConfigAsyncResultMap =
     std::unordered_map<AnfNodeConfigPtr, AsyncAbstractPtr, AnfNodeConfigHasher, AnfNodeConfigEqual>;
   using AnalysisConfigAsyncResultCache =
     MultiThreadCache<AnfNodeConfigPtr, AsyncAbstractPtr, AnalysisConfigAsyncResultMap>;
-
-  using AnalysisConfigResultMap =
-    std::unordered_map<AnfNodeConfigPtr, EvalResultPtr, AnfNodeConfigHasher, AnfNodeConfigEqual>;
-  using AnalysisConfigResultCache = NormalCache<AnfNodeConfigPtr, EvalResultPtr, AnalysisConfigResultMap>;
-
   AnalysisResultCacheMgr() = default;
   static AnalysisResultCacheMgr instance_;
   std::mutex lock_;
diff --git a/mindspore/ccsrc/pipeline/jit/static_analysis/order_enforce.cc b/mindspore/ccsrc/pipeline/jit/static_analysis/order_enforce.cc
index b13061ee351..53f175447d2 100644
--- a/mindspore/ccsrc/pipeline/jit/static_analysis/order_enforce.cc
+++ b/mindspore/ccsrc/pipeline/jit/static_analysis/order_enforce.cc
@@ -76,15 +76,17 @@ class OrderEnforcer {
     }
   }
 
-  bool CheckMakeTupleHaveLoad(const CNodePtr &cnode) {
+  std::unordered_set<AnfNodePtr> CheckMakeTupleHaveLoad(const CNodePtr &cnode) {
+    MS_EXCEPTION_IF_NULL(cnode);
+    std::unordered_set<AnfNodePtr> loads;
     auto inputs = cnode->inputs();
     for (size_t index = 1; index < inputs.size(); index++) {
       auto input = cnode->input(index);
       if (IsPrimitiveCNode(input, prim::kPrimLoad)) {
-        return true;
+        loads.insert(input);
       }
     }
-    return false;
+    return loads;
   }
 
   std::vector<AnfNodePtr> FindUpdateStateUsers(const CNodePtr &cnode) {
@@ -155,23 +157,31 @@ class OrderEnforcer {
   // u3 = UpdateState(u', maketuple2, addn) # need put addn or other-op into u3 inputs
   // assign = Assign(para2, inputs, u3)
   void HandleMakeTupleUsers(const AnfNodePtr &node) {
+    MS_EXCEPTION_IF_NULL(node);
     auto maketuple = node->cast<CNodePtr>();
     MS_EXCEPTION_IF_NULL(maketuple);
-    if (CheckMakeTupleHaveLoad(maketuple)) {
+    std::unordered_set<AnfNodePtr> loads = CheckMakeTupleHaveLoad(maketuple);
+    if (!loads.empty()) {
       auto update_state = FindLastUpdateState(maketuple);
       if (update_state != nullptr) {
         std::unordered_set<AnfNodePtr> maketuple_users = GetSpecialOperatorRealUsers(maketuple);
-        std::unordered_set<AnfNodePtr> no_push_maketuple_users;
+        std::unordered_set<AnfNodePtr> no_push_all_users;
         // Push and Pull at the end of the execution order,
         // In order to ensure push and pull operator cut into the same graph, do not put push operator into updatestate
         for (auto maketuple_user : maketuple_users) {
           if (!IsPrimitiveCNode(maketuple_user, prim::kPrimPush)) {
-            no_push_maketuple_users.insert(maketuple_user);
+            no_push_all_users.insert(maketuple_user);
+          }
+        }
+        for (auto load : loads) {
+          std::unordered_set<AnfNodePtr> load_users = GetSpecialOperatorRealUsers(load);
+          for (auto load_user : load_users) {
+            no_push_all_users.insert(load_user);
           }
         }
         auto update_state_cnode = update_state->cast<CNodePtr>();
         MS_EXCEPTION_IF_NULL(update_state_cnode);
-        AddInputEdges(update_state_cnode, no_push_maketuple_users);
+        AddInputEdges(update_state_cnode, no_push_all_users);
       }
     }
   }
@@ -265,6 +275,8 @@ class OrderEnforcer {
   // Add load users as input edges of the update_state node.
   void AddInputEdges(const CNodePtr &update_state, const std::unordered_set<AnfNodePtr> &load_users) {
     auto sorted_load_users = SortLoadUsers(load_users);
+    auto inputs = update_state->inputs();
+    size_t origin_size = inputs.size();
     for (auto &load_user : sorted_load_users) {
       if (IsPrimitiveCNode(load_user, prim::kPrimMakeTuple) || IsPrimitiveCNode(load_user, prim::kPrimUpdateState)) {
         continue;
@@ -272,10 +284,16 @@ class OrderEnforcer {
       if (!IsDependOn(load_user, update_state)) {
         processed_nodes_.insert(load_user);
         if (!IsInUpdateState(load_user, update_state)) {
-          manager_->AddEdge(update_state, load_user);
+          inputs.emplace_back(load_user);
         }
       }
     }
+    if (inputs.size() > origin_size) {
+      auto new_update_state = func_graph_->NewCNode(inputs);
+      new_update_state->set_abstract(update_state->abstract());
+      new_update_state->set_scope(update_state->scope());
+      manager_->Replace(update_state, new_update_state);
+    }
   }
 
   // Sort load users by their topo sort order.
@@ -373,7 +391,6 @@ class OrderEnforcer {
   std::unordered_map<AnfNodePtr, size_t> topo_sort_map_;
   std::unordered_set<AnfNodePtr> processed_nodes_;
 };
-
 }  // namespace
 
 // Enforce order of execution for Load users node.
diff --git a/mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc b/mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc
index 2cadc158521..2530b4f3330 100644
--- a/mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc
+++ b/mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc
@@ -158,7 +158,6 @@ EvalResultPtr UnpackGraphEvaluator::Run(AnalysisEnginePtr engine, const ConfigPt
                       << " args size should equal to inputs size minus 1, but args size " << args_conf_list.size()
                       << ", inputs size " << out_node_inputs.size();
   }
-  AnfNodePtrList args_inputs{out_node_inputs.begin() + 1, out_node_inputs.end()};
   AbstractBasePtrList args_spec_list;
   (void)std::transform(args_conf_list.begin(), args_conf_list.end(), std::back_inserter(args_spec_list),
                        [](const ConfigPtr &ref) -> AbstractBasePtr {
@@ -512,7 +511,7 @@ py::dict ConvertAbstractToPython(const AbstractBasePtr &abs_base) {
 }
 
 namespace {
-py::tuple PreparePyInputs(const PrimitivePyPtr &prim_py, const AbstractBasePtrList &args) {
+py::tuple PreparePyInputs(const PrimitivePyPtr &, const AbstractBasePtrList &args) {
   // The monad parameter is defined at the end of the parameter and needs to be ignored
   std::size_t size_args = args.size() - GetAbstractMonadNum(args);
   py::tuple py_args(size_args);
@@ -862,8 +861,7 @@ EvalResultPtr StaticGetterInferred(const ValuePtr &value, const ConfigPtr &data_
   return eng->ForwardConfig(old_conf, fn_conf);
 }
 
-EvalResultPtr GetEvaluatedValueForNameSpaceString(const AnalysisEnginePtr &engine,
-                                                  const AbstractBasePtrList &args_spec_list,
+EvalResultPtr GetEvaluatedValueForNameSpaceString(const AnalysisEnginePtr &, const AbstractBasePtrList &args_spec_list,
                                                   const AnfNodeConfigPtr &out_conf) {
   // args_spec_list: same as StaticGetter
   if (args_spec_list.size() < 2) {
diff --git a/mindspore/ccsrc/pipeline/jit/static_analysis/program_specialize.cc b/mindspore/ccsrc/pipeline/jit/static_analysis/program_specialize.cc
index 582bca476f3..f1195940b9b 100644
--- a/mindspore/ccsrc/pipeline/jit/static_analysis/program_specialize.cc
+++ b/mindspore/ccsrc/pipeline/jit/static_analysis/program_specialize.cc
@@ -615,8 +615,9 @@ std::pair<AbstractBasePtrList, AbstractBasePtr> FuncGraphSpecializer::BuildFromB
     MS_LOG(DEBUG) << "Broaded_argvals: " << broaded_argvals.size() << ", " << ::mindspore::ToString(broaded_argvals);
   }
   if (choices.size() == 1) {
-    if (args_vector.size() < 2) {
-      MS_LOG(EXCEPTION) << "Should have 2 more choices, but: " << args_vector.size();
+    constexpr auto args_size = 2;
+    if (args_vector.size() < args_size) {
+      MS_LOG(EXCEPTION) << "Should have " << args_size << " or more choices, but: " << args_vector.size();
     }
     AbstractBasePtrList joined_argvals = args_vector[0];
     for (size_t i = 1; i < args_vector.size(); ++i) {
diff --git a/mindspore/ccsrc/pipeline/jit/static_analysis/static_analysis.cc b/mindspore/ccsrc/pipeline/jit/static_analysis/static_analysis.cc
index 69897e1805b..6616fc738dd 100644
--- a/mindspore/ccsrc/pipeline/jit/static_analysis/static_analysis.cc
+++ b/mindspore/ccsrc/pipeline/jit/static_analysis/static_analysis.cc
@@ -359,7 +359,6 @@ void AnalysisEngine::Clear() {
   root_context_ = nullptr;
 }
 
-namespace {
 EvaluatorPtr GetPrimEvaluator(const PrimitivePtr &prim, const AnalysisEnginePtr &engine) {
   // Custom Primitive with python infer_shape, infer_type
   MS_EXCEPTION_IF_NULL(prim);
@@ -396,7 +395,8 @@ EvaluatorPtr GetPrimEvaluator(const PrimitivePtr &prim, const AnalysisEnginePtr
       engine->prim_py_evaluators_[prim_py] = evaluator;
       return evaluator;
     }
-    MS_LOG(EXCEPTION) << "The primitive with python evaluator should be a python primitive.";
+    MS_LOG(ERROR) << "The primitive with python evaluator should be a python primitive.";
+    return nullptr;
   }
 
   // return a default evaluator
@@ -416,11 +416,10 @@ EvaluatorPtr GetPrimEvaluator(const PrimitivePtr &prim, const AnalysisEnginePtr
     }
   }
   if (evaluator == nullptr) {
-    MS_LOG(EXCEPTION) << "The evaluator of the primitive is not defined (" << prim->name() << ").";
+    MS_LOG(DEBUG) << "The evaluator of the primitive is not defined (" << prim->name() << ").";
   }
   return evaluator;
 }
-}  // namespace
 
 EvaluatorPtr AnalysisEngine::_GetEvaluatorFor(const std::shared_ptr<PrimitiveAbstractClosure> &func) {
   MS_EXCEPTION_IF_NULL(func);
@@ -430,6 +429,9 @@ EvaluatorPtr AnalysisEngine::_GetEvaluatorFor(const std::shared_ptr<PrimitiveAbs
   }
   auto primitive = func->prim();
   auto evaluator = GetPrimEvaluator(primitive, shared_from_this());
+  if (evaluator == nullptr) {
+    MS_LOG(EXCEPTION) << "The evaluator of the primitive is not defined (" << primitive->name() << ").";
+  }
   evaluators_[func] = evaluator;
   return evaluator;
 }
@@ -1012,7 +1014,9 @@ AbstractBasePtr FromValueInside(const ValuePtr &value, bool broaden) {
 
 EvalResultPtr EvalOnePrim(const PrimitivePtr &primitive, const AbstractBasePtrList &arg_specs) {
   auto evaluator = GetPrimEvaluator(primitive, nullptr);
-  MS_EXCEPTION_IF_NULL(evaluator);
+  if (evaluator == nullptr) {
+    MS_LOG(EXCEPTION) << "The evaluator of the primitive is not defined (" << primitive->name() << ").";
+  }
   if (!evaluator->isa<TrivialPrimEvaluator>()) {
     MS_LOG(EXCEPTION) << "Prim " << primitive->ToString() << " should build a TrivialPrimEvaluator, but "
                       << evaluator->ToString();
diff --git a/mindspore/ccsrc/pipeline/jit/static_analysis/static_analysis.h b/mindspore/ccsrc/pipeline/jit/static_analysis/static_analysis.h
index 0f22e48de42..73a3bad5afd 100644
--- a/mindspore/ccsrc/pipeline/jit/static_analysis/static_analysis.h
+++ b/mindspore/ccsrc/pipeline/jit/static_analysis/static_analysis.h
@@ -347,7 +347,7 @@ template <typename T>
 AbstractBasePtr FromValue(const T &value, bool broaden = false) {
   return FromValueInside(MakeValue(value), broaden);
 }
-
+EvaluatorPtr GetPrimEvaluator(const PrimitivePtr &prim, const AnalysisEnginePtr &engine);
 EvalResultPtr EvalOnePrim(const PrimitivePtr &p, const AbstractBasePtrList &arg_specs);
 }  // namespace abstract
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
index 8ffa2642bc6..9c312fa0882 100644
--- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
@@ -32,6 +32,7 @@
 #include "ir/tensor.h"
 #include "utils/any.h"
 #include "utils/utils.h"
+#include "utils/profile.h"
 #include "utils/ms_context.h"
 #include "utils/check_convert_utils.h"
 #include "utils/context/context_extends.h"
@@ -59,7 +60,7 @@
 #include "pipeline/jit/resource.h"
 #include "pipeline/jit/pass.h"
 #include "frontend/parallel/context.h"
-#include "pipeline/jit/prim_bprop_optimizer.h"
+#include "frontend/optimizer/ad/prim_bprop_optimizer.h"
 
 #ifdef ENABLE_GE
 #include "pipeline/pynative/pynative_execute_ge.h"
@@ -67,6 +68,7 @@
 
 #include "debug/anf_ir_dump.h"
 #include "runtime/hardware/device_context_manager.h"
+#include "runtime/device/pynative_profiling.h"
 
 using mindspore::tensor::TensorPy;
 
@@ -79,6 +81,7 @@ std::mutex PynativeExecutor::instance_lock_;
 namespace {
 const size_t PTR_LEN = 15;
 const size_t ARG_SIZE = 2;
+const size_t MAX_TOP_CELL_COUNTS = 20;
 
 // primitive unable to infer value for constant input in PyNative mode
 const std::set<std::string> kVmOperators = {"make_ref", "HookBackward", "InsertGradientOf", "stop_gradient",
@@ -345,7 +348,7 @@ std::string GetSingleOpGraphInfo(const OpExecInfoPtr &op_exec_info, const std::v
   return graph_info;
 }
 
-py::args FilterTensorArgs(const py::args &args, bool has_sens = false) {
+py::list FilterTensorArgs(const py::args &args, bool has_sens = false) {
   size_t size = args.size();
   if (size == 0 && has_sens) {
     MS_LOG(EXCEPTION) << "The size of args is 0, when the flag of sens is set to True";
@@ -702,6 +705,9 @@ py::object GetDstType(const TypeId &type_id) {
 }  // namespace
 
 py::object RealRunOp(const py::args &args) {
+  auto real_run_op_start = GetTime();
+  auto &profiler_inst = device::PynativeProfiler::GetInstance();
+  profiler_inst.AddRealRunOpIndex();
   CheckPyNativeContext();
   auto executor = PynativeExecutor::GetInstance();
   MS_EXCEPTION_IF_NULL(executor);
@@ -709,6 +715,10 @@ py::object RealRunOp(const py::args &args) {
   MS_EXCEPTION_IF_NULL(op_exec_info);
   py::object ret = py::none();
   PynativeExecutorTry(executor->forward_executor()->RunOpS, &ret, op_exec_info);
+  auto real_run_op_end = GetTime();
+  profiler_inst.SetRealRunOpName(op_exec_info->op_name);
+  profiler_inst.SetRealRunOpTime(std::make_pair(real_run_op_start, real_run_op_end));
+  profiler_inst.SingleOpProfilingData();
   return ret;
 }
 
@@ -975,6 +985,8 @@ void ForwardExecutor::GetOpOutputAbstract(const OpExecInfoPtr &op_exec_info,
 
   if (shape->IsDynamic()) {
     op_exec_info->is_dynamic_shape = true;
+    // Dynamic shape operator in the current top cell, disable backend cache
+    grad()->EnableOpGraphCache(false);
   }
 }
 
@@ -998,7 +1010,7 @@ void ForwardExecutor::GetOpOutput(const OpExecInfoPtr &op_exec_info,
   }
 
   // Add output abstract info into cache, the const value needs to infer evert step
-  if (!prim_cache_hit && !op_exec_info->is_dynamic_shape) {
+  if (grad()->enable_op_cache() && !prim_cache_hit && !op_exec_info->is_dynamic_shape) {
     AbsCacheKey key{prim->name(), prim->Hash(), prim->attrs()};
     auto &out = prim_abs_list_[key];
     out[args_spec_list].abs = op_exec_info->abstract;
@@ -1320,6 +1332,13 @@ TopCellInfoPtr GradExecutor::GetTopCell(const std::string &cell_id) const {
   return nullptr;
 }
 
+void GradExecutor::EnableOpGraphCache(bool is_enable) {
+  enable_op_cache_ = is_enable;
+  const auto inst = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(inst);
+  inst->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE, is_enable);
+}
+
 void GradExecutor::RecordGradOpInfo(const OpExecInfoPtr &op_exec_info, const py::object &ret) {
   if (!grad_flag_) {
     MS_LOG(DEBUG) << "Grad flag is set to false, no need to record op info";
@@ -1497,7 +1516,7 @@ void GradExecutor::UpdateForwardTensorInfoInBpropGraph(const OpExecInfoPtr &op_e
   }
 
   // First run top cell
-  if (already_run_top_cell_.find(top_cell_->cell_id()) == already_run_top_cell_.end()) {
+  if (already_run_top_cell_.find(top_cell_->already_run_cell_id()) == already_run_top_cell_.end()) {
     MS_LOG(DEBUG) << "Top cell " << top_cell_->cell_id() << " run firstly";
     if (!need_construct_graph()) {
       MS_LOG(EXCEPTION) << "The cell stack is empty when running a new top cell " << top_cell_->cell_id();
@@ -1505,7 +1524,7 @@ void GradExecutor::UpdateForwardTensorInfoInBpropGraph(const OpExecInfoPtr &op_e
     return;
   }
   // Non-first run
-  const auto &pre_top_cell = already_run_top_cell_.at(top_cell_->cell_id());
+  const auto &pre_top_cell = already_run_top_cell_.at(top_cell_->already_run_cell_id());
   MS_EXCEPTION_IF_NULL(pre_top_cell);
   if (pre_top_cell->op_info_with_tensor_id().find(op_info) == pre_top_cell->op_info_with_tensor_id().end()) {
     MS_LOG(DEBUG) << "Can not find op info " << op_info << " in op info with tensor id map. Top cell "
@@ -1582,7 +1601,7 @@ py::tuple ForwardExecutor::RunOpWithInitBackendPolicy(const OpExecInfoPtr &op_ex
 }
 
 MsBackendPolicy ForwardExecutor::InitEnv(const OpExecInfoPtr &op_exec_info) {
-  MS_LOG(INFO) << "RunOp start, op name is: " << op_exec_info->op_name;
+  MS_LOG(DEBUG) << "RunOp start, op name is: " << op_exec_info->op_name;
   parse::python_adapter::set_python_env_flag(true);
   MsBackendPolicy backend_policy;
 #if (!defined ENABLE_GE)
@@ -1877,13 +1896,12 @@ void GradExecutor::ClearCellRes(const std::string &cell_id) {
   }
   // clear when cell destruction
   for (auto it = top_cell_list_.begin(); it != top_cell_list_.end();) {
-    auto top_cell_id = (*it)->cell_id();
+    const auto &top_cell_id = (*it)->cell_id();
+    const auto &alreay_top_cell_id = (*it)->already_run_cell_id();
     if (IsCellObjIdEq(cell_id, top_cell_id)) {
       (*it)->Clear();
       it = top_cell_list_.erase(it);
-      if (already_run_top_cell_.find(top_cell_id) != already_run_top_cell_.end()) {
-        (void)already_run_top_cell_.erase(top_cell_id);
-      }
+      (void)already_run_top_cell_.erase(alreay_top_cell_id);
       MS_LOG(DEBUG) << "Clear top cell resource. Top cell id " << top_cell_id;
       continue;
     }
@@ -1934,7 +1952,7 @@ void GradExecutor::HandleInputArgsForTopCell(const py::args &args, bool is_bprop
   }
   // Convert input args to parameters for top cell graph in construct.
   std::vector<ValuePtr> input_param_values;
-  py::args only_tensors = FilterTensorArgs(args);
+  const auto &only_tensors = FilterTensorArgs(args);
   auto df_builder = GetDfbuilder(top_cell_->cell_id());
   MS_EXCEPTION_IF_NULL(df_builder);
   for (size_t i = 0; i < only_tensors.size(); ++i) {
@@ -1999,11 +2017,18 @@ void GradExecutor::InitResourceAndDfBuilder(const std::string &cell_id, const py
 
 void GradExecutor::NewGraphInner(py::object *ret, const py::object &cell, const py::args &args) {
   MS_EXCEPTION_IF_NULL(ret);
-  auto cell_id = GetCellId(cell, args);
+  const auto &cell_id = GetCellId(cell, args);
   MS_LOG(DEBUG) << "NewGraphInner start " << args.size() << " " << cell_id;
   if (top_cell_ != nullptr && cell_stack_.empty()) {
+    // Already run top cell need distinguish high order; high order add "0" otherwise "1"
+    std::string already_run_cell_id;
+    if (IsNestedGrad()) {
+      already_run_cell_id = cell_id + "0";
+    } else {
+      already_run_cell_id = cell_id + "1";
+    }
     // Whether it is top and has been run
-    auto top_it = already_run_top_cell_.find(cell_id);
+    auto top_it = already_run_top_cell_.find(already_run_cell_id);
     if (top_it != already_run_top_cell_.end()) {
       // Top cell forward run.
       const auto &pre_top_cell = top_it->second;
@@ -2014,8 +2039,8 @@ void GradExecutor::NewGraphInner(py::object *ret, const py::object &cell, const
         set_top_cell(pre_top_cell);
         return;
       }
-    } else if ((top_cell()->IsSubCell(cell_id) && !IsCellObjIdEq(cell_id, check_graph_cell_id_)) ||
-               GetHighOrderStackSize() >= 1) {
+    } else if ((top_cell()->IsSubCell(cell_id) || GetHighOrderStackSize() >= 1) &&
+               !IsCellObjIdEq(cell_id, check_graph_cell_id_)) {
       // Sub cell ( or may be a temporary cell, but must be non top) forward run in cache process.
       MS_LOG(DEBUG) << "Sub cell no need to run NewGraphInner again";
       return;
@@ -2048,6 +2073,15 @@ void GradExecutor::MakeNewTopGraph(const string &cell_id, const py::args &args,
   if (grad_order_ == 0) {
     ++grad_order_;
   }
+  // The number of top cell exceeds MAX_TOP_CELL_COUNTS, delete the last one to keep the maximum length of the list,
+  // disable backend cache
+  if (top_cell_list_.size() >= MAX_TOP_CELL_COUNTS) {
+    EnableOpGraphCache(false);
+    const auto last_top_cell = top_cell_list_.back();
+    top_cell_list_.pop_back();
+    last_top_cell->Clear();
+    (void)already_run_top_cell_.erase(last_top_cell->already_run_cell_id());
+  }
   // Create top cell
   curr_g_ = std::make_shared<FuncGraph>();
   auto df_builder = std::make_shared<FuncGraph>();
@@ -2360,7 +2394,7 @@ std::vector<AnfNodePtr> GradExecutor::GetWeightsArgs(const py::object &weights,
   return w_args;
 }
 
-abstract::AbstractBasePtrList GradExecutor::GetArgsSpec(const py::args &args, const FuncGraphPtr &bprop_graph) {
+abstract::AbstractBasePtrList GradExecutor::GetArgsSpec(const py::list &args, const FuncGraphPtr &bprop_graph) {
   MS_EXCEPTION_IF_NULL(bprop_graph);
   std::size_t size = args.size();
   abstract::AbstractBasePtrList args_spec;
@@ -2447,7 +2481,7 @@ FuncGraphPtr GradExecutor::GetBpropGraph(const prim::GradOperationPtr &grad, con
   auto manager = resource->manager();
   MS_EXCEPTION_IF_NULL(manager);
   manager->AddFuncGraph(bprop_graph);
-  auto optimized_bg = pipeline::PrimBpropOptimizer::GetPrimBpropOptimizerInst().BpropGraphFinalOpt(resource);
+  auto optimized_bg = ad::PrimBpropOptimizer::GetPrimBpropOptimizerInst().BpropGraphFinalOpt(resource);
 
   if (cell_stack_.empty()) {
     need_renormalize_ = false;
@@ -2506,25 +2540,32 @@ py::object PynativeExecutor::CheckAlreadyRun(const py::object &cell, const py::a
 
 void GradExecutor::CheckNeedCompileGraph() {
   auto new_top_cell = top_cell();
-  std::string top_cell_id = new_top_cell->cell_id();
-  // update top cell by current cell op info
-  if (already_run_top_cell_.find(top_cell_id) == already_run_top_cell_.end()) {
-    MS_LOG(DEBUG) << "Top cell " << top_cell_id << " has never been ran, need compile graph";
-    already_run_top_cell_[top_cell_id] = new_top_cell;
+  const auto &already_top_cell_id = new_top_cell->already_run_cell_id();
+  // Update top cell by current cell op info
+  if (already_run_top_cell_.find(already_top_cell_id) == already_run_top_cell_.end()) {
+    MS_LOG(DEBUG) << "Top cell " << new_top_cell->cell_id() << " has never been ran, need compile graph";
+    already_run_top_cell_[already_top_cell_id] = new_top_cell;
     return;
   }
 
-  MS_LOG(DEBUG) << "Top cell " << top_cell_id << " has been ran";
-  auto pre_top_cell = already_run_top_cell_.at(top_cell_id);
+  MS_LOG(DEBUG) << "Top cell " << new_top_cell->cell_id() << " has been ran";
+  auto pre_top_cell = already_run_top_cell_.at(already_top_cell_id);
   auto pre_all_op_info = pre_top_cell->all_op_info();
   auto new_all_op_info = new_top_cell->all_op_info();
   MS_LOG(DEBUG) << "Pre all op info : " << pre_all_op_info;
   MS_LOG(DEBUG) << "New all op info : " << new_all_op_info;
   if (pre_all_op_info != new_all_op_info) {
     MS_LOG(DEBUG) << "The op info has been changed, need to compile graph again";
+    // The top cell switches exceeds MAX_TOP_CELL_COUNTS under the control flow, disable backend cache
+    if (top_cell_switch_counts_ >= MAX_TOP_CELL_COUNTS) {
+      EnableOpGraphCache(false);
+    } else {
+      // Increase top cell switches counts
+      ++top_cell_switch_counts_;
+    }
     EraseTopCellFromTopCellList(pre_top_cell);
     pre_top_cell->Clear();
-    already_run_top_cell_[top_cell_id] = new_top_cell;
+    already_run_top_cell_[already_top_cell_id] = new_top_cell;
   } else {
     MS_LOG(DEBUG) << "The op info has not been changed, no need to compile graph again";
     pre_top_cell->set_input_args_id(new_top_cell->input_args_id());
@@ -2773,9 +2814,11 @@ void GradExecutor::ClearGrad(const py::object &cell, const py::args &args) {
 void GradExecutor::ClearRes() {
   MS_LOG(DEBUG) << "Clear grad res";
   grad_order_ = 0;
+  top_cell_switch_counts_ = 0;
   grad_flag_ = false;
   need_renormalize_ = false;
   grad_is_running_ = false;
+  enable_op_cache_ = true;
   top_cell_ = nullptr;
   curr_g_ = nullptr;
   bprop_cell_list_.clear();
@@ -2800,6 +2843,24 @@ void PynativeExecutor::set_graph_phase(const std::string &graph_phase) {
   grad_executor()->set_graph_phase(graph_phase);
 }
 
+void PynativeExecutor::set_py_exe_path(const py::object &py_exe_path) {
+  if (!py::isinstance<py::str>(py_exe_path)) {
+    MS_LOG(EXCEPTION) << "Failed, py_exe_path input is not a str";
+  }
+  auto py_exe_path_s = py::cast<std::string>(py_exe_path);
+  auto ms_context = MsContext::GetInstance();
+  ms_context->set_param<std::string>(MS_CTX_PYTHON_EXE_PATH, py_exe_path_s);
+}
+
+void PynativeExecutor::set_kernel_build_server_dir(const py::object &kernel_build_server_dir) {
+  if (!py::isinstance<py::str>(kernel_build_server_dir)) {
+    MS_LOG(EXCEPTION) << "Failed, kernel_build_server_dir input is not a str";
+  }
+  auto kernel_build_server_dir_s = py::cast<std::string>(kernel_build_server_dir);
+  auto ms_context = MsContext::GetInstance();
+  ms_context->set_param<std::string>(MS_CTX_KERNEL_BUILD_SERVER_DIR, kernel_build_server_dir_s);
+}
+
 py::object PynativeExecutor::CheckGraph(const py::object &cell, const py::args &args) {
   return grad_executor()->CheckGraph(cell, args);
 }
@@ -2904,6 +2965,11 @@ REGISTER_PYBIND_DEFINE(PynativeExecutor_, ([](const py::module *m) {
                            .def("__call__", &PynativeExecutor::Run, "pynative executor run grad graph.")
                            .def("set_graph_phase", &PynativeExecutor::set_graph_phase, "pynative set graph phase")
                            .def("set_grad_flag", &PynativeExecutor::set_grad_flag, py::arg("flag") = py::bool_(false),
-                                "Executor set grad flag.");
+                                "Executor set grad flag.")
+                           .def("set_py_exe_path", &PynativeExecutor::set_py_exe_path,
+                                py::arg("py_exe_path") = py::str(""), "set python executable path.")
+                           .def("set_kernel_build_server_dir", &PynativeExecutor::set_kernel_build_server_dir,
+                                py::arg("kernel_build_server_dir") = py::str(""),
+                                "set kernel build server directory path.");
                        }));
 }  // namespace mindspore::pynative
diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.h b/mindspore/ccsrc/pipeline/pynative/pynative_execute.h
index 03655ddfa5c..5d214c28a2b 100644
--- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.h
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.h
@@ -42,7 +42,6 @@
 
 namespace mindspore::pynative {
 namespace py = pybind11;
-using CellId = std::string;
 using MsFunctionGradCache = std::unordered_map<std::string, std::pair<FuncGraphPtr, FuncGraphPtr>>;
 using OpInfoWithTensorId = std::unordered_map<std::string, std::vector<std::string>>;
 using TensorIdWithTensorObject = std::unordered_map<std::string, std::vector<tensor::TensorPtr>>;
@@ -68,7 +67,8 @@ class TopCellInfo {
         grad_order_(grad_order),
         resource_(std::move(r)),
         df_builder_(std::move(df)),
-        cell_id_(std::move(cellid)) {}
+        cell_id_(std::move(cellid)),
+        alread_run_cell_id_(cell_id_ + std::to_string(is_topest_)) {}
 
   bool is_init_kpynative() const { return is_init_kpynative_; }
   void set_init_kpynative(bool init) { is_init_kpynative_ = init; }
@@ -90,9 +90,10 @@ class TopCellInfo {
   size_t op_num() const { return op_num_; }
   void set_op_num(size_t op_num) { op_num_ = op_num; }
   std::string &cell_id() { return cell_id_; }
+  std::string &already_run_cell_id() { return alread_run_cell_id_; }
   std::string &input_args_id() { return input_args_id_; }
   std::string &all_op_info() { return all_op_info_; }
-  void set_input_args_id(const std::string &input_args_id) { input_args_id_ = std::move(input_args_id); }
+  void set_input_args_id(const std::string &input_args_id) { input_args_id_ = input_args_id; }
   std::unordered_set<std::string> &sub_cell_list() { return sub_cell_list_; }
   bool IsSubCell(const std::string &cell_id) const;
   OrderedMap<FuncGraphPtr, GraphInfoPtr> &graph_info_map() { return graph_info_map_; }
@@ -124,6 +125,7 @@ class TopCellInfo {
   FuncGraphPtr df_builder_{nullptr};
   ad::KPynativeCellPtr k_pynative_cell_ptr_{nullptr};
   std::string cell_id_;
+  std::string alread_run_cell_id_;
   std::string input_args_id_;
   std::string all_op_info_;
   OrderedMap<FuncGraphPtr, GraphInfoPtr> graph_info_map_;
@@ -173,7 +175,9 @@ class GradExecutor {
   TopCellInfoPtr top_cell() const;
   void CheckNeedCompileGraph();
   TopCellInfoPtr GetTopCell(const string &cell_id) const;
+  void EnableOpGraphCache(bool is_enable);
   bool need_renormalize() const { return need_renormalize_; }
+  bool enable_op_cache() const { return enable_op_cache_; }
   void set_top_cell(TopCellInfoPtr top_cell) { top_cell_ = std::move(top_cell); }
   bool grad_flag() const { return grad_flag_; }
   void set_grad_flag(bool flag) { grad_flag_ = flag; }
@@ -233,7 +237,7 @@ class GradExecutor {
   FuncGraphPtr GetBpropGraph(const prim::GradOperationPtr &grad, const py::object &cell,
                              const std::vector<AnfNodePtr> &weights, size_t arg_size, const py::args &args);
   std::vector<AnfNodePtr> GetWeightsArgs(const py::object &weights, const FuncGraphPtr &df_builder);
-  abstract::AbstractBasePtrList GetArgsSpec(const py::args &args, const FuncGraphPtr &bprop_graph);
+  abstract::AbstractBasePtrList GetArgsSpec(const py::list &args, const FuncGraphPtr &bprop_graph);
   // Manage resource for construct forward graph.
   std::string &graph_phase() { return graph_phase_; }
   AnfNodePtr GetObjNode(const py::object &obj, const std::string &obj_id);
@@ -242,15 +246,15 @@ class GradExecutor {
                                       const std::vector<int64_t> &index_sequence, bool is_param = false);
   void SetTupleArgsToGraphInfoMap(const FuncGraphPtr &g, const py::object &args, const AnfNodePtr &node,
                                   bool is_param = false);
-  void SetParamNodeMapInGraphInfoMap(const FuncGraphPtr &g, const std::string &id, const ParameterPtr &param) {
+  void SetParamNodeMapInGraphInfoMap(const FuncGraphPtr &g, const std::string &id, const ParameterPtr &param) const {
     top_cell()->graph_info_map()[g]->params[id] = param;
   }
   void SetNodeMapInGraphInfoMap(const FuncGraphPtr &g, const std::string &id, const AnfNodePtr &node,
-                                int64_t index = -1) {
+                                int64_t index = -1) const {
     top_cell()->graph_info_map()[g]->node_map[id] = std::make_pair(node, std::vector<int64_t>{index});
   }
   void SetNodeMapInGraphInfoMap(const FuncGraphPtr &g, const std::string &id, const AnfNodePtr &node,
-                                const std::vector<int64_t> &index) {
+                                const std::vector<int64_t> &index) const {
     top_cell()->graph_info_map()[g]->node_map[id] = std::make_pair(node, index);
   }
   void CreateMakeTupleNodeForMultiOut(const FuncGraphPtr &curr_g, const py::object &out, const std::string &out_id);
@@ -259,8 +263,10 @@ class GradExecutor {
   bool grad_flag_{false};
   bool need_renormalize_{false};
   bool grad_is_running_{false};
+  bool enable_op_cache_{true};
   int custom_bprop_cell_count_{0};
   size_t grad_order_{0};
+  size_t top_cell_switch_counts_{0};
 
   // The graph phase is used to obtain backend graph that is complied by ms_function
   std::string graph_phase_;
@@ -280,7 +286,7 @@ class GradExecutor {
   // Use vector for keep order
   std::vector<TopCellInfoPtr> top_cell_list_;
   // Record all top cell which has been ran
-  std::map<CellId, TopCellInfoPtr> already_run_top_cell_;
+  std::unordered_map<std::string, TopCellInfoPtr> already_run_top_cell_;
   // Use vector for keep order
   ForwardExecutorWeakPtr forward_executor_;
 };
@@ -352,6 +358,8 @@ class PynativeExecutor : public std::enable_shared_from_this<PynativeExecutor> {
 
   void set_grad_flag(bool flag);
   void set_graph_phase(const std::string &graph_phase);
+  void set_py_exe_path(const py::object &py_exe_path);
+  void set_kernel_build_server_dir(const py::object &kernel_build_server_dir);
   void GradMsFunction(const py::object &out, const py::args &args);
   void NewGraph(const py::object &cell, const py::args &args);
   void EndGraph(const py::object &cell, const py::object &out, const py::args &args);
diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc
index 3553e18bf01..23b0a78bed6 100644
--- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc
+++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.cc
@@ -25,6 +25,7 @@
 #include "pybind_api/api_register.h"
 #include "utils/log_adapter.h"
 #include "utils/utils.h"
+#include "utils/profile.h"
 #include "utils/ms_context.h"
 
 namespace mindspore {
@@ -446,6 +447,12 @@ void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) {
   }
 }
 
+void GPUProfiler::SingleOpLaunchTimeProcess(float op_time_elapsed) {
+  auto launch_end_time = GetTime();
+  double launch_start_time = launch_end_time - op_time_elapsed / kTimeUnit / kTimeUnit;
+  SetSingleOpLaunchTime(std::make_pair(launch_start_time, launch_end_time));
+}
+
 void GPUProfiler::OpDataProducerEnd() {
   float op_time_elapsed = 0;
   if (sync_enable_flag_) {
@@ -459,9 +466,11 @@ void GPUProfiler::OpDataProducerEnd() {
     CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy  op event stop failed");
     op_time_elapsed = op_time_elapsed * kTimeUnit;
     op_host_time_stop_ = GetHostTimeStamp();
+    SingleOpLaunchTimeProcess(op_time_elapsed);
   } else {
     op_host_time_stop_ = GetHostTimeStamp();
     op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit;
+    SingleOpLaunchTimeProcess(op_time_elapsed);
   }
   MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed;
   Profiler::SetRunTimeData(op_name_, op_time_elapsed);
diff --git a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h
index 17fdd71b93a..ae79e59ebe3 100644
--- a/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h
+++ b/mindspore/ccsrc/profiler/device/gpu/gpu_profiling.h
@@ -135,6 +135,7 @@ class GPUProfiler : public Profiler {
   std::string ProfileDataPath() const { return profile_data_path_; }
 
  private:
+  void SingleOpLaunchTimeProcess(float op_time_elapsed);
   void OpsParser();
   void EventLog(const Event &event);
   void ClearInst() override;
diff --git a/mindspore/ccsrc/profiler/device/profiling.h b/mindspore/ccsrc/profiler/device/profiling.h
index 6b3dd23676d..95318569ea0 100644
--- a/mindspore/ccsrc/profiler/device/profiling.h
+++ b/mindspore/ccsrc/profiler/device/profiling.h
@@ -79,6 +79,10 @@ class Profiler {
   bool GetEnableFlag() const { return enable_flag_; }
   std::string ProfileDataPath() const { return profile_data_path_; }
   void RecordOneStepStartEndInfo(std::string op_name);
+  std::pair<double, double> GetSingleOpLaunchTime() { return single_op_launch_start_time_end_time_; }
+  void SetSingleOpLaunchTime(const std::pair<double, double> &launch_start_end) {
+    single_op_launch_start_time_end_time_ = launch_start_end;
+  }
 
  protected:
   void SetRunTimeData(const std::string &op_name, const float time_elapsed);
@@ -86,6 +90,7 @@ class Profiler {
   uint64_t GetHostMonoTimeStamp() const;
   virtual void SaveProfileData() = 0;
   virtual void ClearInst() = 0;
+  std::pair<double, double> single_op_launch_start_time_end_time_;
   bool enable_flag_ = false;
   std::string profile_data_path_;
   std::unordered_map<std::string, OpInfo> op_info_map_;
diff --git a/mindspore/ccsrc/ps/CMakeLists.txt b/mindspore/ccsrc/ps/CMakeLists.txt
index f3c5ca3e105..c9b7f749902 100644
--- a/mindspore/ccsrc/ps/CMakeLists.txt
+++ b/mindspore/ccsrc/ps/CMakeLists.txt
@@ -24,6 +24,8 @@ if(NOT ENABLE_CPU OR WIN32)
     list(REMOVE_ITEM _PS_SRC_FILES "parameter_server.cc")
     list(REMOVE_ITEM _PS_SRC_FILES "core/communicator/http_request_handler.cc")
     list(REMOVE_ITEM _PS_SRC_FILES "core/communicator/ssl_wrapper.cc")
+    list(REMOVE_ITEM _PS_SRC_FILES "core/communicator/ssl_http.cc")
+    list(REMOVE_ITEM _PS_SRC_FILES "core/communicator/ssl_client.cc")
     list(REMOVE_ITEM _PS_SRC_FILES "core/leader_scaler.cc")
     list(REMOVE_ITEM _PS_SRC_FILES "core/follower_scaler.cc")
     list(REMOVE_ITEM _PS_SRC_FILES "core/file_configuration.cc")
diff --git a/mindspore/ccsrc/ps/constants.h b/mindspore/ccsrc/ps/constants.h
index 47db975de85..9b0a5e3e4f6 100644
--- a/mindspore/ccsrc/ps/constants.h
+++ b/mindspore/ccsrc/ps/constants.h
@@ -133,6 +133,59 @@ constexpr char kClientCertPath[] = "client_cert_path";
 constexpr char kClientPassword[] = "client_password";
 constexpr char kCaCertPath[] = "ca_cert_path";
 
+constexpr char kCipherList[] = "cipher_list";
+constexpr char kCertCheckInterval[] = "cert_check_interval_in_hour";
+// 7 * 24
+constexpr int64_t kCertCheckIntervalInHour = 168;
+constexpr char kCertExpireWarningTime[] = "cert_expire_warning_time_in_day";
+// 90
+constexpr int64_t kCertExpireWarningTimeInDay = 90;
+constexpr char kConnectionNum[] = "connection_num";
+constexpr int64_t kConnectionNumDefault = 10000;
+constexpr char kLocalIp[] = "127.0.0.1";
+
+constexpr int64_t kJanuary = 1;
+constexpr int64_t kSeventyYear = 70;
+constexpr int64_t kHundredYear = 100;
+constexpr int64_t kThousandYear = 1000;
+constexpr int64_t kBaseYear = 1900;
+constexpr int64_t kMinWarningTime = 7;
+constexpr int64_t kMaxWarningTime = 180;
+
+constexpr char kServerCert[] = "server.p12";
+constexpr char kClientCert[] = "client.p12";
+constexpr char kCaCert[] = "ca.crt";
+constexpr char kColon = ':';
+const std::map<std::string, size_t> kCiphers = {{"ECDHE-RSA-AES128-GCM-SHA256", 0},
+                                                {"ECDHE-ECDSA-AES128-GCM-SHA256", 1},
+                                                {"ECDHE-RSA-AES256-GCM-SHA384", 2},
+                                                {"ECDHE-ECDSA-AES256-GCM-SHA384", 3},
+                                                {"DHE-RSA-AES128-GCM-SHA256", 4},
+                                                {"DHE-DSS-AES128-GCM-SHA256", 5},
+                                                {"ECDHE-RSA-AES128-SHA256", 6},
+                                                {"ECDHE-ECDSA-AES128-SHA256", 7},
+                                                {"ECDHE-RSA-AES128-SHA", 8},
+                                                {"ECDHE-ECDSA-AES128-SHA", 9},
+                                                {"ECDHE-RSA-AES256-SHA384", 10},
+                                                {"ECDHE-ECDSA-AES256-SHA384", 11},
+                                                {"ECDHE-RSA-AES256-SHA", 12},
+                                                {"ECDHE-ECDSA-AES256-SHA", 13},
+                                                {"DHE-RSA-AES128-SHA256", 14},
+                                                {"DHE-RSA-AES128-SHA", 15},
+                                                {"DHE-DSS-AES128-SHA256", 16},
+                                                {"DHE-RSA-AES256-SHA256", 17},
+                                                {"DHE-DSS-AES256-SHA", 18},
+                                                {"DHE-RSA-AES256-SHA", 19},
+                                                {"!aNULL", 20},
+                                                {"!eNULL", 21},
+                                                {"!EXPORT", 22},
+                                                {"!DES", 23},
+                                                {"!RC4", 24},
+                                                {"!3DES", 25},
+                                                {"!MD5", 26},
+                                                {"!PSK", 27},
+                                                {"kEDH+AESGCM", 28}};
+
 using DataPtr = std::shared_ptr<unsigned char[]>;
 using VectorPtr = std::shared_ptr<std::vector<unsigned char>>;
 using Key = uint64_t;
@@ -197,6 +250,7 @@ using HandlerAfterScaleOut = std::function<void(void)>;
 using HandlerAfterScaleIn = std::function<void(void)>;
 
 constexpr char kClusterSafeMode[] = "The cluster is in safemode.";
+constexpr char kJobNotAvailable[] = "The server's training job is disabled or finished.";
 
 enum class CustomEvent { kIterationRunning = 0, kIterationCompleted };
 
diff --git a/mindspore/ccsrc/ps/core/comm_util.cc b/mindspore/ccsrc/ps/core/comm_util.cc
index 91ba81edc27..d58b17d0ab6 100644
--- a/mindspore/ccsrc/ps/core/comm_util.cc
+++ b/mindspore/ccsrc/ps/core/comm_util.cc
@@ -64,7 +64,9 @@ void CommUtil::GetAvailableInterfaceAndIP(std::string *interface, std::string *i
 
   interface->clear();
   ip->clear();
-  getifaddrs(&if_address);
+  if (getifaddrs(&if_address) == -1) {
+    MS_LOG(WARNING) << "Get ifaddrs failed.";
+  }
   for (ifa = if_address; ifa != nullptr; ifa = ifa->ifa_next) {
     if (ifa->ifa_addr == nullptr) {
       continue;
@@ -146,6 +148,7 @@ bool CommUtil::Retry(const std::function<bool()> &func, size_t max_attempts, siz
 }
 
 void CommUtil::LogCallback(int severity, const char *msg) {
+  MS_EXCEPTION_IF_NULL(msg);
   switch (severity) {
     case EVENT_LOG_MSG:
       MS_LOG(INFO) << kLibeventLogPrefix << msg;
@@ -173,7 +176,11 @@ bool CommUtil::IsFileExists(const std::string &file) {
 
 std::string CommUtil::ClusterStateToString(const ClusterState &state) {
   MS_LOG(INFO) << "The cluster state:" << state;
-  return kClusterState.at(state);
+  if (state < SizeToInt(kClusterState.size())) {
+    return kClusterState.at(state);
+  } else {
+    return "";
+  }
 }
 
 std::string CommUtil::ParseConfig(const Configuration &config, const std::string &key) {
@@ -190,6 +197,145 @@ std::string CommUtil::ParseConfig(const Configuration &config, const std::string
   std::string path = config.GetString(key, "");
   return path;
 }
+
+bool CommUtil::VerifyCertTime(const X509 *cert, int64_t time) {
+  MS_EXCEPTION_IF_NULL(cert);
+  ASN1_TIME *start = X509_getm_notBefore(cert);
+  ASN1_TIME *end = X509_getm_notAfter(cert);
+  MS_EXCEPTION_IF_NULL(start);
+  MS_EXCEPTION_IF_NULL(end);
+  int day = 0;
+  int sec = 0;
+  if (!ASN1_TIME_diff(&day, &sec, start, NULL)) {
+    MS_LOG(WARNING) << "ASN1 time diff failed.";
+    return false;
+  }
+
+  if (day < 0 || sec < 0) {
+    MS_LOG(WARNING) << "Cert start time is later than now time.";
+    return false;
+  }
+  day = 0;
+  sec = 0;
+
+  if (!ASN1_TIME_diff(&day, &sec, NULL, end)) {
+    MS_LOG(WARNING) << "ASN1 time diff failed.";
+    return false;
+  }
+
+  int64_t interval = kCertExpireWarningTimeInDay;
+  if (time > 0) {
+    interval = time;
+  }
+
+  if (day < LongToInt(interval) && day >= 0) {
+    MS_LOG(WARNING) << "The certificate will expire in " << day << " days and " << sec << " seconds.";
+  } else if (day < 0 || sec < 0) {
+    MS_LOG(WARNING) << "The certificate has expired.";
+    return false;
+  }
+  return true;
+}
+
+bool CommUtil::VerifyCRL(const X509 *cert, const std::string &crl_path) {
+  MS_ERROR_IF_NULL_W_RET_VAL(cert, false);
+  BIO *bio = BIO_new_file(crl_path.c_str(), "r");
+  MS_ERROR_IF_NULL_W_RET_VAL(bio, false);
+  X509_CRL *root_crl = PEM_read_bio_X509_CRL(bio, nullptr, nullptr, nullptr);
+  MS_ERROR_IF_NULL_W_RET_VAL(root_crl, false);
+  EVP_PKEY *evp_pkey = X509_get_pubkey(const_cast<X509 *>(cert));
+  MS_ERROR_IF_NULL_W_RET_VAL(evp_pkey, false);
+
+  int ret = X509_CRL_verify(root_crl, evp_pkey);
+  BIO_free_all(bio);
+  if (ret == 1) {
+    MS_LOG(WARNING) << "Equip cert in root crl, verify failed";
+    return false;
+  }
+  MS_LOG(INFO) << "VerifyCRL success.";
+  return true;
+}
+
+bool CommUtil::VerifyCommonName(const X509 *cert, const std::string &ca_path) {
+  MS_ERROR_IF_NULL_W_RET_VAL(cert, false);
+  X509 *cert_temp = const_cast<X509 *>(cert);
+  char subject_cn[256] = "";
+  char issuer_cn[256] = "";
+  X509_NAME *subject_name = X509_get_subject_name(cert_temp);
+  X509_NAME *issuer_name = X509_get_issuer_name(cert_temp);
+  MS_ERROR_IF_NULL_W_RET_VAL(subject_name, false);
+  MS_ERROR_IF_NULL_W_RET_VAL(issuer_name, false);
+  if (!X509_NAME_get_text_by_NID(subject_name, NID_commonName, subject_cn, sizeof(subject_cn))) {
+    MS_LOG(WARNING) << "Get text by nid failed.";
+    return false;
+  }
+  if (!X509_NAME_get_text_by_NID(issuer_name, NID_commonName, issuer_cn, sizeof(issuer_cn))) {
+    MS_LOG(WARNING) << "Get text by nid failed.";
+    return false;
+  }
+  MS_LOG(INFO) << "the subject:" << subject_cn << ", the issuer:" << issuer_cn;
+
+  BIO *ca_bio = BIO_new_file(ca_path.c_str(), "r");
+  MS_EXCEPTION_IF_NULL(ca_bio);
+  X509 *ca_cert = PEM_read_bio_X509(ca_bio, nullptr, nullptr, nullptr);
+  MS_EXCEPTION_IF_NULL(ca_cert);
+  char ca_subject_cn[256] = "";
+  char ca_issuer_cn[256] = "";
+  X509_NAME *ca_subject_name = X509_get_subject_name(ca_cert);
+  X509_NAME *ca_issuer_name = X509_get_issuer_name(ca_cert);
+  MS_ERROR_IF_NULL_W_RET_VAL(ca_subject_name, false);
+  MS_ERROR_IF_NULL_W_RET_VAL(ca_issuer_name, false);
+  if (!X509_NAME_get_text_by_NID(ca_subject_name, NID_commonName, ca_subject_cn, sizeof(subject_cn))) {
+    MS_LOG(WARNING) << "Get text by nid failed.";
+    return false;
+  }
+  if (!X509_NAME_get_text_by_NID(ca_issuer_name, NID_commonName, ca_issuer_cn, sizeof(issuer_cn))) {
+    MS_LOG(WARNING) << "Get text by nid failed.";
+    return false;
+  }
+  MS_LOG(INFO) << "the subject:" << ca_subject_cn << ", the issuer:" << ca_issuer_cn;
+  BIO_free_all(ca_bio);
+  if (strcmp(issuer_cn, ca_subject_cn) != 0) {
+    return false;
+  }
+  return true;
+}
+
+std::vector<std::string> CommUtil::Split(const std::string &s, char delim) {
+  std::vector<std::string> res;
+  std::stringstream ss(s);
+  std::string item;
+
+  while (getline(ss, item, delim)) {
+    res.push_back(item);
+  }
+  return res;
+}
+
+bool CommUtil::VerifyCipherList(const std::vector<std::string> &list) {
+  for (auto &item : list) {
+    if (!kCiphers.count(item)) {
+      MS_LOG(WARNING) << "The ciphter:" << item << " is not supported.";
+      return false;
+    }
+  }
+  return true;
+}
+
+void CommUtil::InitOpenSSLEnv() {
+  if (!SSL_library_init()) {
+    MS_LOG(EXCEPTION) << "SSL_library_init failed.";
+  }
+  if (!ERR_load_crypto_strings()) {
+    MS_LOG(EXCEPTION) << "ERR_load_crypto_strings failed.";
+  }
+  if (!SSL_load_error_strings()) {
+    MS_LOG(EXCEPTION) << "SSL_load_error_strings failed.";
+  }
+  if (!OpenSSL_add_all_algorithms()) {
+    MS_LOG(EXCEPTION) << "OpenSSL_add_all_algorithms failed.";
+  }
+}
 }  // namespace core
 }  // namespace ps
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/ps/core/comm_util.h b/mindspore/ccsrc/ps/core/comm_util.h
index 2e127fe1d03..13118b41de0 100644
--- a/mindspore/ccsrc/ps/core/comm_util.h
+++ b/mindspore/ccsrc/ps/core/comm_util.h
@@ -37,6 +37,14 @@
 #include <event2/listener.h>
 #include <event2/util.h>
 
+#include <openssl/ssl.h>
+#include <openssl/rand.h>
+#include <openssl/err.h>
+#include <openssl/evp.h>
+#include <assert.h>
+#include <openssl/pkcs12.h>
+#include <openssl/bio.h>
+
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
@@ -49,6 +57,7 @@
 #include <fstream>
 #include <iostream>
 #include <vector>
+#include <algorithm>
 
 #include "proto/comm.pb.h"
 #include "proto/ps.pb.h"
@@ -104,6 +113,18 @@ class CommUtil {
   // Parse the configuration file according to the key.
   static std::string ParseConfig(const Configuration &config, const std::string &key);
 
+  // verify valid of certificate time
+  static bool VerifyCertTime(const X509 *cert, int64_t time = 0);
+  // verify valid of equip certificate with CRL
+  static bool VerifyCRL(const X509 *cert, const std::string &crl_path);
+  // Check the common name of the certificate
+  static bool VerifyCommonName(const X509 *cert, const std::string &ca_path);
+  // The string is divided according to delim
+  static std::vector<std::string> Split(const std::string &s, char delim);
+  // Check the cipher list of the certificate
+  static bool VerifyCipherList(const std::vector<std::string> &list);
+  static void InitOpenSSLEnv();
+
  private:
   static std::random_device rd;
   static std::mt19937_64 gen;
diff --git a/mindspore/ccsrc/ps/core/communicator/ssl_wrapper.cc b/mindspore/ccsrc/ps/core/communicator/ssl_wrapper.cc
index d0b3f7457a1..4192ca31b3c 100644
--- a/mindspore/ccsrc/ps/core/communicator/ssl_wrapper.cc
+++ b/mindspore/ccsrc/ps/core/communicator/ssl_wrapper.cc
@@ -44,10 +44,7 @@ SSLWrapper::SSLWrapper()
 SSLWrapper::~SSLWrapper() { CleanSSL(); }
 
 void SSLWrapper::InitSSL() {
-  SSL_library_init();
-  ERR_load_crypto_strings();
-  SSL_load_error_strings();
-  OpenSSL_add_all_algorithms();
+  CommUtil::InitOpenSSLEnv();
   int rand = RAND_poll();
   if (rand == 0) {
     MS_LOG(ERROR) << "RAND_poll failed";
diff --git a/mindspore/ccsrc/ps/core/communicator/ssl_wrapper.h b/mindspore/ccsrc/ps/core/communicator/ssl_wrapper.h
index e7870598e02..b975616582d 100644
--- a/mindspore/ccsrc/ps/core/communicator/ssl_wrapper.h
+++ b/mindspore/ccsrc/ps/core/communicator/ssl_wrapper.h
@@ -29,6 +29,7 @@
 #include <string>
 
 #include "utils/log_adapter.h"
+#include "ps/core/comm_util.h"
 
 namespace mindspore {
 namespace ps {
diff --git a/mindspore/ccsrc/ps/core/communicator/tcp_communicator.h b/mindspore/ccsrc/ps/core/communicator/tcp_communicator.h
index f6ef04fc5e2..900c3361366 100644
--- a/mindspore/ccsrc/ps/core/communicator/tcp_communicator.h
+++ b/mindspore/ccsrc/ps/core/communicator/tcp_communicator.h
@@ -54,7 +54,8 @@ enum class TcpUserCommand {
   kEndLastIter,
   kStartFLJob,
   kUpdateModel,
-  kGetModel
+  kGetModel,
+  kPushMetrics
 };
 
 const std::unordered_map<TcpUserCommand, std::string> kUserCommandToMsgType = {
@@ -75,7 +76,8 @@ const std::unordered_map<TcpUserCommand, std::string> kUserCommandToMsgType = {
   {TcpUserCommand::kEndLastIter, "endLastIter"},
   {TcpUserCommand::kStartFLJob, "startFLJob"},
   {TcpUserCommand::kUpdateModel, "updateModel"},
-  {TcpUserCommand::kGetModel, "getModel"}};
+  {TcpUserCommand::kGetModel, "getModel"},
+  {TcpUserCommand::kPushMetrics, "pushMetrics"}};
 
 class TcpCommunicator : public CommunicatorBase {
  public:
diff --git a/mindspore/ccsrc/ps/core/configuration.h b/mindspore/ccsrc/ps/core/configuration.h
index 6651a88cab9..21047f7544e 100644
--- a/mindspore/ccsrc/ps/core/configuration.h
+++ b/mindspore/ccsrc/ps/core/configuration.h
@@ -45,17 +45,20 @@ class Configuration {
   // Determine whether the initialization has been completed.
   virtual bool IsInitialized() const = 0;
 
-  // Get configuration data from database or config file.The returned string is quoted
+  // Get configuration data from database or config file.
   virtual std::string Get(const std::string &key, const std::string &defaultvalue) const = 0;
 
-  // Get configuration data from database or config file.The returned string is not quoted
+  // Get configuration data from database or config file.
   virtual std::string GetString(const std::string &key, const std::string &defaultvalue) const = 0;
 
+  // Get configuration data from database or config file.
+  virtual int64_t GetInt(const std::string &key, int64_t default_value) const = 0;
+
   // Put configuration data to database or config file.
   virtual void Put(const std::string &key, const std::string &defaultvalue) = 0;
 
   // Determine whether the configuration item exists.
-  virtual bool Exists(const std::string &key) = 0;
+  virtual bool Exists(const std::string &key) const = 0;
 };
 }  // namespace core
 }  // namespace ps
diff --git a/mindspore/ccsrc/ps/core/file_configuration.cc b/mindspore/ccsrc/ps/core/file_configuration.cc
index 2b813a0edc2..9c2be9eded7 100644
--- a/mindspore/ccsrc/ps/core/file_configuration.cc
+++ b/mindspore/ccsrc/ps/core/file_configuration.cc
@@ -25,12 +25,13 @@ bool FileConfiguration::Initialize() {
     return false;
   }
 
+  std::ifstream json_file(file_path_);
   try {
-    std::ifstream json_file(file_path_);
     json_file >> js;
     json_file.close();
     is_initialized_ = true;
   } catch (nlohmann::json::exception &e) {
+    json_file.close();
     std::string illegal_exception = e.what();
     MS_LOG(ERROR) << "Parse json file:" << file_path_ << " failed, the exception:" << illegal_exception;
     return false;
@@ -58,6 +59,15 @@ std::string FileConfiguration::GetString(const std::string &key, const std::stri
   return res;
 }
 
+int64_t FileConfiguration::GetInt(const std::string &key, int64_t default_value) const {
+  if (!js.contains(key)) {
+    MS_LOG(WARNING) << "The key:" << key << " is not exist.";
+    return default_value;
+  }
+  int64_t res = js.at(key);
+  return res;
+}
+
 void FileConfiguration::Put(const std::string &key, const std::string &value) {
   std::ofstream output_file(file_path_);
   js[key] = value;
@@ -66,7 +76,7 @@ void FileConfiguration::Put(const std::string &key, const std::string &value) {
   output_file.close();
 }
 
-bool FileConfiguration::Exists(const std::string &key) {
+bool FileConfiguration::Exists(const std::string &key) const {
   if (!js.contains(key)) {
     return false;
   }
diff --git a/mindspore/ccsrc/ps/core/file_configuration.h b/mindspore/ccsrc/ps/core/file_configuration.h
index 8415a4ce5cb..2a2564e9203 100644
--- a/mindspore/ccsrc/ps/core/file_configuration.h
+++ b/mindspore/ccsrc/ps/core/file_configuration.h
@@ -58,9 +58,11 @@ class FileConfiguration : public Configuration {
 
   std::string GetString(const std::string &key, const std::string &defaultvalue) const override;
 
+  int64_t GetInt(const std::string &key, int64_t default_value) const override;
+
   void Put(const std::string &key, const std::string &value) override;
 
-  bool Exists(const std::string &key) override;
+  bool Exists(const std::string &key) const override;
 
  private:
   // The path of the configuration file.
diff --git a/mindspore/ccsrc/ps/core/follower_scaler.cc b/mindspore/ccsrc/ps/core/follower_scaler.cc
index 54de1104d23..ac33ab2a835 100644
--- a/mindspore/ccsrc/ps/core/follower_scaler.cc
+++ b/mindspore/ccsrc/ps/core/follower_scaler.cc
@@ -78,10 +78,18 @@ FollowerScaler::~FollowerScaler() {
   running_ = false;
   scale_out_cv_.notify_all();
   scale_in_cv_.notify_all();
-  process_before_scale_out_thread_.join();
-  process_before_scale_in_thread_.join();
-  process_after_scale_out_thread_.join();
-  process_after_scale_in_thread_.join();
+  if (process_before_scale_out_thread_.joinable()) {
+    process_before_scale_out_thread_.join();
+  }
+  if (process_before_scale_in_thread_.joinable()) {
+    process_before_scale_in_thread_.join();
+  }
+  if (process_after_scale_out_thread_.joinable()) {
+    process_after_scale_out_thread_.join();
+  }
+  if (process_after_scale_in_thread_.joinable()) {
+    process_after_scale_in_thread_.join();
+  }
 }
 
 void FollowerScaler::RegisterScaleEventCallbacks() {
diff --git a/mindspore/ccsrc/ps/ps_context.h b/mindspore/ccsrc/ps/ps_context.h
index 291a7246038..f2896f82957 100644
--- a/mindspore/ccsrc/ps/ps_context.h
+++ b/mindspore/ccsrc/ps/ps_context.h
@@ -46,11 +46,11 @@ constexpr char kNotEncryptType[] = "NOT_ENCRYPT";
 // 2: Server is in mixed training mode.
 // 3: Server enables pairwise encrypt algorithm.
 // For example: 1010 stands for that the server is in federated learning mode and pairwise encrypt algorithm is enabled.
-enum class ResetterRound { kNoNeedToReset, kUpdateModel, kReconstructSeccrets, kPushWeight };
+enum class ResetterRound { kNoNeedToReset, kUpdateModel, kReconstructSeccrets, kPushWeight, kPushMetrics };
 const std::map<uint32_t, ResetterRound> kServerContextToResetRoundMap = {{0b0010, ResetterRound::kUpdateModel},
                                                                          {0b1010, ResetterRound::kReconstructSeccrets},
-                                                                         {0b1100, ResetterRound::kPushWeight},
-                                                                         {0b0100, ResetterRound::kPushWeight}};
+                                                                         {0b1100, ResetterRound::kPushMetrics},
+                                                                         {0b0100, ResetterRound::kPushMetrics}};
 
 class PSContext {
  public:
diff --git a/mindspore/ccsrc/pybind_api/ir/dtype_py.cc b/mindspore/ccsrc/pybind_api/ir/dtype_py.cc
index 46bc8c50e2d..04c0a0186e5 100644
--- a/mindspore/ccsrc/pybind_api/ir/dtype_py.cc
+++ b/mindspore/ccsrc/pybind_api/ir/dtype_py.cc
@@ -109,6 +109,22 @@ REGISTER_PYBIND_DEFINE(
           Float data(t[0].cast<py::int_>());
           return data;
         }));
+    (void)py::class_<Complex, Number, std::shared_ptr<Complex>>(m_sub, "Complex")
+      .def(py::init())
+      .def(py::init<int>(), py::arg("nbits"))
+      .def(py::pickle(
+        [](const Complex &t) {  // __getstate__
+          /* Return a tuple that fully encodes the state of the object */
+          return py::make_tuple(py::int_(t.nbits()));
+        },
+        [](const py::tuple &t) {  // __setstate__
+          if (t.size() != 1) {
+            throw std::runtime_error("Invalid state!");
+          }
+          /* Create a new C++ instance */
+          Complex data(t[0].cast<py::int_>());
+          return data;
+        }));
     (void)py::class_<List, Type, std::shared_ptr<List>>(m_sub, "List")
       .def(py::init())
       .def(py::init<std::vector<TypePtr>>(), py::arg("elements"));
diff --git a/mindspore/ccsrc/pybind_api/ir/primitive_py.cc b/mindspore/ccsrc/pybind_api/ir/primitive_py.cc
index 24226244d13..b3845bcc0ec 100644
--- a/mindspore/ccsrc/pybind_api/ir/primitive_py.cc
+++ b/mindspore/ccsrc/pybind_api/ir/primitive_py.cc
@@ -271,18 +271,18 @@ py::function PrimitivePy::GetComputeFunction() const {
   static const char *const compute_func_name = "vm_impl";
 
   if (py::hasattr(python_obj_, compute_func_name)) {
-    MS_LOG(INFO) << name() << " compute_func_name";
+    MS_LOG(DEBUG) << name() << " compute_func_name";
     py::function fn = python_obj_.attr(compute_func_name).cast<py::function>();
     return fn;
   }
 
   static const std::string vm_module = "mindspore.ops.vm_impl_registry";
   static const std::string get_vm_impl_fn = "get_vm_impl_fn";
-  MS_LOG(INFO) << name() << ": get_vm_impl_fn";
+  MS_LOG(DEBUG) << name() << ": get_vm_impl_fn";
   py::function get_fn = parse::python_adapter::GetPyFn(vm_module, get_vm_impl_fn);
   py::function vm_fn = get_fn(python_obj_);
   if (py::isinstance<py::none>(vm_fn)) {
-    MS_LOG(INFO) << "Cannot find " << python_obj_.attr("__class__").attr("__name__").cast<std::string>();
+    MS_LOG(DEBUG) << "Cannot find " << python_obj_.attr("__class__").attr("__name__").cast<std::string>();
     vm_fn = mindspore::GetComputeFunction(Primitive::name());
   }
   return vm_fn;
diff --git a/mindspore/ccsrc/pybind_api/ir/tensor_py.cc b/mindspore/ccsrc/pybind_api/ir/tensor_py.cc
index 7667ee793d3..dbaabc44124 100644
--- a/mindspore/ccsrc/pybind_api/ir/tensor_py.cc
+++ b/mindspore/ccsrc/pybind_api/ir/tensor_py.cc
@@ -20,6 +20,7 @@
 #include <sstream>
 #include <string>
 #include <utility>
+#include <complex>
 
 #include "pybind_api/api_register.h"
 #include "abstract/abstract_value.h"
@@ -78,9 +79,15 @@ static TypeId GetDataType(const py::buffer_info &buf) {
       case '?':
         return TypeId::kNumberTypeBool;
     }
-  } else if (buf.format.size() >= 2 && buf.format.back() == 'w') {
+  } else if (buf.format.size() >= 2) {
     // Support np.str_ dtype, format: {x}w. {x} is a number that means the maximum length of the string items.
-    return TypeId::kObjectTypeString;
+    if (buf.format.back() == 'w') {
+      return TypeId::kObjectTypeString;
+    } else if (buf.format == "Zf") {
+      return TypeId::kNumberTypeComplex64;
+    } else if (buf.format == "Zd") {
+      return TypeId::kNumberTypeComplex128;
+    }
   }
   MS_LOG(WARNING) << "Unsupported DataType format " << buf.format << ", item size " << buf.itemsize;
   return TypeId::kTypeUnknown;
@@ -114,6 +121,10 @@ static std::string GetPyTypeFormat(TypeId data_type) {
       return py::format_descriptor<bool>::format();
     case TypeId::kObjectTypeString:
       return py::format_descriptor<uint8_t>::format();
+    case TypeId::kNumberTypeComplex64:
+      return py::format_descriptor<std::complex<float>>::format();
+    case TypeId::kNumberTypeComplex128:
+      return py::format_descriptor<std::complex<double>>::format();
     default:
       MS_LOG(WARNING) << "Unsupported DataType " << data_type << ".";
       return "";
diff --git a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
index c431349ed8f..512ef8a0f6a 100644
--- a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
+++ b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
@@ -100,7 +100,8 @@ REGISTER_PYBIND_DEFINE(MsContextPy, ([](const py::module *m) {
                            .value("graph_kernel_flags", MsCtxParam::MS_CTX_GRAPH_KERNEL_FLAGS)
                            .value("grad_for_scalar", MsCtxParam::MS_CTX_GRAD_FOR_SCALAR)
                            .value("save_compile_cache", MsCtxParam::MS_CTX_SAVE_COMPILE_CACHE)
-                           .value("load_compile_cache", MsCtxParam::MS_CTX_LOAD_COMPILE_CACHE);
+                           .value("load_compile_cache", MsCtxParam::MS_CTX_LOAD_COMPILE_CACHE)
+                           .value("pynative_synchronize", MsCtxParam::MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE);
                          (void)py::class_<mindspore::MsContext, std::shared_ptr<mindspore::MsContext>>(*m, "MSContext")
                            .def_static("get_instance", &mindspore::MsContext::GetInstance, "Get ms context instance.")
                            .def("get_param", &mindspore::MsCtxGetParameter, "Get value of specified parameter.")
diff --git a/mindspore/ccsrc/runtime/device/CMakeLists.txt b/mindspore/ccsrc/runtime/device/CMakeLists.txt
index 56cd9fe6275..19a7b9f90cf 100644
--- a/mindspore/ccsrc/runtime/device/CMakeLists.txt
+++ b/mindspore/ccsrc/runtime/device/CMakeLists.txt
@@ -1,7 +1,7 @@
 file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc"
     "kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc"
     "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc"
-    "bucket.cc" "launch_kernel.cc" "launch_mul.cc"
+    "bucket.cc" "launch_kernel.cc" "launch_mul.cc" "pynative_profiling.cc"
 )
 
 if("${ENABLE_HIDDEN}" STREQUAL "OFF")
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_event.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_event.cc
index eb063c54c6a..9e98291fd24 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_event.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_event.cc
@@ -29,6 +29,14 @@ AscendEvent::AscendEvent() {
   }
 }
 
+AscendTimeEvent::AscendTimeEvent() {
+  auto ret = rtEventCreateWithFlag(&event_, RT_EVENT_TIME_LINE);
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(ERROR) << "rtEventCreate failed, ret:" << ret;
+    event_ = nullptr;
+  }
+}
+
 AscendEvent::~AscendEvent() {
   auto ret = rtEventDestroy(event_);
   if (ret != RT_ERROR_NONE) {
@@ -60,5 +68,24 @@ void AscendEvent::WaitEvent() {
   need_wait_ = false;
 }
 
+void AscendEvent::SyncEvent() {
+  MS_EXCEPTION_IF_NULL(event_);
+  auto ret = rtEventSynchronize(event_);
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(EXCEPTION) << "rtEventSynchronize failed, ret:" << ret;
+  }
+}
+
+void AscendEvent::ElapsedTime(float *cost_time, DeviceEvent *other) {
+  MS_EXCEPTION_IF_NULL(event_);
+  auto ascend_other = static_cast<AscendEvent *>(other);
+  MS_EXCEPTION_IF_NULL(ascend_other);
+  MS_EXCEPTION_IF_NULL(ascend_other->event_);
+  auto ret = rtEventElapsedTime(cost_time, event_, ascend_other->event_);
+  if (ret != RT_ERROR_NONE) {
+    MS_LOG(EXCEPTION) << "rtEventElapsedTime failed, ret:" << ret;
+  }
+}
+
 bool AscendEvent::NeedWait() { return need_wait_; }
 }  // namespace mindspore::device::ascend
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_event.h b/mindspore/ccsrc/runtime/device/ascend/ascend_event.h
index 059390e8c92..358752cec87 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_event.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_event.h
@@ -19,6 +19,7 @@
 
 #include "runtime/base.h"
 #include "ir/device_event.h"
+
 namespace mindspore::device::ascend {
 class AscendEvent : public DeviceEvent {
  public:
@@ -28,14 +29,22 @@ class AscendEvent : public DeviceEvent {
   void WaitEvent() override;
   void RecordEvent() override;
   bool NeedWait() override;
+  void SyncEvent() override;
+  void ElapsedTime(float *cost_time, DeviceEvent *other) override;
   void set_wait_stream(rtStream_t wait_stream) override { wait_stream_ = wait_stream; }
   void set_record_stream(rtStream_t record_stream) override { record_stream_ = record_stream; }
 
- private:
+ protected:
   rtEvent_t event_{nullptr};
   rtStream_t wait_stream_{nullptr};
   rtStream_t record_stream_{nullptr};
   bool need_wait_{false};
 };
+
+class AscendTimeEvent : public AscendEvent {
+ public:
+  AscendTimeEvent();
+  ~AscendTimeEvent() override = default;
+};
 }  // namespace mindspore::device::ascend
 #endif  // MINDSPORE_ASCEND_EVENT_H
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
index 6dad9375810..8be697ae463 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -78,7 +78,7 @@ constexpr size_t kPathMax = 4096;
 namespace mindspore::device::ascend {
 static thread_local rtContext_t thread_local_rt_context{nullptr};
 namespace {
-std::string GetRankId() {
+std::string GetRankIdStr() {
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
   if (!context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) {
@@ -155,9 +155,7 @@ void AscendKernelRuntime::ClearGraphModelMap() {
   }
 }
 
-void AscendKernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &,
-                                                    const std::unordered_set<ValueNodePtr> &,
-                                                    const std::vector<CNodePtr> &) {
+void AscendKernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id) {
   SetCurrentContext();
   MS_LOG(DEBUG) << "Clear graph:" << graph_id << " data dumper";
   if (auto dumper_iter = graph_data_dumper_.find(graph_id); dumper_iter != graph_data_dumper_.end()) {
@@ -252,6 +250,8 @@ void AscendKernelRuntime::ReleaseDeviceRes() {
   MS_EXCEPTION_IF_NULL(context_ptr);
   uint32_t device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
 
+  // DestroyHccl must be called before FreeDeviceMemory
+  (void)DestroyHccl();
   if (mem_manager_ != nullptr) {
     mem_manager_->FreeDeviceMemory();
   }
@@ -261,7 +261,6 @@ void AscendKernelRuntime::ReleaseDeviceRes() {
     MS_LOG(EXCEPTION) << "Reg SetTaskFailCallback failed, error: " << rt_ret;
   }
 
-  (void)DestroyHccl();
   (void)ResetDevice(device_id);
   (void)ProfilingManager::GetInstance().StopProfiling();
   current_graph_ = nullptr;
@@ -283,7 +282,32 @@ void AscendKernelRuntime::PreInit() {
   }
 }
 
+uint32_t AscendKernelRuntime::GetRankId() {
+  uint32_t rank_id;
+  auto ret = hccl::HcclAdapter::GetInstance().HcclGetRankId(&rank_id);
+  if (ret != HCCL_SUCCESS) {
+    MS_LOG(EXCEPTION) << "HcclGetRankId failed, ret:" << ret;
+  }
+  return rank_id;
+}
+
+uint32_t AscendKernelRuntime::GetRankSize() {
+  uint32_t rank_size;
+  auto ret = hccl::HcclAdapter::GetInstance().HcclGetRankSize(&rank_size);
+  if (ret != HCCL_SUCCESS) {
+    MS_LOG(EXCEPTION) << "HcclGetRankSize failed, ret:" << ret;
+  }
+  return rank_size;
+}
+
 bool AscendKernelRuntime::Init() {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
+  auto profiling_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PROFILING);
+  if (execution_mode == kPynativeMode && profiling_flag) {
+    pynative_mode_profiling_flag_ = true;
+  }
   if (initialized_) {
     SetCurrentContext();
     return true;
@@ -868,7 +892,7 @@ bool AscendKernelRuntime::HcclInit() {
     MS_LOG(ERROR) << "File path oversize";
     return false;
   }
-  std::string rank_id_str = GetRankId();
+  std::string rank_id_str = GetRankIdStr();
   auto full_path = realpath(config_path_str, nullptr);
   if (full_path == nullptr) {
     MS_LOG(ERROR) << "File path " << config_path_str << " does not exist";
@@ -876,7 +900,7 @@ bool AscendKernelRuntime::HcclInit() {
   }
   MS_LOG(INFO) << "MINDSPORE_HCCL_CONFIG_PATH : " << full_path << ", RANK_ID: " << rank_id_str;
   bool ret = hccl::HcclAdapter::GetInstance().InitHccl(context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID), rank_id_str,
-                                                       full_path);
+                                                       full_path, mode == kGraphMode);
   free(full_path);
   if (!ret) {
     MS_LOG(ERROR) << "Hcom init failed.";
@@ -948,6 +972,12 @@ std::shared_ptr<DeviceEvent> AscendKernelRuntime::CreateDeviceEvent() {
   return ascend_event;
 }
 
+std::shared_ptr<DeviceEvent> AscendKernelRuntime::CreateDeviceTimeEvent() {
+  auto ascend_time_event = std::make_shared<AscendTimeEvent>();
+  MS_EXCEPTION_IF_NULL(ascend_time_event);
+  return ascend_time_event;
+}
+
 uint64_t AscendKernelRuntime::GetAvailableMemMaxSize() const {
   auto ascend_mem_manager = std::dynamic_pointer_cast<AscendMemoryManager>(mem_manager_);
   MS_EXCEPTION_IF_NULL(ascend_mem_manager);
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
index 7e24cde8153..ecfecfccd9c 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@@ -39,6 +39,8 @@ class AscendKernelRuntime : public KernelRuntime {
   AscendKernelRuntime() = default;
   ~AscendKernelRuntime() override;
   bool Init() override;
+  uint32_t GetRankId() override;
+  uint32_t GetRankSize() override;
   bool LoadData(session::KernelGraph *graph) override;
   bool GenTask(const session::KernelGraph *graph);
   bool GenDynamicKernel(const session::KernelGraph *graph) override;
@@ -47,9 +49,7 @@ class AscendKernelRuntime : public KernelRuntime {
   bool RunTask(const session::KernelGraph *graph);
   bool Load(session::KernelGraph *graph, bool is_task_sink) override;
   bool Run(session::KernelGraph *graph, bool is_task_sink) override;
-  void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
-                                 const std::unordered_set<ValueNodePtr> &value_nodes,
-                                 const std::vector<CNodePtr> &execution_order) override;
+  void ClearGraphRuntimeResource(uint32_t graph_id) override;
   void ClearGlobalIdleMem() override;
   bool SyncStream() override;
   bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override;
@@ -60,6 +60,7 @@ class AscendKernelRuntime : public KernelRuntime {
   uint64_t GetAvailableMemMaxSize() const override;
   DeviceAddressType GetTargetDeviceAddressType() const override { return DeviceAddressType::kAscend; };
   std::shared_ptr<DeviceEvent> CreateDeviceEvent() override;
+  std::shared_ptr<DeviceEvent> CreateDeviceTimeEvent() override;
   void *compute_stream() const override { return stream_; }
   void *communication_stream() const override { return communication_stream_; }
 
diff --git a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
index aefcb8cc553..f35d78d336f 100644
--- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
@@ -133,9 +133,11 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf
   }
   uint32_t graph_id = kernel_graph_->graph_id();
   uint32_t rank_id = 0;
-  auto env_table_file = common::GetEnv("RANK_TABLE_FILE");
+
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
   auto env_rank_id = common::GetEnv("RANK_ID");
-  if (!(env_table_file.empty() || env_rank_id.empty())) {
+  if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
     // get actual rank id if it's distribution training case.
     if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) {
       MS_LOG(INFO) << "Failed to get rank id.";
diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.cc b/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.cc
index c37be1a0f43..b4279ea28ca 100644
--- a/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.cc
@@ -26,6 +26,7 @@
 #include "pipeline/jit/static_analysis/static_analysis.h"
 #include "runtime/device/ascend/executor/tiling/op_tiling_adapter.h"
 #include "common/trans.h"
+#include "backend/kernel_compiler/tbe/tbe_utils.h"
 
 namespace mindspore {
 namespace device {
@@ -71,20 +72,23 @@ void AiCoreDynamicKernel::ParseCompileJson() {
   if (!AnfAlgo::IsDynamicShape(cnode)) {
     return;
   }
-  if (!AnfAlgo::HasNodeAttr(kAttrCompileInfo, cnode)) {
-    MS_LOG(EXCEPTION) << "Get compile_info failed";
-  }
-  auto compile_info_attr = AnfAlgo::GetNodeAttr<std::string>(cnode, kAttrCompileInfo);
-  MS_LOG(INFO) << "Get compile_info:" << compile_info_attr;
-  op_compile_info_.str = compile_info_attr;
-  op_compile_info_.key = "";
 
-  if (AnfAlgo::HasNodeAttr(kAttrFusionType, cnode)) {
-    auto fusion_type = AnfAlgo::GetNodeAttr<std::string>(cnode, kAttrFusionType);
-    MS_LOG(INFO) << "Get fusion_type:" << fusion_type;
-    (*compile_info_json_)["_pattern"] = fusion_type;
-    op_compile_info_.key = std::hash<std::string>{}(fusion_type);
+  MS_LOG(INFO) << "Get compile_info from attr start.";
+  std::string old_build = common::GetEnv("MS_OLD_BUILD_PROCESS");
+  if (!old_build.empty()) {
+    if (!AnfAlgo::HasNodeAttr(kAttrCompileInfo, cnode)) {
+      MS_LOG(EXCEPTION) << "Get compile info failed.";
+    }
+    op_compile_info_ = AnfAlgo::GetNodeAttr<std::string>(cnode, kAttrCompileInfo);
+  } else {
+    bool get_flag = true;
+    TbeUtils::GetCompileInfo(cnode, &op_compile_info_, &get_flag);
+    if (!get_flag) {
+      MS_LOG(EXCEPTION) << "Get compile_info failed. The compile result of [" << AnfAlgo::GetCNodeName(cnode)
+                        << "]maybe not in the json file(dir:./kernel_meta/) or the file had been deleted";
+    }
   }
+  MS_LOG(INFO) << "Get compile_info:" << op_compile_info_;
 }
 
 void AiCoreDynamicKernel::Initialize() {
@@ -135,7 +139,7 @@ void AiCoreDynamicKernel::ComputeTiling() {
   optiling::utils::OpRunInfo op_run_info_v2(-1, true, 0);
   tiling::OpTilingCalculateAdapter converter;
   ge::ComputeGraphPtr ge_graph = std::make_shared<ge::ComputeGraph>("default");
-  auto ge_node = converter.AnfNodeToGeNodeAdapter(cnode, &ge_graph, depend_tensor_map_);
+  auto ge_node = converter.AnfNodeToGeNodeAdapter(cnode, &ge_graph, depend_tensor_map_, op_compile_info_);
   (void)optiling::OpParaCalculateV2(*ge_node, op_run_info_v2);
 
   block_dim_ = op_run_info_v2.GetBlockDim();
diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.h b/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.h
index fc19aa0c3ac..09ee5009eaa 100644
--- a/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.h
+++ b/mindspore/ccsrc/runtime/device/ascend/executor/ai_core_dynamic_kernel.h
@@ -71,7 +71,7 @@ class AiCoreDynamicKernel : public DynamicKernel {
   std::vector<int64_t> workspaces_size_;
   std::vector<DeviceAddressPtr> workspace_addr_;
   std::shared_ptr<nlohmann::json> compile_info_json_;
-  optiling::OpCompileInfo op_compile_info_{};
+  std::string op_compile_info_;
   uint32_t tiling_key_{0};
   const std::string origin_key_{""};
 
diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.cc b/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.cc
index 5d22d300520..f7ebd026df5 100644
--- a/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.cc
@@ -138,14 +138,10 @@ void OpTilingCalculateAdapter::ConvertOutputShapeAndType(const CNodePtr &node, g
 void OpTilingCalculateAdapter::ConvertCompileInfo(const CNodePtr &node, ge::OpDescPtr *op_desc) {
   MS_EXCEPTION_IF_NULL(node);
   MS_EXCEPTION_IF_NULL(*op_desc);
-  if (!AnfAlgo::HasNodeAttr(kAttrCompileInfo, node)) {
-    MS_LOG(EXCEPTION) << "Get compile_info failed";
-  }
-  auto compile_info_attr = AnfAlgo::GetNodeAttr<std::string>(node, kAttrCompileInfo);
-  MS_LOG(INFO) << "For op " << op_name_ << ", get compile_info: " << compile_info_attr;
-  std::string compile_info_key = std::to_string(std::hash<std::string>()(compile_info_attr));
+  MS_LOG(INFO) << "For op " << op_name_ << ", get compile_info: " << op_compile_info_;
+  std::string compile_info_key = std::to_string(std::hash<std::string>()(op_compile_info_));
   (void)ge::AttrUtils::SetStr(*(*op_desc), COMPILE_INFO_KEY, compile_info_key);
-  (void)ge::AttrUtils::SetStr(*(*op_desc), COMPILE_INFO_JSON, compile_info_attr);
+  (void)ge::AttrUtils::SetStr(*(*op_desc), COMPILE_INFO_JSON, op_compile_info_);
 }
 
 ge::NodePtr OpTilingCalculateAdapter::NewConstantOp(const CNodePtr &node, const std::string &name,
@@ -269,9 +265,11 @@ void OpTilingCalculateAdapter::InitOpIoName(const CNodePtr &node) {
 }
 
 ge::NodePtr OpTilingCalculateAdapter::AnfNodeToGeNodeAdapter(
-  const CNodePtr &node, ge::ComputeGraphPtr *ge_graph, const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map) {
+  const CNodePtr &node, ge::ComputeGraphPtr *ge_graph, const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
+  const std::string &op_compile_info) {
   MS_EXCEPTION_IF_NULL(node);
   op_name_ = AnfAlgo::GetCNodeName(node);
+  op_compile_info_ = op_compile_info;
   auto op_type = GetRealOpType(op_name_);
   (void)InitOpIoName(node);
   ge::OpDescPtr op_desc = std::make_shared<ge::OpDesc>(op_name_, op_type);
diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.h b/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.h
index 9dbfd7ab8ca..5c92c2bfc0d 100644
--- a/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.h
+++ b/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.h
@@ -37,7 +37,8 @@ class OpTilingCalculateAdapter {
   ~OpTilingCalculateAdapter() = default;
 
   ge::NodePtr AnfNodeToGeNodeAdapter(const CNodePtr &node, ge::ComputeGraphPtr *ge_graph,
-                                     const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map);
+                                     const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
+                                     const std::string &op_compile_info);
 
  private:
   void ConvertInputShapeAndType(const CNodePtr &node, ge::OpDescPtr *op_desc);
@@ -55,6 +56,7 @@ class OpTilingCalculateAdapter {
   std::string GetOutputName(const CNodePtr &node, size_t index);
   void InitOpIoName(const CNodePtr &node);
   std::string op_name_;
+  std::string op_compile_info_;
   std::vector<std::string> input_names_;
   std::vector<std::string> output_names_;
 };
diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_calculater.cc b/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_calculater.cc
deleted file mode 100644
index 69751aaa5e6..00000000000
--- a/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_calculater.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h"
-#include <dlfcn.h>
-#include <map>
-#include <vector>
-#include <memory>
-#include <string>
-#include <algorithm>
-#include "backend/session/anf_runtime_algorithm.h"
-#include "runtime/device/ascend/ge_types_convert.h"
-#include "utils/utils.h"
-#include "external/graph/tensor.h"
-#include "external/register/op_tiling_registry.h"
-
-namespace mindspore {
-namespace device {
-namespace ascend {
-ge::Tensor MakeTempGeTensor(const TypeId &type_id, const std::vector<size_t> &shape, const std::string &format) {
-  auto ge_type = GeTypesConvert::TransTypeIdToGeDataType(type_id);
-  std::vector<int64_t> int_shape;
-  std::transform(shape.begin(), shape.end(), std::back_inserter(int_shape), SizeToLong);
-  auto ge_format = GeTypesConvert::GetGeFormat(format, shape.size());
-  ge::Tensor ge_tensor;
-  ge_tensor.SetTensorDesc(ge::TensorDesc(ge::Shape(int_shape), ge_format, ge_type));
-  return ge_tensor;
-}
-
-void FeedTeOpTensorInputArg(const NotNull<CNodePtr> &cnode,
-                            NotNull<std::vector<optiling::TeOpTensorArg> *> tensor_arg_list) {
-  MS_LOG(INFO) << "FeedTeOpTensorInputArg start, node:" << cnode->fullname_with_scope();
-  auto input_size = AnfAlgo::GetInputTensorNum(cnode.get());
-
-  // Skip Dynamic Shape Depend Input
-
-  for (size_t i = 0; i < input_size; ++i) {
-    auto input_node_with_index = AnfAlgo::GetPrevNodeOutput(cnode.get(), i);
-    auto input_node = input_node_with_index.first;
-    auto input_index = input_node_with_index.second;
-    auto output_shape = AnfAlgo::GetOutputDeviceShape(input_node, input_index);
-    auto output_ori_shape = AnfAlgo::GetOutputInferShape(input_node, input_index);
-    auto output_format = AnfAlgo::GetOutputFormat(input_node, input_index);
-    auto output_dtype = AnfAlgo::GetOutputDeviceDataType(input_node, input_index);
-    auto iter = type_name_map.find(output_dtype);
-    if (iter == type_name_map.end()) {
-      MS_LOG(EXCEPTION) << "Cannot found typeId:" << output_dtype;
-    }
-    auto ge_output_dtype = iter->second;
-
-    optiling::TeOpTensorArg tensor_arg;
-    optiling::TeOpTensor tensor;
-    tensor_arg.arg_type = optiling::TA_SINGLE;
-    tensor.dtype = ge_output_dtype;
-    tensor.shape.insert(tensor.shape.end(), output_shape.begin(), output_shape.end());
-    tensor.ori_shape.insert(tensor.ori_shape.end(), output_ori_shape.begin(), output_ori_shape.end());
-
-    tensor.format = GeTypesConvert::GetGeTilingFormat(GeTypesConvert::GetGeFormat(output_format, output_shape.size()));
-    MS_LOG(INFO) << "Tiling Format:" << tensor.format;
-    tensor_arg.tensor.emplace_back(tensor);
-    tensor_arg_list->emplace_back(tensor_arg);
-  }
-}
-
-void FeedTeOpTensorOutputArg(const NotNull<CNodePtr> &cnode,
-                             NotNull<std::vector<optiling::TeOpTensorArg> *> tensor_arg_list) {
-  MS_LOG(INFO) << "FeedTeOpTensorOutputArg start, node:" << cnode->fullname_with_scope();
-  auto output_size = AnfAlgo::GetOutputTensorNum(cnode.get());
-  for (size_t i = 0; i < output_size; ++i) {
-    auto output_shape = AnfAlgo::GetOutputDeviceShape(cnode.get(), i);
-    auto output_ori_shape = AnfAlgo::GetOutputInferShape(cnode.get(), i);
-    auto output_format = AnfAlgo::GetOutputFormat(cnode.get(), i);
-    auto data_type = AnfAlgo::GetOutputDeviceDataType(cnode.get(), i);
-    auto iter = type_name_map.find(data_type);
-    if (iter == type_name_map.end()) {
-      MS_LOG(EXCEPTION) << "Cannot found typeId:" << data_type;
-    }
-
-    optiling::TeOpTensorArg tensor_arg;
-    optiling::TeOpTensor tensor;
-    tensor_arg.arg_type = optiling::TA_SINGLE;
-    tensor.dtype = iter->second;
-    tensor.shape.insert(tensor.shape.end(), output_shape.begin(), output_shape.end());
-    tensor.ori_shape.insert(tensor.ori_shape.end(), output_ori_shape.begin(), output_ori_shape.end());
-    tensor.format = GeTypesConvert::GetGeTilingFormat(GeTypesConvert::GetGeFormat(output_format, output_shape.size()));
-    MS_LOG(INFO) << "Tiling Format:" << tensor.format;
-    tensor_arg.tensor.emplace_back(tensor);
-    tensor_arg_list->emplace_back(tensor_arg);
-  }
-}
-
-void FeedTeOpConstTensor(const NotNull<CNodePtr> &cnode, const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
-                         NotNull<std::map<std::string, optiling::TeConstTensorData> *> const_inputs) {
-  MS_LOG(INFO) << "FeedTeOpConstTensor start, node:" << cnode->fullname_with_scope();
-  auto depends_list_me = abstract::GetDependsFormMap(cnode);
-  if (depends_list_me.empty()) {
-    MS_LOG(INFO) << "No input depend found, " << cnode->fullname_with_scope();
-    return;
-  }
-
-  std::vector<int> depends_list;
-  (void)std::transform(depends_list_me.begin(), depends_list_me.end(), std::back_inserter(depends_list),
-                       [](const int64_t &value) { return static_cast<int>(value); });
-  for (auto index : depends_list) {
-    auto iter = depend_tensor_map.find(IntToSize(index));
-    if (iter == depend_tensor_map.end()) {
-      MS_LOG(EXCEPTION) << "Index not found in depend_tensor_map";
-    }
-
-    auto const_tensor = iter->second;
-
-    auto have_input_names_attr = AnfAlgo::HasNodeAttr("input_names", cnode);
-    if (!have_input_names_attr) {
-      MS_LOG(EXCEPTION) << "cnode:" << cnode->fullname_with_scope() << " no input_names attr";
-    }
-    auto input_names_attr = AnfAlgo::GetNodeAttr<std::vector<std::string>>(cnode.get(), "input_names");
-    if (IntToSize(index) >= input_names_attr.size()) {
-      MS_LOG(EXCEPTION) << "input index" << index << " >= input_name_attr.size:" << input_names_attr.size();
-    }
-    auto input_name = input_names_attr[index];
-    MS_LOG(INFO) << "input_name is " << input_name;
-    auto type_id = AnfAlgo::GetPrevNodeOutputDeviceDataType(cnode.get(), IntToSize(index));
-    auto shape = AnfAlgo::GetPrevNodeOutputInferShape(cnode.get(), IntToSize(index));
-    auto format = AnfAlgo::GetPrevNodeOutputFormat(cnode.get(), IntToSize(index));
-    const_inputs->try_emplace(
-      input_name,
-      optiling::TeConstTensorData{static_cast<const uint8_t *>(const_tensor->data_c()),
-                                  IntToSize(const_tensor->DataSize()), MakeTempGeTensor(type_id, shape, format)});
-  }
-  MS_LOG(INFO) << "FeedTeOpConstTensor end";
-}
-
-void OpTilingCalculater::Init() {
-  MS_LOG(INFO) << "Start init OpTilingCalculater";
-  tiling_func_map_ = optiling::OpTilingRegistryInterf::RegisteredOpInterf();
-  if (tiling_func_map_.empty()) {
-    MS_LOG(EXCEPTION) << "Get register tiling func failed.";
-  }
-}
-
-std::string GetRealOpType(const std::string &op_type) {
-  static const std::map<std::string, std::string> kOpTypeMap = {
-    {"SparseApplyFtrl", "SparseApplyFtrlD"},
-    {"SparseApplyProximalAdagrad", "SparseApplyProximalAdagradD"},
-    {"SparseGatherV2", "Gather"},
-    {"Pad", "PadD"},
-    {"Concat", "ConcatD"},
-    {"Softmax", "SoftmaxV2"},
-    {"DropoutDoMask", "DropOutDoMask"},
-  };
-  auto iter = kOpTypeMap.find(op_type);
-  if (iter == kOpTypeMap.end()) {
-    return op_type;
-  }
-  return iter->second;
-}
-
-void OpTilingCalculater::CalculateTiling(const NotNull<CNodePtr> &cnode, const optiling::OpCompileInfo &op_compile_info,
-                                         const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
-                                         const NotNull<optiling::OpRunInfo *> op_run_info) {
-  optiling::TeOpParas op_param;
-  std::string op_type = AnfAlgo::GetCNodeName(cnode.get());
-  MS_LOG(INFO) << "[DynamicShape] calculate tiling, op_type:" << op_type;
-
-  FeedTeOpTensorInputArg(cnode, NOT_NULL(&op_param.inputs));
-  FeedTeOpTensorOutputArg(cnode, NOT_NULL(&op_param.outputs));
-  FeedTeOpConstTensor(cnode, depend_tensor_map, NOT_NULL(&op_param.const_inputs));
-
-  op_type = GetRealOpType(op_type);
-  auto iter = tiling_func_map_.find(op_type);
-  if (iter == tiling_func_map_.end()) {
-    iter = tiling_func_map_.find("AutoTiling");
-    if (iter == tiling_func_map_.end()) {
-      MS_LOG(EXCEPTION) << "AutoTiling Func Not Found";
-    }
-  }
-
-  MS_LOG(INFO) << "Get tiling func:" << iter->first;
-
-  if (iter != tiling_func_map_.end()) {
-    bool ret = (iter->second)(op_param, op_compile_info, *op_run_info);
-    if (!ret) {
-      MS_LOG(EXCEPTION) << "Calculate tiling failed";
-    }
-  } else {
-    MS_LOG(EXCEPTION) << "Tiling func not found";
-  }
-  MS_LOG(INFO) << "CalculateTiling success";
-}
-}  // namespace ascend
-}  // namespace device
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_calculater.h b/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_calculater.h
deleted file mode 100644
index 17c4262f199..00000000000
--- a/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_calculater.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_TILING_OP_TILING_CALCULATE_H_
-#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_TILING_OP_TILING_CALCULATE_H_
-
-#include <map>
-#include <memory>
-#include <string>
-#include "utils/ms_utils.h"
-#include "utils/contract.h"
-#include "ir/anf.h"
-#include "ir/tensor.h"
-#include "register/op_tiling.h"
-#include "abstract/primitive_infer_map.h"
-
-namespace mindspore {
-namespace device {
-namespace ascend {
-class OpTilingCalculater {
- public:
-  static OpTilingCalculater &GetInstance() {
-    static OpTilingCalculater instance;
-    return instance;
-  }
-
-  void Init();
-  void CalculateTiling(const NotNull<CNodePtr> &cnode, const optiling::OpCompileInfo &op_compile_info,
-                       const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
-                       NotNull<optiling::OpRunInfo *> op_run_info);
-
- private:
-  OpTilingCalculater() = default;
-  ~OpTilingCalculater() = default;
-  DISABLE_COPY_AND_ASSIGN(OpTilingCalculater);
-
-  std::map<std::string, optiling::OpTilingFunc> tiling_func_map_;
-};
-}  // namespace ascend
-}  // namespace device
-}  // namespace mindspore
-#endif  // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_TILING_OP_TILING_CALCULATE_H_
diff --git a/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc b/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc
index d1001dc7e11..51c30ee3441 100644
--- a/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/kernel_build_ascend.cc
@@ -24,6 +24,7 @@
 #include "runtime/device/ascend/kernel_select_ascend.h"
 #include "runtime/device/kernel_info.h"
 #include "backend/kernel_compiler/kernel.h"
+#include "backend/kernel_compiler/tbe/ascend_kernel_compile.h"
 #include "backend/kernel_compiler/tbe/tbe_kernel_parallel_build.h"
 #include "backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.h"
 #include "backend/kernel_compiler/aicpu/aicpu_kernel_build.h"
@@ -96,11 +97,28 @@ static bool KernelBuildParallelCompile(const std::vector<CNodePtr> &kernels) {
       }
     }
   }
-  bool tbe_ret = kernel::TbeOpParallelBuild(tbe_nodes);
-  kernel::AkgAscendKernelBuilder akg_ascend_kernel_builder;
-  bool akg_ret = akg_ascend_kernel_builder.AkgKernelParallelBuild(akg_nodes);
+  bool tbe_ret = true;
+  bool akg_ret = true;
   auto bin_map = kernel::tbe::KernelMeta::GetInstance();
-  (void)bin_map->ReadIndex(kernel::kCceKernelMeta);
+  if (!tbe_nodes.empty()) {
+    std::string old_build = common::GetEnv("MS_OLD_BUILD_PROCESS");
+    if (!old_build.empty()) {
+      tbe_ret = kernel::TbeOpParallelBuild(tbe_nodes);
+    } else {
+      auto build_manager = kernel::ascend::AscendKernelCompileManager::GetInstance();
+      MS_EXCEPTION_IF_NULL(build_manager);
+      build_manager->ResetOldTask();
+      tbe_ret = build_manager->AscendSingleOpCompile(tbe_nodes);
+    }
+    auto config_path = TbeUtils::GetOpDebugPath();
+    std::string dir = config_path + "kernel_meta/";
+    (void)bin_map->ReadIndex(dir);
+  }
+  if (!akg_nodes.empty()) {
+    kernel::AkgAscendKernelBuilder akg_ascend_kernel_builder;
+    akg_ret = akg_ascend_kernel_builder.AkgKernelParallelBuild(akg_nodes);
+    (void)bin_map->ReadIndex(kernel::kCceKernelMeta);
+  }
   for (const auto &anf_node : other_nodes) {
     kernel::KernelModPtr kernel_mod_ptr = SerialCompileImpl(anf_node);
     MS_EXCEPTION_IF_NULL(kernel_mod_ptr);
@@ -223,6 +241,11 @@ static bool IsAtomicNode(const CNodePtr &kernel_node) {
       MS_LOG(EXCEPTION) << "Atomic addr clean doesn't support clean input address, input index: " << j;
     }
   }
+
+  if (parameters_indexs.size() < total_num) {
+    MS_LOG(EXCEPTION) << "parameters indexes size: " << parameters_indexs.size()
+                      << " less than total num: " << total_num;
+  }
   // process output
   std::vector<size_t> output_indexs = {};
   if (AnfAlgo::HasNodeAttr(kAttrAtomicOutputIndexs, kernel_node)) {
diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc
index 666d266bc74..ce5ff4a4be0 100644
--- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc
@@ -146,6 +146,11 @@ Status ProfilingManager::GetProfConf(const NotNull<MsprofGeOptions *> prof) {
 bool ProfilingManager::StartupProfiling(uint32_t device_id) {
   auto is_profiling = IsProfiling();
   if (!is_profiling) {
+    int32_t cb_ret = MsprofInit(0XFF, nullptr, 0);
+    if (cb_ret != UintToInt(PROF_SUCCESS)) {
+      MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret;
+      return false;
+    }
     MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
     return true;
   }
@@ -181,15 +186,14 @@ uint32_t GetCurrentDeviceId() {
 bool ProfilingManager::ProfStartUp(const NotNull<MsprofGeOptions *> prof_conf) const {
   MS_LOG(INFO) << "Prof start up. ";
 
-  if (prof_cb_.msprofCtrlCallback == nullptr) {
-    MS_LOG(ERROR) << "MsprofCtrlCallback callback is nullptr.";
-    return false;
+  bool ret = ProfRegisterCtrlCallback();
+  if (ret == false) {
+    return ret;
   }
 
   // call profiling start up api
-  int32_t cb_ret =
-    prof_cb_.msprofCtrlCallback(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS),
-                                static_cast<void *>(prof_conf.get()), sizeof(MsprofGeOptions));
+  int32_t cb_ret = MsprofInit(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS),
+                              static_cast<void *>(prof_conf.get()), sizeof(MsprofGeOptions));
   if (cb_ret != UintToInt(PROF_SUCCESS)) {
     MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret;
     return false;
@@ -199,6 +203,30 @@ bool ProfilingManager::ProfStartUp(const NotNull<MsprofGeOptions *> prof_conf) c
   return true;
 }
 
+bool ProfilingManager::ProfRegisterCtrlCallback() const {
+  rtError_t rt_ret = rtProfRegisterCtrlCallback(GE, CtrlCallbackHandle);
+  if (rt_ret != RT_ERROR_NONE) {
+    MS_LOG(ERROR) << "Call rtProfRegisterCtrlCallback failed.";
+    return false;
+  }
+
+  return true;
+}
+
+rtError_t CtrlCallbackHandle(uint32_t rt_type, void *data, uint32_t len) {
+  if (rt_type == RT_PROF_CTRL_REPORTER) {
+    ProfilingManager::GetInstance().SetMsprofReporterCallback(reinterpret_cast<MsprofReporterCallback>(data));
+    MS_LOG(INFO) << "Set MsprofReporterCallback success.";
+  } else if (rt_type == RT_PROF_CTRL_SWITCH) {
+    Status ret = ProfCtrlSwitchHandle(data);
+    if (ret != PROF_SUCCESS) {
+      MS_LOG(ERROR) << "Start runtime profiler failed.";
+    }
+  }
+
+  return RT_ERROR_NONE;
+}
+
 bool ProfilingManager::StopProfiling() {
   MS_LOG(INFO) << "StopProfiling";
   if (!IsProfiling()) {
@@ -208,26 +236,11 @@ bool ProfilingManager::StopProfiling() {
 
   // plugin unregister
   PluginUnInit();
-  // stop runtime profiler
-  auto module = GetProfilingModule();
-  uint32_t device_ids[kProfilingDeviceNum] = {GetCurrentDeviceId()};
-
-  auto rt_ret = rtProfilerStop(module, kProfilingDeviceNum, device_ids);
-  if (rt_ret != UintToInt(RT_ERROR_NONE)) {
-    MS_LOG(ERROR) << "Call rtProfilerStop failed";
-    return false;
-  }
 
   // stop profiling
-  if (prof_cb_.msprofCtrlCallback == nullptr) {
-    MS_LOG(ERROR) << "MsprofCtrlCallback callback is nullptr.";
-    return false;
-  }
-
-  int32_t cb_ret =
-    prof_cb_.msprofCtrlCallback(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE), nullptr, 0);
+  int32_t cb_ret = MsprofFinalize();
   if (cb_ret != 0) {
-    MS_LOG(WARNING) << "Call msprofCtrlCallback failed, ret: " << cb_ret;
+    MS_LOG(WARNING) << "Call MsprofFinalize failed, ret: " << cb_ret;
     return false;
   }
   return true;
@@ -273,28 +286,18 @@ Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) {
   return PROF_SUCCESS;
 }
 
-Status RegProfReporterCallback(MsprofReporterCallback func) {
-  if (func == nullptr) {
-    MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr.";
+Status ProfCtrlSwitchHandle(void *data) {
+  if (data == nullptr) {
+    MS_LOG(ERROR) << "Ctrl switch handl data is nullptr.";
     return PROF_FAILED;
   }
-  if (ProfilingManager::GetInstance().GetMsprofCallback().msprofReporterCallback != nullptr) {
-    MS_LOG(WARNING) << "Msprof reporter callback is exist, just ignore it.";
-  } else {
-    MS_LOG(INFO) << "GE register Msprof reporter callback.";
-    ProfilingManager::GetInstance().SetMsprofReporterCallback(func);
-    // Pass MsprofReporterCallback to runtime
-    rtError_t rt_ret = rtSetMsprofReporterCallback(func);
-    if (rt_ret != UintToInt(PROF_SUCCESS)) {
-      MS_LOG(WARNING) << "Pass MsprofReporterCallback to runtime failed, ret: " << rt_ret;
-      return IntToUint(rt_ret);
-    }
-    // Pass MsprofReporterCallback to hccl
-  }
-  return PROF_SUCCESS;
+
+  rtProfCommandHandle_t *prof_config_param = reinterpret_cast<rtProfCommandHandle_t *>(data);
+  auto type = static_cast<ProfCommandHandleType>(prof_config_param->type);
+  return ProfCommandHandle(type);
 }
 
-Status ProfCommandHandle(ProfCommandHandleType type, void *, uint32_t) {
+Status ProfCommandHandle(ProfCommandHandleType type) {
   MS_LOG(INFO) << "ProfCommandHandle start, type:" << type;
   if (type == kProfCommandhandleInit) {
     auto cb_ret = ProfilingManager::GetInstance().PluginInit();
@@ -302,25 +305,10 @@ Status ProfCommandHandle(ProfCommandHandleType type, void *, uint32_t) {
       MS_LOG(ERROR) << "Profiling plugin int failed.";
       return PROF_FAILED;
     }
-
-    // call runtime profiler API
-    auto module = GetProfilingModule();
-    auto device_id = GetCurrentDeviceId();
-    auto ret = rtProfilerStart(module, kProfilingDeviceNum, &device_id);
-    if (ret != RT_ERROR_NONE) {
-      MS_LOG(ERROR) << "Call rtProfilerStart failed, ret:" << ret;
-      return PROF_FAILED;
-    }
   }
+
   return PROF_SUCCESS;
 }
-
-bool DoRegiste() noexcept {
-  MS_LOG(INFO) << "VM profiling register start";
-  return VMCallbackRegister::GetInstance().Register(RegProfCtrlCallback, RegProfSetDeviceCallback,
-                                                    RegProfReporterCallback, ProfCommandHandle);
-}
-static bool doRegiste = DoRegiste();
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.h b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.h
index d6b57f373b9..676cf06f95a 100644
--- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.h
+++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.h
@@ -25,6 +25,8 @@
 #include "utils/ms_context.h"
 #include "toolchain/prof_callback.h"
 #include "toolchain/prof_acl_api.h"
+#include "toolchain/slog.h"
+#include "runtime/base.h"
 #include "runtime/device/ascend/profiling/profiling_callback_register.h"
 
 using std::map;
@@ -43,6 +45,7 @@ class ProfilingManager {
   static ProfilingManager &GetInstance();
   uint64_t GetJobId() const;
   bool ReportProfilingData(const map<uint32_t, string> &op_taskId_map) const;
+  bool ProfRegisterCtrlCallback() const;
   bool StartupProfiling(uint32_t device_id);
   bool StopProfiling();
 
@@ -75,7 +78,9 @@ class ProfilingManager {
 Status RegProfCtrlCallback(MsprofCtrlCallback func);
 Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func);
 Status RegProfReporterCallback(MsprofReporterCallback func);
-Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len);
+Status ProfCommandHandle(ProfCommandHandleType type);
+rtError_t CtrlCallbackHandle(uint32_t rt_type, void *data, uint32_t len);
+Status ProfCtrlSwitchHandle(void *data);
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/device/ascend/tasksink/task_generator.cc b/mindspore/ccsrc/runtime/device/ascend/tasksink/task_generator.cc
index 9c20e049259..65e152807f1 100644
--- a/mindspore/ccsrc/runtime/device/ascend/tasksink/task_generator.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/tasksink/task_generator.cc
@@ -46,9 +46,8 @@ bool TaskGenerator::GenTasks(const std::vector<CNodePtr> &anf_node_list, std::ve
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
-  auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
   if (save_graphs) {
-    std::string file_path = save_graphs_path + "/" + "task_info" + "_graph_" + std::to_string(graph_id) + ".ir";
+    std::string file_path = GetSaveGraphsPathName("task_info_graph_" + std::to_string(graph_id) + ".ir");
     DumpTaskInfo(file_path);
   }
   return true;
diff --git a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
index d924b0d94e0..1c7041a094c 100644
--- a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
@@ -68,7 +68,9 @@ void DynamicKernel::RebuildDependTensor() {
     auto host_type = AnfAlgo::GetOutputInferDataType(pre_node_with_index.first, pre_node_with_index.second);
     auto out_tensor = std::make_shared<tensor::Tensor>(host_type, shapes);
     MS_EXCEPTION_IF_NULL(out_tensor);
-    out_tensor->set_device_address(output_addr);
+    // The second parameter must be false, otherwise the device address cannot be released and allocated, and the
+    // address size will be wrong in the dynamic shape scenario.
+    out_tensor->set_device_address(output_addr, false);
     auto ret = depend_tensor_map_.try_emplace(depend, out_tensor);
     if (!ret.second) {
       MS_LOG(EXCEPTION) << "Insert map failed";
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_event.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_event.cc
index 41ead402a70..d0da14d2e4a 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_event.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_event.cc
@@ -42,5 +42,19 @@ void GpuEvent::RecordEvent() {
   need_wait_ = true;
 }
 
+void GpuEvent::SyncEvent() {
+  MS_EXCEPTION_IF_NULL(event_);
+  CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventSynchronize(event_), "cudaEventSynchronize failed");
+}
+
+void GpuEvent::ElapsedTime(float *cost_time, DeviceEvent *other) {
+  MS_EXCEPTION_IF_NULL(event_);
+  auto gpu_event = static_cast<GpuEvent *>(other);
+  MS_EXCEPTION_IF_NULL(gpu_event);
+  MS_EXCEPTION_IF_NULL(gpu_event->event_);
+  CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaEventElapsedTime(cost_time, event_, gpu_event->event_),
+                                     "cudaEventElapsedTime failed");
+}
+
 bool GpuEvent::NeedWait() { return need_wait_; }
 }  // namespace mindspore::device::gpu
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_event.h b/mindspore/ccsrc/runtime/device/gpu/gpu_event.h
index a5cd50e0be0..443f689054e 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_event.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_event.h
@@ -29,6 +29,8 @@ class GpuEvent : public DeviceEvent {
   void WaitEvent() override;
   void RecordEvent() override;
   bool NeedWait() override;
+  void SyncEvent() override;
+  void ElapsedTime(float *cost_time, DeviceEvent *other) override;
   void set_wait_stream(void *wait_stream) override { wait_stream_ = static_cast<cudaStream_t>(wait_stream); }
   void set_record_stream(void *record_stream) override { record_stream_ = static_cast<cudaStream_t>(record_stream); }
 
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index a6ba90f0ee4..082990bb436 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -16,6 +16,7 @@
 #include "runtime/device/gpu/gpu_kernel_runtime.h"
 #include <algorithm>
 #include <map>
+#include "debug/anf_ir_utils.h"
 #include "runtime/device/gpu/gpu_device_address.h"
 #include "runtime/device/gpu/cuda_driver.h"
 #include "runtime/device/gpu/gpu_event.h"
@@ -150,7 +151,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
     for (size_t j = 0; j < input_size; ++j) {
       auto input_kernel = kernel->input(j + 1);
       MS_EXCEPTION_IF_NULL(input_kernel);
-      std::string input_kernel_name = input_kernel->fullname_with_scope();
+      std::string input_kernel_name = GetKernelNodeName(input_kernel);
       auto addr = kernel_inputs[j];
       auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
       // For example, this happens with the Depend op
@@ -282,12 +283,8 @@ void GPUKernelRuntime::ReleaseDeviceRes() {
   }
 }
 
-void GPUKernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
-                                                 const std::unordered_set<ValueNodePtr> &value_nodes,
-                                                 const std::vector<CNodePtr> &execution_order) {
+void GPUKernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id) {
   MS_LOG(INFO) << "Clear graph:" << graph_id << " GPU runtime resource";
-  // Clear the output address of graph.
-  ClearOutputAddress(inputs, value_nodes, execution_order);
   graph_output_map_.erase(graph_id);
 }
 
@@ -799,7 +796,8 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
   }
   if (!mock) {
     // collect weights and bias for dump mode
-    debugger_->LoadParametersAndConst();
+    auto kernel_graph_ptr = std::make_shared<session::KernelGraph>(*graph);
+    debugger_->LoadParametersAndConst(kernel_graph_ptr);
     auto context_ptr = MsContext::GetInstance();
     MS_EXCEPTION_IF_NULL(context_ptr);
     if (context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode) {
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
index 92d461a10fc..ff89a882528 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
@@ -42,9 +42,7 @@ class GPUKernelRuntime : public KernelRuntime {
   ~GPUKernelRuntime() override = default;
   bool Init() override;
   void ReleaseDeviceRes() override;
-  void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
-                                 const std::unordered_set<ValueNodePtr> &value_nodes,
-                                 const std::vector<CNodePtr> &execution_order) override;
+  void ClearGraphRuntimeResource(uint32_t graph_id) override;
   void AssignMemory(session::KernelGraph *graph) override;
   bool Run(session::KernelGraph *graph, bool is_task_sink) override;
   bool GenDynamicKernel(const session::KernelGraph *graph) override { return true; }
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
index 8b1343d43c8..49d8515a7e1 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@@ -32,6 +32,7 @@
 #include "utils/utils.h"
 #include "frontend/parallel/context.h"
 #include "debug/env_config_parser.h"
+#include "runtime/device/pynative_profiling.h"
 #if ((defined ENABLE_CPU) && (!defined _WIN32))
 #include "ps/ps_cache/ps_cache_manager.h"
 #endif
@@ -966,6 +967,47 @@ void KernelRuntime::LaunchKernelEvent(const std::vector<std::vector<std::functio
   }
 }
 
+bool KernelRuntime::LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name,
+                                                      const std::vector<AddressPtr> &inputs,
+                                                      const std::vector<AddressPtr> &workspace,
+                                                      const std::vector<AddressPtr> &outputs, void *stream) {
+  MS_EXCEPTION_IF_NULL(kernel_mod);
+  MS_EXCEPTION_IF_NULL(stream);
+  float cost_time = 0;
+  auto start = CreateDeviceTimeEvent();
+  auto end = CreateDeviceTimeEvent();
+  MS_EXCEPTION_IF_NULL(start);
+  MS_EXCEPTION_IF_NULL(end);
+  start->set_record_stream(stream);
+  end->set_record_stream(stream);
+  start->RecordEvent();
+  bool ret = kernel_mod->Launch(inputs, workspace, outputs, stream);
+  end->RecordEvent();
+  start->SyncEvent();
+  end->SyncEvent();
+  start->ElapsedTime(&cost_time, end.get());
+  auto launch_end_time = GetTime();
+  auto &profiler_inst = PynativeProfiler::GetInstance();
+  double launch_start_time = launch_end_time - cost_time / kBasicTimeTransferUnit;
+  auto op_launch_start_time_end_time = std::make_pair(launch_start_time, launch_end_time);
+  profiler_inst.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_time_end_time));
+  if (!ret) {
+    MS_LOG(EXCEPTION) << "Launch kernel failed, kernel name is : " << op_name;
+  }
+  return ret;
+}
+
+void KernelRuntime::DebugStreamSync(const CNodePtr &kernel) {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto enable_sync_run = ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE);
+  if (enable_sync_run) {
+    if (!SyncStream()) {
+      MS_LOG(EXCEPTION) << "Op " << kernel->fullname_with_scope() << " run failed!";
+    }
+  }
+}
+
 bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
   const auto &kernels = graph.execution_order();
   std::vector<DynamicKernelPtr> dynamic_kernel_list;
@@ -1017,18 +1059,37 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
       AddressPtrList kernel_inputs;
       AddressPtrList kernel_workspaces;
       AddressPtrList kernel_outputs;
-      GenLaunchArgs(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
+      auto ms_context = MsContext::GetInstance();
+      MS_EXCEPTION_IF_NULL(ms_context);
+      if (ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kAscendDevice) {
+        GenLaunchArgs(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
+      } else {
+        kernel_inputs = kernel_mod->GetInputsAddr();
+        kernel_workspaces = kernel_mod->GetWorkSpacesAddr();
+        kernel_outputs = kernel_mod->GetOutputsAddr();
+      }
       bool ret;
       if (AnfAlgo::IsCommunicationOp(kernel)) {
-        ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_);
+        if (pynative_mode_profiling_flag_) {
+          ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs,
+                                                  kernel_workspaces, kernel_outputs, communication_stream_);
+        } else {
+          ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, communication_stream_);
+        }
       } else {
-        ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
+        if (pynative_mode_profiling_flag_) {
+          ret = LaunchKernelWithPynativeProfiling(kernel_mod, kernel->fullname_with_scope(), kernel_inputs,
+                                                  kernel_workspaces, kernel_outputs, stream_);
+        } else {
+          ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
+        }
       }
       if (!ret) {
         MS_LOG(ERROR) << "Launch kernel failed.";
         return false;
       }
       KernelLaunchProfiling(kernel->fullname_with_scope());
+      DebugStreamSync(kernel);
     }
     LaunchKernelEvent(kernel_post_run_events, i);
   }
@@ -1053,54 +1114,10 @@ bool KernelRuntime::LaunchKernel(const session::KernelGraph *graph) {
   return true;
 }
 
-void KernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &,
-                                              const std::unordered_set<ValueNodePtr> &, const std::vector<CNodePtr> &) {
+void KernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id) {
   MS_LOG(INFO) << "Clear graph:" << graph_id << " runtime resource";
 }
 
-void KernelRuntime::ClearOutputAddress(const std::vector<AnfNodePtr> &inputs,
-                                       const std::unordered_set<ValueNodePtr> &value_nodes,
-                                       const std::vector<CNodePtr> &execution_order) {
-  // clear input parameter output address.
-  for (const auto &input_node : inputs) {
-    MS_EXCEPTION_IF_NULL(input_node);
-    if (!input_node->isa<Parameter>()) {
-      continue;
-    }
-    auto parameter = input_node->cast<ParameterPtr>();
-    MS_EXCEPTION_IF_NULL(parameter);
-    parameter->DecreaseUsedGraphCount();
-    // Only the parameter has no graph used, then clear the output address.
-    if (parameter->used_graph_count() != 0) {
-      continue;
-    }
-    size_t output_num = AnfAlgo::GetOutputTensorNum(input_node);
-    for (size_t index = 0; index < output_num; ++index) {
-      if (!AnfAlgo::OutputAddrExist(input_node, index)) {
-        continue;
-      }
-      AnfAlgo::SetOutputAddr(nullptr, index, input_node.get());
-    }
-  }
-  // clear input value node output address.
-  for (const auto &value_node : value_nodes) {
-    if (!AnfAlgo::OutputAddrExist(value_node, 0)) {
-      continue;
-    }
-    AnfAlgo::SetOutputAddr(nullptr, 0, value_node.get());
-  }
-  // clear cnode output address.
-  for (const auto &cnode : execution_order) {
-    size_t output_num = AnfAlgo::GetOutputTensorNum(cnode);
-    for (size_t index = 0; index < output_num; ++index) {
-      if (!AnfAlgo::OutputAddrExist(cnode, index)) {
-        continue;
-      }
-      AnfAlgo::SetOutputAddr(nullptr, index, cnode.get());
-    }
-  }
-}
-
 #if ((defined ENABLE_CPU) && (!defined _WIN32))
 void KernelRuntime::GetFirstPSEmbeddingCache(const session::KernelGraph *graph,
                                              AnfNodePtr *const first_cache_input_index,
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h
index d3c7d2b1d1e..b846f697f76 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -52,6 +52,8 @@ class KernelRuntime {
   KernelRuntime() = default;
   virtual ~KernelRuntime();
   virtual bool Init() = 0;
+  virtual uint32_t GetRankId() { MS_LOG(EXCEPTION) << "Not Implement"; }
+  virtual uint32_t GetRankSize() { MS_LOG(EXCEPTION) << "Not Implement"; }
   virtual void AssignMemory(session::KernelGraph *graph);
   void RunOpAssignMemory(const std::vector<tensor::TensorPtr> &input_tensors, session::KernelGraph *graph);
   void RunOpClearMemory(const session::KernelGraph *graph) const;
@@ -65,12 +67,7 @@ class KernelRuntime {
   bool LaunchKernel(const session::KernelGraph *graph);
   virtual void AssignStaticMemoryInput(const session::KernelGraph *graph);
   virtual void AssignStaticMemoryValueNode(session::KernelGraph *graph);
-  virtual void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
-                                         const std::unordered_set<ValueNodePtr> &value_nodes,
-                                         const std::vector<CNodePtr> &execution_order);
-  virtual void ClearOutputAddress(const std::vector<AnfNodePtr> &inputs,
-                                  const std::unordered_set<ValueNodePtr> &value_nodes,
-                                  const std::vector<CNodePtr> &execution_order);
+  virtual void ClearGraphRuntimeResource(uint32_t graph_id);
   virtual bool SyncStream() = 0;
   virtual bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) = 0;
   virtual void ClearGlobalIdleMem() {}
@@ -103,6 +100,7 @@ class KernelRuntime {
   virtual uint64_t GetAvailableMemMaxSize() const { return 0; }
   void GenKernelEvents(const session::KernelGraph *graph);
   virtual std::shared_ptr<DeviceEvent> CreateDeviceEvent() { return nullptr; }
+  virtual std::shared_ptr<DeviceEvent> CreateDeviceTimeEvent() { return nullptr; }
   virtual DeviceAddressType GetTargetDeviceAddressType() const = 0;
   virtual void *compute_stream() const { return nullptr; }
   virtual void *communication_stream() const { return nullptr; }
@@ -132,6 +130,7 @@ class KernelRuntime {
   void AssignStaticMemoryOutput(const session::KernelGraph *graph);
   bool LaunchKernelMod(const session::KernelGraph &graph);
   void LaunchKernelEvent(const std::vector<std::vector<std::function<void()>>> &run_events, size_t index);
+  void DebugStreamSync(const CNodePtr &kernel);
   static void GenAddrCleanLaunchArgs(const CNodePtr &cnode, AddressPtrList *kernel_inputs);
   void RunOpAssignInputMemory(const std::vector<tensor::TensorPtr> &input_tensors, const session::KernelGraph *graph);
   void RunOpAssignOutputMemory(const AnfNodePtr &kernel);
@@ -139,6 +138,10 @@ class KernelRuntime {
   void RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, session::KernelGraph *graph);
   void AssignValueNodeTensor(const ValueNodePtr &value_node, const ValuePtr &node_value, size_t output_idx);
   DeviceAddressPtr PreAssignCNodeMemory(const AnfNodePtr &anf_node, size_t index);
+  bool LaunchKernelWithPynativeProfiling(kernel::KernelMod *kernel_mod, const std::string &op_name,
+                                         const std::vector<AddressPtr> &inputs,
+                                         const std::vector<AddressPtr> &workspace,
+                                         const std::vector<AddressPtr> &outputs, void *stream);
 #if (ENABLE_CPU && !_WIN32)
   void GetFirstPSEmbeddingCache(const session::KernelGraph *graph, AnfNodePtr *const first_cache_input_index,
                                 size_t *const first_cache_size);
@@ -148,6 +151,7 @@ class KernelRuntime {
 
  protected:
   uint32_t device_id_{0};
+  bool pynative_mode_profiling_flag_{false};
 #if !defined(_WIN32) && !defined(_WIN64)
   std::shared_ptr<Debugger> debugger_;
 #endif
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime_manager.cc b/mindspore/ccsrc/runtime/device/kernel_runtime_manager.cc
index 903b4e672df..4a0d440cb6d 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime_manager.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime_manager.cc
@@ -37,9 +37,7 @@ void KernelRuntimeManager::ClearRuntimeResource() {
   runtime_map_.clear();
 }
 
-void KernelRuntimeManager::ClearGraphResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
-                                              const std::unordered_set<ValueNodePtr> &value_nodes,
-                                              const std::vector<CNodePtr> &execution_order) {
+void KernelRuntimeManager::ClearGraphResource(uint32_t graph_id) {
   std::lock_guard<std::mutex> guard(lock_);
   for (auto &iter : runtime_map_) {
     MS_LOG(INFO) << "Clear device " << iter.first << " graph " << graph_id << " runtime resource";
@@ -47,7 +45,7 @@ void KernelRuntimeManager::ClearGraphResource(uint32_t graph_id, const std::vect
       MS_LOG(ERROR) << "Kernel runtime is nullptr";
       continue;
     }
-    iter.second->ClearGraphRuntimeResource(graph_id, inputs, value_nodes, execution_order);
+    iter.second->ClearGraphRuntimeResource(graph_id);
   }
 }
 
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime_manager.h b/mindspore/ccsrc/runtime/device/kernel_runtime_manager.h
index df3fe6fe4b2..9a28a6c6b10 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime_manager.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime_manager.h
@@ -39,9 +39,7 @@ class KernelRuntimeManager {
   KernelRuntime *GetSingleKernelRuntime(const std::string &device_name, uint32_t device_id);
   void ReleaseKernelRuntime(const std::string &device_name, uint32_t device_id);
   void ClearRuntimeResource();
-  void ClearGraphResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
-                          const std::unordered_set<ValueNodePtr> &value_nodes,
-                          const std::vector<CNodePtr> &execution_order);
+  void ClearGraphResource(uint32_t graph_id);
 
  private:
   KernelRuntimeManager() = default;
diff --git a/mindspore/ccsrc/runtime/device/memory_manager.cc b/mindspore/ccsrc/runtime/device/memory_manager.cc
index e097f5c9820..f3a93686e3a 100644
--- a/mindspore/ccsrc/runtime/device/memory_manager.cc
+++ b/mindspore/ccsrc/runtime/device/memory_manager.cc
@@ -65,15 +65,11 @@ void MemoryManager::MallocSomasDynamicMem(const session::KernelGraph *graph) {
   (void)mindspore::RDR::RecordString(module, name, somas_reuse_util_ptr_->SomasMemory());
 #endif
   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
-  auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
-  if (save_graphs_path.empty()) {
-    save_graphs_path = ".";
-  }
   if (save_graphs) {
-    std::string file_path = save_graphs_path + "/" + "somas_allocate_info_" + std::to_string(graph->graph_id()) + ".ir";
+    std::string file_path = GetSaveGraphsPathName("somas_allocate_info_" + std::to_string(graph->graph_id()) + ".ir");
     somas_reuse_util_ptr_->DumpSomasInfoIR(file_path);
 
-    std::string mem_file_path = save_graphs_path + "/" + "somas_mem_info_" + std::to_string(graph->graph_id()) + ".ir";
+    std::string mem_file_path = GetSaveGraphsPathName("somas_mem_info_" + std::to_string(graph->graph_id()) + ".ir");
     somas_reuse_util_ptr_->DumpSomasMemoryIR(mem_file_path);
   }
 }
diff --git a/mindspore/ccsrc/runtime/framework/actor/actor_common.cc b/mindspore/ccsrc/runtime/framework/actor/actor_common.cc
index 88e8643cac3..cf98d24c220 100644
--- a/mindspore/ccsrc/runtime/framework/actor/actor_common.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/actor_common.cc
@@ -15,17 +15,18 @@
  */
 
 #include "runtime/framework/actor/actor_common.h"
-#include "backend/session/anf_runtime_algorithm.h"
 #include "runtime/framework/device_tensor_store.h"
+#include "backend/session/anf_runtime_algorithm.h"
 #include "utils/ms_context.h"
 
 namespace mindspore {
 namespace runtime {
-void ComputeThreadNums(size_t *actor_thread_num, size_t *OMP_thread_num) {
+void ComputeThreadNums(size_t *actor_thread_num, size_t *OMP_thread_num, size_t *max_thread_num) {
   MS_EXCEPTION_IF_NULL(actor_thread_num);
   MS_EXCEPTION_IF_NULL(OMP_thread_num);
-  size_t cpu_core_num = std::thread::hardware_concurrency();
-
+  MS_EXCEPTION_IF_NULL(max_thread_num);
+  size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
+  const size_t kMaxThreadNum = 23;
   const size_t kActorThreadMaxNum = 5;
   // The MemoryManagerActor binds single thread, and the other actors share one thread at least, so the min num is 2.
   const size_t kActorThreadMinNum = 2;
@@ -41,6 +42,10 @@ void ComputeThreadNums(size_t *actor_thread_num, size_t *OMP_thread_num) {
 
   const size_t kOMPThreadMaxNum = 8;
   *OMP_thread_num = cpu_core_num < kOMPThreadMaxNum ? cpu_core_num : kOMPThreadMaxNum;
+  *max_thread_num = cpu_core_num > *actor_thread_num ? cpu_core_num : (*actor_thread_num + 1);
+  if (*max_thread_num > kMaxThreadNum) {
+    *max_thread_num = kMaxThreadNum;
+  }
 }
 
 bool IsDeviceQueueDSActor(const AnfNodePtr &node, GraphExecutionStrategy strategy) {
@@ -55,6 +60,38 @@ bool IsDeviceQueueDSActor(const AnfNodePtr &node, GraphExecutionStrategy strateg
   return false;
 }
 
+bool IsHostQueueDSActor(const AnfNodePtr &node, const KernelGraphPtr &graph,
+                        const std::vector<AnfNodePtr> &host_parameters, GraphExecutionStrategy strategy) {
+  MS_EXCEPTION_IF_NULL(node);
+
+  bool is_parameter_data = node->isa<Parameter>() && (!AnfAlgo::IsParameterWeight(node->cast<ParameterPtr>()));
+  if (!is_parameter_data) {
+    return false;
+  }
+
+  if (strategy == GraphExecutionStrategy::kStep) {
+    MS_EXCEPTION_IF_NULL(graph);
+    return graph->execution_order().size() > 1;
+  }
+
+  if (graph == nullptr) {
+    return true;
+  }
+
+  // In control flow, only the parameters of the root funcgraph are in the host data source.
+  const auto &front_node = graph->GetFrontAnfByBackendAnf(node);
+  bool is_host = ((front_node == nullptr) || host_parameters.empty() ||
+                  find(host_parameters.begin(), host_parameters.end(), front_node) != host_parameters.end());
+
+  //  Judge whether node is internal parameter.
+  const auto &internal_front_node = graph->GetFrontNodeByInternalParameter(node);
+  if (internal_front_node.first == nullptr && is_host) {
+    return true;
+  }
+
+  return false;
+}
+
 bool IsSwitchActor(const AnfNodePtr &node) { return AnfAlgo::CheckPrimitiveType(node, prim::kPrimSwitch); }
 
 bool IsInternalParameter(const AnfNodePtr &node, const KernelGraphPtr &graph) {
diff --git a/mindspore/ccsrc/runtime/framework/actor/actor_common.h b/mindspore/ccsrc/runtime/framework/actor/actor_common.h
index 2ae5c7fa2fb..3f096cd34c6 100644
--- a/mindspore/ccsrc/runtime/framework/actor/actor_common.h
+++ b/mindspore/ccsrc/runtime/framework/actor/actor_common.h
@@ -43,6 +43,19 @@ enum class GraphExecutionStrategy {
   kStep       // The actor running need be triggered by control in addition.
 };
 
+enum class KernelTransformType {
+  kUnknown,
+  kDeviceDataSourceActor,
+  kHostDataSourceActor,
+  kKernelActor,
+  kCopyActor,
+  kLoopCountActor,
+  kOutputActor,
+  kDeviceTensorStore,
+  // Internal parameter is the output of previous kernel graph which is related to the input of next kernel graph.
+  kInternalParameter
+};
+
 #define SET_OPCONTEXT_FAIL_RET_WITH_ERROR(op_context, message) \
   {                                                            \
     MS_LOG(ERROR) << message;                                  \
@@ -66,10 +79,16 @@ enum class GraphExecutionStrategy {
     return;                                                                          \
   }
 
-void ComputeThreadNums(size_t *actor_thread_num, size_t *OMP_thread_num);
+void ComputeThreadNums(size_t *actor_thread_num, size_t *OMP_thread_num, size_t *max_thread_num);
 
 bool IsDeviceQueueDSActor(const AnfNodePtr &node, GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline);
 
+// Host parameters are parameters of root funcgraph, in control flow, only the parameters of the root funcgraph are
+// in the host data source.
+bool IsHostQueueDSActor(const AnfNodePtr &node, const KernelGraphPtr &graph = nullptr,
+                        const std::vector<AnfNodePtr> &host_parameters = {},
+                        GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline);
+
 bool IsKernelActor(const AnfNodePtr &node, GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline);
 
 bool IsSwitchActor(const AnfNodePtr &node);
diff --git a/mindspore/ccsrc/runtime/framework/actor/copy_actor.cc b/mindspore/ccsrc/runtime/framework/actor/copy_actor.cc
index 9c7adae4938..d7088275104 100644
--- a/mindspore/ccsrc/runtime/framework/actor/copy_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/copy_actor.cc
@@ -24,6 +24,11 @@ namespace runtime {
 const size_t kDeviceTensorNum = 1;
 
 void CopyActor::Init() {
+  // Check device contexts number.
+  if (device_contexts_.size() != device::kDeviceContextsNumTwo) {
+    MS_LOG(EXCEPTION) << "The device contexts number is wrong.";
+  }
+
   input_device_tensor_.resize(kDeviceTensorNum);
   output_device_tensor_.resize(kDeviceTensorNum);
 
@@ -43,7 +48,7 @@ void CopyActor::RunOpData(OpData<DeviceTensor> *const input_data, OpContext<Devi
   auto &sequential_num = context->sequential_num_;
   (void)input_op_datas_[sequential_num].emplace_back(input_data);
   // When all the inputs are collected, then allocate memory and callback copy.
-  if (CheckCopyCondition(context)) {
+  if (CheckRunningCondition(context)) {
     FetchDeviceTensor(context);
     SendMemoryAllocReq(context);
   }
@@ -54,20 +59,20 @@ void CopyActor::RunOpControl(AID *const input_control, OpContext<DeviceTensor> *
   auto &sequential_num = context->sequential_num_;
   (void)input_op_controls_[sequential_num].emplace_back(input_control);
   // When all the inputs are collected, then allocate memory and callback copy.
-  if (CheckCopyCondition(context)) {
+  if (CheckRunningCondition(context)) {
     FetchDeviceTensor(context);
     SendMemoryAllocReq(context);
   }
 }
 
 void CopyActor::SendMemoryAllocReq(OpContext<DeviceTensor> *const context) {
-  Async(memory_manager_aid_, &MemoryManagerActor::AllocateMemory, &output_device_tensor_, output_device_context_,
-        context, GetAID());
+  Async(memory_manager_aid_, &MemoryManagerActor::AllocateMemory, &output_device_tensor_, device_contexts_[1], context,
+        GetAID());
 }
 
 void CopyActor::SendMemoryFreeReq(OpContext<DeviceTensor> *const context) {
-  Async(memory_manager_aid_, &MemoryManagerActor::FreeMemory, &input_device_tensor_, input_device_context_, context);
-  Async(memory_manager_aid_, &MemoryManagerActor::FreeMemory, &output_device_tensor_, output_device_context_, context);
+  Async(memory_manager_aid_, &MemoryManagerActor::FreeMemory, &input_device_tensor_, device_contexts_[0], context);
+  Async(memory_manager_aid_, &MemoryManagerActor::FreeMemory, &output_device_tensor_, device_contexts_[1], context);
 }
 
 void CopyActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
@@ -96,50 +101,28 @@ void CopyActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
   SendOutput(context);
 }
 
-bool CopyActor::CheckCopyCondition(OpContext<DeviceTensor> *const context) const {
-  MS_EXCEPTION_IF_NULL(context);
-  if (input_datas_num_ != 0) {
-    const auto &data_iter = input_op_datas_.find(context->sequential_num_);
-    if (data_iter == input_op_datas_.end()) {
-      return false;
-    }
-    if (data_iter->second.size() != input_datas_num_) {
-      return false;
-    }
-  }
-
-  if (input_controls_num_ != 0) {
-    const auto &control_iter = input_op_controls_.find(context->sequential_num_);
-    if (control_iter == input_op_controls_.end()) {
-      return false;
-    }
-    if (control_iter->second.size() != input_controls_num_) {
-      return false;
-    }
-  }
-  return true;
-}
-
 void CopyActor::FetchDeviceTensor(OpContext<DeviceTensor> *const context) {
   MS_EXCEPTION_IF_NULL(context);
-  MS_EXCEPTION_IF_NULL(input_device_context_);
+  MS_EXCEPTION_IF_NULL(device_contexts_[0]);
 
-  if (device_tensor_store_key_.second != nullptr) {
-    input_device_tensor_[0] = DeviceTensorStore::GetInstance().Fetch(device_tensor_store_key_.second,
-                                                                     input_device_context_->GetDeviceAddressType());
+  if (device_tensor_store_keys_.size() > 0) {
+    input_device_tensor_[0] = DeviceTensorStore::GetInstance().Fetch(device_tensor_store_keys_[0].second.get(),
+                                                                     device_contexts_[0]->GetDeviceAddressType());
     if (input_device_tensor_[0] == nullptr) {
       std::string error_info =
-        GetAID().Name() + " get device tensor store failed: " + device_tensor_store_key_.second->fullname_with_scope() +
-        ", device type:" + std::to_string(static_cast<int>(input_device_context_->GetDeviceAddressType()));
+        GetAID().Name() +
+        " get device tensor store failed: " + device_tensor_store_keys_[0].second->fullname_with_scope() +
+        ", device type:" + std::to_string(static_cast<int>(device_contexts_[0]->GetDeviceAddressType()));
       SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info);
     }
 
-    output_device_tensor_[0] = DeviceTensorStore::GetInstance().Fetch(device_tensor_store_key_.second,
-                                                                      output_device_context_->GetDeviceAddressType());
+    output_device_tensor_[0] = DeviceTensorStore::GetInstance().Fetch(device_tensor_store_keys_[0].second.get(),
+                                                                      device_contexts_[1]->GetDeviceAddressType());
     if (output_device_tensor_[0] == nullptr) {
       std::string error_info =
-        GetAID().Name() + " get device tensor store failed: " + device_tensor_store_key_.second->fullname_with_scope() +
-        ", device type:" + std::to_string(static_cast<int>(output_device_context_->GetDeviceAddressType()));
+        GetAID().Name() +
+        " get device tensor store failed: " + device_tensor_store_keys_[0].second->fullname_with_scope() +
+        ", device type:" + std::to_string(static_cast<int>(device_contexts_[1]->GetDeviceAddressType()));
       SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info);
     }
   } else {
@@ -178,24 +161,5 @@ void CopyActor::SendOutput(OpContext<DeviceTensor> *const context) const {
     }
   }
 }
-
-void CopyActor::EraseInput(OpContext<DeviceTensor> *const context) {
-  MS_EXCEPTION_IF_NULL(context);
-  if (input_datas_num_ != 0) {
-    auto ret = input_op_datas_.erase(context->sequential_num_);
-    if (ret == 0) {
-      std::string error_info = "Erase input data failed: " + GetAID().Name();
-      SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info);
-    }
-  }
-
-  if (input_controls_num_ != 0) {
-    auto ret = input_op_controls_.erase(context->sequential_num_);
-    if (ret == 0) {
-      std::string error_info = "Erase input controls failed: " + GetAID().Name();
-      SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info);
-    }
-  }
-}
 }  // namespace runtime
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/framework/actor/copy_actor.h b/mindspore/ccsrc/runtime/framework/actor/copy_actor.h
index d5d244d789a..4cea66bf0e2 100644
--- a/mindspore/ccsrc/runtime/framework/actor/copy_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/copy_actor.h
@@ -32,18 +32,12 @@ namespace runtime {
 using mindspore::device::DeviceContext;
 
 // The copy actor is used to receive the device tensors and control info to copy data between input device tensor and
-// output device tensor. The processing flow is RunOpData/RunOpControl -> CheckCopyCondition -> SendMemoryAllocReq
+// output device tensor. The processing flow is RunOpData/RunOpControl -> CheckRunningCondition -> SendMemoryAllocReq
 // -> OnMemoryAllocFinish -> Copy -> SendMemoryFreeReq -> SendOutput.
 class CopyActor : public MemoryAwareActor {
  public:
   CopyActor(const std::string &name, const AID &memory_manager_aid)
-      : MemoryAwareActor(name),
-        memory_manager_aid_(memory_manager_aid),
-        input_datas_num_(0),
-        input_controls_num_(0),
-        input_device_context_(nullptr),
-        output_device_context_(nullptr),
-        output_(nullptr) {}
+      : MemoryAwareActor(name, KernelTransformType::kCopyActor, nullptr, memory_manager_aid), output_(nullptr) {}
   ~CopyActor() override = default;
 
   void Init() override;
@@ -62,34 +56,15 @@ class CopyActor : public MemoryAwareActor {
  private:
   friend class GraphScheduler;
 
-  // Check whether satisfy the condition for copy.
-  bool CheckCopyCondition(OpContext<DeviceTensor> *const context) const;
   // Fetch the device tensor for copy.
   void FetchDeviceTensor(OpContext<DeviceTensor> *const context);
 
   // Send output data and output controls when finish copy.
   void SendOutput(OpContext<DeviceTensor> *const context) const;
-  // Erase input data and input controls when finish copy.
-  void EraseInput(OpContext<DeviceTensor> *const context);
 
-  // The id of memory manager actor. Send message to it for alloc and free memory during the copy.
-  const AID memory_manager_aid_;
-
-  // The dependent input data number.
-  size_t input_datas_num_;
-  // The dependent input controls number.
-  size_t input_controls_num_;
-
-  // Pair<index, anfNode> points to the dependent device tensor store, anfNode is the key of the device tensor store.
-  std::pair<size_t, AnfNode *> device_tensor_store_key_;
-
-  // The device interface for copy.
-  const DeviceContext *input_device_context_;
-  const DeviceContext *output_device_context_;
-
-  // The input device tensor is saved from the input data or fetched by device_tensor_store_key_.
+  // The input device tensor is saved from the input data or fetched by device_tensor_store_keys_.
   std::vector<DeviceTensor *> input_device_tensor_;
-  // The output device tensor is saved from the output or fetched by device_tensor_store_key_.
+  // The output device tensor is saved from the output or fetched by device_tensor_store_keys_.
   std::vector<DeviceTensor *> output_device_tensor_;
 
   //  The output_data_ corresponds to the output_data_arrows_ one by one.
diff --git a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc
index dab511adbd5..e46437d9266 100644
--- a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc
@@ -27,6 +27,11 @@
 namespace mindspore {
 namespace runtime {
 void DataSourceActor::Init() {
+  // Check device contexts number.
+  if (device_contexts_.size() < device::kDeviceContextsNumOne) {
+    MS_LOG(EXCEPTION) << "The device contexts number is wrong.";
+  }
+
   // Init output data.
   for (auto &data_arrow : output_data_arrows_) {
     MS_EXCEPTION_IF_NULL(data_arrow);
@@ -98,6 +103,11 @@ void DataSourceActor::SendOutput(OpContext<DeviceTensor> *const context) {
 }
 
 void DeviceQueueDataSourceActor::Init() {
+  // Check device contexts number.
+  if (device_contexts_.size() != device::kDeviceContextsNumOne) {
+    MS_LOG(EXCEPTION) << "The device contexts number is wrong.";
+  }
+
   // Init output data.
   for (auto &data_arrow : output_data_arrows_) {
     MS_EXCEPTION_IF_NULL(data_arrow);
@@ -126,17 +136,18 @@ void DeviceQueueDataSourceActor::FillDataBuffer() {
 
 void DeviceQueueDataSourceActor::SendMemoryAllocReq(OpContext<DeviceTensor> *const context) {
   auto &device_tensors = buffers_.back();
-  Async(memory_manager_aid_, &MemoryManagerActor::AllocateMemory, &device_tensors, device_context_, context, GetAID());
+  Async(memory_manager_aid_, &MemoryManagerActor::AllocateMemory, &device_tensors, device_contexts_[0], context,
+        GetAID());
 }
 
 void DeviceQueueDataSourceActor::SendMemoryFreeReq(OpContext<DeviceTensor> *const context) {
   auto &device_tensors = buffers_.front();
-  Async(memory_manager_aid_, &MemoryManagerActor::FreeMemory, &device_tensors, device_context_, context);
+  Async(memory_manager_aid_, &MemoryManagerActor::FreeMemory, &device_tensors, device_contexts_[0], context);
 }
 
 void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
   MS_EXCEPTION_IF_NULL(context);
-  MS_EXCEPTION_IF_NULL(device_context_);
+  MS_EXCEPTION_IF_NULL(device_contexts_[0]);
   if (buffers_.size() == 0) {
     SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "The data queue is empty.");
   }
@@ -151,8 +162,8 @@ void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *co
 
   // Copy data from device queue by data kernel launching.
   try {
-    auto ret = device_context_->LaunchKernel(data_kernel_, launch_info_.inputs_, launch_info_.workspaces_,
-                                             launch_info_.outputs_);
+    auto ret = device_contexts_[0]->LaunchKernel(data_kernel_, launch_info_.inputs_, launch_info_.workspaces_,
+                                                 launch_info_.outputs_);
     if (!ret) {
       std::string error_info = "Launch kernel failed: " + data_kernel_->fullname_with_scope();
       SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info);
@@ -178,7 +189,7 @@ void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *co
 }
 
 void DeviceQueueDataSourceActor::SendDebugReq(OpContext<DeviceTensor> *const context) {
-  Async(*debug_aid_, &DebugActor::Debug, data_kernel_, &launch_info_, device_context_, context, &GetAID());
+  Async(*debug_aid_, &DebugActor::Debug, data_kernel_, &launch_info_, device_contexts_[0], context, &GetAID());
 }
 
 void DeviceQueueDataSourceActor::OnDebugFinish(OpContext<DeviceTensor> *const context) {
@@ -197,7 +208,7 @@ void DeviceQueueDataSourceActor::SendResult(OpContext<DeviceTensor> *const conte
 void DeviceQueueDataSourceActor::SendRecorderInfo(OpContext<DeviceTensor> *const context) {
   if (recorder_aid_ != nullptr) {
     Async(*recorder_aid_, &RecorderActor::RecordInfo, data_kernel_->fullname_with_scope(), &launch_info_,
-          device_context_, context);
+          device_contexts_[0], context);
   }
 }
 
@@ -294,7 +305,7 @@ void HostQueueDataSourceActor::SendResult(OpContext<DeviceTensor> *const context
   }
 }
 
-size_t HostQueueDataSourceActor::FetchDataNodePosition(const AnfNodePtr &data_node) const {
+size_t HostQueueDataSourceActor::FetchNodePosition(const AnfNodePtr &data_node) const {
   const auto &iter = data_node_position_map_.find(data_node);
   if (iter == data_node_position_map_.end()) {
     MS_LOG(EXCEPTION) << "Data node: " << data_node->fullname_with_scope() << " is not exist.";
diff --git a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.h b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.h
index 68de3d9e289..dadf2a60523 100644
--- a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.h
@@ -41,13 +41,9 @@ using mindspore::kernel::KernelLaunchInfo;
 // -> OnMemoryAllocFinish -> SendMemoryFreeReq -> SendOutput.
 class DataSourceActor : public DebugAwareActor {
  public:
-  DataSourceActor(const std::string &name, size_t buffer_capacity, const AID memory_manager_aid, const AID *debug_aid,
-                  const AID *recorder_aid)
-      : DebugAwareActor(name),
-        buffer_capacity_(buffer_capacity),
-        memory_manager_aid_(memory_manager_aid),
-        debug_aid_(debug_aid),
-        recorder_aid_(recorder_aid) {}
+  DataSourceActor(const std::string &name, KernelTransformType type, size_t buffer_capacity,
+                  const AID &memory_manager_aid, const AID *debug_aid, const AID *recorder_aid)
+      : DebugAwareActor(name, type, recorder_aid, memory_manager_aid, debug_aid), buffer_capacity_(buffer_capacity) {}
   virtual ~DataSourceActor() = default;
 
   void Init() override;
@@ -76,20 +72,10 @@ class DataSourceActor : public DebugAwareActor {
   // Send output to downstream actors to trigger computing after fetching data finished.
   void SendOutput(OpContext<DeviceTensor> *const context);
 
-  // The output result arrows of graph output.
-  std::vector<DataArrowPtr> output_result_arrows_;
-
   // The buffers store the device tensors.
   std::queue<std::vector<DeviceTensor *>> buffers_;
   size_t buffer_capacity_;
 
-  // The id of memory manager actor. Send message to it for alloc and free memory during the data processing.
-  const AID memory_manager_aid_;
-  // The id of debug actor. Send message to it for debug after the kernel launch.
-  const AID *debug_aid_;
-  // The id of recorder actor. Send message to it for recording kernel info after the kernel launch.
-  const AID *recorder_aid_;
-
   //  The output_data_ corresponds to the output_data_arrows_ one by one.
   std::vector<OpDataUniquePtr<DeviceTensor>> output_data_;
 };
@@ -97,10 +83,12 @@ class DataSourceActor : public DebugAwareActor {
 // The class represents that the data source is device queue.
 class DeviceQueueDataSourceActor : public DataSourceActor {
  public:
-  DeviceQueueDataSourceActor(std::string name, size_t buffer_capacity, const DeviceContext *device_context,
-                             const AID memory_manager_aid, const AID *debug_aid, const AID *recorder_aid)
-      : DataSourceActor(name, buffer_capacity, memory_manager_aid, debug_aid, recorder_aid),
-        device_context_(device_context) {}
+  DeviceQueueDataSourceActor(const std::string &name, size_t buffer_capacity, const DeviceContext *device_context,
+                             const AID &memory_manager_aid, const AID *debug_aid, const AID *recorder_aid)
+      : DataSourceActor(name, KernelTransformType::kDeviceDataSourceActor, buffer_capacity, memory_manager_aid,
+                        debug_aid, recorder_aid) {
+    (void)device_contexts_.emplace_back(device_context);
+  }
   ~DeviceQueueDataSourceActor() override = default;
 
   void Init() override;
@@ -126,8 +114,6 @@ class DeviceQueueDataSourceActor : public DataSourceActor {
 
   // The kernel launch info is fetched by the device tensors.
   KernelLaunchInfo launch_info_;
-
-  const DeviceContext *device_context_;
 };
 
 // The class represents that the data source is host queue.
@@ -135,14 +121,16 @@ class HostQueueDataSourceActor : public DataSourceActor {
  public:
   HostQueueDataSourceActor(std::string name, size_t buffer_capacity, const AID memory_manager_aid, const AID *debug_aid,
                            const AID *recorder_aid, HostTensorQueuePtr host_queue)
-      : DataSourceActor(name, buffer_capacity, memory_manager_aid, debug_aid, recorder_aid), host_queue_(host_queue) {}
+      : DataSourceActor(name, KernelTransformType::kHostDataSourceActor, buffer_capacity, memory_manager_aid, debug_aid,
+                        recorder_aid),
+        host_queue_(host_queue) {}
   ~HostQueueDataSourceActor() override = default;
 
   void SendMemoryAllocReq(OpContext<DeviceTensor> *const context) override;
   void SendMemoryFreeReq(OpContext<DeviceTensor> *const context) override;
   void OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) override;
 
-  size_t FetchDataNodePosition(const AnfNodePtr &data_node) const;
+  size_t FetchNodePosition(const AnfNodePtr &node) const override;
 
  protected:
   void FillDataBuffer() override;
@@ -157,8 +145,6 @@ class HostQueueDataSourceActor : public DataSourceActor {
   HostTensorQueuePtr host_queue_;
   // Input data nodes fetch data from host queue.
   std::vector<AnfNodePtr> data_nodes_;
-  // The device contexts corresponding to the data nodes.
-  std::vector<const DeviceContext *> device_contexts_;
 
   // The location of the data node in the data source actor.
   std::unordered_map<AnfNodePtr, size_t> data_node_position_map_;
diff --git a/mindspore/ccsrc/runtime/framework/actor/debug_aware_actor.h b/mindspore/ccsrc/runtime/framework/actor/debug_aware_actor.h
index 214f1378f7d..e4a0d9327c5 100644
--- a/mindspore/ccsrc/runtime/framework/actor/debug_aware_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/debug_aware_actor.h
@@ -25,10 +25,17 @@ namespace runtime {
 // The actor represents a set of common debug related operations of actor.
 class DebugAwareActor : public MemoryAwareActor {
  public:
-  explicit DebugAwareActor(const std::string &name) : MemoryAwareActor(name) {}
+  explicit DebugAwareActor(const std::string &name, KernelTransformType type, const AID *recorder_aid,
+                           const AID &memory_manager_aid, const AID *debug_aid)
+      : MemoryAwareActor(name, type, recorder_aid, memory_manager_aid), debug_aid_(debug_aid) {}
   virtual ~DebugAwareActor() = default;
+
   virtual void SendDebugReq(OpContext<DeviceTensor> *const context) {}
   virtual void OnDebugFinish(OpContext<DeviceTensor> *const context) {}
+
+ protected:
+  // The id of debug actor. Send message to it for debug.
+  const AID *debug_aid_;
 };
 }  // namespace runtime
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/framework/actor/kernel_actor.cc b/mindspore/ccsrc/runtime/framework/actor/kernel_actor.cc
index cae678fa23d..f444af5fc9f 100644
--- a/mindspore/ccsrc/runtime/framework/actor/kernel_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/kernel_actor.cc
@@ -25,6 +25,11 @@
 namespace mindspore {
 namespace runtime {
 void KernelActor::Init() {
+  // Check device contexts number.
+  if (device_contexts_.size() != device::kDeviceContextsNumOne) {
+    MS_LOG(EXCEPTION) << "The device contexts number is wrong.";
+  }
+
   // Set the number of actor running dependent messages.
   running_dependent_msg_num_ = SizeToInt(input_datas_num_ + input_controls_num_);
 
@@ -84,10 +89,10 @@ void KernelActor::RunOpData(OpData<DeviceTensor> *const input_data, OpContext<De
     SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info);
   }
   // When all the inputs are collected, then allocate memory and callback launch.
-  if (CheckLaunchCondition(context)) {
+  if (CheckRunningCondition(context)) {
     // Infer kernel shape and update abstract info for dynamic shape kernel.
     if (is_dynamic_shape_) {
-      device_context_->UpdateDynamicShape(kernel_);
+      device_contexts_[0]->UpdateDynamicShape(kernel_);
     }
 
     FetchInputDeviceTensor(context);
@@ -105,10 +110,10 @@ void KernelActor::RunOpControl(AID *const input_control, OpContext<DeviceTensor>
   auto &sequential_num = context->sequential_num_;
   (void)input_op_controls_[sequential_num].emplace_back(input_control);
   // When all the inputs are collected, then allocate memory and callback launch.
-  if (CheckLaunchCondition(context)) {
+  if (CheckRunningCondition(context)) {
     // Infer kernel shape and update abstract info for dynamic shape kernel.
     if (is_dynamic_shape_) {
-      device_context_->UpdateDynamicShape(kernel_);
+      device_contexts_[0]->UpdateDynamicShape(kernel_);
     }
 
     FetchInputDeviceTensor(context);
@@ -130,7 +135,7 @@ void KernelActor::RunOpControlWithInputTensor(AID *const input_control, OpContex
 
   PushInputDeviceTensor(input_tensors);
   // When all the inputs are collected, then allocate memory and callback launch.
-  if (CheckLaunchCondition(context)) {
+  if (CheckRunningCondition(context)) {
     FetchOutputDeviceTensor();
     if (memory_alloc_list_.size() > 0) {
       SendMemoryAllocReq(context);
@@ -181,30 +186,30 @@ void FreeMemory(const std::vector<DeviceTensor *> &free_list, const DeviceContex
 void KernelActor::SendMemoryAllocReq(OpContext<DeviceTensor> *const context) {
   running_dependent_msg_num_ = 1;
   if (strategy_ == GraphExecutionStrategy::kPipeline) {
-    Async(memory_manager_aid_, &MemoryManagerActor::AllocateMemory, &memory_alloc_list_, device_context_, context,
+    Async(memory_manager_aid_, &MemoryManagerActor::AllocateMemory, &memory_alloc_list_, device_contexts_[0], context,
           GetAID());
   } else {
-    AllocateMemory(memory_alloc_list_, device_context_);
+    AllocateMemory(memory_alloc_list_, device_contexts_[0]);
   }
 }
 
 void KernelActor::SendMemoryFreeReq(OpContext<DeviceTensor> *const context) {
   if (strategy_ == GraphExecutionStrategy::kPipeline) {
-    Async(memory_manager_aid_, &MemoryManagerActor::FreeMemory, &memory_free_list_, device_context_, context);
+    Async(memory_manager_aid_, &MemoryManagerActor::FreeMemory, &memory_free_list_, device_contexts_[0], context);
   } else {
-    FreeMemory(memory_free_list_, device_context_);
+    FreeMemory(memory_free_list_, device_contexts_[0]);
   }
 }
 
 void KernelActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
   MS_EXCEPTION_IF_NULL(context);
   MS_EXCEPTION_IF_NULL(kernel_);
-  MS_EXCEPTION_IF_NULL(device_context_);
+  MS_EXCEPTION_IF_NULL(device_contexts_[0]);
   PreLaunchKernel(context);
 
   try {
-    auto ret = device_context_->LaunchKernel(kernel_, launch_info_.inputs_, launch_info_.workspaces_,
-                                             launch_info_.outputs_, is_dynamic_shape_);
+    auto ret = device_contexts_[0]->LaunchKernel(kernel_, launch_info_.inputs_, launch_info_.workspaces_,
+                                                 launch_info_.outputs_, is_dynamic_shape_);
     if (!ret) {
       std::string error_info = "Launch kernel failed: " + kernel_->fullname_with_scope();
       SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(strategy_, (*context), error_info);
@@ -226,7 +231,7 @@ void KernelActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
 
 void KernelActor::SendDebugReq(OpContext<DeviceTensor> *const context) {
   running_dependent_msg_num_ = 1;
-  Async(*debug_aid_, &DebugActor::Debug, kernel_, &launch_info_, device_context_, context, &GetAID());
+  Async(*debug_aid_, &DebugActor::Debug, kernel_, &launch_info_, device_contexts_[0], context, &GetAID());
 }
 
 void KernelActor::OnDebugFinish(OpContext<DeviceTensor> *context) {
@@ -234,30 +239,6 @@ void KernelActor::OnDebugFinish(OpContext<DeviceTensor> *context) {
   PostLaunchKernel(context);
 }
 
-bool KernelActor::CheckLaunchCondition(OpContext<DeviceTensor> *const context) const {
-  MS_EXCEPTION_IF_NULL(context);
-  if (input_datas_num_ != 0) {
-    const auto &data_iter = input_op_datas_.find(context->sequential_num_);
-    if (data_iter == input_op_datas_.end()) {
-      return false;
-    }
-    if (data_iter->second.size() != input_datas_num_) {
-      return false;
-    }
-  }
-
-  if (input_controls_num_ != 0) {
-    const auto &control_iter = input_op_controls_.find(context->sequential_num_);
-    if (control_iter == input_op_controls_.end()) {
-      return false;
-    }
-    if (control_iter->second.size() != input_controls_num_) {
-      return false;
-    }
-  }
-  return true;
-}
-
 void KernelActor::PushInputDeviceTensor(const std::vector<TensorPtr> *input_tensors) {
   MS_EXCEPTION_IF_NULL(input_tensors);
   if (input_tensors->size() != real_input_num_) {
@@ -279,24 +260,25 @@ void KernelActor::PushInputDeviceTensor(const std::vector<TensorPtr> *input_tens
 void KernelActor::CopyInputDeviceTensor(const OpData<DeviceTensor> *input_data,
                                         OpContext<DeviceTensor> *const context) {
   MS_EXCEPTION_IF_NULL(input_data);
-  if ((input_data->data_ == nullptr) || (input_data->data_->DeviceType() == device_context_->GetDeviceAddressType())) {
+  if ((input_data->data_ == nullptr) ||
+      (input_data->data_->DeviceType() == device_contexts_[0]->GetDeviceAddressType())) {
     return;
   }
 
   MS_LOG(DEBUG) << "Copy from device type: " << input_data->data_->DeviceType()
-                << " to device type: " << device_context_->GetDeviceAddressType() << " in " << GetAID().Name();
+                << " to device type: " << device_contexts_[0]->GetDeviceAddressType() << " in " << GetAID().Name();
   if (copy_input_device_tensors_[input_data->index_] == nullptr) {
-    copy_input_device_tensors_[input_data->index_] = device_context_->CreateDeviceAddress(
+    copy_input_device_tensors_[input_data->index_] = device_contexts_[0]->CreateDeviceAddress(
       nullptr, input_data->data_->GetSize(), input_data->data_->format(), input_data->data_->type_id());
   }
   // Dynamic shape need update size.
   copy_input_device_tensors_[input_data->index_]->SetSize(input_data->data_->GetSize());
 
   if (copy_input_device_tensors_[input_data->index_]->GetPtr() == nullptr) {
-    if (!device_context_->AllocateMemory(copy_input_device_tensors_[input_data->index_].get(),
-                                         copy_input_device_tensors_[input_data->index_]->GetSize())) {
+    if (!device_contexts_[0]->AllocateMemory(copy_input_device_tensors_[input_data->index_].get(),
+                                             copy_input_device_tensors_[input_data->index_]->GetSize())) {
       std::string error_info =
-        "Device(id:" + std::to_string(device_context_->device_context_key().device_id_) +
+        "Device(id:" + std::to_string(device_contexts_[0]->device_context_key().device_id_) +
         ") memory isn't enough and alloc failed, actor name: " + GetAID().Name() +
         ", alloc size: " + std::to_string(copy_input_device_tensors_[input_data->index_]->GetSize());
       SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info);
@@ -315,7 +297,7 @@ void KernelActor::CopyInputDeviceTensor(const OpData<DeviceTensor> *input_data,
 
 void KernelActor::FetchInputDeviceTensor(OpContext<DeviceTensor> *const context) {
   MS_EXCEPTION_IF_NULL(context);
-  MS_EXCEPTION_IF_NULL(device_context_);
+  MS_EXCEPTION_IF_NULL(device_contexts_[0]);
 
   const auto &data_iter = input_op_datas_.find(context->sequential_num_);
   if (data_iter != input_op_datas_.end()) {
@@ -330,12 +312,12 @@ void KernelActor::FetchInputDeviceTensor(OpContext<DeviceTensor> *const context)
   }
 
   for (auto &device_tensor_store_key : device_tensor_store_keys_) {
-    auto device_tensor =
-      DeviceTensorStore::GetInstance().Fetch(device_tensor_store_key.second, device_context_->GetDeviceAddressType());
+    auto device_tensor = DeviceTensorStore::GetInstance().Fetch(device_tensor_store_key.second.get(),
+                                                                device_contexts_[0]->GetDeviceAddressType());
     if (device_tensor == nullptr) {
       std::string error_info =
         GetAID().Name() + " get device tensor store failed: " + device_tensor_store_key.second->fullname_with_scope() +
-        ", device type:" + std::to_string(static_cast<int>(device_context_->GetDeviceAddressType()));
+        ", device type:" + std::to_string(static_cast<int>(device_contexts_[0]->GetDeviceAddressType()));
       SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(strategy_, (*context), error_info);
     }
     if (input_device_tensors_[device_tensor_store_key.first] != device_tensor) {
@@ -439,8 +421,8 @@ void KernelActor::SendOutput(OpContext<DeviceTensor> *const context) const {
 
   // 4.Send recorder info.
   if (recorder_aid_ != nullptr) {
-    Async(*recorder_aid_, &RecorderActor::RecordInfo, kernel_->fullname_with_scope(), &launch_info_, device_context_,
-          context);
+    Async(*recorder_aid_, &RecorderActor::RecordInfo, kernel_->fullname_with_scope(), &launch_info_,
+          device_contexts_[0], context);
   }
 
   // No output.
@@ -449,28 +431,5 @@ void KernelActor::SendOutput(OpContext<DeviceTensor> *const context) const {
     SET_OPCONTEXT_SUCCESS_RET((*context));
   }
 }
-
-void KernelActor::EraseInput(OpContext<DeviceTensor> *const context) {
-  MS_EXCEPTION_IF_NULL(context);
-  if (input_datas_num_ != 0) {
-    auto ret = input_op_datas_.erase(context->sequential_num_);
-    if (ret == 0) {
-      std::string error_info = "Erase input data failed: " + GetAID().Name();
-      // The sequential num may be invalid, can't set the promise value of context.
-      MS_LOG(ERROR) << error_info << ", sequential_num: " << context->sequential_num_;
-      return;
-    }
-  }
-
-  if (input_controls_num_ != 0) {
-    auto ret = input_op_controls_.erase(context->sequential_num_);
-    if (ret == 0) {
-      std::string error_info = "Erase input controls failed: " + GetAID().Name();
-      // The sequential num may be invalid, can't set the promise value of context.
-      MS_LOG(ERROR) << error_info << ", sequential_num: " << context->sequential_num_;
-      return;
-    }
-  }
-}
 }  // namespace runtime
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/framework/actor/kernel_actor.h b/mindspore/ccsrc/runtime/framework/actor/kernel_actor.h
index 068f59129f4..68509245eab 100644
--- a/mindspore/ccsrc/runtime/framework/actor/kernel_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/kernel_actor.h
@@ -39,30 +39,24 @@ using mindspore::kernel::KernelLaunchInfo;
 using mindspore::tensor::TensorPtr;
 
 // The kernel actor is used to receive the device tensors and control info to luanch kernel.
-// The processing flow is RunOpData/RunOpControl -> CheckLaunchCondition -> SendMemoryAllocReq
+// The processing flow is RunOpData/RunOpControl -> CheckRunningCondition -> SendMemoryAllocReq
 // -> OnMemoryAllocFinish -> LaunchKernel -> SendMemoryFreeReq -> SendOutput.
 class KernelActor : public DebugAwareActor {
  public:
   KernelActor(const std::string &name, const CNodePtr &kernel, const DeviceContext *device_context,
-              const AID memory_manager_aid, const AID *debug_aid, const AID *recorder_aid,
+              const AID &memory_manager_aid, const AID *debug_aid, const AID *recorder_aid,
               GraphExecutionStrategy strategy)
-      : DebugAwareActor(name),
+      : DebugAwareActor(name, KernelTransformType::kKernelActor, recorder_aid, memory_manager_aid, debug_aid),
         kernel_(kernel),
         kernel_info_(nullptr),
         is_dynamic_shape_(false),
-        device_context_(device_context),
-        memory_manager_aid_(memory_manager_aid),
-        debug_aid_(debug_aid),
-        recorder_aid_(recorder_aid),
-        input_datas_num_(0),
-        input_controls_num_(0),
         real_input_num_(0),
-        running_dependent_msg_num_(1),
-        strategy_(strategy) {}
+        strategy_(strategy) {
+    (void)device_contexts_.emplace_back(device_context);
+  }
   ~KernelActor() override = default;
 
   void Init() override;
-  bool IsActive(int msg_num) override { return msg_num >= running_dependent_msg_num_ ? true : false; }
 
   // The kernel actor run when receive the input data.
   void RunOpData(OpData<DeviceTensor> *const input_data, OpContext<DeviceTensor> *const context) override;
@@ -86,8 +80,6 @@ class KernelActor : public DebugAwareActor {
  private:
   friend class GraphScheduler;
 
-  // Check whether satisfy the condition for launch.
-  bool CheckLaunchCondition(OpContext<DeviceTensor> *const context) const;
   // Fetch the device tensor for launch.
   void FetchInputDeviceTensor(OpContext<DeviceTensor> *const context);
   void FetchOutputDeviceTensor();
@@ -102,45 +94,20 @@ class KernelActor : public DebugAwareActor {
 
   // Send output data and output controls when finish kernel launch.
   void SendOutput(OpContext<DeviceTensor> *const context) const;
-  // Erase input data and input controls when finish kernel launch.
-  void EraseInput(OpContext<DeviceTensor> *const context);
 
   // The info of kernel.
   CNodePtr kernel_;
   KernelInfo *kernel_info_;
   bool is_dynamic_shape_;
 
-  // The device interface of kernel launch.
-  const DeviceContext *device_context_;
-
-  // The id of memory manager actor. Send message to it for alloc and free memory during the kernel launch.
-  const AID memory_manager_aid_;
-  // The id of debug actor. Send message to it for debug after the kernel launch.
-  const AID *debug_aid_;
-  // The id of recorder actor. Send message to it for recording kernel info after the kernel launch.
-  const AID *recorder_aid_;
-
-  // The dependent input data number.
-  size_t input_datas_num_;
-  // The dependent input controls number.
-  size_t input_controls_num_;
   // The real input number of kernel launch.
   size_t real_input_num_;
-  // The dependent messages number of actor running.
-  int running_dependent_msg_num_;
 
   // The execution strategy of kernel actor.
   // In pipeline mode, kernel actor executes asynchronously.
   // In step mode, kernel actor executes synchronously.
   GraphExecutionStrategy strategy_{GraphExecutionStrategy::kPipeline};
 
-  // The dependent input actors.
-  std::vector<AID> input_data_arrow_aids_;
-  std::vector<AID> input_control_arrow_aids_;
-
-  // Pair<index, anfNode> points to the dependent device tensor store, anfNode is the key of the device tensor store.
-  std::vector<std::pair<size_t, AnfNode *>> device_tensor_store_keys_;
-
   // The device tensors for launch.
   std::vector<DeviceTensor *> input_device_tensors_;
   std::vector<DeviceTensor *> output_device_tensors_;
@@ -160,9 +127,6 @@ class KernelActor : public DebugAwareActor {
   // The kernel launch info is fetched by the device tensors.
   KernelLaunchInfo launch_info_;
 
-  // The output result arrows of graph output.
-  std::vector<DataArrowPtr> output_result_arrows_;
-
   // Cache unique output data by output index to modify the output data effectively.
   std::vector<std::vector<OpDataUniquePtr<DeviceTensor>>> output_data_by_output_index_;
   //  The output_data_ corresponds to the output_data_arrows_ one by one.
diff --git a/mindspore/ccsrc/runtime/framework/actor/loop_count_actor.cc b/mindspore/ccsrc/runtime/framework/actor/loop_count_actor.cc
index 9076974147b..c85cb33a575 100644
--- a/mindspore/ccsrc/runtime/framework/actor/loop_count_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/loop_count_actor.cc
@@ -86,7 +86,7 @@ void LoopCountActor::RunOpControl(AID *const input_control, OpContext<DeviceTens
   MS_EXCEPTION_IF_NULL(context);
   auto sequential_num = context->sequential_num_;
   (void)input_op_controls_[sequential_num].emplace_back(input_control);
-  if (CheckLoopCountIncreaseCondition(context)) {
+  if (CheckRunningCondition(context)) {
     IncreaseLoopCount(context);
   }
 }
@@ -102,12 +102,7 @@ void LoopCountActor::OnDebugFinish(OpContext<DeviceTensor> *const context) {
 
 void LoopCountActor::IncreaseLoopCount(OpContext<DeviceTensor> *const context) {
   MS_EXCEPTION_IF_NULL(context);
-  auto sequential_num = context->sequential_num_;
-  auto ret = input_op_controls_.erase(sequential_num);
-  if (ret == 0) {
-    std::string error_info = "Erase input controls failed: " + GetAID().Name();
-    SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info);
-  }
+  EraseInput(context);
 
   total_running_count_++;
   current_count_++;
@@ -165,12 +160,5 @@ void LoopCountActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context)
     Async(kernel_aid, &KernelActor::RunOpControl, source_aid, context);
   }
 }
-
-bool LoopCountActor::CheckLoopCountIncreaseCondition(OpContext<DeviceTensor> *const context) {
-  MS_EXCEPTION_IF_NULL(context);
-  auto sequential_num = context->sequential_num_;
-
-  return input_op_controls_[sequential_num].size() == input_controls_num_;
-}
 }  // namespace runtime
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/framework/actor/loop_count_actor.h b/mindspore/ccsrc/runtime/framework/actor/loop_count_actor.h
index a6d4efccc3d..214163c7da1 100644
--- a/mindspore/ccsrc/runtime/framework/actor/loop_count_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/loop_count_actor.h
@@ -34,16 +34,12 @@ namespace runtime {
 // and decide whether to loop execution by loop count.
 class LoopCountActor : public DebugAwareActor {
  public:
-  LoopCountActor(std::string name, size_t loop_count, const AID memory_manager_aid, const AID *debug_aid,
+  LoopCountActor(const std::string &name, size_t loop_count, const AID &memory_manager_aid, const AID *debug_aid,
                  const AID *recorder_aid)
-      : DebugAwareActor(name),
+      : DebugAwareActor(name, KernelTransformType::kLoopCountActor, recorder_aid, memory_manager_aid, debug_aid),
         loop_count_(loop_count),
         current_count_(0),
-        total_running_count_(0),
-        input_controls_num_(0),
-        memory_manager_aid_(memory_manager_aid),
-        debug_aid_(debug_aid),
-        recorder_aid_(recorder_aid) {}
+        total_running_count_(0) {}
 
   ~LoopCountActor() override = default;
 
@@ -68,30 +64,17 @@ class LoopCountActor : public DebugAwareActor {
   void IncreaseLoopCount(OpContext<DeviceTensor> *const context);
   void SendOutput(OpContext<DeviceTensor> *const context);
 
-  bool CheckLoopCountIncreaseCondition(OpContext<DeviceTensor> *const context);
   // The loop count is constant, the current count is increased after each step running finished.
   size_t loop_count_;
   size_t current_count_;
   // The total running count represents the toal step running count.
   size_t total_running_count_;
 
-  // The dependent input controls number.
-  // In the multi-branch output scenario of the control flow, the control of each branch needs to be recorded
-  // separately with the branch id as the key. When the output has only one branch, the branch id is 0.
-  size_t input_controls_num_;
-
   // The output controls contain the data source actors and the no input kernel actors and output actor.
   std::vector<AID> data_source_aids_;
   std::vector<AID> no_input_kernel_aids_;
   AID output_aid_;
 
-  // The id of memory manager actor. Send message to it for alloc continuous memory before next step running.
-  const AID memory_manager_aid_;
-  // The id of debug actor. Send message to it for debug before loop count actor exits.
-  const AID *debug_aid_;
-  // The id of recorder actor. Send message to it for clearing recorder info before loop count actor exits.
-  const AID *recorder_aid_;
-
   // The nodes need continuous memory, which must allocate in the begin of step running. The first bool of pair
   // expresses the inputs of node need continuous memory, the second bool of pair expresses the outputs of node need
   // continuous memory.
@@ -100,7 +83,6 @@ class LoopCountActor : public DebugAwareActor {
   std::vector<std::vector<DeviceTensorPtr>> continuous_memory_alloc_list_list_;
   std::vector<std::vector<size_t>> size_list_list_;
   std::vector<size_t> total_size_list_;
-  std::vector<const DeviceContext *> device_contexts_;
 };
 
 using LoopCountActorPtr = std::shared_ptr<LoopCountActor>;
diff --git a/mindspore/ccsrc/runtime/framework/actor/memory_aware_actor.h b/mindspore/ccsrc/runtime/framework/actor/memory_aware_actor.h
index f8ce7838584..3c24e48f5b9 100644
--- a/mindspore/ccsrc/runtime/framework/actor/memory_aware_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/memory_aware_actor.h
@@ -19,21 +19,28 @@
 
 #include <utility>
 #include <string>
-#include "mindrt/include/actor/op_actor.h"
+#include "runtime/framework/actor/abstract_actor.h"
 #include "runtime/framework/device_tensor_store.h"
 
 namespace mindspore {
 namespace runtime {
 // The actor represents a set of common memory related operations of actor.
-class MemoryAwareActor : public OpActor<DeviceTensor> {
+class MemoryAwareActor : public AbstractActor {
  public:
-  explicit MemoryAwareActor(std::string name) : OpActor(name) {}
+  explicit MemoryAwareActor(const std::string &name, KernelTransformType type, const AID *recorder_aid,
+                            const AID &memory_manager_aid)
+      : AbstractActor(name, type, recorder_aid), memory_manager_aid_(memory_manager_aid) {}
   virtual ~MemoryAwareActor() = default;
+
   virtual void SendMemoryAllocReq(OpContext<DeviceTensor> *const context) {}
   virtual void SendMemoryFreeReq(OpContext<DeviceTensor> *const context) {}
   virtual void OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {}
 
+ protected:
   friend class GraphScheduler;
+
+  // The id of memory manager actor. Send message to it for alloc and free memory.
+  const AID memory_manager_aid_;
 };
 }  // namespace runtime
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/framework/actor/output_actor.h b/mindspore/ccsrc/runtime/framework/actor/output_actor.h
index 54963dfd73f..0cfcb5eeb6f 100644
--- a/mindspore/ccsrc/runtime/framework/actor/output_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/output_actor.h
@@ -26,6 +26,7 @@
 #include "runtime/framework/control_node_parser.h"
 #include "runtime/framework/device_tensor_store.h"
 #include "runtime/framework/actor/actor_common.h"
+#include "runtime/framework/actor/abstract_actor.h"
 #include "runtime/hardware/device_context.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "ir/tensor.h"
@@ -37,16 +38,15 @@ using mindspore::session::KernelWithIndex;
 using mindspore::tensor::TensorPtr;
 
 // The output actor is used to receive the output result of actor which represents the graph output.
-class OutputActor : public OpActor<DeviceTensor> {
+class OutputActor : public AbstractActor {
  public:
   OutputActor(std::string name, size_t loop_count, size_t outputs_num, bool need_loop_count)
-      : OpActor(name),
+      : AbstractActor(name, KernelTransformType::kOutputActor, nullptr),
         loop_count_(loop_count),
         current_count_(0),
         outputs_num_(outputs_num),
         current_outputs_num_(0),
-        need_loop_count_(need_loop_count),
-        running_dependent_msg_num_(1) {
+        need_loop_count_(need_loop_count) {
     outputs_.resize(outputs_num);
     output_nodes_.resize(outputs_num);
     device_contexts_.resize(outputs_num);
@@ -54,7 +54,6 @@ class OutputActor : public OpActor<DeviceTensor> {
   ~OutputActor() override = default;
 
   void Init() override;
-  bool IsActive(int msg_num) override { return msg_num >= running_dependent_msg_num_ ? true : false; }
 
   // The output actor collects loop count when receive the input control of loop count actor.
   void CollectLoopCount(size_t loop_count, OpContext<DeviceTensor> *const context);
@@ -80,15 +79,9 @@ class OutputActor : public OpActor<DeviceTensor> {
   // The outputs.
   std::vector<TensorPtr> outputs_;
   std::vector<KernelWithIndex> output_nodes_;
-  std::vector<const DeviceContext *> device_contexts_;
   size_t outputs_num_;
   size_t current_outputs_num_;
   bool need_loop_count_;
-
-  // The dependent messages number of actor running.
-  int running_dependent_msg_num_;
-
-  std::vector<std::pair<size_t, AnfNodePtr>> device_tensor_store_keys_;
 };
 
 using OutputActorPtr = std::shared_ptr<OutputActor>;
diff --git a/mindspore/ccsrc/runtime/framework/actor/recorder_actor.cc b/mindspore/ccsrc/runtime/framework/actor/recorder_actor.cc
index 49c91b6d29f..9ce7d926652 100644
--- a/mindspore/ccsrc/runtime/framework/actor/recorder_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/recorder_actor.cc
@@ -17,8 +17,10 @@
 #include "runtime/framework/actor/recorder_actor.h"
 #include <string>
 #include <utility>
+#ifdef ENABLE_DUMP_IR
 #include "debug/rdr/recorder_manager.h"
 #include "debug/rdr/mem_address_recorder.h"
+#endif
 #include "utils/log_adapter.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/runtime/framework/graph_compiler.cc b/mindspore/ccsrc/runtime/framework/graph_compiler.cc
index ad225024aee..5184124e07c 100644
--- a/mindspore/ccsrc/runtime/framework/graph_compiler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_compiler.cc
@@ -527,5 +527,10 @@ void GraphCompiler::Summary(const std::vector<KernelGraphPtr> &graphs) const {
     session_->Summary(graph.get());
   }
 }
+
+void GraphCompiler::EraseSingleOpCache(const GraphInfo &graph_info, const GraphId &graph_id) {
+  run_op_graphs_.erase(graph_info);
+  run_op_graph_output_nodes_.erase(graph_id);
+}
 }  // namespace runtime
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/framework/graph_compiler.h b/mindspore/ccsrc/runtime/framework/graph_compiler.h
index 1593579727f..0891eee8f23 100644
--- a/mindspore/ccsrc/runtime/framework/graph_compiler.h
+++ b/mindspore/ccsrc/runtime/framework/graph_compiler.h
@@ -113,6 +113,9 @@ class GraphCompiler {
   // Execute graph summary.
   void Summary(const std::vector<KernelGraphPtr> &graphs) const;
 
+  // Remove single op kernel graph cache and output nodes cache.
+  void EraseSingleOpCache(const GraphInfo &graph_info, const GraphId &graph_id);
+
  private:
   DISABLE_COPY_AND_ASSIGN(GraphCompiler);
 
diff --git a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
index a3ddbd9d0a0..bf96684545f 100644
--- a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
@@ -39,15 +39,16 @@
 #include "debug/debugger/debugger.h"
 #endif
 #include "profiler/device/profiling.h"
+#include "debug/common.h"
 
 namespace mindspore {
 namespace runtime {
 namespace {
-bool IsNeedInsertCopyActor(const DeviceContext *from_devcie_context, const DeviceContext *to_devcie_context) {
-  MS_EXCEPTION_IF_NULL(from_devcie_context);
-  MS_EXCEPTION_IF_NULL(to_devcie_context);
+bool IsNeedInsertCopyActor(const DeviceContext *from_device_context, const DeviceContext *to_device_context) {
+  MS_EXCEPTION_IF_NULL(from_device_context);
+  MS_EXCEPTION_IF_NULL(to_device_context);
 
-  if (from_devcie_context->GetDeviceAddressType() == to_devcie_context->GetDeviceAddressType()) {
+  if (from_device_context->GetDeviceAddressType() == to_device_context->GetDeviceAddressType()) {
     return false;
   } else {
     return true;
@@ -74,6 +75,13 @@ void UpdateRefCount(const AnfNodePtr &node, size_t output_idx, bool is_max_ref_c
 AnfNodePtr FetchFrontNodeByBackendNode(const AnfNodePtr &backend_node, const KernelGraphPtr &graph) {
   MS_EXCEPTION_IF_NULL(backend_node);
   MS_EXCEPTION_IF_NULL(graph);
+
+  // Internal parameter ---> front node.
+  auto front_node_with_index = graph->GetFrontNodeByInternalParameter(backend_node);
+  if (front_node_with_index.first != nullptr) {
+    return front_node_with_index.first;
+  }
+
   auto front_node = graph->GetFrontAnfByBackendAnf(backend_node);
   // PyNative forward graph does not has front node, using backend node instead.
   if (front_node == nullptr) {
@@ -306,7 +314,7 @@ void PrepareDataForInputData(const HostQueueDataSourceActor *host_data_source_ac
   MS_EXCEPTION_IF_NULL(tensor);
   // Fill the host tensors for non weighted parameters.
   if (host_data_source_actor != nullptr) {
-    (*host_tensors)[host_data_source_actor->FetchDataNodePosition(node)] = tensor;
+    (*host_tensors)[host_data_source_actor->FetchNodePosition(node)] = tensor;
   }
 
   auto device_address = std::dynamic_pointer_cast<DeviceTensor>(tensor->device_address());
@@ -389,6 +397,87 @@ bool RunInStepMode(const ActorSet *actor_set, const std::vector<TensorPtr> *inpu
   return result_future.IsOK();
 }
 
+// Convert the actors vector by the actor set.
+std::vector<ActorReference> CollectActors(const ActorSet *actor_set) {
+  MS_EXCEPTION_IF_NULL(actor_set);
+  std::vector<ActorReference> actors;
+
+  for (auto &data_source_actor : actor_set->data_source_actors_) {
+    MS_EXCEPTION_IF_NULL(data_source_actor);
+    (void)actors.emplace_back(static_cast<ActorReference>(data_source_actor));
+  }
+  for (auto &kernel_actor : actor_set->kernel_actors_) {
+    MS_EXCEPTION_IF_NULL(kernel_actor);
+    (void)actors.emplace_back(static_cast<ActorReference>(kernel_actor));
+  }
+  for (auto &switch_actor : actor_set->switch_actors_) {
+    MS_EXCEPTION_IF_NULL(switch_actor);
+    (void)actors.emplace_back(static_cast<ActorReference>(switch_actor));
+  }
+  for (auto &gather_actor : actor_set->gather_actors_) {
+    MS_EXCEPTION_IF_NULL(gather_actor);
+    (void)actors.emplace_back(static_cast<ActorReference>(gather_actor));
+  }
+  for (auto &copy_actor : actor_set->copy_actors_) {
+    MS_EXCEPTION_IF_NULL(copy_actor);
+    (void)actors.emplace_back(static_cast<ActorReference>(copy_actor));
+  }
+  if (actor_set->loop_count_actor_ != nullptr) {
+    (void)actors.emplace_back(static_cast<ActorReference>(actor_set->loop_count_actor_));
+  }
+  if (actor_set->output_actor_ != nullptr) {
+    (void)actors.emplace_back(static_cast<ActorReference>(actor_set->output_actor_));
+  }
+
+  return actors;
+}
+
+void ClearNodeInfo(const KernelGraphPtr graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+
+  // Clear input parameter device tensor and device tensor store.
+  for (const auto &input_node : graph->input_nodes()) {
+    MS_EXCEPTION_IF_NULL(input_node);
+    if (!input_node->isa<Parameter>()) {
+      continue;
+    }
+    auto parameter = input_node->cast<ParameterPtr>();
+    MS_EXCEPTION_IF_NULL(parameter);
+    parameter->DecreaseUsedGraphCount();
+    // Only the parameter has no graph used, then clear the device tensor.
+    if (parameter->used_graph_count() != 0) {
+      continue;
+    }
+    auto front_input_node = FetchFrontNodeByBackendNode(input_node, graph);
+    DeviceTensorStore::GetInstance().Remove(front_input_node.get());
+    size_t output_num = AnfAlgo::GetOutputTensorNum(input_node);
+    for (size_t index = 0; index < output_num; ++index) {
+      if (AnfAlgo::OutputAddrExist(input_node, index)) {
+        AnfAlgo::SetOutputAddr(nullptr, index, input_node.get());
+      }
+    }
+  }
+
+  // Clear input value node device tensor and device tensor store.
+  for (const auto &value_node : graph->graph_value_nodes()) {
+    auto front_value_node = FetchFrontNodeByBackendNode(value_node, graph);
+    DeviceTensorStore::GetInstance().Remove(front_value_node.get());
+    if (AnfAlgo::OutputAddrExist(value_node, 0)) {
+      AnfAlgo::SetOutputAddr(nullptr, 0, value_node.get());
+    }
+  }
+
+  // Clear cnode device tensor.
+  for (const auto &cnode : graph->execution_order()) {
+    size_t output_num = AnfAlgo::GetOutputTensorNum(cnode);
+    for (size_t index = 0; index < output_num; ++index) {
+      if (AnfAlgo::OutputAddrExist(cnode, index)) {
+        AnfAlgo::SetOutputAddr(nullptr, index, cnode.get());
+      }
+    }
+  }
+}
+
 #if !defined(_WIN32) && !defined(_WIN64)
 void IntHandler(int, siginfo_t *, void *) {
   int this_pid = getpid();
@@ -398,6 +487,30 @@ void IntHandler(int, siginfo_t *, void *) {
 #endif
 }  // namespace
 
+GraphCompilerInfo::~GraphCompilerInfo() { GraphScheduler::GetInstance().Clear(name_, graphs_); }
+
+void GraphScheduler::Clear(const ActorInfo &actor_info, const std::vector<KernelGraphPtr> &graphs) {
+  // Terminate the actors of actor info.
+  if (actors_.count(actor_info) > 0) {
+    auto actorMgr = ActorMgr::GetActorMgrRef();
+    MS_EXCEPTION_IF_NULL(actorMgr);
+    auto actor_set = actors_[actor_info];
+    auto base_actors = CollectActors(actor_set.get());
+    for (auto &base_actor : base_actors) {
+      actorMgr->Terminate(base_actor->GetAID());
+    }
+  }
+
+  // Clear device tensor and device tensor store.
+  for (auto &graph : graphs) {
+    ClearNodeInfo(graph);
+  }
+
+  // Clear global maps of actor info.
+  (void)actors_.erase(actor_info);
+  (void)actor_to_host_queue_.erase(actor_info);
+}
+
 void GraphScheduler::Clear() {
   // Terminate all actors.
   auto actorMgr = ActorMgr::GetActorMgrRef();
@@ -411,34 +524,36 @@ void GraphScheduler::Clear() {
   actors_.clear();
   actor_name_to_actor_.clear();
   actor_to_host_queue_.clear();
-  device_tensor_to_actor_.clear();
-
-  // Clear local maps and vectors.
-  graph_output_to_actor_.clear();
-  front_node_to_actor_.clear();
-  copy_actors_.clear();
 }
 
-void GraphScheduler::Initialize() {
-  // Local maps and vectors clear.
-  graph_output_to_actor_.clear();
-  front_node_to_actor_.clear();
-  copy_actors_.clear();
+using DataArrowLinkFunc = void (GraphScheduler::*)(AbstractActor *const, KernelActor *const, const KernelWithIndex &,
+                                                   const KernelWithIndex &, const KernelGraphPtr &);
+static std::map<KernelTransformType, DataArrowLinkFunc> kKernelTypeToLinkFunc;
 
+void GraphScheduler::Initialize() {
   if (init_) {
     return;
   }
   init_ = true;
 
+  (void)kKernelTypeToLinkFunc.emplace(KernelTransformType::kDeviceDataSourceActor,
+                                      &GraphScheduler::LinkDataArrowForDeviceDSActor);
+  (void)kKernelTypeToLinkFunc.emplace(KernelTransformType::kHostDataSourceActor,
+                                      &GraphScheduler::LinkDataArrowForHostDSActor);
+  (void)kKernelTypeToLinkFunc.emplace(KernelTransformType::kKernelActor, &GraphScheduler::LinkDataArrowForKernelActor);
+  (void)kKernelTypeToLinkFunc.emplace(KernelTransformType::kDeviceTensorStore,
+                                      &GraphScheduler::LinkDataArrowForDeviceTensorStore);
+  (void)kKernelTypeToLinkFunc.emplace(KernelTransformType::kInternalParameter,
+                                      &GraphScheduler::LinkDataArrowForInternalParameter);
+
   // Create the thread pool of actor runtime and Set the OMP_NUM_THREADS env.
   size_t actor_thread_num = 0;
   size_t OMP_thread_num = 0;
-  ComputeThreadNums(&actor_thread_num, &OMP_thread_num);
-
+  size_t max_thread_num = 0;
+  ComputeThreadNums(&actor_thread_num, &OMP_thread_num, &max_thread_num);
   auto actor_manager = ActorMgr::GetActorMgrRef();
   MS_EXCEPTION_IF_NULL(actor_manager);
-  actor_manager->Initialize(true, actor_thread_num);
-
+  actor_manager->Initialize(true, actor_thread_num, max_thread_num);
   std::string OMP_env = std::to_string(OMP_thread_num);
   (void)common::SetEnv("OMP_NUM_THREADS", OMP_env.c_str(), 0);
   auto OMP_thread_num_used = common::GetEnv("OMP_NUM_THREADS");
@@ -519,36 +634,7 @@ ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info
 
 void GraphScheduler::Schedule(const ActorSet *actor_set) {
   MS_EXCEPTION_IF_NULL(actor_set);
-  std::vector<ActorReference> actors;
-
-  // Collect actors.
-  for (auto &data_source_actor : actor_set->data_source_actors_) {
-    MS_EXCEPTION_IF_NULL(data_source_actor);
-    (void)actors.emplace_back(static_cast<ActorReference>(data_source_actor));
-  }
-  for (auto &kernel_actor : actor_set->kernel_actors_) {
-    MS_EXCEPTION_IF_NULL(kernel_actor);
-    (void)actors.emplace_back(static_cast<ActorReference>(kernel_actor));
-  }
-  for (auto &switch_actor : actor_set->switch_actors_) {
-    MS_EXCEPTION_IF_NULL(switch_actor);
-    (void)actors.emplace_back(static_cast<ActorReference>(switch_actor));
-  }
-  for (auto &gather_actor : actor_set->gather_actors_) {
-    MS_EXCEPTION_IF_NULL(gather_actor);
-    (void)actors.emplace_back(static_cast<ActorReference>(gather_actor));
-  }
-  for (auto &copy_actor : actor_set->copy_actors_) {
-    MS_EXCEPTION_IF_NULL(copy_actor);
-    (void)actors.emplace_back(static_cast<ActorReference>(copy_actor));
-  }
-  if (actor_set->loop_count_actor_ != nullptr) {
-    (void)actors.emplace_back(static_cast<ActorReference>(actor_set->loop_count_actor_));
-  }
-  if (actor_set->output_actor_ != nullptr) {
-    (void)actors.emplace_back(static_cast<ActorReference>(actor_set->output_actor_));
-  }
-
+  auto actors = CollectActors(actor_set);
   // Schedule actors.
   auto actorMgr = ActorMgr::GetActorMgrRef();
   MS_EXCEPTION_IF_NULL(actorMgr);
@@ -821,7 +907,7 @@ void GraphScheduler::CacheGraphOutputToActor(const GraphCompilerInfo &graph_comp
         const auto &host_ds_actor = dynamic_cast<HostQueueDataSourceActor *>(actor);
         MS_EXCEPTION_IF_NULL(host_ds_actor);
         // Get the position of output kernel in the data source actor.
-        actor_output_index = host_ds_actor->FetchDataNodePosition(output_kernel);
+        actor_output_index = host_ds_actor->FetchNodePosition(output_kernel);
       } else if (IsPersistentDeviceTensor(output_kernel)) {
         MS_LOG(INFO) << "The graph " << graph->graph_id() << " output node:" << output_kernel->fullname_with_scope()
                      << " is device tensor store.";
@@ -837,7 +923,8 @@ void GraphScheduler::CacheGraphOutputToActor(const GraphCompilerInfo &graph_comp
                    << " with index:" << actor_output_index
                    << ", from front node:" << origin_output_with_index.first->fullname_with_scope()
                    << " with index: " << origin_output_with_index.second;
-      (void)graph_output_to_actor_.emplace(origin_output_with_index, GraphOutputPair(actor, actor_output_index));
+      (void)graph_output_to_actor_.emplace(origin_output_with_index,
+                                           GraphOutputPair(dynamic_cast<AbstractActor *>(actor), actor_output_index));
     }
   }
 }
@@ -1288,18 +1375,15 @@ void GraphScheduler::LinkDataArrow(KernelActor *const to_actor, const GraphCompi
   MS_EXCEPTION_IF_NULL(graph);
 
   auto from_kernel = from_kernel_with_output_idx.first;
-  auto front_node = GetFrontNodeByBackendNode(from_kernel);
-
   if (from_kernel->isa<Parameter>() && graph_compiler_info.control_node_parser_->IsCallInputKernelGraph(graph)) {
     const auto &kernel_with_index = GetFrontNodeByKernelGraph(from_kernel, graph);
     const auto &real_front_node_with_index =
       AnfAlgo::VisitKernelWithReturnType(kernel_with_index.first, SizeToInt(kernel_with_index.second));
     if (HasAbstractRef(real_front_node_with_index.first)) {
       (void)to_actor->device_tensor_store_keys_.emplace_back(to_kernel_with_input_idx.second,
-                                                             real_front_node_with_index.first.get());
+                                                             real_front_node_with_index.first);
       return;
     }
-
     // When there is a call input in the kernel graph, all the inputs of the kernel graph needs to be sent by gather.
     const auto actor_name = graph->ToString();
     auto actor = FetchActor(actor_name);
@@ -1309,12 +1393,8 @@ void GraphScheduler::LinkDataArrow(KernelActor *const to_actor, const GraphCompi
     return;
   }
 
-  if (IsDeviceQueueDSActor(from_kernel, graph_compiler_info.strategy_)) {
-    // Link the data arrows of device queue data source actor.
-    std::string actor_name = graph_compiler_info.name_ + "_DeviceDSActor" + "_" + std::to_string(graph->graph_id());
-    const auto &from_actor = dynamic_cast<DeviceQueueDataSourceActor *>(FetchActor(actor_name));
-    LinkDataArrowForDeviceDSActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
-  } else if (front_node != nullptr && IsGatherActor(front_node, actor_name_to_actor_)) {
+  auto front_node = GetFrontNodeByBackendNode(from_kernel);
+  if (front_node != nullptr && IsGatherActor(front_node, actor_name_to_actor_)) {
     // Link the data arrows of gather actor.
     auto func_graph = GetFuncgraphByBackendNode(from_kernel);
     if (func_graph == nullptr) {
@@ -1323,42 +1403,44 @@ void GraphScheduler::LinkDataArrow(KernelActor *const to_actor, const GraphCompi
     auto actor_name = func_graph->ToString();
     const auto &from_actor = dynamic_cast<GatherActor *>(FetchActor(actor_name));
     if (HasAbstractRef(from_kernel)) {
-      (void)to_actor->device_tensor_store_keys_.emplace_back(to_kernel_with_input_idx.second, front_node.get());
+      (void)to_actor->device_tensor_store_keys_.emplace_back(to_kernel_with_input_idx.second, front_node);
       return;
     }
     LinkDataArrowForGatherActor(from_actor, to_actor, {front_node, 0}, to_kernel_with_input_idx);
-  } else if (IsHostQueueDSActor(from_kernel, graph, graph_compiler_info.origin_parameters_order_,
-                                graph_compiler_info.strategy_)) {
-    // Link the data arrows of host queue data source actor.
-    std::string actor_name = graph_compiler_info.name_ + "_HostDSActor";
-    const auto &from_actor = dynamic_cast<HostQueueDataSourceActor *>(FetchActor(actor_name));
-    LinkDataArrowForHostDSActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
-  } else if (IsKernelActor(from_kernel, graph_compiler_info.strategy_)) {
-    // Link the data arrows of kernel actor.
-    const auto &from_actor = dynamic_cast<KernelActor *>(FetchActor(from_kernel->fullname_with_scope()));
-    LinkDataArrowForKernelActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
-  } else if (IsInternalParameter(from_kernel, graph)) {
-    // Link data arrow for internal parameter, convert internal parameter to actor by internal parameter cache to
-    // link.
-    LinkDataArrowForInternalParameter(from_kernel, graph_compiler_info.origin_parameters_order_, graph, to_actor,
-                                      to_kernel_with_input_idx);
-  } else if (IsPersistentDeviceTensor(from_kernel)) {
-    const auto devcie_tensor_store_key = FetchFrontNodeByBackendNode(from_kernel, graph);
-    (void)to_actor->device_tensor_store_keys_.emplace_back(to_kernel_with_input_idx.second,
-                                                           devcie_tensor_store_key.get());
-  } else {
-    // May exist the from kernel that no need link in the pynative mode.
-    MS_LOG(DEBUG) << "Invalid from kernel: " << from_kernel->fullname_with_scope();
+    return;
+  }
+
+  auto kernel_type = KernelTransformType::kUnknown;
+  std::string kernel_name = "";
+  FetchKernelTransformTypeAndName(from_kernel, graph, graph_compiler_info, &kernel_type, &kernel_name);
+  auto from_actor = dynamic_cast<AbstractActor *>(FetchActor(kernel_name));
+  if (kKernelTypeToLinkFunc.count(kernel_type) > 0) {
+    (this->*kKernelTypeToLinkFunc[kernel_type])(from_actor, to_actor, from_kernel_with_output_idx,
+                                                to_kernel_with_input_idx, graph);
   }
 }
 
-void GraphScheduler::LinkDataArrowForInternalParameter(const AnfNodePtr &internal_parameter,
-                                                       const std::vector<AnfNodePtr> &host_parameters,
-                                                       const KernelGraphPtr &graph, KernelActor *to_actor,
-                                                       const KernelWithIndex &to_kernel_with_input_idx) {
-  MS_EXCEPTION_IF_NULL(internal_parameter);
-  MS_EXCEPTION_IF_NULL(graph);
+void GraphScheduler::LinkDataArrowForDeviceTensorStore(AbstractActor *const, KernelActor *const to_actor,
+                                                       const KernelWithIndex &from_kernel_with_output_idx,
+                                                       const KernelWithIndex &to_kernel_with_input_idx,
+                                                       const KernelGraphPtr &graph) {
   MS_EXCEPTION_IF_NULL(to_actor);
+  MS_EXCEPTION_IF_NULL(graph);
+  auto from_kernel = from_kernel_with_output_idx.first;
+  MS_EXCEPTION_IF_NULL(from_kernel);
+
+  auto device_tensor_store_key = FetchFrontNodeByBackendNode(from_kernel, graph);
+  (void)to_actor->device_tensor_store_keys_.emplace_back(to_kernel_with_input_idx.second, device_tensor_store_key);
+}
+
+void GraphScheduler::LinkDataArrowForInternalParameter(AbstractActor *const, KernelActor *to_actor,
+                                                       const KernelWithIndex &from_kernel_with_output_idx,
+                                                       const KernelWithIndex &to_kernel_with_input_idx,
+                                                       const KernelGraphPtr &graph) {
+  MS_EXCEPTION_IF_NULL(to_actor);
+  MS_EXCEPTION_IF_NULL(graph);
+  auto internal_parameter = from_kernel_with_output_idx.first;
+  MS_EXCEPTION_IF_NULL(internal_parameter);
 
   // Parameter ---> front node.
   auto front_output_with_index = graph->GetFrontNodeByInternalParameter(internal_parameter);
@@ -1371,74 +1453,40 @@ void GraphScheduler::LinkDataArrowForInternalParameter(const AnfNodePtr &interna
     to_actor->input_datas_num_++;
     return;
   }
+
+  auto real_from_kernel_with_output_idx = from_kernel_with_output_idx;
+  AbstractActor *real_from_actor = nullptr;
+  KernelTransformType kernel_type;
   if (IsPersistentDeviceTensor(front_output_node)) {
-    (void)to_actor->device_tensor_store_keys_.emplace_back(to_kernel_with_input_idx.second, front_output_node.get());
-    return;
-  }
-
-  // front node ---> actor.
-  if (graph_output_to_actor_.count(front_output_with_index) == 0) {
-    MS_LOG(EXCEPTION) << "Can't find actor by front node:" << AnfAlgo::GetNodeDebugString(front_output_node)
-                      << ", internal parameter:" << AnfAlgo::GetNodeDebugString(internal_parameter);
-  }
-  auto actor_pair = graph_output_to_actor_[front_output_with_index];
-  MS_EXCEPTION_IF_NULL(actor_pair.first);
-  MS_LOG(INFO) << "Graph " << graph->graph_id() << " internal parameter:" << internal_parameter->DebugString()
-               << ", corresponding front node:" << front_output_node->fullname_with_scope()
-               << " with index:" << front_output_with_index.second
-               << ", from actor:" << actor_pair.first->GetAID().Name() << " with index:" << actor_pair.second
-               << ", to actor:" << to_actor->GetAID().Name() << " with index:" << to_kernel_with_input_idx.second;
-
-  if (IsDeviceQueueDSActor(front_output_node)) {
-    auto from_actor = dynamic_cast<DeviceQueueDataSourceActor *>(actor_pair.first);
-    MS_EXCEPTION_IF_NULL(from_actor);
-    auto from_kernel_with_output_idx = KernelWithIndex(from_actor->data_kernel_, actor_pair.second);
-    LinkDataArrowForDeviceDSActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
-  } else if (IsKernelActor(front_output_node)) {
-    auto from_actor = dynamic_cast<KernelActor *>(actor_pair.first);
-    MS_EXCEPTION_IF_NULL(from_actor);
-    auto from_kernel_with_output_idx = KernelWithIndex(from_actor->kernel_, actor_pair.second);
-    LinkDataArrowForKernelActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
-  } else if (IsHostQueueDSActor(front_output_node, graph, host_parameters)) {
-    auto from_actor = dynamic_cast<HostQueueDataSourceActor *>(actor_pair.first);
-    MS_EXCEPTION_IF_NULL(from_actor);
-    auto from_kernel_with_output_idx = KernelWithIndex(from_actor->data_nodes_[actor_pair.second], 0);
-    LinkDataArrowForHostDSActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
+    kernel_type = KernelTransformType::kDeviceTensorStore;
   } else {
-    MS_LOG(EXCEPTION) << "Invalid internal parameter: " << internal_parameter->DebugString();
+    // front node ---> actor.
+    if (graph_output_to_actor_.count(front_output_with_index) == 0) {
+      MS_LOG(EXCEPTION) << "Can't find actor by front node:" << AnfAlgo::GetNodeDebugString(front_output_node)
+                        << ", internal parameter:" << AnfAlgo::GetNodeDebugString(internal_parameter);
+    }
+    auto actor_pair = graph_output_to_actor_[front_output_with_index];
+    MS_EXCEPTION_IF_NULL(actor_pair.first);
+    MS_LOG(INFO) << "Graph " << graph->graph_id() << " internal parameter:" << internal_parameter->DebugString()
+                 << ", corresponding front node:" << front_output_node->fullname_with_scope()
+                 << " with index:" << front_output_with_index.second
+                 << ", from actor:" << actor_pair.first->GetAID().Name() << " with index:" << actor_pair.second
+                 << ", to actor:" << to_actor->GetAID().Name() << " with index:" << to_kernel_with_input_idx.second;
+    real_from_actor = actor_pair.first;
+    real_from_kernel_with_output_idx = KernelWithIndex(nullptr, actor_pair.second);
+    kernel_type = actor_pair.first->type_;
   }
+
+  if (kKernelTypeToLinkFunc.count(kernel_type) == 0) {
+    MS_LOG(EXCEPTION) << "Invalid internal parameter:" << internal_parameter->DebugString() << ", type:" << kernel_type;
+  }
+  (this->*kKernelTypeToLinkFunc[kernel_type])(real_from_actor, to_actor, real_from_kernel_with_output_idx,
+                                              to_kernel_with_input_idx, graph);
 }
 
-void GraphScheduler::LinkDataArrowForDeviceDSActor(DeviceQueueDataSourceActor *const from_actor,
-                                                   KernelActor *const to_actor,
-                                                   const KernelWithIndex &from_kernel_with_output_idx,
-                                                   const KernelWithIndex &to_kernel_with_input_idx) {
-  MS_EXCEPTION_IF_NULL(from_actor);
-  MS_EXCEPTION_IF_NULL(to_actor);
-
-  auto from_kernel = from_kernel_with_output_idx.first;
-  MS_EXCEPTION_IF_NULL(from_kernel);
-  auto from_output_index = from_kernel_with_output_idx.second;
-  auto to_input_index = to_kernel_with_input_idx.second;
-
-  if (IsNeedInsertCopyActor(from_actor->device_context_, to_actor->device_context_)) {
-    LinkDataArrowForCopyActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
-  } else {
-    auto to_aid = to_actor->GetAID();
-    auto op_arrow = std::make_shared<DataArrow>(from_output_index, to_aid, to_input_index);
-    (void)from_actor->output_data_arrows_.emplace_back(op_arrow);
-    to_actor->input_datas_num_++;
-    (void)to_actor->input_data_arrow_aids_.emplace_back(from_actor->GetAID());
-
-    // Update the reference count of device tensor.
-    UpdateRefCount(from_kernel, from_output_index);
-  }
-}
-
-void GraphScheduler::LinkDataArrowForHostDSActor(HostQueueDataSourceActor *const from_actor,
-                                                 KernelActor *const to_actor,
-                                                 const KernelWithIndex &from_kernel_with_output_idx,
-                                                 const KernelWithIndex &to_kernel_with_input_idx) {
+void GraphScheduler::LinkDataArrowForBaseActor(AbstractActor *const from_actor, KernelActor *const to_actor,
+                                               const KernelWithIndex &from_kernel_with_output_idx,
+                                               const KernelWithIndex &to_kernel_with_input_idx) {
   MS_EXCEPTION_IF_NULL(from_actor);
   MS_EXCEPTION_IF_NULL(to_actor);
 
@@ -1448,52 +1496,21 @@ void GraphScheduler::LinkDataArrowForHostDSActor(HostQueueDataSourceActor *const
   auto to_input_index = to_kernel_with_input_idx.second;
 
   // Get the position of from kernel in the data source actor.
-  auto position = from_actor->FetchDataNodePosition(from_kernel);
-  if (IsNeedInsertCopyActor(from_actor->device_contexts_[position], to_actor->device_context_)) {
-    LinkDataArrowForCopyActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
-  } else {
-    auto to_aid = to_actor->GetAID();
-    auto op_arrow = std::make_shared<DataArrow>(position, to_aid, to_input_index);
-    (void)from_actor->output_data_arrows_.emplace_back(op_arrow);
-    to_actor->input_datas_num_++;
-    (void)to_actor->input_data_arrow_aids_.emplace_back(from_actor->GetAID());
-
-    // Update the reference count of device tensor.
-    UpdateRefCount(from_actor->data_nodes_[position], from_output_index);
-  }
-}
-
-void GraphScheduler::LinkDataArrowForKernelActor(KernelActor *from_actor, KernelActor *const to_actor,
-                                                 KernelWithIndex from_kernel_with_output_idx,
-                                                 const KernelWithIndex &to_kernel_with_input_idx) {
-  MS_EXCEPTION_IF_NULL(to_actor);
-  if (IsSkippedKernelActor(from_kernel_with_output_idx.first)) {
-    auto real_kernel_with_index = AnfAlgo::GetPrevNodeOutput(from_kernel_with_output_idx.first, 0);
-    MS_EXCEPTION_IF_NULL(real_kernel_with_index.first);
-    LinkControlArrowBySkippedNode(to_actor, from_kernel_with_output_idx.first);
-
-    // Update the from kernel info by the real node info.
-    MS_LOG(INFO) << "Link data arrow for inplace node, aggregate node: "
-                 << to_kernel_with_input_idx.first->fullname_with_scope()
-                 << ", aggregate input index: " << to_kernel_with_input_idx.second
-                 << ", skip node: " << from_kernel_with_output_idx.first->fullname_with_scope()
-                 << ", real node: " << real_kernel_with_index.first->fullname_with_scope();
-    from_kernel_with_output_idx.first = real_kernel_with_index.first;
-    from_kernel_with_output_idx.second = real_kernel_with_index.second;
-    from_actor = dynamic_cast<KernelActor *>(FetchActor(from_kernel_with_output_idx.first->fullname_with_scope()));
+  auto position = from_actor->FetchNodePosition(from_kernel);
+  if ((from_actor->device_contexts_.size() <= position) || (to_actor->device_contexts_.size() <= 0)) {
+    MS_LOG(EXCEPTION) << "The device contexts size is wrong.";
   }
 
-  MS_EXCEPTION_IF_NULL(from_actor);
-  auto from_kernel = from_kernel_with_output_idx.first;
-  MS_EXCEPTION_IF_NULL(from_kernel);
-  auto from_output_index = from_kernel_with_output_idx.second;
-  auto to_input_index = to_kernel_with_input_idx.second;
-
-  if (IsNeedInsertCopyActor(from_actor->device_context_, to_actor->device_context_)) {
+  if (IsNeedInsertCopyActor(from_actor->device_contexts_[position], to_actor->device_contexts_[0])) {
     LinkDataArrowForCopyActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
   } else {
     auto to_aid = to_actor->GetAID();
     auto op_arrow = std::make_shared<DataArrow>(from_output_index, to_aid, to_input_index);
+    // If the from actor has the multi nodes, then use the real output position.
+    if (position != 0) {
+      op_arrow->from_output_index_ = SizeToInt(position);
+    }
+
     (void)from_actor->output_data_arrows_.emplace_back(op_arrow);
     to_actor->input_datas_num_++;
     (void)to_actor->input_data_arrow_aids_.emplace_back(from_actor->GetAID());
@@ -1503,15 +1520,82 @@ void GraphScheduler::LinkDataArrowForKernelActor(KernelActor *from_actor, Kernel
   }
 }
 
-void GraphScheduler::LinkDataArrowForCopyActor(OpActor<DeviceTensor> *const from_actor, KernelActor *const to_actor,
+void GraphScheduler::LinkDataArrowForDeviceDSActor(AbstractActor *const from_actor, KernelActor *const to_actor,
+                                                   const KernelWithIndex &from_kernel_with_output_idx,
+                                                   const KernelWithIndex &to_kernel_with_input_idx,
+                                                   const KernelGraphPtr &) {
+  auto real_from_kernel_with_output_idx = from_kernel_with_output_idx;
+  if (real_from_kernel_with_output_idx.first == nullptr) {
+    auto device_ds_actor = dynamic_cast<DeviceQueueDataSourceActor *>(from_actor);
+    MS_EXCEPTION_IF_NULL(device_ds_actor);
+    real_from_kernel_with_output_idx.first = device_ds_actor->data_kernel_;
+  }
+
+  LinkDataArrowForBaseActor(from_actor, to_actor, real_from_kernel_with_output_idx, to_kernel_with_input_idx);
+}
+
+void GraphScheduler::LinkDataArrowForHostDSActor(AbstractActor *const from_actor, KernelActor *const to_actor,
+                                                 const KernelWithIndex &from_kernel_with_output_idx,
+                                                 const KernelWithIndex &to_kernel_with_input_idx,
+                                                 const KernelGraphPtr &) {
+  auto host_ds_actor = dynamic_cast<HostQueueDataSourceActor *>(from_actor);
+  MS_EXCEPTION_IF_NULL(host_ds_actor);
+
+  KernelWithIndex real_from_kernel_with_output_idx;
+  if (from_kernel_with_output_idx.first != nullptr) {
+    // Get the position of from kernel in the data source actor.
+    auto position = host_ds_actor->FetchNodePosition(from_kernel_with_output_idx.first);
+    real_from_kernel_with_output_idx.first = host_ds_actor->data_nodes_[position];
+    real_from_kernel_with_output_idx.second = from_kernel_with_output_idx.second;
+  } else {
+    real_from_kernel_with_output_idx.first = host_ds_actor->data_nodes_[from_kernel_with_output_idx.second];
+    real_from_kernel_with_output_idx.second = 0;
+  }
+
+  LinkDataArrowForBaseActor(from_actor, to_actor, real_from_kernel_with_output_idx, to_kernel_with_input_idx);
+}
+
+void GraphScheduler::LinkDataArrowForKernelActor(AbstractActor *const from_actor, KernelActor *const to_actor,
+                                                 const KernelWithIndex &from_kernel_with_output_idx,
+                                                 const KernelWithIndex &to_kernel_with_input_idx,
+                                                 const KernelGraphPtr &) {
+  auto real_from_actor = from_actor;
+  auto real_from_kernel_with_output_idx = from_kernel_with_output_idx;
+  auto from_kernel = from_kernel_with_output_idx.first;
+  if (from_kernel == nullptr) {
+    auto kernel_actor = dynamic_cast<KernelActor *>(from_actor);
+    MS_EXCEPTION_IF_NULL(kernel_actor);
+    from_kernel = kernel_actor->kernel_;
+    real_from_kernel_with_output_idx.first = kernel_actor->kernel_;
+  }
+
+  MS_EXCEPTION_IF_NULL(from_kernel);
+  if (IsSkippedKernelActor(from_kernel)) {
+    real_from_kernel_with_output_idx = AnfAlgo::GetPrevNodeOutput(from_kernel, 0);
+    MS_EXCEPTION_IF_NULL(real_from_kernel_with_output_idx.first);
+    LinkControlArrowBySkippedNode(to_actor, from_kernel);
+
+    // Update the from kernel info by the real node info.
+    MS_LOG(INFO) << "Link data arrow for inplace node, aggregate node: "
+                 << to_kernel_with_input_idx.first->fullname_with_scope()
+                 << ", aggregate input index: " << to_kernel_with_input_idx.second
+                 << ", skip node: " << from_kernel->fullname_with_scope()
+                 << ", real node: " << real_from_kernel_with_output_idx.first->fullname_with_scope();
+    real_from_actor =
+      dynamic_cast<AbstractActor *>(FetchActor(real_from_kernel_with_output_idx.first->fullname_with_scope()));
+    MS_EXCEPTION_IF_NULL(real_from_actor);
+  }
+
+  LinkDataArrowForBaseActor(real_from_actor, to_actor, real_from_kernel_with_output_idx, to_kernel_with_input_idx);
+}
+
+void GraphScheduler::LinkDataArrowForCopyActor(AbstractActor *const from_actor, KernelActor *const to_actor,
                                                const KernelWithIndex &from_kernel_with_output_idx,
                                                const KernelWithIndex &to_kernel_with_input_idx) {
   MS_EXCEPTION_IF_NULL(from_actor);
   MS_EXCEPTION_IF_NULL(to_actor);
   auto from_kernel = from_kernel_with_output_idx.first;
   MS_EXCEPTION_IF_NULL(from_kernel);
-  auto to_devcie_context = to_actor->device_context_;
-  MS_EXCEPTION_IF_NULL(to_devcie_context);
   auto from_output_index = from_kernel_with_output_idx.second;
   auto to_input_index = to_kernel_with_input_idx.second;
 
@@ -1527,45 +1611,38 @@ void GraphScheduler::LinkDataArrowForCopyActor(OpActor<DeviceTensor> *const from
     MS_EXCEPTION_IF_NULL(copy_actor);
     InsertActor(copy_actor);
 
-    // Link.
-    const DeviceContext *from_devcie_context = nullptr;
-    auto from_device_tensor = AnfAlgo::GetMutableOutputAddr(from_kernel, from_output_index, false);
-    auto op_arrow_to_copy = std::make_shared<DataArrow>(from_output_index, copy_actor->GetAID(), 0);
-    if (IsDeviceQueueDSActor(from_kernel)) {
-      auto real_from_actor = dynamic_cast<DeviceQueueDataSourceActor *>(from_actor);
-      MS_EXCEPTION_IF_NULL(real_from_actor);
-      from_devcie_context = real_from_actor->device_context_;
-      (void)real_from_actor->output_data_arrows_.emplace_back(op_arrow_to_copy);
-    } else if (IsKernelActor(from_kernel)) {
-      auto real_from_actor = dynamic_cast<KernelActor *>(from_actor);
-      MS_EXCEPTION_IF_NULL(real_from_actor);
-      from_devcie_context = real_from_actor->device_context_;
-      (void)real_from_actor->output_data_arrows_.emplace_back(op_arrow_to_copy);
-    } else if (IsHostQueueDSActor(from_kernel)) {
-      auto real_from_actor = dynamic_cast<HostQueueDataSourceActor *>(from_actor);
-      MS_EXCEPTION_IF_NULL(real_from_actor);
-      auto position = real_from_actor->FetchDataNodePosition(from_kernel);
-      from_devcie_context = real_from_actor->device_contexts_[position];
-      op_arrow_to_copy->from_output_index_ = SizeToInt(position);
-      (void)real_from_actor->output_data_arrows_.emplace_back(op_arrow_to_copy);
-      from_device_tensor =
-        AnfAlgo::GetMutableOutputAddr(real_from_actor->data_nodes_[position], from_output_index, false);
+    // Get the position of from kernel in the data source actor.
+    auto position = from_actor->FetchNodePosition(from_kernel);
+    if ((from_actor->device_contexts_.size() <= position) || (to_actor->device_contexts_.size() <= 0)) {
+      MS_LOG(EXCEPTION) << "The device contexts size is wrong.";
     }
+    auto from_device_context = from_actor->device_contexts_[position];
+    auto to_device_context = to_actor->device_contexts_[0];
+    auto from_device_tensor = AnfAlgo::GetMutableOutputAddr(from_kernel, from_output_index, false);
+    MS_EXCEPTION_IF_NULL(from_device_context);
+    MS_EXCEPTION_IF_NULL(to_device_context);
+    MS_EXCEPTION_IF_NULL(from_device_tensor);
+    auto op_arrow_to_copy = std::make_shared<DataArrow>(from_output_index, copy_actor->GetAID(), 0);
+    // If the from actor has the multi nodes, then use the real output position.
+    if (position != 0) {
+      op_arrow_to_copy->from_output_index_ = SizeToInt(position);
+    }
+
+    // Link.
+    (void)from_actor->output_data_arrows_.emplace_back(op_arrow_to_copy);
     copy_actor->input_datas_num_++;
 
     // Set the member of the copy actor.
-    MS_EXCEPTION_IF_NULL(from_device_tensor);
     auto to_kernel_mod = AnfAlgo::GetKernelMod(to_kernel_with_input_idx.first);
     MS_EXCEPTION_IF_NULL(to_kernel_mod);
     auto input_sizes = to_kernel_mod->GetInputSizeList();
     if (to_input_index >= input_sizes.size()) {
       MS_LOG(EXCEPTION) << "To input index(" << to_input_index << ") is out of size: " << input_sizes.size();
     }
-    copy_actor->output_ = to_devcie_context->CreateDeviceAddress(
+    copy_actor->output_ = to_device_context->CreateDeviceAddress(
       nullptr, input_sizes[to_input_index], from_device_tensor->format(), from_device_tensor->type_id());
-    MS_EXCEPTION_IF_NULL(from_devcie_context);
-    copy_actor->input_device_context_ = from_devcie_context;
-    copy_actor->output_device_context_ = to_devcie_context;
+    (void)copy_actor->device_contexts_.emplace_back(from_device_context);
+    (void)copy_actor->device_contexts_.emplace_back(to_device_context);
 
     // Update the reference count of device tensor.
     UpdateRefCount(from_device_tensor.get());
@@ -1871,45 +1948,27 @@ void GraphScheduler::LinkOutputResultArrowForOutputActor(OutputActor *to_actor,
           continue;
         }
 
-        // The graph output is from kernel actor.
-        if (IsKernelActor(output_with_index.first)) {
-          const auto &from_actor =
-            dynamic_cast<KernelActor *>(FetchActor(output_with_index.first->fullname_with_scope()));
-          MS_EXCEPTION_IF_NULL(from_actor);
-          auto op_arrow = std::make_shared<DataArrow>(output_with_index.second, to_actor->GetAID(), output_position);
-          (void)from_actor->output_result_arrows_.emplace_back(op_arrow);
+        // The graph output is from kernel actor or data source actor.
+        auto kernel_type = KernelTransformType::kUnknown;
+        std::string kernel_name = "";
+        FetchKernelTransformTypeAndName(output_with_index.first, graph, graph_compiler_info, &kernel_type,
+                                        &kernel_name);
+        auto from_actor = dynamic_cast<AbstractActor *>(FetchActor(kernel_name));
+        if (from_actor == nullptr) {
           continue;
         }
-
-        // The graph output is from data source actor.
-        std::string actor_name;
-        DataSourceActor *from_actor = nullptr;
-        size_t from_actor_output_index = 0;
-        if (IsHostQueueDSActor(output_with_index.first, graph, graph_compiler_info.origin_parameters_order_,
-                               graph_compiler_info.strategy_)) {
-          actor_name = graph_compiler_info.name_ + "_HostDSActor";
-          const auto &host_queue_ds_actor = dynamic_cast<HostQueueDataSourceActor *>(FetchActor(actor_name));
-          from_actor_output_index = host_queue_ds_actor->FetchDataNodePosition(output_with_index.first);
-          UpdateRefCount(host_queue_ds_actor->data_nodes_[from_actor_output_index], output_with_index.second, true);
-          from_actor = static_cast<DataSourceActor *>(host_queue_ds_actor);
-        } else if (IsDeviceQueueDSActor(output_with_index.first, graph_compiler_info.strategy_)) {
-          actor_name = graph_compiler_info.name_ + "_DeviceDSActor" + "_" + std::to_string(graph->graph_id());
-          from_actor = dynamic_cast<DataSourceActor *>(FetchActor(actor_name));
-          from_actor_output_index = output_with_index.second;
+        auto op_arrow = std::make_shared<DataArrow>(output_with_index.second, to_actor->GetAID(), output_position);
+        auto position = from_actor->FetchNodePosition(output_with_index.first);
+        // If the from actor has the multi nodes, then use the real output position.
+        if (position != 0) {
+          op_arrow->from_output_index_ = SizeToInt(position);
         }
-
-        // When the input is a parameter node, it should be connected by gather actor.
-        if (from_actor == nullptr) {
-          if (output_with_index.first->isa<CNode>()) {
-            MS_LOG(EXCEPTION) << "Cannot find kernel actor for kernel:"
-                              << output_with_index.first->fullname_with_scope();
-          } else {
-            continue;
-          }
-        }
-        MS_EXCEPTION_IF_NULL(from_actor);
-        auto op_arrow = std::make_shared<DataArrow>(from_actor_output_index, to_actor->GetAID(), output_position);
         (void)from_actor->output_result_arrows_.emplace_back(op_arrow);
+        if (kernel_type == KernelTransformType::kHostDataSourceActor) {
+          auto host_queue_ds_actor = dynamic_cast<HostQueueDataSourceActor *>(from_actor);
+          MS_EXCEPTION_IF_NULL(host_queue_ds_actor);
+          UpdateRefCount(host_queue_ds_actor->data_nodes_[position], output_with_index.second, true);
+        }
       }
     }
   }
@@ -1995,7 +2054,7 @@ void GraphScheduler::LinkDeviceTensorStoreForAutoMonadActor(const std::vector<Ke
   for (auto &kernel_actor : auto_monad_actors) {
     MS_EXCEPTION_IF_NULL(kernel_actor);
     for (auto &device_tensor_store_key : kernel_actor->device_tensor_store_keys_) {
-      auto device_tensors = DeviceTensorStore::GetInstance().Fetch(device_tensor_store_key.second);
+      auto device_tensors = DeviceTensorStore::GetInstance().Fetch(device_tensor_store_key.second.get());
       if (device_tensors.size() < kNeedUpdateDeviceTensorStoreNum) {
         continue;
       }
@@ -2012,9 +2071,9 @@ void GraphScheduler::LinkDeviceTensorStoreForAutoMonadActor(const std::vector<Ke
       InsertActor(copy_actor.get());
 
       // Set the member of the copy actor.
-      copy_actor->device_tensor_store_key_ = std::pair<size_t, AnfNode *>(0, device_tensor_store_key.second);
-      auto input_device_context = kernel_actor->device_context_;
-      copy_actor->input_device_context_ = input_device_context;
+      (void)copy_actor->device_tensor_store_keys_.emplace_back(0, device_tensor_store_key.second);
+      auto input_device_context = kernel_actor->device_contexts_[0];
+      (void)copy_actor->device_contexts_.emplace_back(input_device_context);
       auto another_device_tensor = (device_tensors[0]->DeviceType() == input_device_context->GetDeviceAddressType())
                                      ? device_tensors[1]
                                      : device_tensors[0];
@@ -2023,7 +2082,7 @@ void GraphScheduler::LinkDeviceTensorStoreForAutoMonadActor(const std::vector<Ke
       const auto &another_device_context = device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext(
         {device::kDeviceTypeToName.at(another_device_type), input_device_context->device_context_key().device_id_});
       MS_EXCEPTION_IF_NULL(another_device_context);
-      copy_actor->output_device_context_ = another_device_context;
+      (void)copy_actor->device_contexts_.emplace_back(another_device_context);
 
       MS_LOG(INFO) << "The kernel actor: " << kernel_actor->GetAID().Name()
                    << "has control arrows number:" << kernel_actor->output_control_arrows_.size();
@@ -2597,7 +2656,7 @@ bool GraphScheduler::CheckActorValid(const ActorSet *actor_set, GraphExecutionSt
 
     const size_t kCopyActorInputDataNum = 1;
     auto input_data_num = copy_actor->input_datas_num_;
-    size_t device_tensor_store_num = (copy_actor->device_tensor_store_key_.second == nullptr) ? 0 : 1;
+    size_t device_tensor_store_num = copy_actor->device_tensor_store_keys_.size();
     if (input_data_num + device_tensor_store_num != kCopyActorInputDataNum) {
       MS_LOG(ERROR) << "The input building of " << copy_actor->GetAID().Name()
                     << " is wrong, input data num: " << input_data_num
@@ -2701,6 +2760,39 @@ HostTensorQueue *GraphScheduler::FetchHostQueue(const ActorInfo &actor_info) con
   }
 }
 
+void GraphScheduler::FetchKernelTransformTypeAndName(const AnfNodePtr &node, const KernelGraphPtr &graph,
+                                                     const GraphCompilerInfo &graph_compiler_info,
+                                                     KernelTransformType *const kernel_type,
+                                                     std::string *const kernel_name) {
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(kernel_type);
+  MS_EXCEPTION_IF_NULL(kernel_name);
+
+  if (IsDeviceQueueDSActor(node, graph_compiler_info.strategy_)) {
+    *kernel_type = KernelTransformType::kDeviceDataSourceActor;
+    *kernel_name = graph_compiler_info.name_ + "_DeviceDSActor" + "_" + std::to_string(graph->graph_id());
+  } else if (IsHostQueueDSActor(node, graph, graph_compiler_info.origin_parameters_order_,
+                                graph_compiler_info.strategy_)) {
+    *kernel_type = KernelTransformType::kHostDataSourceActor;
+    *kernel_name = graph_compiler_info.name_ + "_HostDSActor";
+  } else if (IsKernelActor(node, graph_compiler_info.strategy_)) {
+    *kernel_type = KernelTransformType::kKernelActor;
+    *kernel_name = node->fullname_with_scope();
+  } else if (IsInternalParameter(node, graph)) {
+    *kernel_type = KernelTransformType::kInternalParameter;
+    *kernel_name = "";
+  } else if (IsPersistentDeviceTensor(node)) {
+    *kernel_type = KernelTransformType::kDeviceTensorStore;
+    *kernel_name = "";
+  } else {
+    // May exist the from kernel that no need link in the pynative mode.
+    MS_LOG(DEBUG) << "Invalid from kernel: " << node->fullname_with_scope();
+    *kernel_type = KernelTransformType::kUnknown;
+    *kernel_name = "";
+  }
+}
+
 void GraphScheduler::InsertActor(OpActor<DeviceTensor> *actor) {
   MS_EXCEPTION_IF_NULL(actor);
   if (actor_name_to_actor_.count(actor->GetAID().Name()) > 0) {
@@ -2717,39 +2809,6 @@ OpActor<DeviceTensor> *GraphScheduler::FetchActor(const std::string &actor_name)
   return iter->second;
 }
 
-bool GraphScheduler::IsHostQueueDSActor(const AnfNodePtr &node, const KernelGraphPtr &graph,
-                                        const std::vector<AnfNodePtr> &host_parameters,
-                                        GraphExecutionStrategy strategy) {
-  MS_EXCEPTION_IF_NULL(node);
-
-  bool is_parameter_data = node->isa<Parameter>() && (!AnfAlgo::IsParameterWeight(node->cast<ParameterPtr>()));
-  if (!is_parameter_data) {
-    return false;
-  }
-
-  if (strategy == GraphExecutionStrategy::kStep) {
-    MS_EXCEPTION_IF_NULL(graph);
-    return graph->execution_order().size() > 1;
-  }
-
-  if (graph == nullptr) {
-    return true;
-  }
-
-  // In control flow, only the parameters of the root funcgraph are in the host data source.
-  const auto &front_node = graph->GetFrontAnfByBackendAnf(node);
-  bool is_host = ((front_node == nullptr) || host_parameters.empty() ||
-                  find(host_parameters.begin(), host_parameters.end(), front_node) != host_parameters.end());
-
-  //  Judge whether node is internal parameter.
-  const auto &internal_front_node = graph->GetFrontNodeByInternalParameter(node);
-  if (internal_front_node.first == nullptr && is_host) {
-    return true;
-  }
-
-  return false;
-}
-
 void GraphScheduler::DumpActor(const ActorSet *actor_set, const GraphCompilerInfo &graph_compiler_info) const {
   MS_EXCEPTION_IF_NULL(actor_set);
   const auto &context_ptr = MsContext::GetInstance();
@@ -2758,12 +2817,8 @@ void GraphScheduler::DumpActor(const ActorSet *actor_set, const GraphCompilerInf
   if (!save_graphs) {
     return;
   }
-  auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
-  if (save_graphs_path.empty()) {
-    save_graphs_path = ".";
-  }
 
-  std::string filename = save_graphs_path + "/actor_set_" + actor_set->name_ + ".ir";
+  std::string filename = GetSaveGraphsPathName("actor_set_" + actor_set->name_ + ".ir");
   std::ofstream ofs(filename);
   if (!ofs.is_open()) {
     MS_LOG(ERROR) << "Open file [" << filename << "] failed!";
@@ -2773,78 +2828,131 @@ void GraphScheduler::DumpActor(const ActorSet *actor_set, const GraphCompilerInf
   ofs << "[Device tensor stores]\n";
   DumpDeviceTensorStore(graph_compiler_info, ofs);
 
-  ofs << "\n\n[Data source actors]\n";
+  ofs << "\n\n[Data source actors:" << actor_set->data_source_actors_.size() << "]\n";
   for (const auto &data_source_actor : actor_set->data_source_actors_) {
     DumpDSActor(data_source_actor.get(), ofs);
   }
 
-  ofs << "\n\n[Kernel actors]\n";
+  ofs << "\n\n[Kernel actors:" << actor_set->kernel_actors_.size() << "]\n";
   for (const auto &kernel_actor : actor_set->kernel_actors_) {
     DumpKernelActor(kernel_actor.get(), ofs);
   }
 
-  ofs << "\n\n[No input kernel actors]\n";
+  ofs << "\n\n[No input kernel actors:" << actor_set->no_input_kernel_actors_.size() << "]\n";
   for (const auto &no_input_kernel_actor : actor_set->no_input_kernel_actors_) {
     DumpKernelActor(no_input_kernel_actor.get(), ofs);
   }
 
-  ofs << "\n\n[Copy actors]\n";
+  ofs << "\n\n[Copy actors:" << actor_set->copy_actors_.size() << "]\n";
   for (const auto &copy_actor : actor_set->copy_actors_) {
     DumpCopyActor(copy_actor.get(), ofs);
   }
 
-  ofs << "\n\n[Gather actors]\n";
+  ofs << "\n\n[Gather actors:" << actor_set->gather_actors_.size() << "]\n";
   for (const auto &gather_actor : actor_set->gather_actors_) {
     DumpGatherActor(gather_actor.get(), ofs);
   }
 
-  ofs << "\n\n[Switch actors]\n";
+  ofs << "\n\n[Switch actors:" << actor_set->switch_actors_.size() << "]\n";
   for (const auto &switch_actor : actor_set->switch_actors_) {
     DumpSwitchActor(switch_actor.get(), ofs);
   }
 
-  ofs << "\n\n[Loop count actor]\n";
   const auto &loop_count_actor = actor_set->loop_count_actor_;
+  ofs << "\n\n[Loop count actor:" << (loop_count_actor != nullptr ? 1 : 0) << "]\n";
   if (loop_count_actor != nullptr) {
     DumpLoopCountActor(loop_count_actor.get(), ofs);
   }
 
-  ofs << "\n\n[Output actor]\n";
   const auto &output_actor = actor_set->output_actor_;
+  ofs << "\n\n[Output actor:" << (output_actor != nullptr ? 1 : 0) << "]\n";
   if (output_actor != nullptr) {
     DumpOutputActor(output_actor.get(), ofs);
   }
 }
 
-void GraphScheduler::DumpBaseActor(const OpActor<DeviceTensor> *actor, std::ofstream &ofs) const {
+void GraphScheduler::DumpAbstractActor(const AbstractActor *actor, std::ofstream &ofs) const {
   MS_EXCEPTION_IF_NULL(actor);
+  ofs << "\t\tdevice_contexts_num:" << actor->device_contexts_.size()
+      << "\tdevice_tensor_store_keys_num:" << actor->device_tensor_store_keys_.size()
+      << "\tinput_data_arrow_actors_num:" << actor->input_datas_num_
+      << "\tinput_control_arrow_actors_num:" << actor->input_controls_num_ << "\n";
+  ofs << "\t\toutput_data_arrows_num:" << actor->output_data_arrows_.size()
+      << "\toutput_control_arrows_num:" << actor->output_control_arrows_.size()
+      << "\toutput_result_arrows_num:" << actor->output_result_arrows_.size() << "\n";
+
+  if (actor->device_contexts_.size() > 0) {
+    ofs << "\t\tdevice_contexts:" << actor->device_contexts_.size() << "\n ";
+    for (const auto &device_context : actor->device_contexts_) {
+      if (device_context == nullptr) {
+        ofs << "\t\t\tdevice_context:" << device_context << "\n";
+        continue;
+      }
+      ofs << "\t\t\tdevice_context:" << device_context->device_context_key().ToString() << "\n";
+    }
+  }
+
+  if (actor->device_tensor_store_keys_.size() > 0) {
+    ofs << "\t\tdevice_tensor_store_keys:" << actor->device_tensor_store_keys_.size() << "\n ";
+    for (const auto &device_tensor_store_key : actor->device_tensor_store_keys_) {
+      MS_EXCEPTION_IF_NULL(device_tensor_store_key.second);
+      ofs << "\t\t\tto_input_index:" << device_tensor_store_key.first
+          << "\tfrom_node_name:" << device_tensor_store_key.second->fullname_with_scope() << "\n";
+    }
+  }
+
+  if (actor->input_data_arrow_aids_.size() > 0) {
+    ofs << "\t\tinput_data_arrow_actors:" << actor->input_data_arrow_aids_.size() << "\n ";
+    for (const auto &input_data_arrow_aid : actor->input_data_arrow_aids_) {
+      ofs << "\t\t\tfrom_actor_name:" << input_data_arrow_aid.Name() << "\n";
+    }
+  }
+
+  if (actor->input_control_arrow_aids_.size() > 0) {
+    ofs << "\t\tinput_control_arrow_actors:" << actor->input_control_arrow_aids_.size() << "\n ";
+    for (const auto &input_control_arrow_aid : actor->input_control_arrow_aids_) {
+      ofs << "\t\t\tfrom_actor_name:" << input_control_arrow_aid.Name() << "\n";
+    }
+  }
 
   const auto &output_data_arrows = actor->output_data_arrows();
-  ofs << "\t\toutput_data_arrows:" << output_data_arrows.size() << "\n ";
-  for (const auto &data_arrow : output_data_arrows) {
-    MS_EXCEPTION_IF_NULL(data_arrow);
-    ofs << "\t\t\tfrom_output_index:" << data_arrow->from_output_index_
-        << "\tto_actor_name:" << data_arrow->to_op_id_.Name() << "\tto_input_index:" << data_arrow->to_input_index_
-        << "\n";
+  if (output_data_arrows.size() > 0) {
+    ofs << "\t\toutput_data_arrows:" << output_data_arrows.size() << "\n ";
+    for (const auto &data_arrow : output_data_arrows) {
+      MS_EXCEPTION_IF_NULL(data_arrow);
+      ofs << "\t\t\tfrom_output_index:" << data_arrow->from_output_index_
+          << "\tto_actor_name:" << data_arrow->to_op_id_.Name() << "\tto_input_index:" << data_arrow->to_input_index_
+          << "\n";
+    }
   }
 
   const auto &output_control_arrows = actor->output_control_arrows();
-  ofs << "\t\toutput_control_arrows:" << output_control_arrows.size() << "\n ";
-  for (const auto &aid : output_control_arrows) {
-    ofs << "\t\t\tto_actor_name:" << aid.Name() << "\n";
+  if (output_control_arrows.size() > 0) {
+    ofs << "\t\toutput_control_arrows:" << output_control_arrows.size() << "\n ";
+    for (const auto &aid : output_control_arrows) {
+      ofs << "\t\t\tto_actor_name:" << aid.Name() << "\n";
+    }
+  }
+
+  if (actor->output_result_arrows_.size() > 0) {
+    ofs << "\t\toutput_result_arrows:" << actor->output_result_arrows_.size() << "\n ";
+    for (const auto &result_arrow : actor->output_result_arrows_) {
+      MS_EXCEPTION_IF_NULL(result_arrow);
+      ofs << "\t\t\tfrom_output_index:" << result_arrow->from_output_index_
+          << "\tto_actor_name:" << result_arrow->to_op_id_.Name()
+          << "\toutput_node_position:" << result_arrow->to_input_index_ << "\n";
+    }
   }
 }
 
 void GraphScheduler::DumpDSActor(const DataSourceActor *actor, std::ofstream &ofs) const {
   MS_EXCEPTION_IF_NULL(actor);
   const auto &actor_name = actor->GetAID().Name();
+  ofs << "\tactor_name:" << actor_name << "\n";
 
   if (actor_name.find("_DeviceDSActor") != string::npos) {
     // Dump the member info of device queue data source actor.
     const auto &device_queue_ds_actor = dynamic_cast<const DeviceQueueDataSourceActor *>(actor);
-    MS_EXCEPTION_IF_NULL(device_queue_ds_actor->device_context_);
-    ofs << "\tactor_name:" << actor_name
-        << "\tdevice_context:" << device_queue_ds_actor->device_context_->device_context_key().ToString() << "\n";
     const auto &data_kernel = device_queue_ds_actor->data_kernel_;
     MS_EXCEPTION_IF_NULL(data_kernel);
     ofs << "\t\tdata_kernel_name:" << data_kernel->fullname_with_scope()
@@ -2858,7 +2966,6 @@ void GraphScheduler::DumpDSActor(const DataSourceActor *actor, std::ofstream &of
     }
   } else if (actor_name.find("_HostDSActor") != string::npos) {
     // Dump the member info of host queue data source actor.
-    ofs << "\tactor_name:" << actor_name << "\n";
     const auto &host_queue_ds_actor = dynamic_cast<const HostQueueDataSourceActor *>(actor);
     ofs << "\t\tdata_nodes:" << host_queue_ds_actor->data_nodes_.size() << "\n";
     for (size_t i = 0; i < host_queue_ds_actor->data_nodes_.size(); ++i) {
@@ -2868,27 +2975,18 @@ void GraphScheduler::DumpDSActor(const DataSourceActor *actor, std::ofstream &of
       MS_EXCEPTION_IF_NULL(device_tensor);
       ofs << "\t\t\tnode_order_number:" << i << "\tnode_name:" << data_node->fullname_with_scope()
           << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
-          << "\toriginal_ref_count:" << device_tensor->original_ref_count()
-          << "\tdevice_context:" << host_queue_ds_actor->device_contexts_[i]->device_context_key().ToString() << "\n";
+          << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n";
     }
   }
 
-  DumpBaseActor(actor, ofs);
-
-  ofs << "\t\toutput_result_arrows:" << actor->output_result_arrows_.size() << "\n ";
-  for (const auto &result_arrow : actor->output_result_arrows_) {
-    MS_EXCEPTION_IF_NULL(result_arrow);
-    ofs << "\t\t\tfrom_output_index:" << result_arrow->from_output_index_
-        << "\tto_actor_name:" << result_arrow->to_op_id_.Name()
-        << "\toutput_node_position:" << result_arrow->to_input_index_ << "\n";
-  }
+  DumpAbstractActor(actor, ofs);
   ofs << "\n";
 }
 
 void GraphScheduler::DumpLoopCountActor(const LoopCountActor *actor, std::ofstream &ofs) const {
   MS_EXCEPTION_IF_NULL(actor);
-  ofs << "\tactor_name:" << actor->GetAID().Name() << "\tloop_count:" << actor->loop_count_
-      << "\tinput_controls_num:" << actor->input_controls_num_ << "\n";
+  ofs << "\tactor_name:" << actor->GetAID().Name() << "\tloop_count:" << actor->loop_count_ << "\n";
+  DumpAbstractActor(actor, ofs);
 
   ofs << "\t\toutput_control_arrows:" << (actor->data_source_aids_.size() + actor->no_input_kernel_aids_.size() + 1)
       << "\n ";
@@ -2910,16 +3008,12 @@ void GraphScheduler::DumpLoopCountActor(const LoopCountActor *actor, std::ofstre
 
 void GraphScheduler::DumpKernelActor(const KernelActor *actor, std::ofstream &ofs) const {
   MS_EXCEPTION_IF_NULL(actor);
-  MS_EXCEPTION_IF_NULL(actor->device_context_);
-  ofs << "\tactor_name:" << actor->GetAID().Name()
-      << "\tdevice_context:" << actor->device_context_->device_context_key().ToString()
-      << "\tinput_data_num:" << actor->input_datas_num_ << "\tinput_controls_num:" << actor->input_controls_num_
-      << "\n";
+  ofs << "\tactor_name:" << actor->GetAID().Name() << "\n";
 
   const auto &kernel = actor->kernel_;
   MS_EXCEPTION_IF_NULL(kernel);
-  ofs << "\t\tkernel_name:" << kernel->fullname_with_scope() << "\tinput_number:" << AnfAlgo::GetInputTensorNum(kernel)
-      << "\toutput_number:" << AnfAlgo::GetOutputTensorNum(kernel) << "\n";
+  ofs << "\t\tkernel_name:" << kernel->fullname_with_scope() << "\tinputs_num:" << AnfAlgo::GetInputTensorNum(kernel)
+      << "\toutputs_num:" << AnfAlgo::GetOutputTensorNum(kernel) << "\n";
   for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
     const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
     MS_EXCEPTION_IF_NULL(device_tensor);
@@ -2927,22 +3021,7 @@ void GraphScheduler::DumpKernelActor(const KernelActor *actor, std::ofstream &of
         << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n ";
   }
 
-  ofs << "\t\tdevice_tensor_stores:" << actor->device_tensor_store_keys_.size() << "\n ";
-  for (const auto &device_tensor_store_key : actor->device_tensor_store_keys_) {
-    MS_EXCEPTION_IF_NULL(device_tensor_store_key.second);
-    ofs << "\t\t\tto_input_index:" << device_tensor_store_key.first
-        << "\tfrom_node_name:" << device_tensor_store_key.second->fullname_with_scope() << "\n";
-  }
-
-  DumpBaseActor(actor, ofs);
-
-  ofs << "\t\toutput_result_arrows:" << actor->output_result_arrows_.size() << "\n ";
-  for (const auto &result_arrow : actor->output_result_arrows_) {
-    MS_EXCEPTION_IF_NULL(result_arrow);
-    ofs << "\t\t\tfrom_output_index:" << result_arrow->from_output_index_
-        << "\tto_actor_name:" << result_arrow->to_op_id_.Name()
-        << "\toutput_node_position:" << result_arrow->to_input_index_ << "\n";
-  }
+  DumpAbstractActor(actor, ofs);
   ofs << "\n";
 }
 
@@ -2950,33 +3029,12 @@ void GraphScheduler::DumpOutputActor(const OutputActor *actor, std::ofstream &of
   MS_EXCEPTION_IF_NULL(actor);
   ofs << "\tactor_name:" << actor->GetAID().Name() << "\tloop_count:" << actor->loop_count_
       << "\toutputs_num:" << actor->outputs_num_ << "\n";
-
-  ofs << "\t\tdevice_tensor_store_keys:" << actor->device_tensor_store_keys_.size() << "\n ";
-  for (const auto &device_tensor_store_key : actor->device_tensor_store_keys_) {
-    MS_EXCEPTION_IF_NULL(device_tensor_store_key.second);
-    ofs << "\t\t\toutput_node_position:" << device_tensor_store_key.first
-        << "\toutput_node_name:" << device_tensor_store_key.second->fullname_with_scope() << "\n";
-  }
-
-  ofs << "\t\tdevice_contexts:" << actor->device_contexts_.size() << "\n ";
-  for (const auto &device_context : actor->device_contexts_) {
-    if (device_context == nullptr) {
-      ofs << "\t\t\tdevice_context:" << device_context << "\n";
-      continue;
-    }
-    ofs << "\t\t\tdevice_context:" << device_context->device_context_key().ToString() << "\n";
-  }
+  DumpAbstractActor(actor, ofs);
 }
 
 void GraphScheduler::DumpCopyActor(const CopyActor *actor, std::ofstream &ofs) const {
   MS_EXCEPTION_IF_NULL(actor);
-  MS_EXCEPTION_IF_NULL(actor->input_device_context_);
-  MS_EXCEPTION_IF_NULL(actor->output_device_context_);
-  ofs << "\tactor_name:" << actor->GetAID().Name()
-      << "\tinput_device_context:" << actor->input_device_context_->device_context_key().ToString()
-      << "\toutput_device_context:" << actor->output_device_context_->device_context_key().ToString()
-      << "\tinput_data_num:" << actor->input_datas_num_ << "\tinput_controls_num:" << actor->input_controls_num_
-      << "\n";
+  ofs << "\tactor_name:" << actor->GetAID().Name() << "\n";
 
   auto device_tensor = actor->output_;
   if (device_tensor != nullptr) {
@@ -2984,13 +3042,7 @@ void GraphScheduler::DumpCopyActor(const CopyActor *actor, std::ofstream &ofs) c
         << "\toriginal_ref_count:" << device_tensor->original_ref_count() << "\n ";
   }
 
-  if (actor->device_tensor_store_key_.second != nullptr) {
-    ofs << "\t\tdevice_tensor_stores:" << 1 << "\n ";
-    ofs << "\t\t\tto_input_index:" << actor->device_tensor_store_key_.first
-        << "\tfrom_node_name:" << actor->device_tensor_store_key_.second->fullname_with_scope() << "\n";
-  }
-
-  DumpBaseActor(actor, ofs);
+  DumpAbstractActor(actor, ofs);
   ofs << "\n";
 }
 
@@ -3007,10 +3059,10 @@ void GraphScheduler::DumpDeviceTensorStore(const GraphCompilerInfo &graph_compil
       const auto &front_node = FetchFrontNodeByBackendNode(value_node, graph);
       MS_EXCEPTION_IF_NULL(front_node);
       const auto device_tensors = DeviceTensorStore::GetInstance().Fetch(front_node.get());
-      ofs << "\t\tdevcie tensor key:" << front_node->DebugString() << "\tvalue size:" << device_tensors.size() << "\n";
+      ofs << "\t\tdevice tensor key:" << front_node->DebugString() << "\tvalue size:" << device_tensors.size() << "\n";
       for (const auto &device_tensor : device_tensors) {
         MS_EXCEPTION_IF_NULL(device_tensor);
-        ofs << "\t\t\tdevcie tensor value:" << device_tensor << "\tptr:" << device_tensor->GetPtr()
+        ofs << "\t\t\tdevice tensor value:" << device_tensor << "\tptr:" << device_tensor->GetPtr()
             << "\tsize:" << device_tensor->GetSize() << "\toriginal_ref_count:" << device_tensor->original_ref_count()
             << "\tdevice_type:" << device_tensor->DeviceType() << "\n ";
       }
@@ -3029,10 +3081,10 @@ void GraphScheduler::DumpDeviceTensorStore(const GraphCompilerInfo &graph_compil
       }
       const auto device_tensors = DeviceTensorStore::GetInstance().Fetch(front_node.get());
       MS_EXCEPTION_IF_NULL(front_node);
-      ofs << "\t\tdevcie tensor key:" << front_node->DebugString() << "\tvalue size:" << device_tensors.size() << "\n";
+      ofs << "\t\tdevice tensor key:" << front_node->DebugString() << "\tvalue size:" << device_tensors.size() << "\n";
       for (const auto &device_tensor : device_tensors) {
         MS_EXCEPTION_IF_NULL(device_tensor);
-        ofs << "\t\t\tdevcie tensor value:" << device_tensor << "\tptr:" << device_tensor->GetPtr()
+        ofs << "\t\t\tdevice tensor value:" << device_tensor << "\tptr:" << device_tensor->GetPtr()
             << "\tsize:" << device_tensor->GetSize() << "\toriginal_ref_count:" << device_tensor->original_ref_count()
             << "\tdevice_type:" << device_tensor->DeviceType() << "\n ";
       }
@@ -3079,6 +3131,7 @@ void GraphScheduler::DumpGatherActor(const GatherActor *actor, std::ofstream &of
   for (const auto &control_arrow : actor->output_control_arrows_) {
     ofs << "\t\t\tto_actor_name:" << control_arrow;
   }
+  ofs << "\n";
 }
 
 void GraphScheduler::DumpSwitchActor(const SwitchActor *actor, std::ofstream &ofs) const {
@@ -3126,6 +3179,7 @@ void GraphScheduler::DumpSwitchActor(const SwitchActor *actor, std::ofstream &of
       ofs << "\t\t\t\t from index:" << arrow << '\n';
     }
   }
+  ofs << "\n";
 }
 }  // namespace runtime
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/framework/graph_scheduler.h b/mindspore/ccsrc/runtime/framework/graph_scheduler.h
index 63c7fc0572a..a65ace9e26b 100644
--- a/mindspore/ccsrc/runtime/framework/graph_scheduler.h
+++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.h
@@ -49,13 +49,8 @@ using mindspore::session::KernelWithIndex;
 using KernelMapPosition = std::map<KernelWithIndex, std::vector<size_t>, session::KernelWithIndexCmp>;
 using ActorInfo = std::string;
 
-// The second element of pair represents the output index of op actor corresponding to the graph output node.
-using GraphOutputPair = std::pair<OpActor<DeviceTensor> *, size_t>;
-
-// DataArrowPair represent data edge between from actor and to actor.
-// The first element of pair is the AID of from actor, and
-// second element is op arrow between actors.
-using DataArrowPair = std::pair<AID, DataArrowPtr>;
+// The second element of pair represents the output index of abstract actor corresponding to the graph output node.
+using GraphOutputPair = std::pair<AbstractActor *, size_t>;
 
 // The graph compiler info generated by graph compiler is the express of executable graph.
 // The device context is unified interface of interaction with device of corresponding graph.
@@ -65,6 +60,7 @@ using DataArrowPair = std::pair<AID, DataArrowPtr>;
 // The control node parser is used to parse the edge info in control nodes.
 // The origin parameters order is used to correspond to the input args.
 // The origin outputs order is used to correspond to the output args.
+// The need_erase means need erase this GraphCompilerInfo object after run actor set.
 struct GraphCompilerInfo {
   GraphCompilerInfo(const std::vector<KernelGraphPtr> &graphs, const std::vector<DeviceContext *> &device_contexts,
                     const std::vector<std::vector<int64_t> *> &tensors_mask,
@@ -72,7 +68,7 @@ struct GraphCompilerInfo {
                     const std::vector<AnfNodePtr> &control_nodes,
                     const std::vector<AnfNodePtr> &origin_parameters_order, const ControlNodeParserPtr &parser,
                     const KernelMapPosition &origin_outputs_order, const size_t outputs_num, const std::string &name,
-                    GraphExecutionStrategy strategy)
+                    bool need_erase, GraphExecutionStrategy strategy)
       : graphs_(graphs),
         device_contexts_(device_contexts),
         tensors_mask_(tensors_mask),
@@ -83,7 +79,9 @@ struct GraphCompilerInfo {
         origin_outputs_order_(origin_outputs_order),
         outputs_num_(outputs_num),
         name_(name),
+        need_erase_(need_erase),
         strategy_(strategy) {}
+  ~GraphCompilerInfo();
   std::vector<KernelGraphPtr> graphs_;
   std::vector<DeviceContext *> device_contexts_;
   std::vector<std::vector<int64_t> *> tensors_mask_;
@@ -94,6 +92,7 @@ struct GraphCompilerInfo {
   KernelMapPosition origin_outputs_order_;
   size_t outputs_num_;
   std::string name_;
+  bool need_erase_;
   GraphExecutionStrategy strategy_;
 };
 
@@ -137,6 +136,7 @@ class GraphScheduler {
 
   // Clear the members.
   void Clear();
+  void Clear(const ActorInfo &actor_info, const std::vector<KernelGraphPtr> &graphs);
 
   // Transform graph to actor DAG, contains build and link.
   ActorSet *Transform(const GraphCompilerInfo &graph_compiler_info);
@@ -198,23 +198,29 @@ class GraphScheduler {
   void LinkDataArrow(KernelActor *const to_actor, const GraphCompilerInfo &graph_compiler_info,
                      const KernelGraphPtr &graph, const KernelWithIndex &from_kernel_with_output_idx,
                      const KernelWithIndex &to_kernel_with_input_idx);
-  // Link data arrows for internal parameter, convert internal parameter to actor by internal parameter cache to link.
-  void LinkDataArrowForInternalParameter(const AnfNodePtr &internal_parameter,
-                                         const std::vector<AnfNodePtr> &host_parameters, const KernelGraphPtr &graph,
-                                         KernelActor *to_actor, const KernelWithIndex &to_kernel_with_input_idx);
-  // Link data arrows in the copy actor scene, insert the copy actor between from_actor and to_actor.
-  void LinkDataArrowForCopyActor(OpActor<DeviceTensor> *const from_actor, KernelActor *const to_actor,
+  void LinkDataArrowForBaseActor(AbstractActor *const from_actor, KernelActor *const to_actor,
                                  const KernelWithIndex &from_kernel_with_output_idx,
                                  const KernelWithIndex &to_kernel_with_input_idx);
-  void LinkDataArrowForDeviceDSActor(DeviceQueueDataSourceActor *const from_actor, KernelActor *const to_actor,
+  // Link data arrows for internal parameter, convert internal parameter to actor by internal parameter cache to link.
+  void LinkDataArrowForInternalParameter(AbstractActor *const from_actor, KernelActor *const to_actor,
+                                         const KernelWithIndex &from_kernel_with_output_idx,
+                                         const KernelWithIndex &to_kernel_with_input_idx, const KernelGraphPtr &graph);
+  void LinkDataArrowForDeviceTensorStore(AbstractActor *const from_actor, KernelActor *const to_actor,
+                                         const KernelWithIndex &from_kernel_with_output_idx,
+                                         const KernelWithIndex &to_kernel_with_input_idx, const KernelGraphPtr &graph);
+  void LinkDataArrowForDeviceDSActor(AbstractActor *const from_actor, KernelActor *const to_actor,
                                      const KernelWithIndex &from_kernel_with_output_idx,
-                                     const KernelWithIndex &to_to_kernel_with_input_idx);
-  void LinkDataArrowForHostDSActor(HostQueueDataSourceActor *const from_actor, KernelActor *const to_actor,
+                                     const KernelWithIndex &to_kernel_with_input_idx, const KernelGraphPtr &graph);
+  void LinkDataArrowForHostDSActor(AbstractActor *const from_actor, KernelActor *const to_actor,
                                    const KernelWithIndex &from_kernel_with_output_idx,
-                                   const KernelWithIndex &to_kernel_with_input_idx);
-  void LinkDataArrowForKernelActor(KernelActor *from_actor, KernelActor *const to_actor,
-                                   KernelWithIndex from_kernel_with_output_idx,
-                                   const KernelWithIndex &to_kernel_with_input_idx);
+                                   const KernelWithIndex &to_kernel_with_input_idx, const KernelGraphPtr &graph);
+  void LinkDataArrowForKernelActor(AbstractActor *const from_actor, KernelActor *const to_actor,
+                                   const KernelWithIndex &from_kernel_with_output_idx,
+                                   const KernelWithIndex &to_kernel_with_input_idx, const KernelGraphPtr &graph);
+  // Link data arrows in the copy actor scene, insert the copy actor between from_actor and to_actor.
+  void LinkDataArrowForCopyActor(AbstractActor *const from_actor, KernelActor *const to_actor,
+                                 const KernelWithIndex &from_kernel_with_output_idx,
+                                 const KernelWithIndex &to_kernel_with_input_idx);
 
   // 2. The processing of linking control arrows.
   void LinkControlArrowForLoopCountActor(LoopCountActor *loop_count_actor, const ActorSet *actor_set,
@@ -269,15 +275,6 @@ class GraphScheduler {
   // to switch actor.
   void PrepareInputNodeForSwitchActor(const std::vector<AnfNodePtr> &control_nodes);
 
-  // The processing of actors link dynamically.
-  // Analyze necessary input data of current actor, generate and cache op arrow
-  // between current actor and prev actor, the method executes before calling Schedule.
-  void PrepareForDynamiclyLink(ActorSet *actor_set, const CNodePtr &kernel, const AID &aid,
-                               const std::vector<TensorPtr> *input_tensors);
-  // Link to prev actor dynamically, and send message to prev actor to add the
-  // new DataArrow and send output data back, the method must execute after calling Schedule.
-  void LinkDataArrowForKernelActorDynamicly(const ActorSet *actor_set);
-
   // Check whether the actor set is valid.
   bool CheckActorValid(const ActorSet *actor_set,
                        GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline) const;
@@ -288,19 +285,18 @@ class GraphScheduler {
   // Fetch the hsot tensor queue by actor info.
   HostTensorQueue *FetchHostQueue(const ActorInfo &actor_info) const;
 
+  // The fetch results are kernel_type and kernel_name.
+  void FetchKernelTransformTypeAndName(const AnfNodePtr &node, const KernelGraphPtr &graph,
+                                       const GraphCompilerInfo &graph_compiler_info,
+                                       KernelTransformType *const kernel_type, std::string *const kernel_name);
+
   // The operation of the map of actor_name_to_actor_.
   void InsertActor(OpActor<DeviceTensor> *actor);
   OpActor<DeviceTensor> *FetchActor(const std::string &actor_name) const;
 
-  // Host parameters are parameters of root funcgraph, in control flow, only the parameters of the root funcgraph are
-  // in the host data source.
-  bool IsHostQueueDSActor(const AnfNodePtr &node, const KernelGraphPtr &graph = nullptr,
-                          const std::vector<AnfNodePtr> &host_parameters = {},
-                          GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline);
-
   // Display the actor information of corresponding kernel graph.
   void DumpActor(const ActorSet *actor_set, const GraphCompilerInfo &graph_compiler_info) const;
-  void DumpBaseActor(const OpActor<DeviceTensor> *actor, std::ofstream &ofs) const;
+  void DumpAbstractActor(const AbstractActor *actor, std::ofstream &ofs) const;
   void DumpDSActor(const DataSourceActor *actor, std::ofstream &ofs) const;
   void DumpLoopCountActor(const LoopCountActor *actor, std::ofstream &ofs) const;
   void DumpKernelActor(const KernelActor *actor, std::ofstream &ofs) const;
@@ -314,10 +310,8 @@ class GraphScheduler {
   std::unordered_map<ActorInfo, ActorSetPtr> actors_;
   std::unordered_map<std::string, OpActor<DeviceTensor> *> actor_name_to_actor_;
   std::unordered_map<ActorInfo, HostTensorQueuePtr> actor_to_host_queue_;
-  // The second element of pair represents the output index of op actor corresponding to the device tensor.
-  std::unordered_map<DeviceTensorPtr, GraphOutputPair> device_tensor_to_actor_;
 
-  // The local maps and vectors, will be cleared at the beginning of each graph transform:
+  // The local maps and vectors, will be cleared at the end of each graph transform:
   // 1.The second element of pair represents the output index of op actor corresponding to the graph output front node.
   std::map<KernelWithIndex, GraphOutputPair, session::KernelWithIndexCmp> graph_output_to_actor_;
   // 2.Since the control node does not have a backend node, it can only be connected through the relationship between
diff --git a/mindspore/ccsrc/runtime/hardware/device_context.h b/mindspore/ccsrc/runtime/hardware/device_context.h
index 8fa37581918..256826393d5 100644
--- a/mindspore/ccsrc/runtime/hardware/device_context.h
+++ b/mindspore/ccsrc/runtime/hardware/device_context.h
@@ -30,6 +30,9 @@ namespace device {
 using mindspore::kernel::AddressPtr;
 using mindspore::kernel::KernelMod;
 
+const size_t kDeviceContextsNumOne = 1;
+const size_t kDeviceContextsNumTwo = 2;
+
 struct DeviceContextKey {
   // device type name, such as 'GPU' 'Ascend' 'CPU'.
   std::string device_name_;
diff --git a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
index fa92a5aac3f..167c341108c 100644
--- a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
@@ -16,6 +16,8 @@
 
 #include "runtime/hardware/gpu/gpu_device_context.h"
 #include <dlfcn.h>
+#include <utility>
+#include "runtime/device/pynative_profiling.h"
 #include "runtime/device/gpu/kernel_info_setter.h"
 #include "runtime/device/gpu/gpu_kernel_build.h"
 #include "runtime/device/gpu/gpu_device_address.h"
@@ -432,6 +434,11 @@ bool GPUDeviceContext::LaunchKernelWithProfiling(const CNodePtr &kernel, const s
   bool ret = DoLaunchKernel(kernel_mod, inputs, workspace, outputs);
   profiler_inst->OpDataProducerEnd();
 
+  auto op_launch_start_end_time = profiler_inst->GetSingleOpLaunchTime();
+  auto &pynative_profiler = PynativeProfiler::GetInstance();
+  std::string op_name = kernel->fullname_with_scope();
+  pynative_profiler.SetOpNameAndLaunchTime(std::make_pair(op_name, op_launch_start_end_time));
+
   if (profiler_inst->GetSyncEnableFlag()) {
     CHECK_RET_WITH_RETURN_ERROR(SyncStream(), "Profiler SyncStream failed.");
   }
diff --git a/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.cc b/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.cc
index 3fef5113bdf..04e416cf15d 100644
--- a/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.cc
+++ b/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.cc
@@ -84,6 +84,8 @@ void HcclAdapter::InitPlugin() {
   get_all_kernel_builder_ = DlsymFuncObj(GetAllKernelBuilder, plugin_handle_);
   init_hccl_comm_ = DlsymFuncObj(HcclCommInitClusterInfo, plugin_handle_);
   finalize_hccl_comm_ = DlsymFuncObj(HcclCommDestroy, plugin_handle_);
+  single_op_hccl_get_rank_id_ = DlsymFuncObj(HcclGetRankId, plugin_handle_);
+  single_op_hccl_get_rank_size_ = DlsymFuncObj(HcclGetRankSize, plugin_handle_);
   launch_hccl_broadcast_ = DlsymFuncObj(HcclBroadcast, plugin_handle_);
   launch_hccl_all_reduce_ = DlsymFuncObj(HcclAllReduce, plugin_handle_);
   hccl_create_group_ = DlsymFuncObj(HcomCreateGroup, plugin_handle_);
@@ -137,43 +139,53 @@ bool HcclAdapter::InitHccl() {
   return true;
 }
 
-bool HcclAdapter::InitHccl(uint32_t device_id, std::string_view rank_id, std::string_view rank_file) {
-  MS_LOG(INFO) << "Start init hccl adapter.";
+bool HcclAdapter::InitHccl(uint32_t device_id, std::string_view rank_id, std::string_view rank_file,
+                           bool is_graph_mode) {
+  MS_LOG(INFO) << "Start init hccl adapter for " << (is_graph_mode ? "graph mode." : "pynative mode.");
   std::lock_guard<std::mutex> lock(init_mutex_);
   if (init_flag_) {
     MS_LOG(INFO) << "Hccl has been inited, skip.";
     return true;
   }
-
+  is_graph_mode_ = is_graph_mode;
   InitPlugin();
-  bool ret = InitKernelInfoStore(device_id, rank_id, rank_file);
-  if (!ret) {
-    return false;
-  }
-  ret = InitHcclComm(rank_id, rank_file);
-  if (!ret) {
-    return false;
-  }
-  ret = InitHcclExec();
-  if (!ret) {
-    return false;
+  if (is_graph_mode_) {
+    bool ret = InitKernelInfoStore(device_id, rank_id, rank_file);
+    if (!ret) {
+      return false;
+    }
+
+    ret = InitHcclExec();
+    if (!ret) {
+      return false;
+    }
+  } else {
+    bool ret = InitHcclComm(rank_id, rank_file);
+    if (!ret) {
+      return false;
+    }
   }
+
   init_flag_ = true;
   MS_LOG(INFO) << "Init hccl adapter success.";
   return true;
 }
 
 bool HcclAdapter::FinalizeHccl() {
-  MS_LOG(INFO) << "Start destroy hccl adapter.";
   std::lock_guard<std::mutex> lock(init_mutex_);
+  MS_LOG(INFO) << "Start destroy hccl adapter for " << (is_graph_mode_ ? "graph mode." : "pynative mode.");
   if (!init_flag_) {
     MS_LOG(INFO) << "Hccl has never been inited, skip.";
     return true;
   }
 
-  (void)FinalizeHcclExec();
-  (void)FinalizeHcclComm();
-  (void)FinalizeKernelInfoStore();
+  if (is_graph_mode_) {
+    (void)FinalizeHcclExec();
+    (void)FinalizeKernelInfoStore();
+  } else {
+    (void)FinalizeHcclComm();
+  }
+
   FinalizePlugin();
   init_flag_ = false;
   MS_LOG(INFO) << "Destroy hccl adapter success.";
@@ -444,6 +456,16 @@ HcclResult HcclAdapter::HcclDestroyGroup(const std::string &group) const {
   return hccl_destroy_group_(group.c_str());
 }
 
+HcclResult HcclAdapter::HcclGetRankId(uint32_t *rank_id) const {
+  MS_EXCEPTION_IF_NULL(single_op_hccl_get_rank_id_);
+  return single_op_hccl_get_rank_id_(hccl_comm_, rank_id);
+}
+
+HcclResult HcclAdapter::HcclGetRankSize(uint32_t *rank_size) const {
+  MS_EXCEPTION_IF_NULL(single_op_hccl_get_rank_size_);
+  return single_op_hccl_get_rank_size_(hccl_comm_, rank_size);
+}
+
 HcclResult HcclAdapter::HcclGetRankId(const std::string &group, uint32_t *rank_id) const {
   MS_EXCEPTION_IF_NULL(hccl_get_rank_id_);
   return hccl_get_rank_id_(group.c_str(), rank_id);
diff --git a/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.h b/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.h
index 2ed8685d9fd..6f1d5c40f74 100644
--- a/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.h
+++ b/mindspore/ccsrc/runtime/hccl_adapter/hccl_adapter.h
@@ -42,7 +42,7 @@ class HcclAdapter {
   static HcclAdapter &GetInstance();
 
   // common
-  bool InitHccl(uint32_t device_id, std::string_view rank_id, std::string_view rank_file);
+  bool InitHccl(uint32_t device_id, std::string_view rank_id, std::string_view rank_file, bool is_graph_mode);
   bool InitHccl();
   bool FinalizeHccl();
 
@@ -51,6 +51,9 @@ class HcclAdapter {
   HcclResult HcclGetRankId(const std::string &group, uint32_t *rank_id) const;
   HcclResult HcclGetRankSize(const std::string &group, uint32_t *rank_size) const;
 
+  HcclResult HcclGetRankId(uint32_t *rank_id) const;
+  HcclResult HcclGetRankSize(uint32_t *rank_size) const;
+
   // for ge node
   bool GenTask(const AnfNodePtr &node, HcclDataType datatype, std::vector<HcclTaskInfo> *task_info_lists) const;
   int64_t CalcWorkspaceSize(const AnfNodePtr &node, HcclDataType datatype) const;
@@ -104,6 +107,8 @@ class HcclAdapter {
   HcclAllGatherFunObj launch_hccl_all_gather_ = nullptr;
   HcclSendFunObj launch_hccl_send_ = nullptr;
   HcclRecvFunObj launch_hccl_recv_ = nullptr;
+  HcclGetRankIdFunObj single_op_hccl_get_rank_id_ = nullptr;
+  HcclGetRankSizeFunObj single_op_hccl_get_rank_size_ = nullptr;
 
   HcomCreateGroupFunObj hccl_create_group_ = nullptr;
   HcomDestroyGroupFunObj hccl_destroy_group_ = nullptr;
@@ -121,6 +126,7 @@ class HcclAdapter {
   std::shared_ptr<::ge::OpsKernelBuilder> ops_kernel_builder_ = nullptr;
 
   bool init_flag_ = false;
+  bool is_graph_mode_ = false;
   std::mutex init_mutex_;
 };
 }  // namespace mindspore::hccl
diff --git a/mindspore/ccsrc/runtime/hccl_adapter/plugin/hccl_plugin.h b/mindspore/ccsrc/runtime/hccl_adapter/plugin/hccl_plugin.h
index 82e0156abe5..15481269f5d 100644
--- a/mindspore/ccsrc/runtime/hccl_adapter/plugin/hccl_plugin.h
+++ b/mindspore/ccsrc/runtime/hccl_adapter/plugin/hccl_plugin.h
@@ -55,6 +55,9 @@ ORIGIN_METHOD(HcclRecv, HcclResult, void *, uint64_t, HcclDataType, uint32_t, Hc
 
 ORIGIN_METHOD(HcclCommInitClusterInfo, HcclResult, const char *, uint32_t, HcclComm *);
 ORIGIN_METHOD(HcclCommDestroy, HcclResult, HcclComm);
+ORIGIN_METHOD(HcclGetRankId, HcclResult, void *, uint32_t *);
+ORIGIN_METHOD(HcclGetRankSize, HcclResult, void *, uint32_t *);
+
 ORIGIN_METHOD(HcomCreateGroup, HcclResult, const char *, uint32_t, uint32_t *);
 ORIGIN_METHOD(HcomDestroyGroup, HcclResult, const char *);
 ORIGIN_METHOD(HcomGetRankId, HcclResult, const char *, uint32_t *);
diff --git a/mindspore/ccsrc/transform/express_ir/mindir_exporter.cc b/mindspore/ccsrc/transform/express_ir/mindir_exporter.cc
index 4ee7217e8a9..2979885f393 100644
--- a/mindspore/ccsrc/transform/express_ir/mindir_exporter.cc
+++ b/mindspore/ccsrc/transform/express_ir/mindir_exporter.cc
@@ -137,10 +137,11 @@ class IrExportBuilder {
   mind_ir::ModelProto model_;
   mind_ir::NodeProto *last_node_{nullptr};
   std::list<FuncGraphPtr> todo_;
-  std::map<AnfNodePtr, size_t> node_index_map_;
+  std::map<AnfNodePtr, std::string> node_index_map_;
   std::set<std::string> nodeName_;
   size_t node_index_{0};
   size_t shape_index_{0};
+  bool top_graph{true};
 };
 
 using IrExporterPtr = std::shared_ptr<IrExporter>;
@@ -177,6 +178,7 @@ void IrExportBuilder::BuildModelInfo() {
 }
 
 void IrExportBuilder::BuildModel(const FuncGraphPtr &func_graph, bool save_tensor_data) {
+  MS_EXCEPTION_IF_NULL(func_graph);
   mind_ir::GraphProto *graph_proto = model_.mutable_graph();
   graph_proto->set_name(func_graph->ToString());
   graph_proto->set_bprop_hash(func_graph->bprop_hash());
@@ -185,9 +187,11 @@ void IrExportBuilder::BuildModel(const FuncGraphPtr &func_graph, bool save_tenso
   nodeName_.clear();
   // Build the main funcGraph
   nodeName_.insert(func_graph->ToString());
+  top_graph = true;
   BuildFuncGraph(func_graph, graph_proto, save_tensor_data);
   std::set<FuncGraphPtr> graphVisited;
   graphVisited.insert(func_graph);
+  top_graph = false;
   while (!todo_.empty()) {
     FuncGraphPtr fg = todo_.back();
     todo_.pop_back();
@@ -204,6 +208,7 @@ void IrExportBuilder::BuildModel(const FuncGraphPtr &func_graph, bool save_tenso
   }
   // Release resource
   nodeName_.clear();
+  node_index_map_.clear();
 }
 
 void IrExportBuilder::BuildFuncGraph(const FuncGraphPtr &func_graph, mind_ir::GraphProto *const graph_proto,
@@ -221,14 +226,17 @@ void IrExportBuilder::BuildFuncGraph(const FuncGraphPtr &func_graph, mind_ir::Gr
 
 void IrExportBuilder::BuildParameters(const FuncGraphPtr &func_graph, mind_ir::GraphProto *const graph_proto,
                                       bool save_tensor_data) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(graph_proto);
   for (auto &item : func_graph->parameters()) {
+    MS_EXCEPTION_IF_NULL(item);
     auto param = item->cast<ParameterPtr>();
     if (param == nullptr) {
       MS_LOG(EXCEPTION) << "Parameter: '" << item->ToString() << "' could not cast to parameter.";
     }
     std::string param_name = GetUniqueNodeName(param);
-    if (param->has_default()) {
-      MS_LOG(DEBUG) << "Parameter: '" << item->ToString() << "' has default.";
+    if (top_graph && param->has_default()) {
+      MS_LOG(DEBUG) << "Parameter: '" << item->DebugString() << "' has default. address: " << (size_t)param.get();
       mind_ir::TensorProto *parameter_proto = graph_proto->add_parameter();
       parameter_proto->set_name(param_name);
       SetParamToTensorProto(param, parameter_proto);
@@ -292,6 +300,7 @@ void IrExportBuilder::SetValueInfoProto(const AnfNodePtr &node, mind_ir::ValueIn
   }
   if (type->isa<TensorType>() && shape->isa<abstract::Shape>()) {
     auto tensor = type->cast<TensorTypePtr>();
+    MS_EXCEPTION_IF_NULL(tensor);
     auto elem_type = tensor->element();
     const auto &dims = shape->cast<abstract::ShapePtr>()->shape();
     mind_ir::TensorProto *tensor_proto = value_proto->add_tensor();
@@ -308,11 +317,10 @@ void IrExportBuilder::SetValueInfoProto(const AnfNodePtr &node, mind_ir::ValueIn
   } else if (type->isa<Tuple>()) {
     auto tup_shape = shape->cast<abstract::TupleShapePtr>();
     value_proto->set_denotation(type->type_name() + ":" + std::to_string(tup_shape->shape().size()));
-  } else if (type->isa<Number>() || type->isa<String>()) {
-    value_proto->set_denotation(type->type_name());
   } else {
-    MS_LOG(EXCEPTION) << "Value type: " << type->type_name() << " is not supported!";
+    value_proto->set_denotation(type->type_name());
   }
+  MS_LOG(DEBUG) << "Value type: " << type->type_name();
 }
 
 void IrExportBuilder::SetTensorToAttributeProto(const ValuePtr &value, mind_ir::AttributeProto *const attr_proto) {
@@ -324,6 +332,7 @@ void IrExportBuilder::SetTensorToAttributeProto(const ValuePtr &value, mind_ir::
   mind_ir::TensorProto *tensor_proto = attr_proto->add_tensors();
   tensor_proto->set_name("value0");
   auto data = value->cast<tensor::TensorPtr>();
+  MS_EXCEPTION_IF_NULL(data);
   tensor_proto->set_raw_data(data->data_c(), static_cast<size_t>(data->data().nbytes()));
   auto dtype = data->data_type();
   auto shape = data->shape_c();
@@ -356,34 +365,31 @@ void IrExportBuilder::SetParamToTensorProto(const ParameterPtr &param, mind_ir::
 
 void IrExportBuilder::BuildNodes(const FuncGraphPtr &func_graph, mind_ir::GraphProto *const graph_proto) {
   std::vector<AnfNodePtr> nodes = TopoSort(func_graph->get_return(), SuccIncoming, AlwaysInclude);
-  bool is_only_return = true;
   for (const AnfNodePtr &node : nodes) {
+    MS_EXCEPTION_IF_NULL(node);
     if (!node->isa<CNode>()) {
       MS_LOG(DEBUG) << "Node: '" << node->ToString() << "' is not cnode";
       continue;
     }
     auto cnode = node->cast<CNodePtr>();
     if (cnode == func_graph->get_return()) {
-      if (is_only_return) {
-        MS_LOG(EXCEPTION) << "Only has return node, can't convert to binary model!";
-      }
       BuildOutput(cnode, graph_proto);
     } else {
       BuildCNode(cnode, graph_proto);
-      is_only_return = false;
     }
   }
 }
 
 void IrExportBuilder::BuildOutput(const CNodePtr &node, mind_ir::GraphProto *const graph_proto) {
-  if (node->size() != 2) {
+  MS_EXCEPTION_IF_NULL(node);
+  const int OutputSize = 2;
+  if (node->size() != OutputSize) {
     MS_LOG(EXCEPTION) << "Number of inputs of return node is not equal to 2.";
   }
   AnfNodePtr arg = node->input(1);
+  std::string node_name = BuildInputNode(arg, graph_proto);
   mind_ir::ValueInfoProto *output_proto = graph_proto->add_output();
-  std::string output_name = GetUniqueNodeName(node);
-  output_proto->set_name(output_name);
-  last_node_->set_output(0, output_name);
+  output_proto->set_name(node_name);
   SetValueInfoProto(arg, output_proto);
 }
 
@@ -392,9 +398,11 @@ std::string IrExportBuilder::GetOpTypeName(const AnfNodePtr &node) {
   std::string type_name = "";
   if (IsValueNode<Primitive>(node)) {
     PrimitivePtr prim = GetValueNode<PrimitivePtr>(node);
+    MS_EXCEPTION_IF_NULL(prim);
     type_name = prim->ToString();
   } else if (IsValueNode<FuncGraph>(node)) {
     FuncGraphPtr fg = GetValueNode<FuncGraphPtr>(node);
+    MS_EXCEPTION_IF_NULL(fg);
     todo_.push_back(fg);
     type_name = "REF::" + fg->ToString();
   } else if (node->isa<CNode>() || node->isa<Parameter>()) {
@@ -412,10 +420,9 @@ std::string IrExportBuilder::GetOpTypeName(const AnfNodePtr &node) {
 
 void IrExportBuilder::SetShapeToNodeProto(const TypePtr &type, const BaseShapePtr &shape,
                                           mind_ir::AttributeProto *const attr_proto, std::string *const seq_string) {
-  if (seq_string == nullptr) {
-    MS_LOG(EXCEPTION) << "seq_string is nullptr.";
-  }
-
+  MS_EXCEPTION_IF_NULL(type);
+  MS_EXCEPTION_IF_NULL(shape);
+  MS_EXCEPTION_IF_NULL(seq_string);
   if (type->isa<Tuple>()) {
     *seq_string += "Tuple[";
     auto elements = type->cast<TuplePtr>()->elements();
@@ -541,32 +548,24 @@ std::string IrExportBuilder::GetUniqueNodeName(const AnfNodePtr &node) {
   // Naming anfnode
   // 1. parameter is unique in one func_graph
   // 2. cnode and valuenode may be reduplicative, so add index to identify.
-  std::string node_name = "";
-  if (node->isa<Parameter>()) {
-    node_name = GetNodeName(node);
-  } else if (node->isa<CNode>()) {
-    auto iter = node_index_map_.find(node);
-    if (iter != node_index_map_.end()) {
-      node_name = GetNodeName(node) + ":" + std::to_string(iter->second);
-    } else {
-      auto node_idx = GetNodeIndex();
-      node_index_map_[node] = node_idx;
-      node_name = GetNodeName(node) + ":" + std::to_string(node_idx);
-    }
-  } else if (node->isa<ValueNode>()) {
-    auto node_idx = GetNodeIndex();
-    node_index_map_[node] = node_idx;
-    node_name = GetNodeName(node) + ":" + std::to_string(node_idx);
+  auto iter = node_index_map_.find(node);
+  if (iter != node_index_map_.end()) {
+    return iter->second;
   } else {
-    MS_LOG(EXCEPTION) << "Can not support type of node:" << node->ToString();
+    std::string node_name = GetNodeName(node);
+    while (nodeName_.count(node_name) > 0) {
+      auto node_idx = GetNodeIndex();
+      node_name = node_name + ":" + std::to_string(node_idx);
+    }
+    node_index_map_[node] = node_name;
+    return node_name;
   }
-  MS_LOG(DEBUG) << "Node name: " << node_name;
-  return node_name;
 }
 
 std::string IrExportBuilder::GetNodeName(const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(node);
   std::string node_name = "";
-  if ((node != nullptr) && (node->func_graph() != nullptr)) {
+  if (node->func_graph() != nullptr) {
     node_name = node->func_graph()->ToString() + ":";
   }
   if (node->isa<ValueNode>()) {
@@ -583,7 +582,9 @@ void IrExportBuilder::SetAttributeProto(const AnfNodePtr &node, mind_ir::NodePro
   if (node == nullptr || node_proto == nullptr) {
     MS_LOG(EXCEPTION) << "AnfNode or NodeProto is null!";
   }
-  auto value = node->cast<ValueNodePtr>()->value();
+  auto value_node = node->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(value_node);
+  auto value = value_node->value();
   node_proto->set_op_type("Constant");
   mind_ir::AttributeProto *attr_proto = node_proto->add_attribute();
   attr_proto->set_name("value");
@@ -668,6 +669,9 @@ void IrExportBuilder::SetValueToAttributeProto(const ValuePtr &value, mind_ir::A
 }
 
 void IrExportBuilder::SetScalarToAttributeProto_ir(const ValuePtr &value, mind_ir::AttributeProto *const attr_proto) {
+  if (value == nullptr || attr_proto == nullptr) {
+    MS_LOG(EXCEPTION) << "ValuePtr or AttributeProto is null!";
+  }
   attr_proto->set_ref_attr_name("scalar:value0");
   if (value->isa<StringImm>()) {
     attr_proto->set_type(mind_ir::AttributeProto_AttributeType_STRING);
@@ -714,6 +718,9 @@ void IrExportBuilder::SetScalarToAttributeProto_ir(const ValuePtr &value, mind_i
 }
 
 void IrExportBuilder::SetScalarToAttributeProto_irs(const ValuePtr &value, mind_ir::AttributeProto *const attr_proto) {
+  if (value == nullptr || attr_proto == nullptr) {
+    MS_LOG(EXCEPTION) << "ValuePtr or AttributeProto is null!";
+  }
   if (value->isa<Int>()) {
     attr_proto->set_type(mind_ir::AttributeProto_AttributeType_TENSORS);
     mind_ir::TensorProto *tensor_proto = attr_proto->add_tensors();
@@ -808,6 +815,7 @@ void IrExportBuilder::SetSequenceToAttributeProto(const ValueSequeuePtr &value,
       return;
     }
     for (const auto &item : list_value->value()) {
+      MS_EXCEPTION_IF_NULL(item);
       if (item->isa<ValueList>()) {
         SetSequenceToAttributeProto(item->cast<ValueListPtr>(), attr_proto, seq_string);
       } else {
diff --git a/mindspore/ccsrc/transform/graph_ir/convert.cc b/mindspore/ccsrc/transform/graph_ir/convert.cc
index 9b504abbaf0..4521af5ea98 100644
--- a/mindspore/ccsrc/transform/graph_ir/convert.cc
+++ b/mindspore/ccsrc/transform/graph_ir/convert.cc
@@ -55,6 +55,7 @@ using Data = ge::op::Data;
 
 namespace {
 std::vector<AnfNodePtr> GetOrderedCNodes(const FuncGraphPtr fg) {
+  MS_EXCEPTION_IF_NULL(fg);
   auto BelongSameGraph = std::bind(IncludeBelongGraph, fg, std::placeholders::_1);
   auto succ_include_fv = [&fg](const AnfNodePtr &node) -> std::vector<AnfNodePtr> {
     std::vector<AnfNodePtr> vecs;
@@ -132,6 +133,7 @@ OpAdapterPtr DfGraphConvertor::FindAdapter(const AnfNodePtr node, bool train) {
 }
 
 void DfGraphConvertor::InitLoopVar(std::vector<ge::Operator> *init_input) {
+  MS_EXCEPTION_IF_NULL(init_input);
   if (this->training_) {
     GeTensorDesc desc(GeShape(), ge::FORMAT_NCHW, ge::DT_INT64);
     auto var_iter_num = std::make_shared<Variable>("npu_runconfig/iterations_per_loop");
@@ -237,6 +239,7 @@ void DfGraphConvertor::SetupParamInitSubGraph(const TensorOrderMap &tensors, std
   std::vector<AnfNodePtr> nodes = GetOrderedCNodes(anf_graph_);
 
   for (auto &it : nodes) {
+    MS_EXCEPTION_IF_NULL(it);
     if (it->isa<ValueNode>()) {
       if (IsValueNode<SymbolicKeyInstance>(it)) {
         auto symbolic = GetValueNode<SymbolicKeyInstancePtr>(it);
@@ -251,6 +254,7 @@ void DfGraphConvertor::SetupParamInitSubGraph(const TensorOrderMap &tensors, std
         }
       } else if (IsValueNode<RefKey>(it)) {
         auto refkey = GetValueNode<RefKeyPtr>(it);
+        MS_EXCEPTION_IF_NULL(refkey);
         auto name = refkey->tag();
         auto iter = vars_.find(name);  // get corresponding variable op
         if (iter != vars_.end()) {
@@ -771,9 +775,10 @@ void DfGraphConvertor::GetCaseNodeInput(const CNodePtr node, const CNodePtr inpu
     case_inputs.emplace_back(node->input(i));
   }
   auto bnode = input_node->input(2)->cast<CNodePtr>();
-
+  MS_EXCEPTION_IF_NULL(bnode);
   for (size_t i = 1; i < bnode->inputs().size(); i++) {
     auto branch_node = bnode->input(i)->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(branch_node);
     for (size_t j = 2; j < branch_node->inputs().size(); j++) {
       if (std::find(case_inputs.begin(), case_inputs.end(), branch_node->input(j)) == case_inputs.end()) {
         case_inputs.emplace_back(branch_node->input(j));
@@ -1073,7 +1078,9 @@ void DfGraphConvertor::AddEdgeForLoad(const AnfNodePtr &node) {
   }
   auto manager = func_graph->manager();
   MS_EXCEPTION_IF_NULL(manager);
-
+  if (manager->node_users().find(node) == manager->node_users().end()) {
+    MS_LOG(EXCEPTION) << "Can't find node in nodes_users.";
+  }
   auto &users = manager->node_users()[node];
   std::shared_ptr<std::vector<AnfNodePtr>> src_node_list = std::make_shared<std::vector<AnfNodePtr>>();
   std::shared_ptr<std::vector<AnfNodePtr>> dst_node_list = std::make_shared<std::vector<AnfNodePtr>>();
@@ -1101,6 +1108,7 @@ void DfGraphConvertor::AddEdgeForLoad(const AnfNodePtr &node) {
 
 void DfGraphConvertor::FindDestOps(const AnfNodePtr &node, const std::shared_ptr<std::vector<AnfNodePtr>> &node_list,
                                    bool top) {
+  MS_EXCEPTION_IF_NULL(node);
   auto func_graph = node->func_graph();
   MS_EXCEPTION_IF_NULL(func_graph);
   auto mng = func_graph->manager();
@@ -1356,6 +1364,7 @@ void DfGraphConvertor::ProcessSubgraph(AnfNodePtr node, const std::vector<AnfNod
     return;
   }
   auto graph_node = node->cast<CNodePtr>()->input(1)->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(graph_node);
   FuncGraphPtr anf_graph = graph_node->value()->cast<FuncGraphPtr>();
   DfGraphConvertor converter(anf_graph);
   converter.use_inputs_ = true;
@@ -1449,13 +1458,16 @@ void DfGraphConvertor::ConvertMakeTuple(const CNodePtr node) {
 }
 
 void DfGraphConvertor::ConvertTopK(const CNodePtr node) {
+  MS_EXCEPTION_IF_NULL(node);
   MS_LOG(INFO) << "Convert TopK second input's type from int64 to int32.";
   auto value_ptr = node->input(2)->cast<ValueNodePtr>();
   std::ostringstream ss;
   ss << "op" << value_ptr.get();
   op_draw_name_[value_ptr.get()] = ss.str();
   compute_sout_ << ss.str() << "[label= \"" << value_ptr->value()->ToString() << "\" shape=ellipse]" << endl;
-  auto int64_value = value_ptr->value()->cast<Int64ImmPtr>()->value();
+  MS_EXCEPTION_IF_NULL(value_ptr);
+  auto input_value = value_ptr->value();
+  auto int64_value = GetValue<int64_t>(input_value);
   OpAdapterPtr adpt = FindAdapter(value_ptr, training_);
   auto op = adpt->generate(value_ptr);
   adpt->setAttr(op, "value", static_cast<int32_t>(int64_value));
diff --git a/mindspore/ccsrc/utils/comm_manager.cc b/mindspore/ccsrc/utils/comm_manager.cc
index 6b943ffc4f7..a351c53cd44 100644
--- a/mindspore/ccsrc/utils/comm_manager.cc
+++ b/mindspore/ccsrc/utils/comm_manager.cc
@@ -16,6 +16,9 @@
 
 #include "utils/comm_manager.h"
 #include "utils/convert_utils.h"
+#include "utils/ms_context.h"
+#include "frontend/parallel/context.h"
+#include "frontend/parallel/group_manager.h"
 
 #ifndef NO_DLIB
 #include "runtime/hccl_adapter/hccl_adapter.h"
@@ -174,4 +177,28 @@ bool CommManager::GetRankSize(const string &group, unsigned int *rank_size) cons
 
 bool CommManager::DestroyGroup(const string &group) const { return true; }
 #endif
+
+uint32_t GetRank() {
+  uint32_t rank_id = 0;
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  std::string world_group;
+  std::string backend = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
+  if (backend == kAscendDevice) {
+    world_group = parallel::HCCL_WORLD_GROUP;
+  } else if (backend == kGPUDevice) {
+    world_group = parallel::NCCL_WORLD_GROUP;
+  } else {
+    // Other backends like CPU not support parallel, return rank_id with default 0.
+    return rank_id;
+  }
+  auto parallel_context = parallel::ParallelContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(parallel_context);
+  if (parallel_context->parallel_mode() != parallel::STAND_ALONE) {
+    if (!CommManager::GetInstance().GetRankID(world_group, &rank_id)) {
+      MS_LOG(EXCEPTION) << "Get rank id failed.";
+    }
+  }
+  return rank_id;
+}
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/utils/comm_manager.h b/mindspore/ccsrc/utils/comm_manager.h
index 002d0c35e9c..a8bc661249b 100644
--- a/mindspore/ccsrc/utils/comm_manager.h
+++ b/mindspore/ccsrc/utils/comm_manager.h
@@ -44,6 +44,6 @@ class CommManager {
   string backend_;
 };
 
+uint32_t GetRank();
 }  // namespace mindspore
-
 #endif  // MINDSPORE_CCSRC_UTILS_COMMUNICATION_MANAGER_H
diff --git a/mindspore/ccsrc/utils/context/context_extends.cc b/mindspore/ccsrc/utils/context/context_extends.cc
index 7cc7d71381b..10d548e0409 100644
--- a/mindspore/ccsrc/utils/context/context_extends.cc
+++ b/mindspore/ccsrc/utils/context/context_extends.cc
@@ -81,7 +81,7 @@ bool OpenTsd(const std::shared_ptr<MsContext> &ms_context_ptr) {
   }
 
   MS_LOG(INFO) << "Device id = " << device_id << ", rank size = " << rank_size << ".";
-  auto ret = rtSetDevice(device_id);
+  auto ret = rtSetDevice(static_cast<int32_t>(device_id));
   if (ret != RT_ERROR_NONE) {
     MS_LOG(EXCEPTION) << "Device " << device_id << " call rtSetDevice failed, ret[" << static_cast<int>(ret) << "]";
   }
@@ -111,7 +111,7 @@ bool CloseTsd(const std::shared_ptr<MsContext> &ms_context_ptr, bool force) {
     ms_context_ptr->DestroyTensorPrintThread();
 #endif
     uint32_t device_id = ms_context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
-    auto ret = rtDeviceReset(device_id);
+    auto ret = rtDeviceReset(static_cast<int32_t>(device_id));
     if (ret != RT_ERROR_NONE) {
       MS_LOG(EXCEPTION) << "Device " << device_id << " call rtDeviceReset failed, ret[" << static_cast<int>(ret) << "]";
       return false;
@@ -362,6 +362,8 @@ struct DeviceTypeSetRegister {
 #endif
     });
   }
+  DeviceTypeSetRegister(DeviceTypeSetRegister &) = delete;
+  DeviceTypeSetRegister &operator=(const DeviceTypeSetRegister &) = delete;
   ~DeviceTypeSetRegister() = default;
 } device_type_set_regsiter;
 }  // namespace context
diff --git a/mindspore/ccsrc/utils/tensorprint_utils.cc b/mindspore/ccsrc/utils/tensorprint_utils.cc
index f642d0301c0..e64aa3388a8 100644
--- a/mindspore/ccsrc/utils/tensorprint_utils.cc
+++ b/mindspore/ccsrc/utils/tensorprint_utils.cc
@@ -279,6 +279,7 @@ void TensorPrint::operator()() {
       acltdtDataset *acl_dataset = acltdtCreateDataset();
       if (acl_dataset == nullptr) {
         MS_LOG(ERROR) << "Failed to create acl dateaset.";
+        break;
       }
       if (acltdtReceiveTensor(acl_handle_, acl_dataset, -1 /* no timeout */) != ACL_SUCCESS) {
         MS_LOG(ERROR) << "AclHandle failed to receive tensor.";
@@ -295,6 +296,7 @@ void TensorPrint::operator()() {
       acltdtDataset *acl_dataset = acltdtCreateDataset();
       if (acl_dataset == nullptr) {
         MS_LOG(ERROR) << "Failed to create acl dateaset.";
+        break;
       }
       if (acltdtReceiveTensor(acl_handle_, acl_dataset, -1 /* no timeout */) != ACL_SUCCESS) {
         MS_LOG(ERROR) << "Acltdt failed to receive tensor.";
diff --git a/mindspore/ccsrc/utils/utils.h b/mindspore/ccsrc/utils/utils.h
index a4983b310a4..57729bf395d 100644
--- a/mindspore/ccsrc/utils/utils.h
+++ b/mindspore/ccsrc/utils/utils.h
@@ -129,6 +129,7 @@ constexpr auto kBNTrainingReduceGradOpName = "BNTrainingReduceGrad";
 constexpr auto kSquareSumV1OpName = "SquareSumV1";
 constexpr auto kSquareSumV2OpName = "SquareSumV2";
 constexpr auto kClipByNormNoDivSumOpName = "ClipByNormNoDivSum";
+constexpr auto kPReluOpName = "PReLU";
 constexpr auto kGreaterOpName = "Greater";
 constexpr auto kSqrtOpName = "Sqrt";
 constexpr auto kRsqrtOpName = "Rsqrt";
@@ -274,6 +275,7 @@ constexpr auto kDynamicRNNOpName = "DynamicRNN";
 constexpr auto kLSTMInputGradOpName = "LSTMInputGrad";
 constexpr auto kDynamicGRUV2OpName = "DynamicGRUV2";
 constexpr auto kGRUV2HiddenGradOpName = "GRUV2HiddenGrad";
+constexpr auto kGRUV2HiddenGradCellOpName = "GRUV2HiddenGradCell";
 constexpr auto kFusedSparseFtrlName = "FusedSparseFtrl";
 constexpr auto kFusedSparseProximalAdagradName = "FusedSparseProximalAdagrad";
 constexpr auto kFusedSparseLazyAdamName = "FusedSparseLazyAdam";
@@ -345,6 +347,7 @@ constexpr auto kAttrAtomicOutputIndexs = "atomic_output_clean_indexs";
 constexpr auto kAttrAtomicWorkspaceIndexs = "atomic_workspace_clean_indexs";
 constexpr auto kAttrSwitchCondition = "switch_condition";
 constexpr auto kAttrDataType = "data_type";
+constexpr auto kAttrDType = "dtype";
 constexpr auto kAttrActiveTarget = "active_target";
 constexpr auto kAttrActiveStreamId = "active_stream_id";
 constexpr auto kAttrActiveStreamList = "active_stream_list";
@@ -370,6 +373,7 @@ constexpr auto kAttrFpBpEnd = "fpbp_end";
 constexpr auto kAttrFusion = "fusion";
 constexpr auto kAttrGroup = "group";
 constexpr auto kAttrGroups = "groups";
+constexpr auto kAttrGroupBack = "group_back";
 constexpr auto kAttrFracZGroup = "fracz_group";
 constexpr auto kAttrFracZGroupIdx = "fracz_group_idx";
 constexpr auto kAttrOp = "op";
@@ -507,6 +511,8 @@ constexpr auto kUpdateStateRealInput = 2;
 // index define of Load
 constexpr auto kLoadRealInput = 1;
 constexpr auto kLoadStateInput = 2;
+// time transfer unit
+constexpr int kBasicTimeTransferUnit = 1000;
 // index of input or output
 enum Index : size_t {
   kIndex0 = 0,
@@ -631,6 +637,8 @@ const std::set<std::string> kComputeDepend = {kUniqueOpName,       kComputeAccid
 const std::set<std::string> k3DFormatSet = {kOpFormat_NCDHW, kOpFormat_NDC1HWC0, kOpFormat_FRACTAL_Z_3D,
                                             kOpFormat_NDHWC, kOpFormat_DHWCN,    kOpFormat_DHWNC};
 
+const std::set<std::string> kNoPaddingFormatSet = {kOpFormat_ChannelLast, kOpFormat_FRAC_NZ};
+
 const std::set<std::string> DynamicShapeConstInputToAttr = {
   kCastOpName,       kExpandDimsOpName, kReshapeOpName,   kEmbeddingLookupOpName, kTransposeOpName, kReduceMinOpName,
   kReduceMeanOpName, kReduceMaxOpName,  kReduceAllOpName, kReduceAnyOpName,       kConcatOpName};
diff --git a/mindspore/ccsrc/vm/backend.cc b/mindspore/ccsrc/vm/backend.cc
index 61885e7c65d..46329a4b3f5 100644
--- a/mindspore/ccsrc/vm/backend.cc
+++ b/mindspore/ccsrc/vm/backend.cc
@@ -46,7 +46,10 @@
 
 namespace mindspore {
 namespace compile {
-bool Backend::GetCond(const BaseRef &c, bool *const value) { return BaseRefToBool(c, value); }
+bool Backend::GetCond(const BaseRef &c, bool *const value) {
+  mindspore::ScopedLongRunning long_running;
+  return BaseRefToBool(c, value);
+}
 bool Backend::GetIndex(const BaseRef &c, int64_t *const value) { return BaseRefToInt(utils::cast<ValuePtr>(c), value); }
 
 Backend::Backend(const std::string &name) : name_(name) {
@@ -289,14 +292,6 @@ VectorRef MsBackend::MsRunGraph(const GraphId &g, const VectorRef &args, const s
   return outputs;
 }
 
-void MsBackend::Link(GraphId graph_id) {
-  MS_EXCEPTION_IF_NULL(target_sess_);
-  if (graph_id == kInvalidGraphId) {
-    graph_id = target_sess_->GetFinalRunGraph();
-  }
-  target_sess_->BuildGraph(graph_id);
-}
-
 MsBackend::MsBackend(const std::string &name, const std::string &target, uint32_t device_id) : Backend(name) {
   convert_fn_ = std::bind(&MsBackend::MsConvert, this, std::placeholders::_1, std::placeholders::_2);
   target_sess_ = session::SessionFactory::Get().Create(target);
@@ -364,8 +359,9 @@ MindRTBackend::MindRTBackend(const std::string &backend_name, const std::string
 const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
   MS_EXCEPTION_IF_NULL(graph_compiler_);
   MS_EXCEPTION_IF_NULL(func_graph);
-  root_graph_ = WrapPrimitives(func_graph);
-  MS_EXCEPTION_IF_NULL(root_graph_);
+  auto root_graph = WrapPrimitives(func_graph);
+  MS_EXCEPTION_IF_NULL(root_graph);
+  root_graph_ = root_graph.get();
   // Register a summary callback function, which is called in the final stages of summary.
   graph_compiler_->RegisterSummaryCallBackFunc(callbacks::SummarySaveCallback);
 
@@ -377,11 +373,11 @@ const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
   // Compile root graph.
   graph_id_to_device_context_.clear();
   control_nodes_.clear();
-  CompileGraph(root_graph_);
+  CompileGraph(root_graph);
 
   // Compile sub graphs.
-  MS_EXCEPTION_IF_NULL(root_graph_->manager());
-  FuncGraphSet sub_graphs = root_graph_->manager()->func_graphs();
+  MS_EXCEPTION_IF_NULL(root_graph->manager());
+  FuncGraphSet sub_graphs = root_graph->manager()->func_graphs();
   for (auto sub_graph : sub_graphs) {
     if (sub_graph != func_graph && sub_graph != nullptr) {
       CompileGraph(sub_graph);
@@ -389,7 +385,7 @@ const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
   }
 
   // Construct the graph compiler info.
-  auto graph_compiler_info = ConstructGraphCompilerInfo(root_graph_);
+  auto graph_compiler_info = ConstructGraphCompilerInfo(root_graph);
 
   if (real_execution_mode_ == kGraphMode) {
     // Transform graph to actor DAG, and schedule the actor DAG.
@@ -486,7 +482,10 @@ const ActorInfo &MindRTBackend::CompileGraph(const OpRunInfo &op_run_info, const
   graph_info_to_device_context_.clear();
   graph_info_to_device_context_[graph_info] = device_context;
 
-  auto graph_compiler_info = ConstructGraphCompilerInfo(actor_info, tensors_mask, input_tensors);
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  bool enable_cache = context_ptr->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
+  auto graph_compiler_info = ConstructGraphCompilerInfo(actor_info, tensors_mask, input_tensors, !enable_cache);
   const auto actor_set = runtime::GraphScheduler::GetInstance().Transform(*graph_compiler_info);
   runtime::GraphScheduler::GetInstance().Schedule(actor_set);
   MS_EXCEPTION_IF_NULL(graph_compiler_info);
@@ -778,7 +777,7 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs
 }
 
 void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args, VectorRef *outputs) {
-  MS_LOG(INFO) << "Run actor begin, actor name: " << actor_info;
+  MS_LOG(DEBUG) << "Run actor begin, actor name: " << actor_info;
   MS_EXCEPTION_IF_NULL(root_graph_);
   if (IsGraphOutputValueNodeOrParameter(root_graph_->output(), args, outputs)) {
     return;
@@ -978,13 +977,13 @@ std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(con
   std::vector<std::vector<int64_t> *> tensors_mask;
   std::vector<std::vector<tensor::TensorPtr> *> input_tensors;
   return std::make_unique<GraphCompilerInfo>(graphs, device_contexts, tensors_mask, input_tensors, control_nodes_,
-                                             root_graph->parameters(), parser, outputs_order, outputs_num, name,
+                                             root_graph->parameters(), parser, outputs_order, outputs_num, name, false,
                                              runtime::GraphExecutionStrategy::kPipeline);
 }
 
 std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(
   const ActorInfo &actor_info, const std::vector<int64_t> *tensors_mask,
-  const std::vector<tensor::TensorPtr> *input_tensors) {
+  const std::vector<tensor::TensorPtr> *input_tensors, bool need_erase) {
   MS_EXCEPTION_IF_NULL(graph_compiler_);
   std::vector<KernelGraphPtr> graphs;
   std::vector<DeviceContext *> device_contexts;
@@ -1013,10 +1012,33 @@ std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(
   auto parser = std::make_shared<ControlNodeParser>();
   return std::make_unique<GraphCompilerInfo>(graphs, device_contexts, tensors_mask_list, input_tensors_list,
                                              std::vector<AnfNodePtr>(), std::vector<AnfNodePtr>(), parser,
-                                             outputs_order, outputs_order.size(), actor_info,
+                                             outputs_order, outputs_order.size(), actor_info, need_erase,
                                              runtime::GraphExecutionStrategy::kStep);
 }
 
+void MindRTBackend::EraseSingleOpCache(const ActorInfo &actor_info, const KernelGraphPtr &graph) {
+  if (graph_info_to_device_context_.empty()) {
+    MS_LOG(EXCEPTION) << "The map graph_info_to_device_context_ is empty.";
+  }
+  const auto &graph_info = graph_info_to_device_context_.begin()->first;
+  graph_compiler_->EraseSingleOpCache(graph_info, graph->graph_id());
+  actor_to_graph_compiler_info_.erase(actor_info);
+}
+
+void DebugStreamSync(const GraphCompilerInfo &graph_compiler_info) {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto enable_sync_run = ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE);
+  if (enable_sync_run) {
+    if (!graph_compiler_info.device_contexts_.empty()) {
+      MS_EXCEPTION_IF_NULL(graph_compiler_info.device_contexts_[0]);
+      if (!graph_compiler_info.device_contexts_[0]->SyncStream()) {
+        MS_LOG(EXCEPTION) << "Sync stream failed!";
+      }
+    }
+  }
+}
+
 void MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info,
                              const std::vector<int64_t> *tensors_mask,
                              const std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs) {
@@ -1056,6 +1078,9 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info
     MS_LOG(EXCEPTION) << "The actor runs failed, actor name: " << actor_set->name_;
   }
 
+  // Debug for pynative
+  DebugStreamSync(graph_compiler_info);
+
   // Fetch outputs.
   const auto &graph = graph_compiler_info.graphs_.front();
   MS_EXCEPTION_IF_NULL(graph);
@@ -1084,6 +1109,10 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info
   // Update device address for input and output of graph.
   UpdateOutputDeviceAddress(output_nodes, graph_compiler_info.device_contexts_.front());
   UpdateInputDeviceAddress(graph);
+
+  if (graph_compiler_info.need_erase_) {
+    EraseSingleOpCache(actor_info, graph);
+  }
 }
 }  // namespace compile
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/vm/backend.h b/mindspore/ccsrc/vm/backend.h
index 005880eb35e..ba6e025c99e 100644
--- a/mindspore/ccsrc/vm/backend.h
+++ b/mindspore/ccsrc/vm/backend.h
@@ -61,7 +61,6 @@ class Backend {
   virtual bool GetCond(const BaseRef &c, bool *value);
   virtual bool GetIndex(const BaseRef &c, int64_t *value);
   virtual GraphId CompileGraph(NotNull<FuncGraphPtr> fg) { return kInvalidGraphId; }
-  virtual void Link(GraphId) {}
   virtual void SetDebugger() {}
 
   bool is_multi_graph_sink() const { return is_multi_graph_sink_; }
@@ -82,7 +81,6 @@ class MsBackend : public Backend {
   VectorRef MsRunGraph(const GraphId &g, const VectorRef &args, const std::string &target = "");
 
   VectorRef MsSimuRunGraph(const GraphId &g);
-  void Link(GraphId) override;
   GraphId CompileGraph(NotNull<FuncGraphPtr> fg) override;
   VectorRef RunGraph(GraphId graph_id, const VectorRef &args);
   void ClearSessionGraphs();
@@ -139,7 +137,12 @@ class MindRTBackend : public Backend {
   // Construct the GraphCompilerInfo by the compilation results of graph, used in PyNative mode.
   std::unique_ptr<GraphCompilerInfo> ConstructGraphCompilerInfo(const ActorInfo &actor_info,
                                                                 const std::vector<int64_t> *tensors_mask,
-                                                                const std::vector<tensor::TensorPtr> *input_tensors);
+                                                                const std::vector<tensor::TensorPtr> *input_tensors,
+                                                                bool need_erase);
+
+  // In PyNative mode, the size of single op cache list will be increasing, which lead to memory cost increasing,
+  // so the latest single op cache should be erased when cache list size exceeds threshold value.
+  void EraseSingleOpCache(const ActorInfo &actor_info, const KernelGraphPtr &graph);
 
   // Split complete kernel graph to single op graph in PyNative back
   // propagation, then compile and run single op graph.
@@ -158,7 +161,7 @@ class MindRTBackend : public Backend {
   // Cache output tensor ref count of kernels for back propagation graph in PyNative mode.
   std::map<GraphId, std::map<KernelWithIndex, size_t>> cnode_ref_counts_;
 
-  FuncGraphPtr root_graph_;
+  FuncGraph *root_graph_;
   GraphPartitionPtr graph_partition_;
   std::shared_ptr<GraphCompiler> graph_compiler_;
   std::string device_name_;
diff --git a/mindspore/ccsrc/vm/graph_partition.cc b/mindspore/ccsrc/vm/graph_partition.cc
index dc619a5da12..e61e03e2b6b 100644
--- a/mindspore/ccsrc/vm/graph_partition.cc
+++ b/mindspore/ccsrc/vm/graph_partition.cc
@@ -452,6 +452,31 @@ void AddSegmentDependency(const FuncGraphPtr &graph, const std::map<AnfNodePtr,
   }
 }
 
+void RemoveUselessDependency(std::vector<GraphSegmentPtr> *segments) {
+  MS_EXCEPTION_IF_NULL(segments);
+  for (auto &segment : *segments) {
+    MS_EXCEPTION_IF_NULL(segment);
+    if (segment->is_cut_) {
+      continue;
+    }
+    bool total_virtual_node = true;
+    for (auto &node : segment->nodes_) {
+      if (IsPrimitiveCNode(node, prim::kPrimImageSummary) || IsPrimitiveCNode(node, prim::kPrimScalarSummary) ||
+          IsPrimitiveCNode(node, prim::kPrimTensorSummary) || IsPrimitiveCNode(node, prim::kPrimHistogramSummary) ||
+          IsPrimitiveCNode(node, prim::kPrimDepend) || IsPrimitiveCNode(node, prim::kPrimLoad) ||
+          IsPrimitiveCNode(node, prim::kPrimUpdateState) || IsPrimitiveCNode(node, prim::kPrimMakeTuple) ||
+          IsPrimitiveCNode(node, prim::kPrimTupleGetItem)) {
+        continue;
+      }
+      total_virtual_node = false;
+      break;
+    }
+    if (total_virtual_node) {
+      segment->pre_segments_.clear();
+    }
+  }
+}
+
 bool IsSubGraph(const AnfNodePtr &node) {
   MS_EXCEPTION_IF_NULL(node);
   if (node->isa<CNode>()) {
@@ -691,6 +716,7 @@ std::vector<GraphSegmentPtr> GraphPartition::Partition(const FuncGraphPtr &graph
   MS_LOG(DEBUG) << "Segment size:" << segments.size();
   if (contain_multi_target) {
     AddSegmentDependency(graph, node_to_segment);
+    RemoveUselessDependency(&segments);
   }
   return segments;
 }
diff --git a/mindspore/ccsrc/vm/transform.cc b/mindspore/ccsrc/vm/transform.cc
index 4a363b65cbb..404b87d6e45 100644
--- a/mindspore/ccsrc/vm/transform.cc
+++ b/mindspore/ccsrc/vm/transform.cc
@@ -580,9 +580,6 @@ BackendPtr CreateBackend() {
       if (MsContext::GetInstance()->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
         backend->set_is_multi_graph_sink(false);
         context_ptr->set_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK, false);
-      } else {
-        backend->set_is_multi_graph_sink(true);
-        context_ptr->set_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK, true);
       }
     }
     return backend;
@@ -613,7 +610,7 @@ void SetMindRTEnable() {
   }
 #endif
 
-  MS_LOG(INFO) << "Enable mindRT.";
+  MS_LOG(DEBUG) << "Enable mindRT.";
   context_ptr->set_param<bool>(MS_CTX_ENABLE_MINDRT, true);
 }
 }  // namespace compile
diff --git a/mindspore/common/api.py b/mindspore/common/api.py
index 6d3e6fb9071..43973fabc21 100644
--- a/mindspore/common/api.py
+++ b/mindspore/common/api.py
@@ -17,11 +17,13 @@
 """Providing interface methods."""
 import types
 import sys
+import os
 from collections import OrderedDict
 from functools import wraps
 
 from mindspore import context
 from mindspore import log as logger
+from mindspore._extends.remote import kernel_build_server
 from .tensor import Tensor as MsTensor
 from .._c_expression import generate_key, Executor_, Tensor, MetaTensor, PynativeExecutor_
 from .._c_expression import verify_inputs_signature, init_exec_dataset, _set_dataset_mode_config, init_pipeline
@@ -173,7 +175,7 @@ class _MindSporeFunction:
             self.obj.__parse_method__ = method_name
             generate_name = self.obj.__module__ + "."
             if self.obj.__class__.__name__ != "ClipByNorm":
-                generate_name = generate_name + str(self.obj.create_time)
+                generate_name = generate_name + str(self.obj.create_time) + '.' + self.fn.__name__
         if self.identify_obj is not None:
             generate_name = generate_name + str(id(self.identify_obj))
 
@@ -376,6 +378,8 @@ class _PynativeExecutor:
 
     def __init__(self):
         self._executor = PynativeExecutor_.get_instance()
+        self._executor.set_py_exe_path(sys.executable)
+        self._executor.set_kernel_build_server_dir(os.path.split(kernel_build_server.__file__)[0] + os.sep)
 
     def new_graph(self, obj, *args, **kwargs):
         self._executor.new_graph(obj, *args, *(kwargs.values()))
@@ -445,6 +449,7 @@ class _Executor:
         self._executor = Executor_.get_instance()
         self.compile_cache = {}
         self._executor.set_py_exe_path(sys.executable)
+        self._executor.set_kernel_build_server_dir(os.path.split(kernel_build_server.__file__)[0] + os.sep)
         self.queue_name = ""
 
     def init_dataset(self, queue_name, dataset_size, batch_size, dataset_types, dataset_shapes,
diff --git a/mindspore/common/dtype.py b/mindspore/common/dtype.py
index f4f34fd5ffc..7ab147ba06a 100644
--- a/mindspore/common/dtype.py
+++ b/mindspore/common/dtype.py
@@ -38,7 +38,8 @@ __dtype__ = [
     "number", "tensor",
     "string", "type_none",
     "tensor_type",
-    "Type", "Int"
+    "Type", "Int",
+    "complex64", "complex128"
 ]
 
 __method__ = [
@@ -77,6 +78,8 @@ float32 = typing.Float(32)
 single = float32
 float64 = typing.Float(64)
 double = float64
+complex64 = typing.Complex(64)
+complex128 = typing.Complex(128)
 
 number = typing.Number()
 int_ = typing.Int()
@@ -124,14 +127,16 @@ number_type = (int8,
                uint64,
                float16,
                float32,
-               float64,)
+               float64,
+               complex64,
+               complex128,)
 
 int_type = (int8, int16, int32, int64,)
 uint_type = (uint8, uint16, uint32, uint64,)
 float_type = (float16, float32, float64,)
 
 implicit_conversion_seq = {t: idx for idx, t in enumerate((
-    bool_, int8, uint8, int16, int32, int64, float16, float32, float64))}
+    bool_, int8, uint8, int16, int32, int64, float16, float32, float64, complex64, complex128))}
 
 _simple_types = {
     list: list_,
@@ -140,6 +145,7 @@ _simple_types = {
     bool: bool_,
     int: int64,
     float: float64,
+    complex: complex128,
     str: string,
     np.bool_: bool_,
     np.str: string,
@@ -180,10 +186,10 @@ def pytype_to_dtype(obj):
 
 def get_py_obj_dtype(obj):
     """
-    Get the MindSpore data type which corresponds to python type or variable.
+    Get the MindSpore data type, which corresponds to python type or variable.
 
     Args:
-        obj (type): An object of python type, or a variable in python type.
+        obj (type): An object of python type, or a variable of python type.
 
     Returns:
         Type of MindSpore type.
@@ -228,6 +234,8 @@ def dtype_to_nptype(type_):
         float16: np.float16,
         float32: np.float32,
         float64: np.float64,
+        complex64: np.complex64,
+        complex128: np.complex128,
     }[type_]
 
 
@@ -260,6 +268,8 @@ def dtype_to_pytype(type_):
         list_: list,
         tuple_: tuple,
         string: str,
+        complex64: complex,
+        complex128: complex,
         type_none: type(None)
     }[type_]
 
diff --git a/mindspore/common/initializer.py b/mindspore/common/initializer.py
index d8b6db77f49..57d6c1ce76a 100644
--- a/mindspore/common/initializer.py
+++ b/mindspore/common/initializer.py
@@ -101,6 +101,13 @@ class Zero(Initializer):
 
     Returns:
         Array, an array after being assigned.
+
+
+    Examples:
+        >>> import mindspore
+        >>> from mindspore.common.initializer import initializer, Zero
+        >>> tensor1 = initializer(Zero(), [1, 2, 3], mindspore.float32)
+        >>> tensor2 = initializer('zeros', [1, 2, 3], mindspore.float32)
     """
     def _initialize(self, arr):
         _assignment(arr, 0)
@@ -116,6 +123,13 @@ class One(Initializer):
 
     Returns:
         Array, assigned array.
+
+
+    Examples:
+        >>> import mindspore
+        >>> from mindspore.common.initializer import initializer, One
+        >>> tensor1 = initializer(One(), [1, 2, 3], mindspore.float32)
+        >>> tensor2 = initializer('ones', [1, 2, 3], mindspore.float32)
     """
     def _initialize(self, arr):
         _assignment(arr, 1)
@@ -236,11 +250,21 @@ class XavierUniform(Initializer):
     - where :math:`n_{in}` is the number of input units in the weight tensor.
     - where :math:`n_{out}` is the number of output units in the weight tensor.
 
+    For details of XavierUniform algorithm, please check
+    `<http://proceedings.mlr.press/v9/glorot10a.html>`_.
+
     Args:
         gain (float): An optional scaling factor. Default: 1.
 
     Returns:
         Array, assigned array.
+
+
+    Examples:
+        >>> import mindspore
+        >>> from mindspore.common.initializer import initializer, XavierUniform
+        >>> tensor1 = initializer(XavierUniform(), [1, 2, 3], mindspore.float32)
+        >>> tensor2 = initializer('xavier_uniform', [1, 2, 3], mindspore.float32)
     """
     def __init__(self, gain=1):
         super(XavierUniform, self).__init__(gain=gain)
@@ -265,7 +289,7 @@ class HeUniform(Initializer):
         boundary = \sqrt{\frac{6}{(1 + a^2) \times \text{fan_in}}}
 
     Args:
-        negative_slope (int, float, bool): The negativa slope of the rectifier used after this layer
+        negative_slope (int, float, bool): The negative slope of the rectifier used after this layer
             (only used when `nonlinearity` is 'leaky_relu'). Default: 0.
         mode (str): Either 'fan_in' or 'fan_out'. Choosing 'fan_in' preserves the magnitude of the
             variance of the weights in the forward pass. Choosing 'fan_out' preserves the magnitudes
@@ -275,6 +299,13 @@ class HeUniform(Initializer):
 
     Returns:
         Array, assigned array.
+
+
+    Examples:
+        >>> import mindspore
+        >>> from mindspore.common.initializer import initializer, HeUniform
+        >>> tensor1 = initializer(HeUniform(), [1, 2, 3], mindspore.float32)
+        >>> tensor2 = initializer('he_uniform', [1, 2, 3], mindspore.float32)
     """
     def __init__(self, negative_slope=0, mode='fan_in', nonlinearity='leaky_relu'):
         super(HeUniform, self).__init__(negative_slope=negative_slope, mode=mode, nonlinearity=nonlinearity)
@@ -299,7 +330,7 @@ class HeNormal(Initializer):
     N(0, sigma).
 
     Args:
-        negative_slope (int, float, bool): The negativa slope of the rectifier used after this layer
+        negative_slope (int, float, bool): The negative slope of the rectifier used after this layer
             (only used when `nonlinearity` is 'leaky_relu'). Default: 0.
         mode (str): Either 'fan_in' or 'fan_out'. Choosing 'fan_in' preserves the magnitude of the
             variance of the weights in the forward pass. Choosing 'fan_out' preserves the magnitudes
@@ -309,6 +340,13 @@ class HeNormal(Initializer):
 
     Returns:
         Array, assigned array.
+
+
+    Examples:
+        >>> import mindspore
+        >>> from mindspore.common.initializer import initializer, HeNormal
+        >>> tensor1 = initializer(HeNormal(), [1, 2, 3], mindspore.float32)
+        >>> tensor2 = initializer('he_normal', [1, 2, 3], mindspore.float32)
     """
     def __init__(self, negative_slope=0, mode='fan_in', nonlinearity='leaky_relu'):
         super(HeNormal, self).__init__(negative_slope=negative_slope, mode=mode, nonlinearity=nonlinearity)
@@ -334,6 +372,13 @@ class Constant(Initializer):
 
     Returns:
         Array, an array after being assigned.
+
+
+    Examples:
+        >>> import mindspore
+        >>> from mindspore.common.initializer import initializer
+        >>> tensor1 = initializer(0, [1, 2, 3], mindspore.float32)
+        >>> tensor2 = initializer(5, [1, 2, 3], mindspore.float32)
     """
     def __init__(self, value):
         super(Constant, self).__init__(value=value)
@@ -354,6 +399,13 @@ class Uniform(Initializer):
 
     Returns:
         Array, uniform array.
+
+
+    Examples:
+        >>> import mindspore
+        >>> from mindspore.common.initializer import initializer, Uniform
+        >>> tensor1 = initializer(Uniform(), [1, 2, 3], mindspore.float32)
+        >>> tensor2 = initializer('uniform', [1, 2, 3], mindspore.float32)
     """
     def __init__(self, scale=0.07):
         super(Uniform, self).__init__(scale=scale)
@@ -376,6 +428,13 @@ class Normal(Initializer):
 
     Returns:
         Array, normal array.
+
+
+    Examples:
+        >>> import mindspore
+        >>> from mindspore.common.initializer import initializer, Normal
+        >>> tensor1 = initializer(Normal(), [1, 2, 3], mindspore.float32)
+        >>> tensor2 = initializer('normal', [1, 2, 3], mindspore.float32)
     """
     def __init__(self, sigma=0.01, mean=0.0):
         super(Normal, self).__init__(sigma=sigma, mean=mean)
@@ -400,6 +459,13 @@ class TruncatedNormal(Initializer):
 
     Returns:
         Array, truncated normal array.
+
+
+    Examples:
+        >>> import mindspore
+        >>> from mindspore.common.initializer import initializer, TruncatedNormal
+        >>> tensor1 = initializer(TruncatedNormal(), [1, 2, 3], mindspore.float32)
+        >>> tensor2 = initializer('truncatedNormal', [1, 2, 3], mindspore.float32)
     """
     def __init__(self, sigma=0.01):
         super(TruncatedNormal, self).__init__(sigma=sigma)
@@ -435,9 +501,9 @@ def initializer(init, shape=None, dtype=mstype.float32):
     Examples:
         >>> import mindspore
         >>> from mindspore.common.initializer import initializer, One
-        >>> tensor = initializer('ones', [1, 2, 3], mindspore.float32)
-        >>> tensor = initializer(One(), [1, 2, 3], mindspore.float32)
-        >>> tensor = initializer(0, [1, 2, 3], mindspore.float32)
+        >>> tensor1 = initializer('ones', [1, 2, 3], mindspore.float32)
+        >>> tensor2 = initializer(One(), [1, 2, 3], mindspore.float32)
+        >>> tensor3 = initializer(0, [1, 2, 3], mindspore.float32)
     """
     if not isinstance(init, (Tensor, numbers.Number, str, Initializer)):
         raise TypeError("Unsupported init type '{}'.".format(type(init)))
diff --git a/mindspore/common/parameter.py b/mindspore/common/parameter.py
index 018ebaf5190..26c1cf53d66 100644
--- a/mindspore/common/parameter.py
+++ b/mindspore/common/parameter.py
@@ -105,7 +105,7 @@ class Parameter(Tensor_):
         >>> x = Tensor(np.ones((2, 1)), mindspore.float32)
         >>> print(net(x))
         [[2.]]
-        >>> _ = net.weight.set_data(Tensor(np.zeros((1, 2)), mindspore.float32))
+        >>> net.weight.set_data(Tensor(np.zeros((1, 2)), mindspore.float32))
         >>> print(net(x))
         [[0.]]
     """
diff --git a/mindspore/common/tensor.py b/mindspore/common/tensor.py
index 12b11905f87..1b059c3ffaf 100644
--- a/mindspore/common/tensor.py
+++ b/mindspore/common/tensor.py
@@ -26,7 +26,7 @@ from .._checkparam import Validator as validator
 __all__ = ['Tensor', 'RowTensor', 'SparseTensor']
 np_types = (np.int8, np.int16, np.int32, np.int64,
             np.uint8, np.uint16, np.uint32, np.uint64, np.float16,
-            np.float32, np.float64, np.bool_)
+            np.float32, np.float64, np.bool_, np.complex64, np.complex128)
 
 
 class Tensor(Tensor_):
@@ -91,7 +91,7 @@ class Tensor(Tensor_):
             validator.check_value_type('input_data', input_data, (Tensor_, np.ndarray, list, tuple, float, int, bool),
                                        'Tensor')
             valid_dtypes = (np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64,
-                            np.float16, np.float32, np.float64, np.bool_, np.str_)
+                            np.float16, np.float32, np.float64, np.bool_, np.str_, np.complex64, np.complex128)
             if isinstance(input_data, np.ndarray) and input_data.dtype not in valid_dtypes and \
                 input_data.dtype.kind != 'U':  # Support dtype np.str_
                 raise TypeError(f"For Tensor, the input_data is a numpy array, "
@@ -1232,7 +1232,7 @@ class Tensor(Tensor_):
             raise ValueError(msg)
 
         class seed_context:
-            '''set and restore seed'''
+            """Set and restore seed."""
 
             def __init__(self, init):
                 self.init = init
diff --git a/mindspore/communication/_hccl_management.py b/mindspore/communication/_hccl_management.py
index 5d1fc577cd3..67bc136ef37 100644
--- a/mindspore/communication/_hccl_management.py
+++ b/mindspore/communication/_hccl_management.py
@@ -16,6 +16,8 @@
 """HCCL management API"""
 import ctypes
 import os
+from mindspore import context
+from .._c_expression import get_hccl_rank_id, get_hccl_rank_size
 
 MAX_GROUP_NAME_LEN = 127
 MAX_RANK_NUM = 4096
@@ -149,6 +151,10 @@ def get_rank_size(group="hccl_world_group"):
     Returns:
         An integer scalar with the num of ranks.
     """
+
+    if context.get_context("mode") == context.PYNATIVE_MODE:
+        return get_hccl_rank_size()
+
     check_group(group)
     c_group = c_str(group)
     c_rank_size = ctypes.c_uint()
@@ -166,6 +172,10 @@ def get_rank_id(group="hccl_world_group"):
     Returns:
         An integer scalar with the rank id of the calling process.
     """
+
+    if context.get_context("mode") == context.PYNATIVE_MODE:
+        return get_hccl_rank_id()
+
     check_group(group)
     c_group = c_str(group)
     c_rank_id = ctypes.c_uint()
@@ -176,6 +186,7 @@ def get_rank_id(group="hccl_world_group"):
     return c_rank_id.value
 
 
+
 def get_local_rank_size(group="hccl_world_group"):
     """
     A function that returns the number of local ranks within the given collection communication group.
diff --git a/mindspore/communication/management.py b/mindspore/communication/management.py
index 9e79ac82f16..0ad0f870cec 100755
--- a/mindspore/communication/management.py
+++ b/mindspore/communication/management.py
@@ -232,8 +232,7 @@ def get_world_rank_from_group_rank(group, group_rank_id):
         This method should be used after init().
 
     Args:
-        group (str): The communication group to work on. The group is created by create_group
-                     or the default world communication group.
+        group (str): The communication group to work on. The group is created by create_group.
         group_rank_id (int): A rank ID in the communication group.
 
     Returns:
@@ -269,8 +268,7 @@ def get_group_rank_from_world_rank(world_rank_id, group):
 
     Args:
         world_rank_id (int): A rank ID in the world communication group.
-        group (str): The communication group to work on. The group is created by create_group
-                     or the default world communication group.
+        group (str): The communication group to work on. The group is created by create_group.
 
     Returns:
         int, the rank ID in the user communication group.
diff --git a/mindspore/compression/common/__init__.py b/mindspore/compression/common/__init__.py
index 5ed90b1eef5..c382f47e87b 100644
--- a/mindspore/compression/common/__init__.py
+++ b/mindspore/compression/common/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ============================================================================
 """
-Compression common module.
+Common module for various compression algorithms, now only including datatype definition for quantization.
 """
 
 from .constant import QuantDtype
diff --git a/mindspore/compression/quant/__init__.py b/mindspore/compression/quant/__init__.py
index 713970270bf..e2b8cf0f83d 100644
--- a/mindspore/compression/quant/__init__.py
+++ b/mindspore/compression/quant/__init__.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 # ============================================================================
 """
-Compression quant module.
+Quantization module, including base class of the quantizer, the quantization aware training algorithm,
+and quantization utils.
 """
 
 from .quantizer import OptimizeOption
diff --git a/mindspore/compression/quant/qat.py b/mindspore/compression/quant/qat.py
index 95bd821afd4..3c8ccbcae56 100644
--- a/mindspore/compression/quant/qat.py
+++ b/mindspore/compression/quant/qat.py
@@ -44,25 +44,25 @@ def create_quant_config(quant_observer=(nn.FakeQuantWithMinMaxObserver, nn.FakeQ
                         narrow_range=(False, False),
                         mode="DEFAULT"):
     r"""
-    Config the observer type of weights and data flow with quant params.
+    Config the observer type of weights and data flow with quant parameters.
 
     Args:
         quant_observer (Union[Observer, list, tuple]): The types of observer for quantization. The first element
-            applies to weights and second applies to data flow. Currently, only
+            applies to weights and the second applies to data flow. Currently, only
             :class:`FakeQuantWithMinMaxObserver` supported.
             Default: (nn.FakeQuantWithMinMaxObserver, nn.FakeQuantWithMinMaxObserver).
         quant_delay (Union[int, list, tuple]): Number of steps after which weights and activations are quantized
-            during train and eval. The first element represents weights and second element represents data flow.
+            during train and eval. The first element represents weights and the second element represents data flow.
             Default: (0, 0).
-        quant_dtype (Union[QuantDtype, list, tuple]): Datatype to use for quantize weights and activations. The first
-            element represents weights and second element represents data flow.
+        quant_dtype (Union[QuantDtype, list, tuple]): Datatype used to quantize weights and activations. The first
+            element represents weights and the second element represents data flow.
             Default: (QuantDtype.INT8, QuantDtype.INT8).
         per_channel (Union[bool, list, tuple]):  Quantization granularity based on layer or on channel. If `True`
-            then base on per channel otherwise base on per layer. The first element represents weights
-            and second element represents data flow, and second element must be `False` now.
+            then base on per channel, otherwise base on per layer. The first element represents weights
+            and the second element represents data flow, and the second element must be `False` now.
             Default: (False, False).
         symmetric (Union[bool, list, tuple]): Whether the quantization algorithm is symmetric or not. If `True` then
-            base on symmetric otherwise base on asymmetric. The first element represents weights and second
+            base on symmetric, otherwise base on asymmetric. The first element represents weights and the second
             element represents data flow. Default: (False, False).
         narrow_range (Union[bool, list, tuple]): Whether the quantization algorithm uses narrow range or not.
             The first element represents weights and the second element represents data flow.
@@ -147,17 +147,17 @@ class QuantizationAwareTraining(Quantizer):
         freeze_bn (int): Number of steps after which BatchNorm OP parameters fixed to global mean and variance.
             Default: 1e7.
         quant_delay (Union[int, list, tuple]): Number of steps after which weights and activations are quantized
-            during train and eval. The first element represents weights and second element represents data flow.
+            during train and eval. The first element represents weights and the second element represents data flow.
             Default: (0, 0).
-        quant_dtype (Union[QuantDtype, list, tuple]): Datatype to use for quantize weights and activations. The first
-            element represents weights and second element represents data flow. It is necessary to consider the
+        quant_dtype (Union[QuantDtype, list, tuple]): Datatype used to quantize weights and activations. The first
+            element represents weights and the second element represents data flow. It is necessary to consider the
             precision support of hardware devices in the practical quantization infer scenario.
             Default: (QuantDtype.INT8, QuantDtype.INT8).
         per_channel (Union[bool, list, tuple]):  Quantization granularity based on layer or on channel. If `True`
-            then base on per channel otherwise base on per layer. The first element represents weights and second
-            element represents data flow, and second element must be `False` now. Default: (False, False).
+            then base on per channel, otherwise base on per layer. The first element represents weights and the
+            second element represents data flow, and the second element must be `False` now. Default: (False, False).
         symmetric (Union[bool, list, tuple]): Whether the quantization algorithm is symmetric or not. If `True` then
-            base on symmetric otherwise base on asymmetric. The first element represents weights and second
+            base on symmetric, otherwise base on asymmetric. The first element represents weights and the second
             element represents data flow. Default: (False, False).
         narrow_range (Union[bool, list, tuple]): Whether the quantization algorithm uses narrow range or not.
             The first element represents weights and the second element represents data flow.
@@ -165,8 +165,8 @@ class QuantizationAwareTraining(Quantizer):
         optimize_option (Union[OptimizeOption, list, tuple]): Specifies the quant algorithm and options, currently
             only support `QAT` and `LEARNED_SCALE` (Note that, if both `QAT` and `LEARNED_SCALE` are configured,
             `LEARNED_SCALE` has a higher priority. `LEARNED_SCALE` currently only work under some constraints, which
-            includes: freeze_bn=0, quant_delay=0, symmetric=Ture, narrow_range=True, More specifically, for operators
-            such as ReLu and ReLu6, which only have positive values, we add a negative truncation to optimize this
+            includes: freeze_bn=0, quant_delay=0, symmetric=True, narrow_range=True, More specifically, for operators
+            such as Relu and Relu6, which only have positive values, we add a negative truncation to optimize this
             scenario, and narrow_range will automatically match to False). Default: OptimizeOption.QAT.
         one_conv_fold (bool): Whether to use one conv bn fold ops for simulation inference operation. Default: True.
 
diff --git a/mindspore/compression/quant/quant_utils.py b/mindspore/compression/quant/quant_utils.py
index c20b7488793..0813a4ad07d 100644
--- a/mindspore/compression/quant/quant_utils.py
+++ b/mindspore/compression/quant/quant_utils.py
@@ -280,8 +280,8 @@ def compute_kl_threshold(data, bitwidth):
 def query_quant_layers(network):
     r"""
     Query the network's quantization strategy of each quantized layer and print it to the screen, note that all the
-    quantization layers are queried before graph compile optimization in the graph mode, thus may be appear some
-    redundant quantized layers, which are not exist in practical execution.
+    quantization layers are queried before graph compile optimization in the graph mode, thus, some redundant quantized
+    layers, which not exist in practical execution, may appear.
 
     Args:
         network (Cell): input network
diff --git a/mindspore/context.py b/mindspore/context.py
index c6262fd4e79..85700e45405 100644
--- a/mindspore/context.py
+++ b/mindspore/context.py
@@ -22,6 +22,7 @@ import time
 import threading
 from collections import namedtuple
 from types import FunctionType
+
 from mindspore import log as logger
 from mindspore._c_expression import MSContext, ms_ctx_param
 from mindspore._checkparam import args_type_check, Validator
@@ -369,7 +370,7 @@ def set_auto_parallel_context(**kwargs):
     gradients_mean               auto_parallel_search_mode
     parallel_mode                strategy_ckpt_load_file
     all_reduce_fusion_config     strategy_ckpt_save_file
-    enable_parallel_optimizer    full_batch
+    enable_parallel_optimizer    dataset_strategy
                \                 pipeline_stages
                \                 grad_accumulation_step
     ===========================  ===========================
@@ -379,9 +380,8 @@ def set_auto_parallel_context(**kwargs):
         global_rank (int): Global rank id, the value must be in [0, 4095]. Default: 0.
         gradients_mean (bool): Whether to perform mean operator after allreduce of gradients.
                      "stand_alone" do not support gradients_mean. Default: False.
-        gradient_fp32_sync (bool): Run allreduce of gradients in fp32.
-                     "stand_alone", "data_parallel" and "hybrid_parallel" do not support
-                     gradient_fp32_sync. Default: True.
+        gradient_fp32_sync (bool): Run allreduce of gradients in fp32. "stand_alone", "data_parallel"
+                     and "hybrid_parallel" do not support gradient_fp32_sync. Default: True.
         parallel_mode (str): There are five kinds of parallel modes, "stand_alone", "data_parallel",
                      "hybrid_parallel", "semi_auto_parallel" and "auto_parallel". Default: "stand_alone".
 
@@ -391,8 +391,7 @@ def set_auto_parallel_context(**kwargs):
 
                      - hybrid_parallel: Achieves data parallelism and model parallelism manually.
 
-                     - semi_auto_parallel: Achieves data parallelism and model parallelism by
-                       setting parallel strategies.
+                     - semi_auto_parallel: Achieves data and model parallelism by setting parallel strategies.
 
                      - auto_parallel: Achieving parallelism automatically.
         auto_parallel_search_mode (str): There are two kinds of shard strategy search modes, "recursive_programming"
@@ -410,17 +409,21 @@ def set_auto_parallel_context(**kwargs):
         strategy_ckpt_load_file (str): The path to load parallel strategy checkpoint. Default: ''
         strategy_ckpt_save_file (str): The path to save parallel strategy checkpoint. Default: ''
         full_batch (bool): If you load whole batch datasets in auto_parallel mode, this parameter
-                       should be set as True. Default: False.
+                       should be set as True. Default: False. The interface is not be recommended currently,
+                       it is better using 'dataset_strategy' to replace it.
+        dataset_strategy Union[str, tuple]: Dataset sharding strategy. Default: "data_parallel".
+                       dataset_strategy="data_parallel" is equal to full_batch=False, dataset_strategy="full_batch" is
+                       equal to full_batch=True. For dataset load into net by model parallel strategy likes
+                       ds_stra ((1, 8), (1, 8)), it requires using set_auto_parallel_context(dataset_strategy=ds_stra).
         enable_parallel_optimizer (bool): This is a developing feature, which shards the weight update computation for
                        data parallel training in the benefit of time and memory saving. Currently, auto and semi auto
                        parallel mode support all optimizers in both Ascend and GPU. Data parallel mode only supports
                        `Lamb` and `AdamWeightDecay` in Ascend . Default: False.
         all_reduce_fusion_config (list): Set allreduce fusion strategy by parameters indices. Only support ReduceOp.SUM
                        and HCCL_WORLD_GROUP/NCCL_WORLD_GROUP. No Default, if it is not set, the fusion is closed.
-        pipeline_stages (int): Set the stage information for pipeline parallel. This indicates how
-                        the devices are distributed alone the pipeline. The total devices will be divided into
-                        'pipeline_stags' stages. Currently this could only be used when
-                        parallel mode semi_auto_parallel is enabled. Default: 1.
+        pipeline_stages (int): Set the stage information for pipeline parallel. This indicates how the devices are
+                         distributed alone the pipeline. The total devices will be divided into 'pipeline_stags' stages.
+                        Currently this could only be used when parallel mode semi_auto_parallel is enabled. Default: 1.
         grad_accumulation_step (int): Set the accumulation steps of gradients in auto and semi auto parallel mode.
                         This should be a positive int. Default: 1.
 
@@ -437,14 +440,13 @@ def set_auto_parallel_context(**kwargs):
         >>> context.set_auto_parallel_context(parameter_broadcast=False)
         >>> context.set_auto_parallel_context(strategy_ckpt_load_file="./strategy_stage1.ckpt")
         >>> context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_stage1.ckpt")
-        >>> context.set_auto_parallel_context(full_batch=True)
+        >>> context.set_auto_parallel_context(dataset_strategy=((1, 8), (1, 8)))
         >>> context.set_auto_parallel_context(enable_parallel_optimizer=False)
         >>> context.set_auto_parallel_context(all_reduce_fusion_config=[8, 160])
         >>> context.set_auto_parallel_context(pipeline_stages=2)
     """
     _set_auto_parallel_context(**kwargs)
 
-
 def get_auto_parallel_context(attr_key):
     """
     Get auto parallel context attribute value according to the key.
@@ -644,7 +646,7 @@ def set_context(**kwargs):
             suffix to the file. Default: ''.
         enable_sparse (bool): Whether to enable sparsity feature. Default: False.
             For details of sparsity and sparse tensor, please check
-            `<https://www.mindspore.cn/doc/programming_guide/zh-CN/master/tensor.html>`_.
+            `<https://www.mindspore.cn/docs/programming_guide/zh-CN/master/tensor.html>`_.
         max_call_depth (int): Specify the maximum depth of function call. Must be positive integer. Default: 1000.
         env_config_path (str): Config path for DFX.
         auto_tune_mode (str): The mode of auto tune when op building, get the best tiling performance,
diff --git a/mindspore/core/abstract/abstract_function.h b/mindspore/core/abstract/abstract_function.h
index 0d59421155c..1e6bce66b93 100644
--- a/mindspore/core/abstract/abstract_function.h
+++ b/mindspore/core/abstract/abstract_function.h
@@ -28,7 +28,7 @@
 
 namespace mindspore {
 namespace abstract {
-class AbstractFuncAtom : public AbstractFunction {
+class MS_CORE_API AbstractFuncAtom : public AbstractFunction {
  public:
   AbstractFuncAtom() = default;
   ~AbstractFuncAtom() override = default;
@@ -42,7 +42,7 @@ class AbstractFuncAtom : public AbstractFunction {
   std::size_t hash() const override { return tid(); }
 };
 
-class AbstractFuncUnion : public AbstractFunction {
+class MS_CORE_API AbstractFuncUnion : public AbstractFunction {
  public:
   explicit AbstractFuncUnion(const AbstractFuncAtomPtrList &func_list);
   AbstractFuncUnion(const AbstractFunctionPtr &first, const AbstractFunctionPtr &second);
@@ -63,7 +63,7 @@ class AbstractFuncUnion : public AbstractFunction {
   AbstractFuncAtomPtrList func_list_;
 };
 
-class PrimitiveAbstractClosure : public AbstractFuncAtom {
+class MS_CORE_API PrimitiveAbstractClosure : public AbstractFuncAtom {
  public:
   // Represents a Primitive.
   // prim: The primitive
@@ -96,7 +96,7 @@ class PrimitiveAbstractClosure : public AbstractFuncAtom {
 };
 using PrimitiveAbstractClosurePtr = std::shared_ptr<PrimitiveAbstractClosure>;
 
-class FuncGraphAbstractClosure : public AbstractFuncAtom {
+class MS_CORE_API FuncGraphAbstractClosure : public AbstractFuncAtom {
  public:
   // Represents a Graph in a certain Context.
   // context: The context, or Context.empty()
@@ -140,7 +140,7 @@ class FuncGraphAbstractClosure : public AbstractFuncAtom {
 };
 using FuncGraphAbstractClosurePtr = std::shared_ptr<FuncGraphAbstractClosure>;
 
-class MetaFuncGraphAbstractClosure : public AbstractFuncAtom {
+class MS_CORE_API MetaFuncGraphAbstractClosure : public AbstractFuncAtom {
  public:
   explicit MetaFuncGraphAbstractClosure(const MetaFuncGraphPtr &meta_func_graph,
                                         const AnfNodePtr &tracking_id = nullptr, const ScopePtr &scope = kDefaultScope)
@@ -173,7 +173,7 @@ class MetaFuncGraphAbstractClosure : public AbstractFuncAtom {
 };
 using MetaFuncGraphAbstractClosurePtr = std::shared_ptr<MetaFuncGraphAbstractClosure>;
 
-class PartialAbstractClosure : public AbstractFuncAtom {
+class MS_CORE_API PartialAbstractClosure : public AbstractFuncAtom {
  public:
   // Represents a partial application.
   // args_spec_list: The first few arguments of that function
@@ -204,7 +204,7 @@ class PartialAbstractClosure : public AbstractFuncAtom {
 };
 using PartialAbstractClosurePtr = std::shared_ptr<PartialAbstractClosure>;
 
-class JTransformedAbstractClosure : public AbstractFuncAtom {
+class MS_CORE_API JTransformedAbstractClosure : public AbstractFuncAtom {
  public:
   // Represents a Function transformed through the application of J.
   explicit JTransformedAbstractClosure(const AbstractFuncAtomPtr &fn) : fn_(fn) {}
@@ -222,7 +222,7 @@ class JTransformedAbstractClosure : public AbstractFuncAtom {
   AbstractFuncAtomPtr fn_;
 };
 
-class VirtualAbstractClosure : public AbstractFuncAtom {
+class MS_CORE_API VirtualAbstractClosure : public AbstractFuncAtom {
  public:
   // Represents some function with an explicitly fixed type signature.
   // args_spec_list: The arguments as abstract value given to the function
@@ -251,7 +251,7 @@ class VirtualAbstractClosure : public AbstractFuncAtom {
 };
 using VirtualAbstractClosurePtr = std::shared_ptr<VirtualAbstractClosure>;
 
-class TypedPrimitiveAbstractClosure : public AbstractFuncAtom {
+class MS_CORE_API TypedPrimitiveAbstractClosure : public AbstractFuncAtom {
  public:
   // Represents a Primitive with an explicitly fixed type signature.
   // args_spec_list: The arguments as abstract value given to the Primitive
@@ -280,7 +280,7 @@ class TypedPrimitiveAbstractClosure : public AbstractFuncAtom {
 };
 
 // Represents a function that can't be called.
-class DummyAbstractClosure : public AbstractFuncAtom {
+class MS_CORE_API DummyAbstractClosure : public AbstractFuncAtom {
  public:
   DummyAbstractClosure() = default;
   ~DummyAbstractClosure() override = default;
@@ -292,14 +292,14 @@ class DummyAbstractClosure : public AbstractFuncAtom {
   std::string ToString() const override { return "DummyAbstractClosure()"; }
 };
 
-struct AbstractFunctionHasher {
+struct MS_CORE_API AbstractFunctionHasher {
   std::size_t operator()(const AbstractFunctionPtr &t) const {
     std::size_t hash = t->hash();
     return hash;
   }
 };
 
-struct AbstractFunctionEqual {
+struct MS_CORE_API AbstractFunctionEqual {
   bool operator()(const AbstractFunctionPtr &lhs, const AbstractFunctionPtr &rhs) const { return *lhs == *rhs; }
 };
 }  // namespace abstract
diff --git a/mindspore/core/abstract/abstract_value.h b/mindspore/core/abstract/abstract_value.h
index d3b4355bd94..c34526cf209 100644
--- a/mindspore/core/abstract/abstract_value.h
+++ b/mindspore/core/abstract/abstract_value.h
@@ -43,7 +43,7 @@ using AbstractBasePtrList = std::vector<AbstractBasePtr>;
 
 // The base class for abstract value. The abstract value is used in evaluating
 // to express the type, shape, and value of the real value.
-class AbstractBase : public Base {
+class MS_CORE_API AbstractBase : public Base {
  public:
   using TraceNodeProvider = std::function<void(AnfNodePtr *node)>;
 
@@ -101,7 +101,7 @@ class AbstractBase : public Base {
   std::string value_desc_;  // store initial value description for error report
 };
 
-class AbstractScalar : public AbstractBase {
+class MS_CORE_API AbstractScalar : public AbstractBase {
  public:
   AbstractScalar() : AbstractBase(kAnyValue, kAnyType) {}
   explicit AbstractScalar(const ValuePtr &value, const TypePtr &type) : AbstractBase(value, type) {}
@@ -127,7 +127,7 @@ class AbstractScalar : public AbstractBase {
 };
 using AbstractScalarPtr = std::shared_ptr<AbstractScalar>;
 
-class AbstractType : public AbstractBase {
+class MS_CORE_API AbstractType : public AbstractBase {
  public:
   explicit AbstractType(const TypePtr &type) : AbstractBase(type, kTypeType) {
     if (type == nullptr) {
@@ -146,7 +146,7 @@ class AbstractType : public AbstractBase {
 };
 using AbstractTypePtr = std::shared_ptr<AbstractType>;
 
-class AbstractError : public AbstractBase {
+class MS_CORE_API AbstractError : public AbstractBase {
  public:
   explicit AbstractError(const StringImmPtr &err, const AnfNodePtr &node) : AbstractBase(err), node_(node) {
     if (err == nullptr || node == nullptr) {
@@ -181,7 +181,7 @@ class AbstractFuncAtom;
 using AbstractFuncAtomPtr = std::shared_ptr<AbstractFuncAtom>;
 using AbstractFuncAtomPtrList = std::vector<AbstractFuncAtomPtr>;
 
-class AbstractFunction : public AbstractBase {
+class MS_CORE_API AbstractFunction : public AbstractBase {
  public:
   AbstractFunction() = default;
   ~AbstractFunction() override = default;
@@ -215,7 +215,7 @@ class AbstractFunction : public AbstractBase {
 using AbstractFunctionPtrList = std::vector<AbstractFunctionPtr>;
 
 // Represents a key-value pair used in function's parameters.
-class AbstractKeywordArg : public AbstractBase {
+class MS_CORE_API AbstractKeywordArg : public AbstractBase {
  public:
   AbstractKeywordArg(const std::string &key, const AbstractBasePtr &argument) : arg_name_(key), arg_value_(argument) {}
   ~AbstractKeywordArg() override = default;
@@ -242,7 +242,7 @@ class AbstractKeywordArg : public AbstractBase {
 };
 using AbstractKeywordArgPtr = std::shared_ptr<AbstractKeywordArg>;
 
-class AbstractUndetermined : public AbstractBase {
+class MS_CORE_API AbstractUndetermined : public AbstractBase {
  public:
   // shape and type are all unknown
   AbstractUndetermined() : AbstractBase(kAnyValue) {}
@@ -291,7 +291,7 @@ class AbstractUndetermined : public AbstractBase {
   AbstractBasePtr element_;
 };
 
-class AbstractTensor : public AbstractUndetermined {
+class MS_CORE_API AbstractTensor : public AbstractUndetermined {
  public:
   // only element_ and value, shape track are valid member, type track are unknown.
   explicit AbstractTensor(const AbstractBasePtr &element, const BaseShapePtr &shape = std::make_shared<Shape>())
@@ -340,7 +340,7 @@ class AbstractTensor : public AbstractUndetermined {
 using AbstractTensorPtr = std::shared_ptr<AbstractTensor>;
 using AbstractTensorPtrList = std::vector<AbstractTensorPtr>;
 
-class AbstractSequeue : public AbstractBase {
+class MS_CORE_API AbstractSequeue : public AbstractBase {
  public:
   explicit AbstractSequeue(const AbstractBasePtrList &elements) : elements_(elements) {}
   ~AbstractSequeue() override = default;
@@ -371,7 +371,7 @@ class AbstractSequeue : public AbstractBase {
 };
 using AbstractSequeuePtr = std::shared_ptr<AbstractSequeue>;
 
-class AbstractTuple : public AbstractSequeue {
+class MS_CORE_API AbstractTuple : public AbstractSequeue {
  public:
   explicit AbstractTuple(const AbstractBasePtrList &elements) : AbstractSequeue(elements) {}
 
@@ -400,7 +400,7 @@ class AbstractTuple : public AbstractSequeue {
 };
 using AbstractTuplePtr = std::shared_ptr<AbstractTuple>;
 
-class AbstractList : public AbstractSequeue {
+class MS_CORE_API AbstractList : public AbstractSequeue {
  public:
   explicit AbstractList(const AbstractBasePtrList &elements) : AbstractSequeue(elements) {}
 
@@ -430,7 +430,7 @@ class AbstractList : public AbstractSequeue {
 };
 using AbstractListPtr = std::shared_ptr<AbstractList>;
 
-class AbstractClass : public AbstractBase {
+class MS_CORE_API AbstractClass : public AbstractBase {
  public:
   AbstractClass(const Named &tag, const std::vector<AbstractAttribute> &attributes,
                 const std::unordered_map<std::string, ValuePtr> &methods)
@@ -462,7 +462,7 @@ class AbstractClass : public AbstractBase {
 };
 using AbstractClassPtr = std::shared_ptr<AbstractClass>;
 
-class AbstractDictionary : public AbstractBase {
+class MS_CORE_API AbstractDictionary : public AbstractBase {
  public:
   explicit AbstractDictionary(const std::vector<AbstractAttribute> &key_values) : key_values_(key_values) {}
   ~AbstractDictionary() override = default;
@@ -485,7 +485,7 @@ class AbstractDictionary : public AbstractBase {
 };
 using AbstractDictionaryPtr = std::shared_ptr<AbstractDictionary>;
 
-class AbstractSlice : public AbstractBase {
+class MS_CORE_API AbstractSlice : public AbstractBase {
  public:
   AbstractSlice(const AbstractBasePtr &start, const AbstractBasePtr &stop, const AbstractBasePtr &step)
       : start_(start), stop_(stop), step_(step) {}
@@ -513,7 +513,7 @@ class AbstractSlice : public AbstractBase {
 };
 using AbstractSlicePtr = std::shared_ptr<AbstractSlice>;
 
-class AbstractJTagged : public AbstractBase {
+class MS_CORE_API AbstractJTagged : public AbstractBase {
  public:
   explicit AbstractJTagged(const AbstractBasePtr &element) : element_(element) {}
 
@@ -536,7 +536,7 @@ class AbstractJTagged : public AbstractBase {
 };
 using AbstractJTaggedPtr = std::shared_ptr<AbstractJTagged>;
 
-class AbstractNone : public AbstractBase {
+class MS_CORE_API AbstractNone : public AbstractBase {
  public:
   AbstractNone() : AbstractBase() { set_type(std::make_shared<TypeNone>()); }
   ~AbstractNone() override = default;
@@ -554,7 +554,7 @@ class AbstractNone : public AbstractBase {
 using AbstractNonePtr = std::shared_ptr<AbstractNone>;
 
 // the un assigned state value for variable, which means the variable is not assigned
-class AbstractNull : public AbstractBase {
+class MS_CORE_API AbstractNull : public AbstractBase {
  public:
   AbstractNull() : AbstractBase(kNull) { set_type(std::make_shared<TypeNull>()); }
   ~AbstractNull() override = default;
@@ -569,7 +569,7 @@ class AbstractNull : public AbstractBase {
 using AbstractNullPtr = std::shared_ptr<AbstractNull>;
 
 // the timeout state value for variable, which means the variable is not assigned because it is  timeout
-class AbstractTimeOut : public AbstractBase {
+class MS_CORE_API AbstractTimeOut : public AbstractBase {
  public:
   AbstractTimeOut() : AbstractBase(kNull) { set_type(std::make_shared<TypeNull>()); }
   ~AbstractTimeOut() override = default;
@@ -583,7 +583,7 @@ class AbstractTimeOut : public AbstractBase {
 };
 using AbstractTimeOutPtr = std::shared_ptr<AbstractTimeOut>;
 
-class AbstractEllipsis : public AbstractBase {
+class MS_CORE_API AbstractEllipsis : public AbstractBase {
  public:
   AbstractEllipsis() : AbstractBase(kEllipsis) { set_type(std::make_shared<TypeEllipsis>()); }
   ~AbstractEllipsis() override = default;
@@ -597,7 +597,7 @@ class AbstractEllipsis : public AbstractBase {
 };
 using AbstractEllipsisPtr = std::shared_ptr<AbstractEllipsis>;
 
-class AbstractRefKey : public AbstractBase {
+class MS_CORE_API AbstractRefKey : public AbstractBase {
  public:
   AbstractRefKey() : AbstractBase(), ref_key_value_(nullptr) { set_type(std::make_shared<RefKeyType>()); }
   ~AbstractRefKey() override = default;
@@ -627,7 +627,7 @@ class AbstractRefKey : public AbstractBase {
 };
 using AbstractRefKeyPtr = std::shared_ptr<AbstractRefKey>;
 
-class AbstractRef : public AbstractTensor {
+class MS_CORE_API AbstractRef : public AbstractTensor {
  public:
   AbstractRef(const AbstractBasePtr &ref_key, const AbstractTensorPtr &ref_value);
   ~AbstractRef() override = default;
@@ -669,19 +669,19 @@ class AbstractRef : public AbstractTensor {
 };
 using AbstractRefPtr = std::shared_ptr<AbstractRef>;
 
-struct AbstractBasePtrListHasher {
+struct MS_CORE_API AbstractBasePtrListHasher {
   std::size_t operator()(const AbstractBasePtrList &args_spec_list) const;
 };
 
-struct AbstractBasePtrListEqual {
+struct MS_CORE_API AbstractBasePtrListEqual {
   bool operator()(const AbstractBasePtrList &lhs, const AbstractBasePtrList &rhs) const;
 };
 
-std::size_t AbstractBasePtrListHash(const AbstractBasePtrList &args_spec_list);
-bool AbstractBasePtrListDeepEqual(const AbstractBasePtrList &lhs, const AbstractBasePtrList &rhs);
+MS_CORE_API std::size_t AbstractBasePtrListHash(const AbstractBasePtrList &args_spec_list);
+MS_CORE_API bool AbstractBasePtrListDeepEqual(const AbstractBasePtrList &lhs, const AbstractBasePtrList &rhs);
 
 // RowTensor
-class AbstractRowTensor : public AbstractUndetermined {
+class MS_CORE_API AbstractRowTensor : public AbstractUndetermined {
  public:
   explicit AbstractRowTensor(const AbstractBasePtr &element, const BaseShapePtr &shape = std::make_shared<Shape>())
       : AbstractUndetermined(element, shape) {}
@@ -710,7 +710,7 @@ class AbstractRowTensor : public AbstractUndetermined {
 };
 
 // SparseTensor
-class AbstractSparseTensor : public AbstractUndetermined {
+class MS_CORE_API AbstractSparseTensor : public AbstractUndetermined {
  public:
   explicit AbstractSparseTensor(const AbstractBasePtr &element, const BaseShapePtr &shape = std::make_shared<Shape>())
       : AbstractUndetermined(element, shape) {}
diff --git a/mindspore/core/abstract/dshape.h b/mindspore/core/abstract/dshape.h
index 071c3cd1a16..a2d751fd785 100644
--- a/mindspore/core/abstract/dshape.h
+++ b/mindspore/core/abstract/dshape.h
@@ -37,7 +37,7 @@ class BaseShape;
 using BaseShapePtr = std::shared_ptr<BaseShape>;
 using BaseShapePtrList = std::vector<BaseShapePtr>;
 
-class BaseShape : public Base {
+class MS_CORE_API BaseShape : public Base {
  public:
   BaseShape() = default;
   ~BaseShape() override = default;
@@ -53,7 +53,7 @@ class BaseShape : public Base {
   virtual void Broaden() {}
 };
 
-class NoShape : public BaseShape {
+class MS_CORE_API NoShape : public BaseShape {
  public:
   MS_DECLARE_PARENT(NoShape, BaseShape)
   BaseShapePtr Clone() const override { return std::make_shared<NoShape>(); }
@@ -62,7 +62,7 @@ class NoShape : public BaseShape {
 };
 extern const std::shared_ptr<NoShape> kNoShape;
 
-class Shape : public BaseShape {
+class MS_CORE_API Shape : public BaseShape {
  public:
   static const int64_t SHP_ANY = -1;
   Shape() : shape_() {}
@@ -93,7 +93,7 @@ class Shape : public BaseShape {
 using ShapePtr = std::shared_ptr<Shape>;
 using ShapePtrList = std::vector<ShapePtr>;
 
-class SequeueShape : public BaseShape {
+class MS_CORE_API SequeueShape : public BaseShape {
  public:
   SequeueShape() : p_shapes_() {}
   explicit SequeueShape(const BaseShapePtrList &shapes) : p_shapes_(shapes) {}
@@ -118,7 +118,7 @@ class SequeueShape : public BaseShape {
 };
 using SequeueShapePtr = std::shared_ptr<SequeueShape>;
 
-class TupleShape : public SequeueShape {
+class MS_CORE_API TupleShape : public SequeueShape {
  public:
   TupleShape() : SequeueShape() {}
   explicit TupleShape(const BaseShapePtrList &shapes) : SequeueShape(shapes) {}
@@ -133,7 +133,7 @@ class TupleShape : public SequeueShape {
 };
 using TupleShapePtr = std::shared_ptr<TupleShape>;
 
-class ListShape : public SequeueShape {
+class MS_CORE_API ListShape : public SequeueShape {
  public:
   ListShape() : SequeueShape() {}
   explicit ListShape(const BaseShapePtrList &shapes) : SequeueShape(shapes) {}
diff --git a/mindspore/core/abstract/primitive_infer_map.cc b/mindspore/core/abstract/primitive_infer_map.cc
index f56fbd7ba80..38f1d98c63d 100644
--- a/mindspore/core/abstract/primitive_infer_map.cc
+++ b/mindspore/core/abstract/primitive_infer_map.cc
@@ -235,7 +235,7 @@ PrimitiveEvalImplMap &GetPrimitiveToBackendEvalImplMap() {
     {prim::kPrimConcat, {InferImplConcat, nullptr, true}},
     {prim::kPrimArgMaxWithValue, {InferImplArgMaxWithValue, nullptr, true}},
     {prim::kPrimFusedSparseAdam, {InferImplFusedSparseAdam, nullptr, true}},
-    {prim::KPrimTransData, {InferImplTransData, nullptr, true}},
+    {prim::kPrimTransData, {InferImplTransData, nullptr, true}},
   };
   return prim_backend_eval_implement_map;
 }
diff --git a/mindspore/core/abstract/utils.cc b/mindspore/core/abstract/utils.cc
index 7b8e27e958b..1ae85cceb3d 100644
--- a/mindspore/core/abstract/utils.cc
+++ b/mindspore/core/abstract/utils.cc
@@ -27,11 +27,12 @@
 
 namespace mindspore {
 namespace abstract {
-const std::map<TypeId, size_t> type_map = {{kNumberTypeBool, 1},    {kNumberTypeInt, 4},     {kNumberTypeInt8, 1},
-                                           {kNumberTypeInt16, 2},   {kNumberTypeInt32, 4},   {kNumberTypeInt64, 8},
-                                           {kNumberTypeUInt, 4},    {kNumberTypeUInt8, 1},   {kNumberTypeUInt16, 2},
-                                           {kNumberTypeUInt32, 4},  {kNumberTypeUInt64, 8},  {kNumberTypeFloat, 4},
-                                           {kNumberTypeFloat16, 2}, {kNumberTypeFloat32, 4}, {kNumberTypeFloat64, 8}};
+const std::map<TypeId, size_t> type_map = {
+  {kNumberTypeBool, 1},       {kNumberTypeInt, 4},     {kNumberTypeInt8, 1},    {kNumberTypeInt16, 2},
+  {kNumberTypeInt32, 4},      {kNumberTypeInt64, 8},   {kNumberTypeUInt, 4},    {kNumberTypeUInt8, 1},
+  {kNumberTypeUInt16, 2},     {kNumberTypeUInt32, 4},  {kNumberTypeUInt64, 8},  {kNumberTypeFloat, 4},
+  {kNumberTypeFloat16, 2},    {kNumberTypeFloat32, 4}, {kNumberTypeFloat64, 8}, {kNumberTypeComplex64, 8},
+  {kNumberTypeComplex128, 16}};
 
 ValuePtr ValueJoin(const ValuePtr &value1, const ValuePtr &value2) {
   MS_EXCEPTION_IF_NULL(value1);
diff --git a/mindspore/core/api/ir/func_graph.h b/mindspore/core/api/ir/func_graph.h
index c2fd0d8e8ab..d1e9c6bbb02 100644
--- a/mindspore/core/api/ir/func_graph.h
+++ b/mindspore/core/api/ir/func_graph.h
@@ -21,11 +21,12 @@
 #include <memory>
 #include <string>
 
+#include "utils/visible.h"
 #include "api/ir/func_graph_manager.h"
 
 namespace mindspore::api {
 
-class FuncGraph {
+class MS_CORE_API FuncGraph {
  public:
   FuncGraph() = default;
   virtual ~FuncGraph() = default;
@@ -45,6 +46,8 @@ class FuncGraph {
   virtual void set_attr(const std::string &key, const ValuePtr &value) = 0;
 
   virtual FuncGraphManagerPtr get_manager() const = 0;
+
+  static std::vector<AnfNodePtr> TopoSort(const AnfNodePtr &node);
 };
 }  // namespace mindspore::api
 #endif  // MINDSPORE_CORE_API_IR_FUNC_GRAPH_H_
diff --git a/mindspore/core/api/ir/func_graph_manager.h b/mindspore/core/api/ir/func_graph_manager.h
index e1dbe4952bc..f399d4e7240 100644
--- a/mindspore/core/api/ir/func_graph_manager.h
+++ b/mindspore/core/api/ir/func_graph_manager.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <utility>
 
+#include "utils/visible.h"
 #include "utils/ordered_set.h"
 #include "utils/ordered_map.h"
 #include "ir/anf.h"
@@ -32,13 +33,13 @@ using FuncGraphPtr = std::shared_ptr<FuncGraph>;
 class FuncGraphManager;
 using FuncGraphManagerPtr = std::shared_ptr<FuncGraphManager>;
 
-struct AnfNodeIndexPairHasher {
+struct MS_CORE_API AnfNodeIndexPairHasher {
   std::size_t operator()(const std::pair<AnfNodePtr, int> &p1) const {
     return std::hash<const AnfNode *>{}(p1.first.get());
   }
 };
 
-struct AnfNodeIndexPairEqual {
+struct MS_CORE_API AnfNodeIndexPairEqual {
   bool operator()(const std::pair<AnfNodePtr, int> &lhs, const std::pair<AnfNodePtr, int> &rhs) const {
     return lhs == rhs;
   }
@@ -47,7 +48,7 @@ struct AnfNodeIndexPairEqual {
 using AnfNodeIndexSet = OrderedSet<std::pair<AnfNodePtr, int>, AnfNodeIndexPairHasher, AnfNodeIndexPairEqual>;
 using NodeUsersMap = OrderedMap<AnfNodePtr, AnfNodeIndexSet>;
 
-class FuncGraphManager {
+class MS_CORE_API FuncGraphManager {
  public:
   FuncGraphManager() = default;
   virtual ~FuncGraphManager() = default;
diff --git a/mindspore/core/base/base.h b/mindspore/core/base/base.h
index 1bc579d6207..2ca0c3088ba 100644
--- a/mindspore/core/base/base.h
+++ b/mindspore/core/base/base.h
@@ -37,7 +37,7 @@ struct is_shared_ptr : public std::false_type {};
 template <typename T>
 struct is_shared_ptr<std::shared_ptr<T>> : public std::true_type {};
 
-class Base : public std::enable_shared_from_this<Base> {
+class MS_CORE_API Base : public std::enable_shared_from_this<Base> {
  public:
   constexpr Base() = default;
   Base(const Base &other) : std::enable_shared_from_this<Base>(other) {}
diff --git a/mindspore/core/base/core_ops.h b/mindspore/core/base/core_ops.h
index 9ad67236fc6..30652190a27 100644
--- a/mindspore/core/base/core_ops.h
+++ b/mindspore/core/base/core_ops.h
@@ -78,6 +78,7 @@ constexpr auto kFastGeLU = "FastGeLU";
 constexpr auto kFastGeLUGrad = "FastGeLUGrad";
 constexpr auto kStridedSlice = "StridedSlice";
 constexpr auto kZerosLike = "ZerosLike";
+constexpr auto kOnes = "Ones";
 constexpr auto kOnesLike = "OnesLike";
 constexpr auto kDiag = "Diag";
 constexpr auto kDiagPart = "DiagPart";
@@ -93,6 +94,7 @@ constexpr auto kDropoutDoMask = "DropoutDoMask";
 constexpr auto kDropout = "Dropout";
 constexpr auto kDropoutGrad = "DropoutGrad";
 constexpr auto kConv2DTranspose = "Conv2DTranspose";
+constexpr auto kRoll = "Roll";
 
 // Here list all primitives used in backend or some special primitives used by core.
 // GetNext
@@ -202,7 +204,7 @@ inline const PrimitivePtr kPrimSliceFusion = std::make_shared<Primitive>("SliceF
 inline const PrimitivePtr kPrimTile = std::make_shared<Primitive>(kTile);
 inline const PrimitivePtr kPrimAddN = std::make_shared<Primitive>("AddN");
 inline const PrimitivePtr kPrimAccumulateNV2 = std::make_shared<Primitive>("AccumulateNV2");
-inline const PrimitivePtr KPrimTransData = std::make_shared<Primitive>("TransData");
+inline const PrimitivePtr kPrimTransData = std::make_shared<Primitive>("TransData");
 inline const PrimitivePtr kPrimNMSWithMask = std::make_shared<Primitive>("NMSWithMask");
 inline const PrimitivePtr kPrimPad = std::make_shared<Primitive>("Pad");
 inline const PrimitivePtr kPrimArgMaxWithValue = std::make_shared<Primitive>("ArgMaxWithValue");
@@ -283,6 +285,7 @@ inline const PrimitivePtr kPrimCTCLossV2Grad = std::make_shared<Primitive>("CTCL
 inline const PrimitivePtr kPrimCTCLoss = std::make_shared<Primitive>(kCTCLoss);
 inline const PrimitivePtr kPrimFullConnection = std::make_shared<Primitive>("FullConnection");
 inline const PrimitivePtr kPrimConv2DTranspose = std::make_shared<Primitive>(kConv2DTranspose);
+inline const PrimitivePtr kPrimRoll = std::make_shared<Primitive>(kRoll);
 inline const PrimitivePtr kPrimGroupConv2DGradInput = std::make_shared<Primitive>("GroupConv2DGradInput");
 inline const PrimitivePtr kPrimBatchNorm = std::make_shared<Primitive>("BatchNorm");
 inline const PrimitivePtr kPrimBatchNormGrad = std::make_shared<Primitive>("BatchNormGrad");
@@ -351,6 +354,7 @@ inline const PrimitivePtr kPrimSoftplus = std::make_shared<Primitive>("Softplus"
 inline const PrimitivePtr kPrimSoftplusGrad = std::make_shared<Primitive>("SoftplusGrad");
 inline const PrimitivePtr kPrimZeros = std::make_shared<Primitive>("Zeros");
 inline const PrimitivePtr kPrimZerosLike = std::make_shared<Primitive>(kZerosLike);
+inline const PrimitivePtr kPrimOnes = std::make_shared<Primitive>(kOnes);
 inline const PrimitivePtr kPrimOnesLike = std::make_shared<Primitive>(kOnesLike);
 inline const PrimitivePtr kPrimBpropCut = std::make_shared<Primitive>("bprop_cut");
 inline const PrimitivePtr kPrimFakeQuantPerLayer = std::make_shared<Primitive>("FakeQuantPerLayer");
diff --git a/mindspore/core/ir/anf.cc b/mindspore/core/ir/anf.cc
index 3ef25ab473b..6178d1be3df 100644
--- a/mindspore/core/ir/anf.cc
+++ b/mindspore/core/ir/anf.cc
@@ -419,7 +419,7 @@ std::string GetVirtualNodeTargetFromInputs(const AnfNodePtr &node) {
     }
     std::string first_input_target = kTargetUnDefined;
     bool has_diff_target =
-      std::any_of(std::begin(real_inputs), std::end(real_inputs), [&first_input_target](const AnfNodePtr &n) {
+      std::any_of(std::rbegin(real_inputs), std::rend(real_inputs), [&first_input_target](const AnfNodePtr &n) {
         auto target = GetOriginNodeTarget(n);
         if (target == kTargetUnDefined) {
           return false;
diff --git a/mindspore/core/ir/anf.h b/mindspore/core/ir/anf.h
index 7d4a2607525..8d1f611923c 100644
--- a/mindspore/core/ir/anf.h
+++ b/mindspore/core/ir/anf.h
@@ -96,7 +96,7 @@ using ParamInfoPtr = std::shared_ptr<ParamInfo>;
 // input of other CNodes, you can get the related info by this method.
 // debug_info: return the information retrieved from parser. Set it using set_debug_info.
 // fullname_with_scope: return the detailed debug info.
-class AnfNode : public Base {
+class MS_CORE_API AnfNode : public Base {
  public:
   explicit AnfNode(const FuncGraphPtr &func_graph)
       : func_graph_(FuncGraphWeakPtr(func_graph)),
@@ -117,7 +117,7 @@ class AnfNode : public Base {
   virtual void accept(AnfIrVisitor *) {}
   FuncGraphPtr func_graph() const { return func_graph_.lock(); }
 
-  void set_func_graph(const FuncGraphPtr &func_graph) { func_graph_ = FuncGraphWeakPtr(func_graph); }
+  virtual void set_func_graph(const FuncGraphPtr &func_graph) { func_graph_ = FuncGraphWeakPtr(func_graph); }
 
   ScopePtr scope() { return scope_; }
   void set_scope(const ScopePtr &scope) { scope_ = scope; }
@@ -234,7 +234,7 @@ class AnfNode : public Base {
 // stop_gradient_: a flag used to stop gradient.
 // Using stop_gradient() to get this flag, mainly used in ad.
 // Using set_stop_gradient() to set this flag.
-class CNode : public AnfNode, public EffectInfoHolder {
+class MS_CORE_API CNode : public AnfNode, public EffectInfoHolder {
  public:
   CNode(const std::vector<AnfNodePtr> &inputs, const FuncGraphPtr &func_graph);
   CNode(const std::vector<AnfNodePtr> &inputs, const VarPtr &func_graph_as_var)
@@ -365,7 +365,7 @@ class CNode : public AnfNode, public EffectInfoHolder {
 };
 
 // ANode represents the atomic node. It's derived Parameter and ValueNode.
-class ANode : public AnfNode {
+class MS_CORE_API ANode : public AnfNode {
  public:
   ANode() : AnfNode(nullptr) {}
   explicit ANode(const FuncGraphPtr &func_graph) : AnfNode(func_graph) {}
@@ -377,7 +377,7 @@ class ANode : public AnfNode {
 // Parameter represents the parameter inputs of a function. They have no value.
 // Attributes:
 // default_param_value_: used to hold the inputting tensor of the model.
-class Parameter : public ANode {
+class MS_CORE_API Parameter : public ANode {
  public:
   explicit Parameter(const FuncGraphPtr &func_graph)
       : ANode(func_graph), name_(""), has_default_(false), default_param_(nullptr), used_graph_count_(0) {}
@@ -443,7 +443,7 @@ using ParameterPtr = std::shared_ptr<Parameter>;
 
 // Value is used to represent the atomic expression mentioned in BNF.
 // It mainly be stored in ValueNode. Value and ValueNode is related definition.
-class Value : public Base {
+class MS_CORE_API Value : public Base {
  public:
   Value() = default;
   explicit Value(const TypePtr t) : type_(t) {}
@@ -469,12 +469,16 @@ class Value : public Base {
 
 // ValueNode is used to hold value. Unlike CNode and Parameter, ValueNode
 // does not belong to any particular function graph.
-class ValueNode : public ANode {
+class MS_CORE_API ValueNode : public ANode {
  public:
   explicit ValueNode(const ValuePtr &value) : value_(value) {}
   ~ValueNode() override = default;
   MS_DECLARE_PARENT(ValueNode, ANode);
 
+  void set_func_graph(const FuncGraphPtr &func_graph) override {
+    MS_EXCEPTION(ValueError) << "ValueNode should not set its func_graph.";
+  }
+
   void accept(AnfIrVisitor *v) override;
   void set_value(const ValuePtr &value) { value_ = value; }
   const ValuePtr &value() const { return value_; }
diff --git a/mindspore/core/ir/cell.h b/mindspore/core/ir/cell.h
index 29fcc93fef3..c0d1c655ad3 100644
--- a/mindspore/core/ir/cell.h
+++ b/mindspore/core/ir/cell.h
@@ -31,7 +31,7 @@ using abstract::AbstractBasePtr;
 using abstract::AbstractBasePtrList;
 // value for Cell
 
-class Cell : public Named {
+class MS_CORE_API Cell : public Named {
  public:
   explicit Cell(const std::string &name) : Named(name) {}
   MS_DECLARE_PARENT(Cell, Named);
diff --git a/mindspore/core/ir/device_event.h b/mindspore/core/ir/device_event.h
index 8309d2b4e37..5c855bbf3a4 100644
--- a/mindspore/core/ir/device_event.h
+++ b/mindspore/core/ir/device_event.h
@@ -24,6 +24,8 @@ class DeviceEvent {
   virtual void WaitEvent() = 0;
   virtual void RecordEvent() = 0;
   virtual bool NeedWait() = 0;
+  virtual void SyncEvent() = 0;
+  virtual void ElapsedTime(float *cost_time, DeviceEvent *other) = 0;
   virtual void set_wait_stream(void *stream) = 0;
   virtual void set_record_stream(void *stream) = 0;
 };
diff --git a/mindspore/core/ir/dtype.h b/mindspore/core/ir/dtype.h
index ad00dde44b9..38b798a186a 100644
--- a/mindspore/core/ir/dtype.h
+++ b/mindspore/core/ir/dtype.h
@@ -42,9 +42,9 @@
 /* namespace to support intermediate representation definition */
 namespace mindspore {
 // Only few type supported now.
-TypePtr TypeIdToType(TypeId id);
+MS_CORE_API TypePtr TypeIdToType(TypeId id);
 
-class String : public Object {
+class MS_CORE_API String : public Object {
  public:
   String() : Object(kObjectTypeString, false) {}
   ~String() override = default;
@@ -59,7 +59,7 @@ class String : public Object {
 };
 using StringPtr = std::shared_ptr<String>;
 
-class Keyword : public Object {
+class MS_CORE_API Keyword : public Object {
  public:
   Keyword() : Object(kObjectTypeKeyword, false), key_(""), value_(nullptr) {}
   Keyword(const std::string &key, const TypePtr &value) : Object(kObjectTypeKeyword, false), key_(key), value_(value) {}
@@ -83,7 +83,7 @@ class Keyword : public Object {
 };
 using KeywordPtr = std::shared_ptr<Keyword>;
 
-class Slice : public Object {
+class MS_CORE_API Slice : public Object {
  public:
   Slice() : Object(kObjectTypeSlice), start_(nullptr), stop_(nullptr), step_(nullptr) {}
   Slice(const TypePtr &start, const TypePtr &stop, const TypePtr &step)
@@ -110,7 +110,7 @@ class Slice : public Object {
 };
 using SlicePtr = std::shared_ptr<Slice>;
 
-class Function : public Object {
+class MS_CORE_API Function : public Object {
  public:
   Function();
   Function(const std::vector<TypePtr> &args, const TypePtr retval);
@@ -135,7 +135,7 @@ class Function : public Object {
 };
 using FunctionPtr = std::shared_ptr<Function>;
 
-class JTagged : public Object {
+class MS_CORE_API JTagged : public Object {
  public:
   JTagged() : Object(kObjectTypeJTagged) {}
   explicit JTagged(const TypePtr &subtype) : Object(kObjectTypeJTagged, false), subtype_(subtype) {}
@@ -153,7 +153,7 @@ class JTagged : public Object {
 };
 using JTaggedPtr = std::shared_ptr<JTagged>;
 
-class SymbolicKeyType : public Object {
+class MS_CORE_API SymbolicKeyType : public Object {
  public:
   SymbolicKeyType() : Object(kObjectTypeSymbolicKeyType) {}
   ~SymbolicKeyType() override = default;
@@ -165,7 +165,7 @@ class SymbolicKeyType : public Object {
   std::string DumpText() const override { return "SymType"; }
 };
 
-class EnvType : public Object {
+class MS_CORE_API EnvType : public Object {
  public:
   EnvType() : Object(kObjectTypeEnvType) {}
   ~EnvType() override = default;
@@ -177,7 +177,7 @@ class EnvType : public Object {
 };
 using EnvTypePtr = std::shared_ptr<EnvType>;
 
-class TypeType : public Type {
+class MS_CORE_API TypeType : public Type {
  public:
   TypeType() : Type(kMetaTypeTypeType) {}
   ~TypeType() override = default;
@@ -190,7 +190,7 @@ class TypeType : public Type {
 };
 using TypeTypePtr = std::shared_ptr<TypeType>;
 
-class Problem : public Type {
+class MS_CORE_API Problem : public Type {
  public:
   Problem() : Type(kMetaTypeProblem), kind_(Named("unknown")) {}
   explicit Problem(const Named &kind) : Type(kMetaTypeProblem), kind_(kind) {}
@@ -209,7 +209,7 @@ class Problem : public Type {
 };
 using ProblemPtr = std::shared_ptr<Problem>;
 
-class External : public Type {
+class MS_CORE_API External : public Type {
  public:
   External() : Type(kMetaTypeExternal) {}
   ~External() override = default;
@@ -230,39 +230,39 @@ TypePtr Clone(const T &t) {
   return t.Clone();
 }
 
-TypePtr StringToType(const std::string &type_name);
+MS_CORE_API TypePtr StringToType(const std::string &type_name);
 
 // Judge whether x is predicate or is a subclass of predicate.
-bool IsIdentidityOrSubclass(TypePtr const &x, TypePtr const &base_type);
+MS_CORE_API bool IsIdentidityOrSubclass(TypePtr const &x, TypePtr const &base_type);
 
 // Whether t1 is identity or a subclass of t2.
-bool IsSubType(TypePtr const &t1, TypePtr const &t2 = nullptr);
+MS_CORE_API bool IsSubType(TypePtr const &t1, TypePtr const &t2 = nullptr);
 
-struct TypeHasher {
+struct MS_CORE_API TypeHasher {
   std::size_t operator()(TypePtr const &type) const;
 };
-struct TypeListHasher {
+struct MS_CORE_API TypeListHasher {
   std::size_t operator()(const TypePtrList &type_list) const;
 };
-struct TypeEqual {
+struct MS_CORE_API TypeEqual {
   bool operator()(TypePtr const &t1, TypePtr const &t2) const;
 };
-struct TypeListEqual {
+struct MS_CORE_API TypeListEqual {
   bool operator()(TypePtrList const &lhs, TypePtrList const &rhs) const;
 };
 
-extern const TypePtr kTypeExternal;
-extern const TypePtr kTypeEnv;
-extern const TypePtr kTypeType;
-extern const TypePtr kString;
-extern const TypePtr kList;
-extern const TypePtr kTuple;
-extern const TypePtr kDict;
-extern const TypePtr kSlice;
-extern const TypePtr kKeyword;
-extern const TypePtr kTensorType;
-extern const TypePtr kTensorTypeFP16;
-extern const TypePtr kTensorTypeFP32;
+MS_CORE_API extern const TypePtr kTypeExternal;
+MS_CORE_API extern const TypePtr kTypeEnv;
+MS_CORE_API extern const TypePtr kTypeType;
+MS_CORE_API extern const TypePtr kString;
+MS_CORE_API extern const TypePtr kList;
+MS_CORE_API extern const TypePtr kTuple;
+MS_CORE_API extern const TypePtr kDict;
+MS_CORE_API extern const TypePtr kSlice;
+MS_CORE_API extern const TypePtr kKeyword;
+MS_CORE_API extern const TypePtr kTensorType;
+MS_CORE_API extern const TypePtr kTensorTypeFP16;
+MS_CORE_API extern const TypePtr kTensorTypeFP32;
 }  // namespace mindspore
 
 #endif  // MINDSPORE_CORE_IR_DTYPE_H_
diff --git a/mindspore/core/ir/dtype/container.h b/mindspore/core/ir/dtype/container.h
index a6aa07e6f7f..8ce91bc6df8 100644
--- a/mindspore/core/ir/dtype/container.h
+++ b/mindspore/core/ir/dtype/container.h
@@ -37,7 +37,7 @@ namespace mindspore {
 // TypeRefKey type
 
 // List
-class List : public Object {
+class MS_CORE_API List : public Object {
  public:
   List() : Object(kObjectTypeList) {}
   List(const std::initializer_list<TypePtr> &objs)
@@ -65,7 +65,7 @@ using ListPtr = std::shared_ptr<List>;
 
 using ClassAttrVector = std::vector<std::pair<std::string, TypePtr>>;
 
-class Class : public Object {
+class MS_CORE_API Class : public Object {
  public:
   Class() : Object(kObjectTypeClass), tag_(Named("Class")) {}
   Class(const Named &tag, const ClassAttrVector &attributes, const std::unordered_map<std::string, ValuePtr> &methods);
@@ -95,7 +95,7 @@ class Class : public Object {
 };
 using ClassPtr = std::shared_ptr<Class>;
 
-class Tuple : public Object {
+class MS_CORE_API Tuple : public Object {
  public:
   Tuple() : Object(kObjectTypeTuple) {}
   // usage : Tuple t = {std::make_shared<Bool>(), std::make_shared<Int>(32)};
@@ -125,7 +125,7 @@ class Tuple : public Object {
 };
 using TuplePtr = std::shared_ptr<Tuple>;
 
-class Dictionary : public Object {
+class MS_CORE_API Dictionary : public Object {
  public:
   Dictionary() : Object(kObjectTypeDictionary) {}
   explicit Dictionary(const std::vector<std::pair<std::string, TypePtr>> &key_values)
diff --git a/mindspore/core/ir/dtype/empty.h b/mindspore/core/ir/dtype/empty.h
index d2422f8fc3c..bdbbe5c9c1a 100644
--- a/mindspore/core/ir/dtype/empty.h
+++ b/mindspore/core/ir/dtype/empty.h
@@ -34,7 +34,7 @@
 #include "ir/dtype/type.h"
 
 namespace mindspore {
-class TypeAnything : public Type {
+class MS_CORE_API TypeAnything : public Type {
  public:
   TypeAnything() : Type(kMetaTypeAnything) {}
   ~TypeAnything() override {}
@@ -46,7 +46,7 @@ class TypeAnything : public Type {
 };
 using TypeAnythingPtr = std::shared_ptr<TypeAnything>;
 
-class TypeNone : public Type {
+class MS_CORE_API TypeNone : public Type {
  public:
   TypeNone() : Type(kMetaTypeNone) {}
   ~TypeNone() override {}
@@ -59,7 +59,7 @@ class TypeNone : public Type {
 };
 using TypeNonePtr = std::shared_ptr<TypeNone>;
 
-class TypeNull : public Type {
+class MS_CORE_API TypeNull : public Type {
  public:
   TypeNull() : Type(kMetaTypeNull) {}
   ~TypeNull() override {}
@@ -71,7 +71,7 @@ class TypeNull : public Type {
 };
 using TypeNullPtr = std::shared_ptr<TypeNull>;
 
-class TypeEllipsis : public Type {
+class MS_CORE_API TypeEllipsis : public Type {
  public:
   TypeEllipsis() : Type(kMetaTypeEllipsis) {}
   ~TypeEllipsis() override {}
@@ -84,10 +84,10 @@ class TypeEllipsis : public Type {
 };
 using TypeEllipsisPtr = std::shared_ptr<TypeEllipsis>;
 
-extern const TypePtr kTypeNone;
-extern const TypePtr kTypeNull;
-extern const TypePtr kTypeEllipsis;
-extern const TypePtr kAnyType;
+MS_CORE_API extern const TypePtr kTypeNone;
+MS_CORE_API extern const TypePtr kTypeNull;
+MS_CORE_API extern const TypePtr kTypeEllipsis;
+MS_CORE_API extern const TypePtr kAnyType;
 }  // namespace mindspore
 
 #endif  // MINDSPORE_CORE_IR_DTYPE_EMPTY_H_
diff --git a/mindspore/core/ir/dtype/number.cc b/mindspore/core/ir/dtype/number.cc
index 1c5a185023a..e47b21288bb 100644
--- a/mindspore/core/ir/dtype/number.cc
+++ b/mindspore/core/ir/dtype/number.cc
@@ -46,4 +46,10 @@ Float::Float(const int nbits) : Number(FloatBitsToTypeId(nbits), nbits, false) {
     MS_LOG(EXCEPTION) << "Wrong number of bits.";
   }
 }
+
+Complex::Complex(const int nbits) : Number(ComplexBitsToTypeId(nbits), nbits, false) {
+  if (nbits != 64 && nbits != 128) {
+    MS_LOG(EXCEPTION) << "Wrong number of bits.";
+  }
+}
 }  // namespace mindspore
diff --git a/mindspore/core/ir/dtype/number.h b/mindspore/core/ir/dtype/number.h
index d1f1698ae63..e46ea41fcf3 100644
--- a/mindspore/core/ir/dtype/number.h
+++ b/mindspore/core/ir/dtype/number.h
@@ -35,7 +35,7 @@
 
 namespace mindspore {
 // Number, abstract class.
-class Number : public Object {
+class MS_CORE_API Number : public Object {
  public:
   Number() : Object(kObjectTypeNumber), number_type_(kObjectTypeNumber), nbits_(0) {}
   Number(const TypeId number_type, const int nbits, bool is_generic = true)
@@ -71,7 +71,7 @@ class Number : public Object {
 using NumberPtr = std::shared_ptr<Number>;
 
 // Bool
-class Bool : public Number {
+class MS_CORE_API Bool : public Number {
  public:
   Bool() : Number(kNumberTypeBool, 8) {}
   ~Bool() override = default;
@@ -85,7 +85,7 @@ class Bool : public Number {
 };
 
 // Int
-class Int : public Number {
+class MS_CORE_API Int : public Number {
  public:
   Int() : Number(kNumberTypeInt, 0) {}
   explicit Int(const int nbits);
@@ -106,7 +106,7 @@ class Int : public Number {
 };
 
 // UInt
-class UInt : public Number {
+class MS_CORE_API UInt : public Number {
  public:
   UInt() : Number(kNumberTypeUInt, 0) {}
   explicit UInt(const int nbits);
@@ -129,7 +129,7 @@ class UInt : public Number {
 };
 
 // Float
-class Float : public Number {
+class MS_CORE_API Float : public Number {
  public:
   Float() : Number(kNumberTypeFloat, 0) {}
   explicit Float(const int nbits);
@@ -150,20 +150,19 @@ class Float : public Number {
   }
 };
 
-// Complex64
-class Complex64 : public Number {
+// Complex
+class MS_CORE_API Complex : public Number {
  public:
-  Complex64() : Number(kNumberTypeComplex64, 64, false) {}
-  ~Complex64() override {}
-  MS_DECLARE_PARENT(Complex64, Number)
+  Complex() : Number(kNumberTypeComplex64, 64, false) {}
+  explicit Complex(const int nbits);
+  ~Complex() override {}
+  MS_DECLARE_PARENT(Complex, Number)
 
   TypeId generic_type_id() const override { return kNumberTypeComplex64; }
-  TypePtr DeepCopy() const override { return std::make_shared<Complex64>(); }
+  TypePtr DeepCopy() const override { return std::make_shared<Complex>(nbits()); }
   std::string ToString() const override { return GetTypeName("Complex"); }
-  std::string ToReprString() const override { return nbits() == 0 ? "complex64_" : GetTypeName("complex64"); }
-  std::string DumpText() const override {
-    return nbits() == 0 ? std::string("Complex64") : std::string("C") + std::to_string(nbits());
-  }
+  std::string ToReprString() const override { return GetTypeName("complex"); }
+  std::string DumpText() const override { return std::string("C") + std::to_string(nbits()); }
 };
 
 inline const TypePtr kBool = std::make_shared<Bool>();
@@ -182,7 +181,8 @@ inline const TypePtr kInt = std::make_shared<Int>();
 inline const TypePtr kUInt = std::make_shared<UInt>();
 inline const TypePtr kFloat = std::make_shared<Float>();
 inline const TypePtr kNumber = std::make_shared<Number>();
-inline const TypePtr kComplex64 = std::make_shared<Complex64>();
+inline const TypePtr kComplex64 = std::make_shared<Complex>(64);
+inline const TypePtr kComplex128 = std::make_shared<Complex>(128);
 }  // namespace mindspore
 
 #endif  // MINDSPORE_CORE_IR_DTYPE_NUMBER_H_
diff --git a/mindspore/core/ir/dtype/ref.h b/mindspore/core/ir/dtype/ref.h
index ccdcb6cf6b3..e428c3a6eca 100644
--- a/mindspore/core/ir/dtype/ref.h
+++ b/mindspore/core/ir/dtype/ref.h
@@ -27,7 +27,7 @@
 
 namespace mindspore {
 // TypeRefKey type
-class RefKeyType : public Object {
+class MS_CORE_API RefKeyType : public Object {
  public:
   RefKeyType() : Object(kObjectTypeRefKey) {}
   ~RefKeyType() override {}
@@ -40,7 +40,7 @@ class RefKeyType : public Object {
 };
 
 // TypeRef type
-class RefType : public TensorType {
+class MS_CORE_API RefType : public TensorType {
  public:
   RefType() : TensorType() {}
   explicit RefType(const TensorTypePtr &subtype) : TensorType(subtype->element()) {}
@@ -53,8 +53,8 @@ class RefType : public TensorType {
 };
 using RefTypePtr = std::shared_ptr<RefType>;
 
-extern const TypePtr kRefKeyType;
-extern const TypePtr kRefType;
+MS_CORE_API extern const TypePtr kRefKeyType;
+MS_CORE_API extern const TypePtr kRefType;
 }  // namespace mindspore
 
 #endif  // MINDSPORE_CORE_IR_DTYPE_REF_H_
diff --git a/mindspore/core/ir/dtype/tensor_type.h b/mindspore/core/ir/dtype/tensor_type.h
index 7fb2b911f78..0c3d48da0f1 100644
--- a/mindspore/core/ir/dtype/tensor_type.h
+++ b/mindspore/core/ir/dtype/tensor_type.h
@@ -34,7 +34,7 @@
 #include "ir/dtype/type.h"
 
 namespace mindspore {
-class UndeterminedType : public Object {
+class MS_CORE_API UndeterminedType : public Object {
  public:
   UndeterminedType() : Object(kObjectTypeUndeterminedType) {}
   explicit UndeterminedType(const TypePtr &ele)
@@ -57,7 +57,7 @@ class UndeterminedType : public Object {
 };
 using MetaTensorTypePtr = std::shared_ptr<UndeterminedType>;
 
-class TensorType : public Object {
+class MS_CORE_API TensorType : public Object {
  public:
   TensorType() : Object(kObjectTypeTensorType, kObjectTypeUndeterminedType) {}
   explicit TensorType(const TypePtr &ele)
@@ -80,7 +80,7 @@ class TensorType : public Object {
 };
 using TensorTypePtr = std::shared_ptr<TensorType>;
 
-class RowTensorType : public Object {
+class MS_CORE_API RowTensorType : public Object {
  public:
   RowTensorType() : Object(kObjectTypeRowTensorType, kObjectTypeUndeterminedType) {}
   explicit RowTensorType(const TypePtr &ele)
@@ -103,7 +103,7 @@ class RowTensorType : public Object {
 };
 using RowTensorTypePtr = std::shared_ptr<RowTensorType>;
 
-class SparseTensorType : public Object {
+class MS_CORE_API SparseTensorType : public Object {
  public:
   SparseTensorType() : Object(kObjectTypeSparseTensorType, kObjectTypeUndeterminedType) {}
   explicit SparseTensorType(const TypePtr &ele)
diff --git a/mindspore/core/ir/dtype/type.cc b/mindspore/core/ir/dtype/type.cc
index dc3624fad76..b733b6095cf 100644
--- a/mindspore/core/ir/dtype/type.cc
+++ b/mindspore/core/ir/dtype/type.cc
@@ -87,6 +87,7 @@ enum class BitsNum : int {
   eBits16 = 16,
   eBits32 = 32,
   eBits64 = 64,
+  eBits128 = 128,
 };
 TypeId IntBitsToTypeId(const int nbits) {
   switch (nbits) {
@@ -131,6 +132,17 @@ TypeId FloatBitsToTypeId(const int nbits) {
   }
 }
 
+TypeId ComplexBitsToTypeId(const int nbits) {
+  switch (nbits) {
+    case static_cast<int>(BitsNum::eBits64):
+      return kNumberTypeComplex64;
+    case static_cast<int>(BitsNum::eBits128):
+      return kNumberTypeComplex128;
+    default:
+      MS_LOG(EXCEPTION) << "Wrong number of bits:" << nbits;
+  }
+}
+
 const std::string &TypeIdLabel(const TypeId &v) {
   static const std::string unknown("[Unknown Type Id]");
   auto iter = g_type_2_lable.find(v);
diff --git a/mindspore/core/ir/dtype/type.h b/mindspore/core/ir/dtype/type.h
index 6cff3df1899..73e63164ce5 100644
--- a/mindspore/core/ir/dtype/type.h
+++ b/mindspore/core/ir/dtype/type.h
@@ -41,6 +41,7 @@ namespace mindspore {
 TypeId IntBitsToTypeId(const int nbits);
 TypeId UIntBitsToTypeId(const int nbits);
 TypeId FloatBitsToTypeId(const int nbits);
+TypeId ComplexBitsToTypeId(const int nbits);
 const std::string &TypeIdLabel(const TypeId &v);
 TypeId NormalizeTypeId(const TypeId type_id);
 bool IsSameObjectType(const Type &lhs, const Type &rhs);
@@ -49,7 +50,7 @@ size_t GetTypeByte(const TypePtr &type_ptr);
 // Base class for all types
 // forward declaration.
 
-class Type : public Value {
+class MS_CORE_API Type : public Value {
  public:
   Type() : meta_type_(kMetaTypeType), is_generic_(true) {}
   explicit Type(TypeId t, bool is_generic = true) : meta_type_(t), is_generic_(is_generic) {}
@@ -94,7 +95,7 @@ using TypePtrList = std::vector<TypePtr>;
 //
 // Base class for normal objects
 //
-class Object : public Type {
+class MS_CORE_API Object : public Type {
  public:
   Object() : Type(kMetaTypeObject), object_type_(kMetaTypeObject), parent_type_(kMetaTypeObject) {}
   explicit Object(const TypeId object_type, bool is_generic = true)
@@ -132,7 +133,7 @@ const std::unordered_map<TypeId, int> type_priority_map = {
   {kNumberTypeInt16, 3},   {kNumberTypeInt32, 4},   {kNumberTypeInt64, 5},
   {kNumberTypeFloat16, 6}, {kNumberTypeFloat32, 7}, {kNumberTypeFloat64, 8}};
 
-std::ostream &operator<<(std::ostream &os, const TypePtrList &types);
+MS_CORE_API std::ostream &operator<<(std::ostream &os, const TypePtrList &types);
 }  // namespace mindspore
 
 #endif  // MINDSPORE_CORE_IR_DTYPE_TYPE_H_
diff --git a/mindspore/core/ir/dtype/type_id.h b/mindspore/core/ir/dtype/type_id.h
index 46209b8ba43..bb3a58c57e0 100644
--- a/mindspore/core/ir/dtype/type_id.h
+++ b/mindspore/core/ir/dtype/type_id.h
@@ -79,6 +79,8 @@ enum TypeId : int {
   kNumberTypeFloat32,
   kNumberTypeFloat64,
   kNumberTypeComplex64,
+  kNumberTypeComplex128,
+  kNumberTypeInt4,
   kNumberTypeEnd,
   //
   // Monad Types
diff --git a/mindspore/core/ir/dtype_extends.cc b/mindspore/core/ir/dtype_extends.cc
index 14173909552..76f4e8e3693 100644
--- a/mindspore/core/ir/dtype_extends.cc
+++ b/mindspore/core/ir/dtype_extends.cc
@@ -61,41 +61,20 @@ bool TypeListEqual::operator()(TypePtrList const &lhs, TypePtrList const &rhs) c
 }
 
 TypePtr TypeIdToType(TypeId id) {
-  static std::unordered_map<TypeId, TypePtr> type_id_to_type = {{kNumberTypeFloat16, kFloat16},
-                                                                {kNumberTypeFloat, kFloat32},
-                                                                {kNumberTypeFloat32, kFloat32},
-                                                                {kNumberTypeFloat64, kFloat64},
-                                                                {kNumberTypeComplex64, kComplex64},
-                                                                {kNumberTypeInt8, kInt8},
-                                                                {kNumberTypeInt16, kInt16},
-                                                                {kNumberTypeInt32, kInt32},
-                                                                {kNumberTypeInt, kInt32},
-                                                                {kNumberTypeInt64, kInt64},
-                                                                {kNumberTypeUInt8, kUInt8},
-                                                                {kNumberTypeUInt16, kUInt16},
-                                                                {kNumberTypeUInt32, kUInt32},
-                                                                {kNumberTypeUInt64, kUInt64},
-                                                                {kNumberTypeBool, kBool},
-                                                                {kMetaTypeExternal, kTypeExternal},
-                                                                {kMetaTypeAnything, kAnyType},
-                                                                {kMetaTypeNone, kTypeNone},
-                                                                {kMetaTypeNull, kTypeNull},
-                                                                {kMetaTypeEllipsis, kTypeEllipsis},
-                                                                {kObjectTypeEnvType, kTypeEnv},
-                                                                {kObjectTypeRefKey, kRefKeyType},
-                                                                {kObjectTypeRef, kRefType},
-                                                                {kMetaTypeTypeType, kTypeType},
-                                                                {kObjectTypeString, kString},
-                                                                {kObjectTypeList, kList},
-                                                                {kObjectTypeTuple, kTuple},
-                                                                {kObjectTypeDictionary, kDict},
-                                                                {kObjectTypeSlice, kSlice},
-                                                                {kObjectTypeKeyword, kKeyword},
-                                                                {kObjectTypeTensorType, kTensorType},
-                                                                {kObjectTypeUMonad, kUMonadType},
-                                                                {kObjectTypeIOMonad, kIOMonadType},
-                                                                {kTypeUnknown, kTypeNone},
-                                                                {kMetaTypeProblem, kTypeNone}};
+  static std::unordered_map<TypeId, TypePtr> type_id_to_type = {
+    {kNumberTypeFloat16, kFloat16},     {kNumberTypeFloat, kFloat32},         {kNumberTypeFloat32, kFloat32},
+    {kNumberTypeFloat64, kFloat64},     {kNumberTypeComplex64, kComplex64},   {kNumberTypeInt8, kInt8},
+    {kNumberTypeInt16, kInt16},         {kNumberTypeInt32, kInt32},           {kNumberTypeInt, kInt32},
+    {kNumberTypeInt64, kInt64},         {kNumberTypeUInt8, kUInt8},           {kNumberTypeUInt16, kUInt16},
+    {kNumberTypeUInt32, kUInt32},       {kNumberTypeUInt64, kUInt64},         {kNumberTypeBool, kBool},
+    {kNumberTypeComplex64, kComplex64}, {kNumberTypeComplex128, kComplex128}, {kMetaTypeExternal, kTypeExternal},
+    {kMetaTypeAnything, kAnyType},      {kMetaTypeNone, kTypeNone},           {kMetaTypeNull, kTypeNull},
+    {kMetaTypeEllipsis, kTypeEllipsis}, {kObjectTypeEnvType, kTypeEnv},       {kObjectTypeRefKey, kRefKeyType},
+    {kObjectTypeRef, kRefType},         {kMetaTypeTypeType, kTypeType},       {kObjectTypeString, kString},
+    {kObjectTypeList, kList},           {kObjectTypeTuple, kTuple},           {kObjectTypeDictionary, kDict},
+    {kObjectTypeSlice, kSlice},         {kObjectTypeKeyword, kKeyword},       {kObjectTypeTensorType, kTensorType},
+    {kObjectTypeUMonad, kUMonadType},   {kObjectTypeIOMonad, kIOMonadType},   {kTypeUnknown, kTypeNone},
+    {kMetaTypeProblem, kTypeNone}};
   const auto &it = type_id_to_type.find(id);
   if (it == type_id_to_type.end()) {
     MS_LOG(EXCEPTION) << "Not support the type: " << id;
diff --git a/mindspore/core/ir/func_graph.cc b/mindspore/core/ir/func_graph.cc
index 703b679fe40..1abf10b099d 100644
--- a/mindspore/core/ir/func_graph.cc
+++ b/mindspore/core/ir/func_graph.cc
@@ -632,7 +632,7 @@ std::list<CNodePtr> FuncGraph::GetOrderedCnodes() {
   auto SuccDepends = std::bind(SuccIncludeFV, this_ptr, std::placeholders::_1);
 
   std::list<CNodePtr> cnodes;
-  auto nodes = TopoSort(get_return(), SuccDepends, BelongSameGraph);
+  auto nodes = mindspore::TopoSort(get_return(), SuccDepends, BelongSameGraph);
   for (const auto &node : nodes) {
     auto cnode = dyn_cast<CNode>(node);
     if (cnode) {
@@ -727,7 +727,7 @@ bool FuncGraph::ContainMultiTarget() const {
   MS_EXCEPTION_IF_NULL(graph_manager);
   FuncGraphSet graphs = graph_manager->func_graphs();
   for (auto &g : graphs) {
-    auto nodes = TopoSort(g->get_return());
+    auto nodes = mindspore::TopoSort(g->get_return());
     if (mindspore::ContainMultiTarget(nodes)) {
       return true;
     }
@@ -740,5 +740,8 @@ size_t NewFgSeenGeneration() {
   return ++fg_seen_generation;
 }
 
+// Implement TopoSort api.
+std::vector<AnfNodePtr> api::FuncGraph::TopoSort(const AnfNodePtr &node) { return mindspore::TopoSort(node); }
+
 const PrimitivePtr FuncGraphTransform::func_graph_prim_ = std::make_shared<Primitive>("FuncGraph");
 }  // namespace mindspore
diff --git a/mindspore/core/ir/func_graph_cloner.cc b/mindspore/core/ir/func_graph_cloner.cc
index b036672f55c..261d90e8775 100644
--- a/mindspore/core/ir/func_graph_cloner.cc
+++ b/mindspore/core/ir/func_graph_cloner.cc
@@ -758,13 +758,9 @@ FuncGraphPtr TransformableClone(const FuncGraphPtr &func_graph, const TraceInfoP
   for (auto &item : func_graph->parameter_default_value()) {
     new_func_graph->set_param_default_value(item.first, cloner[item.second]);
   }
-
-  if (MsContext::GetInstance()->get_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK)) {
-    if (func_graph->has_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES)) {
-      new_func_graph->set_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
-    }
+  if (func_graph->has_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES)) {
+    new_func_graph->set_flag(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
   }
-
   if (func_graph->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)) {
     new_func_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, func_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL));
   }
diff --git a/mindspore/core/ir/meta_tensor.h b/mindspore/core/ir/meta_tensor.h
index f542baca869..96c860855c1 100644
--- a/mindspore/core/ir/meta_tensor.h
+++ b/mindspore/core/ir/meta_tensor.h
@@ -55,7 +55,7 @@ struct DeviceInfo {
 //
 // Includes the metadata information of a tensor, such as data type, shape
 // and so on. But it does not contain values of a tensor.
-class MetaTensor : public Value {
+class MS_CORE_API MetaTensor : public Value {
  public:
   // Construction
   MetaTensor();
diff --git a/mindspore/core/ir/named.h b/mindspore/core/ir/named.h
index 041bef12b05..62855a502df 100644
--- a/mindspore/core/ir/named.h
+++ b/mindspore/core/ir/named.h
@@ -24,7 +24,7 @@
 #include "ir/anf.h"
 
 namespace mindspore {
-class Named : public Value {
+class MS_CORE_API Named : public Value {
  public:
   explicit Named(const std::string &name) : name_(name) { hash_id_ = std::hash<std::string>{}(name); }
   Named(const Named &other) : Value(other) {
@@ -62,14 +62,14 @@ class Named : public Value {
 };
 using NamedPtr = std::shared_ptr<Named>;
 
-struct NamedHasher {
+struct MS_CORE_API NamedHasher {
   std::size_t operator()(NamedPtr const &name) const {
     std::size_t hash = name->Hash();
     return hash;
   }
 };
 
-struct NamedEqual {
+struct MS_CORE_API NamedEqual {
   bool operator()(NamedPtr const &t1, NamedPtr const &t2) const {
     MS_EXCEPTION_IF_NULL(t1);
     MS_EXCEPTION_IF_NULL(t2);
@@ -77,31 +77,31 @@ struct NamedEqual {
   }
 };
 
-class None : public Named {
+class MS_CORE_API None : public Named {
  public:
   None() : Named("None") {}
   ~None() override = default;
   MS_DECLARE_PARENT(None, Named);
   abstract::AbstractBasePtr ToAbstract() override;
 };
-extern const NamedPtr kNone;
+MS_CORE_API extern const NamedPtr kNone;
 
-class Null : public Named {
+class MS_CORE_API Null : public Named {
  public:
   Null() : Named("Null") {}
   ~Null() override = default;
   MS_DECLARE_PARENT(Null, Named);
   abstract::AbstractBasePtr ToAbstract() override;
 };
-extern const NamedPtr kNull;
+MS_CORE_API extern const NamedPtr kNull;
 
-class Ellipsis : public Named {
+class MS_CORE_API Ellipsis : public Named {
  public:
   Ellipsis() : Named("Ellipsis") {}
   ~Ellipsis() override = default;
   MS_DECLARE_PARENT(Ellipsis, Named);
   abstract::AbstractBasePtr ToAbstract() override;
 };
-extern const NamedPtr kEllipsis;
+MS_CORE_API extern const NamedPtr kEllipsis;
 }  // namespace mindspore
 #endif  // MINDSPORE_CORE_IR_NAMED_H_
diff --git a/mindspore/core/ir/primitive.h b/mindspore/core/ir/primitive.h
index d875fe53eb3..c1d47d20fac 100644
--- a/mindspore/core/ir/primitive.h
+++ b/mindspore/core/ir/primitive.h
@@ -38,7 +38,7 @@ enum PrimType {
   kPrimTypePyCheck  // Primitive operator with input args checking method
 };
 
-class Primitive : public Named {
+class MS_CORE_API Primitive : public Named {
  public:
   explicit Primitive(const std::string &name, const bool is_base = true, const PrimType prim_type = kPrimTypeBuiltIn);
   Primitive(const std::string &name, const std::unordered_map<std::string, ValuePtr> &attrs);
@@ -142,7 +142,7 @@ inline std::ostream &operator<<(std::ostream &os, const PrimitivePtr &p) {
   return os;
 }
 
-struct PrimitiveEqual {
+struct MS_CORE_API PrimitiveEqual {
   bool operator()(PrimitivePtr const &t1, PrimitivePtr const &t2) const {
     MS_EXCEPTION_IF_NULL(t1);
     MS_EXCEPTION_IF_NULL(t2);
@@ -150,14 +150,14 @@ struct PrimitiveEqual {
   }
 };
 
-struct PrimitiveHasher {
+struct MS_CORE_API PrimitiveHasher {
   std::size_t operator()(PrimitivePtr const &prim) const {
     MS_EXCEPTION_IF_NULL(prim);
     return prim->Hash();
   }
 };
 
-struct PrimitiveTotalEqual {
+struct MS_CORE_API PrimitiveTotalEqual {
   bool operator()(PrimitivePtr const &t1, PrimitivePtr const &t2) const {
     MS_EXCEPTION_IF_NULL(t1);
     MS_EXCEPTION_IF_NULL(t2);
diff --git a/mindspore/core/ir/scalar.h b/mindspore/core/ir/scalar.h
index 7d76bcc1c51..200b3664977 100644
--- a/mindspore/core/ir/scalar.h
+++ b/mindspore/core/ir/scalar.h
@@ -35,7 +35,7 @@
 using std::fabs;
 
 namespace mindspore {
-class Scalar : public Value {
+class MS_CORE_API Scalar : public Value {
  public:
   Scalar() = default;
   explicit Scalar(const TypePtr t) : Value(t) {}
@@ -50,7 +50,7 @@ class Scalar : public Value {
 };
 using ScalarPtr = std::shared_ptr<Scalar>;
 
-class BoolImm : public Scalar {
+class MS_CORE_API BoolImm : public Scalar {
  public:
   explicit BoolImm(bool b) : Scalar(kBool), v_(b) { hash_ = hash_combine({tid(), std::hash<bool>{}(v_)}); }
   ~BoolImm() override = default;
@@ -81,7 +81,7 @@ class BoolImm : public Scalar {
 using BoolImmPtr = std::shared_ptr<BoolImm>;
 IMM_TRAITS(BoolImmPtr, bool)
 
-class IntergerImm : public Scalar {
+class MS_CORE_API IntergerImm : public Scalar {
  public:
   IntergerImm() = default;
   explicit IntergerImm(const TypePtr &t) : Scalar(t) {}
@@ -89,7 +89,7 @@ class IntergerImm : public Scalar {
   MS_DECLARE_PARENT(IntergerImm, Scalar)
 };
 
-class Int8Imm : public IntergerImm {
+class MS_CORE_API Int8Imm : public IntergerImm {
  public:
   Int8Imm() : IntergerImm(kInt8), v_(0) {}
   explicit Int8Imm(int8_t v) : IntergerImm(kInt8), v_(v) { hash_ = hash_combine({tid(), std::hash<int>{}(v_)}); }
@@ -115,7 +115,7 @@ class Int8Imm : public IntergerImm {
 using Int8ImmPtr = std::shared_ptr<Int8Imm>;
 IMM_TRAITS(Int8ImmPtr, int8_t)
 
-class Int16Imm : public IntergerImm {
+class MS_CORE_API Int16Imm : public IntergerImm {
  public:
   Int16Imm() : IntergerImm(kInt16), v_(0) {}
   explicit Int16Imm(int16_t v) : IntergerImm(kInt16), v_(v) { hash_ = hash_combine({tid(), std::hash<int>{}(v_)}); }
@@ -141,7 +141,7 @@ class Int16Imm : public IntergerImm {
 using Int16ImmPtr = std::shared_ptr<Int16Imm>;
 IMM_TRAITS(Int16ImmPtr, int16_t)
 
-class Int32Imm : public IntergerImm {
+class MS_CORE_API Int32Imm : public IntergerImm {
  public:
   Int32Imm() : IntergerImm(kInt32), v_(0) {}
   explicit Int32Imm(int v) : IntergerImm(kInt32), v_(v) { hash_ = hash_combine({tid(), std::hash<int>{}(v_)}); }
@@ -167,7 +167,7 @@ class Int32Imm : public IntergerImm {
 using Int32ImmPtr = std::shared_ptr<Int32Imm>;
 IMM_TRAITS(Int32ImmPtr, int32_t)
 
-class Int64Imm : public IntergerImm {
+class MS_CORE_API Int64Imm : public IntergerImm {
  public:
   Int64Imm() : IntergerImm(kInt64), v_(0) {}
   explicit Int64Imm(int64_t v) : IntergerImm(kInt64), v_(v) { hash_ = hash_combine({tid(), std::hash<int64_t>{}(v_)}); }
@@ -193,7 +193,7 @@ class Int64Imm : public IntergerImm {
 using Int64ImmPtr = std::shared_ptr<Int64Imm>;
 IMM_TRAITS(Int64ImmPtr, int64_t)
 
-class UInt8Imm : public IntergerImm {
+class MS_CORE_API UInt8Imm : public IntergerImm {
  public:
   UInt8Imm() : IntergerImm(kUInt8), v_(0) {}
   explicit UInt8Imm(uint8_t v) : IntergerImm(kUInt8), v_(v) {
@@ -221,7 +221,7 @@ class UInt8Imm : public IntergerImm {
 using UInt8ImmPtr = std::shared_ptr<UInt8Imm>;
 IMM_TRAITS(UInt8ImmPtr, uint8_t);
 
-class UInt16Imm : public IntergerImm {
+class MS_CORE_API UInt16Imm : public IntergerImm {
  public:
   UInt16Imm() : IntergerImm(kUInt16), v_(0) {}
   explicit UInt16Imm(uint16_t v) : IntergerImm(kUInt16), v_(v) {
@@ -249,7 +249,7 @@ class UInt16Imm : public IntergerImm {
 using UInt16ImmPtr = std::shared_ptr<UInt16Imm>;
 IMM_TRAITS(UInt16ImmPtr, uint16_t);
 
-class UInt32Imm : public IntergerImm {
+class MS_CORE_API UInt32Imm : public IntergerImm {
  public:
   UInt32Imm() : IntergerImm(kUInt32), v_(0) {}
   explicit UInt32Imm(uint32_t v) : IntergerImm(kUInt32), v_(v) {
@@ -277,7 +277,7 @@ class UInt32Imm : public IntergerImm {
 using UInt32ImmPtr = std::shared_ptr<UInt32Imm>;
 IMM_TRAITS(UInt32ImmPtr, uint32_t);
 
-class UInt64Imm : public IntergerImm {
+class MS_CORE_API UInt64Imm : public IntergerImm {
  public:
   UInt64Imm() : IntergerImm(kUInt64), v_(0) {}
   explicit UInt64Imm(uint64_t v) : IntergerImm(kUInt64), v_(v) {
@@ -305,7 +305,7 @@ class UInt64Imm : public IntergerImm {
 using UInt64ImmPtr = std::shared_ptr<UInt64Imm>;
 IMM_TRAITS(UInt64ImmPtr, uint64_t);
 
-class FloatImm : public Scalar {
+class MS_CORE_API FloatImm : public Scalar {
  public:
   FloatImm() = default;
   explicit FloatImm(const TypePtr &t) : Scalar(t) {}
@@ -314,7 +314,7 @@ class FloatImm : public Scalar {
 };
 using FloatImmPtr = std::shared_ptr<FloatImm>;
 
-class FP32Imm : public FloatImm {
+class MS_CORE_API FP32Imm : public FloatImm {
  public:
   FP32Imm() : FloatImm(kFloat32), v_(0.0) {}
   explicit FP32Imm(float v) : FloatImm(kFloat32), v_(v) { hash_ = hash_combine({tid(), std::hash<float>{}(v_)}); }
@@ -340,7 +340,7 @@ class FP32Imm : public FloatImm {
 using FP32ImmPtr = std::shared_ptr<FP32Imm>;
 IMM_TRAITS(FP32ImmPtr, float)
 
-class FP64Imm : public FloatImm {
+class MS_CORE_API FP64Imm : public FloatImm {
  public:
   FP64Imm() : FloatImm(kFloat64), v_(0.0) {}
   explicit FP64Imm(double v) : FloatImm(kFloat64), v_(v) { hash_ = hash_combine({tid(), std::hash<double>{}(v_)}); }
diff --git a/mindspore/core/ir/scope.h b/mindspore/core/ir/scope.h
index c66949867d5..5e0302770ac 100644
--- a/mindspore/core/ir/scope.h
+++ b/mindspore/core/ir/scope.h
@@ -19,6 +19,7 @@
 #include <string>
 #include <memory>
 #include <stack>
+
 namespace mindspore {
 class Scope;
 using ScopePtr = std::shared_ptr<Scope>;
diff --git a/mindspore/core/ir/tensor.cc b/mindspore/core/ir/tensor.cc
index 84ad5cf3dbf..ef116a4f753 100644
--- a/mindspore/core/ir/tensor.cc
+++ b/mindspore/core/ir/tensor.cc
@@ -31,6 +31,7 @@
 
 #include "abstract/utils.h"
 #include "abstract/abstract_value.h"
+#include "base/complex_storage.h"
 
 namespace mindspore {
 namespace tensor {
@@ -73,7 +74,10 @@ std::unique_ptr<T[]> NewData(const U *input, size_t size) {
     return nullptr;
   }
   auto data = std::make_unique<T[]>(size);
-  if constexpr (!std::is_same<T, U>::value && (std::is_same<T, float16>::value || std::is_same<U, float16>::value)) {
+  if constexpr (!std::is_same<T, U>::value &&
+                (std::is_same<T, float16>::value || std::is_same<U, float16>::value ||
+                 std::is_same<T, ComplexStorage<float>>::value || std::is_same<U, ComplexStorage<float>>::value ||
+                 std::is_same<T, ComplexStorage<double>>::value || std::is_same<U, ComplexStorage<double>>::value)) {
     // Because float16 do not support implicit cast from/to other types,
     // We can not use std::copy() on array of float16, use a loop here.
     for (size_t i = 0; i < size; ++i) {
@@ -146,7 +150,11 @@ std::unique_ptr<T[]> CopyData(const ShapeVector &shape, void *const data, TypeId
       return NewData<T>(buf, size);
     }
     case kNumberTypeComplex64: {
-      auto buf = static_cast<double *>(data);
+      auto buf = static_cast<ComplexStorage<float> *>(data);
+      return NewData<T>(buf, size);
+    }
+    case kNumberTypeComplex128: {
+      auto buf = static_cast<ComplexStorage<double> *>(data);
       return NewData<T>(buf, size);
     }
     case kObjectTypeString: {
@@ -233,7 +241,8 @@ class TensorDataImpl : public TensorData {
       std::is_same<T, bool>::value || std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value ||
       std::is_same<T, int16_t>::value || std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value ||
       std::is_same<T, uint16_t>::value || std::is_same<T, uint32_t>::value || std::is_same<T, uint64_t>::value ||
-      std::is_same<T, float16>::value || std::is_same<T, float>::value || std::is_same<T, double>::value;
+      std::is_same<T, float16>::value || std::is_same<T, float>::value || std::is_same<T, double>::value ||
+      std::is_same<T, ComplexStorage<float>>::value || std::is_same<T, ComplexStorage<double>>::value;
     static_assert(valid, "Type is invalid");
     if (data_size_ == 0) {
       return "";
@@ -302,10 +311,14 @@ class TensorDataImpl : public TensorData {
     constexpr auto isBool = std::is_same<T, bool>::value;
     constexpr auto isFloat =
       std::is_same<T, float16>::value || std::is_same<T, float>::value || std::is_same<T, double>::value;
+    constexpr auto isComplex =
+      std::is_same<T, ComplexStorage<float>>::value || std::is_same<T, ComplexStorage<double>>::value;
     constexpr int linefeedThreshold = isFloat ? kThreshold1DFloat : (isBool ? kThreshold1DBool : kThreshold1DInt);
     for (ssize_t i = start; i < end && (cursor + i) < static_cast<ssize_t>(data_size_); i++) {
       const auto value = data_[cursor + i];
-      if constexpr (isFloat) {
+      if constexpr (isComplex) {
+        ss << value;
+      } else if constexpr (isFloat) {
         OutputFloatDataString(ss, isScalar, value);
       } else if (isBool) {
         OutputBoolDataString(ss, isScalar, value);
@@ -458,7 +471,9 @@ TensorDataPtr MakeTensorData(TypeId data_type, const ShapeVector &shape, const A
     case kNumberTypeFloat64:
       return std::make_shared<TensorDataImpl<double>>(shape, args...);
     case kNumberTypeComplex64:
-      return std::make_shared<TensorDataImpl<double>>(shape, args...);
+      return std::make_shared<TensorDataImpl<ComplexStorage<float>>>(shape, args...);
+    case kNumberTypeComplex128:
+      return std::make_shared<TensorDataImpl<ComplexStorage<double>>>(shape, args...);
     case kObjectTypeString:
       return std::make_shared<TensorDataImpl<uint8_t>>(shape, args...);
     case kObjectTypeTensorType:
diff --git a/mindspore/core/ir/tensor.h b/mindspore/core/ir/tensor.h
index b94757ea403..51241bf4d61 100644
--- a/mindspore/core/ir/tensor.h
+++ b/mindspore/core/ir/tensor.h
@@ -42,7 +42,7 @@ enum TensorSyncStatus { kNoNeedSync, kNeedSyncHostToDevice, kNeedSyncDeviceToHos
 // A sub namespace in ME to support tensor related definition.
 namespace tensor {
 // Tensor data interface.
-class TensorData {
+class MS_CORE_API TensorData {
  public:
   /// virtual destructor is required for base classes.
   virtual ~TensorData() = default;
@@ -111,7 +111,7 @@ class WaitEvent : public ExceptionListener {
 };
 
 // Tensor entity class
-class Tensor : public MetaTensor {
+class MS_CORE_API Tensor : public MetaTensor {
  public:
   abstract::AbstractBasePtr ToAbstract() override;
 
@@ -286,10 +286,13 @@ class Tensor : public MetaTensor {
   void set_init_flag(bool flag) { init_flag_ = flag; }
 
   DeviceSyncPtr device_address() const { return device_sync_; }
-  void set_device_address(const DeviceSyncPtr &device_sync) {
+  // If need_update_ref_count is true, the device address cannot be released and reused,
+  // so the feature map should set false when set device address of tensor.
+  void set_device_address(const DeviceSyncPtr &device_sync, bool need_update_ref_count = true) {
     device_sync_ = device_sync;
-    // To support the old and new runtime coexistence.
-    if (device_sync_ != nullptr) {
+    // To support the old and new runtime coexistence, the output of old runtime may be the input of new runtime, so the
+    // device address cannot be released through ref count and set max ref count in this scenario.
+    if (need_update_ref_count && (device_sync_ != nullptr)) {
       device_sync_->set_original_ref_count(SIZE_MAX);
       device_sync_->ResetRefCount();
     }
diff --git a/mindspore/core/ir/value.h b/mindspore/core/ir/value.h
index c2db08c7a0d..4da4474b008 100644
--- a/mindspore/core/ir/value.h
+++ b/mindspore/core/ir/value.h
@@ -34,7 +34,7 @@
 #include "utils/ms_utils.h"
 
 namespace mindspore {
-class ValueSequeue : public Value {
+class MS_CORE_API ValueSequeue : public Value {
  public:
   explicit ValueSequeue(const ValuePtrList &elements) : elements_(elements) {
     TypePtrList t_list;
@@ -69,7 +69,7 @@ class ValueSequeue : public Value {
 };
 using ValueSequeuePtr = std::shared_ptr<ValueSequeue>;
 
-class ValueTuple : public ValueSequeue {
+class MS_CORE_API ValueTuple : public ValueSequeue {
  public:
   explicit ValueTuple(const std::vector<ValuePtr> &elements) : ValueSequeue(elements) {}
   ValueTuple(const std::initializer_list<ValuePtr> &elements) : ValueSequeue(elements) {}
@@ -82,7 +82,7 @@ class ValueTuple : public ValueSequeue {
 };
 using ValueTuplePtr = std::shared_ptr<ValueTuple>;
 
-class ValueList : public ValueSequeue {
+class MS_CORE_API ValueList : public ValueSequeue {
  public:
   explicit ValueList(const std::vector<ValuePtr> &elements) : ValueSequeue(elements) {}
   ValueList(const std::initializer_list<ValuePtr> &elements) : ValueSequeue(elements) {}
@@ -110,7 +110,7 @@ ValuePtr MakeValue(const T &vec) {
   return std::make_shared<ValueTuple>(list);
 }
 
-class ValueSlice : public Value {
+class MS_CORE_API ValueSlice : public Value {
  public:
   ValueSlice(const ValuePtr &start, const ValuePtr &stop, const ValuePtr &step)
       : start_(start), stop_(stop), step_(step) {}
@@ -135,7 +135,7 @@ class ValueSlice : public Value {
 };
 using ValueSlicePtr = std::shared_ptr<ValueSlice>;
 
-class KeywordArg : public Value {
+class MS_CORE_API KeywordArg : public Value {
  public:
   KeywordArg(const std::string &key, const ValuePtr &value) : key_(key), value_(value) {}
   ~KeywordArg() override = default;
@@ -156,7 +156,7 @@ class KeywordArg : public Value {
 };
 using KeywordArgPtr = std::shared_ptr<KeywordArg>;
 
-class ValueDictionary : public Value {
+class MS_CORE_API ValueDictionary : public Value {
  public:
   explicit ValueDictionary(const std::vector<std::pair<std::string, ValuePtr>> &key_values) : key_values_(key_values) {}
   ~ValueDictionary() override = default;
@@ -197,7 +197,7 @@ class ValueDictionary : public Value {
 };
 using ValueDictionaryPtr = std::shared_ptr<ValueDictionary>;
 
-class StringImm : public Value {
+class MS_CORE_API StringImm : public Value {
  public:
   explicit StringImm(const std::string &str) : Value(kString), str_(str), hash_(std::hash<std::string>{}(str_)) {}
 
@@ -224,7 +224,7 @@ using StringImmPtr = std::shared_ptr<StringImm>;
 IMM_TRAITS(StringImmPtr, std::string)
 IMM_TRAITS(StringImmPtr, const char *)
 
-class RefKey : public Named {
+class MS_CORE_API RefKey : public Named {
  public:
   explicit RefKey(const std::string &tag) : Named(tag) {}
 
@@ -242,7 +242,7 @@ class RefKey : public Named {
 };
 using RefKeyPtr = std::shared_ptr<RefKey>;
 
-class AnyValue : public Value {
+class MS_CORE_API AnyValue : public Value {
  public:
   AnyValue() = default;
   ~AnyValue() override = default;
@@ -253,7 +253,7 @@ class AnyValue : public Value {
 };
 extern const ValuePtr kAnyValue;
 
-class Monad : public Value {
+class MS_CORE_API Monad : public Value {
  public:
   ~Monad() override = default;
   MS_DECLARE_PARENT(Monad, Value)
@@ -263,7 +263,7 @@ class Monad : public Value {
   explicit Monad(TypePtr type) : Value(type) {}
 };
 
-class UMonad : public Monad {
+class MS_CORE_API UMonad : public Monad {
  public:
   UMonad() : Monad(kUMonadType) {}
   ~UMonad() override = default;
@@ -276,7 +276,7 @@ class UMonad : public Monad {
 using UMonadPtr = std::shared_ptr<UMonad>;
 extern const ValuePtr kUMonad;
 
-class IOMonad : public Monad {
+class MS_CORE_API IOMonad : public Monad {
  public:
   IOMonad() : Monad(kIOMonadType) {}
   ~IOMonad() override = default;
diff --git a/mindspore/core/load_mindir/anf_model_parser.cc b/mindspore/core/load_mindir/anf_model_parser.cc
index c38868d0d42..bfb6c6576fe 100644
--- a/mindspore/core/load_mindir/anf_model_parser.cc
+++ b/mindspore/core/load_mindir/anf_model_parser.cc
@@ -307,14 +307,16 @@ bool MSANFModelParser::BuildInputForFuncGraph(const ParameterPtr &node, const mi
   node->set_debug_info(debug_info_ptr);
   node->set_name(debug_info_name);
 
+  // Set abstract of the parameter
   if (value_proto.tensor_size() > 0) {
     const mind_ir::TensorProto &tensor_proto = value_proto.tensor(0);
     tensor::TensorPtr tensor_info = BuildTensorInfoForFuncGraph(tensor_proto);
     MS_EXCEPTION_IF_NULL(tensor_info);
     auto tensor_abstract = tensor_info->ToAbstract();
     node->set_abstract(tensor_abstract);
+  } else if (value_proto.has_denotation()) {
+    MS_LOG(DEBUG) << "Not tensor. parameter type: " << value_proto.denotation();
   }
-
   anfnode_build_map_[value_proto.name()] = node;
   return true;
 }
@@ -493,6 +495,7 @@ bool MSANFModelParser::ObtainCNodeAttrInTensorForm(const PrimitivePtr &prim,
     shape.push_back(attr_tensor.dims(i));
   }
   tensor::TensorPtr tensor_info = std::make_shared<tensor::Tensor>(kDefaultValueSwitchMap[attr_tensor_type], shape);
+  MS_EXCEPTION_IF_NULL(tensor_info);
   const std::string &tensor_buf = attr_tensor.raw_data();
   auto *tensor_data_buf = reinterpret_cast<uint8_t *>(tensor_info->data_c());
   auto ret = memcpy_s(tensor_data_buf, tensor_info->data().nbytes(), tensor_buf.data(), tensor_buf.size());
@@ -570,6 +573,7 @@ bool MSANFModelParser::ObtainValueNodeInTensorForm(const std::string &value_node
     shape.push_back(attr_tensor.dims(i));
   }
   tensor::TensorPtr tensor_info = std::make_shared<tensor::Tensor>(kDefaultValueSwitchMap[attr_tensor_type], shape);
+  MS_EXCEPTION_IF_NULL(tensor_info);
   const std::string &tensor_buf = attr_tensor.raw_data();
   auto *tensor_data_buf = reinterpret_cast<uint8_t *>(tensor_info->data_c());
   auto ret = memcpy_s(tensor_data_buf, tensor_info->data().nbytes(), tensor_buf.data(), tensor_buf.size());
@@ -774,11 +778,11 @@ AnfNodePtr MSANFModelParser::BuildOperatorNode(const mind_ir::NodeProto &node_pr
   // Operator maybe CNode,FuncGraph or Parameter.
 
   if (node_type.size() > kOpTypeFlagSize && node_type.substr(0, kOpTypeFlagSize) == kOperatorTypeFlag) {
-    auto it = anfnode_build_map_.find(node_type.substr(kOpTypeFlagSize));
-    if (it != anfnode_build_map_.end()) {
-      return it->second;
+    auto anfNode = GetAnfNode(node_type.substr(kOpTypeFlagSize));
+    if (anfNode == nullptr) {
+      MS_LOG(EXCEPTION) << "Can't find the ref:" << node_type;
     }
-    MS_LOG(EXCEPTION) << "Can't find the ref:" << node_type;
+    return anfNode;
   }
 
   // Operator is  primitive.
@@ -790,9 +794,12 @@ AnfNodePtr MSANFModelParser::BuildOperatorNode(const mind_ir::NodeProto &node_pr
     if (node_type.compare(0, strlen(kDoSignaturePrimitivePrefix), kDoSignaturePrimitivePrefix) == 0) {
       auto op_name = node_type.substr(strlen(kDoSignaturePrimitivePrefix));
       prim = std::make_shared<prim::DoSignaturePrimitive>(op_name, std::make_shared<Primitive>(op_name));
+      MS_EXCEPTION_IF_NULL(prim);
       prim->set_instance_name(op_name);
     } else {
+      MS_LOG(DEBUG) << "Special node_type: " << node_type;
       prim = std::make_shared<Primitive>(node_type);
+      MS_EXCEPTION_IF_NULL(prim);
       prim->set_instance_name(node_type);
     }
   }
@@ -824,9 +831,10 @@ void MSANFModelParser::SetCNodeAbastract(const mind_ir::NodeProto &node_proto, C
     cnode_ptr->set_abstract(nullptr);
     return;
   }
-  // Set abstract of switch(c,f,t)() to null
-  prim = GetCNodePrimitive(operatorPtr);
-  if (IsPrimitiveEquals(prim::kPrimSwitch, prim) || IsPrimitiveEquals(prim::kPrimSwitchLayer, prim)) {
+
+  // If the operator is not a primitive, the abstract will been set to null.
+  // Because there are not some operators in front end, the abstract of primitive should be reserved.
+  if (prim == nullptr) {
     cnode_ptr->set_abstract(nullptr);
     return;
   }
@@ -894,12 +902,12 @@ CNodePtr MSANFModelParser::BuildCNodeForFuncGraph(const FuncGraphPtr &outputFunc
   std::vector<AnfNodePtr> inputs;
   inputs.push_back(BuildOperatorNode(node_proto));
   for (int i = 0; i < node_proto.input_size(); ++i) {
-    const std::string &input_name = node_proto.input(i);
-    if (anfnode_build_map_.find(input_name) == anfnode_build_map_.end()) {
-      MS_LOG(ERROR) << node_name << " input " << i << input_name << "can't find in nodes have parsed";
+    auto anfNode = GetAnfNode(node_proto.input(i));
+    if (anfNode == nullptr) {
+      MS_LOG(ERROR) << node_name << " input " << i << node_proto.input(i) << "can't find in nodes have parsed";
       return nullptr;
     }
-    inputs.push_back(anfnode_build_map_[input_name]);
+    inputs.push_back(anfNode);
   }
 
   CNodePtr cnode_ptr = outputFuncGraph->NewCNode(inputs);
@@ -920,9 +928,8 @@ CNodePtr MSANFModelParser::BuildCNodeForFuncGraph(const FuncGraphPtr &outputFunc
 }
 
 bool MSANFModelParser::BuildReturnForFuncGraph(const FuncGraphPtr &outputFuncGraph,
-                                               const mind_ir::GraphProto &importProto, const CNodePtr &cnode_ptr) {
+                                               const mind_ir::GraphProto &importProto) {
   MS_EXCEPTION_IF_NULL(outputFuncGraph);
-  MS_EXCEPTION_IF_NULL(cnode_ptr);
   if (importProto.output_size() < 0 || importProto.output_size() > INT_MAX) {
     MS_LOG(ERROR) << "importProto.output_size is : " << importProto.output_size();
     return false;
@@ -935,10 +942,16 @@ bool MSANFModelParser::BuildReturnForFuncGraph(const FuncGraphPtr &outputFuncGra
     for (int out_size = 0; out_size < importProto.output_size(); ++out_size) {
       const mind_ir::ValueInfoProto &output_node = importProto.output(out_size);
       const std::string &out_tuple = output_node.name();
-      inputs.push_back(anfnode_build_map_[out_tuple]);
-      elem.push_back(anfnode_build_map_[out_tuple]->abstract());
+      auto anfNode = GetAnfNode(out_tuple);
+      if (anfNode == nullptr) {
+        MS_LOG(ERROR) << "Miss return node: " << out_tuple;
+        return false;
+      }
+      inputs.push_back(anfNode);
+      elem.push_back(anfNode->abstract());
     }
     auto maketuple_ptr = outputFuncGraph->NewCNode(inputs);
+    MS_EXCEPTION_IF_NULL(maketuple_ptr);
     maketuple_ptr->set_abstract(std::make_shared<abstract::AbstractTuple>(elem));
     inputs.clear();
     inputs.push_back(NewValueNode(prim::kPrimReturn));
@@ -947,16 +960,22 @@ bool MSANFModelParser::BuildReturnForFuncGraph(const FuncGraphPtr &outputFuncGra
     MS_EXCEPTION_IF_NULL(return_node);
     return_node->set_load_flag(true);
     outputFuncGraph->set_return(return_node);
-    MS_LOG(INFO) << "Construct funcgraph finined, all success.";
+    MS_LOG(DEBUG) << "Construct funcgraph finined, all success.";
   } else {
     inputs.clear();
     inputs.push_back(NewValueNode(prim::kPrimReturn));
-    inputs.push_back(cnode_ptr);
+    auto nodeName = importProto.output(0).name();
+    auto anfNode = GetAnfNode(nodeName);
+    if (anfNode == nullptr) {
+      MS_LOG(ERROR) << "Miss return node: " << nodeName;
+      return false;
+    }
+    inputs.push_back(anfNode);
     auto return_node = outputFuncGraph->NewCNode(inputs);
     MS_EXCEPTION_IF_NULL(return_node);
     return_node->set_load_flag(true);
     outputFuncGraph->set_return(return_node);
-    MS_LOG(INFO) << "Construct funcgraph finined, all success!";
+    MS_LOG(DEBUG) << "Construct funcgraph finined, all success!";
   }
   return true;
 }
@@ -968,7 +987,7 @@ bool MSANFModelParser::ImportNodesForGraph(const FuncGraphPtr &outputFuncGraph,
     MS_LOG(ERROR) << "importProto.node_size is : " << importProto.node_size();
     return false;
   }
-  MS_LOG(INFO) << "The CNdoe size : " << importProto.node_size();
+  MS_LOG(DEBUG) << "The node size : " << importProto.node_size();
   CNodePtr cnode_ptr = nullptr;
   for (int i = 0; i < importProto.node_size(); ++i) {
     const mind_ir::NodeProto &node_proto = importProto.node(i);
@@ -987,8 +1006,7 @@ bool MSANFModelParser::ImportNodesForGraph(const FuncGraphPtr &outputFuncGraph,
     }
   }
 
-  BuildReturnForFuncGraph(outputFuncGraph, importProto, cnode_ptr);
-  return true;
+  return BuildReturnForFuncGraph(outputFuncGraph, importProto);
 }
 
 bool MSANFModelParser::BuildFuncGraph(const FuncGraphPtr &outputFuncGraph, const mind_ir::GraphProto &importProto) {
@@ -1079,4 +1097,17 @@ FuncGraphPtr MSANFModelParser::Parse(const mind_ir::ModelProto &model_proto) {
   anfnode_build_map_.clear();
   return dstGraph;
 }
+
+AnfNodePtr MSANFModelParser::GetAnfNode(const std::string &node_name) {
+  auto it = anfnode_build_map_.find(node_name);
+  if (it == anfnode_build_map_.end()) {
+    return nullptr;
+  }
+  FuncGraphPtr func_graph_ptr = GetValueNode<FuncGraphPtr>(it->second);
+  if (func_graph_ptr) {
+    return NewValueNode(func_graph_ptr);
+  } else {
+    return it->second;
+  }
+}
 }  // namespace mindspore
diff --git a/mindspore/core/load_mindir/anf_model_parser.h b/mindspore/core/load_mindir/anf_model_parser.h
index dffc78deeff..abc92c0958f 100644
--- a/mindspore/core/load_mindir/anf_model_parser.h
+++ b/mindspore/core/load_mindir/anf_model_parser.h
@@ -52,8 +52,7 @@ class MSANFModelParser {
   bool BuildInputForFuncGraph(const ParameterPtr &node, const mind_ir::ValueInfoProto &value_proto);
   tensor::TensorPtr BuildTensorInfoForFuncGraph(const mind_ir::TensorProto &tensor_proto);
   CNodePtr BuildCNodeForFuncGraph(const FuncGraphPtr &outputFuncGraph, const mind_ir::NodeProto &node_proto);
-  bool BuildReturnForFuncGraph(const FuncGraphPtr &outputFuncGraph, const mind_ir::GraphProto &importProto,
-                               const CNodePtr &cnode_ptr);
+  bool BuildReturnForFuncGraph(const FuncGraphPtr &outputFuncGraph, const mind_ir::GraphProto &importProto);
   bool GetAttrValueForCNode(const PrimitivePtr &prim, const mind_ir::AttributeProto &attr_proto);
   bool ObtainCNodeAttrInTypeForm(const PrimitivePtr &prim, const mind_ir::AttributeProto &attr_proto);
   void ObtainCNodeAttrInScalarForm(const mind_ir::AttributeProto &attr_proto,
@@ -72,6 +71,7 @@ class MSANFModelParser {
   bool ObtainValueNodeInMonadForm(const std::string &value_node_name, const mind_ir::AttributeProto &attr_proto);
   std::unordered_map<std::string, abstract::AbstractBasePtr> GetAbstractForCNode(
     const mind_ir::AttributeProto &attr_proto);
+  AnfNodePtr GetAnfNode(const std::string &node_name);
 
   std::string producer_name_;
   std::string model_version_;
diff --git a/mindspore/core/mindrt/CMakeLists.txt b/mindspore/core/mindrt/CMakeLists.txt
index 1a966e89ee5..f2c9b455eed 100644
--- a/mindspore/core/mindrt/CMakeLists.txt
+++ b/mindspore/core/mindrt/CMakeLists.txt
@@ -11,4 +11,8 @@ file(GLOB MINDRT_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/src/thread/*.cc
     )
 
+if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+    add_compile_definitions(BUILDING_DLL)
+endif()
+
 add_library(mindrt_mid OBJECT ${MINDRT_SRC})
diff --git a/mindspore/core/mindrt/src/actor/actormgr.cc b/mindspore/core/mindrt/src/actor/actormgr.cc
index 4c28eea3de4..c5c19f6c29d 100644
--- a/mindspore/core/mindrt/src/actor/actormgr.cc
+++ b/mindspore/core/mindrt/src/actor/actormgr.cc
@@ -44,9 +44,14 @@ ActorMgr::ActorMgr() : actors(), procotols(), urls() {
   urls.clear();
 }
 
-ActorMgr::~ActorMgr() {}
+ActorMgr::~ActorMgr() {
+  if (inner_pool_ != nullptr) {
+    delete inner_pool_;
+    inner_pool_ = nullptr;
+  }
+}
 
-void ActorMgr::Initialize(bool use_inner_pool, size_t thread_num) {
+void ActorMgr::Initialize(bool use_inner_pool, size_t actor_thread_num, size_t max_thread_num) {
   bool expected = false;
   if (!initialized_.compare_exchange_strong(expected, true)) {
     MS_LOG(DEBUG) << "Actor Manager has been initialized before";
@@ -54,7 +59,14 @@ void ActorMgr::Initialize(bool use_inner_pool, size_t thread_num) {
   }
   // create inner thread pool only when specified use_inner_pool
   if (use_inner_pool) {
-    inner_pool_ = ActorThreadPool::CreateThreadPool(thread_num);
+    if (max_thread_num <= actor_thread_num) {
+      inner_pool_ = ActorThreadPool::CreateThreadPool(actor_thread_num);
+    } else {
+      inner_pool_ = ActorThreadPool::CreateThreadPool(actor_thread_num, max_thread_num, {});
+      inner_pool_->SetActorThreadNum(actor_thread_num);
+      inner_pool_->DisableOccupiedActorThread();
+      inner_pool_->SetKernelThreadNum(max_thread_num - actor_thread_num);
+    }
   }
 }
 
diff --git a/mindspore/core/mindrt/src/actor/actormgr.h b/mindspore/core/mindrt/src/actor/actormgr.h
index 967b77a0b3e..65782687312 100644
--- a/mindspore/core/mindrt/src/actor/actormgr.h
+++ b/mindspore/core/mindrt/src/actor/actormgr.h
@@ -48,12 +48,14 @@ class ActorMgr {
     (void)ActorMgr::GetActorMgrRef()->Send(AID(to), std::move(msg));
   }
 
+  ActorThreadPool *GetActorThreadPool() { return inner_pool_; }
+
   ActorMgr();
   ~ActorMgr();
 
   void Finalize();
   // initialize actor manager resource, do not create inner thread pool by default
-  void Initialize(bool use_inner_pool = false, size_t thread_num = 1);
+  void Initialize(bool use_inner_pool = false, size_t actor_thread_num = 1, size_t max_thread_num = 1);
 
   void RemoveActor(const std::string &name);
   ActorBase *GetActor(const AID &id);
diff --git a/mindspore/core/mindrt/src/thread/actor_threadpool.cc b/mindspore/core/mindrt/src/thread/actor_threadpool.cc
index 58966fca13b..d2be7d52c30 100644
--- a/mindspore/core/mindrt/src/thread/actor_threadpool.cc
+++ b/mindspore/core/mindrt/src/thread/actor_threadpool.cc
@@ -140,7 +140,7 @@ int ActorThreadPool::CreateThreads(size_t actor_thread_num, size_t all_thread_nu
   size_t core_num = std::thread::hardware_concurrency();
   THREAD_INFO("ThreadInfo, Actor: [%zu], All: [%zu], CoreNum: [%zu]", actor_thread_num, all_thread_num, core_num);
   actor_thread_num_ = actor_thread_num < core_num ? actor_thread_num : core_num;
-  if (actor_thread_num_ <= 0 || actor_thread_num > all_thread_num) {
+  if (actor_thread_num > all_thread_num) {
     THREAD_ERROR("thread num is invalid");
     return THREAD_ERROR;
   }
diff --git a/mindspore/core/mindrt/src/thread/actor_threadpool.h b/mindspore/core/mindrt/src/thread/actor_threadpool.h
index bb4bc4f57ba..037440d3c02 100644
--- a/mindspore/core/mindrt/src/thread/actor_threadpool.h
+++ b/mindspore/core/mindrt/src/thread/actor_threadpool.h
@@ -23,6 +23,7 @@
 #include <atomic>
 #include <condition_variable>
 #include "thread/threadpool.h"
+#include "thread/core_affinity.h"
 #include "actor/actor.h"
 #include "thread/hqueue.h"
 #define USE_HQUEUE
diff --git a/mindspore/core/mindrt/src/thread/threadpool.cc b/mindspore/core/mindrt/src/thread/threadpool.cc
index fec5dedfd8f..66c08a9227d 100644
--- a/mindspore/core/mindrt/src/thread/threadpool.cc
+++ b/mindspore/core/mindrt/src/thread/threadpool.cc
@@ -175,11 +175,11 @@ int ThreadPool::ParallelLaunch(const Func &func, Content content, int task_num)
   return THREAD_OK;
 }
 
-void ThreadPool::SyncRunTask(Task *task, int task_num) const {
+void ThreadPool::SyncRunTask(Task *task, int start_num, int task_num) const {
   // run task sequentially
   // if the current thread is not the actor thread
-  float per_scale = kMaxScale / task_num;
-  for (int i = 0; i < task_num; ++i) {
+  float per_scale = kMaxScale / (task_num - start_num);
+  for (int i = start_num; i < task_num; ++i) {
     float lhs_scale = i * per_scale;
     float rhs_scale = (i + 1) * per_scale;
     rhs_scale = i == task_num - 1 ? kMaxScale : rhs_scale;
@@ -197,7 +197,11 @@ void ThreadPool::DistributeTask(Task *task, int task_num) const {
   int sum_frequency = 0;
   std::vector<Worker *> assigned;
   int num = static_cast<int>(workers_.size()) - 1;
-  for (int i = num; i >= 0 && count < num_assigned; --i) {
+  int offset = 0;
+  if (!occupied_actor_thread_) {
+    offset = static_cast<int>(actor_thread_num_);
+  }
+  for (int i = num; i >= offset && count < num_assigned; --i) {
     if (workers_[i]->available()) {
       assigned.push_back(workers_[i]);
       sum_frequency += workers_[i]->frequency();
@@ -212,7 +216,9 @@ void ThreadPool::DistributeTask(Task *task, int task_num) const {
       sum_frequency += curr->frequency();
     }
   } else if (assigned.size() != static_cast<size_t>(task_num)) {
-    SyncRunTask(task, task_num);
+    CalculateScales(assigned, sum_frequency);
+    ActiveWorkers(assigned, task, assigned.size(), curr);
+    SyncRunTask(task, assigned.size(), task_num);
     return;
   }
   CalculateScales(assigned, sum_frequency);
diff --git a/mindspore/core/mindrt/src/thread/threadpool.h b/mindspore/core/mindrt/src/thread/threadpool.h
index 4db2c8e4aea..dda874711b7 100644
--- a/mindspore/core/mindrt/src/thread/threadpool.h
+++ b/mindspore/core/mindrt/src/thread/threadpool.h
@@ -24,6 +24,7 @@
 #include <atomic>
 #include <condition_variable>
 #include <mutex>
+#include <functional>
 #include "thread/threadlog.h"
 #include "thread/core_affinity.h"
 
@@ -40,7 +41,7 @@ enum ThreadStatus {
 
 // used in scenarios with unequal division of task
 // the parameters indicate the start and end coefficients
-using Func = int (*)(void *, int, float, float);
+using Func = std::function<int(void *, int, float, float)>;
 using Content = void *;
 
 typedef struct Task {
@@ -113,6 +114,10 @@ class ThreadPool {
   int SetProcessAffinity(BindMode bind_mode) const;
 
   int ParallelLaunch(const Func &func, Content content, int task_num) const;
+  void DisableOccupiedActorThread() { occupied_actor_thread_ = false; }
+  void SetActorThreadNum(size_t actor_thread_num) { actor_thread_num_ = actor_thread_num; }
+  void SetKernelThreadNum(size_t kernel_thread_num) { kernel_thread_num_ = kernel_thread_num; }
+  size_t GetKernelThreadNum() const { return kernel_thread_num_; }
 
  protected:
   ThreadPool() = default;
@@ -121,7 +126,7 @@ class ThreadPool {
 
   int InitAffinityInfo();
 
-  void SyncRunTask(Task *task, int task_num) const;
+  void SyncRunTask(Task *task, int start_num, int task_num) const;
 
   void DistributeTask(Task *task, int task_num) const;
   void CalculateScales(const std::vector<Worker *> &workers, int sum_frequency) const;
@@ -132,6 +137,9 @@ class ThreadPool {
   std::mutex pool_mutex_;
   std::vector<Worker *> workers_;
   CoreAffinity *affinity_{nullptr};
+  size_t actor_thread_num_{0};
+  size_t kernel_thread_num_{0};
+  bool occupied_actor_thread_{true};
 };
 
 }  // namespace mindspore
diff --git a/mindspore/core/ops/LayerNormBetaGammaBackprop.h b/mindspore/core/ops/LayerNormBetaGammaBackprop.h
index 456a281cb01..8385149595b 100644
--- a/mindspore/core/ops/LayerNormBetaGammaBackprop.h
+++ b/mindspore/core/ops/LayerNormBetaGammaBackprop.h
@@ -27,7 +27,7 @@
 
 namespace mindspore {
 namespace ops {
-class LayerNormBetaGammaBackprop : public PrimitiveC {
+class MS_CORE_API LayerNormBetaGammaBackprop : public PrimitiveC {
  public:
   LayerNormBetaGammaBackprop() : PrimitiveC(prim::kPrimLayerNormBetaGammaBackprop->name()) {}
   ~LayerNormBetaGammaBackprop() = default;
diff --git a/mindspore/core/ops/LayerNormXBackprop.h b/mindspore/core/ops/LayerNormXBackprop.h
index d5029db6666..f6ab576df3b 100644
--- a/mindspore/core/ops/LayerNormXBackprop.h
+++ b/mindspore/core/ops/LayerNormXBackprop.h
@@ -27,7 +27,7 @@
 
 namespace mindspore {
 namespace ops {
-class LayerNormXBackprop : public PrimitiveC {
+class MS_CORE_API LayerNormXBackprop : public PrimitiveC {
  public:
   LayerNormXBackprop() : PrimitiveC(prim::kPrimLayerNormXBackprop->name()) {}
   ~LayerNormXBackprop() = default;
diff --git a/mindspore/core/ops/abs.h b/mindspore/core/ops/abs.h
index 1e8a1683d02..f9aad08073e 100644
--- a/mindspore/core/ops/abs.h
+++ b/mindspore/core/ops/abs.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAbs = "Abs";
-class Abs : public PrimitiveC {
+class MS_CORE_API Abs : public PrimitiveC {
  public:
   Abs() : PrimitiveC(kNameAbs) { InitIOName({"input_x"}, {"output"}); }
   ~Abs() = default;
diff --git a/mindspore/core/ops/adam.h b/mindspore/core/ops/adam.h
index 1767b7e342c..ed2c7073147 100644
--- a/mindspore/core/ops/adam.h
+++ b/mindspore/core/ops/adam.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAdam = "Adam";
-class Adam : public PrimitiveC {
+class MS_CORE_API Adam : public PrimitiveC {
  public:
   Adam() : PrimitiveC(kNameAdam) {}
   ~Adam() = default;
diff --git a/mindspore/core/ops/add.h b/mindspore/core/ops/add.h
index cc334e2e54f..5dbc35679cc 100644
--- a/mindspore/core/ops/add.h
+++ b/mindspore/core/ops/add.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAdd = prim::kAdd;
-class Add : public PrimitiveC {
+class MS_CORE_API Add : public PrimitiveC {
  public:
   Add() : PrimitiveC(kNameAdd) { InitIOName({"x", "y"}, {"output"}); }
   explicit Add(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x", "y"}, {"output"}); }
diff --git a/mindspore/core/ops/adder.h b/mindspore/core/ops/adder.h
index 4fe36cb5967..6f0ffd21d86 100644
--- a/mindspore/core/ops/adder.h
+++ b/mindspore/core/ops/adder.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAdder = "Adder";
-class Adder : public PrimitiveC {
+class MS_CORE_API Adder : public PrimitiveC {
  public:
   explicit Adder(const std::string &k_name = kNameAdder) : PrimitiveC(k_name) {}
   ~Adder() = default;
diff --git a/mindspore/core/ops/addn.h b/mindspore/core/ops/addn.h
index 7459ac28066..dd17a518513 100644
--- a/mindspore/core/ops/addn.h
+++ b/mindspore/core/ops/addn.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAddN = "AddN";
-class AddN : public PrimitiveC {
+class MS_CORE_API AddN : public PrimitiveC {
  public:
   AddN() : PrimitiveC(kNameAddN) { InitIOName({"inputs"}, {"sum"}); }
   ~AddN() = default;
diff --git a/mindspore/core/ops/affine.h b/mindspore/core/ops/affine.h
index 30b800b8a9f..7568252eb4b 100644
--- a/mindspore/core/ops/affine.h
+++ b/mindspore/core/ops/affine.h
@@ -29,7 +29,7 @@ constexpr auto kNameAffine = "Affine";
 constexpr auto kAffineContext = "context";
 constexpr auto kAffineOutputDim = "output_dim";
 
-class Affine : public PrimitiveC {
+class MS_CORE_API Affine : public PrimitiveC {
  public:
   Affine() : PrimitiveC(kNameAffine) { InitIOName({"x1", "x2"}, {"outputs"}); }
   ~Affine() = default;
diff --git a/mindspore/core/ops/all.h b/mindspore/core/ops/all.h
index c8035874c8a..f34bb519f28 100644
--- a/mindspore/core/ops/all.h
+++ b/mindspore/core/ops/all.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAll = "All";
-class All : public PrimitiveC {
+class MS_CORE_API All : public PrimitiveC {
  public:
   All() : PrimitiveC(kNameAll) {}
   ~All() = default;
diff --git a/mindspore/core/ops/apply_momentum.h b/mindspore/core/ops/apply_momentum.h
index 388bec9dd87..3a8fc42033f 100644
--- a/mindspore/core/ops/apply_momentum.h
+++ b/mindspore/core/ops/apply_momentum.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameApplyMomentum = "ApplyMomentum";
-class ApplyMomentum : public PrimitiveC {
+class MS_CORE_API ApplyMomentum : public PrimitiveC {
  public:
   ApplyMomentum() : PrimitiveC(kNameApplyMomentum) {
     InitIOName({"variable", "accumulation", "learning_rate", "gradient", "momentum"}, {"output"});
diff --git a/mindspore/core/ops/arg_max.h b/mindspore/core/ops/arg_max.h
index 75440f0b0a4..969fb497fb0 100644
--- a/mindspore/core/ops/arg_max.h
+++ b/mindspore/core/ops/arg_max.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameArgMax = "Argmax";
-class ArgMax : public PrimitiveC {
+class MS_CORE_API ArgMax : public PrimitiveC {
  public:
   ArgMax() : PrimitiveC(kNameArgMax) { InitIOName({"x"}, {"output"}); }
   explicit ArgMax(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x"}, {"output"}); }
diff --git a/mindspore/core/ops/arg_min.h b/mindspore/core/ops/arg_min.h
index 6872d7f0db1..90dc47c7bfc 100644
--- a/mindspore/core/ops/arg_min.h
+++ b/mindspore/core/ops/arg_min.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameArgMin = "ArgMin";
-class ArgMin : public PrimitiveC {
+class MS_CORE_API ArgMin : public PrimitiveC {
  public:
   ArgMin() : PrimitiveC(kNameArgMin) { InitIOName({"x"}, {"output"}); }
   explicit ArgMin(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x"}, {"output"}); }
diff --git a/mindspore/core/ops/asin.h b/mindspore/core/ops/asin.h
index ebed63649b3..e4a79af2c15 100644
--- a/mindspore/core/ops/asin.h
+++ b/mindspore/core/ops/asin.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAsin = "Asin";
-class Asin : public PrimitiveC {
+class MS_CORE_API Asin : public PrimitiveC {
  public:
   Asin() : PrimitiveC(kNameAsin) {}
   ~Asin() = default;
diff --git a/mindspore/core/ops/assert.h b/mindspore/core/ops/assert.h
index c4488bde7a9..19470a6be03 100644
--- a/mindspore/core/ops/assert.h
+++ b/mindspore/core/ops/assert.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAssert = "Assert";
-class Assert : public PrimitiveC {
+class MS_CORE_API Assert : public PrimitiveC {
  public:
   Assert() : PrimitiveC(kNameAssert) {}
   ~Assert() = default;
diff --git a/mindspore/core/ops/assign.h b/mindspore/core/ops/assign.h
index a0072725a3e..e088f826f9f 100644
--- a/mindspore/core/ops/assign.h
+++ b/mindspore/core/ops/assign.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAssign = "Assign";
-class Assign : public PrimitiveC {
+class MS_CORE_API Assign : public PrimitiveC {
  public:
   Assign() : PrimitiveC(kNameAssign) { InitIOName({"ref", "value"}, {"output"}); }
   ~Assign() = default;
diff --git a/mindspore/core/ops/assign_add.h b/mindspore/core/ops/assign_add.h
index 645d39bb38f..2936c75eb21 100644
--- a/mindspore/core/ops/assign_add.h
+++ b/mindspore/core/ops/assign_add.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAssignAdd = "AssignAdd";
-class AssignAdd : public PrimitiveC {
+class MS_CORE_API AssignAdd : public PrimitiveC {
  public:
   AssignAdd() : PrimitiveC(kNameAssignAdd) { InitIOName({"ref", "value"}, {"output"}); }
   ~AssignAdd() = default;
diff --git a/mindspore/core/ops/atan.h b/mindspore/core/ops/atan.h
index cf726611263..dfb5ed4c83b 100644
--- a/mindspore/core/ops/atan.h
+++ b/mindspore/core/ops/atan.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAtan = "Atan";
-class Atan : public PrimitiveC {
+class MS_CORE_API Atan : public PrimitiveC {
  public:
   Atan() : PrimitiveC(kNameAtan) {}
   ~Atan() = default;
diff --git a/mindspore/core/ops/attention.h b/mindspore/core/ops/attention.h
index 1d74fe0dfc6..5e204f3b008 100644
--- a/mindspore/core/ops/attention.h
+++ b/mindspore/core/ops/attention.h
@@ -27,7 +27,7 @@ namespace mindspore {
 namespace ops {
 constexpr auto kNameAttention = "Attention";
 // Attention MultiHeadAttention
-class Attention : public PrimitiveC {
+class MS_CORE_API Attention : public PrimitiveC {
  public:
   Attention() : PrimitiveC(kNameAttention) {
     InitIOName(
diff --git a/mindspore/core/ops/audio_spectrogram.h b/mindspore/core/ops/audio_spectrogram.h
index 54173ccc3e3..031134b464b 100644
--- a/mindspore/core/ops/audio_spectrogram.h
+++ b/mindspore/core/ops/audio_spectrogram.h
@@ -29,7 +29,7 @@ namespace ops {
 constexpr auto kNameAudioSpectrogram = "AudioSpectrogram";
 int64_t Log2Ceil(int64_t length);
 int64_t GetFftLength(int64_t length);
-class AudioSpectrogram : public PrimitiveC {
+class MS_CORE_API AudioSpectrogram : public PrimitiveC {
  public:
   AudioSpectrogram() : PrimitiveC(kNameAudioSpectrogram) {}
   ~AudioSpectrogram() = default;
diff --git a/mindspore/core/ops/avg_pool.h b/mindspore/core/ops/avg_pool.h
index 4985519938d..ac429594a9f 100644
--- a/mindspore/core/ops/avg_pool.h
+++ b/mindspore/core/ops/avg_pool.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAvgPool = "AvgPool";
-class AvgPool : public PrimitiveC {
+class MS_CORE_API AvgPool : public PrimitiveC {
  public:
   AvgPool() : PrimitiveC(kNameAvgPool) { InitIOName({"x"}, {"output"}); }
   explicit AvgPool(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x"}, {"output"}); }
diff --git a/mindspore/core/ops/avg_pool_3d.h b/mindspore/core/ops/avg_pool_3d.h
index ea68327a295..105858edfd6 100644
--- a/mindspore/core/ops/avg_pool_3d.h
+++ b/mindspore/core/ops/avg_pool_3d.h
@@ -27,7 +27,7 @@
 
 namespace mindspore {
 namespace ops {
-class AvgPool3D : public PrimitiveC {
+class MS_CORE_API AvgPool3D : public PrimitiveC {
  public:
   AvgPool3D() : PrimitiveC(prim::kPrimAvgPool3D->name()) { InitIOName({"input"}, {"output"}); }
   ~AvgPool3D() = default;
diff --git a/mindspore/core/ops/batch_matmul.h b/mindspore/core/ops/batch_matmul.h
index ad7cef3d42c..a3c6dab7aff 100644
--- a/mindspore/core/ops/batch_matmul.h
+++ b/mindspore/core/ops/batch_matmul.h
@@ -24,7 +24,7 @@
 
 namespace mindspore {
 namespace ops {
-class BatchMatmul : public PrimitiveC {
+class MS_CORE_API BatchMatmul : public PrimitiveC {
  public:
   BatchMatmul() : PrimitiveC(prim::kPrimBatchMatMul->name()) { InitIOName({"x1", "x2"}, {"output"}); }
   ~BatchMatmul() = default;
diff --git a/mindspore/core/ops/batch_norm.h b/mindspore/core/ops/batch_norm.h
index 06aa5cae59a..a4937f647f3 100644
--- a/mindspore/core/ops/batch_norm.h
+++ b/mindspore/core/ops/batch_norm.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameBatchNorm = "BatchNorm";
-class BatchNorm : public PrimitiveC {
+class MS_CORE_API BatchNorm : public PrimitiveC {
  public:
   BatchNorm() : PrimitiveC(kNameBatchNorm) {
     InitIOName({"x", "scale", "offset", "mean", "variance"},
diff --git a/mindspore/core/ops/batch_to_space.h b/mindspore/core/ops/batch_to_space.h
index 8812999e02f..0d776389b67 100644
--- a/mindspore/core/ops/batch_to_space.h
+++ b/mindspore/core/ops/batch_to_space.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameBatchToSpace = "BatchToSpace";
-class BatchToSpace : public PrimitiveC {
+class MS_CORE_API BatchToSpace : public PrimitiveC {
  public:
   BatchToSpace() : PrimitiveC(kNameBatchToSpace) {}
   ~BatchToSpace() = default;
diff --git a/mindspore/core/ops/batch_to_space_nd.h b/mindspore/core/ops/batch_to_space_nd.h
index 99df67a6ba7..8c1d4bf62d5 100644
--- a/mindspore/core/ops/batch_to_space_nd.h
+++ b/mindspore/core/ops/batch_to_space_nd.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameBatchToSpaceND = "BatchToSpaceND";
-class BatchToSpaceND : public PrimitiveC {
+class MS_CORE_API BatchToSpaceND : public PrimitiveC {
  public:
   BatchToSpaceND() : PrimitiveC(kNameBatchToSpaceND) {}
   ~BatchToSpaceND() = default;
diff --git a/mindspore/core/ops/bias_add.h b/mindspore/core/ops/bias_add.h
index 7a89b46a85d..dbaf3518bd0 100644
--- a/mindspore/core/ops/bias_add.h
+++ b/mindspore/core/ops/bias_add.h
@@ -29,7 +29,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameBiasAdd = prim::kBiasAdd;
-class BiasAdd : public PrimitiveC {
+class MS_CORE_API BiasAdd : public PrimitiveC {
  public:
   BiasAdd() : PrimitiveC(prim::kPrimBiasAdd->name()) { InitIOName({"x", "b"}, {"output"}); }
   ~BiasAdd() = default;
diff --git a/mindspore/core/ops/binary_cross_entropy.h b/mindspore/core/ops/binary_cross_entropy.h
index ce7a771d688..74e5416f5d0 100644
--- a/mindspore/core/ops/binary_cross_entropy.h
+++ b/mindspore/core/ops/binary_cross_entropy.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameBinaryCrossEntropy = "BinaryCrossEntropy";
-class BinaryCrossEntropy : public PrimitiveC {
+class MS_CORE_API BinaryCrossEntropy : public PrimitiveC {
  public:
   BinaryCrossEntropy() : PrimitiveC(kNameBinaryCrossEntropy) {}
   ~BinaryCrossEntropy() = default;
diff --git a/mindspore/core/ops/broadcast.h b/mindspore/core/ops/broadcast.h
index e124b1e95e6..080881ebbdb 100644
--- a/mindspore/core/ops/broadcast.h
+++ b/mindspore/core/ops/broadcast.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameBroadcast = "Broadcast";
-class Broadcast : public PrimitiveC {
+class MS_CORE_API Broadcast : public PrimitiveC {
  public:
   Broadcast() : PrimitiveC(kNameBroadcast) {}
   ~Broadcast() = default;
diff --git a/mindspore/core/ops/broadcast_to.h b/mindspore/core/ops/broadcast_to.h
index eff8abdcd01..dba6dd12f1f 100644
--- a/mindspore/core/ops/broadcast_to.h
+++ b/mindspore/core/ops/broadcast_to.h
@@ -27,7 +27,7 @@
 
 namespace mindspore {
 namespace ops {
-class BroadcastTo : public PrimitiveC {
+class MS_CORE_API BroadcastTo : public PrimitiveC {
  public:
   BroadcastTo() : PrimitiveC(prim::kPrimBroadcastTo->name()) {}
   ~BroadcastTo() = default;
diff --git a/mindspore/core/ops/call.h b/mindspore/core/ops/call.h
index a316b5dd0be..68134d407bc 100644
--- a/mindspore/core/ops/call.h
+++ b/mindspore/core/ops/call.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCall = "call";
-class Call : public PrimitiveC {
+class MS_CORE_API Call : public PrimitiveC {
  public:
   Call() : PrimitiveC(kNameCall) {}
   ~Call() = default;
diff --git a/mindspore/core/ops/cast.h b/mindspore/core/ops/cast.h
index d543c5dcf19..b941fc17125 100644
--- a/mindspore/core/ops/cast.h
+++ b/mindspore/core/ops/cast.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCast = "Cast";
-class Cast : public PrimitiveC {
+class MS_CORE_API Cast : public PrimitiveC {
  public:
   Cast() : PrimitiveC(kNameCast) { InitIOName({"x", "dst_type"}, {"output"}); }
   ~Cast() = default;
diff --git a/mindspore/core/ops/ceil.h b/mindspore/core/ops/ceil.h
index 2b6df01ab0d..9a7c51c2e4e 100644
--- a/mindspore/core/ops/ceil.h
+++ b/mindspore/core/ops/ceil.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCeil = "Ceil";
-class Ceil : public PrimitiveC {
+class MS_CORE_API Ceil : public PrimitiveC {
  public:
   Ceil() : PrimitiveC(kNameCeil) { InitIOName({"x"}, {"y"}); }
   ~Ceil() = default;
diff --git a/mindspore/core/ops/clip.h b/mindspore/core/ops/clip.h
index a62768643f5..f0fb3a7b7dd 100644
--- a/mindspore/core/ops/clip.h
+++ b/mindspore/core/ops/clip.h
@@ -24,7 +24,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameClip = "Clip";
-class Clip : public PrimitiveC {
+class MS_CORE_API Clip : public PrimitiveC {
  public:
   Clip() : PrimitiveC(kNameClip) {}
   ~Clip() = default;
diff --git a/mindspore/core/ops/concat.h b/mindspore/core/ops/concat.h
index 1a13537905f..4570dcccf8b 100644
--- a/mindspore/core/ops/concat.h
+++ b/mindspore/core/ops/concat.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameConcat = "Concat";
-class Concat : public PrimitiveC {
+class MS_CORE_API Concat : public PrimitiveC {
  public:
   Concat() : PrimitiveC(kNameConcat) {}
   ~Concat() = default;
diff --git a/mindspore/core/ops/constant_of_shape.h b/mindspore/core/ops/constant_of_shape.h
index 71e7e02fe8b..537d2f3700c 100644
--- a/mindspore/core/ops/constant_of_shape.h
+++ b/mindspore/core/ops/constant_of_shape.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameConstantOfShape = "ConstantOfShape";
-class ConstantOfShape : public PrimitiveC {
+class MS_CORE_API ConstantOfShape : public PrimitiveC {
  public:
   ConstantOfShape() : PrimitiveC(kNameConstantOfShape) {}
   ~ConstantOfShape() = default;
diff --git a/mindspore/core/ops/control_depend.h b/mindspore/core/ops/control_depend.h
index 91feede28b8..f6cd3755064 100644
--- a/mindspore/core/ops/control_depend.h
+++ b/mindspore/core/ops/control_depend.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameControlDepend = "ControlDepend";
-class ControlDepend : public PrimitiveC {
+class MS_CORE_API ControlDepend : public PrimitiveC {
  public:
   ControlDepend() : PrimitiveC(kNameControlDepend) {}
   ~ControlDepend() = default;
diff --git a/mindspore/core/ops/conv2d.h b/mindspore/core/ops/conv2d.h
index 9639d8383da..13446a96e47 100644
--- a/mindspore/core/ops/conv2d.h
+++ b/mindspore/core/ops/conv2d.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameConv2D = "Conv2D";
-class Conv2D : public PrimitiveC {
+class MS_CORE_API Conv2D : public PrimitiveC {
  public:
   Conv2D() : PrimitiveC(kNameConv2D) { InitIOName({"x", "w"}, {"output"}); }
   explicit Conv2D(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x", "w"}, {"output"}); }
diff --git a/mindspore/core/ops/conv2d_transpose.h b/mindspore/core/ops/conv2d_transpose.h
index a88e50a3a97..2c4720efee3 100644
--- a/mindspore/core/ops/conv2d_transpose.h
+++ b/mindspore/core/ops/conv2d_transpose.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameConv2DTranspose = "Conv2DTranspose";
-class Conv2DTranspose : public PrimitiveC {
+class MS_CORE_API Conv2DTranspose : public PrimitiveC {
  public:
   Conv2DTranspose() : PrimitiveC(kNameConv2DTranspose) {
     InitIOName({"out_backprop", "filter", "input_sizes"}, {"output"});
diff --git a/mindspore/core/ops/cos.h b/mindspore/core/ops/cos.h
index 136e4f96e57..4430947fc9e 100644
--- a/mindspore/core/ops/cos.h
+++ b/mindspore/core/ops/cos.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCos = "Cos";
-class Cos : public PrimitiveC {
+class MS_CORE_API Cos : public PrimitiveC {
  public:
   Cos() : PrimitiveC(kNameCos) {}
   ~Cos() = default;
diff --git a/mindspore/core/ops/crop.h b/mindspore/core/ops/crop.h
index 676df602cc8..af48b4c8a0d 100644
--- a/mindspore/core/ops/crop.h
+++ b/mindspore/core/ops/crop.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCrop = "Crop";
-class Crop : public PrimitiveC {
+class MS_CORE_API Crop : public PrimitiveC {
  public:
   Crop() : PrimitiveC(kNameCrop) {}
   ~Crop() = default;
diff --git a/mindspore/core/ops/crop_and_resize.h b/mindspore/core/ops/crop_and_resize.h
index 18c7d6c7517..04d712209ed 100644
--- a/mindspore/core/ops/crop_and_resize.h
+++ b/mindspore/core/ops/crop_and_resize.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCropAndResize = "CropAndResize";
-class CropAndResize : public PrimitiveC {
+class MS_CORE_API CropAndResize : public PrimitiveC {
  public:
   CropAndResize() : PrimitiveC(kNameCropAndResize) { InitIOName({"x", "boxes", "box_index", "crop_size"}, {"y"}); }
   ~CropAndResize() = default;
diff --git a/mindspore/core/ops/ctc_loss_v2.h b/mindspore/core/ops/ctc_loss_v2.h
index 0be3a7fb46b..4c331da78c5 100644
--- a/mindspore/core/ops/ctc_loss_v2.h
+++ b/mindspore/core/ops/ctc_loss_v2.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCTCLossV2 = "CTCLossV2";
-class CTCLossV2 : public PrimitiveC {
+class MS_CORE_API CTCLossV2 : public PrimitiveC {
  public:
   CTCLossV2() : PrimitiveC(kNameCTCLossV2) {
     InitIOName({"log_probs", "targets", "input_lengths", "target_lengths"}, {"neg_log_likelihood", "log_alpha"});
diff --git a/mindspore/core/ops/ctc_loss_v2_grad.h b/mindspore/core/ops/ctc_loss_v2_grad.h
index d7fa4446ec4..7ab2519225f 100644
--- a/mindspore/core/ops/ctc_loss_v2_grad.h
+++ b/mindspore/core/ops/ctc_loss_v2_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCTCLossV2Grad = "CTCLossV2Grad";
-class CTCLossV2Grad : public PrimitiveC {
+class MS_CORE_API CTCLossV2Grad : public PrimitiveC {
  public:
   CTCLossV2Grad() : PrimitiveC(kNameCTCLossV2Grad) {
     InitIOName(
diff --git a/mindspore/core/ops/ctcloss.h b/mindspore/core/ops/ctcloss.h
index ae251df463e..4e24a95b0c7 100644
--- a/mindspore/core/ops/ctcloss.h
+++ b/mindspore/core/ops/ctcloss.h
@@ -24,7 +24,7 @@
 
 namespace mindspore {
 namespace ops {
-class CTCLoss : public PrimitiveC {
+class MS_CORE_API CTCLoss : public PrimitiveC {
  public:
   CTCLoss() : PrimitiveC(prim::kPrimCTCLoss->name()) {}
   ~CTCLoss() = default;
diff --git a/mindspore/core/ops/cumsum.h b/mindspore/core/ops/cumsum.h
index d458187e3d3..f84dc72d927 100644
--- a/mindspore/core/ops/cumsum.h
+++ b/mindspore/core/ops/cumsum.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCumSum = "CumSum";
-class CumSum : public PrimitiveC {
+class MS_CORE_API CumSum : public PrimitiveC {
  public:
   CumSum() : PrimitiveC(kNameCumSum) {}
   ~CumSum() = default;
diff --git a/mindspore/core/ops/custom.h b/mindspore/core/ops/custom.h
index 52a52049458..8dc62efd9a6 100644
--- a/mindspore/core/ops/custom.h
+++ b/mindspore/core/ops/custom.h
@@ -29,7 +29,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCustom = "Custom";
-class Custom : public PrimitiveC {
+class MS_CORE_API Custom : public PrimitiveC {
  public:
   Custom() : PrimitiveC(kNameCustom) {}
   ~Custom() override = default;
diff --git a/mindspore/core/ops/custom_extract_features.h b/mindspore/core/ops/custom_extract_features.h
index f9976cdea4f..f2ec0b3814e 100644
--- a/mindspore/core/ops/custom_extract_features.h
+++ b/mindspore/core/ops/custom_extract_features.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCustomExtractFeatures = "CustomExtractFeatures";
-class CustomExtractFeatures : public PrimitiveC {
+class MS_CORE_API CustomExtractFeatures : public PrimitiveC {
  public:
   CustomExtractFeatures() : PrimitiveC(kNameCustomExtractFeatures) {}
   ~CustomExtractFeatures() = default;
diff --git a/mindspore/core/ops/custom_normalize.h b/mindspore/core/ops/custom_normalize.h
index 21256921c2a..5348c572fd5 100644
--- a/mindspore/core/ops/custom_normalize.h
+++ b/mindspore/core/ops/custom_normalize.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCustomNormalize = "CustomNormalize";
-class CustomNormalize : public PrimitiveC {
+class MS_CORE_API CustomNormalize : public PrimitiveC {
  public:
   CustomNormalize() : PrimitiveC(kNameCustomNormalize) {}
   ~CustomNormalize() = default;
diff --git a/mindspore/core/ops/custom_predict.h b/mindspore/core/ops/custom_predict.h
index aadb72b2602..6b22a01bea5 100644
--- a/mindspore/core/ops/custom_predict.h
+++ b/mindspore/core/ops/custom_predict.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameCustomPredict = "CustomPredict";
-class CustomPredict : public PrimitiveC {
+class MS_CORE_API CustomPredict : public PrimitiveC {
  public:
   CustomPredict() : PrimitiveC(kNameCustomPredict) {}
   ~CustomPredict() = default;
diff --git a/mindspore/core/ops/depend.h b/mindspore/core/ops/depend.h
index 8d138124555..97232aabaa2 100644
--- a/mindspore/core/ops/depend.h
+++ b/mindspore/core/ops/depend.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameDepend = "Depend";
-class Depend : public PrimitiveC {
+class MS_CORE_API Depend : public PrimitiveC {
  public:
   Depend() : PrimitiveC(kNameDepend) {}
   ~Depend() = default;
diff --git a/mindspore/core/ops/depth_to_space.h b/mindspore/core/ops/depth_to_space.h
index c8cb0263fff..35db7f015c2 100644
--- a/mindspore/core/ops/depth_to_space.h
+++ b/mindspore/core/ops/depth_to_space.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameDepthToSpace = "DepthToSpace";
-class DepthToSpace : public PrimitiveC {
+class MS_CORE_API DepthToSpace : public PrimitiveC {
  public:
   DepthToSpace() : PrimitiveC(kNameDepthToSpace) { InitIOName({"x"}, {"y"}); }
   ~DepthToSpace() = default;
diff --git a/mindspore/core/ops/detection_post_process.h b/mindspore/core/ops/detection_post_process.h
index e6308858ce2..fc31dce6881 100644
--- a/mindspore/core/ops/detection_post_process.h
+++ b/mindspore/core/ops/detection_post_process.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameDetectionPostProcess = "DetectionPostProcess";
-class DetectionPostProcess : public PrimitiveC {
+class MS_CORE_API DetectionPostProcess : public PrimitiveC {
  public:
   DetectionPostProcess() : PrimitiveC(kNameDetectionPostProcess) {}
   ~DetectionPostProcess() = default;
diff --git a/mindspore/core/ops/diag.h b/mindspore/core/ops/diag.h
index 916b8ad50df..11a8479e813 100644
--- a/mindspore/core/ops/diag.h
+++ b/mindspore/core/ops/diag.h
@@ -24,7 +24,7 @@
 
 namespace mindspore {
 namespace ops {
-class Diag : public PrimitiveC {
+class MS_CORE_API Diag : public PrimitiveC {
  public:
   Diag() : PrimitiveC(prim::kPrimDiag->name()) { InitIOName({"input_x"}, {"output"}); }
   ~Diag() = default;
diff --git a/mindspore/core/ops/diag_part.h b/mindspore/core/ops/diag_part.h
index 393b2329165..cdd5f002419 100644
--- a/mindspore/core/ops/diag_part.h
+++ b/mindspore/core/ops/diag_part.h
@@ -24,7 +24,7 @@
 
 namespace mindspore {
 namespace ops {
-class DiagPart : public PrimitiveC {
+class MS_CORE_API DiagPart : public PrimitiveC {
  public:
   DiagPart() : PrimitiveC(prim::kPrimDiagPart->name()) { InitIOName({"input_x"}, {"output"}); }
   ~DiagPart() = default;
diff --git a/mindspore/core/ops/div.h b/mindspore/core/ops/div.h
index 5ee3ebf57c1..d89ecf6c16c 100644
--- a/mindspore/core/ops/div.h
+++ b/mindspore/core/ops/div.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameDiv = "Div";
-class Div : public PrimitiveC {
+class MS_CORE_API Div : public PrimitiveC {
  public:
   Div() : PrimitiveC(kNameDiv) { InitIOName({"x", "y"}, {"output"}); }
   explicit Div(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x", "y"}, {"output"}); }
diff --git a/mindspore/core/ops/dropout.h b/mindspore/core/ops/dropout.h
index e8e19400c13..f7c8285afff 100644
--- a/mindspore/core/ops/dropout.h
+++ b/mindspore/core/ops/dropout.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameDropout = "Dropout";
-class Dropout : public PrimitiveC {
+class MS_CORE_API Dropout : public PrimitiveC {
  public:
   Dropout() : PrimitiveC(kNameDropout) {}
   ~Dropout() = default;
diff --git a/mindspore/core/ops/dropout_do_mask.h b/mindspore/core/ops/dropout_do_mask.h
index b728b116ff3..188686dc13d 100644
--- a/mindspore/core/ops/dropout_do_mask.h
+++ b/mindspore/core/ops/dropout_do_mask.h
@@ -25,7 +25,7 @@
 
 namespace mindspore {
 namespace ops {
-class DropoutDoMask : public PrimitiveC {
+class MS_CORE_API DropoutDoMask : public PrimitiveC {
  public:
   DropoutDoMask() : PrimitiveC(prim::kPrimDropoutDoMask->name()) {}
   ~DropoutDoMask() = default;
diff --git a/mindspore/core/ops/dropout_gen_mask.h b/mindspore/core/ops/dropout_gen_mask.h
index 7f485d24cf6..d466dee4d88 100644
--- a/mindspore/core/ops/dropout_gen_mask.h
+++ b/mindspore/core/ops/dropout_gen_mask.h
@@ -25,7 +25,7 @@
 
 namespace mindspore {
 namespace ops {
-class DropoutGenMask : public PrimitiveC {
+class MS_CORE_API DropoutGenMask : public PrimitiveC {
  public:
   DropoutGenMask() : PrimitiveC(prim::kPrimDropoutGenMask->name()) {}
   ~DropoutGenMask() = default;
diff --git a/mindspore/core/ops/dtype.h b/mindspore/core/ops/dtype.h
index e7029a6676c..e2818a8e73d 100644
--- a/mindspore/core/ops/dtype.h
+++ b/mindspore/core/ops/dtype.h
@@ -26,7 +26,7 @@
 
 namespace mindspore {
 namespace ops {
-class DType : public PrimitiveC {
+class MS_CORE_API DType : public PrimitiveC {
  public:
   DType() : PrimitiveC(prim::kPrimDType->name()) { InitIOName({"x"}, {"output"}); }
   ~DType() = default;
diff --git a/mindspore/core/ops/dynamic_broadcast_gradient_args.h b/mindspore/core/ops/dynamic_broadcast_gradient_args.h
index ce1d1863d05..37b1d14aee8 100644
--- a/mindspore/core/ops/dynamic_broadcast_gradient_args.h
+++ b/mindspore/core/ops/dynamic_broadcast_gradient_args.h
@@ -23,7 +23,7 @@
 
 namespace mindspore {
 namespace ops {
-class DynamicBroadcastGradientArgs : public PrimitiveC {
+class MS_CORE_API DynamicBroadcastGradientArgs : public PrimitiveC {
  public:
   DynamicBroadcastGradientArgs() : PrimitiveC(prim::kPrimDynamicBroadcastGradientArgs->name()) {}
   ~DynamicBroadcastGradientArgs() = default;
diff --git a/mindspore/core/ops/eltwise.h b/mindspore/core/ops/eltwise.h
index ec0347a9f2b..1de61eff97f 100644
--- a/mindspore/core/ops/eltwise.h
+++ b/mindspore/core/ops/eltwise.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameEltwise = "Eltwise";
-class Eltwise : public PrimitiveC {
+class MS_CORE_API Eltwise : public PrimitiveC {
  public:
   Eltwise() : PrimitiveC(kNameEltwise) {}
   ~Eltwise() = default;
diff --git a/mindspore/core/ops/elu.h b/mindspore/core/ops/elu.h
index 3da8c1b202d..39d1126aec7 100644
--- a/mindspore/core/ops/elu.h
+++ b/mindspore/core/ops/elu.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameElu = "Elu";
-class Elu : public PrimitiveC {
+class MS_CORE_API Elu : public PrimitiveC {
  public:
   Elu() : PrimitiveC(kNameElu) {}
   ~Elu() = default;
diff --git a/mindspore/core/ops/embedding_lookup.h b/mindspore/core/ops/embedding_lookup.h
index 9997232fc1f..6a742a1d8f8 100644
--- a/mindspore/core/ops/embedding_lookup.h
+++ b/mindspore/core/ops/embedding_lookup.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameEmbeddingLookup = "EmbeddingLookup";
-class EmbeddingLookup : public PrimitiveC {
+class MS_CORE_API EmbeddingLookup : public PrimitiveC {
  public:
   EmbeddingLookup() : PrimitiveC(kNameEmbeddingLookup) { InitIOName({"params", "indices", "offset"}, {"output"}); }
   ~EmbeddingLookup() = default;
diff --git a/mindspore/core/ops/equal.h b/mindspore/core/ops/equal.h
index aebeae317b4..22382505a38 100644
--- a/mindspore/core/ops/equal.h
+++ b/mindspore/core/ops/equal.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameEqual = prim::kEqual;
-class Equal : public PrimitiveC {
+class MS_CORE_API Equal : public PrimitiveC {
  public:
   Equal() : PrimitiveC(prim::kPrimEqual->name()) { InitIOName({"x", "y"}, {"output"}); }
   ~Equal() = default;
diff --git a/mindspore/core/ops/erf.h b/mindspore/core/ops/erf.h
index 75b7d18c2e4..b186515f7d5 100644
--- a/mindspore/core/ops/erf.h
+++ b/mindspore/core/ops/erf.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameErf = "Erf";
-class Erf : public PrimitiveC {
+class MS_CORE_API Erf : public PrimitiveC {
  public:
   Erf() : PrimitiveC(kNameErf) { InitIOName({"x"}, {"y"}); }
   ~Erf() = default;
diff --git a/mindspore/core/ops/exp.h b/mindspore/core/ops/exp.h
index 84bf600e325..299d3e6b3b2 100644
--- a/mindspore/core/ops/exp.h
+++ b/mindspore/core/ops/exp.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameExp = prim::kExp;
-class Exp : public PrimitiveC {
+class MS_CORE_API Exp : public PrimitiveC {
  public:
   Exp() : PrimitiveC(prim::kPrimExp->name()) { InitIOName({"x"}, {"y"}); }
   explicit Exp(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x"}, {"y"}); }
diff --git a/mindspore/core/ops/expand_dims.h b/mindspore/core/ops/expand_dims.h
index 084d38d7b88..ce0a20f7164 100644
--- a/mindspore/core/ops/expand_dims.h
+++ b/mindspore/core/ops/expand_dims.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameExpandDims = "ExpandDims";
-class ExpandDims : public PrimitiveC {
+class MS_CORE_API ExpandDims : public PrimitiveC {
  public:
   ExpandDims() : PrimitiveC(kNameExpandDims) { InitIOName({"x", "axis"}, {"output"}); }
   ~ExpandDims() = default;
diff --git a/mindspore/core/ops/fake_quant_with_min_max_vars.h b/mindspore/core/ops/fake_quant_with_min_max_vars.h
index fb7091a69d5..d69c06b65d2 100644
--- a/mindspore/core/ops/fake_quant_with_min_max_vars.h
+++ b/mindspore/core/ops/fake_quant_with_min_max_vars.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameFakeQuantWithMinMaxVars = "FakeQuantWithMinMaxVars";
-class FakeQuantWithMinMaxVars : public PrimitiveC {
+class MS_CORE_API FakeQuantWithMinMaxVars : public PrimitiveC {
  public:
   FakeQuantWithMinMaxVars() : PrimitiveC(kNameFakeQuantWithMinMaxVars) {}
   ~FakeQuantWithMinMaxVars() = default;
diff --git a/mindspore/core/ops/fake_quant_with_min_max_vars_per_channel.h b/mindspore/core/ops/fake_quant_with_min_max_vars_per_channel.h
index ebfae11f5ca..95ae3256182 100644
--- a/mindspore/core/ops/fake_quant_with_min_max_vars_per_channel.h
+++ b/mindspore/core/ops/fake_quant_with_min_max_vars_per_channel.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameFakeQuantWithMinMaxVarsPerChannel = "FakeQuantWithMinMaxVarsPerChannel";
-class FakeQuantWithMinMaxVarsPerChannel : public PrimitiveC {
+class MS_CORE_API FakeQuantWithMinMaxVarsPerChannel : public PrimitiveC {
  public:
   FakeQuantWithMinMaxVarsPerChannel() : PrimitiveC(kNameFakeQuantWithMinMaxVarsPerChannel) {}
   ~FakeQuantWithMinMaxVarsPerChannel() = default;
diff --git a/mindspore/core/ops/fft_imag.h b/mindspore/core/ops/fft_imag.h
index c0a3d2301a3..c72a60cbb0e 100644
--- a/mindspore/core/ops/fft_imag.h
+++ b/mindspore/core/ops/fft_imag.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameFftImag = "FftImag";
-class FftImag : public PrimitiveC {
+class MS_CORE_API FftImag : public PrimitiveC {
  public:
   FftImag() : PrimitiveC(kNameFftImag) {}
   ~FftImag() = default;
diff --git a/mindspore/core/ops/fft_real.h b/mindspore/core/ops/fft_real.h
index 5aee6082196..ffae2ea2367 100644
--- a/mindspore/core/ops/fft_real.h
+++ b/mindspore/core/ops/fft_real.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameFftReal = "FftReal";
-class FftReal : public PrimitiveC {
+class MS_CORE_API FftReal : public PrimitiveC {
  public:
   FftReal() : PrimitiveC(kNameFftReal) {}
   ~FftReal() = default;
diff --git a/mindspore/core/ops/fill.h b/mindspore/core/ops/fill.h
index c983c5a54d6..79db0a55a63 100644
--- a/mindspore/core/ops/fill.h
+++ b/mindspore/core/ops/fill.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameFill = "Fill";
-class Fill : public PrimitiveC {
+class MS_CORE_API Fill : public PrimitiveC {
  public:
   Fill() : PrimitiveC(kNameFill) {}
   ~Fill() = default;
diff --git a/mindspore/core/ops/flatten.h b/mindspore/core/ops/flatten.h
index 164e7ccda6a..da6cb8233f2 100644
--- a/mindspore/core/ops/flatten.h
+++ b/mindspore/core/ops/flatten.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameFlatten = "Flatten";
-class Flatten : public PrimitiveC {
+class MS_CORE_API Flatten : public PrimitiveC {
  public:
   Flatten() : PrimitiveC(kNameFlatten) {}
   ~Flatten() = default;
diff --git a/mindspore/core/ops/floor.h b/mindspore/core/ops/floor.h
index 67f8c222f7a..eae5d791411 100644
--- a/mindspore/core/ops/floor.h
+++ b/mindspore/core/ops/floor.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameFloor = "Floor";
-class Floor : public PrimitiveC {
+class MS_CORE_API Floor : public PrimitiveC {
  public:
   Floor() : PrimitiveC(kNameFloor) { InitIOName({"x"}, {"y"}); }
   ~Floor() = default;
diff --git a/mindspore/core/ops/floor_div.h b/mindspore/core/ops/floor_div.h
index 66a0ed94322..052947b831f 100644
--- a/mindspore/core/ops/floor_div.h
+++ b/mindspore/core/ops/floor_div.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameFloorDiv = "FloorDiv";
-class FloorDiv : public PrimitiveC {
+class MS_CORE_API FloorDiv : public PrimitiveC {
  public:
   FloorDiv() : PrimitiveC(kNameFloorDiv) { InitIOName({"x", "y"}, {"output"}); }
   ~FloorDiv() = default;
diff --git a/mindspore/core/ops/floor_mod.h b/mindspore/core/ops/floor_mod.h
index 0410873412c..3b7561663e3 100644
--- a/mindspore/core/ops/floor_mod.h
+++ b/mindspore/core/ops/floor_mod.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameFloorMod = "FloorMod";
-class FloorMod : public PrimitiveC {
+class MS_CORE_API FloorMod : public PrimitiveC {
  public:
   FloorMod() : PrimitiveC(kNameFloorMod) { InitIOName({"x", "y"}, {"output"}); }
   ~FloorMod() = default;
diff --git a/mindspore/core/ops/fused_batch_norm.h b/mindspore/core/ops/fused_batch_norm.h
index 0642ab73198..bce32464873 100644
--- a/mindspore/core/ops/fused_batch_norm.h
+++ b/mindspore/core/ops/fused_batch_norm.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameFusedBatchNorm = "FusedBatchNorm";
-class FusedBatchNorm : public PrimitiveC {
+class MS_CORE_API FusedBatchNorm : public PrimitiveC {
  public:
   FusedBatchNorm() : PrimitiveC(kNameFusedBatchNorm) {
     InitIOName({"x", "scale", "b", "mean", "variance"},
diff --git a/mindspore/core/ops/fusion/activation.h b/mindspore/core/ops/fusion/activation.h
index b153197b9e2..b893e4b3133 100644
--- a/mindspore/core/ops/fusion/activation.h
+++ b/mindspore/core/ops/fusion/activation.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameActivation = "Activation";
-class Activation : public PrimitiveC {
+class MS_CORE_API Activation : public PrimitiveC {
  public:
   Activation() : PrimitiveC(kNameActivation) {}
   ~Activation() = default;
diff --git a/mindspore/core/ops/fusion/add_fusion.h b/mindspore/core/ops/fusion/add_fusion.h
index 2131549dc84..f52a6717abd 100644
--- a/mindspore/core/ops/fusion/add_fusion.h
+++ b/mindspore/core/ops/fusion/add_fusion.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAddFusion = "AddFusion";
-class AddFusion : public Add {
+class MS_CORE_API AddFusion : public Add {
  public:
   AddFusion() : Add(kNameAddFusion) { InitIOName({"x", "y"}, {"output"}); }
   ~AddFusion() = default;
diff --git a/mindspore/core/ops/fusion/adder_fusion.h b/mindspore/core/ops/fusion/adder_fusion.h
index 1d01ff4ccf8..633aecc7055 100644
--- a/mindspore/core/ops/fusion/adder_fusion.h
+++ b/mindspore/core/ops/fusion/adder_fusion.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAdderFusion = "AdderFusion";
-class AdderFusion : public Adder {
+class MS_CORE_API AdderFusion : public Adder {
  public:
   AdderFusion() : Adder(kNameAdderFusion) {}
   ~AdderFusion() = default;
diff --git a/mindspore/core/ops/fusion/arg_max_fusion.h b/mindspore/core/ops/fusion/arg_max_fusion.h
index 8ccf011914c..25edea542d3 100644
--- a/mindspore/core/ops/fusion/arg_max_fusion.h
+++ b/mindspore/core/ops/fusion/arg_max_fusion.h
@@ -24,7 +24,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameArgMaxFusion = "ArgMaxFusion";
-class ArgMaxFusion : public ArgMax {
+class MS_CORE_API ArgMaxFusion : public ArgMax {
  public:
   ArgMaxFusion() : ArgMax(kNameArgMaxFusion) { InitIOName({"x"}, {"output"}); }
   ~ArgMaxFusion() = default;
diff --git a/mindspore/core/ops/fusion/arg_min_fusion.h b/mindspore/core/ops/fusion/arg_min_fusion.h
index f165cd530c4..10e18e9b337 100644
--- a/mindspore/core/ops/fusion/arg_min_fusion.h
+++ b/mindspore/core/ops/fusion/arg_min_fusion.h
@@ -24,7 +24,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameArgMinFusion = "ArgMinFusion";
-class ArgMinFusion : public ArgMin {
+class MS_CORE_API ArgMinFusion : public ArgMin {
  public:
   ArgMinFusion() : ArgMin(kNameArgMinFusion) { InitIOName({"x"}, {"output"}); }
   ~ArgMinFusion() = default;
diff --git a/mindspore/core/ops/fusion/avg_pool_fusion.h b/mindspore/core/ops/fusion/avg_pool_fusion.h
index 04f2e929dd1..c6e3e679c36 100644
--- a/mindspore/core/ops/fusion/avg_pool_fusion.h
+++ b/mindspore/core/ops/fusion/avg_pool_fusion.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAvgPoolFusion = "AvgPoolFusion";
-class AvgPoolFusion : public AvgPool {
+class MS_CORE_API AvgPoolFusion : public AvgPool {
  public:
   AvgPoolFusion() : AvgPool(kNameAvgPoolFusion) { InitIOName({"x"}, {"output"}); }
   ~AvgPoolFusion() = default;
diff --git a/mindspore/core/ops/fusion/conv2d_backprop_filter_fusion.h b/mindspore/core/ops/fusion/conv2d_backprop_filter_fusion.h
index de8c7851bb5..c68b3c313de 100644
--- a/mindspore/core/ops/fusion/conv2d_backprop_filter_fusion.h
+++ b/mindspore/core/ops/fusion/conv2d_backprop_filter_fusion.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameConv2DBackpropFilterFusion = "Conv2DBackpropFilterFusion";
-class Conv2DBackpropFilterFusion : public Conv2DBackpropFilter {
+class MS_CORE_API Conv2DBackpropFilterFusion : public Conv2DBackpropFilter {
  public:
   Conv2DBackpropFilterFusion() : Conv2DBackpropFilter(kNameConv2DBackpropFilterFusion) {
     InitIOName({"out_backprop", "input", "filter_sizes"}, {"output"});
diff --git a/mindspore/core/ops/fusion/conv2d_backprop_input_fusion.h b/mindspore/core/ops/fusion/conv2d_backprop_input_fusion.h
index f76858e3f37..3bec6953bd1 100644
--- a/mindspore/core/ops/fusion/conv2d_backprop_input_fusion.h
+++ b/mindspore/core/ops/fusion/conv2d_backprop_input_fusion.h
@@ -24,7 +24,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameConv2DBackpropInputFusion = "Conv2DBackpropInputFusion";
-class Conv2DBackpropInputFusion : public Conv2DBackpropInput {
+class MS_CORE_API Conv2DBackpropInputFusion : public Conv2DBackpropInput {
  public:
   Conv2DBackpropInputFusion() : Conv2DBackpropInput(kNameConv2DBackpropInputFusion) {}
   ~Conv2DBackpropInputFusion() = default;
diff --git a/mindspore/core/ops/fusion/conv2d_fusion.h b/mindspore/core/ops/fusion/conv2d_fusion.h
index 9d4a4561de1..60fbfb628b9 100644
--- a/mindspore/core/ops/fusion/conv2d_fusion.h
+++ b/mindspore/core/ops/fusion/conv2d_fusion.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameConv2DFusion = "Conv2DFusion";
-class Conv2DFusion : public Conv2D {
+class MS_CORE_API Conv2DFusion : public Conv2D {
  public:
   Conv2DFusion() : Conv2D(kNameConv2DFusion) {}
   ~Conv2DFusion() = default;
diff --git a/mindspore/core/ops/fusion/conv2d_transpose_fusion.h b/mindspore/core/ops/fusion/conv2d_transpose_fusion.h
index daec73ccee2..2d5c3f58d63 100644
--- a/mindspore/core/ops/fusion/conv2d_transpose_fusion.h
+++ b/mindspore/core/ops/fusion/conv2d_transpose_fusion.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameConv2dTransposeFusion = "Conv2dTransposeFusion";
-class Conv2dTransposeFusion : public Conv2DTranspose {
+class MS_CORE_API Conv2dTransposeFusion : public Conv2DTranspose {
  public:
   Conv2dTransposeFusion() : Conv2DTranspose(kNameConv2dTransposeFusion) {
     InitIOName({"out_backprop", "filter", "input_sizes"}, {"output"});
diff --git a/mindspore/core/ops/fusion/div_fusion.h b/mindspore/core/ops/fusion/div_fusion.h
index c04ec4c0ed9..8e2b63ee2f9 100644
--- a/mindspore/core/ops/fusion/div_fusion.h
+++ b/mindspore/core/ops/fusion/div_fusion.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameDivFusion = "DivFusion";
-class DivFusion : public Div {
+class MS_CORE_API DivFusion : public Div {
  public:
   DivFusion() : Div(kNameDivFusion) {}
   ~DivFusion() = default;
diff --git a/mindspore/core/ops/fusion/embedding_lookup_fusion.h b/mindspore/core/ops/fusion/embedding_lookup_fusion.h
index a88c9e05aa8..62e686361af 100644
--- a/mindspore/core/ops/fusion/embedding_lookup_fusion.h
+++ b/mindspore/core/ops/fusion/embedding_lookup_fusion.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameEmbeddingLookupFusion = "EmbeddingLookupFusion";
-class EmbeddingLookupFusion : public PrimitiveC {
+class MS_CORE_API EmbeddingLookupFusion : public PrimitiveC {
  public:
   EmbeddingLookupFusion() : PrimitiveC(kNameEmbeddingLookupFusion) {
     InitIOName({"params", "indices", "offset"}, {"output"});
diff --git a/mindspore/core/ops/fusion/exp_fusion.h b/mindspore/core/ops/fusion/exp_fusion.h
index be0b7bb3132..21a0675311d 100644
--- a/mindspore/core/ops/fusion/exp_fusion.h
+++ b/mindspore/core/ops/fusion/exp_fusion.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameExpFusion = "ExpFusion";
-class ExpFusion : public Exp {
+class MS_CORE_API ExpFusion : public Exp {
  public:
   ExpFusion() : Exp(kNameExpFusion) { InitIOName({"x"}, {"y"}); }
   ~ExpFusion() = default;
diff --git a/mindspore/core/ops/fusion/full_connection.h b/mindspore/core/ops/fusion/full_connection.h
index 80984b03a8a..ea9ce4b083e 100644
--- a/mindspore/core/ops/fusion/full_connection.h
+++ b/mindspore/core/ops/fusion/full_connection.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameFullConnection = "FullConnection";
-class FullConnection : public PrimitiveC {
+class MS_CORE_API FullConnection : public PrimitiveC {
  public:
   FullConnection() : PrimitiveC(kNameFullConnection) { InitIOName({"x1", "x2", "b"}, {"output"}); }
   ~FullConnection() = default;
diff --git a/mindspore/core/ops/fusion/l2_normalize_fusion.h b/mindspore/core/ops/fusion/l2_normalize_fusion.h
index 6afa60b77de..430ebe4489e 100644
--- a/mindspore/core/ops/fusion/l2_normalize_fusion.h
+++ b/mindspore/core/ops/fusion/l2_normalize_fusion.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameL2NormalizeFusion = "L2NormalizeFusion";
-class L2NormalizeFusion : public L2Normalize {
+class MS_CORE_API L2NormalizeFusion : public L2Normalize {
  public:
   L2NormalizeFusion() : L2Normalize(kNameL2NormalizeFusion) {}
   ~L2NormalizeFusion() = default;
diff --git a/mindspore/core/ops/fusion/layer_norm_fusion.h b/mindspore/core/ops/fusion/layer_norm_fusion.h
index f83f1721cc7..0c256074e24 100644
--- a/mindspore/core/ops/fusion/layer_norm_fusion.h
+++ b/mindspore/core/ops/fusion/layer_norm_fusion.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLayerNormFusion = "LayerNormFusion";
-class LayerNormFusion : public LayerNorm {
+class MS_CORE_API LayerNormFusion : public LayerNorm {
  public:
   LayerNormFusion() : LayerNorm(kNameLayerNormFusion) {}
   ~LayerNormFusion() = default;
diff --git a/mindspore/core/ops/fusion/max_pool_fusion.h b/mindspore/core/ops/fusion/max_pool_fusion.h
index bf2ea3db28c..6397a9817e8 100644
--- a/mindspore/core/ops/fusion/max_pool_fusion.h
+++ b/mindspore/core/ops/fusion/max_pool_fusion.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMaxPoolFusion = "MaxPoolFusion";
-class MaxPoolFusion : public MaxPool {
+class MS_CORE_API MaxPoolFusion : public MaxPool {
  public:
   MaxPoolFusion() : MaxPool(kNameMaxPoolFusion) { InitIOName({"x"}, {"output"}); }
   ~MaxPoolFusion() = default;
diff --git a/mindspore/core/ops/fusion/mul_fusion.h b/mindspore/core/ops/fusion/mul_fusion.h
index 592865978ca..106d0b6c0fa 100644
--- a/mindspore/core/ops/fusion/mul_fusion.h
+++ b/mindspore/core/ops/fusion/mul_fusion.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMulFusion = "MulFusion";
-class MulFusion : public Mul {
+class MS_CORE_API MulFusion : public Mul {
  public:
   MulFusion() : Mul(kNameMulFusion) { InitIOName({"x", "y"}, {"output"}); }
   ~MulFusion() = default;
diff --git a/mindspore/core/ops/fusion/pad_fusion.h b/mindspore/core/ops/fusion/pad_fusion.h
index e64e2cf3461..96707868b92 100644
--- a/mindspore/core/ops/fusion/pad_fusion.h
+++ b/mindspore/core/ops/fusion/pad_fusion.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePadFusion = "PadFusion";
-class PadFusion : public Pad {
+class MS_CORE_API PadFusion : public Pad {
  public:
   PadFusion() : Pad(kNamePadFusion) { InitIOName({"x"}, {"y"}); }
   ~PadFusion() = default;
diff --git a/mindspore/core/ops/fusion/partial_fusion.h b/mindspore/core/ops/fusion/partial_fusion.h
index 4acbf776d6e..e7d0b521037 100644
--- a/mindspore/core/ops/fusion/partial_fusion.h
+++ b/mindspore/core/ops/fusion/partial_fusion.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePartialFusion = "PartialFusion";
-class PartialFusion : public PrimitiveC {
+class MS_CORE_API PartialFusion : public PrimitiveC {
  public:
   PartialFusion() : PrimitiveC(kNamePartialFusion) {}
   ~PartialFusion() = default;
diff --git a/mindspore/core/ops/fusion/pow_fusion.h b/mindspore/core/ops/fusion/pow_fusion.h
index 695f46a74e8..a06d5560e0f 100644
--- a/mindspore/core/ops/fusion/pow_fusion.h
+++ b/mindspore/core/ops/fusion/pow_fusion.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePowFusion = "PowFusion";
-class PowFusion : public Pow {
+class MS_CORE_API PowFusion : public Pow {
  public:
   PowFusion() : Pow(kNamePowFusion) {}
   ~PowFusion() = default;
diff --git a/mindspore/core/ops/fusion/prelu_fusion.h b/mindspore/core/ops/fusion/prelu_fusion.h
index 8f4243c6e1d..a054ea639ca 100644
--- a/mindspore/core/ops/fusion/prelu_fusion.h
+++ b/mindspore/core/ops/fusion/prelu_fusion.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePReLUFusion = "PReLUFusion";
-class PReLUFusion : public PReLU {
+class MS_CORE_API PReLUFusion : public PReLU {
  public:
   PReLUFusion() : PReLU(kNamePReLUFusion) {}
   ~PReLUFusion() = default;
diff --git a/mindspore/core/ops/fusion/reduce_fusion.h b/mindspore/core/ops/fusion/reduce_fusion.h
index 18657ae9643..81793fb7a58 100644
--- a/mindspore/core/ops/fusion/reduce_fusion.h
+++ b/mindspore/core/ops/fusion/reduce_fusion.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReduceFusion = "ReduceFusion";
-class ReduceFusion : public Reduce {
+class MS_CORE_API ReduceFusion : public Reduce {
  public:
   ReduceFusion() : Reduce(kNameReduceFusion) {}
   ~ReduceFusion() = default;
diff --git a/mindspore/core/ops/fusion/scale_fusion.h b/mindspore/core/ops/fusion/scale_fusion.h
index f9571c7f2f0..fd5282aaed8 100644
--- a/mindspore/core/ops/fusion/scale_fusion.h
+++ b/mindspore/core/ops/fusion/scale_fusion.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameScaleFusion = "ScaleFusion";
-class ScaleFusion : public Scale {
+class MS_CORE_API ScaleFusion : public Scale {
  public:
   ScaleFusion() : Scale(kNameScaleFusion) {}
   ~ScaleFusion() = default;
diff --git a/mindspore/core/ops/fusion/slice_fusion.h b/mindspore/core/ops/fusion/slice_fusion.h
index 61155136cb3..e924e285e58 100644
--- a/mindspore/core/ops/fusion/slice_fusion.h
+++ b/mindspore/core/ops/fusion/slice_fusion.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSliceFusion = "SliceFusion";
-class SliceFusion : public PrimitiveC {
+class MS_CORE_API SliceFusion : public PrimitiveC {
  public:
   SliceFusion() : PrimitiveC(kNameSliceFusion) { InitIOName({"x", "begin", "size"}, {"output"}); }
   ~SliceFusion() = default;
diff --git a/mindspore/core/ops/fusion/sub_fusion.h b/mindspore/core/ops/fusion/sub_fusion.h
index 9bbb7976896..07d060409a7 100644
--- a/mindspore/core/ops/fusion/sub_fusion.h
+++ b/mindspore/core/ops/fusion/sub_fusion.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSubFusion = "SubFusion";
-class SubFusion : public Sub {
+class MS_CORE_API SubFusion : public Sub {
  public:
   SubFusion() : Sub(kNameSubFusion) {}
   ~SubFusion() = default;
diff --git a/mindspore/core/ops/fusion/tile_fusion.h b/mindspore/core/ops/fusion/tile_fusion.h
index def292c62d9..02338810107 100644
--- a/mindspore/core/ops/fusion/tile_fusion.h
+++ b/mindspore/core/ops/fusion/tile_fusion.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameTileFusion = "TileFusion";
-class TileFusion : public Tile {
+class MS_CORE_API TileFusion : public Tile {
  public:
   TileFusion() : Tile(kNameTileFusion) {}
   ~TileFusion() = default;
diff --git a/mindspore/core/ops/fusion/topk_fusion.h b/mindspore/core/ops/fusion/topk_fusion.h
index 03d7801d19f..47e953b12bf 100644
--- a/mindspore/core/ops/fusion/topk_fusion.h
+++ b/mindspore/core/ops/fusion/topk_fusion.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameTopKFusion = "TopKFusion";
-class TopKFusion : public TopK {
+class MS_CORE_API TopKFusion : public TopK {
  public:
   TopKFusion() : TopK(kNameTopKFusion) {}
   ~TopKFusion() = default;
diff --git a/mindspore/core/ops/gather.h b/mindspore/core/ops/gather.h
index ea46370cf3f..89e31321d17 100644
--- a/mindspore/core/ops/gather.h
+++ b/mindspore/core/ops/gather.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameGather = "Gather";
-class Gather : public PrimitiveC {
+class MS_CORE_API Gather : public PrimitiveC {
  public:
   Gather() : PrimitiveC(kNameGather) { InitIOName({"param", "indices", "axis"}, {"output"}); }
   ~Gather() = default;
diff --git a/mindspore/core/ops/gather_d.h b/mindspore/core/ops/gather_d.h
index 76021af100f..dacc2c26f6b 100644
--- a/mindspore/core/ops/gather_d.h
+++ b/mindspore/core/ops/gather_d.h
@@ -27,7 +27,7 @@
 
 namespace mindspore {
 namespace ops {
-class GatherD : public PrimitiveC {
+class MS_CORE_API GatherD : public PrimitiveC {
  public:
   GatherD() : PrimitiveC(prim::kPrimGatherD->name()) { InitIOName({"x", "dim", "index"}, {"output"}); }
   ~GatherD() = default;
diff --git a/mindspore/core/ops/gather_nd.h b/mindspore/core/ops/gather_nd.h
index 413c9d8f4e2..834ce4404f7 100644
--- a/mindspore/core/ops/gather_nd.h
+++ b/mindspore/core/ops/gather_nd.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameGatherNd = "GatherNd";
-class GatherNd : public PrimitiveC {
+class MS_CORE_API GatherNd : public PrimitiveC {
  public:
   GatherNd() : PrimitiveC(kNameGatherNd) { InitIOName({"input_x", "indices"}, {"y"}); }
   ~GatherNd() = default;
diff --git a/mindspore/core/ops/gelu.h b/mindspore/core/ops/gelu.h
index 17d83ac7e4b..2a00410b316 100644
--- a/mindspore/core/ops/gelu.h
+++ b/mindspore/core/ops/gelu.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameGeLU = prim::kGeLU;
-class GeLU : public PrimitiveC {
+class MS_CORE_API GeLU : public PrimitiveC {
  public:
   GeLU() : PrimitiveC(kNameGeLU) { InitIOName({"x"}, {"output"}); }
   ~GeLU() = default;
diff --git a/mindspore/core/ops/getnext.h b/mindspore/core/ops/getnext.h
index 78acd30f76f..582ce34873d 100644
--- a/mindspore/core/ops/getnext.h
+++ b/mindspore/core/ops/getnext.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameGetNext = prim::kGetNext;
-class GetNext : public PrimitiveC {
+class MS_CORE_API GetNext : public PrimitiveC {
  public:
   GetNext() : PrimitiveC(prim::kPrimGetNext->name()) {}
   ~GetNext() = default;
diff --git a/mindspore/core/ops/glu.h b/mindspore/core/ops/glu.h
index f8929d7b9b7..8a55d864ff0 100644
--- a/mindspore/core/ops/glu.h
+++ b/mindspore/core/ops/glu.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameGLU = prim::kGLU;
-class GLU : public PrimitiveC {
+class MS_CORE_API GLU : public PrimitiveC {
  public:
   GLU() : PrimitiveC(kNameGLU) { InitIOName({"x"}, {"output"}); }
   ~GLU() = default;
diff --git a/mindspore/core/ops/grad/abs_grad.h b/mindspore/core/ops/grad/abs_grad.h
index 82c7ed13572..7439158c913 100644
--- a/mindspore/core/ops/grad/abs_grad.h
+++ b/mindspore/core/ops/grad/abs_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAbsGrad = "AbsGrad";
-class AbsGrad : public PrimitiveC {
+class MS_CORE_API AbsGrad : public PrimitiveC {
  public:
   AbsGrad() : PrimitiveC(kNameAbsGrad) {}
   ~AbsGrad() = default;
diff --git a/mindspore/core/ops/grad/activation_grad.h b/mindspore/core/ops/grad/activation_grad.h
index d1f71b5b738..9ef3709198d 100644
--- a/mindspore/core/ops/grad/activation_grad.h
+++ b/mindspore/core/ops/grad/activation_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameActivationGrad = "ActivationGrad";
-class ActivationGrad : public PrimitiveC {
+class MS_CORE_API ActivationGrad : public PrimitiveC {
  public:
   ActivationGrad() : PrimitiveC(kNameActivationGrad) {}
   ~ActivationGrad() = default;
diff --git a/mindspore/core/ops/grad/add_grad.h b/mindspore/core/ops/grad/add_grad.h
index ff5afd99c3b..ebe8ff71cb7 100644
--- a/mindspore/core/ops/grad/add_grad.h
+++ b/mindspore/core/ops/grad/add_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAddGrad = "AddGrad";
-class AddGrad : public PrimitiveC {
+class MS_CORE_API AddGrad : public PrimitiveC {
  public:
   AddGrad() : PrimitiveC(kNameAddGrad) {}
   ~AddGrad() = default;
diff --git a/mindspore/core/ops/grad/avg_pool_3d_grad.h b/mindspore/core/ops/grad/avg_pool_3d_grad.h
index 697cd26ee09..c25df35262f 100644
--- a/mindspore/core/ops/grad/avg_pool_3d_grad.h
+++ b/mindspore/core/ops/grad/avg_pool_3d_grad.h
@@ -27,7 +27,7 @@
 
 namespace mindspore {
 namespace ops {
-class AvgPool3DGrad : public PrimitiveC {
+class MS_CORE_API AvgPool3DGrad : public PrimitiveC {
  public:
   AvgPool3DGrad() : PrimitiveC(prim::kPrimAvgPool3DGrad->name()) {
     InitIOName({"origin_input_size", "grad"}, {"output"});
diff --git a/mindspore/core/ops/grad/avg_pool_grad.h b/mindspore/core/ops/grad/avg_pool_grad.h
index b408aa84e04..8f2abd5fe35 100644
--- a/mindspore/core/ops/grad/avg_pool_grad.h
+++ b/mindspore/core/ops/grad/avg_pool_grad.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameAvgPoolGrad = "AvgPoolGrad";
-class AvgPoolGrad : public PoolGrad {
+class MS_CORE_API AvgPoolGrad : public PoolGrad {
  public:
   AvgPoolGrad() : PoolGrad(kNameAvgPoolGrad) { InitIOName({"x_origin", "out_origin", "grad"}, {"output"}); }
   ~AvgPoolGrad() = default;
diff --git a/mindspore/core/ops/grad/batch_norm_grad.h b/mindspore/core/ops/grad/batch_norm_grad.h
index adc1a157f67..e92dbb678a0 100644
--- a/mindspore/core/ops/grad/batch_norm_grad.h
+++ b/mindspore/core/ops/grad/batch_norm_grad.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameBatchNormGrad = "BatchNormGrad";
-class BatchNormGrad : public PrimitiveC {
+class MS_CORE_API BatchNormGrad : public PrimitiveC {
  public:
   BatchNormGrad() : PrimitiveC(kNameBatchNormGrad) {}
   ~BatchNormGrad() = default;
diff --git a/mindspore/core/ops/grad/bias_add_grad.h b/mindspore/core/ops/grad/bias_add_grad.h
index b1df50986de..f49e2daecdb 100644
--- a/mindspore/core/ops/grad/bias_add_grad.h
+++ b/mindspore/core/ops/grad/bias_add_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameBiasAddGrad = prim::kBiasAddGrad;
-class BiasAddGrad : public PrimitiveC {
+class MS_CORE_API BiasAddGrad : public PrimitiveC {
  public:
   BiasAddGrad() : PrimitiveC(prim::kPrimBiasAddGrad->name()) { InitIOName({"x"}, {"output"}); }
   ~BiasAddGrad() = default;
diff --git a/mindspore/core/ops/grad/binary_cross_entropy_grad.h b/mindspore/core/ops/grad/binary_cross_entropy_grad.h
index 3378febfb10..7bce73a760d 100644
--- a/mindspore/core/ops/grad/binary_cross_entropy_grad.h
+++ b/mindspore/core/ops/grad/binary_cross_entropy_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameBinaryCrossEntropyGrad = "BinaryCrossEntropyGrad";
-class BinaryCrossEntropyGrad : public PrimitiveC {
+class MS_CORE_API BinaryCrossEntropyGrad : public PrimitiveC {
  public:
   BinaryCrossEntropyGrad() : PrimitiveC(kNameBinaryCrossEntropyGrad) {}
   ~BinaryCrossEntropyGrad() = default;
diff --git a/mindspore/core/ops/grad/bn_grad.h b/mindspore/core/ops/grad/bn_grad.h
index 2dee03a8b05..38ce31f6bdf 100644
--- a/mindspore/core/ops/grad/bn_grad.h
+++ b/mindspore/core/ops/grad/bn_grad.h
@@ -24,7 +24,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameBNGrad = "BNGrad";
-class BNGrad : public PrimitiveC {
+class MS_CORE_API BNGrad : public PrimitiveC {
  public:
   BNGrad() : PrimitiveC(kNameBNGrad) {}
   ~BNGrad() = default;
diff --git a/mindspore/core/ops/grad/conv2d_backprop_filter.h b/mindspore/core/ops/grad/conv2d_backprop_filter.h
index 3dac2274aa5..51c80dff9e1 100644
--- a/mindspore/core/ops/grad/conv2d_backprop_filter.h
+++ b/mindspore/core/ops/grad/conv2d_backprop_filter.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameConv2DBackpropFilter = "Conv2DBackpropFilter";
-class Conv2DBackpropFilter : public PrimitiveC {
+class MS_CORE_API Conv2DBackpropFilter : public PrimitiveC {
  public:
   Conv2DBackpropFilter() : PrimitiveC(kNameConv2DBackpropFilter) {
     InitIOName({"out_backprop", "input", "filter_sizes"}, {"output"});
diff --git a/mindspore/core/ops/grad/conv2d_backprop_input.h b/mindspore/core/ops/grad/conv2d_backprop_input.h
index 1889c2d5b22..b7ff96627c2 100644
--- a/mindspore/core/ops/grad/conv2d_backprop_input.h
+++ b/mindspore/core/ops/grad/conv2d_backprop_input.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameConv2DBackpropInput = "Conv2DBackpropInput";
-class Conv2DBackpropInput : public PrimitiveC {
+class MS_CORE_API Conv2DBackpropInput : public PrimitiveC {
  public:
   explicit Conv2DBackpropInput(const std::string &k_name = kNameConv2DBackpropInput) : PrimitiveC(k_name) {
     InitIOName({"out_backprop", "filter", "input_sizes"}, {"output"});
diff --git a/mindspore/core/ops/grad/de_conv2d_grad_filter.h b/mindspore/core/ops/grad/de_conv2d_grad_filter.h
index 24ee57084f3..9c9be83281e 100644
--- a/mindspore/core/ops/grad/de_conv2d_grad_filter.h
+++ b/mindspore/core/ops/grad/de_conv2d_grad_filter.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameDeConv2DGradFilter = "DeConv2DGradFilter";
-class DeConv2DGradFilter : public PrimitiveC {
+class MS_CORE_API DeConv2DGradFilter : public PrimitiveC {
  public:
   DeConv2DGradFilter() : PrimitiveC(kNameDeConv2DGradFilter) {}
   ~DeConv2DGradFilter() = default;
diff --git a/mindspore/core/ops/grad/div_grad.h b/mindspore/core/ops/grad/div_grad.h
index 1ec463808ae..dd79e0c79d7 100644
--- a/mindspore/core/ops/grad/div_grad.h
+++ b/mindspore/core/ops/grad/div_grad.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameDivGrad = "DivGrad";
-class DivGrad : public PrimitiveC {
+class MS_CORE_API DivGrad : public PrimitiveC {
  public:
   DivGrad() : PrimitiveC(kNameDivGrad) {}
   ~DivGrad() = default;
diff --git a/mindspore/core/ops/grad/dropout_grad.h b/mindspore/core/ops/grad/dropout_grad.h
index dcdd5fb6ee5..c7edff172aa 100644
--- a/mindspore/core/ops/grad/dropout_grad.h
+++ b/mindspore/core/ops/grad/dropout_grad.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameDropoutGrad = "DropoutGrad";
-class DropoutGrad : public PrimitiveC {
+class MS_CORE_API DropoutGrad : public PrimitiveC {
  public:
   DropoutGrad() : PrimitiveC(kNameDropoutGrad) {}
   ~DropoutGrad() = default;
diff --git a/mindspore/core/ops/grad/flatten_grad.h b/mindspore/core/ops/grad/flatten_grad.h
index 1a28b9213e5..f492a53efb6 100644
--- a/mindspore/core/ops/grad/flatten_grad.h
+++ b/mindspore/core/ops/grad/flatten_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameFlattenGrad = "FlattenGrad";
-class FlattenGrad : public PrimitiveC {
+class MS_CORE_API FlattenGrad : public PrimitiveC {
  public:
   FlattenGrad() : PrimitiveC(kNameFlattenGrad) { InitIOName({"x", "shape"}, {"output"}); }
   ~FlattenGrad() = default;
diff --git a/mindspore/core/ops/grad/group_conv2d_grad_input.h b/mindspore/core/ops/grad/group_conv2d_grad_input.h
index 998228e188f..bb95f977ac3 100644
--- a/mindspore/core/ops/grad/group_conv2d_grad_input.h
+++ b/mindspore/core/ops/grad/group_conv2d_grad_input.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameGroupConv2DGradInput = "GroupConv2DGradInput";
-class GroupConv2DGradInput : public PrimitiveC {
+class MS_CORE_API GroupConv2DGradInput : public PrimitiveC {
  public:
   GroupConv2DGradInput() : PrimitiveC(kNameGroupConv2DGradInput) {}
   ~GroupConv2DGradInput() = default;
diff --git a/mindspore/core/ops/grad/hshrink_grad.h b/mindspore/core/ops/grad/hshrink_grad.h
index 210b8b47965..45e92b79b33 100644
--- a/mindspore/core/ops/grad/hshrink_grad.h
+++ b/mindspore/core/ops/grad/hshrink_grad.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameHShrinkGrad = "HShrinkGrad";
-class HShrinkGrad : public PrimitiveC {
+class MS_CORE_API HShrinkGrad : public PrimitiveC {
  public:
   HShrinkGrad() : PrimitiveC(kNameHShrinkGrad) { InitIOName({"gradients", "features"}, {"backprops"}); }
   ~HShrinkGrad() = default;
diff --git a/mindspore/core/ops/grad/hsigmoid_grad.h b/mindspore/core/ops/grad/hsigmoid_grad.h
index eb1ec65a41e..076fa642069 100644
--- a/mindspore/core/ops/grad/hsigmoid_grad.h
+++ b/mindspore/core/ops/grad/hsigmoid_grad.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameHSigmoidGrad = "HSigmoidGrad";
-class HSigmoidGrad : public PrimitiveC {
+class MS_CORE_API HSigmoidGrad : public PrimitiveC {
  public:
   HSigmoidGrad() : PrimitiveC(kNameHSigmoidGrad) { InitIOName({"grads", "input_x"}, {"output"}); }
   ~HSigmoidGrad() = default;
diff --git a/mindspore/core/ops/grad/layer_norm_grad.h b/mindspore/core/ops/grad/layer_norm_grad.h
index 0008ce2eb95..b99b0c681b5 100644
--- a/mindspore/core/ops/grad/layer_norm_grad.h
+++ b/mindspore/core/ops/grad/layer_norm_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLayerNormGrad = prim::kLayerNormGrad;
-class LayerNormGrad : public PrimitiveC {
+class MS_CORE_API LayerNormGrad : public PrimitiveC {
  public:
   LayerNormGrad() : PrimitiveC(kNameLayerNormGrad) {}
   explicit LayerNormGrad(const std::string k_name) : PrimitiveC(k_name) {}
diff --git a/mindspore/core/ops/grad/log_grad.h b/mindspore/core/ops/grad/log_grad.h
index 0c8223e5fb5..2c62cf814c4 100644
--- a/mindspore/core/ops/grad/log_grad.h
+++ b/mindspore/core/ops/grad/log_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLogGrad = "LogGrad";
-class LogGrad : public PrimitiveC {
+class MS_CORE_API LogGrad : public PrimitiveC {
  public:
   LogGrad() : PrimitiveC(kNameLogGrad) {}
   ~LogGrad() = default;
diff --git a/mindspore/core/ops/grad/lstm_grad.h b/mindspore/core/ops/grad/lstm_grad.h
index f91323d2c2b..0c5402e4e16 100644
--- a/mindspore/core/ops/grad/lstm_grad.h
+++ b/mindspore/core/ops/grad/lstm_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLSTMGrad = "LSTMGrad";
-class LSTMGrad : public PrimitiveC {
+class MS_CORE_API LSTMGrad : public PrimitiveC {
  public:
   LSTMGrad() : PrimitiveC(kNameLSTMGrad) {}
   ~LSTMGrad() = default;
diff --git a/mindspore/core/ops/grad/max_pool_grad.h b/mindspore/core/ops/grad/max_pool_grad.h
index 9ce85f601cb..3cefec3db0e 100644
--- a/mindspore/core/ops/grad/max_pool_grad.h
+++ b/mindspore/core/ops/grad/max_pool_grad.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMaxPoolGrad = "MaxPoolGrad";
-class MaxPoolGrad : public PoolGrad {
+class MS_CORE_API MaxPoolGrad : public PoolGrad {
  public:
   MaxPoolGrad() : PoolGrad(kNameMaxPoolGrad) { InitIOName({"x_origin", "out_origin", "grad"}, {"output"}); }
   ~MaxPoolGrad() = default;
diff --git a/mindspore/core/ops/grad/maximum_grad.h b/mindspore/core/ops/grad/maximum_grad.h
index 149f925a7bb..944324e45dc 100644
--- a/mindspore/core/ops/grad/maximum_grad.h
+++ b/mindspore/core/ops/grad/maximum_grad.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMaximumGrad = "MaximumGrad";
-class MaximumGrad : public PrimitiveC {
+class MS_CORE_API MaximumGrad : public PrimitiveC {
  public:
   MaximumGrad() : PrimitiveC(kNameMaximumGrad) {}
   ~MaximumGrad() = default;
diff --git a/mindspore/core/ops/grad/minimum_grad.h b/mindspore/core/ops/grad/minimum_grad.h
index c07b84ae10a..0be842c255b 100644
--- a/mindspore/core/ops/grad/minimum_grad.h
+++ b/mindspore/core/ops/grad/minimum_grad.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMinimumGrad = "MinimumGrad";
-class MinimumGrad : public PrimitiveC {
+class MS_CORE_API MinimumGrad : public PrimitiveC {
  public:
   MinimumGrad() : PrimitiveC(kNameMinimumGrad) {}
   ~MinimumGrad() = default;
diff --git a/mindspore/core/ops/grad/mul_grad.h b/mindspore/core/ops/grad/mul_grad.h
index 879492e1de9..92e41d76b8d 100644
--- a/mindspore/core/ops/grad/mul_grad.h
+++ b/mindspore/core/ops/grad/mul_grad.h
@@ -24,7 +24,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMulGrad = "MulGrad";
-class MulGrad : public PrimitiveC {
+class MS_CORE_API MulGrad : public PrimitiveC {
  public:
   MulGrad() : PrimitiveC(kNameMulGrad) {}
   ~MulGrad() = default;
diff --git a/mindspore/core/ops/grad/neg_grad.h b/mindspore/core/ops/grad/neg_grad.h
index a46a70f5fb4..f2d8c656a97 100644
--- a/mindspore/core/ops/grad/neg_grad.h
+++ b/mindspore/core/ops/grad/neg_grad.h
@@ -31,7 +31,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameNegGrad = "NegGrad";
-class NegGrad : public PrimitiveC {
+class MS_CORE_API NegGrad : public PrimitiveC {
  public:
   NegGrad() : PrimitiveC(kNameNegGrad) {}
   ~NegGrad() = default;
diff --git a/mindspore/core/ops/grad/pool_grad.h b/mindspore/core/ops/grad/pool_grad.h
index 40bf1fec2fb..3ceb81927de 100644
--- a/mindspore/core/ops/grad/pool_grad.h
+++ b/mindspore/core/ops/grad/pool_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePoolGrad = "PoolGrad";
-class PoolGrad : public PrimitiveC {
+class MS_CORE_API PoolGrad : public PrimitiveC {
  public:
   PoolGrad() : PrimitiveC(kNamePoolGrad) { InitIOName({"x_origin", "out_origin", "grad"}, {"output"}); }
   explicit PoolGrad(const std::string k_name) : PrimitiveC(k_name) {
diff --git a/mindspore/core/ops/grad/pooling_grad.h b/mindspore/core/ops/grad/pooling_grad.h
index f6e542950e2..b54feee1e27 100644
--- a/mindspore/core/ops/grad/pooling_grad.h
+++ b/mindspore/core/ops/grad/pooling_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePoolingGrad = "PoolingGrad";
-class PoolingGrad : public PrimitiveC {
+class MS_CORE_API PoolingGrad : public PrimitiveC {
  public:
   PoolingGrad() : PrimitiveC(kNamePoolingGrad) {}
   ~PoolingGrad() = default;
diff --git a/mindspore/core/ops/grad/power_grad.h b/mindspore/core/ops/grad/power_grad.h
index 719a3d9aecf..8581203786e 100644
--- a/mindspore/core/ops/grad/power_grad.h
+++ b/mindspore/core/ops/grad/power_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePowerGrad = "PowerGrad";
-class PowerGrad : public PrimitiveC {
+class MS_CORE_API PowerGrad : public PrimitiveC {
  public:
   PowerGrad() : PrimitiveC(kNamePowerGrad) {}
   ~PowerGrad() = default;
diff --git a/mindspore/core/ops/grad/relu_grad.h b/mindspore/core/ops/grad/relu_grad.h
index e8195c47880..24c53be6641 100644
--- a/mindspore/core/ops/grad/relu_grad.h
+++ b/mindspore/core/ops/grad/relu_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReLUGrad = prim::kReLUGrad;
-class ReLUGrad : public PrimitiveC {
+class MS_CORE_API ReLUGrad : public PrimitiveC {
  public:
   ReLUGrad() : PrimitiveC(prim::kPrimReluGrad->name()) { InitIOName({"x"}, {"output"}); }
   ~ReLUGrad() = default;
diff --git a/mindspore/core/ops/grad/relu_grad_v2.h b/mindspore/core/ops/grad/relu_grad_v2.h
index 9117cdbd5f0..3ab9f6f9050 100644
--- a/mindspore/core/ops/grad/relu_grad_v2.h
+++ b/mindspore/core/ops/grad/relu_grad_v2.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReLUGradV2 = prim::kReLUGradV2;
-class ReLUGradV2 : public PrimitiveC {
+class MS_CORE_API ReLUGradV2 : public PrimitiveC {
  public:
   ReLUGradV2() : PrimitiveC(prim::kPrimReluGradV2->name()) { InitIOName({"x"}, {"output"}); }
   ~ReLUGradV2() = default;
diff --git a/mindspore/core/ops/grad/resize_grad.h b/mindspore/core/ops/grad/resize_grad.h
index da41b61f0da..f3b8e536f53 100644
--- a/mindspore/core/ops/grad/resize_grad.h
+++ b/mindspore/core/ops/grad/resize_grad.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameResizeGrad = "ResizeGrad";
-class ResizeGrad : public PrimitiveC {
+class MS_CORE_API ResizeGrad : public PrimitiveC {
  public:
   ResizeGrad() : PrimitiveC(kNameResizeGrad) {}
   ~ResizeGrad() = default;
diff --git a/mindspore/core/ops/grad/rsqrt_grad.h b/mindspore/core/ops/grad/rsqrt_grad.h
index df6f9795fb0..718ab0f51e4 100644
--- a/mindspore/core/ops/grad/rsqrt_grad.h
+++ b/mindspore/core/ops/grad/rsqrt_grad.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameRsqrtGrad = "RsqrtGrad";
-class RsqrtGrad : public PrimitiveC {
+class MS_CORE_API RsqrtGrad : public PrimitiveC {
  public:
   RsqrtGrad() : PrimitiveC(kNameRsqrtGrad) { InitIOName({"out_backprop", "input"}, {"output"}); }
   ~RsqrtGrad() = default;
diff --git a/mindspore/core/ops/grad/sigmoid_cross_entropy_with_logits_grad.h b/mindspore/core/ops/grad/sigmoid_cross_entropy_with_logits_grad.h
index d25440ab072..36cc693e3ca 100644
--- a/mindspore/core/ops/grad/sigmoid_cross_entropy_with_logits_grad.h
+++ b/mindspore/core/ops/grad/sigmoid_cross_entropy_with_logits_grad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSigmoidCrossEntropyWithLogitsGrad = "SigmoidCrossEntropyWithLogitsGrad";
-class SigmoidCrossEntropyWithLogitsGrad : public PrimitiveC {
+class MS_CORE_API SigmoidCrossEntropyWithLogitsGrad : public PrimitiveC {
  public:
   SigmoidCrossEntropyWithLogitsGrad() : PrimitiveC(kNameSigmoidCrossEntropyWithLogitsGrad) {
     InitIOName({"x", "y", "dout"}, {"x_grad"});
diff --git a/mindspore/core/ops/grad/smooth_l1_loss_grad.h b/mindspore/core/ops/grad/smooth_l1_loss_grad.h
index 4f87218a0ca..50b907134ef 100644
--- a/mindspore/core/ops/grad/smooth_l1_loss_grad.h
+++ b/mindspore/core/ops/grad/smooth_l1_loss_grad.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSmoothL1LossGrad = "SmoothL1LossGrad";
-class SmoothL1LossGrad : public PrimitiveC {
+class MS_CORE_API SmoothL1LossGrad : public PrimitiveC {
  public:
   SmoothL1LossGrad() : PrimitiveC(kNameSmoothL1LossGrad) {}
   ~SmoothL1LossGrad() = default;
diff --git a/mindspore/core/ops/grad/soft_margin_loss_grad.h b/mindspore/core/ops/grad/soft_margin_loss_grad.h
index 152ff646fe6..e5a47350ab1 100644
--- a/mindspore/core/ops/grad/soft_margin_loss_grad.h
+++ b/mindspore/core/ops/grad/soft_margin_loss_grad.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSoftMarginLossGrad = "SoftMarginLossGrad";
-class SoftMarginLossGrad : public PrimitiveC {
+class MS_CORE_API SoftMarginLossGrad : public PrimitiveC {
  public:
   SoftMarginLossGrad() : PrimitiveC(kNameSoftMarginLossGrad) { InitIOName({"predict", "label", "dout"}, {"gradient"}); }
   ~SoftMarginLossGrad() = default;
diff --git a/mindspore/core/ops/grad/soft_shrink_grad.h b/mindspore/core/ops/grad/soft_shrink_grad.h
index 248e6983162..26297273439 100644
--- a/mindspore/core/ops/grad/soft_shrink_grad.h
+++ b/mindspore/core/ops/grad/soft_shrink_grad.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSoftShrinkGrad = "SoftShrinkGrad";
-class SoftShrinkGrad : public PrimitiveC {
+class MS_CORE_API SoftShrinkGrad : public PrimitiveC {
  public:
   SoftShrinkGrad() : PrimitiveC(kNameSoftShrinkGrad) { InitIOName({"input_grad", "input_x"}, {"output"}); }
   ~SoftShrinkGrad() = default;
diff --git a/mindspore/core/ops/grad/sqrt_grad.h b/mindspore/core/ops/grad/sqrt_grad.h
index 4ff484fc8b8..443a5121d01 100644
--- a/mindspore/core/ops/grad/sqrt_grad.h
+++ b/mindspore/core/ops/grad/sqrt_grad.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSqrtGrad = "SqrtGrad";
-class SqrtGrad : public PrimitiveC {
+class MS_CORE_API SqrtGrad : public PrimitiveC {
  public:
   SqrtGrad() : PrimitiveC(kNameSqrtGrad) { InitIOName({"out_backprop", "input"}, {"output"}); }
   ~SqrtGrad() = default;
diff --git a/mindspore/core/ops/grad/strided_slice_grad.h b/mindspore/core/ops/grad/strided_slice_grad.h
index 0cbedd43f8e..521696cee55 100644
--- a/mindspore/core/ops/grad/strided_slice_grad.h
+++ b/mindspore/core/ops/grad/strided_slice_grad.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameStridedSliceGrad = "StridedSliceGrad";
-class StridedSliceGrad : public PrimitiveC {
+class MS_CORE_API StridedSliceGrad : public PrimitiveC {
  public:
   StridedSliceGrad() : PrimitiveC(kNameStridedSliceGrad) {}
   ~StridedSliceGrad() = default;
diff --git a/mindspore/core/ops/grad/sub_grad.h b/mindspore/core/ops/grad/sub_grad.h
index c7b0f93f0d5..282e12aac2b 100644
--- a/mindspore/core/ops/grad/sub_grad.h
+++ b/mindspore/core/ops/grad/sub_grad.h
@@ -24,7 +24,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSubGrad = "SubGrad";
-class SubGrad : public PrimitiveC {
+class MS_CORE_API SubGrad : public PrimitiveC {
  public:
   SubGrad() : PrimitiveC(kNameSubGrad) {}
   ~SubGrad() = default;
diff --git a/mindspore/core/ops/greater.h b/mindspore/core/ops/greater.h
index 06751f94012..cff59d18dfe 100644
--- a/mindspore/core/ops/greater.h
+++ b/mindspore/core/ops/greater.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameGreater = "Greater";
-class Greater : public PrimitiveC {
+class MS_CORE_API Greater : public PrimitiveC {
  public:
   Greater() : PrimitiveC(kNameGreater) { InitIOName({"x", "y"}, {"output"}); }
   ~Greater() = default;
diff --git a/mindspore/core/ops/greater_equal.h b/mindspore/core/ops/greater_equal.h
index d8151d3983a..30adfe4276b 100644
--- a/mindspore/core/ops/greater_equal.h
+++ b/mindspore/core/ops/greater_equal.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameGreaterEqual = "GreaterEqual";
-class GreaterEqual : public PrimitiveC {
+class MS_CORE_API GreaterEqual : public PrimitiveC {
  public:
   GreaterEqual() : PrimitiveC(kNameGreaterEqual) {}
   ~GreaterEqual() = default;
diff --git a/mindspore/core/ops/gru.h b/mindspore/core/ops/gru.h
index 5fcf49fc8a6..2e953358fa5 100644
--- a/mindspore/core/ops/gru.h
+++ b/mindspore/core/ops/gru.h
@@ -31,7 +31,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameGRU = "GRU";
-class GRU : public PrimitiveC {
+class MS_CORE_API GRU : public PrimitiveC {
  public:
   GRU() : PrimitiveC(kNameGRU) {
     InitIOName({"x", "weight_input", "weight_hidden", "bias_input", "bias_hidden", "seq_length", "init_h"},
diff --git a/mindspore/core/ops/hashtable_lookup.h b/mindspore/core/ops/hashtable_lookup.h
index 6ab20abb442..ae017be8c8b 100644
--- a/mindspore/core/ops/hashtable_lookup.h
+++ b/mindspore/core/ops/hashtable_lookup.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameHashtableLookup = "HashtableLookup";
-class HashtableLookup : public PrimitiveC {
+class MS_CORE_API HashtableLookup : public PrimitiveC {
  public:
   HashtableLookup() : PrimitiveC(kNameHashtableLookup) {}
   ~HashtableLookup() = default;
diff --git a/mindspore/core/ops/hshrink.h b/mindspore/core/ops/hshrink.h
index 582e8847dea..5bff01a8319 100644
--- a/mindspore/core/ops/hshrink.h
+++ b/mindspore/core/ops/hshrink.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameHShrink = "HShrink";
-class HShrink : public PrimitiveC {
+class MS_CORE_API HShrink : public PrimitiveC {
  public:
   HShrink() : PrimitiveC(kNameHShrink) { InitIOName({"input_x"}, {"output"}); }
   ~HShrink() = default;
diff --git a/mindspore/core/ops/hsigmoid.h b/mindspore/core/ops/hsigmoid.h
index e6615c6c934..3ce1312c9fd 100644
--- a/mindspore/core/ops/hsigmoid.h
+++ b/mindspore/core/ops/hsigmoid.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameHSigmoid = "HSigmoid";
-class HSigmoid : public PrimitiveC {
+class MS_CORE_API HSigmoid : public PrimitiveC {
  public:
   HSigmoid() : PrimitiveC(kNameHSigmoid) { InitIOName({"input_x"}, {"output"}); }
   ~HSigmoid() = default;
diff --git a/mindspore/core/ops/identity.h b/mindspore/core/ops/identity.h
index 164b9d805b5..a973cad1eed 100644
--- a/mindspore/core/ops/identity.h
+++ b/mindspore/core/ops/identity.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameIdentity = "Identity";
-class Identity : public PrimitiveC {
+class MS_CORE_API Identity : public PrimitiveC {
  public:
   Identity() : PrimitiveC(kNameIdentity) {}
   ~Identity() = default;
diff --git a/mindspore/core/ops/instance_norm.h b/mindspore/core/ops/instance_norm.h
index d1cd8cc63b8..7382826f2f0 100644
--- a/mindspore/core/ops/instance_norm.h
+++ b/mindspore/core/ops/instance_norm.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameInstanceNorm = "InstanceNorm";
-class InstanceNorm : public PrimitiveC {
+class MS_CORE_API InstanceNorm : public PrimitiveC {
  public:
   InstanceNorm() : PrimitiveC(kNameInstanceNorm) {}
   ~InstanceNorm() = default;
diff --git a/mindspore/core/ops/invert_permutation.h b/mindspore/core/ops/invert_permutation.h
index 6507e9f0ace..6b133ea0c31 100644
--- a/mindspore/core/ops/invert_permutation.h
+++ b/mindspore/core/ops/invert_permutation.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameInvertPermutation = "InvertPermutation";
-class InvertPermutation : public PrimitiveC {
+class MS_CORE_API InvertPermutation : public PrimitiveC {
  public:
   InvertPermutation() : PrimitiveC(kNameInvertPermutation) {}
   ~InvertPermutation() = default;
diff --git a/mindspore/core/ops/is_finite.h b/mindspore/core/ops/is_finite.h
index 8f73b7c24a3..eb32ce0b099 100644
--- a/mindspore/core/ops/is_finite.h
+++ b/mindspore/core/ops/is_finite.h
@@ -24,7 +24,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameIsFinite = "IsFinite";
-class IsFinite : public PrimitiveC {
+class MS_CORE_API IsFinite : public PrimitiveC {
  public:
   IsFinite() : PrimitiveC(kNameIsFinite) {}
   ~IsFinite() = default;
diff --git a/mindspore/core/ops/l2_normalize.h b/mindspore/core/ops/l2_normalize.h
index 0f9cd207d99..3219ac01e90 100644
--- a/mindspore/core/ops/l2_normalize.h
+++ b/mindspore/core/ops/l2_normalize.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameL2Normalize = "L2Normalize";
-class L2Normalize : public PrimitiveC {
+class MS_CORE_API L2Normalize : public PrimitiveC {
  public:
   explicit L2Normalize(const std::string &name = kNameL2Normalize) : PrimitiveC(name) {}
   ~L2Normalize() = default;
diff --git a/mindspore/core/ops/layer_norm.h b/mindspore/core/ops/layer_norm.h
index 7ba475e5029..3852fe3407e 100644
--- a/mindspore/core/ops/layer_norm.h
+++ b/mindspore/core/ops/layer_norm.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLayerNorm = prim::kLayerNorm;
-class LayerNorm : public PrimitiveC {
+class MS_CORE_API LayerNorm : public PrimitiveC {
  public:
   LayerNorm() : PrimitiveC(kNameLayerNorm) {}
   explicit LayerNorm(const std::string k_name) : PrimitiveC(k_name) {}
diff --git a/mindspore/core/ops/leaky_relu.h b/mindspore/core/ops/leaky_relu.h
index 09dc63915be..a907b8b3263 100644
--- a/mindspore/core/ops/leaky_relu.h
+++ b/mindspore/core/ops/leaky_relu.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLeakyRelu = "LeakyRelu";
-class LeakyRelu : public PrimitiveC {
+class MS_CORE_API LeakyRelu : public PrimitiveC {
  public:
   LeakyRelu() : PrimitiveC(kNameLeakyRelu) {}
   ~LeakyRelu() = default;
diff --git a/mindspore/core/ops/less.h b/mindspore/core/ops/less.h
index c5dd51835df..db2dfb7a56c 100644
--- a/mindspore/core/ops/less.h
+++ b/mindspore/core/ops/less.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLess = "Less";
-class Less : public PrimitiveC {
+class MS_CORE_API Less : public PrimitiveC {
  public:
   Less() : PrimitiveC(kNameLess) { InitIOName({"x", "y"}, {"output"}); }
   ~Less() = default;
diff --git a/mindspore/core/ops/less_equal.h b/mindspore/core/ops/less_equal.h
index 70f228b6c52..71d40bfb8d4 100644
--- a/mindspore/core/ops/less_equal.h
+++ b/mindspore/core/ops/less_equal.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLessEqual = "LessEqual";
-class LessEqual : public PrimitiveC {
+class MS_CORE_API LessEqual : public PrimitiveC {
  public:
   LessEqual() : PrimitiveC(kNameLessEqual) { InitIOName({"x", "y"}, {"output"}); }
   ~LessEqual() = default;
diff --git a/mindspore/core/ops/lin_space.h b/mindspore/core/ops/lin_space.h
index 43b9bdf9364..fa042a2440e 100644
--- a/mindspore/core/ops/lin_space.h
+++ b/mindspore/core/ops/lin_space.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLinSpace = "LinSpace";
-class LinSpace : public PrimitiveC {
+class MS_CORE_API LinSpace : public PrimitiveC {
  public:
   LinSpace() : PrimitiveC(kNameLinSpace) { InitIOName({"start", "stop", "num"}, {"output"}); }
   ~LinSpace() = default;
diff --git a/mindspore/core/ops/log.h b/mindspore/core/ops/log.h
index 43c1ea74154..19f955a10b3 100644
--- a/mindspore/core/ops/log.h
+++ b/mindspore/core/ops/log.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLog = prim::kLog;
-class Log : public PrimitiveC {
+class MS_CORE_API Log : public PrimitiveC {
  public:
   Log() : PrimitiveC(prim::kPrimLog->name()) { InitIOName({"x"}, {"y"}); }
   ~Log() = default;
diff --git a/mindspore/core/ops/log1p.h b/mindspore/core/ops/log1p.h
index 58a8a0004e2..dfc7daa2374 100644
--- a/mindspore/core/ops/log1p.h
+++ b/mindspore/core/ops/log1p.h
@@ -27,7 +27,7 @@
 
 namespace mindspore {
 namespace ops {
-class Log1p : public PrimitiveC {
+class MS_CORE_API Log1p : public PrimitiveC {
  public:
   Log1p() : PrimitiveC(prim::kPrimLog1p->name()) { InitIOName({"x"}, {"y"}); }
   ~Log1p() = default;
diff --git a/mindspore/core/ops/log_softmax.h b/mindspore/core/ops/log_softmax.h
index 4815dbaf076..5b3238fca97 100644
--- a/mindspore/core/ops/log_softmax.h
+++ b/mindspore/core/ops/log_softmax.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLogSoftmax = "LogSoftmax";
-class LogSoftmax : public PrimitiveC {
+class MS_CORE_API LogSoftmax : public PrimitiveC {
  public:
   LogSoftmax() : PrimitiveC(kNameLogSoftmax) { InitIOName({"x"}, {"output"}); }
   ~LogSoftmax() = default;
diff --git a/mindspore/core/ops/logical_and.h b/mindspore/core/ops/logical_and.h
index e05099afdb1..325bf9b4400 100644
--- a/mindspore/core/ops/logical_and.h
+++ b/mindspore/core/ops/logical_and.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLogicalAnd = "LogicalAnd";
-class LogicalAnd : public PrimitiveC {
+class MS_CORE_API LogicalAnd : public PrimitiveC {
  public:
   LogicalAnd() : PrimitiveC(kNameLogicalAnd) { InitIOName({"x", "y"}, {"output"}); }
   ~LogicalAnd() = default;
diff --git a/mindspore/core/ops/logical_not.h b/mindspore/core/ops/logical_not.h
index 8fea01be8b6..9b70e49b789 100644
--- a/mindspore/core/ops/logical_not.h
+++ b/mindspore/core/ops/logical_not.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLogicalNot = "LogicalNot";
-class LogicalNot : public PrimitiveC {
+class MS_CORE_API LogicalNot : public PrimitiveC {
  public:
   LogicalNot() : PrimitiveC(kNameLogicalNot) { InitIOName({"x"}, {"output"}); }
   ~LogicalNot() = default;
diff --git a/mindspore/core/ops/logical_or.h b/mindspore/core/ops/logical_or.h
index 0e3fc4b8c14..a687f4d7bbb 100644
--- a/mindspore/core/ops/logical_or.h
+++ b/mindspore/core/ops/logical_or.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLogicalOr = "LogicalOr";
-class LogicalOr : public PrimitiveC {
+class MS_CORE_API LogicalOr : public PrimitiveC {
  public:
   LogicalOr() : PrimitiveC(kNameLogicalOr) { InitIOName({"x", "y"}, {"output"}); }
   ~LogicalOr() = default;
diff --git a/mindspore/core/ops/logical_xor.h b/mindspore/core/ops/logical_xor.h
index c765f7b3c60..020e314e585 100644
--- a/mindspore/core/ops/logical_xor.h
+++ b/mindspore/core/ops/logical_xor.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLogicalXor = "LogicalXor";
-class LogicalXor : public PrimitiveC {
+class MS_CORE_API LogicalXor : public PrimitiveC {
  public:
   LogicalXor() : PrimitiveC(kNameLogicalXor) {}
   ~LogicalXor() = default;
diff --git a/mindspore/core/ops/lp_normalization.h b/mindspore/core/ops/lp_normalization.h
index f3851233de5..73e3c2a649c 100644
--- a/mindspore/core/ops/lp_normalization.h
+++ b/mindspore/core/ops/lp_normalization.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLpNormalization = "LpNormalization";
-class LpNormalization : public PrimitiveC {
+class MS_CORE_API LpNormalization : public PrimitiveC {
  public:
   LpNormalization() : PrimitiveC(kNameLpNormalization) {}
   ~LpNormalization() = default;
diff --git a/mindspore/core/ops/lrn.h b/mindspore/core/ops/lrn.h
index 460ea584e0b..1ccbba332e0 100644
--- a/mindspore/core/ops/lrn.h
+++ b/mindspore/core/ops/lrn.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLRN = "LRN";
-class LRN : public PrimitiveC {
+class MS_CORE_API LRN : public PrimitiveC {
  public:
   LRN() : PrimitiveC(kNameLRN) { InitIOName({"x"}, {"y"}); }
   ~LRN() = default;
diff --git a/mindspore/core/ops/lsh_projection.h b/mindspore/core/ops/lsh_projection.h
index a122f7001d1..55656d43c37 100644
--- a/mindspore/core/ops/lsh_projection.h
+++ b/mindspore/core/ops/lsh_projection.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLshProjection = "LshProjection";
-class LshProjection : public PrimitiveC {
+class MS_CORE_API LshProjection : public PrimitiveC {
  public:
   LshProjection() : PrimitiveC(kNameLshProjection) {}
   ~LshProjection() = default;
diff --git a/mindspore/core/ops/lstm.h b/mindspore/core/ops/lstm.h
index 4d128e8896d..106a891aa5f 100644
--- a/mindspore/core/ops/lstm.h
+++ b/mindspore/core/ops/lstm.h
@@ -31,7 +31,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameLSTM = "LSTM";
-class LSTM : public PrimitiveC {
+class MS_CORE_API LSTM : public PrimitiveC {
  public:
   LSTM() : PrimitiveC(kNameLSTM) {}
   ~LSTM() = default;
diff --git a/mindspore/core/ops/mat_mul.h b/mindspore/core/ops/mat_mul.h
index d1c8a04fd2c..0371c7ce22f 100644
--- a/mindspore/core/ops/mat_mul.h
+++ b/mindspore/core/ops/mat_mul.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMatMul = "MatMul";
-class MatMul : public PrimitiveC {
+class MS_CORE_API MatMul : public PrimitiveC {
  public:
   MatMul() : PrimitiveC(kNameMatMul) { InitIOName({"x1", "x2"}, {"output"}); }
   ~MatMul() = default;
diff --git a/mindspore/core/ops/max_pool.h b/mindspore/core/ops/max_pool.h
index c0c6e93e243..3355d03602c 100644
--- a/mindspore/core/ops/max_pool.h
+++ b/mindspore/core/ops/max_pool.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMaxPool = "MaxPool";
-class MaxPool : public PrimitiveC {
+class MS_CORE_API MaxPool : public PrimitiveC {
  public:
   MaxPool() : PrimitiveC(kNameMaxPool) { InitIOName({"x"}, {"output"}); }
   explicit MaxPool(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x"}, {"output"}); }
diff --git a/mindspore/core/ops/maximum.h b/mindspore/core/ops/maximum.h
index 3550d80b901..0b026f0722d 100644
--- a/mindspore/core/ops/maximum.h
+++ b/mindspore/core/ops/maximum.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMaximum = "Maximum";
-class Maximum : public PrimitiveC {
+class MS_CORE_API Maximum : public PrimitiveC {
  public:
   Maximum() : PrimitiveC(kNameMaximum) { InitIOName({"x", "y"}, {"output"}); }
   ~Maximum() = default;
diff --git a/mindspore/core/ops/merge.h b/mindspore/core/ops/merge.h
index 7268e36fa45..d0f0264644e 100644
--- a/mindspore/core/ops/merge.h
+++ b/mindspore/core/ops/merge.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMerge = "Merge";
-class Merge : public PrimitiveC {
+class MS_CORE_API Merge : public PrimitiveC {
  public:
   Merge() : PrimitiveC(kNameMerge) {}
   ~Merge() = default;
diff --git a/mindspore/core/ops/mfcc.h b/mindspore/core/ops/mfcc.h
index 0975fd61f69..0791ab24783 100644
--- a/mindspore/core/ops/mfcc.h
+++ b/mindspore/core/ops/mfcc.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMfcc = "Mfcc";
-class Mfcc : public PrimitiveC {
+class MS_CORE_API Mfcc : public PrimitiveC {
  public:
   Mfcc() : PrimitiveC(kNameMfcc) {}
   ~Mfcc() = default;
diff --git a/mindspore/core/ops/minimum.h b/mindspore/core/ops/minimum.h
index 4dccb391bb8..29446abceaa 100644
--- a/mindspore/core/ops/minimum.h
+++ b/mindspore/core/ops/minimum.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMinimum = "Minimum";
-class Minimum : public PrimitiveC {
+class MS_CORE_API Minimum : public PrimitiveC {
  public:
   Minimum() : PrimitiveC(kNameMinimum) { InitIOName({"x", "y"}, {"output"}); }
   ~Minimum() = default;
diff --git a/mindspore/core/ops/mod.h b/mindspore/core/ops/mod.h
index 1b2af255134..921ac34cab6 100644
--- a/mindspore/core/ops/mod.h
+++ b/mindspore/core/ops/mod.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMod = "Mod";
-class Mod : public PrimitiveC {
+class MS_CORE_API Mod : public PrimitiveC {
  public:
   Mod() : PrimitiveC(kNameMod) { InitIOName({"x", "y"}, {"output"}); }
   ~Mod() = default;
diff --git a/mindspore/core/ops/mul.h b/mindspore/core/ops/mul.h
index d12e72e0d0f..7067a0ac955 100644
--- a/mindspore/core/ops/mul.h
+++ b/mindspore/core/ops/mul.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameMul = prim::kMul;
-class Mul : public PrimitiveC {
+class MS_CORE_API Mul : public PrimitiveC {
  public:
   Mul() : PrimitiveC(kNameMul) { InitIOName({"x", "y"}, {"output"}); }
   explicit Mul(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x", "y"}, {"output"}); }
diff --git a/mindspore/core/ops/neg.h b/mindspore/core/ops/neg.h
index f9a8c8c344f..8b7a009828c 100644
--- a/mindspore/core/ops/neg.h
+++ b/mindspore/core/ops/neg.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameNeg = prim::kNeg;
-class Neg : public PrimitiveC {
+class MS_CORE_API Neg : public PrimitiveC {
  public:
   Neg() : PrimitiveC(prim::kPrimNeg->name()) { InitIOName({"x"}, {"y"}); }
   ~Neg() = default;
diff --git a/mindspore/core/ops/neighborexchange.cc b/mindspore/core/ops/neighborexchange.cc
index b4c47454bc5..23e17ab382a 100644
--- a/mindspore/core/ops/neighborexchange.cc
+++ b/mindspore/core/ops/neighborexchange.cc
@@ -15,18 +15,125 @@
  */
 
 #include "ops/neighborexchange.h"
-#include "ops/op_utils.h"
+#include <string>
 #include "utils/check_convert_utils.h"
 #include "abstract/primitive_infer_map.h"
 
 namespace mindspore {
 namespace ops {
-abstract::TupleShapePtr InferShape(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
+namespace {
+constexpr auto kRecvShapes = "recv_shapes";
+constexpr auto kRecvRankIds = "recv_rank_ids";
+constexpr auto kRecvType = "recv_type";
+constexpr auto kSendShapes = "send_shapes";
+constexpr auto kSendRankIds = "send_rank_ids";
+constexpr auto kGroup = "group";
+
+inline std::string GetShapeStr(const std::vector<int64_t> &shape) {
+  std::string shape_str = "[";
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (i == 0) {
+      shape_str += std::to_string(shape[i]);
+    } else {
+      shape_str += "," + std::to_string(shape[i]);
+    }
+  }
+  return shape_str + "]";
+}
+
+void CheckAttr(const PrimitivePtr &primitive, const std::string &shape_attr_name,
+               const std::string &rank_ids_attr_name) {
+  MS_EXCEPTION_IF_NULL(primitive);
+  // size of send/recv_rank_ids equal to size of send/recv_shapes
+  ValuePtrList attr_shapes;
+  try {
+    auto attr = primitive->GetAttr(shape_attr_name);
+    attr_shapes = GetValue<ValuePtrList>(attr);
+  } catch (const std::exception &) {
+    MS_EXCEPTION(TypeError) << "Attr " << shape_attr_name << " should be a tuple(list, list, ...).";
+  }
+  if (!attr_shapes.empty()) {
+    auto ele = attr_shapes[0]->cast<ValueSequeuePtr>();
+    if (ele == nullptr) {
+      MS_EXCEPTION(TypeError) << "Attr " << shape_attr_name << " must be a tuple.";
+    }
+  }
+  std::vector<int64_t> attr_rank_ids;
+  try {
+    auto attr = primitive->GetAttr(rank_ids_attr_name);
+    attr_rank_ids = GetValue<std::vector<int64_t>>(attr);
+  } catch (const std::exception &) {
+    MS_EXCEPTION(TypeError) << "Attr " << rank_ids_attr_name << " should be a list[int, int, ...].";
+  }
+  if (attr_shapes.size() != attr_rank_ids.size()) {
+    MS_EXCEPTION(ValueError) << "Invalid " << primitive->name() << " attr " << shape_attr_name << " size "
+                             << attr_shapes.size() << " must be equal to attr " << rank_ids_attr_name << " size "
+                             << attr_rank_ids.size();
+  }
+}
+
+void Check(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
   MS_EXCEPTION_IF_NULL(primitive);
   auto prim_name = primitive->name();
+  CheckAttr(primitive, kRecvShapes, kRecvRankIds);
+  CheckAttr(primitive, kSendShapes, kSendRankIds);
+  // check recv type
+  auto recv_type_attr = primitive->GetAttr(kRecvType);
+  MS_EXCEPTION_IF_NULL(recv_type_attr);
+  if (!recv_type_attr->isa<Type>()) {
+    MS_EXCEPTION(TypeError) << "Attr " << kRecvType << " should be a mindspore data type.";
+  }
+  // check group
+  auto group_attr = primitive->GetAttr(kGroup);
+  try {
+    MS_EXCEPTION_IF_NULL(group_attr);
+    (void)GetValue<std::string>(group_attr);
+  } catch (const std::exception &) {
+    MS_EXCEPTION(TypeError) << "Attr " << kGroup << " should be a str.";
+  }
+  // check empty input
+  auto send_rank_ids = GetValue<std::vector<int64_t>>(primitive->GetAttr(kSendRankIds));
+  if (send_rank_ids.empty()) {
+    (void)CheckAndConvertUtils::CheckInteger("input_numbers", input_args.size(), kEqual, 0, prim_name);
+    return;
+  }
+  // check input shape & attr send shape
   (void)CheckAndConvertUtils::CheckInteger("input_numbers", input_args.size(), kEqual, 1, prim_name);
   CheckAndConvertUtils::CheckArgs<abstract::AbstractTuple>(prim_name, input_args, 0);
-  auto recv_shapes = primitive->GetAttr(RecvShapes);
+  auto abstract_tuple = input_args[0]->cast<abstract::AbstractTuplePtr>();
+  MS_EXCEPTION_IF_NULL(abstract_tuple);
+  auto abstract_element = abstract_tuple->elements();
+  auto send_shapes = GetValue<ValuePtrList>(primitive->GetAttr(kSendShapes));
+  if (abstract_element.size() != send_shapes.size()) {
+    MS_EXCEPTION(ArgumentError) << "Input tuple size " << abstract_element.size() << " must be equal to attr "
+                                << kSendShapes << " size " << send_shapes.size();
+  }
+  for (size_t i = 0; i < abstract_element.size(); ++i) {
+    // get attr shape
+    MS_EXCEPTION_IF_NULL(send_shapes[i]);
+    auto send_shape_value = send_shapes[i]->cast<ValueSequeuePtr>();
+    MS_EXCEPTION_IF_NULL(send_shape_value);
+    std::vector<int64_t> send_shape = GetValue<std::vector<int64_t>>(send_shape_value);
+    // get input tensor shape
+    MS_EXCEPTION_IF_NULL(abstract_element[i]);
+    auto arg_base_shape = abstract_element[i]->BuildShape();
+    MS_EXCEPTION_IF_NULL(arg_base_shape);
+    auto shape = arg_base_shape->cast<abstract::ShapePtr>();
+    if (shape == nullptr) {
+      MS_EXCEPTION(ArgumentError) << "Input " << i << " should be a tensor.";
+    }
+    // comp two shape
+    auto shape_vec = shape->shape();
+    if (shape_vec != send_shape) {
+      MS_EXCEPTION(ArgumentError) << "Input " << i << " shape: " << GetShapeStr(shape_vec)
+                                  << " but attr shape : " << GetShapeStr(send_shape);
+    }
+  }
+}
+
+abstract::TupleShapePtr InferShape(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
+  MS_EXCEPTION_IF_NULL(primitive);
+  auto recv_shapes = primitive->GetAttr(kRecvShapes);
   MS_EXCEPTION_IF_NULL(recv_shapes);
   auto shapes_seq = recv_shapes->cast<ValueSequeuePtr>();
   MS_EXCEPTION_IF_NULL(shapes_seq);
@@ -49,25 +156,25 @@ TypePtr InferType(const PrimitivePtr &primitive, const std::vector<AbstractBaseP
   (void)CheckAndConvertUtils::CheckInteger("NeighborExchange infer", SizeToLong(input_args.size()), kEqual, 1,
                                            prim_name);
   MS_EXCEPTION_IF_NULL(input_args[0]);
-  auto recv_shapes = primitive->GetAttr(RecvShapes);
+  auto recv_shapes = primitive->GetAttr(kRecvShapes);
   MS_EXCEPTION_IF_NULL(recv_shapes);
   auto shapes_seq = recv_shapes->cast<ValueSequeuePtr>();
   MS_EXCEPTION_IF_NULL(shapes_seq);
   auto shapes_value = shapes_seq->value();
   auto out_num = shapes_value.size();
-  auto recv_type = primitive->GetAttr(RecvType)->cast<TypePtr>();
+  auto recv_type = primitive->GetAttr(kRecvType)->cast<TypePtr>();
   MS_EXCEPTION_IF_NULL(recv_type);
   std::vector<TypePtr> type_vec(out_num, recv_type);
   return std::make_shared<Tuple>(type_vec);
 }
-
+}  // namespace
 AbstractBasePtr NeighborExchangeInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                       const std::vector<AbstractBasePtr> &input_args) {
+  Check(primitive, input_args);
   auto type = InferType(primitive, input_args);
   auto shape = InferShape(primitive, input_args);
   return abstract::MakeAbstract(shape, type);
 }
-
 REGISTER_PRIMITIVE_EVAL_IMPL(NeighborExchange, prim::kPrimNeighborExchange, NeighborExchangeInfer, nullptr, true);
 }  // namespace ops
 }  // namespace mindspore
diff --git a/mindspore/core/ops/neighborexchange.h b/mindspore/core/ops/neighborexchange.h
index 58f1e53da42..d3ff559f2ca 100644
--- a/mindspore/core/ops/neighborexchange.h
+++ b/mindspore/core/ops/neighborexchange.h
@@ -25,9 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameNeighborExchange = "NeighborExchange";
-constexpr auto RecvShapes = "recv_shapes";
-constexpr auto RecvType = "recv_type";
-class NeighborExchange : public PrimitiveC {
+class MS_CORE_API NeighborExchange : public PrimitiveC {
  public:
   NeighborExchange() : PrimitiveC(kNameNeighborExchange) {}
   ~NeighborExchange() = default;
diff --git a/mindspore/core/ops/non_max_suppression.h b/mindspore/core/ops/non_max_suppression.h
index fac7d7261d7..b8ef1953d53 100644
--- a/mindspore/core/ops/non_max_suppression.h
+++ b/mindspore/core/ops/non_max_suppression.h
@@ -31,7 +31,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameNonMaxSuppression = "NonMaxSuppression";
-class NonMaxSuppression : public PrimitiveC {
+class MS_CORE_API NonMaxSuppression : public PrimitiveC {
  public:
   NonMaxSuppression() : PrimitiveC(kNameNonMaxSuppression) {}
   ~NonMaxSuppression() = default;
diff --git a/mindspore/core/ops/non_zero.h b/mindspore/core/ops/non_zero.h
index 0900d7b6526..a0cd982e1eb 100644
--- a/mindspore/core/ops/non_zero.h
+++ b/mindspore/core/ops/non_zero.h
@@ -24,7 +24,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameNonZero = "NonZero";
-class NonZero : public PrimitiveC {
+class MS_CORE_API NonZero : public PrimitiveC {
  public:
   NonZero() : PrimitiveC(kNameNonZero) {}
   ~NonZero() = default;
diff --git a/mindspore/core/ops/not_equal.h b/mindspore/core/ops/not_equal.h
index 852dc3ecc34..89c28a51a42 100644
--- a/mindspore/core/ops/not_equal.h
+++ b/mindspore/core/ops/not_equal.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameNotEqual = prim::kNotEqual;
-class NotEqual : public PrimitiveC {
+class MS_CORE_API NotEqual : public PrimitiveC {
  public:
   NotEqual() : PrimitiveC(prim::kPrimNotEqual->name()) { InitIOName({"x", "y"}, {"output"}); }
   ~NotEqual() = default;
diff --git a/mindspore/core/ops/one_hot.h b/mindspore/core/ops/one_hot.h
index b953224bb62..f33b2013d50 100644
--- a/mindspore/core/ops/one_hot.h
+++ b/mindspore/core/ops/one_hot.h
@@ -25,7 +25,7 @@
 
 namespace mindspore {
 namespace ops {
-class OneHot : public PrimitiveC {
+class MS_CORE_API OneHot : public PrimitiveC {
  public:
   OneHot() : PrimitiveC(prim::kPrimOneHot->name()) {
     InitIOName({"indices", "depth", "on_value", "off_value"}, {"output"});
diff --git a/mindspore/core/ops/ones_like.h b/mindspore/core/ops/ones_like.h
index cff0b8650a8..989f9320f63 100644
--- a/mindspore/core/ops/ones_like.h
+++ b/mindspore/core/ops/ones_like.h
@@ -24,7 +24,7 @@
 
 namespace mindspore {
 namespace ops {
-class OnesLike : public PrimitiveC {
+class MS_CORE_API OnesLike : public PrimitiveC {
  public:
   OnesLike() : PrimitiveC(prim::kPrimOnesLike->name()) {}
   ~OnesLike() = default;
diff --git a/mindspore/core/ops/pack.h b/mindspore/core/ops/pack.h
index 732311616e4..3ce7c8d38a7 100644
--- a/mindspore/core/ops/pack.h
+++ b/mindspore/core/ops/pack.h
@@ -31,7 +31,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePack = "Pack";
-class Pack : public PrimitiveC {
+class MS_CORE_API Pack : public PrimitiveC {
  public:
   Pack() : PrimitiveC(kNamePack) {}
   ~Pack() = default;
diff --git a/mindspore/core/ops/pad.h b/mindspore/core/ops/pad.h
index 5337371fa40..d294bdbd385 100644
--- a/mindspore/core/ops/pad.h
+++ b/mindspore/core/ops/pad.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePad = "Pad";
-class Pad : public PrimitiveC {
+class MS_CORE_API Pad : public PrimitiveC {
  public:
   Pad() : PrimitiveC(kNamePad) { InitIOName({"x"}, {"y"}); }
   explicit Pad(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x"}, {"y"}); }
diff --git a/mindspore/core/ops/partial.h b/mindspore/core/ops/partial.h
index 66d2da58a66..f0dd2856d2d 100644
--- a/mindspore/core/ops/partial.h
+++ b/mindspore/core/ops/partial.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePartial = "Partial";
-class Partial : public PrimitiveC {
+class MS_CORE_API Partial : public PrimitiveC {
  public:
   Partial() : PrimitiveC(kNamePartial) {}
   ~Partial() = default;
diff --git a/mindspore/core/ops/pow.h b/mindspore/core/ops/pow.h
index bea006585b9..088f43469e8 100644
--- a/mindspore/core/ops/pow.h
+++ b/mindspore/core/ops/pow.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePow = "Pow";
-class Pow : public PrimitiveC {
+class MS_CORE_API Pow : public PrimitiveC {
  public:
   explicit Pow(const std::string &k_name = kNamePow) : PrimitiveC(k_name) { InitIOName({"x", "y"}, {"output"}); }
   ~Pow() = default;
diff --git a/mindspore/core/ops/prelu.h b/mindspore/core/ops/prelu.h
index deae1b6034f..8b467318f8a 100644
--- a/mindspore/core/ops/prelu.h
+++ b/mindspore/core/ops/prelu.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePReLU = "PReLU";
-class PReLU : public PrimitiveC {
+class MS_CORE_API PReLU : public PrimitiveC {
  public:
   PReLU() : PrimitiveC(kNamePReLU) { InitIOName({"x"}, {"y"}); }
   explicit PReLU(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x"}, {"y"}); }
diff --git a/mindspore/core/ops/primitive_c.h b/mindspore/core/ops/primitive_c.h
index ad7ec8c1a65..0334aa25fcf 100644
--- a/mindspore/core/ops/primitive_c.h
+++ b/mindspore/core/ops/primitive_c.h
@@ -25,7 +25,7 @@
 #include "ir/value.h"
 namespace mindspore {
 namespace ops {
-class PrimitiveC : public Primitive {
+class MS_CORE_API PrimitiveC : public Primitive {
  public:
   explicit PrimitiveC(const std::string &name) : Primitive(name) {}
   MS_DECLARE_PARENT(PrimitiveC, Primitive);
@@ -37,7 +37,7 @@ class PrimitiveC : public Primitive {
 };
 
 using OpPrimCDefineFunc = std::function<std::shared_ptr<PrimitiveC>()>;
-class OpPrimCRegister {
+class MS_CORE_API OpPrimCRegister {
  public:
   ~OpPrimCRegister() {}
   static OpPrimCRegister &GetInstance();
@@ -49,7 +49,7 @@ class OpPrimCRegister {
   std::map<std::string, OpPrimCDefineFunc> op_primc_fns_;
 };
 
-class OpPrimCRegisterHelper {
+class MS_CORE_API OpPrimCRegisterHelper {
  public:
   OpPrimCRegisterHelper(const std::string &kname, const OpPrimCDefineFunc &fn) {
     OpPrimCRegister::GetInstance().SetPrimCMap(kname, fn);
diff --git a/mindspore/core/ops/prior_box.h b/mindspore/core/ops/prior_box.h
index a40b2671181..09971d3d65f 100644
--- a/mindspore/core/ops/prior_box.h
+++ b/mindspore/core/ops/prior_box.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNamePriorBox = "PriorBox";
-class PriorBox : public PrimitiveC {
+class MS_CORE_API PriorBox : public PrimitiveC {
  public:
   PriorBox() : PrimitiveC(kNamePriorBox) {}
   ~PriorBox() = default;
diff --git a/mindspore/core/ops/proposal.h b/mindspore/core/ops/proposal.h
index 462bd2fb673..e5ae7c2228f 100644
--- a/mindspore/core/ops/proposal.h
+++ b/mindspore/core/ops/proposal.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameProposal = "Proposal";
-class Proposal : public PrimitiveC {
+class MS_CORE_API Proposal : public PrimitiveC {
  public:
   Proposal() : PrimitiveC(kNameProposal) {}
   ~Proposal() = default;
diff --git a/mindspore/core/ops/quant_dtype_cast.h b/mindspore/core/ops/quant_dtype_cast.h
index 3b1631b27ac..b3afc7039c4 100644
--- a/mindspore/core/ops/quant_dtype_cast.h
+++ b/mindspore/core/ops/quant_dtype_cast.h
@@ -31,7 +31,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameQuantDTypeCast = "QuantDTypeCast";
-class QuantDTypeCast : public PrimitiveC {
+class MS_CORE_API QuantDTypeCast : public PrimitiveC {
  public:
   QuantDTypeCast() : PrimitiveC(kNameQuantDTypeCast) {}
   ~QuantDTypeCast() = default;
diff --git a/mindspore/core/ops/ragged_range.h b/mindspore/core/ops/ragged_range.h
index cfafa9ece8f..896d097a21b 100644
--- a/mindspore/core/ops/ragged_range.h
+++ b/mindspore/core/ops/ragged_range.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameRaggedRange = "RaggedRange";
-class RaggedRange : public PrimitiveC {
+class MS_CORE_API RaggedRange : public PrimitiveC {
  public:
   RaggedRange() : PrimitiveC(kNameRaggedRange) {}
   ~RaggedRange() = default;
diff --git a/mindspore/core/ops/random_standard_normal.h b/mindspore/core/ops/random_standard_normal.h
index 16e29a69e34..0880fa3b846 100644
--- a/mindspore/core/ops/random_standard_normal.h
+++ b/mindspore/core/ops/random_standard_normal.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameRandomStandardNormal = "RandomStandardNormal";
-class RandomStandardNormal : public PrimitiveC {
+class MS_CORE_API RandomStandardNormal : public PrimitiveC {
  public:
   RandomStandardNormal() : PrimitiveC(kNameRandomStandardNormal) {}
   ~RandomStandardNormal() = default;
diff --git a/mindspore/core/ops/range.h b/mindspore/core/ops/range.h
index aba04823a4b..92c8701df8a 100644
--- a/mindspore/core/ops/range.h
+++ b/mindspore/core/ops/range.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameRange = "Range";
-class Range : public PrimitiveC {
+class MS_CORE_API Range : public PrimitiveC {
  public:
   Range() : PrimitiveC(kNameRange) {}
   ~Range() = default;
diff --git a/mindspore/core/ops/rank.h b/mindspore/core/ops/rank.h
index 506e6d0172e..724860210ce 100644
--- a/mindspore/core/ops/rank.h
+++ b/mindspore/core/ops/rank.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameRank = "Rank";
-class Rank : public PrimitiveC {
+class MS_CORE_API Rank : public PrimitiveC {
  public:
   Rank() : PrimitiveC(kNameRank) { auto prim_name = name(); }
   ~Rank() = default;
diff --git a/mindspore/core/ops/real_div.h b/mindspore/core/ops/real_div.h
index dabd2c170aa..b0fa183b956 100644
--- a/mindspore/core/ops/real_div.h
+++ b/mindspore/core/ops/real_div.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameRealDiv = prim::kRealDiv;
-class RealDiv : public PrimitiveC {
+class MS_CORE_API RealDiv : public PrimitiveC {
  public:
   RealDiv() : PrimitiveC(kNameRealDiv) { InitIOName({"x", "y"}, {"output"}); }
   ~RealDiv() = default;
diff --git a/mindspore/core/ops/reciprocal.h b/mindspore/core/ops/reciprocal.h
index 85cd39fd57a..b11cdd6e30f 100644
--- a/mindspore/core/ops/reciprocal.h
+++ b/mindspore/core/ops/reciprocal.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReciprocal = prim::kReciprocal;
-class Reciprocal : public PrimitiveC {
+class MS_CORE_API Reciprocal : public PrimitiveC {
  public:
   Reciprocal() : PrimitiveC(prim::kPrimReciprocal->name()) { InitIOName({"x"}, {"y"}); }
   ~Reciprocal() = default;
diff --git a/mindspore/core/ops/reduce.h b/mindspore/core/ops/reduce.h
index 29a821ff527..e1afe2f8804 100644
--- a/mindspore/core/ops/reduce.h
+++ b/mindspore/core/ops/reduce.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReduce = "Reduce";
-class Reduce : public PrimitiveC {
+class MS_CORE_API Reduce : public PrimitiveC {
  public:
   Reduce() : PrimitiveC(kNameReduce) { InitIOName({"input_x", "axis"}, {"y"}); }
   explicit Reduce(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"input_x", "axis"}, {"y"}); }
diff --git a/mindspore/core/ops/reduce_all.h b/mindspore/core/ops/reduce_all.h
index 83de77ad6cc..fbccc63080b 100644
--- a/mindspore/core/ops/reduce_all.h
+++ b/mindspore/core/ops/reduce_all.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReduceAll = "ReduceAll";
-class ReduceAll : public Reduce {
+class MS_CORE_API ReduceAll : public Reduce {
  public:
   ReduceAll() : Reduce(kNameReduceAll) { InitIOName({"input_x", "axis"}, {"y"}); }
   ~ReduceAll() = default;
diff --git a/mindspore/core/ops/reduce_any.h b/mindspore/core/ops/reduce_any.h
index 3957ee10995..2c0875342d6 100644
--- a/mindspore/core/ops/reduce_any.h
+++ b/mindspore/core/ops/reduce_any.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReduceAny = "ReduceAny";
-class ReduceAny : public Reduce {
+class MS_CORE_API ReduceAny : public Reduce {
  public:
   ReduceAny() : Reduce(kNameReduceAny) { InitIOName({"input_x", "axis"}, {"y"}); }
   ~ReduceAny() = default;
diff --git a/mindspore/core/ops/reduce_asum.h b/mindspore/core/ops/reduce_asum.h
index 9e6e36889c5..c15fed454a1 100644
--- a/mindspore/core/ops/reduce_asum.h
+++ b/mindspore/core/ops/reduce_asum.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReduceASum = "ReduceASum";
-class ReduceASum : public Reduce {
+class MS_CORE_API ReduceASum : public Reduce {
  public:
   ReduceASum() : Reduce(kNameReduceASum) { InitIOName({"input_x", "axis"}, {"y"}); }
   ~ReduceASum() = default;
diff --git a/mindspore/core/ops/reduce_max.h b/mindspore/core/ops/reduce_max.h
index a7cbed8517b..a2728a54827 100644
--- a/mindspore/core/ops/reduce_max.h
+++ b/mindspore/core/ops/reduce_max.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReduceMax = "ReduceMax";
-class ReduceMax : public Reduce {
+class MS_CORE_API ReduceMax : public Reduce {
  public:
   ReduceMax() : Reduce(kNameReduceMax) { InitIOName({"input_x", "axis"}, {"y"}); }
   ~ReduceMax() = default;
diff --git a/mindspore/core/ops/reduce_mean.h b/mindspore/core/ops/reduce_mean.h
index 3bf643d8626..9d1d1c410ca 100644
--- a/mindspore/core/ops/reduce_mean.h
+++ b/mindspore/core/ops/reduce_mean.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReduceMean = "ReduceMean";
-class ReduceMean : public Reduce {
+class MS_CORE_API ReduceMean : public Reduce {
  public:
   ReduceMean() : Reduce(kNameReduceMean) { InitIOName({"input_x", "axis"}, {"y"}); }
   ~ReduceMean() = default;
diff --git a/mindspore/core/ops/reduce_min.h b/mindspore/core/ops/reduce_min.h
index 15972cafcec..e0697b55a00 100644
--- a/mindspore/core/ops/reduce_min.h
+++ b/mindspore/core/ops/reduce_min.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReduceMin = "ReduceMin";
-class ReduceMin : public Reduce {
+class MS_CORE_API ReduceMin : public Reduce {
  public:
   ReduceMin() : Reduce(kNameReduceMin) { InitIOName({"input_x", "axis"}, {"y"}); }
   ~ReduceMin() = default;
diff --git a/mindspore/core/ops/reduce_prod.h b/mindspore/core/ops/reduce_prod.h
index f6c7f6506d0..35155c182fe 100644
--- a/mindspore/core/ops/reduce_prod.h
+++ b/mindspore/core/ops/reduce_prod.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReduceProd = "ReduceProd";
-class ReduceProd : public Reduce {
+class MS_CORE_API ReduceProd : public Reduce {
  public:
   ReduceProd() : Reduce(kNameReduceProd) { InitIOName({"input_x", "axis"}, {"y"}); }
   ~ReduceProd() = default;
diff --git a/mindspore/core/ops/reduce_sum.h b/mindspore/core/ops/reduce_sum.h
index 3c67e181b79..b94c55e0a04 100644
--- a/mindspore/core/ops/reduce_sum.h
+++ b/mindspore/core/ops/reduce_sum.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReduceSum = "ReduceSum";
-class ReduceSum : public Reduce {
+class MS_CORE_API ReduceSum : public Reduce {
  public:
   ReduceSum() : Reduce(kNameReduceSum) { InitIOName({"x", "axis"}, {"y"}); }
   ~ReduceSum() = default;
diff --git a/mindspore/core/ops/reduce_sum_square.h b/mindspore/core/ops/reduce_sum_square.h
index b5c4620fb6a..a7f70a3b153 100644
--- a/mindspore/core/ops/reduce_sum_square.h
+++ b/mindspore/core/ops/reduce_sum_square.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReduceSumSquare = "ReduceSumSquare";
-class ReduceSumSquare : public Reduce {
+class MS_CORE_API ReduceSumSquare : public Reduce {
  public:
   ReduceSumSquare() : Reduce(kNameReduceSumSquare) { InitIOName({"input_x", "axis"}, {"y"}); }
   ~ReduceSumSquare() = default;
diff --git a/mindspore/core/ops/relu.h b/mindspore/core/ops/relu.h
index 3c21bb610f2..cb417303162 100644
--- a/mindspore/core/ops/relu.h
+++ b/mindspore/core/ops/relu.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReLU = prim::kReLU;
-class ReLU : public PrimitiveC {
+class MS_CORE_API ReLU : public PrimitiveC {
  public:
   ReLU() : PrimitiveC(kNameReLU) { InitIOName({"x"}, {"output"}); }
   ~ReLU() = default;
diff --git a/mindspore/core/ops/relu6.h b/mindspore/core/ops/relu6.h
index de76b6f6861..f2cb544d0ef 100644
--- a/mindspore/core/ops/relu6.h
+++ b/mindspore/core/ops/relu6.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReLU6 = prim::kReLU6;
-class ReLU6 : public PrimitiveC {
+class MS_CORE_API ReLU6 : public PrimitiveC {
  public:
   ReLU6() : PrimitiveC(kNameReLU6) { InitIOName({"x"}, {"output"}); }
   ~ReLU6() = default;
diff --git a/mindspore/core/ops/reluv2.h b/mindspore/core/ops/reluv2.h
index 399ff62cbeb..af48acb9800 100644
--- a/mindspore/core/ops/reluv2.h
+++ b/mindspore/core/ops/reluv2.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReLUV2 = prim::kReLUV2;
-class ReLUV2 : public PrimitiveC {
+class MS_CORE_API ReLUV2 : public PrimitiveC {
  public:
   ReLUV2() : PrimitiveC(prim::kPrimReluV2->name()) { InitIOName({"x"}, {"output", "mask"}); }
   explicit ReLUV2(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x"}, {"output", "mask"}); }
diff --git a/mindspore/core/ops/reshape.h b/mindspore/core/ops/reshape.h
index 38c4b0db5a0..bccebbbc633 100644
--- a/mindspore/core/ops/reshape.h
+++ b/mindspore/core/ops/reshape.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReshape = "Reshape";
-class Reshape : public PrimitiveC {
+class MS_CORE_API Reshape : public PrimitiveC {
  public:
   Reshape() : PrimitiveC(kNameReshape) { InitIOName({"tensor", "shape"}, {"output"}); }
   ~Reshape() = default;
diff --git a/mindspore/core/ops/resize.h b/mindspore/core/ops/resize.h
index 940a97501b6..c613453d547 100644
--- a/mindspore/core/ops/resize.h
+++ b/mindspore/core/ops/resize.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameResize = "Resize";
-class Resize : public PrimitiveC {
+class MS_CORE_API Resize : public PrimitiveC {
  public:
   Resize() : PrimitiveC(kNameResize) {}
   ~Resize() = default;
diff --git a/mindspore/core/ops/resize_bilinear.h b/mindspore/core/ops/resize_bilinear.h
index e615509eec0..1a4f9c0d9c1 100644
--- a/mindspore/core/ops/resize_bilinear.h
+++ b/mindspore/core/ops/resize_bilinear.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameResizeBilinear = "ResizeBilinear";
-class ResizeBilinear : public PrimitiveC {
+class MS_CORE_API ResizeBilinear : public PrimitiveC {
  public:
   ResizeBilinear() : PrimitiveC(kNameResizeBilinear) {}
   ~ResizeBilinear() = default;
diff --git a/mindspore/core/ops/resize_nearest_neighbor.h b/mindspore/core/ops/resize_nearest_neighbor.h
index aa4a15affd4..7e1c3f97c25 100644
--- a/mindspore/core/ops/resize_nearest_neighbor.h
+++ b/mindspore/core/ops/resize_nearest_neighbor.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameResizeNearestNeighbor = "ResizeNearestNeighbor";
-class ResizeNearestNeighbor : public PrimitiveC {
+class MS_CORE_API ResizeNearestNeighbor : public PrimitiveC {
  public:
   ResizeNearestNeighbor() : PrimitiveC(kNameResizeNearestNeighbor) {}
   ~ResizeNearestNeighbor() = default;
diff --git a/mindspore/core/ops/reverse_sequence.h b/mindspore/core/ops/reverse_sequence.h
index 6fde1a86144..c0fa39929ce 100644
--- a/mindspore/core/ops/reverse_sequence.h
+++ b/mindspore/core/ops/reverse_sequence.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReverseSequence = "ReverseSequence";
-class ReverseSequence : public PrimitiveC {
+class MS_CORE_API ReverseSequence : public PrimitiveC {
  public:
   ReverseSequence() : PrimitiveC(kNameReverseSequence) { InitIOName({"x", "seq_lengths"}, {"y"}); }
   ~ReverseSequence() = default;
diff --git a/mindspore/core/ops/reverse_v2.h b/mindspore/core/ops/reverse_v2.h
index 6df7990ddb1..914684cd2a2 100644
--- a/mindspore/core/ops/reverse_v2.h
+++ b/mindspore/core/ops/reverse_v2.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameReverseV2 = "ReverseV2";
-class ReverseV2 : public PrimitiveC {
+class MS_CORE_API ReverseV2 : public PrimitiveC {
  public:
   ReverseV2() : PrimitiveC(kNameReverseV2) {}
   ~ReverseV2() = default;
diff --git a/mindspore/core/ops/rfft.h b/mindspore/core/ops/rfft.h
index 1edf6b4fba1..1b042f9c571 100644
--- a/mindspore/core/ops/rfft.h
+++ b/mindspore/core/ops/rfft.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameRfft = "Rfft";
-class Rfft : public PrimitiveC {
+class MS_CORE_API Rfft : public PrimitiveC {
  public:
   Rfft() : PrimitiveC(kNameRfft) {}
   ~Rfft() = default;
diff --git a/mindspore/core/ops/roi_pooling.h b/mindspore/core/ops/roi_pooling.h
index 3fe61323df1..5f255bdf5f2 100644
--- a/mindspore/core/ops/roi_pooling.h
+++ b/mindspore/core/ops/roi_pooling.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameROIPooling = "ROIPooling";
-class ROIPooling : public PrimitiveC {
+class MS_CORE_API ROIPooling : public PrimitiveC {
  public:
   ROIPooling() : PrimitiveC(kNameROIPooling) {}
   ~ROIPooling() = default;
diff --git a/mindspore/core/ops/round.h b/mindspore/core/ops/round.h
index bec87da0731..11956d6fad8 100644
--- a/mindspore/core/ops/round.h
+++ b/mindspore/core/ops/round.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameRound = "Round";
-class Round : public PrimitiveC {
+class MS_CORE_API Round : public PrimitiveC {
  public:
   Round() : PrimitiveC(kNameRound) { InitIOName({"input_x"}, {"output"}); }
   ~Round() = default;
diff --git a/mindspore/core/ops/rsqrt.h b/mindspore/core/ops/rsqrt.h
index 96b7d3c02be..547edb91595 100644
--- a/mindspore/core/ops/rsqrt.h
+++ b/mindspore/core/ops/rsqrt.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameRsqrt = "Rsqrt";
-class Rsqrt : public PrimitiveC {
+class MS_CORE_API Rsqrt : public PrimitiveC {
  public:
   Rsqrt() : PrimitiveC(kNameRsqrt) { InitIOName({"x"}, {"output"}); }
   ~Rsqrt() = default;
diff --git a/mindspore/core/ops/scalar_summary.h b/mindspore/core/ops/scalar_summary.h
index c688f3f7b81..32bd58cb023 100644
--- a/mindspore/core/ops/scalar_summary.h
+++ b/mindspore/core/ops/scalar_summary.h
@@ -27,7 +27,7 @@
 
 namespace mindspore {
 namespace ops {
-class ScalarSummary : public PrimitiveC {
+class MS_CORE_API ScalarSummary : public PrimitiveC {
  public:
   ScalarSummary() : PrimitiveC(prim::kPrimScalarSummary->name()) {}
   ~ScalarSummary() = default;
diff --git a/mindspore/core/ops/scale.h b/mindspore/core/ops/scale.h
index 3b224f44dd9..512c029803f 100644
--- a/mindspore/core/ops/scale.h
+++ b/mindspore/core/ops/scale.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameScale = "Scale";
-class Scale : public PrimitiveC {
+class MS_CORE_API Scale : public PrimitiveC {
  public:
   Scale() : PrimitiveC(kNameScale) {}
   explicit Scale(const std::string k_name) : PrimitiveC(k_name) {}
diff --git a/mindspore/core/ops/scatter_nd.h b/mindspore/core/ops/scatter_nd.h
index dfa678bfd56..7e5be0f7256 100644
--- a/mindspore/core/ops/scatter_nd.h
+++ b/mindspore/core/ops/scatter_nd.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameScatterNd = "ScatterNd";
-class ScatterNd : public PrimitiveC {
+class MS_CORE_API ScatterNd : public PrimitiveC {
  public:
   ScatterNd() : PrimitiveC(kNameScatterNd) { InitIOName({"indices", "update", "shape"}, {"output"}); }
   ~ScatterNd() = default;
diff --git a/mindspore/core/ops/scatter_nd_update.h b/mindspore/core/ops/scatter_nd_update.h
index 5909f0ef48a..03b42cc86b7 100644
--- a/mindspore/core/ops/scatter_nd_update.h
+++ b/mindspore/core/ops/scatter_nd_update.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameScatterNdUpdate = "ScatterNdUpdate";
-class ScatterNdUpdate : public PrimitiveC {
+class MS_CORE_API ScatterNdUpdate : public PrimitiveC {
  public:
   ScatterNdUpdate() : PrimitiveC(kNameScatterNdUpdate) { InitIOName({"input_x", "indices", "update"}, {"output"}); }
   ~ScatterNdUpdate() = default;
diff --git a/mindspore/core/ops/select.h b/mindspore/core/ops/select.h
index 79a3b1b38df..44ea7819347 100644
--- a/mindspore/core/ops/select.h
+++ b/mindspore/core/ops/select.h
@@ -31,7 +31,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSelect = "Select";
-class Select : public PrimitiveC {
+class MS_CORE_API Select : public PrimitiveC {
  public:
   Select() : PrimitiveC(kNameSelect) { InitIOName({"condition", "x", "y"}, {"output"}); }
   ~Select() = default;
diff --git a/mindspore/core/ops/sgd.h b/mindspore/core/ops/sgd.h
index 20025232055..69d87a18f59 100644
--- a/mindspore/core/ops/sgd.h
+++ b/mindspore/core/ops/sgd.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSGD = "SGD";
-class SGD : public PrimitiveC {
+class MS_CORE_API SGD : public PrimitiveC {
  public:
   SGD() : PrimitiveC(kNameSGD) {}
   ~SGD() = default;
diff --git a/mindspore/core/ops/shape.h b/mindspore/core/ops/shape.h
index d359eebcd40..4962ee7e32b 100644
--- a/mindspore/core/ops/shape.h
+++ b/mindspore/core/ops/shape.h
@@ -26,7 +26,7 @@
 
 namespace mindspore {
 namespace ops {
-class Shape : public PrimitiveC {
+class MS_CORE_API Shape : public PrimitiveC {
  public:
   Shape() : PrimitiveC(prim::kPrimShape->name()) {}
   ~Shape() = default;
diff --git a/mindspore/core/ops/sigmoid.h b/mindspore/core/ops/sigmoid.h
index b67a7dc3653..097d0560946 100644
--- a/mindspore/core/ops/sigmoid.h
+++ b/mindspore/core/ops/sigmoid.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSigmoid = "Sigmoid";
-class Sigmoid : public PrimitiveC {
+class MS_CORE_API Sigmoid : public PrimitiveC {
  public:
   Sigmoid() : PrimitiveC(kNameSigmoid) { InitIOName({"x"}, {"output"}); }
   ~Sigmoid() = default;
diff --git a/mindspore/core/ops/sigmoid_cross_entropy_with_logits.h b/mindspore/core/ops/sigmoid_cross_entropy_with_logits.h
index d34987cbe9b..4ad8e17a3ec 100644
--- a/mindspore/core/ops/sigmoid_cross_entropy_with_logits.h
+++ b/mindspore/core/ops/sigmoid_cross_entropy_with_logits.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSigmoidCrossEntropyWithLogits = "SigmoidCrossEntropyWithLogits";
-class SigmoidCrossEntropyWithLogits : public PrimitiveC {
+class MS_CORE_API SigmoidCrossEntropyWithLogits : public PrimitiveC {
  public:
   SigmoidCrossEntropyWithLogits() : PrimitiveC(kNameSigmoidCrossEntropyWithLogits) {
     InitIOName({"predict", "target"}, {"loss"});
diff --git a/mindspore/core/ops/sin.h b/mindspore/core/ops/sin.h
index a6e050ae5f4..6db737b1784 100644
--- a/mindspore/core/ops/sin.h
+++ b/mindspore/core/ops/sin.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSin = "Sin";
-class Sin : public PrimitiveC {
+class MS_CORE_API Sin : public PrimitiveC {
  public:
   Sin() : PrimitiveC(kNameSin) {}
   ~Sin() = default;
diff --git a/mindspore/core/ops/size.h b/mindspore/core/ops/size.h
index b5396bab958..354dfa4498d 100644
--- a/mindspore/core/ops/size.h
+++ b/mindspore/core/ops/size.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSize = "Size";
-class Size : public PrimitiveC {
+class MS_CORE_API Size : public PrimitiveC {
  public:
   Size() : PrimitiveC(kNameSize) {}
   ~Size() = default;
diff --git a/mindspore/core/ops/skip_gram.h b/mindspore/core/ops/skip_gram.h
index 8fdbb64514b..2d1e09b9ac1 100644
--- a/mindspore/core/ops/skip_gram.h
+++ b/mindspore/core/ops/skip_gram.h
@@ -31,7 +31,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSkipGram = "SkipGram";
-class SkipGram : public PrimitiveC {
+class MS_CORE_API SkipGram : public PrimitiveC {
  public:
   SkipGram() : PrimitiveC(kNameSkipGram) {}
   ~SkipGram() = default;
diff --git a/mindspore/core/ops/slice.h b/mindspore/core/ops/slice.h
index da040f4c4e3..446fd6124ed 100644
--- a/mindspore/core/ops/slice.h
+++ b/mindspore/core/ops/slice.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSlice = "Slice";
-class Slice : public PrimitiveC {
+class MS_CORE_API Slice : public PrimitiveC {
  public:
   Slice() : PrimitiveC(kNameSlice) { InitIOName({"x", "begin", "size"}, {"output"}); }
   ~Slice() = default;
diff --git a/mindspore/core/ops/smooth_l1_loss.h b/mindspore/core/ops/smooth_l1_loss.h
index 972a27aae14..8e5f951705e 100644
--- a/mindspore/core/ops/smooth_l1_loss.h
+++ b/mindspore/core/ops/smooth_l1_loss.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSmoothL1Loss = "SmoothL1Loss";
-class SmoothL1Loss : public PrimitiveC {
+class MS_CORE_API SmoothL1Loss : public PrimitiveC {
  public:
   SmoothL1Loss() : PrimitiveC(kNameSmoothL1Loss) { InitIOName({"prediction", "target"}, {"output"}); }
   ~SmoothL1Loss() = default;
diff --git a/mindspore/core/ops/soft_margin_loss.h b/mindspore/core/ops/soft_margin_loss.h
index e670d99dc51..53f63fa38be 100644
--- a/mindspore/core/ops/soft_margin_loss.h
+++ b/mindspore/core/ops/soft_margin_loss.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSoftMarginLoss = "SoftMarginLoss";
-class SoftMarginLoss : public PrimitiveC {
+class MS_CORE_API SoftMarginLoss : public PrimitiveC {
  public:
   SoftMarginLoss() : PrimitiveC(kNameSoftMarginLoss) { InitIOName({"predict", "label"}, {"loss"}); }
   ~SoftMarginLoss() = default;
diff --git a/mindspore/core/ops/soft_shrink.h b/mindspore/core/ops/soft_shrink.h
index ce9531d6324..c93f5e7c819 100644
--- a/mindspore/core/ops/soft_shrink.h
+++ b/mindspore/core/ops/soft_shrink.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSoftShrink = "SoftShrink";
-class SoftShrink : public PrimitiveC {
+class MS_CORE_API SoftShrink : public PrimitiveC {
  public:
   SoftShrink() : PrimitiveC(kNameSoftShrink) { InitIOName({"input_x"}, {"output"}); }
   ~SoftShrink() = default;
diff --git a/mindspore/core/ops/softmax.h b/mindspore/core/ops/softmax.h
index 464ed7c572e..bd1995f9f64 100644
--- a/mindspore/core/ops/softmax.h
+++ b/mindspore/core/ops/softmax.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSoftmax = "Softmax";
-class Softmax : public PrimitiveC {
+class MS_CORE_API Softmax : public PrimitiveC {
  public:
   Softmax() : PrimitiveC(kNameSoftmax) { InitIOName({"x"}, {"output"}); }
   ~Softmax() = default;
diff --git a/mindspore/core/ops/softmax_cross_entropy_with_logits.h b/mindspore/core/ops/softmax_cross_entropy_with_logits.h
index 61d10de048b..feb5e953fca 100644
--- a/mindspore/core/ops/softmax_cross_entropy_with_logits.h
+++ b/mindspore/core/ops/softmax_cross_entropy_with_logits.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSoftmaxCrossEntropyWithLogits = "SoftmaxCrossEntropyWithLogits";
-class SoftmaxCrossEntropyWithLogits : public PrimitiveC {
+class MS_CORE_API SoftmaxCrossEntropyWithLogits : public PrimitiveC {
  public:
   SoftmaxCrossEntropyWithLogits() : PrimitiveC(kNameSoftmaxCrossEntropyWithLogits) {}
   ~SoftmaxCrossEntropyWithLogits() = default;
diff --git a/mindspore/core/ops/softplus.cc b/mindspore/core/ops/softplus.cc
index e77999a9439..0f6077329a6 100644
--- a/mindspore/core/ops/softplus.cc
+++ b/mindspore/core/ops/softplus.cc
@@ -39,7 +39,7 @@ TypePtr SoftplusInferType(const PrimitivePtr &prim, const std::vector<AbstractBa
   MS_EXCEPTION_IF_NULL(prim);
   auto prim_name = prim->name();
   // check
-  std::set<TypePtr> valid_index_types = {kFloat16, kFloat32, kFloat64};
+  std::set<TypePtr> valid_index_types = {kFloat16, kFloat32};
   auto x_type = input_args[0]->BuildType();
   (void)CheckAndConvertUtils::CheckTensorTypeValid("x", x_type, valid_index_types, prim_name);
   return x_type;
diff --git a/mindspore/core/ops/softplus.h b/mindspore/core/ops/softplus.h
index 42bc40518bb..4906c7d87e2 100644
--- a/mindspore/core/ops/softplus.h
+++ b/mindspore/core/ops/softplus.h
@@ -27,7 +27,7 @@
 
 namespace mindspore {
 namespace ops {
-class Softplus : public PrimitiveC {
+class MS_CORE_API Softplus : public PrimitiveC {
  public:
   Softplus() : PrimitiveC(prim::kPrimSoftplus->name()) { InitIOName({"x"}, {"output"}); }
   ~Softplus() = default;
diff --git a/mindspore/core/ops/space_to_batch.h b/mindspore/core/ops/space_to_batch.h
index 0e2197af522..e9afa33c770 100644
--- a/mindspore/core/ops/space_to_batch.h
+++ b/mindspore/core/ops/space_to_batch.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSpaceToBatch = "SpaceToBatch";
-class SpaceToBatch : public PrimitiveC {
+class MS_CORE_API SpaceToBatch : public PrimitiveC {
  public:
   SpaceToBatch() : PrimitiveC(kNameSpaceToBatch) {}
   ~SpaceToBatch() = default;
diff --git a/mindspore/core/ops/space_to_batch_nd.h b/mindspore/core/ops/space_to_batch_nd.h
index dafd345d262..8820fb16af4 100644
--- a/mindspore/core/ops/space_to_batch_nd.h
+++ b/mindspore/core/ops/space_to_batch_nd.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSpaceToBatchND = "SpaceToBatchND";
-class SpaceToBatchND : public PrimitiveC {
+class MS_CORE_API SpaceToBatchND : public PrimitiveC {
  public:
   SpaceToBatchND() : PrimitiveC(kNameSpaceToBatchND) {}
   ~SpaceToBatchND() = default;
diff --git a/mindspore/core/ops/space_to_depth.h b/mindspore/core/ops/space_to_depth.h
index edc50abfbe6..c307f225a15 100644
--- a/mindspore/core/ops/space_to_depth.h
+++ b/mindspore/core/ops/space_to_depth.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSpaceToDepth = "SpaceToDepth";
-class SpaceToDepth : public PrimitiveC {
+class MS_CORE_API SpaceToDepth : public PrimitiveC {
  public:
   SpaceToDepth() : PrimitiveC(kNameSpaceToDepth) { InitIOName({"x"}, {"y"}); }
   ~SpaceToDepth() = default;
diff --git a/mindspore/core/ops/sparse_softmax_cross_entropy_with_logits.h b/mindspore/core/ops/sparse_softmax_cross_entropy_with_logits.h
index 56528354e0e..569f0bf57d9 100644
--- a/mindspore/core/ops/sparse_softmax_cross_entropy_with_logits.h
+++ b/mindspore/core/ops/sparse_softmax_cross_entropy_with_logits.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSparseSoftmaxCrossEntropyWithLogits = "SparseSoftmaxCrossEntropyWithLogits";
-class SparseSoftmaxCrossEntropyWithLogits : public PrimitiveC {
+class MS_CORE_API SparseSoftmaxCrossEntropyWithLogits : public PrimitiveC {
  public:
   SparseSoftmaxCrossEntropyWithLogits() : PrimitiveC(kNameSparseSoftmaxCrossEntropyWithLogits) {}
   ~SparseSoftmaxCrossEntropyWithLogits() = default;
diff --git a/mindspore/core/ops/sparse_to_dense.h b/mindspore/core/ops/sparse_to_dense.h
index 27820a60ef7..28ba3d489ff 100644
--- a/mindspore/core/ops/sparse_to_dense.h
+++ b/mindspore/core/ops/sparse_to_dense.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSparseToDense = "SparseToDense";
-class SparseToDense : public PrimitiveC {
+class MS_CORE_API SparseToDense : public PrimitiveC {
  public:
   SparseToDense() : PrimitiveC(kNameSparseToDense) { InitIOName({"indices", "values", "dense_shape"}, {"output"}); }
   ~SparseToDense() = default;
diff --git a/mindspore/core/ops/splice.h b/mindspore/core/ops/splice.h
index b9f1f69305b..137ee0b8de2 100644
--- a/mindspore/core/ops/splice.h
+++ b/mindspore/core/ops/splice.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSplice = "Splice";
-class Splice : public PrimitiveC {
+class MS_CORE_API Splice : public PrimitiveC {
  public:
   Splice() : PrimitiveC(kNameSplice) { InitIOName({"inputs"}, {"outputs"}); }
   ~Splice() = default;
diff --git a/mindspore/core/ops/split.h b/mindspore/core/ops/split.h
index 2a745180d8e..7844eb195c9 100644
--- a/mindspore/core/ops/split.h
+++ b/mindspore/core/ops/split.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSplit = "Split";
-class Split : public PrimitiveC {
+class MS_CORE_API Split : public PrimitiveC {
  public:
   Split() : PrimitiveC(kNameSplit) {}
   ~Split() = default;
diff --git a/mindspore/core/ops/split_with_overlap.h b/mindspore/core/ops/split_with_overlap.h
index 37853ae92be..8a76f525e60 100644
--- a/mindspore/core/ops/split_with_overlap.h
+++ b/mindspore/core/ops/split_with_overlap.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSplitWithOverlap = "SplitWithOverlap";
-class SplitWithOverlap : public PrimitiveC {
+class MS_CORE_API SplitWithOverlap : public PrimitiveC {
  public:
   SplitWithOverlap() : PrimitiveC(kNameSplitWithOverlap) {}
   ~SplitWithOverlap() = default;
diff --git a/mindspore/core/ops/sqrt.h b/mindspore/core/ops/sqrt.h
index 5de82345508..a679108a491 100644
--- a/mindspore/core/ops/sqrt.h
+++ b/mindspore/core/ops/sqrt.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSqrt = "Sqrt";
-class Sqrt : public PrimitiveC {
+class MS_CORE_API Sqrt : public PrimitiveC {
  public:
   Sqrt() : PrimitiveC(kNameSqrt) { InitIOName({"x"}, {"output"}); }
   ~Sqrt() = default;
diff --git a/mindspore/core/ops/square.h b/mindspore/core/ops/square.h
index 4c39c1dccc1..cc1cc393a48 100644
--- a/mindspore/core/ops/square.h
+++ b/mindspore/core/ops/square.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSquare = "Square";
-class Square : public PrimitiveC {
+class MS_CORE_API Square : public PrimitiveC {
  public:
   Square() : PrimitiveC(kNameSquare) { InitIOName({"input_x"}, {"y"}); }
   ~Square() = default;
diff --git a/mindspore/core/ops/squared_difference.h b/mindspore/core/ops/squared_difference.h
index c5362e09699..56541954f21 100644
--- a/mindspore/core/ops/squared_difference.h
+++ b/mindspore/core/ops/squared_difference.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSquaredDifference = "SquaredDifference";
-class SquaredDifference : public PrimitiveC {
+class MS_CORE_API SquaredDifference : public PrimitiveC {
  public:
   SquaredDifference() : PrimitiveC(kNameSquaredDifference) { InitIOName({"x", "y"}, {"output"}); }
   ~SquaredDifference() = default;
diff --git a/mindspore/core/ops/squeeze.h b/mindspore/core/ops/squeeze.h
index 6a467a7c1db..c78c8448f6f 100644
--- a/mindspore/core/ops/squeeze.h
+++ b/mindspore/core/ops/squeeze.h
@@ -31,7 +31,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSqueeze = "Squeeze";
-class Squeeze : public PrimitiveC {
+class MS_CORE_API Squeeze : public PrimitiveC {
  public:
   Squeeze() : PrimitiveC(kNameSqueeze) { InitIOName({"x"}, {"output"}); }
   ~Squeeze() = default;
diff --git a/mindspore/core/ops/stack.h b/mindspore/core/ops/stack.h
index 09a1b21e478..81328e3ce73 100644
--- a/mindspore/core/ops/stack.h
+++ b/mindspore/core/ops/stack.h
@@ -31,7 +31,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameStack = "Stack";
-class Stack : public PrimitiveC {
+class MS_CORE_API Stack : public PrimitiveC {
  public:
   Stack() : PrimitiveC(kNameStack) {}
   ~Stack() = default;
diff --git a/mindspore/core/ops/strided_slice.h b/mindspore/core/ops/strided_slice.h
index dcbb0ba66a1..a05f8e7c30c 100644
--- a/mindspore/core/ops/strided_slice.h
+++ b/mindspore/core/ops/strided_slice.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameStridedSlice = prim::kStridedSlice;
-class StridedSlice : public PrimitiveC {
+class MS_CORE_API StridedSlice : public PrimitiveC {
  public:
   StridedSlice() : PrimitiveC(prim::kPrimStridedSlice->name()) {
     InitIOName({"x", "begin", "end", "strides"}, {"output"});
diff --git a/mindspore/core/ops/sub.h b/mindspore/core/ops/sub.h
index 89c96538379..50a847a5052 100644
--- a/mindspore/core/ops/sub.h
+++ b/mindspore/core/ops/sub.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSub = prim::kSub;
-class Sub : public PrimitiveC {
+class MS_CORE_API Sub : public PrimitiveC {
  public:
   Sub() : PrimitiveC(kNameSub) { InitIOName({"x", "y"}, {"output"}); }
   explicit Sub(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x", "y"}, {"output"}); }
diff --git a/mindspore/core/ops/switch.h b/mindspore/core/ops/switch.h
index 5782b30f9d2..ecd82b03b8b 100644
--- a/mindspore/core/ops/switch.h
+++ b/mindspore/core/ops/switch.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameSwitch = "Switch";
-class Switch : public PrimitiveC {
+class MS_CORE_API Switch : public PrimitiveC {
  public:
   Switch() : PrimitiveC(kNameSwitch) {}
   ~Switch() = default;
diff --git a/mindspore/core/ops/tan.h b/mindspore/core/ops/tan.h
index 85d914867e1..e70c320387f 100644
--- a/mindspore/core/ops/tan.h
+++ b/mindspore/core/ops/tan.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameTan = "Tan";
-class Tan : public PrimitiveC {
+class MS_CORE_API Tan : public PrimitiveC {
  public:
   Tan() : PrimitiveC(kNameTan) {}
   ~Tan() = default;
diff --git a/mindspore/core/ops/tanh.h b/mindspore/core/ops/tanh.h
index e4a15a1dad2..f329dc82b35 100644
--- a/mindspore/core/ops/tanh.h
+++ b/mindspore/core/ops/tanh.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameTanh = "Tanh";
-class Tanh : public PrimitiveC {
+class MS_CORE_API Tanh : public PrimitiveC {
  public:
   Tanh() : PrimitiveC(kNameTanh) {}
   ~Tanh() = default;
diff --git a/mindspore/core/ops/tensor_array.h b/mindspore/core/ops/tensor_array.h
index f633c3249e1..8dfc5dc3727 100644
--- a/mindspore/core/ops/tensor_array.h
+++ b/mindspore/core/ops/tensor_array.h
@@ -25,7 +25,7 @@ namespace ops {
 
 constexpr auto kNameTensorArray = "TensorArray";
 
-class TensorArray : public PrimitiveC {
+class MS_CORE_API TensorArray : public PrimitiveC {
  public:
   TensorArray() : PrimitiveC(kNameTensorArray) { InitIOName({"size"}, {"handle", "flow"}); }
   ~TensorArray() = default;
diff --git a/mindspore/core/ops/tensor_array_read.h b/mindspore/core/ops/tensor_array_read.h
index 18d4af00d5b..ec28194d81e 100644
--- a/mindspore/core/ops/tensor_array_read.h
+++ b/mindspore/core/ops/tensor_array_read.h
@@ -25,7 +25,7 @@ namespace ops {
 
 constexpr auto kNameTensorArrayRead = "TensorArrayRead";
 
-class TensorArrayRead : public PrimitiveC {
+class MS_CORE_API TensorArrayRead : public PrimitiveC {
  public:
   TensorArrayRead() : PrimitiveC(kNameTensorArrayRead) { InitIOName({"handle", "index", "flow_in"}, {"tensor"}); }
   ~TensorArrayRead() = default;
diff --git a/mindspore/core/ops/tensor_array_write.h b/mindspore/core/ops/tensor_array_write.h
index efee0ae62bf..8035cc28095 100644
--- a/mindspore/core/ops/tensor_array_write.h
+++ b/mindspore/core/ops/tensor_array_write.h
@@ -25,7 +25,7 @@ namespace ops {
 
 constexpr auto kNameTensorArrayWrite = "TensorArrayWrite";
 
-class TensorArrayWrite : public PrimitiveC {
+class MS_CORE_API TensorArrayWrite : public PrimitiveC {
  public:
   TensorArrayWrite() : PrimitiveC(kNameTensorArrayWrite) {
     InitIOName({"handle", "index", "value", "flow_in"}, {"flow_out"});
diff --git a/mindspore/core/ops/tensor_list_from_tensor.h b/mindspore/core/ops/tensor_list_from_tensor.h
index 62ba4a63a1f..905370e8350 100644
--- a/mindspore/core/ops/tensor_list_from_tensor.h
+++ b/mindspore/core/ops/tensor_list_from_tensor.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameTensorListFromTensor = "TensorListFromTensor";
-class TensorListFromTensor : public PrimitiveC {
+class MS_CORE_API TensorListFromTensor : public PrimitiveC {
  public:
   TensorListFromTensor() : PrimitiveC(kNameTensorListFromTensor) {}
   ~TensorListFromTensor() = default;
diff --git a/mindspore/core/ops/tensor_list_get_item.h b/mindspore/core/ops/tensor_list_get_item.h
index a86cf0b3451..b65acdb07e8 100644
--- a/mindspore/core/ops/tensor_list_get_item.h
+++ b/mindspore/core/ops/tensor_list_get_item.h
@@ -24,7 +24,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameTensorListGetItem = "TensorListGetItem";
-class TensorListGetItem : public PrimitiveC {
+class MS_CORE_API TensorListGetItem : public PrimitiveC {
  public:
   TensorListGetItem() : PrimitiveC(kNameTensorListGetItem) {}
   ~TensorListGetItem() = default;
diff --git a/mindspore/core/ops/tensor_list_reserve.h b/mindspore/core/ops/tensor_list_reserve.h
index adcf1ebe463..9f0f680e77d 100644
--- a/mindspore/core/ops/tensor_list_reserve.h
+++ b/mindspore/core/ops/tensor_list_reserve.h
@@ -24,7 +24,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameTensorListReserve = "TensorListReserve";
-class TensorListReserve : public PrimitiveC {
+class MS_CORE_API TensorListReserve : public PrimitiveC {
  public:
   TensorListReserve() : PrimitiveC(kNameTensorListReserve) {}
   ~TensorListReserve() = default;
diff --git a/mindspore/core/ops/tensor_list_set_item.h b/mindspore/core/ops/tensor_list_set_item.h
index 045824449ed..7b265e33df7 100644
--- a/mindspore/core/ops/tensor_list_set_item.h
+++ b/mindspore/core/ops/tensor_list_set_item.h
@@ -24,7 +24,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameTensorListSetItem = "TensorListSetItem";
-class TensorListSetItem : public PrimitiveC {
+class MS_CORE_API TensorListSetItem : public PrimitiveC {
  public:
   TensorListSetItem() : PrimitiveC(kNameTensorListSetItem) {}
   ~TensorListSetItem() = default;
diff --git a/mindspore/core/ops/tensor_list_stack.h b/mindspore/core/ops/tensor_list_stack.h
index ad51e3ec791..67837002b64 100644
--- a/mindspore/core/ops/tensor_list_stack.h
+++ b/mindspore/core/ops/tensor_list_stack.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameTensorListStack = "TensorListStack";
-class TensorListStack : public PrimitiveC {
+class MS_CORE_API TensorListStack : public PrimitiveC {
  public:
   TensorListStack() : PrimitiveC(kNameTensorListStack) {}
   ~TensorListStack() = default;
diff --git a/mindspore/core/ops/tensor_summary.h b/mindspore/core/ops/tensor_summary.h
index 317e22f6689..666c7a31de4 100644
--- a/mindspore/core/ops/tensor_summary.h
+++ b/mindspore/core/ops/tensor_summary.h
@@ -27,7 +27,7 @@
 
 namespace mindspore {
 namespace ops {
-class TensorSummary : public PrimitiveC {
+class MS_CORE_API TensorSummary : public PrimitiveC {
  public:
   TensorSummary() : PrimitiveC(prim::kPrimTensorSummary->name()) {}
   ~TensorSummary() = default;
diff --git a/mindspore/core/ops/tile.h b/mindspore/core/ops/tile.h
index 6150649d0ae..ca3be5763f0 100644
--- a/mindspore/core/ops/tile.h
+++ b/mindspore/core/ops/tile.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameTile = prim::kTile;
-class Tile : public PrimitiveC {
+class MS_CORE_API Tile : public PrimitiveC {
  public:
   Tile() : PrimitiveC(kNameTile) { InitIOName({"x", "multiples"}, {"output"}); }
   explicit Tile(const std::string k_name) : PrimitiveC(k_name) { InitIOName({"x", "multiples"}, {"output"}); }
diff --git a/mindspore/core/ops/to_format.h b/mindspore/core/ops/to_format.h
index 3e438c168cd..141d285ceb4 100644
--- a/mindspore/core/ops/to_format.h
+++ b/mindspore/core/ops/to_format.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameToFormat = "ToFormat";
-class ToFormat : public PrimitiveC {
+class MS_CORE_API ToFormat : public PrimitiveC {
  public:
   ToFormat() : PrimitiveC(kNameToFormat) {}
   ~ToFormat() = default;
diff --git a/mindspore/core/ops/topk.h b/mindspore/core/ops/topk.h
index 94a5cf8c126..16258ce6e81 100644
--- a/mindspore/core/ops/topk.h
+++ b/mindspore/core/ops/topk.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameTopK = "TopK";
-class TopK : public PrimitiveC {
+class MS_CORE_API TopK : public PrimitiveC {
  public:
   explicit TopK(const std::string &k_name = kNameTopK) : PrimitiveC(k_name) {
     InitIOName({"input", "k"}, {"values", "indices"});
diff --git a/mindspore/core/ops/transpose.h b/mindspore/core/ops/transpose.h
index 56b4c25f327..230b4d970d9 100644
--- a/mindspore/core/ops/transpose.h
+++ b/mindspore/core/ops/transpose.h
@@ -25,7 +25,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameTranspose = prim::kTranspose;
-class Transpose : public PrimitiveC {
+class MS_CORE_API Transpose : public PrimitiveC {
  public:
   Transpose() : PrimitiveC(prim::kTranspose) { InitIOName({"x", "perm"}, {"output"}); }
   ~Transpose() = default;
diff --git a/mindspore/core/ops/uniform_real.h b/mindspore/core/ops/uniform_real.h
index 4ca1b366de4..34dd9fccfd3 100644
--- a/mindspore/core/ops/uniform_real.h
+++ b/mindspore/core/ops/uniform_real.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameUniformReal = "UniformReal";
-class UniformReal : public PrimitiveC {
+class MS_CORE_API UniformReal : public PrimitiveC {
  public:
   UniformReal() : PrimitiveC(kNameUniformReal) {}
   ~UniformReal() = default;
diff --git a/mindspore/core/ops/unique.h b/mindspore/core/ops/unique.h
index d0d797c0611..e0526aae9bc 100644
--- a/mindspore/core/ops/unique.h
+++ b/mindspore/core/ops/unique.h
@@ -23,7 +23,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameUnique = "Unique";
-class Unique : public PrimitiveC {
+class MS_CORE_API Unique : public PrimitiveC {
  public:
   Unique() : PrimitiveC(kNameUnique) { InitIOName({"x", "y"}, {"output"}); }
   ~Unique() = default;
diff --git a/mindspore/core/ops/unpack.h b/mindspore/core/ops/unpack.h
index ee53d711e60..79820bfc8f2 100644
--- a/mindspore/core/ops/unpack.h
+++ b/mindspore/core/ops/unpack.h
@@ -31,7 +31,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameUnpack = "Unpack";
-class Unpack : public PrimitiveC {
+class MS_CORE_API Unpack : public PrimitiveC {
  public:
   Unpack() : PrimitiveC(kNameUnpack) {}
   ~Unpack() = default;
diff --git a/mindspore/core/ops/unsorted_segment_sum.h b/mindspore/core/ops/unsorted_segment_sum.h
index 986c3b730cb..5bd13acf6f2 100644
--- a/mindspore/core/ops/unsorted_segment_sum.h
+++ b/mindspore/core/ops/unsorted_segment_sum.h
@@ -28,7 +28,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameUnsortedSegmentSum = "UnsortedSegmentSum";
-class UnsortedSegmentSum : public PrimitiveC {
+class MS_CORE_API UnsortedSegmentSum : public PrimitiveC {
  public:
   UnsortedSegmentSum() : PrimitiveC(kNameUnsortedSegmentSum) {
     InitIOName({"x", "segment_ids", "num_segments"}, {"y"});
diff --git a/mindspore/core/ops/unsqueeze.h b/mindspore/core/ops/unsqueeze.h
index a207d51db2b..7dd2f44be03 100644
--- a/mindspore/core/ops/unsqueeze.h
+++ b/mindspore/core/ops/unsqueeze.h
@@ -27,7 +27,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameUnsqueeze = "Unsqueeze";
-class Unsqueeze : public PrimitiveC {
+class MS_CORE_API Unsqueeze : public PrimitiveC {
  public:
   Unsqueeze() : PrimitiveC(kNameUnsqueeze) {}
   ~Unsqueeze() = default;
diff --git a/mindspore/core/ops/unstack.h b/mindspore/core/ops/unstack.h
index 3657f5dc7dd..91712c62359 100644
--- a/mindspore/core/ops/unstack.h
+++ b/mindspore/core/ops/unstack.h
@@ -31,7 +31,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameUnstack = "Unstack";
-class Unstack : public PrimitiveC {
+class MS_CORE_API Unstack : public PrimitiveC {
  public:
   Unstack() : PrimitiveC(kNameUnstack) {}
   ~Unstack() = default;
diff --git a/mindspore/core/ops/where.h b/mindspore/core/ops/where.h
index 5a0b8c37a88..c957e8b275d 100644
--- a/mindspore/core/ops/where.h
+++ b/mindspore/core/ops/where.h
@@ -26,7 +26,7 @@
 namespace mindspore {
 namespace ops {
 constexpr auto kNameWhere = "Where";
-class Where : public PrimitiveC {
+class MS_CORE_API Where : public PrimitiveC {
  public:
   Where() : PrimitiveC(kNameWhere) { InitIOName({"condition"}, {"output"}); }
   ~Where() = default;
diff --git a/mindspore/core/ops/zeros.h b/mindspore/core/ops/zeros.h
index d0c23bfc329..d6b70ec8d75 100644
--- a/mindspore/core/ops/zeros.h
+++ b/mindspore/core/ops/zeros.h
@@ -27,7 +27,7 @@
 
 namespace mindspore {
 namespace ops {
-class Zeros : public PrimitiveC {
+class MS_CORE_API Zeros : public PrimitiveC {
  public:
   Zeros() : PrimitiveC(prim::kPrimZeros->name()) {}
   ~Zeros() = default;
diff --git a/mindspore/core/ops/zeros_like.h b/mindspore/core/ops/zeros_like.h
index 7dde20d6876..0b404bcfb47 100644
--- a/mindspore/core/ops/zeros_like.h
+++ b/mindspore/core/ops/zeros_like.h
@@ -25,7 +25,7 @@
 
 namespace mindspore {
 namespace ops {
-class ZerosLike : public PrimitiveC {
+class MS_CORE_API ZerosLike : public PrimitiveC {
  public:
   ZerosLike() : PrimitiveC(prim::kPrimZerosLike->name()) { InitIOName({"x"}, {"y"}); }
   ~ZerosLike() = default;
diff --git a/mindspore/core/utils/log_adapter.cc b/mindspore/core/utils/log_adapter.cc
index 1bd1c7888fb..a31501075c3 100644
--- a/mindspore/core/utils/log_adapter.cc
+++ b/mindspore/core/utils/log_adapter.cc
@@ -34,7 +34,7 @@ static std::string GetProcName() {
 #else
   const std::string appname = "?";
 #endif
-  // some times, the appname is an absolute path, its too long
+  // sometimes, the app name is an absolute path, it is too long
   std::string app_name(appname);
   std::size_t pos = app_name.rfind("/");
   if (pos == std::string::npos) {
@@ -420,29 +420,49 @@ __attribute__((constructor)) void common_log_init(void) {
 void common_log_init(void) {
 #endif
 #ifdef USE_GLOG
-  // do not use glog predefined log prefix
+  // Do not use glog predefined log prefix
   FLAGS_log_prefix = false;
+  // Write log to files real-time
   FLAGS_logbufsecs = 0;
-  // set default log level to WARNING
+  // Set default log level to WARNING
   if (mindspore::GetEnv("GLOG_v").empty()) {
     FLAGS_v = mindspore::WARNING;
   }
 
-  // set default log file mode to 0640
+  // Set default log file mode to 0640
   if (mindspore::GetEnv("GLOG_logfile_mode").empty()) {
     FLAGS_logfile_mode = 0640;
   }
+  // Set default log file max size to 50 MB
+  FLAGS_max_log_size = 50;
+  std::string max_log_size = mindspore::GetEnv("GLOG_max_log_size");
+  if (!max_log_size.empty()) {
+    FLAGS_max_log_size = std::stoi(max_log_size);
+  }
   std::string logtostderr = mindspore::GetEnv("GLOG_logtostderr");
-  // default print log to screen
+  // Default print log to screen
   if (logtostderr.empty()) {
     FLAGS_logtostderr = true;
-  } else if (logtostderr == "0" && mindspore::GetEnv("GLOG_log_dir").empty()) {
-    MS_LOG(ERROR) << "`GLOG_log_dir` is empty, it must be set while 'logtostderr' equals to 0.";
-    // Here can not throw exception and use python to catch, because the PYBIND11_MODULE is not yet been initialed.
-    exit(EXIT_FAILURE);
+  } else if (logtostderr == "0") {
+    if (mindspore::GetEnv("GLOG_log_dir").empty()) {
+      MS_LOG(ERROR) << "`GLOG_log_dir` is empty, it must be set while 'logtostderr' equals to 0.";
+      // Here can not throw exception and use python to catch, because the PYBIND11_MODULE is not yet been initialed.
+      exit(EXIT_FAILURE);
+    } else {
+      // Set log dir from GLOG_log_dir with RANK_ID or OMPI_COMM_WORLD_RANK.
+      std::string rank_id = mindspore::GetEnv("RANK_ID");
+      std::string gpu_rank_id = mindspore::GetEnv("OMPI_COMM_WORLD_RANK");
+      std::string rank = "0";
+      if ((!rank_id.empty() && gpu_rank_id.empty()) || (!rank_id.empty() && !gpu_rank_id.empty())) {
+        rank = rank_id;
+      } else if (rank_id.empty() && !gpu_rank_id.empty()) {
+        rank = gpu_rank_id;
+      }
+      FLAGS_log_dir = mindspore::GetEnv("GLOG_log_dir") + "/rank_" + rank + "/logs";
+    }
   }
 
-  // default GLOG_stderrthreshold level to WARNING
+  // Default GLOG_stderrthreshold level to WARNING
   auto threshold = mindspore::GetEnv("GLOG_stderrthreshold");
   FLAGS_stderrthreshold = mindspore::GetThresholdLevel(threshold);
 
diff --git a/mindspore/core/utils/log_adapter.h b/mindspore/core/utils/log_adapter.h
index 60766cc6d31..3214e9648ac 100644
--- a/mindspore/core/utils/log_adapter.h
+++ b/mindspore/core/utils/log_adapter.h
@@ -25,6 +25,7 @@
 #include <map>
 #include <thread>
 #include <functional>
+#include "utils/visible.h"
 #include "utils/overload.h"
 #include "./securec.h"
 #ifdef USE_GLOG
@@ -44,7 +45,7 @@ static constexpr size_t GetRelPathPos() noexcept {
 }
 
 namespace mindspore {
-extern std::map<void **, std::thread *> acl_handle_map __attribute__((visibility("default")));
+MS_CORE_API extern std::map<void **, std::thread *> acl_handle_map;
 #define FILE_NAME                                                                             \
   (sizeof(__FILE__) > GetRelPathPos() ? static_cast<const char *>(__FILE__) + GetRelPathPos() \
                                       : static_cast<const char *>(__FILE__))
@@ -146,25 +147,13 @@ enum SubModuleId : int {
 #define SUBMODULE_ID mindspore::SubModuleId::SM_ME
 #endif
 
-#if defined(_WIN32) || defined(_WIN64)
-extern const std::string GetSubModuleName(SubModuleId module_id) __attribute__((dllexport));
-#else
-extern const std::string GetSubModuleName(SubModuleId module_id) __attribute__((visibility("default")));
-#endif
+MS_EXPORT const std::string GetSubModuleName(SubModuleId module_id);
 
 const char *EnumStrForMsLogLevel(MsLogLevel level);
 
-#if defined(_WIN32) || defined(_WIN64)
-extern std::string GetTimeString() __attribute__((dllexport));
-#else
-extern std::string GetTimeString() __attribute__((visibility("default")));
-#endif
+MS_EXPORT std::string GetTimeString();
 
-#if defined(_WIN32) || defined(_WIN64)
-extern int g_ms_submodule_log_levels[] __attribute__((dllexport));
-#else
-extern int g_ms_submodule_log_levels[] __attribute__((visibility("default")));
-#endif
+MS_EXPORT extern int g_ms_submodule_log_levels[];
 
 class LogWriter {
  public:
@@ -176,8 +165,8 @@ class LogWriter {
       : location_(location), log_level_(log_level), submodule_(submodule), exception_type_(excp_type) {}
   ~LogWriter() = default;
 
-  void operator<(const LogStream &stream) const noexcept __attribute__((visibility("default")));
-  void operator^(const LogStream &stream) const __attribute__((noreturn, visibility("default")));
+  MS_CORE_API void operator<(const LogStream &stream) const noexcept;
+  MS_CORE_API void operator^(const LogStream &stream) const __attribute__((noreturn));
 
   static void set_exception_handler(ExceptionHandler exception_handler) { exception_handler_ = exception_handler; }
   static void set_trace_provider(TraceProvider trace_provider) { trace_provider_ = trace_provider; }
diff --git a/mindspore/core/utils/ms_context.cc b/mindspore/core/utils/ms_context.cc
index ae4d047c4f6..e598dd984ce 100644
--- a/mindspore/core/utils/ms_context.cc
+++ b/mindspore/core/utils/ms_context.cc
@@ -35,6 +35,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
   set_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG, false);
   set_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH, ".");
   set_param<std::string>(MS_CTX_PYTHON_EXE_PATH, "python");
+  set_param<std::string>(MS_CTX_KERNEL_BUILD_SERVER_DIR, "");
   set_param<bool>(MS_CTX_ENABLE_DUMP, false);
   set_param<std::string>(MS_CTX_SAVE_DUMP_PATH, ".");
   set_param<std::string>(MS_CTX_ENV_CONFIG_PATH, "");
@@ -88,6 +89,8 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
   set_param<bool>(MS_CTX_LOAD_COMPILE_CACHE, false);
   set_param<bool>(MS_CTX_ENABLE_MINDRT, false);
   set_param<bool>(MS_CTX_ALREADY_SET_ENABLE_MINDRT, false);
+  set_param<bool>(MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE, false);
+  set_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE, true);
 
   backend_policy_ = policy_map_[policy];
 }
diff --git a/mindspore/core/utils/ms_context.h b/mindspore/core/utils/ms_context.h
index 1eb49942cb3..a80a346902f 100644
--- a/mindspore/core/utils/ms_context.h
+++ b/mindspore/core/utils/ms_context.h
@@ -90,6 +90,8 @@ enum MsCtxParam : unsigned {
   MS_CTX_LOAD_COMPILE_CACHE,
   MS_CTX_ENABLE_MINDRT,
   MS_CTX_ALREADY_SET_ENABLE_MINDRT,
+  MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE,
+  MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE,
   MS_CTX_TYPE_BOOL_END,
 
   // parameter of type int
@@ -120,6 +122,7 @@ enum MsCtxParam : unsigned {
   MS_CTX_SAVE_GRAPHS_PATH,
   MS_CTX_VARIABLE_MEMORY_MAX_SIZE,
   MS_CTX_PYTHON_EXE_PATH,
+  MS_CTX_KERNEL_BUILD_SERVER_DIR,
   MS_CTX_ENV_CONFIG_PATH,
   MS_CTX_TUNE_MODE,
   MS_CTX_GRAPH_KERNEL_FLAGS,
diff --git a/mindspore/core/utils/tensor_construct_utils.cc b/mindspore/core/utils/tensor_construct_utils.cc
index 601e7a1223f..a852fd6f51e 100644
--- a/mindspore/core/utils/tensor_construct_utils.cc
+++ b/mindspore/core/utils/tensor_construct_utils.cc
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 #include "utils/tensor_construct_utils.h"
-#include <vector>
 #include <memory>
+#include <vector>
+#include <map>
+#include <functional>
 namespace mindspore {
 tensor::TensorPtr TensorConstructUtils::CreateZerosTensor(const TypePtr &type_ptr, const std::vector<int64_t> &shape) {
   MS_EXCEPTION_IF_NULL(type_ptr);
@@ -34,13 +36,41 @@ tensor::TensorPtr TensorConstructUtils::CreateOnesTensor(const TypePtr &type_ptr
   MS_EXCEPTION_IF_NULL(type_ptr);
   auto type_id = ExtractTypeId(type_ptr);
   tensor::TensorPtr tensor = std::make_shared<tensor::Tensor>(type_id, shape);
-  size_t mem_size = IntToSize(tensor->ElementsNum());
-  if (tensor->data_type() == kNumberTypeFloat32) {
-    SetTensorData<float>(tensor->data_c(), 1.0, mem_size);
-  } else if (tensor->data_type() == kNumberTypeInt) {
-    SetTensorData<int>(tensor->data_c(), 1, mem_size);
+  const size_t &mem_size = IntToSize(tensor->ElementsNum());
+  auto tensor_data = tensor->data_c();
+  std::map<TypeId, std::function<void()>> type_dict{
+    {kNumberTypeBool, [&tensor_data, mem_size]() { SetTensorData<bool>(tensor_data, true, mem_size); }},
+    {kNumberTypeInt8,
+     [&tensor_data, mem_size]() { SetTensorData<int8_t>(tensor_data, static_cast<int8_t>(1), mem_size); }},
+    {kNumberTypeInt16,
+     [&tensor_data, mem_size]() { SetTensorData<int16_t>(tensor_data, static_cast<int16_t>(1), mem_size); }},
+    {kNumberTypeInt32,
+     [&tensor_data, mem_size]() { SetTensorData<int32_t>(tensor_data, static_cast<int32_t>(1), mem_size); }},
+    {kNumberTypeInt64,
+     [&tensor_data, mem_size]() { SetTensorData<int64_t>(tensor_data, static_cast<int64_t>(1), mem_size); }},
+    {kNumberTypeUInt8,
+     [&tensor_data, mem_size]() { SetTensorData<uint8_t>(tensor_data, static_cast<uint8_t>(1), mem_size); }},
+    {kNumberTypeUInt16,
+     [&tensor_data, mem_size]() { SetTensorData<uint16_t>(tensor_data, static_cast<uint16_t>(1), mem_size); }},
+    {kNumberTypeUInt32,
+     [&tensor_data, mem_size]() { SetTensorData<uint32_t>(tensor_data, static_cast<uint32_t>(1), mem_size); }},
+    {kNumberTypeUInt64,
+     [&tensor_data, mem_size]() { SetTensorData<uint64_t>(tensor_data, static_cast<uint64_t>(1), mem_size); }},
+    {kNumberTypeFloat16,
+     [&tensor_data, mem_size]() { SetTensorData<float16>(tensor_data, static_cast<float16>(1.0), mem_size); }},
+    {kNumberTypeFloat32,
+     [&tensor_data, mem_size]() { SetTensorData<float>(tensor_data, static_cast<float>(1.0), mem_size); }},
+    {kNumberTypeFloat64,
+     [&tensor_data, mem_size]() { SetTensorData<double>(tensor_data, static_cast<double>(1.0), mem_size); }},
+  };
+
+  const auto &tensor_type = tensor->data_type();
+  if (type_dict.count(tensor_type)) {
+    type_dict[tensor_type]();
+    return tensor;
+  } else {
+    MS_LOG(EXCEPTION) << "unsupported data type: " << tensor_type;
   }
-  return tensor;
 }
 
 tensor::TensorPtr TensorConstructUtils::CreateTensor(const TypePtr &type_ptr, const std::vector<int64_t> &shape,
diff --git a/mindspore/core/utils/visible.h b/mindspore/core/utils/visible.h
index afa9b4a46b6..f12a6a14369 100644
--- a/mindspore/core/utils/visible.h
+++ b/mindspore/core/utils/visible.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,27 +17,19 @@
 #ifndef MINDSPORE_CORE_UTILS_VISIBLE_H_
 #define MINDSPORE_CORE_UTILS_VISIBLE_H_
 
-namespace mindspore {
-// refer to https://gcc.gnu.org/wiki/Visibility
-#if defined _WIN32 || defined __CYGWIN__
+#if (defined(_WIN32) || defined(__WIN32__) || defined(WIN32) || defined(__CYGWIN__))
 #ifdef BUILDING_DLL
-#ifdef __GNUC__
-#define MS_EXPORT __attribute__((dllexport))
+#define MS_CORE_API __declspec(dllexport)
+#define MS_EXPORT __declspec(dllexport)
 #else
-#define MS_EXPORT __declspec(dllexport)  // Note: actually gcc seems to also supports this syntax.
-#endif
-#else
-#ifdef __GNUC__
-#define MS_EXPORT __attribute__((dllimport))
-#else
-#define MS_EXPORT __declspec(dllimport)  // Note: actually gcc seems to also supports this syntax.
-#endif
+#define MS_CORE_API __declspec(dllimport)
+#define MS_EXPORT __declspec(dllimport)
 #endif
 #define MS_LOCAL
 #else
+#define MS_CORE_API __attribute__((visibility("default")))
 #define MS_EXPORT __attribute__((visibility("default")))
 #define MS_LOCAL __attribute__((visibility("hidden")))
 #endif
-}  // namespace mindspore
 
 #endif  // MINDSPORE_CORE_UTILS_VISIBLE_H_
diff --git a/mindspore/dataset/audio/__init__.py b/mindspore/dataset/audio/__init__.py
index 2d695cb34a3..067cd2414d4 100644
--- a/mindspore/dataset/audio/__init__.py
+++ b/mindspore/dataset/audio/__init__.py
@@ -11,7 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# ==============================================================================
 """
 This module is to support audio augmentations.
 """
 from . import transforms
+from . import utils
diff --git a/mindspore/dataset/audio/transforms.py b/mindspore/dataset/audio/transforms.py
index aff46d944f4..0bbc0191b08 100644
--- a/mindspore/dataset/audio/transforms.py
+++ b/mindspore/dataset/audio/transforms.py
@@ -11,18 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# ==============================================================================
 """
-The module audio.transforms is inherited from _c_dataengine.
-and is implemented based on  C++. It's a high performance module to
-process audio. Users can apply suitable augmentations on audio data
-to improve their training models.
+The module audio.transforms is inherited from _c_dataengine and is
+implemented based on C++. It's a high performance module to process
+audio. Users can apply suitable augmentations on audio data to improve
+their training models.
 """
-import mindspore._c_dataengine as cde
+
 import numpy as np
+
+import mindspore._c_dataengine as cde
 from ..transforms.c_transforms import TensorOperation
 from .utils import ScaleType
 from .validators import check_allpass_biquad, check_amplitude_to_db, check_band_biquad, check_bandpass_biquad, \
-    check_bandreject_biquad, check_bass_biquad, check_time_stretch
+    check_bandreject_biquad, check_bass_biquad, check_complex_norm, check_masking, check_time_stretch
 
 
 class AudioTensorOperation(TensorOperation):
@@ -33,38 +36,31 @@ class AudioTensorOperation(TensorOperation):
     def __call__(self, *input_tensor_list):
         for tensor in input_tensor_list:
             if not isinstance(tensor, (np.ndarray,)):
-                raise TypeError(
-                    "Input should be NumPy audio, got {}.".format(type(tensor)))
+                raise TypeError("Input should be NumPy audio, got {}.".format(type(tensor)))
         return super().__call__(*input_tensor_list)
 
     def parse(self):
-        raise NotImplementedError(
-            "AudioTensorOperation has to implement parse() method.")
+        raise NotImplementedError("AudioTensorOperation has to implement parse() method.")
 
 
 class AllpassBiquad(AudioTensorOperation):
     """
-    Design two-pole all-pass filter for audio waveform of dimension of `(..., time)`
+    Design two-pole all-pass filter for audio waveform of dimension of (..., time).
 
-        Args:
-            sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz),
-                the value must be greater than 0 .
-            central_freq (float): central frequency (in Hz),
-                the value must be greater than 0 .
-            Q(float, optional): Quality factor,https://en.wikipedia.org/wiki/Q_factor,
-                Range: (0, 1] (Default=0.707).
+    Args:
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz), the value must be greater than 0.
+        central_freq (float): central frequency (in Hz), the value must be greater than 0.
+        Q(float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0, 1] (default=0.707).
 
-        Examples:
-            >>> import mindspore.dataset.audio.transforms as audio
-            >>> import numpy as np
-
-            >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03],[9.246826171875e-03, 1.0894775390625e-02]])
-            >>> allpasspass_biquad_op = audio.AllpassBiquad(44100, 200.0)
-            >>> waveform_filtered = allpass_biquad_op(waveform)
-
-        References:
-            https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    Examples:
+        >>> import numpy as np
+        >>>
+        >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
+        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
+        >>> transforms = [audio.AllpassBiquad(44100, 200.0)]
+        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
     """
+
     @check_allpass_biquad
     def __init__(self, sample_rate, central_freq, Q=0.707):
         self.sample_rate = sample_rate
@@ -84,23 +80,22 @@ class AmplitudeToDB(AudioTensorOperation):
     Converts the input tensor from amplitude/power scale to decibel scale.
 
     Args:
-        stype (ScaleType, optional): Scale of the input tensor. (Default="ScaleType.POWER").
-        It can be any of [ScaleType.MAGNITUDE, ScaleType.POWER].
+        stype (ScaleType, optional): Scale of the input tensor (default=ScaleType.POWER).
+            It can be one of ScaleType.MAGNITUDE or ScaleType.POWER.
         ref_value (float, optional): Param for generate db_multiplier.
         amin (float, optional): Lower bound to clamp the input waveform.
-        top_db (float, optional): Minimum cut-off decibels. The range of values is non-negative. Commonly set at 80.
-            (Default=80.0)
+        top_db (float, optional): Minimum cut-off decibels. The range of values is non-negative.
+            Commonly set at 80 (default=80.0).
     Examples:
-        >>> channel = 1
-        >>> n_fft = 400
-        >>> n_frame = 30
-        >>> specrogram = np.random.random([channel, n_fft//2+1, n_frame])
-        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=specrogram, column_names=["audio"])
+        >>> import numpy as np
+        >>>
+        >>> waveform = np.random.random([1, 400//2+1, 30])
+        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
         >>> transforms = [audio.AmplitudeToDB(stype=ScaleType.POWER)]
         >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
     """
 
-    @ check_amplitude_to_db
+    @check_amplitude_to_db
     def __init__(self, stype=ScaleType.POWER, ref_value=1.0, amin=1e-10, top_db=80.0):
         self.stype = stype
         self.ref_value = ref_value
@@ -115,15 +110,14 @@ class Angle(AudioTensorOperation):
     """
     Calculate the angle of the complex number sequence of shape (..., 2).
     The first dimension represents the real part while the second represents the imaginary.
-    Args:
 
     Examples:
-        >>> import mindspore.dataset.audio.transforms as audio
         >>> import numpy as np
-
-        >>> input_complex = np.array([[1.43, 5.434], [23.54, 89.38]])
-        >>> angle_op = audio.Angle()
-        >>> angles = angle_op(input_complex)
+        >>>
+        >>> waveform = np.array([[1.43, 5.434], [23.54, 89.38]])
+        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
+        >>> transforms = [audio.Angle()]
+        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
     """
 
     def parse(self):
@@ -132,24 +126,24 @@ class Angle(AudioTensorOperation):
 
 class BandBiquad(AudioTensorOperation):
     """
-    Design two-pole band filter for audio waveform of dimension of `(..., time)`
+    Design two-pole band filter for audio waveform of dimension of (..., time).
 
     Args:
-        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
-        central_freq (float): central frequency (in Hz),
-        Q(float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, Range: (0, 1] (Default=0.707).
-        noise (bool, optional) : If ``True``, uses the alternate mode for un-pitched audio (e.g. percussion).
-            If ``False``, uses mode oriented to pitched audio, i.e. voice, singing,
-            or instrumental music (Default: ``False``).
+        sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
+        central_freq (float): Central frequency (in Hz).
+        Q(float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0, 1] (default=0.707).
+        noise (bool, optional) : If True, uses the alternate mode for un-pitched audio (e.g. percussion).
+            If False, uses mode oriented to pitched audio, i.e. voice, singing, or instrumental music (default=False).
 
     Examples:
-        >>> import mindspore.dataset.audio.transforms as audio
         >>> import numpy as np
-
-        >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03],[9.246826171875e-03, 1.0894775390625e-02]])
-        >>> band_biquad_op = audio.BandBiquad(44100, 200.0)
-        >>> waveform_filtered = band_biquad_op(waveform)
+        >>>
+        >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
+        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
+        >>> transforms = [audio.BandBiquad(44100, 200.0)]
+        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
     """
+
     @check_band_biquad
     def __init__(self, sample_rate, central_freq, Q=0.707, noise=False):
         self.sample_rate = sample_rate
@@ -161,25 +155,26 @@ class BandBiquad(AudioTensorOperation):
         return cde.BandBiquadOperation(self.sample_rate, self.central_freq, self.Q, self.noise)
 
 
-class BandpassBiquad(TensorOperation):
+class BandpassBiquad(AudioTensorOperation):
     """
-    Design two-pole band-pass filter.  Similar to SoX implementation.
+    Design two-pole band-pass filter. Similar to SoX implementation.
 
     Args:
-        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
-        central_freq (float): central frequency (in Hz)
-        Q (float, optional): https://en.wikipedia.org/wiki/Q_factor Range: (0,1] (Default=0.707).
-        const_skirt_gain (bool, optional) : If ``True``, uses a constant skirt gain (peak gain = Q).
-            If ``False``, uses a constant 0dB peak gain. (Default: ``False``)
+        sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz).
+        central_freq (float): Central frequency (in Hz).
+        Q (float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0,1] (default=0.707).
+        const_skirt_gain (bool, optional) : If True, uses a constant skirt gain (peak gain = Q).
+            If False, uses a constant 0dB peak gain (default=False).
 
     Examples:
-        >>> import mindspore.dataset.audio.transforms as audio
         >>> import numpy as np
-
-        >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03],[9.246826171875e-03, 1.0894775390625e-02]])
-        >>> bandpass_biquad_op = audio.BandpassBiquad(44100, 200.0)
-        >>> waveform_filtered = bandpass_biquad_op(waveform)
+        >>>
+        >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
+        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
+        >>> transforms = [audio.BandpassBiquad(44100, 200.0)]
+        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
     """
+
     @check_bandpass_biquad
     def __init__(self, sample_rate, central_freq, Q=0.707, const_skirt_gain=False):
         self.sample_rate = sample_rate
@@ -193,23 +188,20 @@ class BandpassBiquad(TensorOperation):
 
 class BandrejectBiquad(AudioTensorOperation):
     """
-    Design two-pole band filter for audio waveform of dimension of `(..., time)`
+    Design two-pole band filter for audio waveform of dimension of (..., time).
 
     Args:
-        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz),
-            the value must be greater than 0 .
-        central_freq (float): central frequency (in Hz),
-            the value must be greater than 0 .
-        Q(float, optional): Quality factor,https://en.wikipedia.org/wiki/Q_factor,
-            Range: (0, 1] (Default=0.707).
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz), the value must be greater than 0.
+        central_freq (float): central frequency (in Hz), the value must be greater than 0.
+        Q(float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0, 1] (default=0.707).
 
     Examples:
-        >>> import mindspore.dataset.audio.transforms as audio
         >>> import numpy as np
-
+        >>>
         >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03],[9.246826171875e-03, 1.0894775390625e-02]])
-        >>> band_biquad_op = audio.BandBiquad(44100, 200.0)
-        >>> waveform_filtered = band_biquad_op(waveform)
+        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
+        >>> transforms = [audio.BandrejectBiquad(44100, 200.0)]
+        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
     """
 
     @check_bandreject_biquad
@@ -224,22 +216,23 @@ class BandrejectBiquad(AudioTensorOperation):
 
 class BassBiquad(AudioTensorOperation):
     """
-    Design a bass tone-control effect for audio waveform of dimension of `(..., time)`
+    Design a bass tone-control effect for audio waveform of dimension of (..., time).
 
     Args:
-        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
-        gain (float): desired gain at the boost (or attenuation) in dB.
-        central_freq (float): central frequency (in Hz)(Default=100.0).
-        Q(float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, Range: (0, 1] (Default=0.707).
+        sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz).
+        gain (float): Desired gain at the boost (or attenuation) in dB.
+        central_freq (float): Central frequency (in Hz) (default=100.0).
+        Q(float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0, 1] (default=0.707).
 
     Examples:
-        >>> import mindspore.dataset.audio.transforms as audio
         >>> import numpy as np
-
-        >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03],[9.246826171875e-03, 1.0894775390625e-02]])
-        >>> bass_biquad_op = audio.BassBiquad(44100, 100.0)
-        >>> waveform_filtered = bass_biquad_op(waveform)
+        >>>
+        >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
+        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
+        >>> transforms = [audio.BassBiquad(44100, 100.0)]
+        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
     """
+
     @check_bass_biquad
     def __init__(self, sample_rate, gain, central_freq=100.0, Q=0.707):
         self.sample_rate = sample_rate
@@ -251,6 +244,91 @@ class BassBiquad(AudioTensorOperation):
         return cde.BassBiquadOperation(self.sample_rate, self.gain, self.central_freq, self.Q)
 
 
+class ComplexNorm(AudioTensorOperation):
+    """
+    Compute the norm of complex tensor input.
+
+    Args:
+        power (float, optional): Power of the norm, which must be non-negative (default=1.0).
+
+    Examples:
+        >>> import numpy as np
+        >>>
+        >>> waveform = np.random.random([2, 4, 2])
+        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
+        >>> transforms = [audio.ComplexNorm()]
+        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
+    """
+    @check_complex_norm
+    def __init__(self, power=1.0):
+        self.power = power
+
+    def parse(self):
+        return cde.ComplexNormOperation(self.power)
+
+
+class FrequencyMasking(AudioTensorOperation):
+    """
+    Apply masking to a spectrogram in the frequency domain.
+
+    Args:
+        iid_masks (bool, optional): Whether to apply different masks to each example (default=false).
+        frequency_mask_param (int): Maximum possible length of the mask (default=0).
+            Indices uniformly sampled from [0, frequency_mask_param].
+        mask_start (int): Mask start when iid_masks=true (default=0).
+        mask_value (double): Mask value (default=0.0).
+
+    Examples:
+        >>> import numpy as np
+        >>>
+        >>> waveform = np.random.random([1, 3, 2])
+        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
+        >>> transforms = [audio.FrequencyMasking(frequency_mask_param=1)]
+        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
+    """
+    @check_masking
+    def __init__(self, iid_masks=False, frequency_mask_param=0, mask_start=0, mask_value=0.0):
+        self.iid_masks = iid_masks
+        self.frequency_mask_param = frequency_mask_param
+        self.mask_start = mask_start
+        self.mask_value = mask_value
+
+    def parse(self):
+        return cde.FrequencyMaskingOperation(self.iid_masks, self.frequency_mask_param, self.mask_start,
+                                             self.mask_value)
+
+
+class TimeMasking(AudioTensorOperation):
+    """
+    Apply masking to a spectrogram in the time domain.
+
+    Args:
+        iid_masks (bool, optional): Whether to apply different masks to each example (default=false).
+        time_mask_param (int): Maximum possible length of the mask (default=0).
+            Indices uniformly sampled from [0, time_mask_param].
+        mask_start (int): Mask start takes effect when iid_masks=true (default=0).
+        mask_value (double): Mask value (default=0.0).
+
+    Examples:
+        >>> import numpy as np
+        >>>
+        >>> waveform = np.random.random([1, 3, 2])
+        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
+        >>> transforms = [audio.TimeMasking(time_mask_param=1)]
+        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
+    """
+
+    @check_masking
+    def __init__(self, iid_masks=False, time_mask_param=0, mask_start=0, mask_value=0.0):
+        self.iid_masks = iid_masks
+        self.time_mask_param = time_mask_param
+        self.mask_start = mask_start
+        self.mask_value = mask_value
+
+    def parse(self):
+        return cde.TimeMaskingOperation(self.iid_masks, self.time_mask_param, self.mask_start, self.mask_value)
+
+
 class TimeStretch(AudioTensorOperation):
     """
     Stretch STFT in time at a given rate, without changing the pitch.
diff --git a/mindspore/dataset/audio/utils.py b/mindspore/dataset/audio/utils.py
index 1bf00f2da0d..3b1f42579eb 100644
--- a/mindspore/dataset/audio/utils.py
+++ b/mindspore/dataset/audio/utils.py
@@ -11,9 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# ==============================================================================
 """
 enum for audio ops
 """
+
 from enum import Enum
 
 
diff --git a/mindspore/dataset/audio/validators.py b/mindspore/dataset/audio/validators.py
index ad10b842b68..ccb55943463 100644
--- a/mindspore/dataset/audio/validators.py
+++ b/mindspore/dataset/audio/validators.py
@@ -15,14 +15,17 @@
 """
 Validators for TensorOps.
 """
+
 from functools import wraps
-from mindspore.dataset.core.validator_helpers import check_not_zero, check_int32, check_float32, check_value, \
-    check_value_normalize_std, check_value_ratio, FLOAT_MAX_INTEGER, INT64_MAX, parse_user_args, type_check
+
+from mindspore.dataset.core.validator_helpers import check_float32, check_int32_not_zero, \
+    check_non_negative_float32, check_non_negative_float64, check_pos_float32, check_pos_int64, check_value, \
+    parse_user_args, type_check
 from .utils import ScaleType
 
 
 def check_amplitude_to_db(method):
-    """Wrapper method to check the parameters of amplitude_to_db."""
+    """Wrapper method to check the parameters of AmplitudeToDB."""
 
     @wraps(method)
     def new_method(self, *args, **kwargs):
@@ -34,30 +37,30 @@ def check_amplitude_to_db(method):
         # type check ref_value
         type_check(ref_value, (int, float), "ref_value")
         # value check ref_value
-        if not ref_value is None:
-            check_value_ratio(ref_value, (0, FLOAT_MAX_INTEGER), "ref_value")
+        if ref_value is not None:
+            check_pos_float32(ref_value, "ref_value")
 
         # type check amin
         type_check(amin, (int, float), "amin")
         # value check amin
-        if not amin is None:
-            check_value_ratio(amin, (0, FLOAT_MAX_INTEGER), "amin")
+        if amin is not None:
+            check_pos_float32(amin, "amin")
 
         # type check top_db
         type_check(top_db, (int, float), "top_db")
         # value check top_db
-        if not top_db is None:
-            check_value_ratio(top_db, (0, FLOAT_MAX_INTEGER), "top_db")
+        if top_db is not None:
+            check_pos_float32(top_db, "top_db")
 
         return method(self, *args, **kwargs)
+
     return new_method
 
 
 def check_biquad_sample_rate(sample_rate):
     """Wrapper method to check the parameters of sample_rate."""
     type_check(sample_rate, (int,), "sample_rate")
-    check_int32(sample_rate, "sample_rate")
-    check_not_zero(sample_rate, "sample_rate")
+    check_int32_not_zero(sample_rate, "sample_rate")
 
 
 def check_biquad_central_freq(central_freq):
@@ -69,7 +72,7 @@ def check_biquad_central_freq(central_freq):
 def check_biquad_Q(Q):
     """Wrapper method to check the parameters of Q."""
     type_check(Q, (float, int), "Q")
-    check_value_normalize_std(Q, [0, 1], "Q")
+    check_value(Q, [0, 1], "Q", True)
 
 
 def check_biquad_noise(noise):
@@ -105,7 +108,7 @@ def check_band_biquad(method):
 
 
 def check_allpass_biquad(method):
-    """Wrapper method to check the parameters of CutMixBatch."""
+    """Wrapper method to check the parameters of AllpassBiquad."""
 
     @wraps(method)
     def new_method(self, *args, **kwargs):
@@ -122,7 +125,7 @@ def check_allpass_biquad(method):
 def check_bandpass_biquad(method):
     """Wrapper method to check the parameters of BandpassBiquad."""
 
-    @ wraps(method)
+    @wraps(method)
     def new_method(self, *args, **kwargs):
         [sample_rate, central_freq, Q, const_skirt_gain], _ = parse_user_args(
             method, *args, **kwargs)
@@ -151,7 +154,7 @@ def check_bandreject_biquad(method):
 
 
 def check_bass_biquad(method):
-    """Wrapper method to check the parameters of CutMixBatch."""
+    """Wrapper method to check the parameters of BassBiquad."""
 
     @wraps(method)
     def new_method(self, *args, **kwargs):
@@ -167,22 +170,52 @@ def check_bass_biquad(method):
 
 
 def check_time_stretch(method):
-    """Wrapper method to check the parameters of time_stretch."""
+    """Wrapper method to check the parameters of TimeStretch."""
+
     @wraps(method)
     def new_method(self, *args, **kwargs):
         [hop_length, n_freq, fixed_rate], _ = parse_user_args(method, *args, **kwargs)
-        # type check
-        type_check(hop_length, (int, type(None)), "hop_length")
-        type_check(n_freq, (int,), "n_freq")
-        type_check(fixed_rate, (int, float, type(None)), "fixed_rate")
 
-        # value check
         if hop_length is not None:
-            check_value(hop_length, (1, INT64_MAX), "hop_length")
-        check_value(n_freq, (1, INT64_MAX), "n_freq")
-        if fixed_rate is not None:
-            check_value_ratio(fixed_rate, (0, FLOAT_MAX_INTEGER), "fixed_rate")
+            type_check(hop_length, (int,), "hop_length")
+            check_pos_int64(hop_length, "hop_length")
 
+        type_check(n_freq, (int,), "n_freq")
+        check_pos_int64(n_freq, "n_freq")
+
+        if fixed_rate is not None:
+            type_check(fixed_rate, (int, float), "fixed_rate")
+            check_pos_float32(fixed_rate, "fixed_rate")
+        return method(self, *args, **kwargs)
+
+    return new_method
+
+
+def check_masking(method):
+    """Wrapper method to check the parameters of time_masking and FrequencyMasking"""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        [iid_masks, mask_param, mask_start, mask_value], _ = parse_user_args(
+            method, *args, **kwargs)
+        type_check(iid_masks, (bool,), "iid_masks")
+        type_check(mask_param, (int,), "mask_param")
+        check_non_negative_float32(mask_param, "mask_param")
+        type_check(mask_start, (int,), "mask_start")
+        check_non_negative_float32(mask_start, "mask_start")
+        type_check(mask_value, (int, float), "mask_value")
+        check_non_negative_float64(mask_value, "mask_value")
+        return method(self, *args, **kwargs)
+
+    return new_method
+
+
+def check_complex_norm(method):
+    """Wrapper method to check the parameters of ComplexNorm."""
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        [power], _ = parse_user_args(method, *args, **kwargs)
+        check_non_negative_float32(power, "power")
         return method(self, *args, **kwargs)
 
     return new_method
diff --git a/mindspore/dataset/core/validator_helpers.py b/mindspore/dataset/core/validator_helpers.py
index 7ca1696e857..f30454314d0 100644
--- a/mindspore/dataset/core/validator_helpers.py
+++ b/mindspore/dataset/core/validator_helpers.py
@@ -92,20 +92,38 @@ def pad_arg_name(arg_name):
     return arg_name
 
 
-def check_value(value, valid_range, arg_name=""):
+def check_value(value, valid_range, arg_name="", left_open_interval=False, right_open_interval=False):
     """
-    Validates a value is within a desired range [inclusive, inclusive].
+    Validates a value is within a desired range with left and right interval open or close.
 
-    :param value: the value to be validated
-    :param valid_range: the desired range
-    :param arg_name: arg_name: arg_name: name of the variable to be validated
+    :param value: the value to be validated.
+    :param valid_range: the desired range.
+    :param arg_name: name of the variable to be validated.
+    :param left_open_interval: True for left interval open and False for close.
+    :param right_open_interval: True for right interval open and False for close.
     :return: Exception: when the validation fails, nothing otherwise.
     """
     arg_name = pad_arg_name(arg_name)
-    if value < valid_range[0] or value > valid_range[1]:
-        raise ValueError(
-            "Input {0}is not within the required interval of [{1}, {2}].".format(arg_name, valid_range[0],
-                                                                                 valid_range[1]))
+    if not left_open_interval and not right_open_interval:
+        if value < valid_range[0] or value > valid_range[1]:
+            raise ValueError(
+                "Input {0}is not within the required interval of [{1}, {2}].".format(arg_name, valid_range[0],
+                                                                                     valid_range[1]))
+    elif left_open_interval and not right_open_interval:
+        if value <= valid_range[0] or value > valid_range[1]:
+            raise ValueError(
+                "Input {0}is not within the required interval of ({1}, {2}].".format(arg_name, valid_range[0],
+                                                                                     valid_range[1]))
+    elif not left_open_interval and right_open_interval:
+        if value < valid_range[0] or value >= valid_range[1]:
+            raise ValueError(
+                "Input {0}is not within the required interval of [{1}, {2}).".format(arg_name, valid_range[0],
+                                                                                     valid_range[1]))
+    else:
+        if value <= valid_range[0] or value >= valid_range[1]:
+            raise ValueError(
+                "Input {0}is not within the required interval of ({1}, {2}).".format(arg_name, valid_range[0],
+                                                                                     valid_range[1]))
 
 
 def check_value_cutoff(value, valid_range, arg_name=""):
@@ -117,11 +135,7 @@ def check_value_cutoff(value, valid_range, arg_name=""):
     :param arg_name: arg_name: arg_name: name of the variable to be validated
     :return: Exception: when the validation fails, nothing otherwise.
     """
-    arg_name = pad_arg_name(arg_name)
-    if value < valid_range[0] or value >= valid_range[1]:
-        raise ValueError(
-            "Input {0}is not within the required interval of [{1}, {2}).".format(arg_name, valid_range[0],
-                                                                                 valid_range[1]))
+    check_value(value, valid_range, arg_name, False, True)
 
 
 def check_value_ratio(value, valid_range, arg_name=""):
@@ -133,11 +147,7 @@ def check_value_ratio(value, valid_range, arg_name=""):
     :param arg_name: arg_name: name of the variable to be validated
     :return: Exception: when the validation fails, nothing otherwise.
     """
-    arg_name = pad_arg_name(arg_name)
-    if value <= valid_range[0] or value > valid_range[1]:
-        raise ValueError(
-            "Input {0}is not within the required interval of ({1}, {2}].".format(arg_name, valid_range[0],
-                                                                                 valid_range[1]))
+    check_value(value, valid_range, arg_name, True, False)
 
 
 def check_value_normalize_std(value, valid_range, arg_name=""):
@@ -149,11 +159,7 @@ def check_value_normalize_std(value, valid_range, arg_name=""):
     :param arg_name: arg_name: name of the variable to be validated
     :return: Exception: when the validation fails, nothing otherwise.
     """
-    arg_name = pad_arg_name(arg_name)
-    if value <= valid_range[0] or value > valid_range[1]:
-        raise ValueError(
-            "Input {0}is not within the required interval of ({1}, {2}].".format(arg_name, valid_range[0],
-                                                                                 valid_range[1]))
+    check_value(value, valid_range, arg_name, True, False)
 
 
 def check_range(values, valid_range, arg_name=""):
@@ -185,10 +191,12 @@ def check_positive(value, arg_name=""):
         raise ValueError("Input {0}must be greater than 0.".format(arg_name))
 
 
-def check_not_zero(value, arg_name=""):
+def check_int32_not_zero(value, arg_name=""):
     arg_name = pad_arg_name(arg_name)
-    if value == 0:
-        raise ValueError("Input {0}can not be 0.".format(arg_name))
+    type_check(value, (int,), arg_name)
+    if value < INT32_MIN or value > INT32_MAX or value == 0:
+        raise ValueError(
+            "Input {0}is not within the required interval of [-2147483648, 0) and (0, 2147483647].".format(arg_name))
 
 
 def check_odd(value, arg_name=""):
@@ -211,6 +219,13 @@ def check_2tuple(value, arg_name=""):
 
 
 def check_int32(value, arg_name=""):
+    """
+    Validates the value of a variable is within the range of int32.
+
+    :param value: the value of the variable
+    :param arg_name: name of the variable to be validated
+    :return: Exception: when the validation fails, nothing otherwise.
+    """
     type_check(value, (int,), arg_name)
     check_value(value, [INT32_MIN, INT32_MAX], arg_name)
 
@@ -284,7 +299,7 @@ def check_pos_int64(value, arg_name=""):
     :return: Exception: when the validation fails, nothing otherwise.
     """
     type_check(value, (int,), arg_name)
-    check_value(value, [UINT64_MIN, INT64_MAX])
+    check_value(value, [POS_INT_MIN, INT64_MAX])
 
 
 def check_float32(value, arg_name=""):
@@ -317,7 +332,7 @@ def check_pos_float32(value, arg_name=""):
     :param arg_name: name of the variable to be validated
     :return: Exception: when the validation fails, nothing otherwise.
     """
-    check_value(value, [UINT32_MIN, FLOAT_MAX_INTEGER], arg_name)
+    check_value(value, [UINT32_MIN, FLOAT_MAX_INTEGER], arg_name, True)
 
 
 def check_pos_float64(value, arg_name=""):
@@ -328,7 +343,29 @@ def check_pos_float64(value, arg_name=""):
     :param arg_name: name of the variable to be validated
     :return: Exception: when the validation fails, nothing otherwise.
     """
-    check_value(value, [UINT64_MIN, DOUBLE_MAX_INTEGER], arg_name)
+    check_value(value, [UINT64_MIN, DOUBLE_MAX_INTEGER], arg_name, True)
+
+
+def check_non_negative_float32(value, arg_name=""):
+    """
+    Validates the value of a variable is within the range of non negative float32.
+
+    :param value: the value of the variable
+    :param arg_name: name of the variable to be validated
+    :return: Exception: when the validation fails, nothing otherwise.
+    """
+    check_value(value, [UINT32_MIN, FLOAT_MAX_INTEGER], arg_name)
+
+
+def check_non_negative_float64(value, arg_name=""):
+    """
+    Validates the value of a variable is within the range of non negative float64.
+
+    :param value: the value of the variable
+    :param arg_name: name of the variable to be validated
+    :return: Exception: when the validation fails, nothing otherwise.
+    """
+    check_value(value, [UINT32_MIN, DOUBLE_MAX_INTEGER], arg_name)
 
 
 def check_valid_detype(type_):
@@ -680,4 +717,3 @@ def check_c_tensor_op(param, param_name):
 def replace_none(value, default):
     """ replaces None with a default value."""
     return value if value is not None else default
-    
\ No newline at end of file
diff --git a/mindspore/dataset/engine/__init__.py b/mindspore/dataset/engine/__init__.py
index c445542f630..ac104f232df 100644
--- a/mindspore/dataset/engine/__init__.py
+++ b/mindspore/dataset/engine/__init__.py
@@ -33,7 +33,6 @@ from .serializer_deserializer import compare, deserialize, serialize, show
 
 __all__ = ["CelebADataset", "Cifar100Dataset", "Cifar10Dataset", "CLUEDataset", "CocoDataset", "CSVDataset",
            "GeneratorDataset", "GraphData", "ImageFolderDataset", "ManifestDataset", "MindDataset", "MnistDataset",
-           "CmuArcticDataset",
            "NumpySlicesDataset", "PaddedDataset", "TextFileDataset", "TFRecordDataset", "VOCDataset",
            "DistributedSampler", "PKSampler", "RandomSampler", "SequentialSampler", "SubsetRandomSampler",
            "WeightedRandomSampler", "SubsetSampler",
diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py
index eab49e87cbd..9050e434954 100644
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -41,6 +41,8 @@ import weakref
 import platform
 import psutil
 import numpy as np
+from scipy.io import loadmat
+from PIL import Image
 
 import mindspore._c_dataengine as cde
 from mindspore._c_expression import typing
@@ -58,10 +60,11 @@ from .queue import _SharedQueue
 from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
     check_rename, check_numpyslicesdataset, check_device_send, check_take, check_project, check_imagefolderdataset, \
     check_mnist_cifar_dataset, check_manifestdataset, check_tfrecorddataset, check_vocdataset, check_cocodataset, \
-    check_celebadataset, check_minddataset,check_cmu_arctic_dataset, check_generatordataset, check_sync_wait, check_zip_dataset, \
+    check_celebadataset, check_minddataset, check_generatordataset, check_sync_wait, check_zip_dataset, \
     check_add_column, check_textfiledataset, check_concat, check_random_dataset, check_split, \
     check_bucket_batch_by_length, check_cluedataset, check_save, check_csvdataset, check_paddeddataset, \
-    check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send
+    check_tuple_iterator, check_dict_iterator, check_schema, check_to_device_send, check_flickr_dataset, \
+    check_sb_dataset, check_flowers102dataset
 from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
     get_prefetch_size
 from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
@@ -814,7 +817,7 @@ class Dataset:
             count (int): Number of elements in the dataset to be skipped.
 
         Returns:
-            SkipDataset, dataset skipped.
+            SkipDataset, dataset that containing rows like origin rows subtract skipped rows.
 
         Examples:
             >>> # dataset is an instance of Dataset object.
@@ -1709,8 +1712,11 @@ class Dataset:
                 (isinstance(num_batch, int) and num_batch <= 0):
             # throwing exception, disable all sync_wait in pipeline
             self.disable_sync()
-            raise RuntimeError("Sync_update batch size can only be positive, got : {}.".format(num_batch))
+            raise RuntimeError("Sync_update batch size can only be positive integer, got : {}.".format(num_batch))
         notifiers_dict = self.get_sync_notifiers()
+        if not isinstance(condition_name, str):
+            raise TypeError("Argument condition_name with value {} is not of type str, but got {}."
+                            .format(condition_name, type(condition_name)))
         if condition_name not in notifiers_dict:
             # throwing exception, disable all sync_wait in pipeline
             self.disable_sync()
@@ -2145,11 +2151,15 @@ class BatchDataset(Dataset):
         Per iterator bootstrap callback.
         """
         if self.python_multiprocessing:
+            if self.per_batch_map is None:
+                logger.warning("per_batch_map is None so python_multiprocessing does not work.")
+                return
             arg_q_list = []
             res_q_list = []
 
-            # Register clean zombie subprocesses signal here
-            signal.signal(signal.SIGCHLD, wait_child_processes)
+            if platform.system().lower() != 'windows':
+                # Register clean zombie subprocesses signal here
+                signal.signal(signal.SIGCHLD, wait_child_processes)
 
             # If user didn't specify num_parallel_workers, set it to default
             if self.num_parallel_workers is not None:
@@ -2647,7 +2657,8 @@ class MapDataset(Dataset):
 
             if callable_list:
                 # Register clean zombie subprocesses signal here
-                signal.signal(signal.SIGCHLD, wait_child_processes)
+                if platform.system().lower() != 'windows':
+                    signal.signal(signal.SIGCHLD, wait_child_processes)
 
                 # Construct pool with the callable list
                 # The callable list and _pyfunc_worker_init are used to pass lambda function in to subprocesses
@@ -3577,6 +3588,24 @@ def _check_shm_usage(num_worker, queue_size, max_rowsize, num_queues=1):
             logger.warning("Expected /dev/shm to exist.")
 
 
+def _watch_dog(pids, eof):
+    """
+    This thread is for get hang in SamplerFn.Process
+    """
+    exit_num = 0
+    while not eof.is_set():
+        for pid in pids:
+            if not psutil.pid_exists(pid):
+                exit_num += 1
+        if exit_num == 0:
+            continue
+        else:
+            ## multiprocessing.queue may hang in .get() forever when put() process was killed.
+            ## We have to exit main process otherwise main process will hang.
+            logger.error("The subprocess of GeneratorDataset may exit unexpected or be killed, main process will exit.")
+            os.kill(os.getpid(), signal.SIGTERM)
+
+
 class SamplerFn:
     """
     Multiprocessing or multithread generator function wrapper master process.
@@ -3591,8 +3620,9 @@ class SamplerFn:
         self.pid = []
         # Event for end of epoch
         if multi_process is True:
-            # Register clean zombie subprocesses signal here
-            signal.signal(signal.SIGCHLD, wait_child_processes)
+            if platform.system().lower() != 'windows':
+                # Register clean zombie subprocesses signal here
+                signal.signal(signal.SIGCHLD, wait_child_processes)
 
             try:
                 self.eof = multiprocessing.Event()
@@ -3628,6 +3658,10 @@ class SamplerFn:
                 worker = _GeneratorWorkerMt(dataset, self.eof)
                 worker.daemon = True
             self.workers.append(worker)
+        if multi_process is True:
+            self.watch_dog = threading.Thread(target=_watch_dog, args=(self.pid, self.eof))
+            self.watch_dog.daemon = True
+            self.watch_dog.start()
 
     def process(self, indices):
         """
@@ -3651,6 +3685,9 @@ class SamplerFn:
             if self.eof.is_set():
                 self._stop_subprocess()
                 return
+            if self.multi_process is True and not psutil.pid_exists(self.workers[i % self.num_worker].pid):
+                self._stop_subprocess()
+                return
             # Fetch result and put index
             try:
                 result = self.workers[i % self.num_worker].get()
@@ -3675,7 +3712,9 @@ class SamplerFn:
             self.eof.set()
             self.need_join = False
             for w in self.workers:
-                w.join()
+                if psutil.pid_exists(w.pid):
+                    w.join()
+            self.watch_dog.join()
 
     def __del__(self):
         self._stop_subprocess()
@@ -4369,20 +4408,6 @@ class Cifar10Dataset(MappableDataset):
         return cde.Cifar10Node(self.dataset_dir, self.usage, self.sampler)
 
 
-class CmuArcticDataset(MappableDataset):
-
-    @check_cmu_arctic_dataset
-    def __init__(self, dataset_dir, usage=None, num_samples=None, num_parallel_workers=None, shuffle=None,
-                 sampler=None, num_shards=None, shard_id=None, cache=None):
-        super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
-                         shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache)
-
-        self.dataset_dir = dataset_dir
-        self.usage = replace_none(usage, "aew")
-
-    def parse(self, children=None):
-        return cde.CmuArcticNode(self.dataset_dir, self.usage, self.sampler)
-
 class Cifar100Dataset(MappableDataset):
     """
     A source dataset for reading and parsing Cifar100 dataset.
@@ -5423,6 +5448,232 @@ class CSVDataset(SourceDataset):
                            self.num_samples, self.shuffle_flag, self.num_shards, self.shard_id)
 
 
+class _Flowers102Dataset:
+    """
+    Mainly for loading Flowers102 Dataset, and return one row each time.
+    """
+    def __init__(self, dataset_dir, task, usage, decode):
+        self.dataset_dir = os.path.realpath(dataset_dir)
+        self.task = task
+        self.usage = usage
+        self.decode = decode
+
+        if self.task == "Classification":
+            self.column_names = ["image", "label"]
+        else:
+            self.column_names = ["image", "segmentation", "label"]
+
+        labels_path = os.path.join(self.dataset_dir, "imagelabels.mat")
+        setid_path = os.path.join(self.dataset_dir, "setid.mat")
+        # minus one to transform 1~102 to 0 ~ 101
+        self.labels = (loadmat(labels_path)["labels"][0] - 1).astype(np.uint32)
+        self.setid = loadmat(setid_path)
+
+        if self.usage == 'train':
+            self.indices = self.setid["trnid"][0].tolist()
+        elif self.usage == 'test':
+            self.indices = self.setid["tstid"][0].tolist()
+        elif self.usage == 'valid':
+            self.indices = self.setid["valid"][0].tolist()
+        elif self.usage == 'all':
+            self.indices = self.setid["trnid"][0].tolist()
+            self.indices += self.setid["tstid"][0].tolist()
+            self.indices += self.setid["valid"][0].tolist()
+        else:
+            raise ValueError("Input usage is not within the valid set of ['train', 'valid', 'test', 'all'].")
+
+    def __getitem__(self, index):
+        # range: 1 ~ 8189
+        image_path = os.path.join(self.dataset_dir, "jpg", "image_" + str(self.indices[index]).zfill(5) + ".jpg")
+        if not os.path.exists(image_path):
+            raise RuntimeError("Can not find image file: " + image_path)
+
+        if self.decode is True:
+            image = np.asarray(Image.open(image_path).convert("RGB"))
+        else:
+            image = np.fromfile(image_path, dtype=np.uint8)
+
+        label = self.labels[self.indices[index] - 1]
+
+        if self.task == "Segmentation":
+            segmentation_path = \
+                os.path.join(self.dataset_dir, "segmim", "segmim_" + str(self.indices[index]).zfill(5) + ".jpg")
+            if not os.path.exists(segmentation_path):
+                raise RuntimeError("Can not find segmentation file: " + segmentation_path)
+            if self.decode is True:
+                segmentation = np.asarray(Image.open(segmentation_path).convert("RGB"))
+            else:
+                segmentation = np.fromfile(segmentation_path, dtype=np.uint8)
+            return image, segmentation, label
+
+        return image, label
+
+    def __len__(self):
+        return len(self.indices)
+
+
+class Flowers102Dataset(GeneratorDataset):
+    """
+    A source dataset for reading and parsing Flowers102 dataset.
+
+    The generated dataset has two columns :py:obj:`[image, label]` or three :py:obj:`[image, segmentation, label]`.
+    The tensor of column :py:obj:`image` is of the uint8 type.
+    The tensor of column :py:obj:`segmentation` is of the uint8 type.
+    The tensor of column :py:obj:`label` is a scalar or a tensor of the uint32 type.
+
+    Args:
+        dataset_dir (str): Path to the root directory that contains the dataset.
+        task (str): Specify the 'Classification' or 'Segmentation' task (default='Classification').
+        usage (str): Specify the 'train', 'valid', 'test' part or 'all' parts of dataset
+            (default='all', will read all samples).
+        num_samples (int, optional): The number of samples to be included in the dataset (default=None, all images).
+        num_parallel_workers (int, optional): Number of subprocesses used to fetch the dataset in parallel (default=1).
+        shuffle (bool, optional): Whether or not to perform shuffle on the dataset. Random accessible input is required.
+            (default=None, expected order behavior shown in the table).
+        decode (bool, optional): Whether or not to decode the images and segmentations after reading (default=False).
+        sampler (Union[Sampler, Iterable], optional): Object used to choose samples from the dataset. Random accessible
+            input is required (default=None, expected order behavior shown in the table).
+        num_shards (int, optional): Number of shards that the dataset will be divided into (default=None).
+            Random accessible input is required. When this argument is specified, 'num_samples' reflects the max
+            sample number of per shard.
+        shard_id (int, optional): The shard ID within num_shards (default=None). This argument must be specified only
+            when num_shards is also specified. Random accessible input is required.
+
+    Raises:
+        RuntimeError: If dataset_dir does not contain data files.
+        RuntimeError: If num_parallel_workers exceeds the max thread numbers.
+        RuntimeError: If sampler and shuffle are specified at the same time.
+        RuntimeError: If sampler and sharding are specified at the same time.
+        RuntimeError: If num_shards is specified but shard_id is None.
+        RuntimeError: If shard_id is specified but num_shards is None.
+        ValueError: If shard_id is invalid (< 0 or >= num_shards).
+
+    Note:
+        - This dataset can take in a sampler. 'sampler' and 'shuffle' are mutually exclusive.
+          The table below shows what input arguments are allowed and their expected behavior.
+
+    .. list-table:: Expected Order Behavior of Using 'sampler' and 'shuffle'
+       :widths: 25 25 50
+       :header-rows: 1
+
+       * - Parameter 'sampler'
+         - Parameter 'shuffle'
+         - Expected Order Behavior
+       * - None
+         - None
+         - random order
+       * - None
+         - True
+         - random order
+       * - None
+         - False
+         - sequential order
+       * - Sampler object
+         - None
+         - order defined by sampler
+       * - Sampler object
+         - True
+         - not allowed
+       * - Sampler object
+         - False
+         - not allowed
+
+    Examples:
+        >>> flowers102_dataset_dir = "/path/to/flowers102_dataset_directory"
+        >>> dataset = ds.Flowers102Dataset(dataset_dir=flowers102_dataset_dir,
+        ...                                task="Classification",
+        ...                                usage="all",
+        ...                                decode=True)
+
+    About Flowers102 dataset:
+
+    Flowers102 dataset consists of 102 flower categories.
+    The flowers commonly occur in the United Kingdom.
+    Each class consists of between 40 and 258 images.
+
+    Here is the original Flowers102 dataset structure.
+    You can unzip the dataset files into this directory structure and read by MindSpore's API.
+
+    .. code-block::
+        .
+        └── flowes102_dataset_dir
+             ├── imagelabels.mat
+             ├── setid.mat
+             ├── jpg
+                  ├── image_00001.jpg
+                  ├── image_00002.jpg
+                  ├── ...
+             ├── segmim
+                  ├── segmim_00001.jpg
+                  ├── segmim_00002.jpg
+                  ├── ...
+
+    Citation:
+
+    .. code-block::
+
+        @InProceedings{Nilsback08,
+          author       = "Maria-Elena Nilsback and Andrew Zisserman",
+          title        = "Automated Flower Classification over a Large Number of Classes",
+          booktitle    = "Indian Conference on Computer Vision, Graphics and Image Processing",
+          month        = "Dec",
+          year         = "2008",
+        }
+    """
+
+    @check_flowers102dataset
+    def __init__(self, dataset_dir, task="Classification", usage="all", num_samples=None, num_parallel_workers=1,
+                 shuffle=None, decode=False, sampler=None, num_shards=None, shard_id=None):
+        self.dataset_dir = os.path.realpath(dataset_dir)
+        self.task = replace_none(task, "Classification")
+        self.usage = replace_none(usage, "all")
+        self.decode = replace_none(decode, False)
+        dataset = _Flowers102Dataset(self.dataset_dir, self.task, self.usage, self.decode)
+        super().__init__(dataset, column_names=dataset.column_names, num_samples=num_samples,
+                         num_parallel_workers=num_parallel_workers, shuffle=shuffle, sampler=sampler,
+                         num_shards=num_shards, shard_id=shard_id)
+
+    def get_class_indexing(self):
+        """
+        Get the class index.
+
+        Returns:
+            dict, a str-to-int mapping from label name to index.
+        """
+        class_names = [
+            "pink primrose", "hard-leaved pocket orchid", "canterbury bells",
+            "sweet pea", "english marigold", "tiger lily", "moon orchid",
+            "bird of paradise", "monkshood", "globe thistle", "snapdragon",
+            "colt's foot", "king protea", "spear thistle", "yellow iris",
+            "globe-flower", "purple coneflower", "peruvian lily", "balloon flower",
+            "giant white arum lily", "fire lily", "pincushion flower", "fritillary",
+            "red ginger", "grape hyacinth", "corn poppy", "prince of wales feathers",
+            "stemless gentian", "artichoke", "sweet william", "carnation",
+            "garden phlox", "love in the mist", "mexican aster", "alpine sea holly",
+            "ruby-lipped cattleya", "cape flower", "great masterwort", "siam tulip",
+            "lenten rose", "barbeton daisy", "daffodil", "sword lily", "poinsettia",
+            "bolero deep blue", "wallflower", "marigold", "buttercup", "oxeye daisy",
+            "common dandelion", "petunia", "wild pansy", "primula", "sunflower",
+            "pelargonium", "bishop of llandaff", "gaura", "geranium", "orange dahlia",
+            "pink-yellow dahlia?", "cautleya spicata", "japanese anemone",
+            "black-eyed susan", "silverbush", "californian poppy", "osteospermum",
+            "spring crocus", "bearded iris", "windflower", "tree poppy", "gazania",
+            "azalea", "water lily", "rose", "thorn apple", "morning glory",
+            "passion flower", "lotus", "toad lily", "anthurium", "frangipani",
+            "clematis", "hibiscus", "columbine", "desert-rose", "tree mallow",
+            "magnolia", "cyclamen", "watercress", "canna lily", "hippeastrum",
+            "bee balm", "ball moss", "foxglove", "bougainvillea", "camellia", "mallow",
+            "mexican petunia", "bromelia", "blanket flower", "trumpet creeper",
+            "blackberry lily"
+        ]
+
+        class_dict = {}
+        for i, class_name in enumerate(class_names):
+            class_dict[class_name] = i
+
+        return class_dict
+
+
 class TextFileDataset(SourceDataset):
     """
     A source dataset that reads and parses datasets stored on disk in text format.
@@ -5679,3 +5930,384 @@ class PaddedDataset(GeneratorDataset):
         super().__init__(dataset, column_names=dataset.column_names, num_shards=None, shard_id=None, shuffle=False)
         self._dataset_size = len(dataset.padded_samples)
         self.padded_samples = padded_samples
+
+
+class FlickrDataset(MappableDataset):
+    """
+    A source dataset for reading and parsing Flickr8k and Flickr30k dataset.
+
+    The generated dataset has two columns :py:obj:`[image, annotation]`.
+    The tensor of column :py:obj:`image` is of the uint8 type.
+    The tensor of column :py:obj:`annotation` is a tensor which contains 5 annotations string,
+    such as ["a", "b", "c", "d", "e"].
+
+    Args:
+        dataset_dir (str): Path to the root directory that contains the dataset.
+        annotation_file (str): Path to the root directory that contains the annotation.
+        num_samples (int, optional): The number of images to be included in the dataset.
+            (default=None, all images).
+        num_parallel_workers (int, optional): Number of workers to read the data
+            (default=None, number set in the config).
+        shuffle (bool, optional): Whether to perform shuffle on the dataset (default=None, expected
+            order behavior shown in the table).
+        decode (bool, optional): Decode the images after reading (default=False).
+        sampler (Sampler, optional): Object used to choose samples from the
+            dataset (default=None, expected order behavior shown in the table).
+        num_shards (int, optional): Number of shards that the dataset will be divided
+            into (default=None). When this argument is specified, `num_samples` reflects
+            the max sample number of per shard.
+        shard_id (int, optional): The shard ID within num_shards (default=None). This
+            argument can only be specified when num_shards is also specified.
+        cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
+            (default=None, which means no cache is used).
+
+    Raises:
+        RuntimeError: If dataset_dir is not valid or does not contain data files.
+        RuntimeError: If num_parallel_workers exceeds the max thread numbers.
+        RuntimeError: If sampler and shuffle are specified at the same time.
+        RuntimeError: If sampler and sharding are specified at the same time.
+        RuntimeError: If num_shards is specified but shard_id is None.
+        RuntimeError: If shard_id is specified but num_shards is None.
+        ValueError: If dataset_dir is not exist.
+        ValueError: If annotation_file is not exist.
+        ValueError: If shard_id is invalid (< 0 or >= num_shards).
+
+    Note:
+        - This dataset can take in a `sampler`. `sampler` and `shuffle` are mutually exclusive.
+          The table below shows what input arguments are allowed and their expected behavior.
+
+    .. list-table:: Expected Order Behavior of Using `sampler` and `shuffle`
+       :widths: 25 25 50
+       :header-rows: 1
+
+       * - Parameter `sampler`
+         - Parameter `shuffle`
+         - Expected Order Behavior
+       * - None
+         - None
+         - random order
+       * - None
+         - True
+         - random order
+       * - None
+         - False
+         - sequential order
+       * - Sampler object
+         - None
+         - order defined by sampler
+       * - Sampler object
+         - True
+         - not allowed
+       * - Sampler object
+         - False
+         - not allowed
+
+    Examples:
+        >>> flickr_dataset_dir = "/path/to/flickr_dataset_directory"
+        >>> annotation_file = "/path/to/flickr_annotation_file"
+        >>>
+        >>> # 1) Get all samples from FLICKR dataset in sequence
+        >>> dataset = ds.FlickrDataset(dataset_dir=flickr_dataset_dir,
+        ...                            annotation_file=annotation_file,
+        ...                            shuffle=False)
+        >>>
+        >>> # 2) Randomly select 350 samples from FLICKR dataset
+        >>> dataset = ds.FlickrDataset(dataset_dir=flickr_dataset_dir,
+        ...                            annotation_file=annotation_file,
+        ...                            num_samples=350,
+        ...                            shuffle=True)
+        >>>
+        >>> # 3) Get samples from FLICKR dataset for shard 0 in a 2-way distributed training
+        >>> dataset = ds.FlickrDataset(dataset_dir=flickr_dataset_dir,
+        ...                            annotation_file=annotation_file,
+        ...                            num_shards=2,
+        ...                            shard_id=0)
+        >>>
+        >>> # In FLICKR dataset, each dictionary has keys "image" and "annotation"
+
+    About Flickr8k dataset:
+
+    The Flickr8k dataset consists of 8092 colour images. There are 40460 annotations in the Flickr8k.token.txt,
+    each image has 5 annotations.
+
+    You can unzip the dataset files into the following directory structure and read by MindSpore's API.
+
+    .. code-block::
+
+        .
+        └── Flickr8k
+             ├── Flickr8k_Dataset
+             │    ├── 1000268201_693b08cb0e.jpg
+             │    ├── 1001773457_577c3a7d70.jpg
+             │    ├── ...
+             └── Flickr8k.token.txt
+
+    Citation:
+
+    .. code-block::
+
+        @article{DBLP:journals/jair/HodoshYH13,
+        author    = {Micah Hodosh and Peter Young and Julia Hockenmaier},
+        title     = {Framing Image Description as a Ranking Task: Data, Models and Evaluation Metrics},
+        journal   = {J. Artif. Intell. Res.},
+        volume    = {47},
+        pages     = {853--899},
+        year      = {2013},
+        url       = {https://doi.org/10.1613/jair.3994},
+        doi       = {10.1613/jair.3994},
+        timestamp = {Mon, 21 Jan 2019 15:01:17 +0100},
+        biburl    = {https://dblp.org/rec/journals/jair/HodoshYH13.bib},
+        bibsource = {dblp computer science bibliography, https://dblp.org}
+        }
+
+    About Flickr30k dataset:
+
+    The Flickr30k dataset consists of 31783 colour images. There are 158915 annotations in
+    the results_20130124.token, each image has 5 annotations.
+
+    You can unzip the dataset files into the following directory structure and read by MindSpore's API.
+
+    Citation:
+
+    .. code-block::
+
+        .
+        └── Flickr30k
+             ├── flickr30k-images
+             │    ├── 1000092795.jpg
+             │    ├── 10002456.jpg
+             │    ├── ...
+             └── results_20130124.token
+
+    .. code-block::
+
+        @article{DBLP:journals/tacl/YoungLHH14,
+        author    = {Peter Young and Alice Lai and Micah Hodosh and Julia Hockenmaier},
+        title     = {From image descriptions to visual denotations: New similarity metrics
+                     for semantic inference over event descriptions},
+        journal   = {Trans. Assoc. Comput. Linguistics},
+        volume    = {2},
+        pages     = {67--78},
+        year      = {2014},
+        url       = {https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/view/229},
+        timestamp = {Wed, 17 Feb 2021 21:55:25 +0100},
+        biburl    = {https://dblp.org/rec/journals/tacl/YoungLHH14.bib},
+        bibsource = {dblp computer science bibliography, https://dblp.org}
+        }
+    """
+
+    @check_flickr_dataset
+    def __init__(self, dataset_dir, annotation_file, num_samples=None, num_parallel_workers=None, shuffle=None,
+                 decode=None, sampler=None, num_shards=None, shard_id=None, cache=None):
+        super().__init__(num_parallel_workers=num_parallel_workers, sampler=sampler, num_samples=num_samples,
+                         shuffle=shuffle, num_shards=num_shards, shard_id=shard_id, cache=cache)
+
+        self.dataset_dir = dataset_dir
+        self.annotation_file = annotation_file
+        self.decode = replace_none(decode, False)
+
+    def parse(self, children=None):
+        return cde.FlickrNode(self.dataset_dir, self.annotation_file, self.decode, self.sampler)
+
+
+class SBDataset(GeneratorDataset):
+    """
+    A source dataset for reading and parsing Semantic Boundaries Dataset.
+
+    The generated dataset has two columns: :py:obj:`[image, task]`.
+    The tensor of column :py:obj:`image` is of the uint8 type.
+    The tensor of column :py:obj:`task` contains 20 images of the uint8 type if `task` is `Boundaries` otherwise
+    contains 1 image of the uint8 type.
+
+    Args:
+        dataset_dir (str): Path to the root directory that contains the dataset.
+        task (str, optional): Acceptable tasks include `Boundaries` or `Segmentation` (default=`Boundaries`).
+        usage (str, optional): Acceptable usages include `train`, `val`, `train_noval` and `all` (default=`all`).
+        num_samples (int, optional): The number of images to be included in the dataset.
+            (default=None, all images).
+        num_parallel_workers (int, optional): Number of workers to read the data
+            (default=None, number set in the config).
+        shuffle (bool, optional): Whether to perform shuffle on the dataset (default=None, expected
+            order behavior shown in the table).
+        sampler (Sampler, optional): Object used to choose samples from the
+            dataset (default=None, expected order behavior shown in the table).
+        num_shards (int, optional): Number of shards that the dataset will be divided
+            into (default=None). When this argument is specified, `num_samples` reflects
+            the max sample number of per shard.
+        shard_id (int, optional): The shard ID within num_shards (default=None). This
+            argument can only be specified when num_shards is also specified.
+
+    Raises:
+        RuntimeError: If dataset_dir is not valid or does not contain data files.
+        RuntimeError: If num_parallel_workers exceeds the max thread numbers.
+        RuntimeError: If sampler and shuffle are specified at the same time.
+        RuntimeError: If sampler and sharding are specified at the same time.
+        RuntimeError: If num_shards is specified but shard_id is None.
+        RuntimeError: If shard_id is specified but num_shards is None.
+        ValueError: If dataset_dir is not exist.
+        ValueError: If task is not in [`Boundaries`, `Segmentation`].
+        ValueError: If usage is not in [`train`, `val`, `train_noval`, `all`].
+        ValueError: If shard_id is invalid (< 0 or >= num_shards).
+
+    Note:
+        - This dataset can take in a sampler. `sampler` and `shuffle` are mutually exclusive.
+          The table below shows what input arguments are allowed and their expected behavior.
+
+    .. list-table:: Expected Order Behavior of Using `sampler` and `shuffle`
+       :widths: 25 25 50
+       :header-rows: 1
+
+       * - Parameter `sampler`
+         - Parameter `shuffle`
+         - Expected Order Behavior
+       * - None
+         - None
+         - random order
+       * - None
+         - True
+         - random order
+       * - None
+         - False
+         - sequential order
+       * - Sampler object
+         - None
+         - order defined by sampler
+       * - Sampler object
+         - True
+         - not allowed
+       * - Sampler object
+         - False
+         - not allowed
+
+    Examples:
+        >>> sb_dataset_dir = "/path/to/sb_dataset_directory"
+        >>>
+        >>> # 1) Get all samples from Semantic Boundaries Dataset in sequence
+        >>> dataset = ds.SBDataset(dataset_dir=sb_dataset_dir, shuffle=False)
+        >>>
+        >>> # 2) Randomly select 350 samples from Semantic Boundaries Dataset
+        >>> dataset = ds.SBDataset(dataset_dir=sb_dataset_dir, num_samples=350, shuffle=True)
+        >>>
+        >>> # 3) Get samples from Semantic Boundaries Dataset for shard 0 in a 2-way distributed training
+        >>> dataset = ds.SBDataset(dataset_dir=sb_dataset_dir, num_shards=2, shard_id=0)
+        >>>
+        >>> # In Semantic Boundaries Dataset, each dictionary has keys "image" and "task"
+
+    About Semantic Boundaries Dataset:
+
+    The Semantic Boundaries Dataset consists of 11355 colour images. There are 8498 images' name in the train.txt,
+    2857 images' name in the val.txt and 5623 images' name in the train_noval.txt. The category cls/
+    contains the Segmentation and Boundaries results of category-level, the category inst/ catains the
+    Segmentation and Boundaries results of instance-level.
+
+    You can unzip the dataset files into the following structure and read by MindSpore's API:
+
+    .. code-block::
+
+         .
+         └── benchmark_RELEASE
+              ├── dataset
+              ├── img
+              │    ├── 2008_000002.jpg
+              │    ├── 2008_000003.jpg
+              │    ├── ...
+              ├── cls
+              │    ├── 2008_000002.mat
+              │    ├── 2008_000003.mat
+              │    ├── ...
+              ├── inst
+              │    ├── 2008_000002.mat
+              │    ├── 2008_000003.mat
+              │    ├── ...
+              ├── train.txt
+              └── val.txt
+
+    .. code-block::
+
+        @InProceedings{BharathICCV2011,
+            author       = "Bharath Hariharan and Pablo Arbelaez and Lubomir Bourdev and
+                            Subhransu Maji and Jitendra Malik",
+            title        = "Semantic Contours from Inverse Detectors",
+            booktitle    = "International Conference on Computer Vision (ICCV)",
+            year         = "2011",
+    """
+
+    @check_sb_dataset
+    def __init__(self, dataset_dir, task='Boundaries', usage='all', num_samples=None, num_parallel_workers=1,
+                 shuffle=None, decode=None, sampler=None, num_shards=None, shard_id=None):
+        dataset = _SBDataset(dataset_dir, task, usage, decode)
+        super().__init__(dataset, column_names=dataset.column_list, num_samples=num_samples,
+                         num_parallel_workers=num_parallel_workers, shuffle=shuffle, sampler=sampler,
+                         num_shards=num_shards, shard_id=shard_id)
+
+
+class _SBDataset:
+    """
+    Dealing with the data file with .mat extension, and return one row in tuple (image, task) each time.
+    """
+
+    def __init__(self, dataset_dir, task, usage, decode):
+        self.column_list = ['image', 'task']
+        self.task = task
+        self.images_path = os.path.join(dataset_dir, 'img')
+        self.cls_path = os.path.join(dataset_dir, 'cls')
+        self._loadmat = loadmat
+        self.categories = 20
+        self.decode = replace_none(decode, False)
+
+        if usage == "all":
+            image_names = []
+            for item in ["train", "val"]:
+                usage_path = os.path.join(dataset_dir, item + '.txt')
+                if not os.path.exists(usage_path):
+                    raise FileNotFoundError("SBDataset: {0} not found".format(usage_path))
+                with open(usage_path, 'r') as f:
+                    image_names += [x.strip() for x in f.readlines()]
+        else:
+            usage_path = os.path.join(dataset_dir, usage + '.txt')
+            if not os.path.exists(usage_path):
+                raise FileNotFoundError("SBDataset: {0} not found".format(usage_path))
+            with open(usage_path, 'r') as f:
+                image_names = [x.strip() for x in f.readlines()]
+
+        self.images = [os.path.join(self.images_path, i + ".jpg") for i in image_names]
+        self.clss = [os.path.join(self.cls_path, i + ".mat") for i in image_names]
+
+        if len(self.images) != len(self.clss):
+            raise ValueError("SBDataset: images count not equal to cls count")
+
+        self._get_data = self._get_boundaries_data if self.task == "Boundaries" else self._get_segmentation_data
+        self._get_item = self._get_decode_item if self.decode else self._get_undecode_item
+
+    def _get_boundaries_data(self, mat_path):
+        mat_data = self._loadmat(mat_path)
+        return np.concatenate([np.expand_dims(mat_data['GTcls'][0][self.task][0][i][0].toarray(), axis=0)
+                               for i in range(self.categories)], axis=0)
+
+    def _get_segmentation_data(self, mat_path):
+        mat_data = self._loadmat(mat_path)
+        return Image.fromarray(mat_data['GTcls'][0][self.task][0])
+
+    def _get_decode_item(self, idx):
+        return Image.open(self.images[idx]).convert('RGB'), self._get_data(self.clss[idx])
+
+    def _get_undecode_item(self, idx):
+        return np.fromfile(self.images[idx], dtype=np.uint8), self._get_data(self.clss[idx])
+
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, idx):
+        return self._get_item(idx)
+
+
+class DeserializedDataset(Dataset):
+    def __init__(self, input_obj):
+        super().__init__()
+        self.input_obj = input_obj
+
+    def parse(self, children=None):
+        if isinstance(self.input_obj, dict):
+            json_str = json.dumps(self.input_obj)
+            return cde.Dataset.from_json_string(json_str)
+        return cde.Dataset.from_json_file(self.input_obj)
diff --git a/mindspore/dataset/engine/queue.py b/mindspore/dataset/engine/queue.py
index fd9d163489a..c3a2b6858bc 100644
--- a/mindspore/dataset/engine/queue.py
+++ b/mindspore/dataset/engine/queue.py
@@ -102,7 +102,7 @@ class _SharedQueue(multiprocessing.queues.Queue):
                                 "Using shared memory queue, but rowsize is larger than allocated memory "
                                 + "max_rowsize "
                                 + str(self.seg_size)
-                                + " current rowwize "
+                                + " current rowsize "
                                 + str(start_bytes + r.nbytes)
                             )
                             self.print_error = False
diff --git a/mindspore/dataset/engine/validators.py b/mindspore/dataset/engine/validators.py
index 2db4b32ef16..1daebde81e9 100644
--- a/mindspore/dataset/engine/validators.py
+++ b/mindspore/dataset/engine/validators.py
@@ -92,36 +92,6 @@ def check_mnist_cifar_dataset(method):
     return new_method
 
 
-def check_cmu_arctic_dataset(method):
-    """A wrapper that wraps a parameter checker around the original CmuArcticDataset."""
-
-    @wraps(method)
-    def new_method(self, *args, **kwargs):
-        _, param_dict = parse_user_args(method, *args, **kwargs)
-
-        nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id']
-        nreq_param_bool = ['shuffle']
-
-        dataset_dir = param_dict.get('dataset_dir')
-        check_dir(dataset_dir)
-
-        usage = param_dict.get('usage')
-        if usage is not None:
-            check_valid_str(usage, ['aew', 'ahw', 'aup', 'awb', 'axb', 'bdl', 'clb', 'eey', 'fem', 'gka', 'jmk', 'ksp', 'ljm', 'lnh', 'rms', 'rxr', 'slp' , 'slt'], "usage")
-
-        validate_dataset_param_value(nreq_param_int, param_dict, int)
-        validate_dataset_param_value(nreq_param_bool, param_dict, bool)
-
-        check_sampler_shuffle_shard_options(param_dict)
-
-        cache = param_dict.get('cache')
-        check_cache_option(cache)
-
-        return method(self, *args, **kwargs)
-
-    return new_method
-
-
 def check_manifestdataset(method):
     """A wrapper that wraps a parameter checker around the original Dataset(ManifestDataset)."""
 
@@ -314,7 +284,7 @@ def check_save(method):
         nreq_param_str = ['file_name', 'file_type']
         validate_dataset_param_value(nreq_param_int, param_dict, int)
         if (param_dict.get('num_files') <= 0 or param_dict.get('num_files') > 1000):
-            raise ValueError("num_files should between {} and {}.".format(1, 1000))
+            raise ValueError("num_files should between 0 and 1000.")
         validate_dataset_param_value(nreq_param_str, param_dict, str)
         if param_dict.get('file_type') != 'mindrecord':
             raise ValueError("{} dataset format is not supported.".format(param_dict.get('file_type')))
@@ -405,7 +375,9 @@ def check_generatordataset(method):
             try:
                 iter(source)
             except TypeError:
-                raise TypeError("source should be callable, iterable or random accessible.")
+                raise TypeError("Input `source` function of GeneratorDataset should be callable, iterable or random"
+                                " accessible, commonly it should implement one of the method like yield, __getitem__ or"
+                                " __next__(__iter__).")
 
         column_names = param_dict.get('column_names')
         if column_names is not None:
@@ -419,7 +391,7 @@ def check_generatordataset(method):
                 raise ValueError("schema should be a path to schema file or a schema object.")
 
         # check optional argument
-        nreq_param_int = ["num_samples", "num_parallel_workers", "num_shards", "shard_id"]
+        nreq_param_int = ["max_rowsize", "num_samples", "num_parallel_workers", "num_shards", "shard_id"]
         validate_dataset_param_value(nreq_param_int, param_dict, int)
         nreq_param_list = ["column_types"]
         validate_dataset_param_value(nreq_param_list, param_dict, list)
@@ -491,11 +463,11 @@ def check_pad_info(key, val):
         type_check(val, (tuple,), "value in pad_info")
 
         if val[0] is not None:
-            type_check(val[0], (list,), "pad_shape")
+            type_check(val[0], (list,), "shape in pad_info")
 
             for dim in val[0]:
                 if dim is not None:
-                    check_pos_int32(dim, "dim in pad_shape")
+                    check_pos_int32(dim, "dim of shape in pad_info")
         if val[1] is not None:
             type_check(val[1], (int, float, str, bytes), "pad_value")
 
@@ -710,7 +682,7 @@ def check_repeat(method):
         type_check(count, (int, type(None)), "repeat")
         if isinstance(count, int):
             if (count <= 0 and count != -1) or count > INT32_MAX:
-                raise ValueError("count should be either -1 or positive integer.")
+                raise ValueError("count should be either -1 or positive integer, range[1, INT32_MAX].")
         return method(self, *args, **kwargs)
 
     return new_method
@@ -724,7 +696,7 @@ def check_skip(method):
         [count], _ = parse_user_args(method, *args, **kwargs)
 
         type_check(count, (int,), "count")
-        check_value(count, (-1, INT32_MAX), "count")
+        check_value(count, (0, INT32_MAX), "count")
 
         return method(self, *args, **kwargs)
 
@@ -739,7 +711,8 @@ def check_take(method):
         [count], _ = parse_user_args(method, *args, **kwargs)
         type_check(count, (int,), "count")
         if (count <= 0 and count != -1) or count > INT32_MAX:
-            raise ValueError("count should be either -1 or positive integer.")
+            raise ValueError("count should be either -1 or within the required interval of ({}, {}], got {}."
+                             .format(0, INT32_MAX, count))
 
         return method(self, *args, **kwargs)
 
@@ -770,14 +743,9 @@ def check_device_send(method):
 
     @wraps(method)
     def new_method(self, *args, **kwargs):
-        param, param_dict = parse_user_args(method, *args, **kwargs)
-        para_list = list(param_dict.keys())
-        if "prefetch_size" in para_list:
-            if param[0] is not None:
-                check_pos_int32(param[0], "prefetch_size")
-            type_check(param[1], (bool,), "send_epoch_end")
-        else:
-            type_check(param[0], (bool,), "send_epoch_end")
+        [send_epoch_end, create_data_info_queue], _ = parse_user_args(method, *args, **kwargs)
+        type_check(send_epoch_end, (bool,), "send_epoch_end")
+        type_check(create_data_info_queue, (bool,), "create_data_info_queue")
 
         return method(self, *args, **kwargs)
 
@@ -870,7 +838,6 @@ def check_schema(method):
         [schema_file], _ = parse_user_args(method, *args, **kwargs)
 
         if schema_file is not None:
-            type_check(schema_file, (str,), "schema_file")
             check_file(schema_file)
 
         return method(self, *args, **kwargs)
@@ -985,6 +952,44 @@ def check_csvdataset(method):
     return new_method
 
 
+def check_flowers102dataset(method):
+    """A wrapper that wraps a parameter checker around the original Dataset(Flowers102Dataset)."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        _, param_dict = parse_user_args(method, *args, **kwargs)
+
+        nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id']
+        nreq_param_bool = ['shuffle', 'decode']
+
+        dataset_dir = param_dict.get('dataset_dir')
+        check_dir(dataset_dir)
+
+        check_dir(os.path.join(dataset_dir, "jpg"))
+
+        check_file(os.path.join(dataset_dir, "imagelabels.mat"))
+        check_file(os.path.join(dataset_dir, "setid.mat"))
+
+        usage = param_dict.get('usage')
+        if usage is not None:
+            check_valid_str(usage, ["train", "valid", "test", "all"], "usage")
+
+        task = param_dict.get('task')
+        if task is not None:
+            check_valid_str(task, ["Classification", "Segmentation"], "task")
+        if task == "Segmentation":
+            check_dir(os.path.join(dataset_dir, "segmim"))
+
+        validate_dataset_param_value(nreq_param_int, param_dict, int)
+        validate_dataset_param_value(nreq_param_bool, param_dict, bool)
+
+        check_sampler_shuffle_shard_options(param_dict)
+
+        return method(self, *args, **kwargs)
+
+    return new_method
+
+
 def check_textfiledataset(method):
     """A wrapper that wraps a parameter checker around the original Dataset(TextFileDataset)."""
 
@@ -1138,7 +1143,7 @@ def check_gnn_get_all_neighbors(method):
 
     @wraps(method)
     def new_method(self, *args, **kwargs):
-        [node_list, neighbour_type], _ = parse_user_args(method, *args, **kwargs)
+        [node_list, neighbour_type, _], _ = parse_user_args(method, *args, **kwargs)
 
         check_gnn_list_or_ndarray(node_list, 'node_list')
         type_check(neighbour_type, (int,), "neighbour_type")
@@ -1293,7 +1298,7 @@ def check_numpyslicesdataset(method):
 
         data = param_dict.get("data")
         column_names = param_dict.get("column_names")
-        if not data:
+        if data is None or len(data) == 0:  # pylint: disable=len-as-condition
             raise ValueError("Argument data cannot be empty")
         type_check(data, (list, tuple, dict, np.ndarray), "data")
         if isinstance(data, tuple):
@@ -1362,3 +1367,62 @@ def check_to_device_send(method):
         return method(self, *args, **kwargs)
 
     return new_method
+
+
+def check_flickr_dataset(method):
+    """A wrapper that wraps a parameter checker around the original Dataset(Flickr8k, Flickr30k)."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        _, param_dict = parse_user_args(method, *args, **kwargs)
+
+        nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id']
+        nreq_param_bool = ['shuffle', 'decode']
+
+        dataset_dir = param_dict.get('dataset_dir')
+        annotation_file = param_dict.get('annotation_file')
+        check_dir(dataset_dir)
+        check_file(annotation_file)
+
+        validate_dataset_param_value(nreq_param_int, param_dict, int)
+        validate_dataset_param_value(nreq_param_bool, param_dict, bool)
+
+        check_sampler_shuffle_shard_options(param_dict)
+
+        cache = param_dict.get('cache')
+        check_cache_option(cache)
+
+        return method(self, *args, **kwargs)
+
+    return new_method
+
+
+def check_sb_dataset(method):
+    """A wrapper that wraps a parameter checker around the original Semantic Boundaries Dataset."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        _, param_dict = parse_user_args(method, *args, **kwargs)
+
+        nreq_param_int = ['num_samples', 'num_parallel_workers', 'num_shards', 'shard_id']
+        nreq_param_bool = ['shuffle', 'decode']
+
+        dataset_dir = param_dict.get('dataset_dir')
+        check_dir(dataset_dir)
+
+        usage = param_dict.get('usage')
+        if usage is not None:
+            check_valid_str(usage, ["train", "val", "train_noval", "all"], "usage")
+
+        task = param_dict.get('task')
+        if task is not None:
+            check_valid_str(task, ["Boundaries", "Segmentation"], "task")
+
+        validate_dataset_param_value(nreq_param_int, param_dict, int)
+        validate_dataset_param_value(nreq_param_bool, param_dict, bool)
+
+        check_sampler_shuffle_shard_options(param_dict)
+
+        return method(self, *args, **kwargs)
+
+    return new_method
diff --git a/mindspore/dataset/vision/c_transforms.py b/mindspore/dataset/vision/c_transforms.py
index 1f7fb720e82..103aafc8a64 100644
--- a/mindspore/dataset/vision/c_transforms.py
+++ b/mindspore/dataset/vision/c_transforms.py
@@ -109,7 +109,7 @@ def parse_padding(padding):
 
 class AdjustGamma(ImageTensorOperation):
     r"""
-    Apply gamma correction on input image. Input image is expected to be in [..., H, W, C] or [H, W, C] format.
+    Apply gamma correction on input image. Input image is expected to be in [..., H, W, C] or [H, W] format.
     .. math::
         I_{\text{out}} = 255 \times \text{gain} \times \left(\frac{I_{\text{in}}}{255}\right)^{\gamma}
 
@@ -1511,6 +1511,7 @@ class RgbToBgr(ImageTensorOperation):
 
     Examples:
         >>> from mindspore.dataset.vision import Inter
+        >>>
         >>> decode_op = c_vision.Decode()
         >>> rgb2bgr_op = c_vision.RgbToBgr()
         >>> transforms_list = [decode_op, rgb2bgr_op]
diff --git a/mindspore/dataset/vision/utils.py b/mindspore/dataset/vision/utils.py
index 2843e519dea..75ed8549707 100644
--- a/mindspore/dataset/vision/utils.py
+++ b/mindspore/dataset/vision/utils.py
@@ -43,6 +43,7 @@ class ImageBatchFormat(IntEnum):
     NHWC = 0
     NCHW = 1
 
+
 class SliceMode(IntEnum):
     PAD = 0
     DROP = 1
diff --git a/mindspore/dataset/vision/validators.py b/mindspore/dataset/vision/validators.py
index d8d7b84385b..546db4a4362 100644
--- a/mindspore/dataset/vision/validators.py
+++ b/mindspore/dataset/vision/validators.py
@@ -22,7 +22,7 @@ from mindspore._c_dataengine import TensorOp, TensorOperation
 from mindspore.dataset.core.validator_helpers import check_value, check_uint8, FLOAT_MIN_INTEGER, FLOAT_MAX_INTEGER, \
     check_pos_float32, check_float32, check_2tuple, check_range, check_positive, INT32_MAX, INT32_MIN, \
     parse_user_args, type_check, type_check_list, check_c_tensor_op, UINT8_MAX, check_value_normalize_std, \
-    check_value_cutoff, check_value_ratio, check_odd
+    check_value_cutoff, check_value_ratio, check_odd, check_non_negative_float32
 from .utils import Inter, Border, ImageBatchFormat, SliceMode
 
 
@@ -143,7 +143,7 @@ def check_degrees(degrees):
     """Check if the degrees is legal."""
     type_check(degrees, (int, float, list, tuple), "degrees")
     if isinstance(degrees, (int, float)):
-        check_pos_float32(degrees, "degrees")
+        check_non_negative_float32(degrees, "degrees")
     elif isinstance(degrees, (list, tuple)):
         if len(degrees) == 2:
             type_check_list(degrees, (int, float), "degrees")
diff --git a/mindspore/lite/CMakeLists.txt b/mindspore/lite/CMakeLists.txt
index 17d306a98de..e314839a839 100644
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@@ -2,6 +2,10 @@ cmake_minimum_required(VERSION 3.12)
 project(Lite)
 
 set(BUILD_LITE "on")
+
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/secure_option.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_link_option.cmake)
+
 if(TOOLCHAIN_NAME STREQUAL "himix200")
   set(TARGET_HIMIX200 on)
   add_compile_definitions(SUPPORT_NNIE)
@@ -20,7 +24,7 @@ if(PLATFORM_ARM32 AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_COMPI
     your Clang version:[${CMAKE_CXX_COMPILER_VERSION}] must not be less than 9.0 and please use android nkd r21e!")
 endif()
 
-# Options that can be configured through environment variables or manually
+#Options that can be configured through environment variables or manually
 set(MSLITE_GPU_BACKEND "" CACHE STRING "enable gpu backend, \
     only arm64 support opencl, only x86_64 support tensorrt, opencl/cuda/tensorrt/off")
 option(MSLITE_ENABLE_NPU "enable npu, only arm64 or arm32 support" off)
@@ -31,18 +35,21 @@ option(MSLITE_ENABLE_CONVERTER "enable converter, only x86_64 support" on)
 option(MSLITE_ENABLE_TOOLS "enable tools" on)
 option(MSLITE_ENABLE_TESTCASES "enable testcase" off)
 option(MSLITE_ENABLE_NNIE "enable NNIE" off)
+option(MSLITE_ENABLE_RUNTIME_PASS "enable runtime pass" on)
 option(MSLITE_COMPILE_NNIE "compile NNIE" off)
 option(MSLITE_ENABLE_HIGH_PERFORMANCE "enable high performance" on)
 option(MSLITE_STRING_KERNEL "enable string kernel" on)
-option(MSLITE_CONTROL_TENSORLIST "enable control and tensorlist" on)
+option(MSLITE_CONTROLFLOW_TENSORLIST "enable control and tensorlist" on)
 option(MSLITE_AUTO_PARALLEL "enable automatic parallelism" on)
-option(MSLITE_HUFFMAN_DECODE "enable huffman decode" on)
+option(MSLITE_WEIGHT_DECODE "enable weight decode" on)
+option(MSLITE_CUSTOM_KERNEL_REGISTRY "enable extend kernel registry" on)
+option(MSLITE_ENABLE_MINDRT "enable mindrt use" on)
+option(MSLITE_DELEGATE_USE "enable delegate use" on)
+option(MSLITE_ENABLE_V0 "support v0 schema" on)
 
-# Option that can be configured through manually
+#Option that can be configured through manually
 option(ENABLE_VERBOSE "" off)
-option(ENABLE_MINDRT "if support mindrt" on)
 option(ENABLE_MODEL_OBF "if support model obfuscation" off)
-option(ENABLE_V0 "support v0 schema" on)
 set(BUILD_MINDDATA "lite_cv" CACHE STRING "off, lite, lite_cv, wrapper or full")
 
 if(APPLE)
@@ -83,34 +90,32 @@ endif()
 if(DEFINED ENV{MSLITE_COMPILE_NNIE})
     set(MSLITE_COMPILE_NNIE $ENV{MSLITE_COMPILE_NNIE})
 endif()
+if(DEFINED ENV{MSLITE_ENABLE_RUNTIME_PASS})
+    set(MSLITE_ENABLE_RUNTIME_PASS $ENV{MSLITE_ENABLE_RUNTIME_PASS})
+endif()
 if(DEFINED ENV{MSLITE_ENABLE_HIGH_PERFORMANCE})
     set(MSLITE_ENABLE_HIGH_PERFORMANCE $ENV{MSLITE_ENABLE_HIGH_PERFORMANCE})
 endif()
 if(DEFINED ENV{MSLITE_STRING_KERNEL})
     set(MSLITE_STRING_KERNEL $ENV{MSLITE_STRING_KERNEL})
 endif()
-if(DEFINED ENV{MSLITE_CONTROL_TENSORLIST})
-    set(MSLITE_CONTROL_TENSORLIST $ENV{MSLITE_CONTROL_TENSORLIST})
+if(DEFINED ENV{MSLITE_CONTROLFLOW_TENSORLIST})
+    set(MSLITE_CONTROLFLOW_TENSORLIST $ENV{MSLITE_CONTROLFLOW_TENSORLIST})
 endif()
 if(DEFINED ENV{MSLITE_AUTO_PARALLEL})
     set(MSLITE_AUTO_PARALLEL $ENV{MSLITE_AUTO_PARALLEL})
 endif()
-if(DEFINED ENV{MSLITE_HUFFMAN_DECODE})
-    set(MSLITE_HUFFMAN_DECODE $ENV{MSLITE_HUFFMAN_DECODE})
+if(DEFINED ENV{MSLITE_WEIGHT_DECODE})
+    set(MSLITE_WEIGHT_DECODE $ENV{MSLITE_WEIGHT_DECODE})
 endif()
-
-
-if(MSLITE_STRING_KERNEL)
-    add_compile_definitions(ENABLE_STRING_KERNEL)
+if(DEFINED ENV{MSLITE_CUSTOM_KERNEL_REGISTRY})
+    set(MSLITE_CUSTOM_KERNEL_REGISTRY $ENV{MSLITE_CUSTOM_KERNEL_REGISTRY})
 endif()
-if(MSLITE_CONTROL_TENSORLIST)
-    add_compile_definitions(ENABLE_CONTROL_TENSORLIST)
+if(DEFINED ENV{MSLITE_ENABLE_MINDRT})
+    set(MSLITE_ENABLE_MINDRT $ENV{MSLITE_ENABLE_MINDRT})
 endif()
-if(MSLITE_AUTO_PARALLEL)
-    add_compile_definitions(ENABLE_AUTO_PARALLEL)
-endif()
-if(MSLITE_HUFFMAN_DECODE)
-    add_compile_definitions(ENABLE_HUFFMAN_DECODE)
+if(DEFINED ENV{MSLITE_DELEGATE_USE})
+    set(MSLITE_DELEGATE_USE $ENV{MSLITE_DELEGATE_USE})
 endif()
 
 if(PLATFORM_ARM64)
@@ -144,7 +149,7 @@ if(PLATFORM_ARM64 OR PLATFORM_ARM32)
     set(MSLITE_ENABLE_SSE off)
     set(MSLITE_ENABLE_AVX off)
     set(MSLITE_ENABLE_CONVERTER off)
-    #set for cross-compiling toolchain
+#set for cross - compiling toolchain
     set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
     set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
     set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH)
@@ -170,7 +175,22 @@ if(MSLITE_ENABLE_NPU)
 endif()
 
 if(TARGET_HIMIX200 OR TARGET_OHOS_LITE)
-  set(ENABLE_MINDRT off)
+  set(MSLITE_ENABLE_MINDRT off)
+endif()
+
+if(MSVC)
+  set(MSLITE_ENABLE_CONVERTER off)
+endif()
+
+if(MSLITE_ENABLE_CONVERTER AND (
+        NOT MSLITE_ENABLE_MINDRT
+        OR NOT MSLITE_STRING_KERNEL
+        OR NOT MSLITE_CONTROLFLOW_TENSORLIST
+        OR NOT MSLITE_WEIGHT_DECODE
+        OR NOT MSLITE_CUSTOM_KERNEL_REGISTRY))
+    message(FATAL_ERROR "If one of 'MSLITE_ENABLE_MINDRT MSLITE_STRING_KERNEL "
+            "MSLITE_CONTROLFLOW_TENSORLIST MSLITE_WEIGHT_DECODE MSLITE_CUSTOM_KERNEL_REGISTRY'"
+            "is configured as off, MSLITE_ENABLE_CONVERTER must also be configured as off")
 endif()
 
 message(STATUS "************MindSpore Lite Build Option:************")
@@ -183,6 +203,17 @@ message(STATUS "\tMSLITE_ENABLE_CONVERTER = \t${MSLITE_ENABLE_CONVERTER}")
 message(STATUS "\tMSLITE_ENABLE_TOOLS     = \t${MSLITE_ENABLE_TOOLS}")
 message(STATUS "\tMSLITE_ENABLE_TESTCASES = \t${MSLITE_ENABLE_TESTCASES}")
 message(STATUS "\tMSLITE_ENABLE_HIGH_PERFORMANCE = \t${MSLITE_ENABLE_HIGH_PERFORMANCE}")
+message(STATUS "\tMSLITE_ENABLE_RUNTIME_PASS = \t${MSLITE_ENABLE_RUNTIME_PASS}")
+message(STATUS "\tMSLITE_STRING_KERNEL = \t${MSLITE_STRING_KERNEL}")
+message(STATUS "\tMSLITE_CONTROLFLOW_TENSORLIST = \t${MSLITE_CONTROLFLOW_TENSORLIST}")
+message(STATUS "\tMSLITE_AUTO_PARALLEL = \t${MSLITE_AUTO_PARALLEL}")
+message(STATUS "\tMSLITE_WEIGHT_DECODE = \t${MSLITE_WEIGHT_DECODE}")
+message(STATUS "\tMSLITE_CUSTOM_KERNEL_REGISTRY = \t${MSLITE_CUSTOM_KERNEL_REGISTRY}")
+message(STATUS "\tMSLITE_ENABLE_MINDRT = \t${MSLITE_ENABLE_MINDRT}")
+message(STATUS "\tMSLITE_ENABLE_V0 = \t${MSLITE_ENABLE_V0}")
+message(STATUS "\tBUILD_MINDDATA = \t${BUILD_MINDDATA}")
+message(STATUS "\tMSLITE_DELEGATE_USE = \t${MSLITE_DELEGATE_USE}")
+
 
 if(MSLITE_ENABLE_HIGH_PERFORMANCE)
     add_compile_definitions(ENABLE_HIGH_PERFORMANCE)
@@ -197,51 +228,6 @@ if(ENABLE_ASAN)
 endif()
 
 set(PKG_NAME_PREFIX mindspore-lite-${MS_VERSION_MAJOR}.${MS_VERSION_MINOR}.${MS_VERSION_REVISION})
-set(CMAKE_SKIP_RPATH TURE)
-
-if(MSVC)
-    add_compile_definitions(SUPPORT_MSVC)
-    add_compile_definitions(_ENABLE_ATOMIC_ALIGNMENT_FIX)
-    set(CMAKE_C_FLAGS "/O2 /EHsc /GS /Zi /utf-8")
-    set(CMAKE_CXX_FLAGS "/O2 /EHsc /GS /Zi /utf-8 /std:c++17")
-    if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-        set(CMAKE_SHARED_LINKER_FLAGS "/SAFESEH ${CMAKE_SHARED_LINKER_FLAGS}")
-        set(CMAKE_EXE_LINKER_FLAGS "/SAFESEH ${CMAKE_EXE_LINKER_FLAGS}")
-    endif()
-    set(CMAKE_SHARED_LINKER_FLAGS "/NXCOMPAT /DYNAMICBASE /DEBUG ${CMAKE_SHARED_LINKER_FLAGS}")
-    set(CMAKE_EXE_LINKER_FLAGS "/NXCOMPAT /DYNAMICBASE /DEBUG ${CMAKE_EXE_LINKER_FLAGS}")
-else()
-    string(REPLACE "-g" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-    string(REPLACE "-g" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    set(LITE_COMPILE_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -Wall -Werror -fstack-protector-strong -Wno-attributes \
-                            -Wno-deprecated-declarations -Wno-missing-braces")
-    set(CMAKE_C_FLAGS "${LITE_COMPILE_FLAGS} ${CMAKE_C_FLAGS}")
-    set(CMAKE_C_FLAGS_DEBUG "-DDebug -g -fvisibility=default")
-
-    set(CMAKE_CXX_FLAGS "${LITE_COMPILE_FLAGS} -Wno-overloaded-virtual ${CMAKE_CXX_FLAGS} -std=c++17")
-    set(CMAKE_CXX_FLAGS_DEBUG "-DDebug -g -fvisibility=default")
-
-    if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
-        string(REPLACE "-O2" "-O0" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-        string(REPLACE "-O2" "-O0" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    endif()
-
-    if(WIN32)
-        if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-            set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-seh ${CMAKE_SHARED_LINKER_FLAGS}")
-            set(CMAKE_EXE_LINKER_FLAGS "-Wl,--no-seh ${CMAKE_EXE_LINKER_FLAGS}")
-        endif()
-        set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--nxcompat -Wl,--dynamicbase ${CMAKE_SHARED_LINKER_FLAGS}")
-        set(CMAKE_EXE_LINKER_FLAGS "-Wl,--nxcompat -Wl,--dynamicbase ${CMAKE_EXE_LINKER_FLAGS}")
-    else()
-        set(CMAKE_SHARED_LINKER_FLAGS "-Wl,-z,relro,-z,now -Wl,-z,noexecstack -s ${CMAKE_SHARED_LINKER_FLAGS}")
-        set(CMAKE_EXE_LINKER_FLAGS "-Wl,-z,relro,-z,now -Wl,-z,noexecstack -s -pie ${CMAKE_EXE_LINKER_FLAGS}")
-        if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
-            string(REPLACE "-s " "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
-            string(REPLACE "-s " "" CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
-        endif()
-    endif()
-endif()
 
 if(SUPPORT_NPU)
     set(DDK_PATH "$ENV{HWHIAI_DDK}/ddk/ai_ddk_lib")
@@ -374,10 +360,8 @@ if(WIN32)
     add_compile_definitions(BUILDING_DLL)
 endif()
 
-if(ENABLE_MINDRT OR TARGET_HIMIX200 OR TARGET_OHOS_LITE)
-    include_directories(${CORE_DIR}/mindrt/include)
-    include_directories(${CORE_DIR}/mindrt/src)
-endif()
+include_directories(${CORE_DIR}/mindrt/include)
+include_directories(${CORE_DIR}/mindrt/src)
 
 if(NOT WIN32 AND NOT APPLE)
     if(ENABLE_MODEL_OBF)
@@ -403,7 +387,7 @@ if(MSLITE_ENABLE_CONVERTER)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/converter)
 endif()
 
-if(ENABLE_MINDRT)
+if(MSLITE_ENABLE_MINDRT)
     add_compile_definitions(ENABLE_MINDRT)
 endif()
 
diff --git a/mindspore/lite/build_lite.sh b/mindspore/lite/build_lite.sh
index 65cec694837..95d0b4b8cc0 100755
--- a/mindspore/lite/build_lite.sh
+++ b/mindspore/lite/build_lite.sh
@@ -159,7 +159,7 @@ build_lite() {
         pkg_name=mindspore-lite-${VERSION_STR}-ios-aarch64
         cmake -DCMAKE_TOOLCHAIN_FILE=${BASEPATH}/cmake/lite_ios.cmake -DARCHS="arm64" -DENABLE_BITCODE=0                   \
               -DCMAKE_BUILD_TYPE="Release" -DBUILD_MINDDATA="" -DPLATFORM_ARM64="on" -DENABLE_NEON="on" -DENABLE_FP16="on" \
-              -DMSLITE_ENABLE_TRAIN="off" -DENABLE_MINDRT="on" -DMSLITE_GPU_BACKEND="off" -DMSLITE_ENABLE_NPU="off"        \
+              -DMSLITE_ENABLE_TRAIN="off" -DMSLITE_GPU_BACKEND="off" -DMSLITE_ENABLE_NPU="off"        \
               -DENABLE_ASAN=${ENABLE_ASAN} -DCMAKE_INSTALL_PREFIX=${BUILD_PATH}/output/tmp -G Xcode ..
       else
         checkndk
@@ -176,7 +176,7 @@ build_lite() {
         pkg_name=mindspore-lite-${VERSION_STR}-ios-aarch32
         cmake -DCMAKE_TOOLCHAIN_FILE=${BASEPATH}/cmake/lite_ios.cmake -DARCHS="armv7;armv7s" -DENABLE_BITCODE=0     \
               -DCMAKE_BUILD_TYPE="Release" -DBUILD_MINDDATA="" -DPLATFORM_ARM32="on" -DENABLE_NEON="on"             \
-              -DMSLITE_ENABLE_TRAIN="off" -DENABLE_MINDRT="on" -DMSLITE_GPU_BACKEND="off" -DMSLITE_ENABLE_NPU="off" \
+              -DMSLITE_ENABLE_TRAIN="off" -DMSLITE_GPU_BACKEND="off" -DMSLITE_ENABLE_NPU="off" \
               -DENABLE_ASAN=${ENABLE_ASAN} -DCMAKE_INSTALL_PREFIX=${BUILD_PATH}/output/tmp -G Xcode ..
       else
         checkndk
diff --git a/mindspore/lite/examples/converter_extend/src/custom_add_infer.cc b/mindspore/lite/examples/converter_extend/src/custom_add_infer.cc
index 2470f66eb68..f1034c2188b 100644
--- a/mindspore/lite/examples/converter_extend/src/custom_add_infer.cc
+++ b/mindspore/lite/examples/converter_extend/src/custom_add_infer.cc
@@ -16,7 +16,7 @@
 
 #include "src/custom_common.h"
 #include "include/errorcode.h"
-#include "include/registry/kernel_interface.h"
+#include "include/registry/register_kernel_interface.h"
 
 namespace mindspore {
 /**
@@ -28,17 +28,19 @@ class CustomAddInfer : public kernel::KernelInterface {
   CustomAddInfer() = default;
   ~CustomAddInfer() = default;
 
-  int Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
-            const schema::Primitive *primitive) override {
+  Status Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
+               const schema::Primitive *primitive) override {
     (*outputs)[0].SetFormat((*inputs)[0].format());
     (*outputs)[0].SetDataType((*inputs)[0].DataType());
     auto ret = common::CheckInputs(*inputs);
-    if (ret != lite::RET_OK) {
+    if (ret == lite::RET_INFER_INVALID) {
       (*outputs)[0].SetShape({-1});  // shape{-1} shows that shape need to be inferred when running.
-      return ret;
+      return kLiteInferInvalid;
+    } else if (ret != lite::RET_OK) {
+      return kLiteError;
     }
     (*outputs)[0].SetShape((*inputs)[0].Shape());
-    return lite::RET_OK;
+    return kSuccess;
   }
 };
 std::shared_ptr<kernel::KernelInterface> CustomAddInferCreator() { return std::make_shared<CustomAddInfer>(); }
diff --git a/mindspore/lite/examples/converter_extend/src/pass_registry_tutorial.cc b/mindspore/lite/examples/converter_extend/src/pass_registry_tutorial.cc
index b3d66a31e53..ffc3256e69f 100644
--- a/mindspore/lite/examples/converter_extend/src/pass_registry_tutorial.cc
+++ b/mindspore/lite/examples/converter_extend/src/pass_registry_tutorial.cc
@@ -94,9 +94,12 @@ bool PassTutorial::Run(const FuncGraphPtr &func_graph) {
   }
   return true;
 }
-
-// register customed Pass
-REG_PASS(PassTutorial, PassTutorial)
-REG_SCHEDULED_PASS(POSITION_BEGIN, {"PassTutorial"})
 }  // namespace opt
+
+namespace lite {
+// register customed Pass
+using mindspore::registry::POSITION_BEGIN;
+REG_PASS(PassTutorial, opt::PassTutorial)
+REG_SCHEDULED_PASS(POSITION_BEGIN, {"PassTutorial"})
+}  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/examples/runtime_extend/src/custom_add_infer.cc b/mindspore/lite/examples/runtime_extend/src/custom_add_infer.cc
index 40eff5d4c01..3b11d737e50 100644
--- a/mindspore/lite/examples/runtime_extend/src/custom_add_infer.cc
+++ b/mindspore/lite/examples/runtime_extend/src/custom_add_infer.cc
@@ -16,7 +16,7 @@
 
 #include "src/custom_common.h"
 #include "include/errorcode.h"
-#include "include/registry/kernel_interface.h"
+#include "include/registry/register_kernel_interface.h"
 
 namespace mindspore {
 /**
@@ -28,17 +28,19 @@ class CustomAddInfer : public kernel::KernelInterface {
   CustomAddInfer() = default;
   ~CustomAddInfer() = default;
 
-  int Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
-            const schema::Primitive *primitive) override {
+  Status Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
+               const schema::Primitive *primitive) override {
     (*outputs)[0].SetFormat((*inputs)[0].format());
     (*outputs)[0].SetDataType((*inputs)[0].DataType());
     auto ret = common::CheckInputs(*inputs);
-    if (ret != lite::RET_OK) {
+    if (ret == lite::RET_INFER_INVALID) {
       (*outputs)[0].SetShape({-1});  // shape{-1} shows that shape need to be inferred when running.
-      return ret;
+      return kLiteInferInvalid;
+    } else if (ret != lite::RET_OK) {
+      return kLiteError;
     }
     (*outputs)[0].SetShape((*inputs)[0].Shape());
-    return lite::RET_OK;
+    return kSuccess;
   }
 };
 std::shared_ptr<kernel::KernelInterface> CustomAddInferCreator() { return std::make_shared<CustomAddInfer>(); }
diff --git a/mindspore/lite/examples/runtime_extend/src/custom_add_kernel.cc b/mindspore/lite/examples/runtime_extend/src/custom_add_kernel.cc
index b9fd71b7fe0..045ed2a3301 100644
--- a/mindspore/lite/examples/runtime_extend/src/custom_add_kernel.cc
+++ b/mindspore/lite/examples/runtime_extend/src/custom_add_kernel.cc
@@ -20,16 +20,21 @@
 #include <vector>
 #include "src/custom_common.h"
 #include "include/errorcode.h"
-#include "include/registry/kernel_interface.h"
+#include "include/registry/register_kernel_interface.h"
 #include "include/registry/register_kernel.h"
 
 namespace mindspore {
 namespace kernel {
+namespace {
+const auto kFloat32 = DataType::kNumberTypeFloat32;
+}
 class CustomAddKernel : public Kernel {
  public:
   CustomAddKernel(const std::vector<MSTensor> &inputs, const std::vector<MSTensor> &outputs,
                   const schema::Primitive *primitive, const mindspore::Context *ctx)
       : Kernel(inputs, outputs, primitive, ctx) {}
+  ~CustomAddKernel() = default;
+
   // Prepare will be called during graph compilation
   int Prepare() override { return lite::RET_OK; }
 
@@ -57,12 +62,13 @@ class CustomAddKernel : public Kernel {
   // if output shape exists value -1, need to be inferred before applying memory for output tensor.
   int PreProcess() {
     if (common::CheckOutputs(outputs_) != lite::RET_OK) {
-      auto ret = RegisterKernelInterface::GetKernelInterface({}, primitive_)->Infer(&inputs_, &outputs_, primitive_);
-      if (ret != lite::RET_OK) {
+      auto status =
+        registry::RegisterKernelInterface::GetKernelInterface({}, primitive_)->Infer(&inputs_, &outputs_, primitive_);
+      if (status != kSuccess) {
         std::cerr << "infer failed." << std::endl;
         return lite::RET_ERROR;
       }
-      ret = ReSize();
+      auto ret = ReSize();
       if (ret != lite::RET_OK) {
         std::cerr << "resize failed." << std::endl;
         return ret;
@@ -105,6 +111,6 @@ std::shared_ptr<Kernel> CustomAddCreator(const std::vector<MSTensor> &inputs, co
                                          const schema::Primitive *primitive, const mindspore::Context *ctx) {
   return std::make_shared<CustomAddKernel>(inputs, outputs, primitive, ctx);
 }
-REGISTER_CUSTOM_KERNEL(CPU, Tutorial, kNumberTypeFloat32, Custom_Add, CustomAddCreator)
+REGISTER_CUSTOM_KERNEL(CPU, Tutorial, kFloat32, Custom_Add, CustomAddCreator)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/lite/examples/unified_api/src/inference.cc b/mindspore/lite/examples/unified_api/src/inference.cc
index 355871e4125..23d133c79b4 100644
--- a/mindspore/lite/examples/unified_api/src/inference.cc
+++ b/mindspore/lite/examples/unified_api/src/inference.cc
@@ -54,7 +54,7 @@ int main(int argc, char **argv) {
   context->MutableDeviceInfo().push_back(cpu_context);
 
   mindspore::Graph graph;
-  auto status = mindspore::Serialization::Load(infer_model_fn, mindspore::kFlatBuffer, &graph);
+  auto status = mindspore::Serialization::Load(infer_model_fn, mindspore::kMindIR, &graph);
   if (status != mindspore::kSuccess) {
     std::cout << "Error " << status << " during serialization of graph " << infer_model_fn;
     MS_ASSERT(status != mindspore::kSuccess);
diff --git a/mindspore/lite/examples/unified_api/src/net_runner.cc b/mindspore/lite/examples/unified_api/src/net_runner.cc
index 238dfa8e9ac..94dbb02b8c2 100644
--- a/mindspore/lite/examples/unified_api/src/net_runner.cc
+++ b/mindspore/lite/examples/unified_api/src/net_runner.cc
@@ -148,7 +148,7 @@ void NetRunner::InitAndFigureInputs() {
   context->MutableDeviceInfo().push_back(cpu_context);
 
   graph_ = new mindspore::Graph();
-  auto status = mindspore::Serialization::Load(ms_file_, mindspore::kFlatBuffer, graph_);
+  auto status = mindspore::Serialization::Load(ms_file_, mindspore::kMindIR, graph_);
   if (status != mindspore::kSuccess) {
     std::cout << "Error " << status << " during serialization of graph " << ms_file_;
     MS_ASSERT(status != mindspore::kSuccess);
diff --git a/mindspore/lite/include/errorcode.h b/mindspore/lite/include/errorcode.h
index 9ff4e093795..796aeea9481 100644
--- a/mindspore/lite/include/errorcode.h
+++ b/mindspore/lite/include/errorcode.h
@@ -27,7 +27,7 @@ using STATUS = int;
 /* Success */
 constexpr int RET_OK = 0; /**< No error occurs. */
 
-/* Common error code, range: [-1, -100）*/
+/* Common error code, range: [-1, -100) */
 constexpr int RET_ERROR = -1;             /**< Common error code. */
 constexpr int RET_NULL_PTR = -2;          /**< NULL pointer returned.*/
 constexpr int RET_PARAM_INVALID = -3;     /**< Invalid parameter.*/
@@ -58,7 +58,7 @@ constexpr int RET_FORMAT_ERR = -400; /**< Failed to checking tensor format. */
 constexpr int RET_INFER_ERR = -500;     /**< Failed to infer shape. */
 constexpr int RET_INFER_INVALID = -501; /**< Invalid infer shape before runtime. */
 
-/* User input param error code, range: [-600, 700)*/
+/* User input param error code, range: [-600, 700) */
 constexpr int RET_INPUT_PARAM_INVALID = -600; /**< Invalid input param by user. */
 
 /// \brief Print description of errorcode.
diff --git a/mindspore/lite/include/lite_utils.h b/mindspore/lite/include/lite_utils.h
index 92aa7856cf0..68499e79a98 100644
--- a/mindspore/lite/include/lite_utils.h
+++ b/mindspore/lite/include/lite_utils.h
@@ -34,12 +34,16 @@
 
 #ifndef MS_API
 #ifdef _WIN32
+#ifdef _MSC_VER
 #ifdef BUILDING_DLL
 #define MS_API __declspec(dllexport)
 #else
 #define MS_API __declspec(dllimport)
 #endif
 #else
+#define MS_API __declspec(dllexport)
+#endif
+#else
 #define MS_API __attribute__((visibility("default")))
 #endif
 #endif
diff --git a/mindspore/lite/include/registry/framework.h b/mindspore/lite/include/registry/framework.h
deleted file mode 100644
index 223606e69ca..00000000000
--- a/mindspore/lite/include/registry/framework.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_INCLUDE_REGISTRY_FRAMEWORK_H_
-#define MINDSPORE_LITE_INCLUDE_REGISTRY_FRAMEWORK_H_
-
-#include "include/lite_utils.h"
-
-namespace mindspore {
-namespace lite {
-namespace converter {
-/// \brief FmkType defined frameworks which converter tool supports.
-enum MS_API FmkType : int {
-  FmkType_TF = 0,
-  FmkType_CAFFE = 1,
-  FmkType_ONNX = 2,
-  FmkType_MS = 3,
-  FmkType_TFLITE = 4,
-};
-}  // namespace converter
-}  // namespace lite
-}  // namespace mindspore
-#endif  // MINDSPORE_LITE_INCLUDE_REGISTRY_FRAMEWORK_H_
diff --git a/mindspore/lite/include/registry/kernel_interface.h b/mindspore/lite/include/registry/kernel_interface.h
deleted file mode 100644
index 4ca4d05cb74..00000000000
--- a/mindspore/lite/include/registry/kernel_interface.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_INCLUDE_REGISTRY_KERNEL_INTERFACE_H_
-#define MINDSPORE_LITE_INCLUDE_REGISTRY_KERNEL_INTERFACE_H_
-
-#include <set>
-#include <string>
-#include <vector>
-#include <memory>
-#include "include/model.h"
-#include "include/api/types.h"
-#include "schema/model_generated.h"
-
-namespace mindspore {
-namespace kernel {
-/// \brief KernelInterface defined customized op's interface, such as infershape, and so on.
-class MS_API KernelInterface {
- public:
-  /// \brief Destructor of KernelInterface.
-  virtual ~KernelInterface() = default;
-
-  /// \brief Method to infer customized op's output shape.
-  ///
-  /// \param[in] inputs Define the input tensors of op.
-  /// \param[in] outputs Define the output tensors of op.
-  /// \param[in] primitive Define the attributes of op.
-  ///
-  /// \return  STATUS as an error code of inferring, STATUS is defined in errorcode.h..
-  virtual int Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
-                    const schema::Primitive *primitive) {
-    return 0;
-  }
-};
-
-/// \brief KernelInterfaceCreator defined a functor to create KernelInterface.
-using KernelInterfaceCreator = std::function<std::shared_ptr<KernelInterface>()>;
-
-/// \brief RegisterKernelInterface defined registration and acquisition of KernelInterface.
-class MS_API RegisterKernelInterface {
- public:
-  /// \brief Static method to register op whose primitive type is custom.
-  ///
-  /// \param[in] provider Define the identification of user.
-  /// \param[in] op_type Define the concrete type of a custom op.
-  /// \param[in] creator Define the KernelInterface create function.
-  ///
-  /// \return STATUS as an error code of registering, STATUS is defined in errorcode.h.
-  static int CustomReg(const std::string &provider, const std::string &op_type, KernelInterfaceCreator creator);
-
-  /// \brief Static method to register op whose primitive type is ordinary.
-  ///
-  /// \param[in] provider Define the identification of user.
-  /// \param[in] op_type Define the ordinary op type.
-  /// \param[in] creator Define the KernelInterface create function.
-  ///
-  /// \return STATUS as an error code of registering, STATUS is defined in errorcode.h.
-  static int Reg(const std::string &provider, int op_type, KernelInterfaceCreator creator);
-
-  /// \brief Static method to get registration of a certain op.
-  ///
-  /// \param[in] provider Define the identification of user.
-  /// \param[in] primitive Define the attributes of a certain op.
-  ///
-  /// \return Boolean value to represent registration of a certain op is existing or not.
-  static std::shared_ptr<kernel::KernelInterface> GetKernelInterface(const std::string &provider,
-                                                                     const schema::Primitive *primitive);
-};
-
-/// \brief KernelInterfaceReg defined registration class of KernelInterface.
-class MS_API KernelInterfaceReg {
- public:
-  /// \brief Constructor of KernelInterfaceReg to register an ordinary op.
-  ///
-  /// \param[in] provider Define the identification of user.
-  /// \param[in] op_type Define the ordinary op type.
-  /// \param[in] creator Define the KernelInterface create function.
-  KernelInterfaceReg(const std::string &provider, int op_type, KernelInterfaceCreator creator) {
-    RegisterKernelInterface::Reg(provider, op_type, creator);
-  }
-
-  /// \brief Constructor of KernelInterfaceReg to register custom op.
-  ///
-  /// \param[in] provider Define the identification of user.
-  /// \param[in] op_type Define the concrete type of a custom op.
-  /// \param[in] creator Define the KernelInterface create function.
-  KernelInterfaceReg(const std::string &provider, const std::string &op_type, KernelInterfaceCreator creator) {
-    RegisterKernelInterface::CustomReg(provider, op_type, creator);
-  }
-};
-
-/// \brief Defined registering macro to register ordinary op, which called by user directly.
-///
-/// \param[in] provider Define the identification of user.
-/// \param[in] op_type Define the ordinary op type.
-/// \param[in] creator Define the KernelInterface create function.
-#define REGISTER_KERNEL_INTERFACE(provider, op_type, creator)                                                  \
-  namespace {                                                                                                  \
-  static mindspore::kernel::KernelInterfaceReg g_##provider##op_type##_inter_reg(#provider, op_type, creator); \
-  }  // namespace
-
-/// \brief Defined registering macro to register custom op, which called by user directly.
-///
-/// \param[in] provider Define the identification of user.
-/// \param[in] op_type Define the concrete type of a custom op.
-/// \param[in] creator Define the KernelInterface create function.
-#define REGISTER_CUSTOM_KERNEL_INTERFACE(provider, op_type, creator)                                                   \
-  namespace {                                                                                                          \
-  static mindspore::kernel::KernelInterfaceReg g_##provider##op_type##_custom_inter_reg(#provider, #op_type, creator); \
-  }  // namespace
-}  // namespace kernel
-}  // namespace mindspore
-
-#endif  // MINDSPORE_LITE_INCLUDE_REGISTRY_KERNEL_INTERFACE_H_
diff --git a/mindspore/lite/include/registry/model_parser_registry.h b/mindspore/lite/include/registry/model_parser_registry.h
index ea9e081dc44..5b6a0b5899a 100644
--- a/mindspore/lite/include/registry/model_parser_registry.h
+++ b/mindspore/lite/include/registry/model_parser_registry.h
@@ -17,82 +17,43 @@
 #ifndef MINDSPORE_LITE_INCLUDE_REGISTRY_MODEL_PARSER_REGISTRY_H
 #define MINDSPORE_LITE_INCLUDE_REGISTRY_MODEL_PARSER_REGISTRY_H
 
-#include <map>
 #include <memory>
-#include <string>
 #include "include/lite_utils.h"
-#include "include/registry/framework.h"
-#include "schema/inner/model_generated.h"
-
-using mindspore::lite::converter::FmkType;
-namespace mindspore::lite {
-namespace converter {
-/// \brief ConverterParameters defined read-only converter parameters used by users in ModelParser.
-struct MS_API ConverterParameters {
-  FmkType fmk_;
-  schema::QuantType quant_type_;
-  std::string model_file_;
-  std::string weight_file_;
-  std::map<std::string, std::string> attrs_;
-};
-}  // namespace converter
-
-/// \brief ModelParser defined a model parser
-class MS_API ModelParser;
+#include "include/registry/parser_context.h"
 
+using mindspore::converter::FmkType;
+namespace mindspore {
+namespace registry {
 /// \brief ModelParserCreator defined function pointer to get a ModelParser class.
-typedef ModelParser *(*ModelParserCreator)();
+typedef converter::ModelParser *(*ModelParserCreator)();
 
 /// \brief ModelParserRegistry defined registration and storage of ModelParser.
 class MS_API ModelParserRegistry {
  public:
   /// \brief Constructor of ModelParserRegistry.
-  ModelParserRegistry() = default;
+  ///
+  /// \param[in] fmk Define identification of a certain framework.
+  /// \param[in] creator Define function pointer of creating ModelParser.
+  ModelParserRegistry(FmkType fmk, ModelParserCreator creator);
 
   /// \brief Destructor of ModelParserRegistry.
   ~ModelParserRegistry() = default;
 
-  /// \brief Static method to get a single instance.
-  ///
-  /// \return Pointer of ModelParserRegistry.
-  static ModelParserRegistry *GetInstance();
-
-  /// \brief Method to get a model parser.
+  /// \brief Static Method to get a model parser.
   ///
   /// \param[in] fmk Define identification of a certain framework.
   ///
   /// \return Pointer of ModelParser.
-  ModelParser *GetModelParser(const FmkType fmk);
-
-  /// \brief Method to register model parser.
-  ///
-  /// \param[in] fmk Define identification of a certain framework.
-  /// \param[in] creator Define function pointer of creating ModelParser.
-  int RegParser(const FmkType fmk, ModelParserCreator creator);
-
-  std::map<FmkType, ModelParserCreator> parsers_;
-};
-
-/// \brief ModelRegistrar defined registration class of ModelParser.
-class MS_API ModelRegistrar {
- public:
-  /// \brief Constructor of ModelRegistrar to register ModelParser.
-  ///
-  /// \param[in] fmk Define identification of a certain framework.
-  /// \param[in] creator Define function pointer of creating ModelParser.
-  ModelRegistrar(const FmkType fmk, ModelParserCreator creator) {
-    ModelParserRegistry::GetInstance()->RegParser(fmk, creator);
-  }
-
-  /// \brief Destructor of ModelRegistrar.
-  ~ModelRegistrar() = default;
+  static converter::ModelParser *GetModelParser(FmkType fmk);
 };
 
 /// \brief Defined registering macro to register ModelParser, which called by user directly.
 ///
 /// \param[in] fmk Define identification of a certain framework.
 /// \param[in] parserCreator Define function pointer of creating ModelParser.
-#define REG_MODEL_PARSER(fmk, parserCreator) static ModelRegistrar g_##type##fmk##ModelParserReg(fmk, parserCreator);
-}  // namespace mindspore::lite
+#define REG_MODEL_PARSER(fmk, parserCreator) \
+  static mindspore::registry::ModelParserRegistry g_##type##fmk##ModelParserReg(fmk, parserCreator);
+}  // namespace registry
+}  // namespace mindspore
 
 #endif  // MINDSPORE_LITE_INCLUDE_REGISTRY_MODEL_PARSER_REGISTRY_H
diff --git a/mindspore/lite/include/registry/pass_registry.h b/mindspore/lite/include/registry/pass_registry.h
index dcd1f8e2bc1..3ed83e95e02 100644
--- a/mindspore/lite/include/registry/pass_registry.h
+++ b/mindspore/lite/include/registry/pass_registry.h
@@ -25,46 +25,63 @@
 
 namespace mindspore {
 namespace opt {
-/// \brief PassPosition defined where to plae user's pass.
-enum MS_API PassPosition { POSITION_BEGIN = 0, POSITION_END = 1 };
-
 /// \brief P defined a basic interface.
 ///
 /// \note List public class and interface for reference.
 class MS_API Pass;
 using PassPtr = std::shared_ptr<Pass>;
+}  // namespace opt
+
+namespace registry {
+/// \brief PassPosition defined where to plae user's pass.
+enum MS_API PassPosition { POSITION_BEGIN = 0, POSITION_END = 1 };
 
 /// \brief PassRegistry defined registration of Pass.
 class MS_API PassRegistry {
  public:
   /// \brief Constructor of PassRegistry to register pass.
   ///
-  /// \param[in] pos Define where to replace the pass.
-  /// \param[in] pass Define user's defined pass.
-  PassRegistry(const std::string &pass_name, const PassPtr &pass);
+  /// \param[in] pass_name Define the name of the pass, a string which should guarantee uniqueness.
+  /// \param[in] pass Define pass instance.
+  PassRegistry(const std::string &pass_name, const opt::PassPtr &pass);
 
   /// \brief Constructor of PassRegistry to assign which passes are required for external extension.
   ///
-  /// \param[in position Define the place where assigned passes will run.
-  /// \param[in] assigned Define the name of passes assigned by user.
-  PassRegistry(PassPosition position, const std::vector<std::string> &assigned);
+  /// \param[in] position Define the place where assigned passes will run.
+  /// \param[in] names Define the names of the passes.
+  PassRegistry(PassPosition position, const std::vector<std::string> &names);
 
   /// \brief Destructor of PassRegistrar.
   ~PassRegistry() = default;
+
+  /// \brief Static method to obtain external scheduling task assigned by user.
+  ///
+  /// \param[in] position Define the place where assigned passes will run.
+  ///
+  /// \return Passes' Name Vector.
+  static std::vector<std::string> GetOuterScheduleTask(PassPosition position);
+
+  /// \brief Static method to obtain pass instance according to passes' name.
+  ///
+  /// \param[in] pass_names Define the name of passes.
+  ///
+  /// \return Pass Instance Vector.
+  static std::vector<opt::PassPtr> GetPassFromStoreRoom(const std::vector<std::string> &pass_names);
 };
 
 /// \brief Defined registering macro to register Pass, which called by user directly.
 ///
-/// \param[in] name Define name of user's pass, which is a string.
-/// \param[in] pass Define user's defined pass.
-#define REG_PASS(name, pass) static PassRegistry g_##name##PassReg(#name, std::make_shared<pass>());
+/// \param[in] name Define the name of the pass, a string which should guarantee uniqueness.
+/// \param[in] pass Define pass instance.
+#define REG_PASS(name, pass) \
+  static mindspore::registry::PassRegistry g_##name##PassReg(#name, std::make_shared<pass>());
 
 /// \brief Defined assigning macro to assign Passes, which called by user directly.
 ///
 /// \param[in] position Define the place where assigned passes will run.
-/// \param[in] assigned Define the name of passes assigned by user.
-#define REG_SCHEDULED_PASS(position, assigned) static PassRegistry g_##position(position, assigned);
-}  // namespace opt
+/// \param[in] names Define the names of the passes.
+#define REG_SCHEDULED_PASS(position, names) static mindspore::registry::PassRegistry g_##position(position, names);
+}  // namespace registry
 }  // namespace mindspore
 
 #endif  // MINDSPORE_LITE_INCLUDE_REGISTRY_PASS_REGISTRY_H_
diff --git a/mindspore/lite/include/registry/register_kernel.h b/mindspore/lite/include/registry/register_kernel.h
index 21289bfd77f..753d0381590 100644
--- a/mindspore/lite/include/registry/register_kernel.h
+++ b/mindspore/lite/include/registry/register_kernel.h
@@ -25,10 +25,19 @@
 #include "include/api/context.h"
 #include "include/api/types.h"
 #include "include/api/kernel.h"
-#include "ir/dtype/type_id.h"
+#include "include/api/data_type.h"
+#include "include/api/status.h"
 
 namespace mindspore {
-namespace kernel {
+namespace registry {
+/// \brief KernelDesc defined kernel's basic attribute.
+struct KernelDesc {
+  DataType data_type;   /**< kernel data type argument */
+  int type;             /**< op type argument */
+  std::string arch;     /**< deviceType argument */
+  std::string provider; /**< user identification argument */
+};
+
 /// \brief CreateKernel Defined a functor to create a kernel.
 ///
 /// \param[in] inputs Define input tensors of kernel.
@@ -52,9 +61,9 @@ class MS_API RegisterKernel {
   /// \param[in] type Define the ordinary op type.
   /// \param[in] creator Define a function pointer to create a kernel.
   ///
-  /// \return STATUS as an error code of registering, STATUS is defined in errorcode.h.
-  static int RegKernel(const std::string &arch, const std::string &provider, TypeId data_type, int type,
-                       CreateKernel creator);
+  /// \return Status as a status identification of registering.
+  static Status RegKernel(const std::string &arch, const std::string &provider, DataType data_type, int type,
+                          CreateKernel creator);
 
   /// \brief Static method to register kernel which is corresponding to custom op.
   ///
@@ -64,9 +73,17 @@ class MS_API RegisterKernel {
   /// \param[in] type Define the concrete type of a custom op.
   /// \param[in] creator Define a function pointer to create a kernel.
   ///
-  /// \return STATUS as an error code of registering, STATUS is defined in errorcode.h.
-  static int RegCustomKernel(const std::string &arch, const std::string &provider, TypeId data_type,
-                             const std::string &type, CreateKernel creator);
+  /// \return Status as a status identification of registering.
+  static Status RegCustomKernel(const std::string &arch, const std::string &provider, DataType data_type,
+                                const std::string &type, CreateKernel creator);
+
+  /// \brief Static methon to get a kernel's create function.
+  ///
+  /// \param[in] desc Define kernel's basic attribute.
+  /// \param[in] primitive Define the primitive of kernel generated by flatbuffers.
+  ///
+  /// \return Function pointer to create a kernel.
+  static CreateKernel GetCreator(const schema::Primitive *primitive, KernelDesc *desc);
 };
 
 /// \brief KernelReg Defined registration class of kernel.
@@ -82,7 +99,8 @@ class MS_API KernelReg {
   /// \param[in] data_type Define kernel's input data type.
   /// \param[in] op_type Define the ordinary op type.
   /// \param[in] creator Define a function pointer to create a kernel.
-  KernelReg(const std::string &arch, const std::string &provider, TypeId data_type, int op_type, CreateKernel creator) {
+  KernelReg(const std::string &arch, const std::string &provider, DataType data_type, int op_type,
+            CreateKernel creator) {
     RegisterKernel::RegKernel(arch, provider, data_type, op_type, creator);
   }
 
@@ -93,7 +111,7 @@ class MS_API KernelReg {
   /// \param[in] data_type Define kernel's input data type.
   /// \param[in] op_type Define the concrete type of a custom op.
   /// \param[in] creator Define a function pointer to create a kernel.
-  KernelReg(const std::string &arch, const std::string &provider, TypeId data_type, const std::string &op_type,
+  KernelReg(const std::string &arch, const std::string &provider, DataType data_type, const std::string &op_type,
             CreateKernel creator) {
     RegisterKernel::RegCustomKernel(arch, provider, data_type, op_type, creator);
   }
@@ -106,10 +124,10 @@ class MS_API KernelReg {
 /// \param[in] data_type Define kernel's input data type.
 /// \param[in] op_type Define the ordinary op type.
 /// \param[in] creator Define a function pointer to create a kernel.
-#define REGISTER_KERNEL(arch, provider, data_type, op_type, creator)                                                 \
-  namespace {                                                                                                        \
-  static mindspore::kernel::KernelReg g_##arch##provider##data_type##op_type##kernelReg(#arch, #provider, data_type, \
-                                                                                        op_type, creator);           \
+#define REGISTER_KERNEL(arch, provider, data_type, op_type, creator)                                                   \
+  namespace {                                                                                                          \
+  static mindspore::registry::KernelReg g_##arch##provider##data_type##op_type##kernelReg(#arch, #provider, data_type, \
+                                                                                          op_type, creator);           \
   }  // namespace
 
 /// \brief Defined registering macro to register custom op kernel, which called by user directly.
@@ -119,12 +137,12 @@ class MS_API KernelReg {
 /// \param[in] data_type Define kernel's input data type.
 /// \param[in] op_type Define the concrete type of a custom op.
 /// \param[in] creator Define a function pointer to create a kernel.
-#define REGISTER_CUSTOM_KERNEL(arch, provider, data_type, op_type, creator)                                          \
-  namespace {                                                                                                        \
-  static mindspore::kernel::KernelReg g_##arch##provider##data_type##op_type##kernelReg(#arch, #provider, data_type, \
-                                                                                        #op_type, creator);          \
+#define REGISTER_CUSTOM_KERNEL(arch, provider, data_type, op_type, creator)                                            \
+  namespace {                                                                                                          \
+  static mindspore::registry::KernelReg g_##arch##provider##data_type##op_type##kernelReg(#arch, #provider, data_type, \
+                                                                                          #op_type, creator);          \
   }  // namespace
-}  // namespace kernel
+}  // namespace registry
 }  // namespace mindspore
 
 #endif  // MINDSPORE_LITE_INCLUDE_REGISTRY_REGISTER_KERNEL_H_
diff --git a/mindspore/lite/java/native/CMakeLists.txt b/mindspore/lite/java/native/CMakeLists.txt
index 1b7a4e38e57..8aedbace710 100644
--- a/mindspore/lite/java/native/CMakeLists.txt
+++ b/mindspore/lite/java/native/CMakeLists.txt
@@ -2,33 +2,19 @@ cmake_minimum_required(VERSION 3.10)
 project(Lite-java)
 
 set(BUILD_LITE "on")
-set(CMAKE_SKIP_RPATH TURE)
+
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../cmake/secure_option.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../cmake/compile_link_option.cmake)
+
+if(TARGET_HIMIX200)
+    set(CMAKE_CXX_FLAGS "-Wno-error=maybe-uninitialized ${CMAKE_CXX_FLAGS}")
+endif()
 
 if(PLATFORM_ARM64 OR PLATFORM_ARM32)
     set(PLATFORM_ARM "on")
     add_compile_definitions(PLATFORM_ARM)
 endif()
 
-if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DDebug -g")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDebug -g")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=default")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=default")
-else()
-    ## enable for binscope for release
-    set(CMAKE_C_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -Wall -Werror -fstack-protector-strong -Wno-attributes \
-    -Wno-deprecated-declarations -Wno-missing-braces ${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -Wall -Werror -fstack-protector-strong -Wno-attributes \
-    -Wno-deprecated-declarations -Wno-missing-braces -Wno-overloaded-virtual ${CMAKE_CXX_FLAGS}")
-    if(TARGET_HIMIX200)
-        set(CMAKE_CXX_FLAGS "-Wno-error=maybe-uninitialized ${CMAKE_CXX_FLAGS}")
-    endif()
-    if(NOT WIN32)
-        set(CMAKE_SHARED_LINKER_FLAGS "-Wl,-z,relro,-z,now -Wl,-z,noexecstack ${CMAKE_SHARED_LINKER_FLAGS}")
-        set(CMAKE_EXE_LINKER_FLAGS "-Wl,-z,relro,-z,now -Wl,-z,noexecstack ${CMAKE_EXE_LINKER_FLAGS}")
-    endif()
-endif()
-
 if(PLATFORM_ARM32 OR PLATFORM_ARM64)
     #for performance
     if("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
diff --git a/mindspore/lite/micro/cmake/file_list.cmake b/mindspore/lite/micro/cmake/file_list.cmake
index 86543f44cdc..2123deace5d 100644
--- a/mindspore/lite/micro/cmake/file_list.cmake
+++ b/mindspore/lite/micro/cmake/file_list.cmake
@@ -134,12 +134,12 @@ set(LITE_SRC
         ${LITE_DIR}/src/common/graph_util.cc
         ${LITE_DIR}/src/common/prim_util.cc
         ${LITE_DIR}/src/common/string_util.cc
+        ${LITE_DIR}/src/common/lite_utils.cc
         ${LITE_DIR}/src/common/tensor_util.cc
         ${LITE_DIR}/src/runtime/infer_manager.cc
-        ${LITE_DIR}/src/registry/kernel_interface.cc
+        ${LITE_DIR}/src/registry/register_kernel_interface.cc
         ${LITE_DIR}/src/registry/kernel_interface_registry.cc
         ${LITE_DIR}/src/registry/register_kernel.cc
-        ${LITE_DIR}/src/registry/register_utils.cc
         ${LITE_DIR}/src/registry/register_kernel_impl.cc
         ${LITE_DIR}/src/lite_model.cc
         ${LITE_DIR}/src/ms_tensor.cc
diff --git a/mindspore/lite/micro/coder/graph.cc b/mindspore/lite/micro/coder/graph.cc
index 4044fc4eb3b..54f820568aa 100644
--- a/mindspore/lite/micro/coder/graph.cc
+++ b/mindspore/lite/micro/coder/graph.cc
@@ -27,6 +27,7 @@
 #include "schema/inner/model_generated.h"
 #include "securec/include/securec.h"
 #include "src/common/prim_util.h"
+#include "src/lite_model.h"
 
 namespace mindspore::lite::micro {
 CoderGraph::~CoderGraph() {
@@ -249,7 +250,7 @@ void CoderGraph::DumpUnSupportLayer(Target target) {
     uint32_t input_idx = node->input_indices_.at(0);
     Tensor *t = all_tensors_.at(input_idx);
     TypeId dtype = t->data_type();
-    int pt = GetPrimitiveType(node->primitive_);
+    int pt = GetPrimitiveType(node->primitive_, reinterpret_cast<lite::LiteModel *>(model_)->GetSchemaVersion());
     CoderKey key(target, dtype, pt);
     // search from the opcoder registry
     if (OpCoderFactory::GetInstance()->FindOpCoder(key) == nullptr) {
diff --git a/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/conv2d_int8_coder.cc b/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/conv2d_int8_coder.cc
index 1aed5bb1e21..aa5d1aa86e6 100644
--- a/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/conv2d_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/conv2d_int8_coder.cc
@@ -174,14 +174,9 @@ int Conv2DInt8Coder::InitTmpBuffer() {
 
 std::unique_ptr<OperatorCoder> CmsisConv2DInt8OpCoderCreator(const std::vector<Tensor *> &in_tensors,
                                                              const std::vector<Tensor *> &out_tensors,
-                                                             const Model::Node *node, size_t node_index,
-                                                             Target target) {
+                                                             const Model::Node *node, size_t node_index, Target target,
+                                                             int schema_version) {
   MS_CHECK_PTR_RET_NULL(node);
-  int pt = GetPrimitiveType(node->primitive_);
-  if (pt != schema::PrimitiveType::PrimitiveType_Conv2DFusion) {
-    MS_LOG(ERROR) << "unmatched primitive type " << PrimitiveTypeName(pt);
-    return nullptr;
-  }
   std::unique_ptr<Conv2DInt8Coder> coder =
     std::make_unique<Conv2DInt8Coder>(in_tensors, out_tensors, node, node_index, target);
   return coder;
diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/conv2d_delegate_fp32_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/conv2d_delegate_fp32_coder.cc
index 742224a9688..3205a5b1435 100644
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/conv2d_delegate_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/conv2d_delegate_fp32_coder.cc
@@ -18,6 +18,7 @@
 #include "src/common/version_manager.h"
 #include "src/ops/populate/populate_register.h"
 #include "nnacl/fp32/winograd_utils.h"
+#include "nnacl/base/conv_common_base.h"
 #include "coder/opcoders/nnacl/fp32/convolution_fp32_coder.h"
 #include "coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.h"
 #include "coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.h"
@@ -28,14 +29,14 @@ int ConvDelegateCoder::Prepare(CoderContext *const context) {
   SetInputOutputShapeInfo(reinterpret_cast<ConvParameter *>(parameter_), input_tensor_, output_tensor_);
   if (conv_coder_ == nullptr) {
     // need to select actual execute coder here
-    conv_coder_ = CPUConvolutionFP32CoderSelect(input_tensors_, output_tensors_, node_, node_index(), target_);
+    conv_coder_ =
+      CPUConvolutionFP32CoderSelect(input_tensors_, output_tensors_, node_, node_index(), target_, schema_version_);
     MS_CHECK_PTR(conv_coder_);
     const void *primitive = node_->primitive_;
     MS_CHECK_PTR(primitive);
-    int primitive_type = GetPrimitiveType(node_->primitive_);
-    int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-    ParameterGen parameter_gen =
-      PopulateRegistry::GetInstance()->GetParameterCreator(GetPrimitiveType(node_->primitive_), schema_version);
+    int primitive_type = GetPrimitiveType(node_->primitive_, schema_version_);
+    ParameterGen parameter_gen = PopulateRegistry::GetInstance()->GetParameterCreator(
+      GetPrimitiveType(node_->primitive_, schema_version_), schema_version_);
     MS_CHECK_PTR(parameter_gen);
     OpParameter *op_parameter = parameter_gen(node_->primitive_);
     MS_CHECK_PTR(op_parameter);
@@ -62,15 +63,14 @@ void SetInputOutputShapeInfo(ConvParameter *conv_param, const lite::Tensor *inpu
 
 std::unique_ptr<OperatorCoder> CPUConvolutionFP32CoderSelect(const std::vector<Tensor *> &in_tensors,
                                                              const std::vector<Tensor *> &out_tensors,
-                                                             const Model::Node *node, size_t node_index,
-                                                             Target target) {
+                                                             const Model::Node *node, size_t node_index, Target target,
+                                                             int schema_version) {
   const void *primitive = node->primitive_;
   if (primitive == nullptr) {
     return nullptr;
   }
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-  ParameterGen paramGen =
-    PopulateRegistry::GetInstance()->GetParameterCreator(GetPrimitiveType(node->primitive_), schema_version);
+  ParameterGen paramGen = PopulateRegistry::GetInstance()->GetParameterCreator(
+    GetPrimitiveType(node->primitive_, schema_version), schema_version);
   MS_CHECK_PTR_RET_NULL(paramGen);
   auto conv_param = reinterpret_cast<ConvParameter *>(paramGen(node->primitive_));
   MS_CHECK_PTR_RET_NULL(conv_param);
@@ -89,40 +89,41 @@ std::unique_ptr<OperatorCoder> CPUConvolutionFP32CoderSelect(const std::vector<T
   std::unique_ptr<OperatorCoder> coder;
   if (kernel_h == 1 && kernel_w == 1) {
     MS_LOG(DEBUG) << "create ConvolutionFP32Coder";
-    coder = CPUOpCoderCreator<ConvolutionFP32Coder>(in_tensors, out_tensors, node, node_index, target);
+    coder = CPUOpCoderCreator<ConvolutionFP32Coder>(in_tensors, out_tensors, node, node_index, target, schema_version);
   } else if (use_winograd) {
     MS_LOG(DEBUG) << "create Conv2DWinogradFP32Coder";
     coder = std::make_unique<ConvolutionWinogradFP32Coder>(in_tensors, out_tensors, node, node_index, target, out_unit);
   } else {
     MS_LOG(DEBUG) << "create ConvolutionFP32Coder";
-    coder = CPUOpCoderCreator<ConvolutionFP32Coder>(in_tensors, out_tensors, node, node_index, target);
+    coder = CPUOpCoderCreator<ConvolutionFP32Coder>(in_tensors, out_tensors, node, node_index, target, schema_version);
   }
   return coder;
 }
 
 std::unique_ptr<OperatorCoder> CreateDelegateConv(const std::vector<Tensor *> &in_tensors,
                                                   const std::vector<Tensor *> &out_tensors, const Model::Node *node,
-                                                  size_t node_index, Target target) {
-  return CPUOpCoderCreator<ConvDelegateCoder>(in_tensors, out_tensors, node, node_index, target);
+                                                  size_t node_index, Target target, int schema_version) {
+  return CPUOpCoderCreator<ConvDelegateCoder>(in_tensors, out_tensors, node, node_index, target, schema_version);
 }
 
 std::unique_ptr<OperatorCoder> CPUConvDwFp32CoderCreator(const std::vector<Tensor *> &in_tensors,
                                                          const std::vector<Tensor *> &out_tensors,
-                                                         const Model::Node *node, size_t node_index, Target target) {
-  return CPUOpCoderCreator<ConvolutionDepthwiseFP32Coder>(in_tensors, out_tensors, node, node_index, target);
+                                                         const Model::Node *node, size_t node_index, Target target,
+                                                         int schema_version) {
+  return CPUOpCoderCreator<ConvolutionDepthwiseFP32Coder>(in_tensors, out_tensors, node, node_index, target,
+                                                          schema_version);
 }
 
 std::unique_ptr<OperatorCoder> CPUConv2DFusionFP32CoderCreator(const std::vector<Tensor *> &in_tensors,
                                                                const std::vector<Tensor *> &out_tensors,
                                                                const Model::Node *node, size_t node_index,
-                                                               Target target) {
+                                                               Target target, int schema_version) {
   const void *primitive = node->primitive_;
   if (primitive == nullptr) {
     return nullptr;
   }
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-  ParameterGen paramGen =
-    PopulateRegistry::GetInstance()->GetParameterCreator(GetPrimitiveType(node->primitive_), schema_version);
+  ParameterGen paramGen = PopulateRegistry::GetInstance()->GetParameterCreator(
+    GetPrimitiveType(node->primitive_, schema_version), schema_version);
   if (paramGen == nullptr) {
     MS_LOG(ERROR) << "parameter generator is null";
     return nullptr;
@@ -130,9 +131,9 @@ std::unique_ptr<OperatorCoder> CPUConv2DFusionFP32CoderCreator(const std::vector
   auto conv_param = reinterpret_cast<ConvParameter *>(paramGen(node->primitive_));
   std::unique_ptr<OperatorCoder> coder;
   if (conv_param->group_ == 1) {
-    coder = CreateDelegateConv(in_tensors, out_tensors, node, node_index, target);
+    coder = CreateDelegateConv(in_tensors, out_tensors, node, node_index, target, schema_version);
   } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
-    coder = CPUConvDwFp32CoderCreator(in_tensors, out_tensors, node, node_index, target);
+    coder = CPUConvDwFp32CoderCreator(in_tensors, out_tensors, node, node_index, target, schema_version);
   } else {
     // GroupConv
     return nullptr;
diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/conv2d_delegate_fp32_coder.h b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/conv2d_delegate_fp32_coder.h
index bca09218a79..de80050c8b7 100644
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/conv2d_delegate_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/conv2d_delegate_fp32_coder.h
@@ -38,18 +38,22 @@ class ConvDelegateCoder : public OperatorCoder {
 void SetInputOutputShapeInfo(ConvParameter *conv_param, const lite::Tensor *input, const lite::Tensor *output);
 std::unique_ptr<OperatorCoder> CPUConvolutionFP32CoderSelect(const std::vector<Tensor *> &in_tensors,
                                                              const std::vector<Tensor *> &out_tensors,
-                                                             const Model::Node *node, size_t node_index, Target target);
+                                                             const Model::Node *node, size_t node_index, Target target,
+                                                             int schema_version);
+
 std::unique_ptr<OperatorCoder> CreateDelegateConv(const std::vector<Tensor *> &in_tensors,
                                                   const std::vector<Tensor *> &out_tensors, const Model::Node *node,
-                                                  size_t node_index, Target target);
+                                                  size_t node_index, Target target, int schema_version);
+
 std::unique_ptr<OperatorCoder> CPUConvDwFp32CoderCreator(const std::vector<Tensor *> &in_tensors,
                                                          const std::vector<Tensor *> &out_tensors,
-                                                         const Model::Node *node, size_t node_index, Target target);
+                                                         const Model::Node *node, size_t node_index, Target target,
+                                                         int schema_version);
 
 std::unique_ptr<OperatorCoder> CPUConv2DFusionFP32CoderCreator(const std::vector<Tensor *> &in_tensors,
                                                                const std::vector<Tensor *> &out_tensors,
                                                                const Model::Node *node, size_t node_index,
-                                                               Target target);
+                                                               Target target, int schema_version);
 
 }  // namespace mindspore::lite::micro::nnacl
 
diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc
index c1c223515db..fad186922cf 100644
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc
@@ -228,6 +228,7 @@ int ConvolutionWinogradFP32Coder::DoCode(CoderContext *const context) {
             "common_func_fp32.c",
             "fixed_point.c",
             "winograd_utils.c",
+            "conv_common_base.c",
             "minimal_filtering_generator.c",
           });
   if (target_ == kARM32A) {
diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/int8/activation_int8_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/int8/activation_int8_coder.cc
index 9b17986d7c8..ef83ef02861 100644
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/activation_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/activation_int8_coder.cc
@@ -26,15 +26,14 @@ using mindspore::schema::PrimitiveType_Activation;
 namespace mindspore::lite::micro::nnacl {
 std::unique_ptr<OperatorCoder> CPUActivationINT8CoderCreator(const std::vector<Tensor *> &in_tensors,
                                                              const std::vector<Tensor *> &out_tensors,
-                                                             const Model::Node *node, size_t node_index,
-                                                             Target target) {
+                                                             const Model::Node *node, size_t node_index, Target target,
+                                                             int schema_version) {
   const void *primitive_c = node->primitive_;
   if (primitive_c == nullptr) {
     return nullptr;
   }
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-  ParameterGen parameter_gen =
-    PopulateRegistry::GetInstance()->GetParameterCreator(GetPrimitiveType(node->primitive_), schema_version);
+  ParameterGen parameter_gen = PopulateRegistry::GetInstance()->GetParameterCreator(
+    GetPrimitiveType(node->primitive_, schema_version), schema_version);
   if (parameter_gen == nullptr) {
     MS_LOG(ERROR) << "parameter generator is nullptr";
     return nullptr;
@@ -42,7 +41,8 @@ std::unique_ptr<OperatorCoder> CPUActivationINT8CoderCreator(const std::vector<T
   OpParameter *parameter = parameter_gen(node->primitive_);
   if (parameter == nullptr) {
     MS_LOG(ERROR) << "PopulateParameter return nullptr, type: "
-                  << schema::EnumNamePrimitiveType((schema::PrimitiveType)GetPrimitiveType(node->primitive_));
+                  << schema::EnumNamePrimitiveType(
+                       (schema::PrimitiveType)GetPrimitiveType(node->primitive_, schema_version));
     return nullptr;
   }
   auto type = (reinterpret_cast<ActivationParameter *>(parameter))->type_;
@@ -50,13 +50,13 @@ std::unique_ptr<OperatorCoder> CPUActivationINT8CoderCreator(const std::vector<T
   std::unique_ptr<OperatorCoder> coder;
   switch (static_cast<schema::ActivationType>(type)) {
     case schema::ActivationType_SIGMOID:
-      coder = CPUOpCoderCreator<SigmodInt8Coder>(in_tensors, out_tensors, node, node_index, target);
+      coder = CPUOpCoderCreator<SigmodInt8Coder>(in_tensors, out_tensors, node, node_index, target, schema_version);
       break;
     case schema::ActivationType_RELU:
-      coder = CPUOpCoderCreator<ReluInt8Coder>(in_tensors, out_tensors, node, node_index, target);
+      coder = CPUOpCoderCreator<ReluInt8Coder>(in_tensors, out_tensors, node, node_index, target, schema_version);
       break;
     case schema::ActivationType_RELU6:
-      coder = CPUOpCoderCreator<Relu6Int8Coder>(in_tensors, out_tensors, node, node_index, target);
+      coder = CPUOpCoderCreator<Relu6Int8Coder>(in_tensors, out_tensors, node, node_index, target, schema_version);
       break;
     default:
       break;
diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
index 4df09b2b9d5..8bdd00e6b36 100644
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
@@ -247,14 +247,14 @@ int Conv2DINT8Coder::DoCode(CoderContext *const context) {
 
 std::unique_ptr<OperatorCoder> CPUConv2DINT8CoderCreator(const std::vector<Tensor *> &in_tensors,
                                                          const std::vector<Tensor *> &out_tensors,
-                                                         const Model::Node *node, size_t node_index, Target target) {
+                                                         const Model::Node *node, size_t node_index, Target target,
+                                                         int schema_version) {
   const void *primitive = node->primitive_;
   if (primitive == nullptr) {
     return nullptr;
   }
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-  ParameterGen paramGen =
-    PopulateRegistry::GetInstance()->GetParameterCreator(GetPrimitiveType(node->primitive_), schema_version);
+  ParameterGen paramGen = PopulateRegistry::GetInstance()->GetParameterCreator(
+    GetPrimitiveType(node->primitive_, schema_version), schema_version);
   if (paramGen == nullptr) {
     MS_LOG(ERROR) << "parameter generator is null";
     return nullptr;
@@ -269,11 +269,11 @@ std::unique_ptr<OperatorCoder> CPUConv2DINT8CoderCreator(const std::vector<Tenso
   free(conv_param);
   std::unique_ptr<OperatorCoder> coder;
   if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
-    coder = CPUOpCoderCreator<Conv2D3x3Int8Coder>(in_tensors, out_tensors, node, node_index, target);
+    coder = CPUOpCoderCreator<Conv2D3x3Int8Coder>(in_tensors, out_tensors, node, node_index, target, schema_version);
   } else if (kernel_h == 1 && kernel_w == 1) {
-    coder = CPUOpCoderCreator<Conv2D1x1Int8Coder>(in_tensors, out_tensors, node, node_index, target);
+    coder = CPUOpCoderCreator<Conv2D1x1Int8Coder>(in_tensors, out_tensors, node, node_index, target, schema_version);
   } else {
-    coder = CPUOpCoderCreator<Conv2DINT8Coder>(in_tensors, out_tensors, node, node_index, target);
+    coder = CPUOpCoderCreator<Conv2DINT8Coder>(in_tensors, out_tensors, node, node_index, target, schema_version);
   }
   if (coder == nullptr) {
     MS_LOG(ERROR) << "create conv2d int8 coder failed";
@@ -285,14 +285,13 @@ std::unique_ptr<OperatorCoder> CPUConv2DINT8CoderCreator(const std::vector<Tenso
 std::unique_ptr<OperatorCoder> CPUConv2DFusionINT8CoderCreator(const std::vector<Tensor *> &in_tensors,
                                                                const std::vector<Tensor *> &out_tensors,
                                                                const Model::Node *node, size_t node_index,
-                                                               Target target) {
+                                                               Target target, int schema_version) {
   const void *primitive = node->primitive_;
   if (primitive == nullptr) {
     return nullptr;
   }
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-  ParameterGen paramGen =
-    PopulateRegistry::GetInstance()->GetParameterCreator(GetPrimitiveType(node->primitive_), schema_version);
+  ParameterGen paramGen = PopulateRegistry::GetInstance()->GetParameterCreator(
+    GetPrimitiveType(node->primitive_, schema_version), schema_version);
   if (paramGen == nullptr) {
     MS_LOG(ERROR) << "parameter generator is null";
     return nullptr;
@@ -300,9 +299,10 @@ std::unique_ptr<OperatorCoder> CPUConv2DFusionINT8CoderCreator(const std::vector
   auto conv_param = reinterpret_cast<ConvParameter *>(paramGen(node->primitive_));
   std::unique_ptr<OperatorCoder> coder;
   if (conv_param->group_ == 1) {
-    coder = CPUConv2DINT8CoderCreator(in_tensors, out_tensors, node, node_index, target);
+    coder = CPUConv2DINT8CoderCreator(in_tensors, out_tensors, node, node_index, target, schema_version);
   } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
-    coder = CPUOpCoderCreator<ConvolutionDepthwiseINT8Coder>(in_tensors, out_tensors, node, node_index, target);
+    coder = CPUOpCoderCreator<ConvolutionDepthwiseINT8Coder>(in_tensors, out_tensors, node, node_index, target,
+                                                             schema_version);
   } else {
     // group conv
   }
@@ -311,6 +311,7 @@ std::unique_ptr<OperatorCoder> CPUConv2DFusionINT8CoderCreator(const std::vector
     MS_LOG(ERROR) << "create conv2d int8 coder failed";
     return nullptr;
   }
+  coder->SetSchemaVersion(schema_version);
   return coder;
 }
 
diff --git a/mindspore/lite/micro/coder/opcoders/op_coder.h b/mindspore/lite/micro/coder/opcoders/op_coder.h
index 2c036c4f78a..a3044c7d475 100644
--- a/mindspore/lite/micro/coder/opcoders/op_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/op_coder.h
@@ -25,6 +25,7 @@
 #include "coder/allocator/allocator.h"
 #include "include/errorcode.h"
 #include "src/lite_kernel.h"
+#include "src/common/version_manager.h"
 #include "securec/include/securec.h"
 #include "coder/opcoders/op_coder_register.h"
 #include "coder/log.h"
@@ -75,6 +76,8 @@ class OperatorCoder {
 
   const std::vector<Tensor *> initial_parameters() const { return initial_parameters_; }
 
+  void SetSchemaVersion(int schema_version) { schema_version_ = schema_version; }
+
   // context
   virtual int Prepare(CoderContext *const context) = 0;
 
@@ -98,6 +101,7 @@ class OperatorCoder {
 
   bool support_parallel_{false};
   int thread_num_{1};
+  int schema_version_ = lite::SCHEMA_VERSION::SCHEMA_CUR;
 
  private:
   size_t node_index_{0};
@@ -114,12 +118,16 @@ class OperatorCoder {
 template <typename T>
 std::unique_ptr<OperatorCoder> CPUOpCoderCreator(const std::vector<Tensor *> &in_tensors,
                                                  const std::vector<Tensor *> &out_tensors, const Model::Node *node,
-                                                 size_t node_index, Target target) {
+                                                 size_t node_index, Target target, int schema_version) {
   if (node == nullptr) {
     MS_LOG(ERROR) << "node is null";
     return nullptr;
   }
   std::unique_ptr<T> coder = std::make_unique<T>(in_tensors, out_tensors, node, node_index, target);
+  if (coder == nullptr) {
+    return nullptr;
+  }
+  coder->SetSchemaVersion(schema_version);
   return coder;
 }
 }  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/micro/coder/opcoders/op_coder_builder.cc b/mindspore/lite/micro/coder/opcoders/op_coder_builder.cc
index cdc15f33698..ba0caeac8e1 100644
--- a/mindspore/lite/micro/coder/opcoders/op_coder_builder.cc
+++ b/mindspore/lite/micro/coder/opcoders/op_coder_builder.cc
@@ -23,9 +23,9 @@
 #include "coder/opcoders/parallel.h"
 
 namespace mindspore::lite::micro {
-std::unique_ptr<OperatorCoder> OpCoderBuilder::build() {
+std::unique_ptr<OperatorCoder> OpCoderBuilder::build(int schema_version) {
   MS_CHECK_PTR_RET_NULL(node_->primitive_);
-  int primitive_type = GetPrimitiveType(node_->primitive_);
+  int primitive_type = GetPrimitiveType(node_->primitive_, schema_version);
   CoderKey coder_key(target_, data_type_, primitive_type);
   CoderCreatorFunc creator_func = OpCoderFactory::GetInstance()->FindOpCoder(coder_key);
   if (creator_func == nullptr) {
@@ -39,7 +39,8 @@ std::unique_ptr<OperatorCoder> OpCoderBuilder::build() {
     MS_CHECK_PTR_RET_NULL(inputs_.at(kInputIndex));
     MS_CHECK_PTR_RET_NULL(outputs_.at(kOutputIndex));
   }
-  std::unique_ptr<OperatorCoder> op_coder = creator_func(inputs_, outputs_, node_, node_index_++, target_);
+  std::unique_ptr<OperatorCoder> op_coder =
+    creator_func(inputs_, outputs_, node_, node_index_++, target_, schema_version);
   if (op_coder == nullptr) {
     MS_LOG(ERROR) << "create op_coder failed: " << node_->name_ << " primitive type: "
                   << mindspore::schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(primitive_type))
diff --git a/mindspore/lite/micro/coder/opcoders/op_coder_builder.h b/mindspore/lite/micro/coder/opcoders/op_coder_builder.h
index cf0da43a938..2da028a2f78 100644
--- a/mindspore/lite/micro/coder/opcoders/op_coder_builder.h
+++ b/mindspore/lite/micro/coder/opcoders/op_coder_builder.h
@@ -25,7 +25,7 @@ namespace mindspore::lite::micro {
 
 class OpCoderBuilder {
  public:
-  std::unique_ptr<OperatorCoder> build();
+  std::unique_ptr<OperatorCoder> build(int schema_version);
 
   OpCoderBuilder &inputs(const std::vector<Tensor *> &inputs);
 
diff --git a/mindspore/lite/micro/coder/opcoders/op_coder_register.h b/mindspore/lite/micro/coder/opcoders/op_coder_register.h
index 982c7dc0b95..19d0c5fd392 100644
--- a/mindspore/lite/micro/coder/opcoders/op_coder_register.h
+++ b/mindspore/lite/micro/coder/opcoders/op_coder_register.h
@@ -28,7 +28,7 @@ namespace mindspore::lite::micro {
 class OperatorCoder;
 using CoderCreatorFunc = std::function<std::unique_ptr<OperatorCoder>(
   const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, const Model::Node *node,
-  size_t node_index, Target target)>;
+  size_t node_index, Target target, int schema_version)>;
 
 class CoderKey {
  public:
diff --git a/mindspore/lite/micro/coder/session.cc b/mindspore/lite/micro/coder/session.cc
index fbb0cccc0d0..d68f29753a9 100644
--- a/mindspore/lite/micro/coder/session.cc
+++ b/mindspore/lite/micro/coder/session.cc
@@ -31,6 +31,7 @@
 #include "src/common/version_manager.h"
 #include "src/runtime/infer_manager.h"
 #include "src/scheduler.h"
+#include "src/lite_model.h"
 #include "include/errorcode.h"
 #include "include/model.h"
 #include "src/common/file_utils.h"
@@ -56,7 +57,7 @@ void CoderSession::EndCode() {
     context_->set_code_blocks(blocks);
   }
   if (config->code_mode() == Train) {
-    Train::TransformGraphForTrain(context_.get(), op_coders_);
+    Train::TransformGraphForTrain(context_.get(), op_coders_, schema_version_);
   }
 }
 
@@ -203,18 +204,18 @@ OpParameter *CoderSession::GenParameterAndInfer(const Model::Node *node, const s
                                                 std::vector<lite::Tensor *> *outputs) const {
   auto primitive = node->primitive_;
   MS_CHECK_PTR_RET_NULL(primitive);
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-  auto parame_gen = PopulateRegistry::GetInstance()->GetParameterCreator(GetPrimitiveType(primitive), schema_version);
+  auto parame_gen =
+    PopulateRegistry::GetInstance()->GetParameterCreator(GetPrimitiveType(primitive, schema_version_), schema_version_);
   MS_CHECK_PTR_RET_NULL(parame_gen);
   auto parameter = parame_gen(primitive);
   MS_CHECK_PTR_RET_NULL(parameter);
   auto ret = KernelInferShape(inputs, *outputs, parameter);
   if (ret == RET_INFER_INVALID) {
     MS_LOG(INFO) << "InferShape shouldn't be done before runtime, name: " << node->name_
-                 << ", type: " << PrimitiveTypeName(GetPrimitiveType(primitive)) << "flag set to false.";
+                 << ", type: " << GetPrimitiveTypeName(primitive, schema_version_) << "flag set to false.";
   } else if (ret != RET_OK) {
     MS_LOG(ERROR) << "InferShape failed, name: " << node->name_
-                  << ", type: " << PrimitiveTypeName(GetPrimitiveType(primitive));
+                  << ", type: " << GetPrimitiveTypeName(primitive, schema_version_);
     return nullptr;
   }
   return parameter;
@@ -226,6 +227,7 @@ int CoderSession::CreateOpCoders() {
     MS_LOG(ERROR) << "Graph model is nullptr";
     return RET_ERROR;
   }
+  schema_version_ = reinterpret_cast<const lite::LiteModel *>(model)->GetSchemaVersion();
   Configurator *config = Configurator::GetInstance();
   Target code_target = config->target();
   CodeMode code_mode = config->code_mode();
@@ -290,7 +292,7 @@ int CoderSession::CreateOpCoders() {
                                                 .mode(code_mode)
                                                 .input_indices(input_indices)
                                                 .output_indices(output_indices)
-                                                .build();
+                                                .build(schema_version_);
     if (op_coder == nullptr) {
       coder_graph_->DumpUnSupportLayer(code_target);
       return RET_ERROR;
diff --git a/mindspore/lite/micro/coder/session.h b/mindspore/lite/micro/coder/session.h
index 2f09757562d..2dd8c3b39ba 100644
--- a/mindspore/lite/micro/coder/session.h
+++ b/mindspore/lite/micro/coder/session.h
@@ -56,6 +56,7 @@ class CoderSession {
   std::unique_ptr<CoderContext> context_{nullptr};
   MemoryAllocator *allocator_{nullptr};
   std::vector<std::unique_ptr<OperatorCoder>> op_coders_;
+  int schema_version_ = SCHEMA_VERSION::SCHEMA_CUR;
 };
 
 std::shared_ptr<CoderSession> CreateCoderSession();
diff --git a/mindspore/lite/micro/coder/train.cc b/mindspore/lite/micro/coder/train.cc
index 320efe1b66e..6532285d2e0 100644
--- a/mindspore/lite/micro/coder/train.cc
+++ b/mindspore/lite/micro/coder/train.cc
@@ -54,7 +54,8 @@ std::set<OperatorCoder *> FindInferenceOpcoders(OperatorCoder *edge) {
   return subgraph;
 }
 
-int Train::TransformGraphForTrain(CoderContext *context, const std::vector<std::unique_ptr<OperatorCoder>> &op_coders) {
+int Train::TransformGraphForTrain(CoderContext *context, const std::vector<std::unique_ptr<OperatorCoder>> &op_coders,
+                                  int schema_version) {
   if (context == nullptr) {
     MS_LOG(INFO) << "input context invalid";
     return RET_ERROR;
@@ -68,7 +69,7 @@ int Train::TransformGraphForTrain(CoderContext *context, const std::vector<std::
   OperatorCoder *loss_op = nullptr;
   for (const auto &opcoder : op_coders) {
     const Model::Node *node = opcoder->node();
-    int primitive_type = GetPrimitiveType(node->primitive_);
+    int primitive_type = GetPrimitiveType(node->primitive_, schema_version);
     auto item = std::find(loss_types.begin(), loss_types.end(), primitive_type);
     if (item != loss_types.end()) {
       loss_op = opcoder.get();
diff --git a/mindspore/lite/micro/coder/train.h b/mindspore/lite/micro/coder/train.h
index fe335e6dd16..f39697c43ac 100644
--- a/mindspore/lite/micro/coder/train.h
+++ b/mindspore/lite/micro/coder/train.h
@@ -25,8 +25,8 @@
 namespace mindspore::lite::micro {
 class Train {
  public:
-  static int TransformGraphForTrain(CoderContext *context,
-                                    const std::vector<std::unique_ptr<OperatorCoder>> &op_coders);
+  static int TransformGraphForTrain(CoderContext *context, const std::vector<std::unique_ptr<OperatorCoder>> &op_coders,
+                                    int schema_version);
 };
 
 }  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/minddata/CMakeLists.txt b/mindspore/lite/minddata/CMakeLists.txt
index 74eee1946f4..e8d88d97335 100644
--- a/mindspore/lite/minddata/CMakeLists.txt
+++ b/mindspore/lite/minddata/CMakeLists.txt
@@ -114,6 +114,7 @@ if(BUILD_MINDDATA STREQUAL "full")
         ${TOP_DIR}/mindspore/lite/src/tensor.cc
         ${TOP_DIR}/mindspore/lite/src/ms_tensor.cc
         ${TOP_DIR}/mindspore/lite/src/common/string_util.cc
+        ${TOP_DIR}/mindspore/lite/src/common/lite_utils.cc
         ${CORE_DIR}/utils/status.cc
         ${MINDDATA_DIR}/api/datasets.cc
         ${MINDDATA_DIR}/kernels/data/data_utils.cc
diff --git a/mindspore/lite/minddata/wrapper/album_op_android.cc b/mindspore/lite/minddata/wrapper/album_op_android.cc
index 472ce0a1305..103316aa555 100644
--- a/mindspore/lite/minddata/wrapper/album_op_android.cc
+++ b/mindspore/lite/minddata/wrapper/album_op_android.cc
@@ -277,7 +277,7 @@ Status AlbumOp::LoadStringTensor(const nlohmann::json &json_obj, int32_t col_num
 
 Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, int32_t col_num, TensorPtr *tensor) {
   // consider templating this function to handle all ints
-  if (data_schema_->column(col_num).type() == DataType::DE_INT64) {
+  if (data_schema_->Column(col_num).Type() == DataType::DE_INT64) {
     std::vector<int64_t> data;
 
     // Iterate over the integer list and add those values to the output shape tensor
@@ -286,7 +286,7 @@ Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, int32_t col_n
     (void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); });
 
     RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, tensor));
-  } else if (data_schema_->column(col_num).type() == DataType::DE_INT32) {
+  } else if (data_schema_->Column(col_num).Type() == DataType::DE_INT32) {
     std::vector<int32_t> data;
 
     // Iterate over the integer list and add those values to the output shape tensor
@@ -297,14 +297,14 @@ Status AlbumOp::LoadIntArrayTensor(const nlohmann::json &json_obj, int32_t col_n
     RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, tensor));
   } else {
     RETURN_STATUS_UNEXPECTED("Invalid data, column type is neither int32 nor int64, it is " +
-                             data_schema_->column(col_num).type().ToString());
+                             data_schema_->Column(col_num).Type().ToString());
   }
   return Status::OK();
 }
 
 Status AlbumOp::LoadFloatArrayTensor(const nlohmann::json &json_obj, int32_t col_num, TensorPtr *tensor) {
   // consider templating this function to handle all ints
-  if (data_schema_->column(col_num).type() == DataType::DE_FLOAT64) {
+  if (data_schema_->Column(col_num).Type() == DataType::DE_FLOAT64) {
     std::vector<double> data;
 
     // Iterate over the integer list and add those values to the output shape tensor
@@ -313,7 +313,7 @@ Status AlbumOp::LoadFloatArrayTensor(const nlohmann::json &json_obj, int32_t col
     (void)std::transform(items.begin(), items.end(), std::back_inserter(data), [](it_type j) { return j.value(); });
 
     RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, tensor));
-  } else if (data_schema_->column(col_num).type() == DataType::DE_FLOAT32) {
+  } else if (data_schema_->Column(col_num).Type() == DataType::DE_FLOAT32) {
     std::vector<float> data;
 
     // Iterate over the integer list and add those values to the output shape tensor
@@ -324,13 +324,13 @@ Status AlbumOp::LoadFloatArrayTensor(const nlohmann::json &json_obj, int32_t col
     RETURN_IF_NOT_OK(Tensor::CreateFromVector(data, tensor));
   } else {
     RETURN_STATUS_UNEXPECTED("Invalid data, column type is neither float32 nor float64, it is " +
-                             data_schema_->column(col_num).type().ToString());
+                             data_schema_->Column(col_num).Type().ToString());
   }
   return Status::OK();
 }
 
 Status AlbumOp::LoadIDTensor(const std::string &file, int32_t col_num, TensorPtr *tensor) {
-  if (data_schema_->column(col_num).type() == DataType::DE_STRING) {
+  if (data_schema_->Column(col_num).Type() == DataType::DE_STRING) {
     RETURN_IF_NOT_OK(Tensor::CreateScalar<std::string>(file, tensor));
     return Status::OK();
   }
@@ -343,7 +343,7 @@ Status AlbumOp::LoadIDTensor(const std::string &file, int32_t col_num, TensorPtr
 
 Status AlbumOp::LoadEmptyTensor(int32_t col_num, TensorPtr *tensor) {
   // hack to get the file name without extension, the 1 is to get rid of the backslash character
-  RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({0}), data_schema_->column(col_num).type(), tensor));
+  RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({0}), data_schema_->Column(col_num).Type(), tensor));
   return Status::OK();
 }
 
@@ -352,11 +352,11 @@ Status AlbumOp::LoadEmptyTensor(int32_t col_num, TensorPtr *tensor) {
 // Float64 doesn't work with reinterpret cast here. Otherwise we limit the float in the schema to
 // only be float32, seems like a weird limitation to impose
 Status AlbumOp::LoadFloatTensor(const nlohmann::json &json_obj, int32_t col_num, TensorPtr *tensor) {
-  if (data_schema_->column(col_num).type() == DataType::DE_FLOAT64) {
+  if (data_schema_->Column(col_num).Type() == DataType::DE_FLOAT64) {
     double data = json_obj;
     MS_LOG(INFO) << "double found: " << json_obj << ".";
     RETURN_IF_NOT_OK(Tensor::CreateScalar<double>(data, tensor));
-  } else if (data_schema_->column(col_num).type() == DataType::DE_FLOAT32) {
+  } else if (data_schema_->Column(col_num).Type() == DataType::DE_FLOAT32) {
     float data = json_obj;
     RETURN_IF_NOT_OK(Tensor::CreateScalar<float>(data, tensor));
     MS_LOG(INFO) << "float found: " << json_obj << ".";
@@ -366,11 +366,11 @@ Status AlbumOp::LoadFloatTensor(const nlohmann::json &json_obj, int32_t col_num,
 
 // Loads a tensor with int value, we have to cast the value to type specified in the schema.
 Status AlbumOp::LoadIntTensor(const nlohmann::json &json_obj, int32_t col_num, TensorPtr *tensor) {
-  if (data_schema_->column(col_num).type() == DataType::DE_INT64) {
+  if (data_schema_->Column(col_num).Type() == DataType::DE_INT64) {
     int64_t data = json_obj;
     MS_LOG(INFO) << "int64 found: " << json_obj << ".";
     RETURN_IF_NOT_OK(Tensor::CreateScalar<int64_t>(data, tensor));
-  } else if (data_schema_->column(col_num).type() == DataType::DE_INT32) {
+  } else if (data_schema_->Column(col_num).Type() == DataType::DE_INT32) {
     int32_t data = json_obj;
     RETURN_IF_NOT_OK(Tensor::CreateScalar<int32_t>(data, tensor));
     MS_LOG(INFO) << "int32 found: " << json_obj << ".";
@@ -383,17 +383,17 @@ Status AlbumOp::LoadIntTensorRowByIndex(int index, bool is_array, const nlohmann
   int i = index;
   // int value
   if (!is_array &&
-      (data_schema_->column(i).type() == DataType::DE_INT64 || data_schema_->column(i).type() == DataType::DE_INT32)) {
+      (data_schema_->Column(i).Type() == DataType::DE_INT64 || data_schema_->Column(i).Type() == DataType::DE_INT32)) {
     TensorPtr tensor;
     RETURN_IF_NOT_OK(LoadIntTensor(column_value, i, &tensor));
-    (*map_row)[data_schema_->column(i).name()] = tensor;
+    (*map_row)[data_schema_->Column(i).Name()] = tensor;
   }
   // int array
   if (is_array &&
-      (data_schema_->column(i).type() == DataType::DE_INT64 || data_schema_->column(i).type() == DataType::DE_INT32)) {
+      (data_schema_->Column(i).Type() == DataType::DE_INT64 || data_schema_->Column(i).Type() == DataType::DE_INT32)) {
     TensorPtr tensor;
     RETURN_IF_NOT_OK(LoadIntArrayTensor(column_value, i, &tensor));
-    (*map_row)[data_schema_->column(i).name()] = tensor;
+    (*map_row)[data_schema_->Column(i).Name()] = tensor;
   }
   return Status::OK();
 }
@@ -402,59 +402,59 @@ Status AlbumOp::LoadTensorRowByIndex(int index, const std::string &file, const n
                                      std::unordered_map<std::string, std::shared_ptr<Tensor>> *map_row) {
   int i = index;
   // special case to handle
-  if (data_schema_->column(i).name() == "id") {
+  if (data_schema_->Column(i).name() == "id") {
     // id is internal, special case to load from file
     TensorPtr tensor;
     RETURN_IF_NOT_OK(LoadIDTensor(file, i, &tensor));
-    (*map_row)[data_schema_->column(i).name()] = tensor;
+    (*map_row)[data_schema_->Column(i).Name()] = tensor;
   }
   // find if key does not exist, insert placeholder nullptr if not found
-  if (js.find(data_schema_->column(i).name()) == js.end()) {
+  if (js.find(data_schema_->Column(i).Name()) == js.end()) {
     // iterator not found, push nullptr as placeholder
-    MS_LOG(INFO) << "Pushing empty tensor for column: " << data_schema_->column(i).name() << ".";
+    MS_LOG(INFO) << "Pushing empty tensor for column: " << data_schema_->Column(i).Name() << ".";
     TensorPtr tensor;
     RETURN_IF_NOT_OK(LoadEmptyTensor(i, &tensor));
-    (*map_row)[data_schema_->column(i).name()] = tensor;
+    (*map_row)[data_schema_->Column(i).Name()] = tensor;
   }
-  nlohmann::json column_value = js.at(data_schema_->column(i).name());
-  MS_LOG(INFO) << "This column is: " << data_schema_->column(i).name() << ".";
+  nlohmann::json column_value = js.at(data_schema_->Column(i).Name());
+  MS_LOG(INFO) << "This column is: " << data_schema_->Column(i).Name() << ".";
   bool is_array = column_value.is_array();
   // load single string
-  if (column_value.is_string() && data_schema_->column(i).type() == DataType::DE_STRING) {
+  if (column_value.is_string() && data_schema_->Column(i).Type() == DataType::DE_STRING) {
     TensorPtr tensor;
     RETURN_IF_NOT_OK(LoadStringTensor(column_value, i, &tensor));
-    (*map_row)[data_schema_->column(i).name()] = tensor;
+    (*map_row)[data_schema_->Column(i).Name()] = tensor;
   }
   // load string array
-  if (is_array && data_schema_->column(i).type() == DataType::DE_STRING) {
+  if (is_array && data_schema_->Column(i).Type() == DataType::DE_STRING) {
     TensorPtr tensor;
     RETURN_IF_NOT_OK(LoadStringArrayTensor(column_value, i, &tensor));
-    (*map_row)[data_schema_->column(i).name()] = tensor;
+    (*map_row)[data_schema_->Column(i).Name()] = tensor;
   }
   // load image file
-  if (column_value.is_string() && data_schema_->column(i).type() != DataType::DE_STRING) {
+  if (column_value.is_string() && data_schema_->Column(i).Type() != DataType::DE_STRING) {
     std::string image_file_path = column_value;
     TensorPtr tensor;
     RETURN_IF_NOT_OK(LoadImageTensor(image_file_path, i, &tensor));
-    (*map_row)[data_schema_->column(i).name()] = tensor;
+    (*map_row)[data_schema_->Column(i).Name()] = tensor;
     uint32_t orientation = GetOrientation(image_file_path);
     TensorPtr scalar_tensor;
     RETURN_IF_NOT_OK(Tensor::CreateScalar<uint32_t>(orientation, &scalar_tensor));
     (*map_row)["orientation"] = scalar_tensor;
   }
   // load float value
-  if (!is_array && (data_schema_->column(i).type() == DataType::DE_FLOAT32 ||
-                    data_schema_->column(i).type() == DataType::DE_FLOAT64)) {
+  if (!is_array && (data_schema_->Column(i).Type() == DataType::DE_FLOAT32 ||
+                    data_schema_->Column(i).Type() == DataType::DE_FLOAT64)) {
     TensorPtr tensor;
     RETURN_IF_NOT_OK(LoadFloatTensor(column_value, i, &tensor));
-    (*map_row)[data_schema_->column(i).name()] = tensor;
+    (*map_row)[data_schema_->Column(i).Name()] = tensor;
   }
   // load float array
-  if (is_array && (data_schema_->column(i).type() == DataType::DE_FLOAT32 ||
-                   data_schema_->column(i).type() == DataType::DE_FLOAT64)) {
+  if (is_array && (data_schema_->Column(i).Type() == DataType::DE_FLOAT32 ||
+                   data_schema_->Column(i).Type() == DataType::DE_FLOAT64)) {
     TensorPtr tensor;
     RETURN_IF_NOT_OK(LoadFloatArrayTensor(column_value, i, &tensor));
-    (*map_row)[data_schema_->column(i).name()] = tensor;
+    (*map_row)[data_schema_->Column(i).Name()] = tensor;
   }
 
   RETURN_IF_NOT_OK(LoadIntTensorRowByIndex(i, is_array, column_value, map_row));
@@ -487,7 +487,7 @@ Status AlbumOp::LoadTensorRow(row_id_type row_id, const std::string &file,
 
       // loop over each column descriptor, this can optimized by switch cases
       for (int32_t i = 0; i < columns; i++) {
-        if (!IsReadColumn(data_schema_->column(i).name())) {
+        if (!IsReadColumn(data_schema_->Column(i).Name())) {
           continue;
         }
         RETURN_IF_NOT_OK(LoadTensorRowByIndex(i, file, js, map_row));
diff --git a/mindspore/lite/schema/model.fbs b/mindspore/lite/schema/model.fbs
index 63ce23fbfa0..6c58e4fa129 100644
--- a/mindspore/lite/schema/model.fbs
+++ b/mindspore/lite/schema/model.fbs
@@ -41,7 +41,8 @@ table QuantParam {
 enum WeightQunatCompressType: int {
     NONE,
     INDEXING,
-    SPARSE
+    SPARSE,
+    FSE
 }
 
 table Tensor {
diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt
index 49e33521311..8ad8114dcce 100644
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -1,11 +1,30 @@
 add_compile_definitions(USE_ANDROID_LOG)
-if(ENABLE_V0)
+if(MSLITE_ENABLE_V0)
     add_definitions(-DENABLE_V0)
 endif()
 include_directories(${CCSRC_DIR}/backend/kernel_compiler/cpu)
-set(LITE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/..)
-include_directories(${LITE_DIR}/nnacl/)
-include_directories(${LITE_DIR}/nnacl/optimize)
+
+if(NOT MSLITE_STRING_KERNEL)
+    add_compile_definitions(STRING_KERNEL_CLIP)
+endif()
+if(NOT MSLITE_CONTROLFLOW_TENSORLIST)
+    add_compile_definitions(CONTROLFLOW_TENSORLIST_CLIP)
+endif()
+if(NOT MSLITE_AUTO_PARALLEL)
+    add_compile_definitions(AUTO_PARALLEL_CLIP)
+endif()
+if(NOT MSLITE_WEIGHT_DECODE)
+    add_compile_definitions(WEIGHT_DECODE_CLIP)
+endif()
+if(NOT MSLITE_CUSTOM_KERNEL_REGISTRY)
+    add_compile_definitions(CUSTOM_KERNEL_REGISTRY_CLIP)
+endif()
+if(NOT MSLITE_ENABLE_RUNTIME_PASS)
+    add_compile_definitions(RUNTIME_PASS_CLIP)
+endif()
+if(NOT MSLITE_DELEGATE_USE)
+    add_compile_definitions(DELEGATE_CLIP)
+endif()
 
 if(PLATFORM_ARM32 OR PLATFORM_ARM64)
     #for performance
@@ -67,16 +86,12 @@ set(LITE_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/common/file_utils.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/common/utils.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/common/graph_util.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/common/dynamic_library_loader.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/common/log_adapter.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/common/string_util.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/common/lite_utils.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/common/prim_util.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/common/tensor_util.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/common/dynamic_library_loader.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/delegate/delegate.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/runtime/inner_allocator.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/runtime/infer_manager.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/runtime_pass.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/tensor.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/ms_tensor.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/executor.cc
@@ -90,21 +105,39 @@ set(LITE_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/scheduler.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/lite_session.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/errorcode.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/weight_decoder.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/cpu_info.cc
         )
-if(MSLITE_CONTROL_TENSORLIST)
+
+if(MSLITE_STRING_KERNEL)
+    set(LITE_SRC
+        ${LITE_SRC}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common/string_util.cc
+        )
+endif()
+if(MSLITE_ENABLE_RUNTIME_PASS)
     set(LITE_SRC
             ${LITE_SRC}
-            ${CMAKE_CURRENT_SOURCE_DIR}/tensorlist.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/runtime/runtime_pass.cc
             )
 endif()
-if(MSLITE_HUFFMAN_DECODE)
+
+if(MSLITE_CONTROLFLOW_TENSORLIST)
+    set(LITE_SRC
+        ${LITE_SRC}
+        ${CMAKE_CURRENT_SOURCE_DIR}/tensorlist.cc
+        )
+endif()
+
+if(MSLITE_WEIGHT_DECODE)
     set(LITE_SRC
         ${LITE_SRC}
         ${CMAKE_CURRENT_SOURCE_DIR}/huffman_decode.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/weight_decoder.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../tools/converter/quantizer/fse_decoder.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../tools/converter/quantizer/fse_bit_stream.cc
         )
 endif()
+
 if(MSLITE_AUTO_PARALLEL)
     set(LITE_SRC
             ${LITE_SRC}
@@ -112,8 +145,26 @@ if(MSLITE_AUTO_PARALLEL)
             )
 endif()
 
-file(GLOB KERNEL_REG_SRC ${CMAKE_CURRENT_SOURCE_DIR}/registry/*.cc)
-set(LITE_SRC ${LITE_SRC} ${KERNEL_REG_SRC})
+if(MSLITE_CUSTOM_KERNEL_REGISTRY)
+    file(GLOB KERNEL_REG_SRC ${CMAKE_CURRENT_SOURCE_DIR}/registry/*.cc)
+else()
+    set(KERNEL_REG_SRC
+            ${CMAKE_CURRENT_SOURCE_DIR}/registry/register_kernel_interface.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/registry/register_kernel.cc
+            )
+endif()
+
+set(LITE_SRC
+    ${LITE_SRC}
+    ${KERNEL_REG_SRC}
+    )
+
+if(MSLITE_DELEGATE_USE)
+    set(LITE_SRC
+            ${LITE_SRC}
+            ${CMAKE_CURRENT_SOURCE_DIR}/delegate/delegate.cc
+            )
+endif()
 
 if(MSLITE_GPU_BACKEND STREQUAL opencl)
     file(GLOB_RECURSE OPENCL_RUNTIME_SRC
@@ -152,21 +203,21 @@ set(TRAIN_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/train/opt_allocator.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../tools/common/storage.cc
         )
-if(ENABLE_V0)
+if(MSLITE_ENABLE_V0)
     set(TRAIN_SRC
             ${TRAIN_SRC}
             ${CMAKE_CURRENT_SOURCE_DIR}/train/train_populate_parameter_v0.cc
             )
 endif()
 
-if(ENABLE_MINDRT)
+if(MSLITE_ENABLE_MINDRT)
     add_subdirectory(${CORE_DIR}/mindrt mindspore_mindrt)
     set(LITE_SRC
         ${LITE_SRC}
         ${CMAKE_CURRENT_SOURCE_DIR}/lite_mindrt.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/mindrt_executor.cc
         )
-elseif(TARGET_HIMIX200 OR TARGET_OHOS_LITE)
+else()
     file(GLOB MINDRT_ACTOR ${CORE_DIR}/mindrt/src/actor/*.cc)
     set(LITE_SRC
         ${LITE_SRC}
@@ -223,7 +274,7 @@ if(MSVC)
     set_target_properties(mindspore-lite_static PROPERTIES PREFIX lib)
 endif()
 
-if(ENABLE_MINDRT)
+if(MSLITE_ENABLE_MINDRT)
     target_link_libraries(mindspore-lite mindrt_mid)
     target_link_libraries(mindspore-lite_static mindrt_mid)
 endif()
diff --git a/mindspore/lite/src/common/context_util.cc b/mindspore/lite/src/common/context_util.cc
index cdf8fc290ad..8b3aa1fb4f7 100644
--- a/mindspore/lite/src/common/context_util.cc
+++ b/mindspore/lite/src/common/context_util.cc
@@ -103,17 +103,5 @@ mindspore::Context *MSContextFromContext(const lite::Context *context) {
   }
   return ms_context;
 }
-
-std::set<std::string> ProvidersFromMSContext(const mindspore::Context *context) {
-  std::set<std::string> providers;
-  if (context == nullptr) {
-    return providers;
-  }
-  auto &device_infos = const_cast<mindspore::Context *>(context)->MutableDeviceInfo();
-  for (auto &device_info : device_infos) {
-    providers.emplace(device_info->GetProvider());
-  }
-  return providers;
-}
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/common/context_util.h b/mindspore/lite/src/common/context_util.h
index 2b33e2b860b..f452fb32075 100644
--- a/mindspore/lite/src/common/context_util.h
+++ b/mindspore/lite/src/common/context_util.h
@@ -25,7 +25,6 @@
 namespace mindspore {
 namespace lite {
 mindspore::Context *MSContextFromContext(const lite::Context *context);
-std::set<std::string> ProvidersFromMSContext(const mindspore::Context *context);
 }  // namespace lite
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_COMMON_CONTEXT_UTIL_H_
diff --git a/mindspore/lite/src/common/log_adapter.h b/mindspore/lite/src/common/log_adapter.h
index 39c6b9fbefb..f899103c2f4 100644
--- a/mindspore/lite/src/common/log_adapter.h
+++ b/mindspore/lite/src/common/log_adapter.h
@@ -20,14 +20,22 @@ namespace mindspore {
 const char *const unsupport_string_tensor_log =
   "This mindspore-lite library does not support string tensors. Set environment variable MSLITE_STRING_KERNEL to on to "
   "recompile it.";
-const char *const unsupport_control_tensorlist_log =
-  "This mindspore-lite library does not support control and tensorlist op. Set environment variable "
-  "MSLITE_CONTROL_TENSORLIST to on to recompile it.";
+const char *const unsupport_controlflow_tensorlist_log =
+  "This mindspore-lite library does not support controlflow and tensorlist op. Set environment variable "
+  "MSLITE_CONTROLFLOW_TENSORLIST to on to recompile it.";
 const char *const unsupport_auto_parallel_log =
   "The mindspore-lite library does not support auto parallel. Set environment variable MSLITE_AUTO_PARALLEL to on to "
   "recompile it.";
-const char *const unsupport_huffman_decode_log =
-  "The mindspore-lite library does not support huffman decode. Set environment variable MSLITE_HUFFMAN_DECODE to on to "
+const char *const unsupport_weight_decode_log =
+  "The mindspore-lite library does not support weight decode. Set environment variable MSLITE_WEIGHT_DECODE to on to "
+  "recompile it.";
+const char *const unsupport_custom_kernel_register_log =
+  "The mindspore-lite library does not support custom kernel register. Set environment variable "
+  "MSLITE_CUSTOM_KERNEL_REGISTRY to on to "
+  "recompile it.";
+const char *const unsupport_delegate_log =
+  "The mindspore-lite library does not support delegate. Set environment variable "
+  "MSLITE_DELEGATE_USE to on to "
   "recompile it.";
 }  // namespace mindspore
 #ifdef USE_GLOG
diff --git a/mindspore/lite/src/common/prim_util.cc b/mindspore/lite/src/common/prim_util.cc
index b8f620a842c..f27f57b0ee5 100644
--- a/mindspore/lite/src/common/prim_util.cc
+++ b/mindspore/lite/src/common/prim_util.cc
@@ -24,25 +24,28 @@
 
 namespace mindspore {
 namespace lite {
-int GetPrimitiveType(const void *primitive) {
+int GetPrimitiveType(const void *primitive, int schema_version) {
   if (primitive == nullptr) {
     return -1;
   }
 #ifdef ENABLE_V0
-  if (VersionManager::GetInstance()->GetSchemaVersion() == SCHEMA_V0) {
+  if (schema_version == SCHEMA_V0) {
     return static_cast<const schema::v0::Primitive *>(primitive)->value_type();
   }
 #endif
   return static_cast<const schema::Primitive *>(primitive)->value_type();
 }
 
-const char *PrimitiveTypeName(int type) {
+const char *GetPrimitiveTypeName(const void *primitive, int schema_version) {
+  if (primitive == nullptr) {
+    return "NONE";
+  }
 #ifdef ENABLE_V0
-  if (VersionManager::GetInstance()->GetSchemaVersion() == SCHEMA_V0) {
-    return schema::v0::EnumNamePrimitiveType(static_cast<schema::v0::PrimitiveType>(type));
+  if (schema_version == SCHEMA_V0) {
+    return schema::v0::EnumNamePrimitiveType(static_cast<const schema::v0::Primitive *>(primitive)->value_type());
   }
 #endif
-  return schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(type));
+  return schema::EnumNamePrimitiveType(static_cast<const schema::Primitive *>(primitive)->value_type());
 }
 
 const char *PrimitiveCurVersionTypeName(int type) {
@@ -51,9 +54,8 @@ const char *PrimitiveCurVersionTypeName(int type) {
 
 int GenPrimVersionKey(int primitive_type, int schema_version) { return primitive_type * 1000 + schema_version; }
 
-bool IsPartialNode(const void *primitive) {
+bool IsPartialNode(const void *primitive, int schema_version) {
   MS_ASSERT(primitive != nullptr);
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
   if (schema_version == SCHEMA_CUR) {
     return reinterpret_cast<const schema::Primitive *>(primitive)->value_type() == schema::PrimitiveType_PartialFusion;
   }
@@ -66,27 +68,31 @@ bool IsPartialNode(const void *primitive) {
   return false;
 }
 
-bool IsCallNode(const void *primitive) {
+bool IsCallNode(const void *primitive, int schema_version) {
   MS_ASSERT(primitive != nullptr);
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
   if (schema_version == SCHEMA_CUR) {
     return reinterpret_cast<const schema::Primitive *>(primitive)->value_type() == schema::PrimitiveType_Call;
   }
   return false;
 }
 
-bool IsSwitchNode(const void *primitive) {
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
+bool IsSwitchNode(const void *primitive, int schema_version) {
   if (schema_version == SCHEMA_CUR) {
     return reinterpret_cast<const schema::Primitive *>(primitive)->value_type() == schema::PrimitiveType_Switch;
   }
   return false;
 }
 
-int GetPartialGraphIndex(const void *primitive) {
+bool IsCustomNode(const void *primitive, int schema_version) {
+  if (schema_version == SCHEMA_CUR) {
+    return reinterpret_cast<const schema::Primitive *>(primitive)->value_type() == schema::PrimitiveType_Custom;
+  }
+  return false;
+}
+
+int GetPartialGraphIndex(const void *primitive, int schema_version) {
   MS_ASSERT(primitive != nullptr);
   int index = -1;
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
   if (schema_version == SCHEMA_CUR) {
     auto partial_fusion = reinterpret_cast<const schema::Primitive *>(primitive)->value_as_PartialFusion();
     if (partial_fusion == nullptr) {
@@ -105,65 +111,5 @@ int GetPartialGraphIndex(const void *primitive) {
 #endif
   return index;
 }
-
-bool IsWhileNode(const void *primitive) {
-  MS_ASSERT(primitive != nullptr);
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-  if (schema_version == SCHEMA_CUR) {
-    return reinterpret_cast<const schema::Primitive *>(primitive)->value_type() == schema::PrimitiveType_While;
-  }
-#ifdef ENABLE_V0
-  if (schema_version == SCHEMA_V0) {
-    return reinterpret_cast<const schema::v0::Primitive *>(primitive)->value_type() == schema::v0::PrimitiveType_While;
-  }
-#endif
-  return false;
-}
-
-int GetWhileBodySubgraphIndex(const void *primitive) {
-  MS_ASSERT(primitive != nullptr);
-  int index = -1;
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-  if (schema_version == SCHEMA_CUR) {
-    auto while_value = reinterpret_cast<const schema::Primitive *>(primitive)->value_as_While();
-    if (while_value == nullptr) {
-      return -1;
-    }
-    index = while_value->body_subgraph_index();
-  }
-#ifdef ENABLE_V0
-  if (schema_version == SCHEMA_V0) {
-    auto while_value = reinterpret_cast<const schema::v0::Primitive *>(primitive)->value_as_While();
-    if (while_value == nullptr) {
-      return -1;
-    }
-    index = while_value->bodySubgraphIndex();
-  }
-#endif
-  return index;
-}
-
-int GetWhileCondSubgraphIndex(const void *primitive) {
-  MS_ASSERT(primitive != nullptr);
-  int index = -1;
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-  if (schema_version == SCHEMA_CUR) {
-    auto while_value = reinterpret_cast<const schema::Primitive *>(primitive)->value_as_While();
-    if (while_value == nullptr) {
-      return -1;
-    }
-    index = while_value->cond_subgraph_index();
-  }
-#ifdef ENABLE_V0
-  if (schema_version == SCHEMA_V0) {
-    auto while_value = reinterpret_cast<const schema::v0::Primitive *>(primitive)->value_as_While();
-    if (while_value == nullptr) {
-      return -1;
-    }
-    index = while_value->condSubgraphIndex();
-  }
-#endif
-  return index;
-}
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/common/prim_util.h b/mindspore/lite/src/common/prim_util.h
index fadb8e601d2..11918f5d7d2 100644
--- a/mindspore/lite/src/common/prim_util.h
+++ b/mindspore/lite/src/common/prim_util.h
@@ -19,17 +19,16 @@
 
 namespace mindspore {
 namespace lite {
-int GetPrimitiveType(const void *prim);
-const char *PrimitiveTypeName(int type);
+int GetPrimitiveType(const void *prim, int schema_version);
+const char *GetPrimitiveTypeName(const void *primitive, int schema_version);
 const char *PrimitiveCurVersionTypeName(int type);
 int GenPrimVersionKey(int primitive_type, int schema_version);
-bool IsPartialNode(const void *primitive);
-bool IsCallNode(const void *node);
-bool IsSwitchNode(const void *node);
-int GetPartialGraphIndex(const void *primitive);
-bool IsWhileNode(const void *primitive);
-int GetWhileBodySubgraphIndex(const void *primitive);
-int GetWhileCondSubgraphIndex(const void *primitive);
+bool IsPartialNode(const void *primitive, int schema_version);
+bool IsCallNode(const void *node, int schema_version);
+bool IsSwitchNode(const void *node, int schema_version);
+bool IsCustomNode(const void *primitive, int schema_version);
+bool IsCastNode(const void *primitive, int schema_version);
+int GetPartialGraphIndex(const void *primitive, int schema_version);
 }  // namespace lite
 }  // namespace mindspore
 
diff --git a/mindspore/lite/src/common/string_util.cc b/mindspore/lite/src/common/string_util.cc
index a890c7fd506..e529d64a37a 100644
--- a/mindspore/lite/src/common/string_util.cc
+++ b/mindspore/lite/src/common/string_util.cc
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#include <algorithm>
 #include "src/common/string_util.h"
+#include <algorithm>
 #include "include/ms_tensor.h"
 
 namespace mindspore {
 namespace lite {
-#ifdef ENABLE_STRING_KERNEL
 std::vector<StringPack> ParseTensorBuffer(Tensor *tensor) {
   if (tensor == nullptr) {
     MS_LOG(ERROR) << "tensor is nullptr.";
@@ -277,41 +276,5 @@ uint64_t StringHash64(const char *s, size_t len) {
   return HashLen16(HashLen16(v.first, w.first, mul) + ShiftMix(y) * k0 + z, HashLen16(v.second, w.second, mul) + x,
                    mul);
 }
-#endif
-int StringsToMSTensor(const std::vector<std::string> &inputs, tensor::MSTensor *tensor) {
-#ifdef ENABLE_STRING_KERNEL
-  if (tensor == nullptr) {
-    return RET_PARAM_INVALID;
-  }
-  std::vector<StringPack> all_pack;
-  for (auto &input : inputs) {
-    StringPack pack = {static_cast<int>(input.length()), input.data()};
-    all_pack.push_back(pack);
-  }
-  return WriteStringsToTensor(static_cast<Tensor *>(tensor), all_pack);
-#else
-  MS_LOG(ERROR) << unsupport_string_tensor_log;
-  return RET_ERROR;
-#endif
-}
-
-std::vector<std::string> MSTensorToStrings(const tensor::MSTensor *tensor) {
-#ifdef ENABLE_STRING_KERNEL
-  if (tensor == nullptr) {
-    return {""};
-  }
-  const void *ptr = static_cast<const Tensor *>(tensor)->data_c();
-  std::vector<StringPack> all_pack = ParseStringBuffer(ptr);
-  std::vector<std::string> result(all_pack.size());
-  std::transform(all_pack.begin(), all_pack.end(), result.begin(), [](StringPack &pack) {
-    std::string str(pack.data, pack.len);
-    return str;
-  });
-  return result;
-#else
-  MS_LOG(ERROR) << unsupport_string_tensor_log;
-  return {""};
-#endif
-}
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/common/string_util.h b/mindspore/lite/src/common/string_util.h
index 52ea90ca23f..be3388209ad 100644
--- a/mindspore/lite/src/common/string_util.h
+++ b/mindspore/lite/src/common/string_util.h
@@ -25,7 +25,6 @@
 #include "include/errorcode.h"
 #include "include/lite_utils.h"
 
-#ifdef ENABLE_STRING_KERNEL
 namespace mindspore {
 namespace lite {
 typedef struct StringPack {
@@ -50,5 +49,4 @@ int GetStringCount(Tensor *tensor);
 uint64_t StringHash64(const char *s, size_t len);
 }  // namespace lite
 }  // namespace mindspore
-#endif
 #endif  // MINDSPORE_LITE_SRC_COMMON_STRING_UTIL_H_
diff --git a/mindspore/lite/src/common/tensor_util.cc b/mindspore/lite/src/common/tensor_util.cc
index 627a6385fed..f0aadb9933f 100644
--- a/mindspore/lite/src/common/tensor_util.cc
+++ b/mindspore/lite/src/common/tensor_util.cc
@@ -22,32 +22,6 @@
 
 namespace mindspore {
 namespace lite {
-int InputTensor2TensorC(const std::vector<lite::Tensor *> &tensors_in, std::vector<TensorC *> *tensors_out) {
-  MS_ASSERT(tensors_out != nullptr);
-  for (size_t i = 0; i < tensors_in.size(); ++i) {
-    size_t shape_size = tensors_in[i]->shape().size();
-    if (shape_size >= MAX_SHAPE_SIZE) {
-      MS_LOG(ERROR) << "shape size " << shape_size << " unsupported!";
-      return RET_ERROR;
-    }
-    auto *tensor_c = static_cast<TensorC *>(malloc(sizeof(TensorC)));
-    if (tensor_c == nullptr) {
-      MS_LOG(ERROR) << "malloc tensor fail!";
-      return RET_ERROR;
-    }
-    memset(tensor_c, 0, sizeof(TensorC));
-    tensor_c->format_ = tensors_in[i]->format();
-    tensor_c->data_type_ = tensors_in[i]->data_type();
-    tensor_c->shape_size_ = shape_size;
-    tensor_c->data_ = tensors_in[i]->data_c();
-    for (size_t j = 0; j < shape_size; ++j) {
-      tensor_c->shape_[j] = tensors_in[i]->shape()[j];
-    }
-    tensors_out->push_back(tensor_c);
-  }
-  return RET_OK;
-}
-
 int OutputTensor2TensorC(const std::vector<lite::Tensor *> &tensors, std::vector<TensorC *> *tensors_c) {
   MS_ASSERT(tensors_c != nullptr);
   for (size_t i = 0; i < tensors.size(); ++i) {
@@ -70,7 +44,7 @@ void FreeAllTensorC(std::vector<TensorC *> *tensors_in) {
     if (i == nullptr) {
       continue;
     }
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
     if (i->data_type_ == kObjectTypeTensorType) {
       TensorListC *tensorListC = reinterpret_cast<TensorListC *>(i);
       FreeTensorListC(tensorListC);
@@ -79,7 +53,7 @@ void FreeAllTensorC(std::vector<TensorC *> *tensors_in) {
 #endif
       free(i);
       i = nullptr;
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
     }
 #endif
   }
@@ -110,7 +84,7 @@ void TensorC2Tensor(const TensorC *src, Tensor *dst) {
   dst->set_shape(std::vector<int>(src->shape_, src->shape_ + src->shape_size_));
 }
 
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 void FreeTensorListC(TensorListC *tensorlist_c) {
   MS_ASSERT(tensorlist_c != nullptr);
   if (tensorlist_c->tensors_ != nullptr) {
@@ -192,11 +166,10 @@ int GenerateOutTensorC(const OpParameter *const parameter, const std::vector<lit
                        const std::vector<lite::Tensor *> &outputs, std::vector<TensorC *> *out_tensor_c) {
   MS_ASSERT(out_tensor_c != nullptr);
   MS_ASSERT(parameter != nullptr);
-  int ret = RET_OK;
-#ifdef ENABLE_CONTROL_TENSORLIST
   if (parameter->type_ == mindspore::schema::PrimitiveType_TensorListFromTensor ||
       parameter->type_ == mindspore::schema::PrimitiveType_TensorListReserve ||
       parameter->type_ == mindspore::schema::PrimitiveType_TensorListSetItem) {
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
     // TensorListC ->TensorC
     auto *tensor_list_c = reinterpret_cast<TensorListC *>(malloc(sizeof(TensorListC)));
     if (tensor_list_c == nullptr) {
@@ -204,25 +177,14 @@ int GenerateOutTensorC(const OpParameter *const parameter, const std::vector<lit
     }
     memset(tensor_list_c, 0, sizeof(TensorListC));
     out_tensor_c->push_back(reinterpret_cast<TensorC *const>(tensor_list_c));
-  } else if (parameter->type_ == mindspore::schema::PrimitiveType_Merge ||
-             parameter->type_ == mindspore::schema::PrimitiveType_Switch) {
-    ret = GenerateMergeSwitchOutTensorC(inputs, static_cast<int>(outputs.size()), out_tensor_c);
-  } else {
-    ret = OutputTensor2TensorC(outputs, out_tensor_c);
-  }
+    return RET_OK;
 #else
-  if (parameter->type_ == mindspore::schema::PrimitiveType_TensorListFromTensor ||
-      parameter->type_ == mindspore::schema::PrimitiveType_TensorListReserve ||
-      parameter->type_ == mindspore::schema::PrimitiveType_TensorListSetItem ||
-      parameter->type_ == mindspore::schema::PrimitiveType_Merge ||
-      parameter->type_ == mindspore::schema::PrimitiveType_Switch) {
-    MS_LOG(ERROR) << unsupport_control_tensorlist_log;
+    MS_LOG(ERROR) << unsupport_controlflow_tensorlist_log;
     return RET_ERROR;
-  } else {
-    ret = OutputTensor2TensorC(outputs, out_tensor_c);
-  }
 #endif
-  return ret;
+  } else {
+    return OutputTensor2TensorC(outputs, out_tensor_c);
+  }
 }
 
 int GenerateInTensorC(const OpParameter *const parameter, const std::vector<lite::Tensor *> &inputs,
@@ -231,7 +193,7 @@ int GenerateInTensorC(const OpParameter *const parameter, const std::vector<lite
   int ret = RET_OK;
   for (auto input : inputs) {
     if (input->data_type() == kObjectTypeTensorType) {
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
       // Tensor ->TensorList -> TensorListC -> TensorC
       auto *tensor_list = reinterpret_cast<TensorList *>(input);
       auto *tensor_list_c = reinterpret_cast<TensorListC *>(malloc(sizeof(TensorListC)));
@@ -248,7 +210,7 @@ int GenerateInTensorC(const OpParameter *const parameter, const std::vector<lite
       }
       in_tensor_c->push_back(reinterpret_cast<TensorC *>(tensor_list_c));
 #else
-      MS_LOG(ERROR) << unsupport_control_tensorlist_log;
+      MS_LOG(ERROR) << unsupport_controlflow_tensorlist_log;
       return RET_NOT_SUPPORT;
 #endif
     } else {
@@ -286,8 +248,8 @@ int CheckTensorsInvalid(const std::vector<Tensor *> &tensors) {
                     << "check the model and assign the input shape with method Resize().";
       return RET_ERROR;
     }
-    if (tensor->format() != mindspore::NHWC) {
-      MS_LOG(ERROR) << "model input's format may be changed, which should keep default value NHWC";
+    if (tensor->format() != mindspore::NHWC && tensor->format() != mindspore::NCHW) {
+      MS_LOG(ERROR) << "model input's format may be changed, which should be NHWC or NCHW";
       return RET_FORMAT_ERR;
     }
     if (tensor->data_c() == nullptr) {
diff --git a/mindspore/lite/src/common/tensor_util.h b/mindspore/lite/src/common/tensor_util.h
index 07c3996b693..d77d6b2de95 100644
--- a/mindspore/lite/src/common/tensor_util.h
+++ b/mindspore/lite/src/common/tensor_util.h
@@ -20,19 +20,20 @@
 
 #include <memory>
 #include "src/tensor.h"
-#include "src/tensorlist.h"
 #include "nnacl/tensor_c.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
+#include "src/tensorlist.h"
 #include "nnacl/infer/common_infer.h"
+#endif
 #include "src/cxx_api/tensor/tensor_impl.h"
 
 namespace mindspore {
 namespace lite {
-int InputTensor2TensorC(const std::vector<lite::Tensor *> &tensors_in, std::vector<TensorC *> *tensors_out);
 int OutputTensor2TensorC(const std::vector<lite::Tensor *> &tensors_in, std::vector<TensorC *> *tensors_out);
 void FreeAllTensorC(std::vector<TensorC *> *tensors_in);
 int Tensor2TensorC(const Tensor *src, TensorC *dst);
 void TensorC2Tensor(const TensorC *src, Tensor *dst);
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 void FreeTensorListC(TensorListC *tensorListC);
 int TensorList2TensorListC(TensorList *src, TensorListC *dst);
 int TensorListC2TensorList(const TensorListC *src, TensorList *dst);
diff --git a/mindspore/lite/src/common/utils.h b/mindspore/lite/src/common/utils.h
index aae4ce65eb3..7e105aea615 100644
--- a/mindspore/lite/src/common/utils.h
+++ b/mindspore/lite/src/common/utils.h
@@ -69,6 +69,17 @@ bool VectorErase(std::vector<T> *vec, T element) {
   return ret;
 }
 
+template <typename T>
+bool VectorSetNull(std::vector<T> *vec, T element) {
+  bool ret = false;
+  for (size_t i = 0; i < vec->size(); i++) {
+    if (vec->at(i) == element) {
+      vec->at(i) = nullptr;
+    }
+  }
+  return ret;
+}
+
 template <typename T>
 bool VectorReplace(std::vector<T> *vec, T srcElement, T dstElement) {
   bool ret = false;
diff --git a/mindspore/lite/src/common/version_manager.h b/mindspore/lite/src/common/version_manager.h
index 0ba0158b245..1eb2c811ebd 100644
--- a/mindspore/lite/src/common/version_manager.h
+++ b/mindspore/lite/src/common/version_manager.h
@@ -22,24 +22,6 @@
 namespace mindspore {
 namespace lite {
 enum SCHEMA_VERSION : int { SCHEMA_INVALID = -1, SCHEMA_CUR = 0, SCHEMA_V0 = 1 };
-class VersionManager {
- public:
-  static VersionManager *GetInstance() {
-    static VersionManager instance;
-    return &instance;
-  }
-  virtual ~VersionManager() = default;
-
-  void SetSchemaVersion(const int schema_version) { schema_version_ = schema_version; }
-  int GetSchemaVersion() const { return schema_version_; }
-  bool CheckV0Schema() const { return schema_version_ == SCHEMA_VERSION::SCHEMA_V0; }
-
- private:
-  VersionManager() = default;
-
- private:
-  int schema_version_ = SCHEMA_VERSION::SCHEMA_CUR;
-};
 }  // namespace lite
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_COMMON_VERSION_MANAGER_H_
diff --git a/mindspore/lite/src/cxx_api/converters.cc b/mindspore/lite/src/cxx_api/converters.cc
index b72bd82dc55..b5b7a748975 100644
--- a/mindspore/lite/src/cxx_api/converters.cc
+++ b/mindspore/lite/src/cxx_api/converters.cc
@@ -24,33 +24,10 @@
 #include "src/common/log_adapter.h"
 
 namespace mindspore {
-constexpr static int kMaxNumOfDevices = 2;
+constexpr static int kMaxNumOfDevices = 3;
 
-Status A2L_ConvertContext(Context *a_context, lite::Context *l_context) {
-  if ((a_context == nullptr) || (l_context == nullptr)) {
-    MS_LOG(ERROR) << "Invalid context pointers.";
-    return kLiteNullptr;
-  }
-
-  auto device_list = a_context->MutableDeviceInfo();
-  if (device_list.size() == 0) {
-    MS_LOG(ERROR) << "Invalid device list.";
-    return kLiteInputParamInvalid;
-  }
-  if (device_list.size() > kMaxNumOfDevices) {
-    MS_LOG(ERROR) << "Only CPU/CPU & GPU/CPU & NPU mode is supported.";
-    return kLiteInputParamInvalid;
-  }
-  l_context->thread_num_ = a_context->GetThreadNum();
-  l_context->enable_parallel_ = a_context->GetEnableParallel();
-  l_context->affinity_core_list_ = a_context->GetThreadAffinityCoreList();
-  l_context->device_list_.clear();
-  if (device_list[0]->GetDeviceType() != kCPU) {
-    MS_LOG(ERROR) << "CPU context must be enabled and in the first place of device list.";
-    return kLiteInputParamInvalid;
-  }
-
-  auto cpu_context = device_list[0]->Cast<CPUDeviceInfo>();
+Status AddCpuDevice(Context *a_context, lite::InnerContext *l_context, DeviceInfoContext *device) {
+  auto cpu_context = device->Cast<CPUDeviceInfo>();
   l_context->allocator = cpu_context->GetAllocator();
   if (l_context->allocator == nullptr) {
     l_context->allocator = Allocator::Create();
@@ -73,22 +50,65 @@ Status A2L_ConvertContext(Context *a_context, lite::Context *l_context) {
   cpu_info.cpu_device_info_ = {cpu_context->GetEnableFP16(), mode};
   l_context->device_list_.push_back({lite::DT_CPU, cpu_info, cpu_context->GetProvider(),
                                      cpu_context->GetProviderDevice(), cpu_context->GetAllocator()});
-  if (device_list.size() == kMaxNumOfDevices) {
-    lite::DeviceInfo device_info = {0};
-    if (device_list[1]->GetDeviceType() == kGPU) {
-      auto gpu_context = device_list[1]->Cast<GPUDeviceInfo>();
-      device_info.gpu_device_info_ = {gpu_context->GetEnableFP16()};
-      l_context->device_list_.push_back({lite::DT_GPU, device_info, gpu_context->GetProvider(),
-                                         gpu_context->GetProviderDevice(), gpu_context->GetAllocator()});
-    } else if (device_list[1]->GetDeviceType() == kKirinNPU) {
-      auto npu_context = device_list[1]->Cast<KirinNPUDeviceInfo>();
-      device_info.npu_device_info_ = {npu_context->GetFrequency()};
-      l_context->device_list_.push_back({lite::DT_NPU, device_info});
+  return kSuccess;
+}
+
+Status AddGpuDevice(Context *a_context, lite::InnerContext *l_context, DeviceInfoContext *device) {
+  lite::DeviceInfo device_info = {0};
+  auto gpu_context = device->Cast<GPUDeviceInfo>();
+  device_info.gpu_device_info_ = {gpu_context->GetEnableFP16()};
+  l_context->device_list_.push_back({lite::DT_GPU, device_info, gpu_context->GetProvider(),
+                                     gpu_context->GetProviderDevice(), gpu_context->GetAllocator()});
+  return kSuccess;
+}
+
+Status AddNpuDevice(Context *a_context, lite::InnerContext *l_context, DeviceInfoContext *device) {
+  lite::DeviceInfo device_info = {0};
+  auto npu_context = device->Cast<KirinNPUDeviceInfo>();
+  device_info.npu_device_info_ = {npu_context->GetFrequency()};
+  l_context->device_list_.push_back({lite::DT_NPU, device_info});
+  return kSuccess;
+}
+
+Status A2L_ConvertContext(Context *a_context, lite::InnerContext *l_context) {
+  if ((a_context == nullptr) || (l_context == nullptr)) {
+    MS_LOG(ERROR) << "Invalid context pointers.";
+    return kLiteNullptr;
+  }
+
+  auto device_list = a_context->MutableDeviceInfo();
+  if (device_list.size() == 0) {
+    MS_LOG(ERROR) << "Invalid device list.";
+    return kLiteInputParamInvalid;
+  }
+  if (device_list.size() > kMaxNumOfDevices) {
+    MS_LOG(ERROR) << "Device support Max: " << kMaxNumOfDevices;
+    return kLiteInputParamInvalid;
+  }
+  l_context->thread_num_ = a_context->GetThreadNum();
+  l_context->enable_parallel_ = a_context->GetEnableParallel();
+  l_context->affinity_core_list_ = a_context->GetThreadAffinityCoreList();
+  l_context->device_list_.clear();
+
+  Status error_code;
+  for (auto device : device_list) {
+    if (device->GetDeviceType() == kCPU) {
+      error_code = AddCpuDevice(a_context, l_context, device.get());
+    } else if (device->GetDeviceType() == kGPU) {
+      error_code = AddGpuDevice(a_context, l_context, device.get());
+    } else if (device->GetDeviceType() == kKirinNPU) {
+      error_code = AddNpuDevice(a_context, l_context, device.get());
     } else {
       MS_LOG(ERROR) << "Invalid device.";
       return kLiteInputParamInvalid;
     }
+
+    if (error_code != kSuccess) {
+      MS_LOG(ERROR) << "Add device failed!";
+      return error_code;
+    }
   }
+
   l_context->delegate = a_context->GetDelegate();
   return kSuccess;
 }
diff --git a/mindspore/lite/src/cxx_api/converters.h b/mindspore/lite/src/cxx_api/converters.h
index 8fd984a79cf..9a907c6be43 100644
--- a/mindspore/lite/src/cxx_api/converters.h
+++ b/mindspore/lite/src/cxx_api/converters.h
@@ -21,6 +21,7 @@
 #include "include/api/status.h"
 #include "include/api/types.h"
 #include "include/lite_types.h"
+#include "src/inner_context.h"
 
 namespace mindspore {
 
@@ -59,7 +60,7 @@ inline bool IsAffinityModeValid(int affinity_mode) {
   return affinity_mode >= lite::NO_BIND && affinity_mode <= lite::MID_CPU;
 }
 
-Status A2L_ConvertContext(Context *a_context, lite::Context *l_context);
+Status A2L_ConvertContext(Context *a_context, lite::InnerContext *l_context);
 
 Status A2L_ConvertConfig(const TrainCfg *a_train_cfg, lite::TrainCfg *l_train_cfg);
 }  // namespace mindspore
diff --git a/mindspore/lite/src/cxx_api/model/model_impl.cc b/mindspore/lite/src/cxx_api/model/model_impl.cc
index 0d69f65649a..6d1a9eba39d 100644
--- a/mindspore/lite/src/cxx_api/model/model_impl.cc
+++ b/mindspore/lite/src/cxx_api/model/model_impl.cc
@@ -45,19 +45,25 @@ CreateTrainSessionProto *CreateTrainSessionCallbackHolder(CreateTrainSessionProt
 Status ModelImpl::Build(const void *model_data, size_t data_size, ModelType model_type,
                         const std::shared_ptr<Context> &ms_context) {
   context_ = ms_context;
-  lite::Context lite_context;
-  auto status = A2L_ConvertContext(ms_context.get(), &lite_context);
+
+  lite::InnerContext *lite_context = new lite::InnerContext();
+  auto status = A2L_ConvertContext(ms_context.get(), lite_context);
   if (status != kSuccess) {
     return status;
   }
 
-  auto session = std::shared_ptr<session::LiteSession>(
-    session::LiteSession::CreateSession(static_cast<const char *>(model_data), data_size, &lite_context));
+  auto session = std::shared_ptr<session::LiteSession>(CreateLiteSession(lite_context));
   if (session == nullptr) {
     MS_LOG(ERROR) << "Allocate session failed.";
     return kLiteNullptr;
   }
 
+  auto ret = lite::LiteSession::CreateSessionByBuf(static_cast<const char *>(model_data), data_size, session.get());
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init session failed";
+    return kLiteError;
+  }
+
   session_.swap(session);
   MS_LOG(DEBUG) << "Build model success.";
   return kSuccess;
@@ -65,15 +71,21 @@ Status ModelImpl::Build(const void *model_data, size_t data_size, ModelType mode
 
 Status ModelImpl::Build(const std::string &model_path, ModelType model_type,
                         const std::shared_ptr<Context> &ms_context) {
-  lite::Context lite_context;
-  auto status = A2L_ConvertContext(ms_context.get(), &lite_context);
+  lite::InnerContext *lite_context = new lite::InnerContext();
+  auto status = A2L_ConvertContext(ms_context.get(), lite_context);
   if (status != kSuccess) {
     return status;
   }
 
-  auto session = std::shared_ptr<session::LiteSession>(lite::LiteSession::CreateSession(model_path, &lite_context));
+  auto session = std::shared_ptr<session::LiteSession>(CreateLiteSession(lite_context));
   if (session == nullptr) {
     MS_LOG(ERROR) << "Allocate session failed.";
+    return kLiteNullptr;
+  }
+
+  auto ret = lite::LiteSession::CreateSessionByPath(model_path, session.get());
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init session failed";
     return kLiteError;
   }
 
@@ -94,8 +106,8 @@ Status ModelImpl::Build() {
     return kLiteNullptr;
   }
 
-  lite::Context model_context;
-  auto status = A2L_ConvertContext(context_.get(), &model_context);
+  lite::InnerContext *lite_context = new lite::InnerContext();
+  auto status = A2L_ConvertContext(context_.get(), lite_context);
   if (status != kSuccess) {
     MS_LOG(ERROR) << "Failed to convert Context to Lite Context";
     return status;
@@ -103,7 +115,7 @@ Status ModelImpl::Build() {
 
   auto create_callback = CreateTrainSessionCallbackHolder();
   if (create_callback != nullptr) {
-    auto session = create_callback(graph_->graph_data_, cfg_, &model_context);
+    auto session = create_callback(graph_->graph_data_, cfg_, lite_context);
     if (session != nullptr) {
       session_ = session;
       MS_LOG(DEBUG) << "Build model success.";
@@ -116,7 +128,8 @@ Status ModelImpl::Build() {
     MS_LOG(ERROR) << "Lite model has been freed.";
     return kLiteError;
   }
-  auto session = std::shared_ptr<session::LiteSession>(session::LiteSession::CreateSession(&model_context));
+
+  auto session = std::shared_ptr<session::LiteSession>(CreateLiteSession(lite_context));
   if (session == nullptr) {
     MS_LOG(ERROR) << "Allocate session failed.";
     return kLiteNullptr;
@@ -210,7 +223,7 @@ Status ModelImpl::Predict(const std::vector<MSTensor> &inputs, std::vector<MSTen
     }
     old_data.push_back(input->data());
     if (input->data_type() == kObjectTypeString) {
-#ifdef ENABLE_STRING_KERNEL
+#ifndef STRING_KERNEL_CLIP
       std::vector<int32_t> shape = TruncateShape(user_input.Shape(), input->data_type(), user_input.DataSize(), false);
       if (shape.empty() && !(user_input.Shape().empty())) {
         ResetTensorData(old_data, input_tensors);
@@ -441,4 +454,21 @@ Status ModelImpl::Resize(const std::vector<MSTensor> &inputs, const std::vector<
   auto ret = session_->Resize(inner_input, truncated_shape);
   return static_cast<StatusCode>(ret);
 }
+
+session::LiteSession *ModelImpl::CreateLiteSession(lite::InnerContext *context) {
+  auto session = new (std::nothrow) lite::LiteSession();
+  if (session == nullptr) {
+    MS_LOG(ERROR) << "create session failed";
+    return nullptr;
+  }
+
+  auto ret = session->Init(context);
+  if (ret != mindspore::lite::RET_OK) {
+    MS_LOG(ERROR) << "init session failed";
+    delete session;
+    return nullptr;
+  }
+  return session;
+}
+
 }  // namespace mindspore
diff --git a/mindspore/lite/src/cxx_api/model/model_impl.h b/mindspore/lite/src/cxx_api/model/model_impl.h
index 0f1422d3e38..e0d55ff71e7 100644
--- a/mindspore/lite/src/cxx_api/model/model_impl.h
+++ b/mindspore/lite/src/cxx_api/model/model_impl.h
@@ -29,6 +29,7 @@
 #include "include/api/cell.h"
 #include "include/lite_session.h"
 #include "src/cxx_api/graph/graph_data.h"
+#include "src/inner_context.h"
 
 template <class T>
 void clearVectorOfPointers(std::vector<T> *v) {
@@ -44,7 +45,7 @@ namespace mindspore {
 
 typedef std::shared_ptr<session::LiteSession>(CreateTrainSessionProto)(std::shared_ptr<Graph::GraphData> graph_data,
                                                                        std::shared_ptr<TrainCfg> cfg,
-                                                                       lite::Context *context);
+                                                                       lite::InnerContext *context);
 CreateTrainSessionProto *CreateTrainSessionCallbackHolder(CreateTrainSessionProto *proto = nullptr);
 
 namespace session {
@@ -66,6 +67,8 @@ class ModelImpl {
   Status Predict(const std::vector<MSTensor> &inputs, std::vector<MSTensor> *outputs, const MSKernelCallBack &before,
                  const MSKernelCallBack &after);
 
+  static session::LiteSession *CreateLiteSession(lite::InnerContext *context);
+
   std::vector<MSTensor> GetInputs();
   std::vector<MSTensor> GetOutputs();
   MSTensor GetInputByTensorName(const std::string &name);
diff --git a/mindspore/lite/src/cxx_api/serialization.cc b/mindspore/lite/src/cxx_api/serialization.cc
index 13dd822c12f..d601c713af9 100644
--- a/mindspore/lite/src/cxx_api/serialization.cc
+++ b/mindspore/lite/src/cxx_api/serialization.cc
@@ -82,7 +82,7 @@ Status Serialization::Load(const std::vector<char> &file, ModelType model_type,
     MS_LOG(ERROR) << "graph is nullptr.";
     return kLiteNullptr;
   }
-  if (model_type != kFlatBuffer) {
+  if (model_type != kMindIR) {
     MS_LOG(ERROR) << "Unsupported IR.";
     return kLiteInputParamInvalid;
   }
diff --git a/mindspore/lite/src/cxx_api/tensor/tensor_impl.cc b/mindspore/lite/src/cxx_api/tensor/tensor_impl.cc
index d12ebd02722..4d444a4aa51 100644
--- a/mindspore/lite/src/cxx_api/tensor/tensor_impl.cc
+++ b/mindspore/lite/src/cxx_api/tensor/tensor_impl.cc
@@ -24,6 +24,7 @@
 #include <functional>
 #include "src/cxx_api/tensor_utils.h"
 #include "src/tensor.h"
+#include "include/lite_utils.h"
 
 namespace mindspore {
 using mindspore::lite::RET_OK;
@@ -55,9 +56,9 @@ std::shared_ptr<MSTensor::Impl> MSTensor::Impl::CreateTensorImpl(const std::stri
   return impl;
 }
 
+#ifndef STRING_KERNEL_CLIP
 std::shared_ptr<MSTensor::Impl> MSTensor::Impl::StringsToTensorImpl(const std::string &name,
                                                                     const std::vector<std::string> &str) {
-#ifdef ENABLE_STRING_KERNEL
   auto lite_tensor = new (std::nothrow) lite::Tensor();
   if (lite_tensor == nullptr) {
     MS_LOG(ERROR) << "Failed to allocate lite tensor.";
@@ -79,24 +80,16 @@ std::shared_ptr<MSTensor::Impl> MSTensor::Impl::StringsToTensorImpl(const std::s
   impl->set_own_data(true);
   impl->set_from_session(false);
   return impl;
-#else
-  MS_LOG(ERROR) << unsupport_string_tensor_log;
-  return nullptr;
-#endif
 }
 
 std::vector<std::string> MSTensor::Impl::TensorImplToStrings(const std::shared_ptr<Impl> &impl) {
   std::vector<std::string> empty;
-#ifdef ENABLE_STRING_KERNEL
   auto lite_tensor = impl->lite_tensor();
   if (lite_tensor == nullptr) {
     MS_LOG(ERROR) << "Invalid tensor impl.";
     return empty;
   }
   return lite::MSTensorToStrings(lite_tensor);
-#else
-  MS_LOG(ERROR) << unsupport_string_tensor_log;
-  return empty;
-#endif
 }
+#endif
 }  // namespace mindspore
diff --git a/mindspore/lite/src/cxx_api/tensor/tensor_impl.h b/mindspore/lite/src/cxx_api/tensor/tensor_impl.h
index f2f197b41a3..39de87c31d2 100644
--- a/mindspore/lite/src/cxx_api/tensor/tensor_impl.h
+++ b/mindspore/lite/src/cxx_api/tensor/tensor_impl.h
@@ -62,9 +62,11 @@ class MSTensor::Impl {
                                                        const std::vector<int64_t> &shape, const void *data,
                                                        size_t data_len);
 
+#ifndef STRING_KERNEL_CLIP
   static std::shared_ptr<Impl> MS_API StringsToTensorImpl(const std::string &name, const std::vector<std::string> &str);
 
   static std::vector<std::string> MS_API TensorImplToStrings(const std::shared_ptr<Impl> &impl);
+#endif
 
   virtual const std::string &Name() const {
     static std::string empty = "";
diff --git a/mindspore/lite/src/cxx_api/train/model_impl.cc b/mindspore/lite/src/cxx_api/train/model_impl.cc
index a40300b3248..abdb76c9ff7 100644
--- a/mindspore/lite/src/cxx_api/train/model_impl.cc
+++ b/mindspore/lite/src/cxx_api/train/model_impl.cc
@@ -27,7 +27,6 @@
 #include "include/api/metrics/metrics.h"
 #include "src/lite_model.h"
 #include "src/runtime/inner_allocator.h"
-#include "src/common/string_util.h"
 #include "src/cxx_api/converters.h"
 #include "src/cxx_api/graph/graph_data.h"
 #include "src/cxx_api/tensor/tensor_impl.h"
diff --git a/mindspore/lite/src/cxx_api/train/train_support.cc b/mindspore/lite/src/cxx_api/train/train_support.cc
index afbe9adc32c..fbb66e64870 100644
--- a/mindspore/lite/src/cxx_api/train/train_support.cc
+++ b/mindspore/lite/src/cxx_api/train/train_support.cc
@@ -25,8 +25,8 @@
 #include "include/api/callback/callback.h"
 #include "include/api/metrics/metrics.h"
 #include "src/lite_model.h"
+#include "src/inner_context.h"
 #include "src/runtime/inner_allocator.h"
-#include "src/common/string_util.h"
 #include "src/cxx_api/model/model_impl.h"
 #include "src/cxx_api/converters.h"
 #include "src/cxx_api/graph/graph_data.h"
@@ -41,7 +41,7 @@
 
 namespace mindspore {
 std::shared_ptr<session::LiteSession> CreateTrainSession(std::shared_ptr<Graph::GraphData> graph_data,
-                                                         std::shared_ptr<TrainCfg> cfg, lite::Context *context) {
+                                                         std::shared_ptr<TrainCfg> cfg, lite::InnerContext *context) {
   bool is_train_session = graph_data->IsTrainModel();
   if (is_train_session) {
     auto model = graph_data->lite_model();
diff --git a/mindspore/lite/src/cxx_api/types.cc b/mindspore/lite/src/cxx_api/types.cc
index aac33f13c7f..d90978777de 100644
--- a/mindspore/lite/src/cxx_api/types.cc
+++ b/mindspore/lite/src/cxx_api/types.cc
@@ -129,6 +129,7 @@ MSTensor *MSTensor::CreateDevTensor(const std::vector<char> &name, enum DataType
 }
 
 MSTensor *MSTensor::CharStringsToTensor(const std::vector<char> &name, const std::vector<std::vector<char>> &inputs) {
+#ifndef STRING_KERNEL_CLIP
   auto impl = Impl::StringsToTensorImpl(CharToString(name), VectorCharToString(inputs));
   if (impl == nullptr) {
     MS_LOG(ERROR) << "Allocate tensor impl failed.";
@@ -140,15 +141,25 @@ MSTensor *MSTensor::CharStringsToTensor(const std::vector<char> &name, const std
     return nullptr;
   }
   return ms_tensor;
+#else
+  MS_LOG(ERROR) << unsupport_string_tensor_log;
+  return nullptr;
+#endif
 }
 
 std::vector<std::vector<char>> MSTensor::TensorToStringChars(const MSTensor &tensor) {
+#ifndef STRING_KERNEL_CLIP
   if (tensor.impl_ == nullptr) {
     MS_LOG(ERROR) << "Invalid tensor.";
     std::vector<std::vector<char>> empty;
     return empty;
   }
   return VectorStringToChar(Impl::TensorImplToStrings(tensor.impl_));
+#else
+  std::vector<std::vector<char>> empty;
+  MS_LOG(ERROR) << unsupport_string_tensor_log;
+  return empty;
+#endif
 }
 
 MSTensor *MSTensor::Clone() const {
diff --git a/mindspore/lite/src/delegate/npu/npu_graph.cc b/mindspore/lite/src/delegate/npu/npu_graph.cc
index 4a924fbaf9a..656cf232696 100644
--- a/mindspore/lite/src/delegate/npu/npu_graph.cc
+++ b/mindspore/lite/src/delegate/npu/npu_graph.cc
@@ -191,10 +191,10 @@ std::vector<NPUOp *> NPUGraph::FindReadySubgraphOps(std::queue<NPUOp *> op_queue
       }
       auto input_ready = std::all_of(out_op->in_ops().begin(), out_op->in_ops().end(),
                                      [&](NPUOp *in_op) { return (*is_visited)[in_op] == true; });
-      if (input_ready && out_op->type() != schema::PrimitiveType_Transpose) {
-        op_queue.push(out_op);
-      } else {
+      if (out_op->type() == schema::PrimitiveType_Transpose) {
         next_candidate_ops->push(out_op);
+      } else if (input_ready) {
+        op_queue.push(out_op);
       }
     }
   }
diff --git a/mindspore/lite/src/delegate/npu/npu_subgraph.cc b/mindspore/lite/src/delegate/npu/npu_subgraph.cc
index e474c80a80e..92b6eb12e74 100644
--- a/mindspore/lite/src/delegate/npu/npu_subgraph.cc
+++ b/mindspore/lite/src/delegate/npu/npu_subgraph.cc
@@ -30,10 +30,9 @@
 #include "src/delegate/npu/npu_graph_utils.h"
 namespace mindspore {
 static std::set<mindspore::schema::PrimitiveType> npu_specific_weight_nodes = {
-  schema::PrimitiveType_Conv2DFusion,   schema::PrimitiveType_Conv2dTransposeFusion,
-  schema::PrimitiveType_ScaleFusion,    schema::PrimitiveType_BatchNorm,
-  schema::PrimitiveType_FullConnection, schema::PrimitiveType_InstanceNorm,
-  schema::PrimitiveType_TileFusion,     schema::PrimitiveType_PadFusion};
+  schema::PrimitiveType_Conv2DFusion, schema::PrimitiveType_Conv2dTransposeFusion, schema::PrimitiveType_PadFusion,
+  schema::PrimitiveType_BatchNorm,    schema::PrimitiveType_FullConnection,        schema::PrimitiveType_InstanceNorm,
+  schema::PrimitiveType_TileFusion};
 
 NPUSubGraph::~NPUSubGraph() {
   subgraph_input_ops_.clear();
diff --git a/mindspore/lite/src/delegate/npu/op/argmax_npu.cc b/mindspore/lite/src/delegate/npu/op/argmax_npu.cc
index 164cce84464..ad5684cab5c 100644
--- a/mindspore/lite/src/delegate/npu/op/argmax_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/argmax_npu.cc
@@ -31,7 +31,7 @@ int ArgmaxNPUOp::Init(const schema::Primitive *primitive, const std::vector<mind
     return RET_ERROR;
   }
 
-  auto axis_const_ = new (std::nothrow) hiai::op::Const(name_ + "_axis");
+  axis_const_ = new (std::nothrow) hiai::op::Const(name_ + "_axis");
   if (axis_const_ == nullptr) {
     MS_LOG(ERROR) << "New weight const failed.";
     return RET_ERROR;
diff --git a/mindspore/lite/src/delegate/npu/op/arithmetic_npu.cc b/mindspore/lite/src/delegate/npu/op/arithmetic_npu.cc
index 3d51b8dcbe6..f05339435ed 100644
--- a/mindspore/lite/src/delegate/npu/op/arithmetic_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/arithmetic_npu.cc
@@ -20,9 +20,9 @@ namespace mindspore {
 constexpr int ARITHMETIC_INPUT_NUM = 2;
 int ArithmeticNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
                                const std::vector<mindspore::MSTensor> &out_tensors) {
-  if (in_tensors[0].Shape() != in_tensors[1].Shape()) {
-    MS_LOG(WARNING) << name_ << " for the two inputs, the corresponding dimensions must have the same value."
-                    << " shape 1 is:" << in_tensors[0].Shape() << " shape 2 is:" << in_tensors[1].Shape();
+  if (in_tensors[0].Shape().size() != in_tensors[1].Shape().size()) {
+    MS_LOG(WARNING) << name_ << " for the two inputs, the dimension size must be same."
+                    << " size 1 is:" << in_tensors[0].Shape().size() << " size 2 is:" << in_tensors[1].Shape().size();
     return RET_NOT_SUPPORT;
   }
   auto type = primitive->value_type();
diff --git a/mindspore/lite/src/delegate/npu/op/batchnorm_npu.cc b/mindspore/lite/src/delegate/npu/op/batchnorm_npu.cc
index 52e6d62398b..1481149bd47 100644
--- a/mindspore/lite/src/delegate/npu/op/batchnorm_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/batchnorm_npu.cc
@@ -46,41 +46,41 @@ int BatchnormNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tens
                                  const std::vector<mindspore::MSTensor> &out_tensors,
                                  const std::vector<ge::Operator *> &npu_inputs) {
   batchnorm_->set_input_x(*npu_inputs[0]);
-  auto scale = new (std::nothrow) hiai::op::Const(name_ + "_scale");
-  if (scale == nullptr) {
+  scale_ = new (std::nothrow) hiai::op::Const(name_ + "_scale");
+  if (scale_ == nullptr) {
     MS_LOG(ERROR) << "New scale const failed.";
     return RET_ERROR;
   }
   auto scale_tensor = ConverterToNPUTensor(in_tensors[SCALE_INDEX]);
-  scale->set_attr_value(scale_tensor);
-  batchnorm_->set_input_scale(*scale);
+  scale_->set_attr_value(scale_tensor);
+  batchnorm_->set_input_scale(*scale_);
 
-  auto offset = new (std::nothrow) hiai::op::Const(name_ + "_offset");
-  if (offset == nullptr) {
+  offset_ = new (std::nothrow) hiai::op::Const(name_ + "_offset");
+  if (offset_ == nullptr) {
     MS_LOG(ERROR) << "New offset const failed.";
     return RET_ERROR;
   }
   auto offset_tensor = ConverterToNPUTensor(in_tensors[OFFSET_INDEX]);
-  offset->set_attr_value(offset_tensor);
-  batchnorm_->set_input_offset(*offset);
+  offset_->set_attr_value(offset_tensor);
+  batchnorm_->set_input_offset(*offset_);
 
-  auto mean = new (std::nothrow) hiai::op::Const(name_ + "_mean");
-  if (mean == nullptr) {
+  mean_ = new (std::nothrow) hiai::op::Const(name_ + "_mean");
+  if (mean_ == nullptr) {
     MS_LOG(ERROR) << "New mean const failed.";
     return RET_ERROR;
   }
   auto mean_tensor = ConverterToNPUTensor(in_tensors[MEAN_INDEX]);
-  mean->set_attr_value(mean_tensor);
-  batchnorm_->set_input_mean(*mean);
+  mean_->set_attr_value(mean_tensor);
+  batchnorm_->set_input_mean(*mean_);
 
-  auto variance = new (std::nothrow) hiai::op::Const(name_ + "_variance");
-  if (variance == nullptr) {
+  variance_ = new (std::nothrow) hiai::op::Const(name_ + "_variance");
+  if (variance_ == nullptr) {
     MS_LOG(ERROR) << "New variance const failed.";
     return RET_ERROR;
   }
   auto variance_tensor = ConverterToNPUTensor(in_tensors[VARIANCE_INDEX]);
-  variance->set_attr_value(variance_tensor);
-  batchnorm_->set_input_variance(*variance);
+  variance_->set_attr_value(variance_tensor);
+  batchnorm_->set_input_variance(*variance_);
   return RET_OK;
 }
 
@@ -91,5 +91,21 @@ BatchnormNPUOp::~BatchnormNPUOp() {
     delete batchnorm_;
     batchnorm_ = nullptr;
   }
+  if (scale_ != nullptr) {
+    delete scale_;
+    scale_ = nullptr;
+  }
+  if (offset_ != nullptr) {
+    delete offset_;
+    offset_ = nullptr;
+  }
+  if (mean_ != nullptr) {
+    delete mean_;
+    mean_ = nullptr;
+  }
+  if (variance_ != nullptr) {
+    delete variance_;
+    variance_ = nullptr;
+  }
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/src/delegate/npu/op/batchnorm_npu.h b/mindspore/lite/src/delegate/npu/op/batchnorm_npu.h
index c88ac042525..ffb06cc005f 100644
--- a/mindspore/lite/src/delegate/npu/op/batchnorm_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/batchnorm_npu.h
@@ -18,6 +18,7 @@
 
 #include <vector>
 #include <string>
+#include "include/graph/op/all_ops.h"
 #include "include/graph/compatible/all_ops.h"
 #include "src/delegate/npu/op/npu_op.h"
 
@@ -46,6 +47,10 @@ class BatchnormNPUOp : public NPUOp {
 
  private:
   ge::op::BatchNormExt2 *batchnorm_ = nullptr;
+  hiai::op::Const *scale_ = nullptr;
+  hiai::op::Const *offset_ = nullptr;
+  hiai::op::Const *mean_ = nullptr;
+  hiai::op::Const *variance_ = nullptr;
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_BATCHNORM_NPU_H_
diff --git a/mindspore/lite/src/delegate/npu/op/deconvolution_npu.cc b/mindspore/lite/src/delegate/npu/op/deconvolution_npu.cc
index a07fe461955..32beb1aa91c 100644
--- a/mindspore/lite/src/delegate/npu/op/deconvolution_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/deconvolution_npu.cc
@@ -108,14 +108,6 @@ int DeconvolutionNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_
     deconv_->set_input_bias(*bias_);
   }
   deconv_->set_input_x(*npu_inputs[0]);
-
-  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
-    ret = SetActivation(deconv_, act_type_);
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
-      return RET_ERROR;
-    }
-  }
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/delegate/npu/op/fullconnection_npu.cc b/mindspore/lite/src/delegate/npu/op/fullconnection_npu.cc
index 94dc7d544de..3c9533edc79 100644
--- a/mindspore/lite/src/delegate/npu/op/fullconnection_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/fullconnection_npu.cc
@@ -43,7 +43,7 @@ int FullconnectionNPUOp::Init(const schema::Primitive *primitive, const std::vec
   }
   reshape_op_ = new (std::nothrow) hiai::op::Const(name_ + "_reshape_data");
   vector<int> reshape_data = {static_cast<int>(input_shape[0]), col};
-  ge::TensorDesc reshape_tensor_desc(ge::Shape({FC_INPUT_DIM}), ge::FORMAT_NCHW, ge::DT_FLOAT);
+  ge::TensorDesc reshape_tensor_desc(ge::Shape({FC_INPUT_DIM}), ge::FORMAT_NCHW, ge::DT_INT32);
   ge::TensorPtr reshape_tensor = std::make_shared<hiai::Tensor>(reshape_tensor_desc);
   reshape_tensor->SetData(reinterpret_cast<uint8_t *>(reshape_data.data()), FC_INPUT_DIM * sizeof(int32_t));
   reshape_op_->set_attr_value(reshape_tensor);
diff --git a/mindspore/lite/src/delegate/npu/op/reduce_npu.cc b/mindspore/lite/src/delegate/npu/op/reduce_npu.cc
index 85419f1ddab..a04adf116bd 100644
--- a/mindspore/lite/src/delegate/npu/op/reduce_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/reduce_npu.cc
@@ -45,7 +45,7 @@ int ReduceNPUOp::Init(const schema::Primitive *primitive, const std::vector<mind
   }
   if (reduce_mode_ == schema::ReduceMode_ReduceMean) {
     auto reduce_mean = new (std::nothrow) hiai::op::ReduceMean(name_);
-    if (reduce_ == nullptr) {
+    if (reduce_mean == nullptr) {
       MS_LOG(ERROR) << "New reduce operator for op " << name_ << " failed.";
       return RET_ERROR;
     }
diff --git a/mindspore/lite/src/delegate/npu/op/scale_npu.cc b/mindspore/lite/src/delegate/npu/op/scale_npu.cc
index 74ccb1f549c..92430e0b2cf 100644
--- a/mindspore/lite/src/delegate/npu/op/scale_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/scale_npu.cc
@@ -15,9 +15,11 @@
  */
 
 #include "src/delegate/npu/op/scale_npu.h"
+#include <memory>
 #include "src/delegate/npu/npu_converter_utils.h"
 
 namespace mindspore {
+constexpr int INPUT_INDEX = 0;
 constexpr int SCALE_INDEX = 1;
 constexpr int BIAS_INDEX = 2;
 
@@ -25,28 +27,37 @@ int ScaleNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<
                           const std::vector<mindspore::MSTensor> &out_tensors) {
   auto scale_prim = primitive->value_as_ScaleFusion();
   if (scale_prim == nullptr) {
-    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    MS_LOG(ERROR) << "Get null primitive value for op: " << name_;
     return RET_ERROR;
   }
   axis_ = scale_prim->axis();
   if (axis_ < 0) {
-    axis_ = axis_ + in_tensors[0].Shape().size();
+    axis_ = axis_ + in_tensors[INPUT_INDEX].Shape().size();
   }
   if (axis_ != NHWC_C && axis_ != NCHW_C) {
-    MS_LOG(WARNING) << "Npu scale axis attr only support 1 or channel, now is " << axis_;
-    return RET_NOT_SUPPORT;
+    if (in_tensors.size() <= BIAS_INDEX) {
+      MS_LOG(INFO) << "Npu Scale op does not support axis: " << axis_ << ", try to convert to Mul op.";
+      use_mul_ = true;
+    } else {
+      MS_LOG(WARNING) << "Npu Scale axis attr only support 1 or channel, now is " << axis_;
+      return RET_NOT_SUPPORT;
+    }
   }
   return RET_OK;
 }
 
 int ScaleNPUOp::Init(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
                      const std::vector<mindspore::MSTensor> &out_tensors) {
-  op_ = new (std::nothrow) hiai::op::Scale(name_);
+  if (!use_mul_) {
+    // note that Scale only support the default axis(i.e., 1), setting axis is meaningless.
+    op_ = new (std::nothrow) hiai::op::Scale(name_);
+  } else {
+    op_ = new (std::nothrow) hiai::op::Mul(name_);
+  }
   if (op_ == nullptr) {
     MS_LOG(ERROR) << name_ << " op is nullptr";
     return RET_ERROR;
   }
-  op_->set_attr_axis(1);  // only support axis 1 now
 
   auto scale_prim = primitive->value_as_ScaleFusion();
   if (scale_prim == nullptr) {
@@ -67,40 +78,20 @@ int ScaleNPUOp::Init(const schema::Primitive *primitive, const std::vector<minds
 int ScaleNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                              const std::vector<mindspore::MSTensor> &out_tensors,
                              const std::vector<ge::Operator *> &npu_inputs) {
-  op_->set_input_x(*npu_inputs.at(0));
   MS_ASSERT(in_tensors.size() > SCALE_INDEX);
-  auto scale_shape = in_tensors[SCALE_INDEX].Shape();
-  auto scale_tensor = ConverterToNPUTensor(in_tensors[SCALE_INDEX]);
-  if (scale_tensor == nullptr) {
-    MS_LOG(ERROR) << "Get scale_tensor failed.";
-    return RET_ERROR;
+  if (use_mul_) {
+    auto ret = ConvertScaleToMul(npu_inputs, op_, in_tensors);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Convert Scale to Mul failed, op name: " << name_;
+    }
+    return ret;
   }
-  scale_tensor->SetTensorDesc(ge::TensorDesc(ConverterToNPUShape({1, scale_shape[0], 1, 1})));
-
-  scale_ = new (std::nothrow) hiai::op::Const(name_ + "_scale");
-  if (scale_ == nullptr) {
-    MS_LOG(ERROR) << "New scale_ const failed.";
-    return RET_ERROR;
-  }
-  scale_->set_attr_value(scale_tensor);
-  op_->set_input_scale(*scale_);
 
+  auto scale_op = reinterpret_cast<hiai::op::Scale *>(op_);
+  scale_op->set_input_x(*npu_inputs.at(INPUT_INDEX));
+  scale_op->set_input_scale(*npu_inputs.at(SCALE_INDEX));
   if (in_tensors.size() > BIAS_INDEX && in_tensors[BIAS_INDEX] != nullptr) {
-    auto bias_shape = in_tensors[BIAS_INDEX].Shape();
-    auto bias_tensor = ConverterToNPUTensor(in_tensors[BIAS_INDEX]);
-    if (bias_tensor == nullptr) {
-      MS_LOG(ERROR) << "Get bias_tensor failed.";
-      return RET_ERROR;
-    }
-    scale_tensor->SetTensorDesc(ge::TensorDesc(ConverterToNPUShape({1, bias_shape[0], 1, 1})));
-
-    bias_ = new (std::nothrow) hiai::op::Const(name_ + "_beta");
-    if (bias_ == nullptr) {
-      MS_LOG(ERROR) << "New beta_ const failed.";
-      return RET_ERROR;
-    }
-    bias_->set_attr_value(bias_tensor);
-    op_->set_input_bias(*bias_);
+    scale_op->set_input_bias(*npu_inputs.at(BIAS_INDEX));
   }
   return RET_OK;
 }
@@ -130,6 +121,45 @@ int ScaleNPUOp::SetActivation(const ge::Operator *input) {
   return RET_OK;
 }
 
+int ScaleNPUOp::ConvertScaleToMul(const std::vector<ge::Operator *> &npu_inputs, ge::Operator *cur_op,
+                                  const std::vector<mindspore::MSTensor> &in_tensors) {
+  auto input_shape = in_tensors[INPUT_INDEX].Shape();
+  auto scale_shape = in_tensors[SCALE_INDEX].Shape();
+  auto mul_op = reinterpret_cast<hiai::op::Mul *>(cur_op);
+  mul_op->set_input_x1(*npu_inputs.at(INPUT_INDEX));
+  if (input_shape.size() == scale_shape.size()) {
+    mul_op->set_input_x2(*npu_inputs.at(SCALE_INDEX));
+  } else {
+    int valid_shape[4] = {1, 1, 1, 1};
+    for (size_t i = 0; i < scale_shape.size(); i++) {
+      valid_shape[axis_ + i] = static_cast<int>(scale_shape[i]);
+    }
+    reshape_ = new (std::nothrow) hiai::op::Reshape(name_ + "_reshape");
+    if (reshape_ == nullptr) {
+      MS_LOG(ERROR) << "New Reshape npu operator for op " << name_ << " failed.";
+      return RET_ERROR;
+    }
+    std::shared_ptr<ge::Tensor> shape_tensor = std::make_shared<ge::Tensor>();
+    if (shape_tensor == nullptr) {
+      MS_LOG(ERROR) << "new shape_tensor failed.";
+      return RET_ERROR;
+    }
+    ge::TensorDesc tensor_desc(ge::Shape({NPU_SHAPE_SIZE}), ge::FORMAT_ND, ge::DT_INT32);
+    shape_tensor->SetTensorDesc(tensor_desc);
+    shape_tensor->SetData(reinterpret_cast<const uint8_t *>(valid_shape), NPU_SHAPE_SIZE * sizeof(int));
+    shape_ = new (std::nothrow) hiai::op::Const(name_ + "_reshape_1");
+    if (shape_ == nullptr) {
+      MS_LOG(ERROR) << "New shape const for op " << name_ << " failed.";
+      return RET_ERROR;
+    }
+    shape_->set_attr_value(shape_tensor);
+    reshape_->set_input_x(*npu_inputs.at(SCALE_INDEX));
+    reshape_->set_input_shape(*shape_);
+    mul_op->set_input_x2(*reshape_);
+  }
+  return RET_OK;
+}
+
 ScaleNPUOp::~ScaleNPUOp() {
   if (op_ != nullptr) {
     delete op_;
@@ -147,5 +177,13 @@ ScaleNPUOp::~ScaleNPUOp() {
     delete act_;
     act_ = nullptr;
   }
+  if (reshape_ != nullptr) {
+    delete reshape_;
+    reshape_ = nullptr;
+  }
+  if (shape_ != nullptr) {
+    delete shape_;
+    shape_ = nullptr;
+  }
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/src/delegate/npu/op/scale_npu.h b/mindspore/lite/src/delegate/npu/op/scale_npu.h
index 6bb0df009e9..04b75d868dd 100644
--- a/mindspore/lite/src/delegate/npu/op/scale_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/scale_npu.h
@@ -48,11 +48,17 @@ class ScaleNPUOp : public NPUOp {
  private:
   int SetActivation(const ge::Operator *input);
 
+  int ConvertScaleToMul(const std::vector<ge::Operator *> &npu_inputs, ge::Operator *cur_op,
+                        const std::vector<mindspore::MSTensor> &in_tensors);
+
   int axis_ = 0;
+  bool use_mul_ = false;
   schema::ActivationType act_type_ = schema::ActivationType_NO_ACTIVATION;
-  hiai::op::Scale *op_ = nullptr;
+  ge::Operator *op_ = nullptr;
+  hiai::op::Reshape *reshape_ = nullptr;
   hiai::op::Const *scale_ = nullptr;
   hiai::op::Const *bias_ = nullptr;
+  hiai::op::Const *shape_ = nullptr;
   hiai::op::Activation *act_ = nullptr;
 };
 }  // namespace mindspore
diff --git a/mindspore/lite/src/delegate/npu/pass/npu_fusion_pass.cc b/mindspore/lite/src/delegate/npu/pass/npu_fusion_pass.cc
index f4edec93f63..7fbb72addfe 100644
--- a/mindspore/lite/src/delegate/npu/pass/npu_fusion_pass.cc
+++ b/mindspore/lite/src/delegate/npu/pass/npu_fusion_pass.cc
@@ -431,6 +431,7 @@ int NPUFusionPass::Run(NPUGraph *subgraph) {
           ret = StridedSliceFusion(cur_op);
           continue;
         case schema::PrimitiveType_AddFusion:
+        case schema::PrimitiveType_MulFusion:
         case schema::PrimitiveType_Activation:
         case schema::PrimitiveType_Eltwise:
           i -= cur_op->in_ops().size();
diff --git a/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.cc b/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.cc
index 9322c5ccb14..118e5dc0838 100644
--- a/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.cc
+++ b/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.cc
@@ -25,9 +25,9 @@ using mindspore::lite::RET_OK;
 namespace mindspore {
 enum InsertState { InsertNone, PreInsert, PostInsert, BothInsert };
 std::set<mindspore::schema::PrimitiveType> insert_nodes = {
-  schema::PrimitiveType_Concat,      schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise,
-  schema::PrimitiveType_Activation,  schema::PrimitiveType_Split,     schema::PrimitiveType_PadFusion,
-  schema::PrimitiveType_StridedSlice};
+  schema::PrimitiveType_Concat,       schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise,
+  schema::PrimitiveType_Activation,   schema::PrimitiveType_Split,     schema::PrimitiveType_PadFusion,
+  schema::PrimitiveType_StridedSlice, schema::PrimitiveType_MulFusion};
 
 // this pass goal is to minimize subgraphs generated
 // by inserting nchw2nhwc or nhwc2nchw before or after the operator (e.g. concat, add, etc..) together with
@@ -167,8 +167,7 @@ int NPUInsertTransformPass::InsertNode(NPUOp *op, NPUOp *post_op, size_t post_in
   } else {
     // post_op nullptr mean output, we remain graph output tensor name unchanged
     auto graph_output_name = in_tensor.Name();
-    in_tensor.SetTensorName(graph_output_name + "_before_" + name_);
-    nc2nh_tensor->SetTensorName(graph_output_name);
+    nc2nh_tensor->SetTensorName(graph_output_name + "_after_" + name_);
   }
   return RET_OK;
 }
diff --git a/mindspore/lite/src/delegate/tensorrt/CMakeLists.txt b/mindspore/lite/src/delegate/tensorrt/CMakeLists.txt
index ccf4b2b9a3f..4f6e39adec9 100644
--- a/mindspore/lite/src/delegate/tensorrt/CMakeLists.txt
+++ b/mindspore/lite/src/delegate/tensorrt/CMakeLists.txt
@@ -6,17 +6,31 @@ file(GLOB_RECURSE TENSORRT_RUNTIME_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/../delegate_utils.cc
         )
 add_library(libcudart SHARED IMPORTED)
-set_target_properties(libcudart PROPERTIES IMPORTED_LOCATION
-        ${CUDA_LIB_PATH}/libcudart.so)
+set_target_properties(libcudart PROPERTIES IMPORTED_LOCATION ${CUDA_LIB_PATH}/libcudart.so)
+
+add_library(libcudnn SHARED IMPORTED)
+set_target_properties(libcudnn PROPERTIES IMPORTED_LOCATION ${CUDA_LIB_PATH}/libcudnn.so)
+
+add_library(libnvrtc SHARED IMPORTED)
+set_target_properties(libnvrtc PROPERTIES IMPORTED_LOCATION ${CUDA_LIB_PATH}/libnvrtc.so)
+
+add_library(libcublas SHARED IMPORTED)
+set_target_properties(libcublas PROPERTIES IMPORTED_LOCATION ${CUDA_LIB_PATH}/libcublas.so)
+
+add_library(libcublasLt SHARED IMPORTED)
+set_target_properties(libcublasLt PROPERTIES IMPORTED_LOCATION ${CUDA_LIB_PATH}/libcublasLt.so)
 
 add_library(libnvinfer SHARED IMPORTED)
-set_target_properties(libnvinfer PROPERTIES IMPORTED_LOCATION
-        ${TENSORRT_LIB_PATH}/libnvinfer.so)
+set_target_properties(libnvinfer PROPERTIES IMPORTED_LOCATION ${TENSORRT_LIB_PATH}/libnvinfer.so)
 
 add_library(tensorrt_kernel_mid OBJECT ${TENSORRT_RUNTIME_SRC})
 add_dependencies(tensorrt_kernel_mid fbs_src)
 target_link_libraries(
         tensorrt_kernel_mid
         libcudart
+        libcudnn
+        libnvrtc
+        libcublas
+        libcublasLt
         libnvinfer
 )
diff --git a/mindspore/lite/src/inner_context.cc b/mindspore/lite/src/inner_context.cc
index b225d6b2970..5044c1dc7af 100644
--- a/mindspore/lite/src/inner_context.cc
+++ b/mindspore/lite/src/inner_context.cc
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include "include/errorcode.h"
 #include "src/common/log_adapter.h"
-#include "src/common/utils.h"
+#include "src/common/log_util.h"
 #ifdef SUPPORT_NPU
 #include "include/HiAiModelManagerType.h"
 #endif
@@ -28,6 +28,8 @@
 namespace mindspore::lite {
 namespace {
 constexpr int kDefaultParallelNum = 2;
+const constexpr int kMaxLiteContextDeviceNums = 2;
+const constexpr int kMaxInnerContextDeviceNums = 3;
 }  // namespace
 
 InnerContext::InnerContext(const Context *context) {
@@ -45,24 +47,49 @@ InnerContext::InnerContext(const Context *context) {
 }
 
 void InnerContext::SetContextDevice(const Context *context) {
+  MS_ASSERT(context->device_list_.size() <= kMaxLiteContextDeviceNums);
+
+  this->device_list_.clear();
+
+  /* user set order for different device */
+  if (context->device_list_.size() < kMaxLiteContextDeviceNums) {
+    this->device_list_.push_back(context->device_list_.front());
+    return;
+  }
+
+  /* keep compatibility :
+   * if user set CPU & NPU/GPU
+   * NPU/GPU higher priority */
   bool isUserSetNPU = context->device_list_.end() !=
-                      std::find_if(context->device_list_.begin(), context->device_list_.end(),
+                      std::find_if(this->device_list_.begin(), this->device_list_.end(),
                                    [](const DeviceContext &device) { return device.device_type_ == DT_NPU; });
   bool isUserSetGPU = context->device_list_.end() !=
-                      std::find_if(context->device_list_.begin(), context->device_list_.end(),
+                      std::find_if(this->device_list_.begin(), this->device_list_.end(),
                                    [](const DeviceContext &device) { return device.device_type_ == DT_GPU; });
-  this->device_list_.clear();
+  if (isUserSetGPU == false && isUserSetNPU == false) {
+    return;
+  }
+
+  /* add GPU/NPU first */
   for (auto &device_ctx : context->device_list_) {
-    // npu/gpu server would use one core so we don't bind core to avoid competition.
-    // If user does not set npu/gpu device, we still bind core.
-    if (device_ctx.device_type_ == DT_CPU && (isUserSetNPU || (isUserSetGPU && !enable_parallel_))) {
-      auto cpu_ctx = device_ctx;
-      cpu_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = NO_BIND;
-      this->device_list_.push_back(cpu_ctx);
-    } else {
+    if (device_ctx.device_type_ != DT_CPU) {
       this->device_list_.push_back(device_ctx);
     }
   }
+
+  /* add CPU */
+  for (auto &device_ctx : context->device_list_) {
+    if (device_ctx.device_type_ == DT_CPU) {
+      if (isUserSetNPU || (isUserSetGPU && enable_parallel_ == false)) {
+        auto cpu_ctx = device_ctx;
+        cpu_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = NO_BIND;
+        this->device_list_.push_back(cpu_ctx);
+      } else {
+        this->device_list_.push_back(device_ctx);
+      }
+    }
+  }
+  return;
 }
 
 int InnerContext::Init() {
@@ -130,7 +157,7 @@ int InnerContext::IsValid() const {
     MS_LOG(ERROR) << "Device list is empty.";
     return RET_NOT_SUPPORT;
   }
-  if (this->device_list_.size() > kMaxDeviceNums) {
+  if (this->device_list_.size() > kMaxInnerContextDeviceNums) {
     MS_LOG(ERROR) << "Not support device list more than 2.";
     return RET_NOT_SUPPORT;
   }
@@ -205,7 +232,6 @@ bool InnerContext::IsGpuEnabled() const {
 
 bool InnerContext::IsNpuEnabled() const {
 #ifdef SUPPORT_NPU
-  //  return IsUserSetNpu() && npu_manager_->IsSupportNPU();
   return IsUserSetNpu();
 #else
   return false;
diff --git a/mindspore/lite/src/inner_context.h b/mindspore/lite/src/inner_context.h
index d3ed51e16d6..bd5e36211f4 100644
--- a/mindspore/lite/src/inner_context.h
+++ b/mindspore/lite/src/inner_context.h
@@ -26,7 +26,6 @@
 #endif
 
 namespace mindspore::lite {
-const constexpr int kMaxDeviceNums = 2;
 struct InnerContext : public Context {
  public:
   InnerContext() = default;
@@ -82,7 +81,6 @@ struct InnerContext : public Context {
 };
 
 int ParallelLaunch(const Context *context, const Func &func, Content content, int task_num);
-
 }  // namespace mindspore::lite
 
 #endif  // MINDSPORE_LITE_SRC_INNER_CONTEXT_H
diff --git a/mindspore/lite/src/inner_kernel.h b/mindspore/lite/src/inner_kernel.h
index 8f41a07b260..08a6b94ecb1 100644
--- a/mindspore/lite/src/inner_kernel.h
+++ b/mindspore/lite/src/inner_kernel.h
@@ -164,7 +164,7 @@ class InnerKernel : public Kernel {
   void set_registry_data_type(TypeId data_type) { registry_data_type_ = data_type; }
 
   void set_workspace_size(size_t value) { workspace_size_ = value; }
-  size_t workspace_size() { return workspace_size_; }
+  virtual size_t workspace_size() { return workspace_size_; }
   void AllocWorkspace();
   void FreeWorkspace();
   void *workspace() { return workspace_; }
diff --git a/mindspore/lite/src/kernel_registry.cc b/mindspore/lite/src/kernel_registry.cc
index 5da2327ceff..43c2b477d8e 100644
--- a/mindspore/lite/src/kernel_registry.cc
+++ b/mindspore/lite/src/kernel_registry.cc
@@ -17,8 +17,9 @@
 #include <utility>
 #include <memory>
 #include "include/errorcode.h"
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
 #include "include/registry/register_kernel.h"
-#include "src/registry/register_utils.h"
+#endif
 #include "src/ops/populate/populate_register.h"
 #include "src/common/version_manager.h"
 #include "nnacl/pooling_parameter.h"
@@ -32,24 +33,29 @@
 #endif
 #include "src/common/tensor_util.h"
 
-using mindspore::kernel::CreateKernel;
 using mindspore::kernel::kBuiltin;
 using mindspore::kernel::kCPU;
 using mindspore::kernel::KERNEL_ARCH;
 using mindspore::kernel::KernelCreator;
 using mindspore::kernel::KernelKey;
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
+using mindspore::registry::CreateKernel;
+using mindspore::registry::KernelDesc;
+#endif
 
 namespace mindspore::lite {
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
 namespace {
 const char *const kArchCPU = "CPU";
-void KernelKeyToKernelDesc(const KernelKey &key, kernel::KernelDesc *desc) {
+void KernelKeyToKernelDesc(const KernelKey &key, KernelDesc *desc) {
   MS_ASSERT(desc != nullptr);
-  desc->data_type = key.data_type;
+  desc->data_type = static_cast<DataType>(key.data_type);
   desc->type = key.type;
   desc->arch = key.kernel_arch;
   desc->provider = key.provider;
 }
 }  // namespace
+#endif
 
 void KernelRegistry::CreatorArraysInit() {
   std::unique_lock<std::mutex> malloc_creator_array(lock_);
@@ -132,14 +138,15 @@ bool KernelRegistry::SupportKernel(const KernelKey &key) {
   return kernel_creator != nullptr;
 }
 
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
 int KernelRegistry::GetCustomKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                                     const mindspore::Context *ms_ctx, const kernel::KernelKey &key,
                                     kernel::LiteKernel **kernel, const void *primitive) {
   MS_ASSERT(ms_ctx != nullptr);
   MS_ASSERT(kernel != nullptr);
-  kernel::KernelDesc desc;
+  KernelDesc desc;
   KernelKeyToKernelDesc(key, &desc);
-  CreateKernel creator = kernel::RegisterUtils::GetCreator(static_cast<const schema::Primitive *>(primitive), &desc);
+  auto creator = registry::RegisterKernel::GetCreator(static_cast<const schema::Primitive *>(primitive), &desc);
   if (creator == nullptr) {
     return RET_NOT_SUPPORT;
   }
@@ -162,13 +169,16 @@ int KernelRegistry::GetCustomKernel(const std::vector<Tensor *> &in_tensors, con
   }
   return RET_ERROR;
 }
+#endif
 
 int KernelRegistry::GetKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                               const InnerContext *ctx, const mindspore::Context *ms_ctx, const kernel::KernelKey &key,
                               OpParameter *parameter, kernel::LiteKernel **kernel, const void *primitive) {
   MS_ASSERT(ctx != nullptr);
   MS_ASSERT(kernel != nullptr);
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
   if (key.provider == kBuiltin) {
+#endif
     auto creator = GetCreator(key);
     if (creator != nullptr) {
       auto inner_kernel = creator(in_tensors, out_tensors, parameter, ctx, key);
@@ -185,6 +195,7 @@ int KernelRegistry::GetKernel(const std::vector<Tensor *> &in_tensors, const std
       }
       return RET_ERROR;
     }
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
   } else {
     auto ret = GetCustomKernel(in_tensors, out_tensors, ms_ctx, key, kernel, primitive);
     if (ret == RET_OK) {
@@ -192,6 +203,7 @@ int KernelRegistry::GetKernel(const std::vector<Tensor *> &in_tensors, const std
     }
     return ret;
   }
+#endif
   return RET_NOT_SUPPORT;
 }
 }  // namespace mindspore::lite
diff --git a/mindspore/lite/src/kernel_registry.h b/mindspore/lite/src/kernel_registry.h
index af480d3b844..293c10f64a0 100644
--- a/mindspore/lite/src/kernel_registry.h
+++ b/mindspore/lite/src/kernel_registry.h
@@ -47,9 +47,11 @@ class KernelRegistry {
                 OpParameter *op_parameter, kernel::LiteKernel **kernel, const void *primitive = nullptr);
 
  protected:
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
   int GetCustomKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                       const mindspore::Context *ctx, const kernel::KernelKey &key, kernel::LiteKernel **kernel,
                       const void *primitive = nullptr);
+#endif
   static const int device_type_length_{kKernelArch_MAX - kKernelArch_MIN + 1};
   static const int data_type_length_{kNumberTypeEnd - kNumberTypeBegin + 1};
   static const int op_type_length_{PrimitiveType_MAX - PrimitiveType_MIN + 1};
diff --git a/mindspore/lite/src/lite_kernel.h b/mindspore/lite/src/lite_kernel.h
index b539849d81f..88e177d949c 100644
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@@ -35,7 +35,9 @@
 #include "include/api/kernel.h"
 #include "src/cxx_api/tensor/tensor_impl.h"
 #include "src/inner_kernel.h"
+#ifndef DELEGATE_CLIP
 #include "include/api/delegate.h"
+#endif
 
 namespace mindspore::kernel {
 enum KERNEL_ARCH { kCPU, kGPU, kAPU, kNPU, kCustom, kDelegate, kKernelArch_MIN = kCPU, kKernelArch_MAX = kAPU };
@@ -47,8 +49,9 @@ struct KernelKey {
   int type = 0;
   std::string kernel_arch;
   std::string provider{kBuiltin};
+#ifndef DELEGATE_CLIP
   std::shared_ptr<Delegate> delegate = nullptr;
-
+#endif
   bool operator<(const KernelKey &dst) const {
     if (provider != dst.provider) {
       return provider < dst.provider;
diff --git a/mindspore/lite/src/lite_kernel_util.cc b/mindspore/lite/src/lite_kernel_util.cc
index 0fac1ba5903..a9d7e3c2c10 100644
--- a/mindspore/lite/src/lite_kernel_util.cc
+++ b/mindspore/lite/src/lite_kernel_util.cc
@@ -196,11 +196,13 @@ void LiteKernelUtil::InitTensorInitRefCount(const std::vector<kernel::LiteKernel
 
 int LiteKernelUtil::SetInput(const LiteKernel &kernelMod, const std::vector<lite::Tensor *> &inputs) { return -1; }
 
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 bool LiteKernelUtil::IsSwitchCall(kernel::LiteKernel *kernel) {
+#ifndef DELEGATE_CLIP
   if (kernel->desc().delegate != nullptr) {
     return false;
   }
+#endif
   auto *subgraph_kernel = reinterpret_cast<kernel::SubGraphKernel *>(kernel);
   if (subgraph_kernel == nullptr) {
     return false;
diff --git a/mindspore/lite/src/lite_kernel_util.h b/mindspore/lite/src/lite_kernel_util.h
index 0a8bc2ddde4..08263043240 100644
--- a/mindspore/lite/src/lite_kernel_util.h
+++ b/mindspore/lite/src/lite_kernel_util.h
@@ -37,7 +37,7 @@ class LiteKernelUtil {
 
   static int SetInput(const LiteKernel &kernelMod, const std::vector<lite::Tensor *> &inputs);
 
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   static bool IsSwitchCall(kernel::LiteKernel *kernel);
 #endif
 
diff --git a/mindspore/lite/src/lite_mindrt.cc b/mindspore/lite/src/lite_mindrt.cc
index 6c7bfffad4c..2d171a1951b 100644
--- a/mindspore/lite/src/lite_mindrt.cc
+++ b/mindspore/lite/src/lite_mindrt.cc
@@ -74,6 +74,28 @@ bool OfflineIsolated(const std::vector<kernel::LiteKernel *> &kernels, const ker
   return true;
 }
 
+void LiteOpActor::ReplaceNodeInTensor(kernel::LiteKernel *kernel, Tensor *old_tensor, Tensor *new_tensor) {
+  int ref_count = 0;
+#ifndef DELEGATE_CLIP
+  /* set op input for calculate */
+  if (kernel->desc().delegate != nullptr) {
+    ref_count++;
+  } else {
+#endif
+    for (auto in_node : reinterpret_cast<kernel::SubGraphKernel *>(kernel)->in_nodes()) {
+      for (size_t node_in_index = 0; node_in_index < in_node->in_tensors().size(); node_in_index++) {
+        if (old_tensor == in_node->in_tensors()[node_in_index]) {
+          in_node->set_in_tensor(new_tensor, node_in_index);
+          ref_count++;
+        }
+      }
+    }
+#ifndef DELEGATE_CLIP
+  }
+#endif
+  new_tensor->set_init_ref_count(ref_count);
+}
+
 void LiteOpActor::IsolateInputData(std::vector<std::shared_ptr<LiteOpActor>> *actors) {
   std::vector<kernel::LiteKernel *> kernels{};
   std::transform(actors->begin(), actors->end(), std::back_inserter(kernels),
@@ -86,7 +108,7 @@ void LiteOpActor::IsolateInputData(std::vector<std::shared_ptr<LiteOpActor>> *ac
       if (old_tensor->data_type() == kNumberTypeFloat16 || old_tensor->data_type() == kNumberTypeFloat32) {
         old_tensor->set_data_type(kernel_->desc().data_type);
       }
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
       if (old_tensor->data_type() == kObjectTypeTensorType) {
         auto old_tensorlist = reinterpret_cast<TensorList *>(old_tensor);
         if (old_tensorlist->tensors_data_type() == kNumberTypeFloat16 ||
@@ -116,22 +138,7 @@ void LiteOpActor::IsolateInputData(std::vector<std::shared_ptr<LiteOpActor>> *ac
       new_tensor->AddQuantParam(quant);
     }
     isolate_input_map_.insert(std::make_pair(new_tensor, old_tensor));
-
-    int ref_count = 0;
-    /* set op input for calculate */
-    if (kernel_->desc().delegate != nullptr) {
-      ref_count++;
-    } else {
-      for (auto in_node : reinterpret_cast<kernel::SubGraphKernel *>(kernel_)->in_nodes()) {
-        for (size_t node_in_index = 0; node_in_index < in_node->in_tensors().size(); node_in_index++) {
-          if (old_tensor == in_node->in_tensors()[node_in_index]) {
-            in_node->set_in_tensor(new_tensor, node_in_index);
-            ref_count++;
-          }
-        }
-      }
-    }
-    new_tensor->set_init_ref_count(ref_count);
+    ReplaceNodeInTensor(kernel_, old_tensor, new_tensor);
     /* set subgraph input for copy data */
     kernel_->set_in_tensor(new_tensor, i);
   }
@@ -192,12 +199,14 @@ int LiteOpActor::CompileArrowThroughOutputKernels() {
   return RET_OK;
 }
 
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 int LiteOpActor::CompileArrowThroughPartialCall() {
+#ifndef DELEGATE_CLIP
   if (kernel_->desc().delegate != nullptr) {
     MS_LOG(INFO) << "kernel is delegate subgraph kernel.";
     return RET_OK;
   }
+#endif
   auto *subgraph_kernel = reinterpret_cast<kernel::SubGraphKernel *>(kernel_);
   if (subgraph_kernel == nullptr) {
     MS_LOG(INFO) << "kernel is not subgraph kernel, no partial call.";
@@ -236,7 +245,7 @@ int LiteOpActor::CompileArrowThroughPartialCall() {
 int LiteOpActor::CompileArrow() {
   int ret;
   output_data_arrows_.clear();
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   ret = CompileArrowThroughPartialCall();
   if (ret != RET_OK) {
     output_data_arrows_.clear();
@@ -279,7 +288,7 @@ void LiteOpActor::MoveInputData(Tensor *dst_tensor, Tensor *src_tensor) {
     return;
   }
   MS_ASSERT(src_tensor->allocator() != nullptr);
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   if (src_tensor->data_type() == kObjectTypeTensorType) {
     MoveTensorListInputData(reinterpret_cast<TensorList *>(dst_tensor), reinterpret_cast<TensorList *>(src_tensor));
   } else {
@@ -298,7 +307,7 @@ void LiteOpActor::SetInputData(Tensor *dst_tensor, Tensor *src_tensor) {
 
 int LiteOpActor::CastInputData(Tensor *dst, Tensor *src) {
   int ret = RET_OK;
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   if (src->data_type() != kObjectTypeTensorType) {
     ret = CastTensorInputData(dst, src);
   } else {
@@ -316,7 +325,7 @@ bool LiteOpActor::NeedCastData(Tensor *dst_tensor, Tensor *src_tensor) {
       dst_tensor->data_type() != src_tensor->data_type()) {
     return true;
   }
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   if (dst_tensor->data_type() == kObjectTypeTensorType && src_tensor->data_type() == kObjectTypeTensorType &&
       reinterpret_cast<TensorList *>(dst_tensor)->tensors_data_type() !=
         reinterpret_cast<TensorList *>(src_tensor)->tensors_data_type()) {
@@ -353,7 +362,7 @@ int LiteOpActor::CastTensorInputData(Tensor *dst, Tensor *src) {
   return RET_ERROR;
 }
 
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 void LiteOpActor::MoveTensorListInputData(TensorList *dst_tensorlist, TensorList *src_tensorlist) {
   MS_ASSERT(src_tensorlist != nullptr);
   MS_ASSERT(dst_tensorlist != nullptr);
@@ -671,7 +680,7 @@ void LiteOpActor::SetInputShape() {
     MS_LOG(DEBUG) << "this->kernel_->name(): " << this->kernel_->name();
 
     if (input_tensor->data_type() == kObjectTypeTensorType) {
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
       auto input_tensorlist = reinterpret_cast<TensorList *>(input_tensor);
       auto input_data_tensorlist = reinterpret_cast<TensorList *>(inputs_data_[i]);
       input_tensorlist->FreeTensorListData();
@@ -755,7 +764,7 @@ std::vector<std::shared_ptr<LiteOpActor>> CreateOpActor(const std::vector<kernel
   for (auto &kernel : kernels) {
     /* make subgraph name (actor name) unique */
     kernel->set_name(kernel->name() + "_" + to_string(actor_count++));
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
     if ((kernel::LiteKernelUtil::IsSwitchCall(kernel))) {
       auto switch_actor = std::make_shared<LiteSwitchOpActor>(kernel);
       if (switch_actor == nullptr) {
@@ -777,7 +786,7 @@ std::vector<std::shared_ptr<LiteOpActor>> CreateOpActor(const std::vector<kernel
       actor->set_thread_pool(thread_pool);
       subgraph_name_AID_map[kernel] = actor->GetAID();
       actors.push_back(actor);
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
     }
 #endif
   }
diff --git a/mindspore/lite/src/lite_mindrt.h b/mindspore/lite/src/lite_mindrt.h
index 3111015153f..256f2ffe7c0 100644
--- a/mindspore/lite/src/lite_mindrt.h
+++ b/mindspore/lite/src/lite_mindrt.h
@@ -27,7 +27,9 @@
 #include "async/future.h"
 #include "src/sub_graph_kernel.h"
 #include "src/cpu_info.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/tensorlist.h"
+#endif
 
 namespace mindspore::lite {
 
@@ -93,6 +95,7 @@ class LiteOpActor : public OpActor<lite::Tensor> {
   std::unordered_map<Tensor *, Tensor *> isolate_input_map_{}; /* <calculate-tensor,  src-input-tensor> */
 
  private:
+  void ReplaceNodeInTensor(kernel::LiteKernel *kernel, Tensor *old_tensor, Tensor *new_tensor);
   void IsolateInputData(std::vector<std::shared_ptr<LiteOpActor>> *actors);
   void MoveTensorInputData(Tensor *dst_tensor, Tensor *src_tensor);
   void MoveInputData(Tensor *dst_tensor, Tensor *src_tensor);
@@ -100,7 +103,7 @@ class LiteOpActor : public OpActor<lite::Tensor> {
   int CastInputData(Tensor *dst_tensor, Tensor *src_tensor);
   bool NeedCastData(Tensor *dst_tensor, Tensor *src_tensor);
   int CastTensorInputData(Tensor *dst_tensor, Tensor *src_tensor);
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   void MoveTensorListInputData(TensorList *dst_tensor, TensorList *src_tensor);
   int CastTensorListInputData(TensorList *dst_tensor, TensorList *src_tensor);
 #endif
@@ -113,7 +116,7 @@ class LiteOpActor : public OpActor<lite::Tensor> {
 #endif
 };
 
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 class LiteSwitchOpActor : public LiteOpActor {
  public:
   explicit LiteSwitchOpActor(kernel::LiteKernel *kernel) : LiteOpActor(kernel) {}
diff --git a/mindspore/lite/src/lite_model.cc b/mindspore/lite/src/lite_model.cc
index 3f28ebf6186..b418036bb09 100644
--- a/mindspore/lite/src/lite_model.cc
+++ b/mindspore/lite/src/lite_model.cc
@@ -37,7 +37,10 @@ int LiteModel::ConvertAttrs(Model::Node *node, std::vector<schema::Tensor *> *ds
     return RET_ERROR;
   }
   auto primitive = node->primitive_;
-  MS_ASSERT(primitive != nullptr);
+  if (primitive == nullptr) {
+    MS_LOG(ERROR) << "primitive is nullptr.";
+    return RET_ERROR;
+  }
   auto prim = reinterpret_cast<const schema::v0::Primitive *>(primitive);
   int primitive_type = prim->value_type();
   auto creator = CompatRegistry::GetInstance()->GetTransferAttrFunc(SCHEMA_VERSION::SCHEMA_V0, primitive_type);
@@ -54,8 +57,7 @@ int LiteModel::ConvertAttrs(Model::Node *node, std::vector<schema::Tensor *> *ds
 }
 
 int LiteModel::ConvertAttrToTensors() {
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-  if (schema_version != SCHEMA_VERSION::SCHEMA_V0) {
+  if (schema_version_ != SCHEMA_VERSION::SCHEMA_V0) {
     MS_LOG(DEBUG) << "no need to convert attr to tensor.";
     return RET_OK;
   }
@@ -143,7 +145,7 @@ void LiteModel::Destroy() {
 
 int LiteModel::ConvertSubGraph(const schema::SubGraph &sub_graph) {
   if (sub_graph.name() == nullptr || sub_graph.inputIndices() == nullptr || sub_graph.outputIndices() == nullptr ||
-      sub_graph.nodeIndices() == nullptr || sub_graph.tensorIndices() == nullptr) {
+      sub_graph.tensorIndices() == nullptr) {
     MS_LOG(ERROR) << "sub_graph is invalid";
     return RET_ERROR;
   }
@@ -163,9 +165,11 @@ int LiteModel::ConvertSubGraph(const schema::SubGraph &sub_graph) {
   for (uint32_t i = 0; i < out_count; ++i) {
     subgraph->output_indices_.push_back(sub_graph.outputIndices()->Get(i));
   }
-  auto node_count = sub_graph.nodeIndices()->size();
-  for (uint32_t i = 0; i < node_count; ++i) {
-    subgraph->node_indices_.push_back(sub_graph.nodeIndices()->Get(i));
+  if (sub_graph.nodeIndices() != nullptr) {
+    auto node_count = sub_graph.nodeIndices()->size();
+    for (uint32_t i = 0; i < node_count; ++i) {
+      subgraph->node_indices_.push_back(sub_graph.nodeIndices()->Get(i));
+    }
   }
   auto tensor_count = sub_graph.tensorIndices()->size();
   for (uint32_t i = 0; i < tensor_count; ++i) {
@@ -211,8 +215,8 @@ int LiteModel::NodeVerify() const {
       return RET_ERROR;
     }
 
-    if (IsPartialNode(node->primitive_)) {
-      auto subgraph_index = GetPartialGraphIndex(node->primitive_);
+    if (IsPartialNode(node->primitive_, schema_version_)) {
+      auto subgraph_index = GetPartialGraphIndex(node->primitive_, schema_version_);
       if (static_cast<uint32_t>(subgraph_index) >= subgraph_size) {
         MS_LOG(ERROR) << "subgraph index：" << subgraph_index << " is beyond subgraph_size: " << subgraph_size;
         return RET_ERROR;
@@ -226,8 +230,7 @@ int LiteModel::SubGraphVerify() const {
   auto tensor_size = this->all_tensors_.size();
   auto node_size = this->all_nodes_.size();
 
-  if (sub_graphs_[0]->input_indices_.size() == 0 || GetGraphInputNodes(this).size() == 0 ||
-      sub_graphs_[0]->output_indices_.size() == 0 || GetGraphOutputNodes(this).size() == 0) {
+  if (sub_graphs_[0]->input_indices_.size() == 0 || sub_graphs_[0]->output_indices_.size() == 0) {
     MS_LOG(ERROR) << "The model has invalid input and output, please check";
     return RET_ERROR;
   }
@@ -290,12 +293,11 @@ bool LiteModel::ModelVerify() const {
 
 const void *LiteModel::GetMetaGraphByVerison() {
   MS_ASSERT(this->buf != nullptr);
-  auto schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-  if (schema_version == SCHEMA_VERSION::SCHEMA_CUR) {
+  if (schema_version_ == SCHEMA_VERSION::SCHEMA_CUR) {
     return reinterpret_cast<const void *>(schema::GetMetaGraph(this->buf));
   }
 #ifdef ENABLE_V0
-  if (schema_version == SCHEMA_VERSION::SCHEMA_V0) {
+  if (schema_version_ == SCHEMA_VERSION::SCHEMA_V0) {
     return reinterpret_cast<const void *>(schema::v0::GetMetaGraph(buf));
   }
 #endif
@@ -304,12 +306,11 @@ const void *LiteModel::GetMetaGraphByVerison() {
 
 int LiteModel::GenerateModelByVersion(const void *meta_graph) {
   MS_ASSERT(meta_graph != nullptr);
-  auto schema_version = VersionManager::GetInstance()->GetSchemaVersion();
   int status = RET_ERROR;
 #ifdef ENABLE_MODEL_OBF
   DeObfuscator *model_deobf = nullptr;
 #endif
-  if (schema_version == SCHEMA_VERSION::SCHEMA_CUR) {
+  if (schema_version_ == SCHEMA_VERSION::SCHEMA_CUR) {
 #ifdef ENABLE_MODEL_OBF
     if (IsMetaGraphObfuscated<schema::MetaGraph>(*reinterpret_cast<const schema::MetaGraph *>(meta_graph))) {
       model_deobf =
@@ -323,7 +324,7 @@ int LiteModel::GenerateModelByVersion(const void *meta_graph) {
     status = GenerateModel<schema::MetaGraph, schema::CNode>(*reinterpret_cast<const schema::MetaGraph *>(meta_graph));
   }
 #ifdef ENABLE_V0
-  if (schema_version == SCHEMA_VERSION::SCHEMA_V0) {
+  if (schema_version_ == SCHEMA_VERSION::SCHEMA_V0) {
     status = GenerateModel<schema::v0::MetaGraph, schema::v0::CNode>(
       *reinterpret_cast<const schema::v0::MetaGraph *>(meta_graph));
   }
@@ -348,12 +349,11 @@ int LiteModel::ConstructModel() {
     return RET_NULL_PTR;
   }
   flatbuffers::Verifier verify((const uint8_t *)this->buf, this->buf_size_);
-  int schema_version = VersionVerify(&verify);
-  if (schema_version == SCHEMA_INVALID) {
+  schema_version_ = VersionVerify(&verify);
+  if (schema_version_ == SCHEMA_INVALID) {
     MS_LOG(ERROR) << "The buffer is invalid and fail to create graph.";
     return RET_ERROR;
   }
-  VersionManager::GetInstance()->SetSchemaVersion(schema_version);
   const void *meta_graph = GetMetaGraphByVerison();
   if (meta_graph == nullptr) {
     MS_LOG(ERROR) << "meta_graph is nullptr!";
diff --git a/mindspore/lite/src/lite_model.h b/mindspore/lite/src/lite_model.h
index b1a9497ca33..b0b44550f5d 100644
--- a/mindspore/lite/src/lite_model.h
+++ b/mindspore/lite/src/lite_model.h
@@ -51,6 +51,8 @@ class LiteModel : public Model {
 
   void set_keep_model_buf(bool keep) { this->keep_model_buf_ = keep; }
 
+  int GetSchemaVersion() const { return schema_version_; }
+
  private:
 #ifdef ENABLE_V0
   int ConvertAttrs(Model::Node *node, std::vector<schema::Tensor *> *dst_tensor);
@@ -100,12 +102,11 @@ class LiteModel : public Model {
       node->primitive_ = c_node->primitive();
 #endif
       node->quant_type_ = c_node->quantType();
-      auto schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-      if (schema_version == SCHEMA_VERSION::SCHEMA_CUR) {
+      if (schema_version_ == SCHEMA_VERSION::SCHEMA_CUR) {
         SetNodeDeviceType(node, *c_node);
       }
 #ifdef ENABLE_V0
-      if (schema_version == SCHEMA_VERSION::SCHEMA_V0) {
+      if (schema_version_ == SCHEMA_VERSION::SCHEMA_V0) {
         SetNodeDeviceType(node, *c_node);
       }
 #endif
@@ -206,6 +207,12 @@ class LiteModel : public Model {
       return RET_ERROR;
     }
 
+    if (meta_graph.inputIndex() == nullptr || meta_graph.outputIndex() == nullptr ||
+        meta_graph.allTensors() == nullptr) {
+      MS_LOG(ERROR) << "meta_graph is invalid, please check your model file.";
+      return RET_ERROR;
+    }
+
     // converterInputOutput
     auto in_count = meta_graph.inputIndex()->size();
     for (uint32_t i = 0; i < in_count; ++i) {
@@ -269,6 +276,7 @@ class LiteModel : public Model {
  protected:
   std::vector<char *> attr_tensor_bufs_;
   bool keep_model_buf_ = false;
+  int schema_version_ = SCHEMA_VERSION::SCHEMA_CUR;
 };
 
 Model *ImportFromBuffer(const char *model_buf, size_t size, bool take_buf);
diff --git a/mindspore/lite/src/lite_session.cc b/mindspore/lite/src/lite_session.cc
index f026ffedd20..9e2c3298d11 100644
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@@ -43,51 +43,52 @@
 #if GPU_TENSORRT
 #include "src/delegate/tensorrt/tensorrt_delegate.h"
 #endif
-
+#ifndef WEIGHT_DECODE_CLIP
+#include "tools/converter/quantizer/fse_decoder.h"
+#endif
 namespace mindspore {
 namespace lite {
 namespace {
-int DecompressTensor(const schema::Tensor &src_tensor, Tensor *dst_tensor) {
-  MS_ASSERT(dst_tensor != nullptr);
-  if (src_tensor.weightQunatCompressType() == schema::WeightQunatCompressType_INDEXING) {
-    return IndexingDecompress(src_tensor, dst_tensor);
-  } else if (src_tensor.weightQunatCompressType() == schema::WeightQunatCompressType_SPARSE) {
-    return SparseDecompress(src_tensor, dst_tensor);
+bool NeedBitUppackCheck(const schema::Tensor &src_tensor) {
+  if (src_tensor.enableHuffmanCode()) {
+    return true;
   }
-
   bool need_bit_unpack = src_tensor.quantParams() != nullptr && src_tensor.quantParams()->size() > 0 &&
                          src_tensor.quantParams()->Get(0) != nullptr && src_tensor.quantParams()->Get(0)->inited();
   if (need_bit_unpack) {
     auto num_bits = src_tensor.quantParams()->Get(0)->numBits();
-    need_bit_unpack = ((num_bits >= WeightDecoder::kBitNum1 && num_bits < WeightDecoder::kBitNum8) ||
-                       (num_bits > WeightDecoder::kBitNum8 && num_bits < WeightDecoder::kBitNum16));
+    need_bit_unpack = ((num_bits >= kBitNum1 && num_bits < kBitNum8) || (num_bits > kBitNum8 && num_bits < kBitNum16));
   }
-  if (!src_tensor.enableHuffmanCode() && !need_bit_unpack) {
-    return RET_NO_CHANGE;
+
+  return need_bit_unpack;
+}
+
+int DecompressTensor(const schema::Tensor &src_tensor, Tensor *dst_tensor) {
+  MS_ASSERT(dst_tensor != nullptr);
+#ifndef WEIGHT_DECODE_CLIP
+  if (src_tensor.weightQunatCompressType() == schema::WeightQunatCompressType_FSE) {
+    return quant::FSEDecoder::DeCompress(src_tensor, dst_tensor);
+  } else if (src_tensor.weightQunatCompressType() == schema::WeightQunatCompressType_INDEXING) {
+    return IndexingDecompress(src_tensor, dst_tensor);
+  } else if (src_tensor.weightQunatCompressType() == schema::WeightQunatCompressType_SPARSE) {
+    return SparseDecompress(src_tensor, dst_tensor);
   }
-  // huffman code and bit pack are not assumed to be performed at same time
-  STATUS ret = RET_ERROR;
-  if (src_tensor.enableHuffmanCode()) {
-#ifdef ENABLE_HUFFMAN_DECODE
-    ret = WeightDecoder::DecodeHuffmanCode(src_tensor, dst_tensor);
-    if (ret != RET_OK && ret != RET_NO_CHANGE) {
-      MS_LOG(ERROR) << "Decode huffman code failed: " << ret;
-      return ret;
-    }
 #else
-    MS_LOG(ERROR) << unsupport_huffman_decode_log;
+  if (src_tensor.weightQunatCompressType() != schema::WeightQunatCompressType_NONE) {
+    MS_LOG(ERROR) << unsupport_weight_decode_log;
+    return RET_ERROR;
+  }
+#endif
+  if (!NeedBitUppackCheck(src_tensor)) {
+    return RET_NO_CHANGE;
+  } else {
+#ifndef WEIGHT_DECODE_CLIP
+    return WeightDecoder::UnPack(src_tensor, dst_tensor);
+#else
+    MS_LOG(ERROR) << unsupport_weight_decode_log;
     return RET_ERROR;
 #endif
-  } else if (need_bit_unpack) {
-    ret = WeightDecoder::UnPackToInt(src_tensor, dst_tensor);
-    if (ret != RET_OK && ret != RET_NO_CHANGE) {
-      MS_LOG(ERROR) << "Unpack to int8 failed: " << ret;
-      return ret;
-    }
-  } else {
-    ret = RET_OK;
   }
-  return ret;
 }
 }  // namespace
 
@@ -128,14 +129,14 @@ int LiteSession::ConvertTensorsData(const lite::Model *model, size_t tensor_inde
   MS_ASSERT(dst_tensor != nullptr);
   if (src_tensor->data() != nullptr && src_tensor->data()->size() > 0) {
     if (dst_tensor->data_type() == kObjectTypeTensorType) {
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
       auto tensor_list = reinterpret_cast<TensorList *>(dst_tensor);
       if (tensor_list->Decode(reinterpret_cast<const int *>(src_tensor->data()->data())) != RET_OK) {
         MS_LOG(ERROR) << "Decode tensorlist data failed";
         return RET_ERROR;
       }
 #else
-      MS_LOG(ERROR) << unsupport_control_tensorlist_log;
+      MS_LOG(ERROR) << unsupport_controlflow_tensorlist_log;
       return RET_NOT_SUPPORT;
 #endif
     } else {
@@ -169,7 +170,7 @@ lite::Tensor *LiteSession::ConvertTensor(const schema::Tensor &src_tensor) {
   }
   lite::Tensor *dst_tensor = nullptr;
   if (TypeId(src_tensor.dataType()) == kObjectTypeTensorType) {
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
     dst_tensor = new (std::nothrow) TensorList(shape, std::vector<int>(), src_category);
     // set tensor list datatype
     auto tensor_list = reinterpret_cast<TensorList *>(dst_tensor);
@@ -178,7 +179,7 @@ lite::Tensor *LiteSession::ConvertTensor(const schema::Tensor &src_tensor) {
       tensor_list->set_tensors_data_type(tensor_data_type);
     }
 #else
-    MS_LOG(ERROR) << unsupport_control_tensorlist_log;
+    MS_LOG(ERROR) << unsupport_controlflow_tensorlist_log;
 #endif
   } else {
     dst_tensor = new (std::nothrow)
@@ -418,10 +419,11 @@ void LiteSession::IsolateOutputTensor() {
           subgraph->set_out_tensor(new_tensor, i);
         }
       }
-
+#ifndef DELEGATE_CLIP
       if (subgraph->desc().delegate != nullptr) {
         continue;
       }
+#endif
       /* node input and output */
       auto nodes = reinterpret_cast<kernel::SubGraphKernel *>(subgraph)->nodes();
       for (size_t i = 0; i < nodes.size(); i++) {
@@ -578,14 +580,18 @@ int LiteSession::PrepareKernels(Model *model, bool use_mindrt_run) {
   // find in_kernels and out_kernels for subgraphs
   for (auto kernel : this->kernels_) {
     kernel->FindInoutKernels(this->kernels_);
+#ifndef DELEGATE_CLIP
     if (kernel->desc().delegate != nullptr) {
       all_kernels.push_back(kernel);
     } else {
+#endif
       auto sub_graph = reinterpret_cast<kernel::SubGraphKernel *>(kernel);
       MS_ASSERT(sub_graph != nullptr);
       auto kernel_in_subgraph = sub_graph->nodes();
       all_kernels.insert(all_kernels.end(), kernel_in_subgraph.begin(), kernel_in_subgraph.end());
+#ifndef DELEGATE_CLIP
     }
+#endif
   }
 
   if (!use_mindrt_run) {
@@ -597,9 +603,11 @@ int LiteSession::PrepareKernels(Model *model, bool use_mindrt_run) {
 
   // init init_ref_count for subgraphs and kernels
   for (auto *kernel : this->kernels_) {
+#ifndef DELEGATE_CLIP
     if (kernel->desc().delegate != nullptr) {
       continue;
     }
+#endif
     if (IsIsolatedSubGraph(kernel)) {
       static_cast<kernel::SubGraphKernel *>(kernel)->InitInputTensorInitRefCount();
     }
@@ -642,7 +650,7 @@ int LiteSession::RunGraph(const KernelCallBack &before, const KernelCallBack &af
   return ret;
 }
 
-int LiteSession::Init(const Context *context) {
+int LiteSession::Init(InnerContext *context) {
   bool expected = false;
   if (!is_running_.compare_exchange_strong(expected, true)) {
     MS_LOG(ERROR) << "Not support multi-threading";
@@ -653,12 +661,8 @@ int LiteSession::Init(const Context *context) {
     is_running_.store(false);
     return RET_NULL_PTR;
   }
-  this->context_ = new (std::nothrow) InnerContext(context);
-  if (this->context_ == nullptr) {
-    MS_LOG(ERROR) << "New Context failed";
-    is_running_.store(false);
-    return RET_MEMORY_FAILED;
-  }
+  this->context_ = context;
+
   auto ret = this->context_->Init();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init Context failed";
@@ -692,6 +696,7 @@ int LiteSession::Init(const Context *context) {
     }
   }
 #endif
+#ifndef DELEGATE_CLIP
   if (delegate_ != nullptr) {
     auto delegate_ret = delegate_->Init();
     if (delegate_ret == RET_NOT_SUPPORT) {
@@ -703,6 +708,7 @@ int LiteSession::Init(const Context *context) {
       return RET_ERROR;
     }
   }
+#endif
   ret = InitGPURuntime();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init GPU runtime failed.";
@@ -731,7 +737,9 @@ LiteSession::~LiteSession() {
     kernel = nullptr;
   }
   for (auto tensor : tensors_) {
-    MS_ASSERT(tensor != nullptr);
+    if (tensor == nullptr) {
+      continue;
+    }
     // Data of const tensor which doesn't own data will not freed.
     // Such as const data from meta_graph which will be freed when freeing meta_graph.
     if (tensor->IsConst() && !tensor->own_data()) {
@@ -846,9 +854,11 @@ int LiteSession::ReSizeKernels(const std::vector<kernel::LiteKernel *> &kernels)
       return RET_ERROR;
     }
     auto ret = RET_OK;
+#ifndef DELEGATE_CLIP
     if (kernel->desc().delegate != nullptr) {
       ret = kernel->ReSize();
     } else {
+#endif
       if (kernel->subgraph_type() == kernel::kGpuSubGraph) {
 #if GPU_OPENCL
         auto sub_graph = reinterpret_cast<kernel::OpenCLSubGraph *>(kernel);
@@ -858,7 +868,9 @@ int LiteSession::ReSizeKernels(const std::vector<kernel::LiteKernel *> &kernels)
         auto sub_graph = reinterpret_cast<kernel::SubGraphKernel *>(kernel);
         ret = sub_graph->ReSize();
       }
+#ifndef DELEGATE_CLIP
     }
+#endif
     if (ret == RET_INFER_INVALID) {
       MS_LOG(INFO) << "InferShape is interrupted";
       continue;
@@ -948,7 +960,10 @@ session::LiteSession *session::LiteSession::CreateSession(const lite::Context *c
     MS_LOG(ERROR) << "create session failed";
     return nullptr;
   }
-  auto ret = session->Init(context);
+
+  mindspore::lite::InnerContext *inner_context = new (std::nothrow) mindspore::lite::InnerContext(context);
+
+  auto ret = session->Init(inner_context);
   if (ret != mindspore::lite::RET_OK) {
     MS_LOG(ERROR) << "init session failed";
     delete session;
@@ -964,48 +979,67 @@ session::LiteSession *session::LiteSession::CreateSession(const char *model_buf,
     MS_LOG(ERROR) << "Create session failed";
     return nullptr;
   }
-  auto *model = lite::ImportFromBuffer(model_buf, size, true);
-  if (model == nullptr) {
-    MS_LOG(ERROR) << "Import model failed";
+  auto ret = lite::LiteSession::CreateSessionByBuf(model_buf, size, session);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init session failed";
     delete session;
     return nullptr;
   }
-  auto ret = session->CompileGraph(model);
-  if (ret != lite::RET_OK) {
-    MS_LOG(ERROR) << "Compile model failed";
-    delete model;
-    delete session;
-    return nullptr;
-  }
-  model->buf = nullptr;
-  (reinterpret_cast<lite::LiteSession *>(session))->set_model(model);
   return session;
 }
 
 session::LiteSession *lite::LiteSession::CreateSession(const std::string &model_path, const lite::Context *context) {
-  size_t model_size;
-  auto model_buf = lite::ReadFile(model_path.c_str(), &model_size);
-  if (model_buf == nullptr) {
-    MS_LOG(ERROR) << "Read model file failed";
-    return nullptr;
-  }
   auto *session = session::LiteSession::CreateSession(context);
   if (session == nullptr) {
     MS_LOG(ERROR) << "Create session failed";
     return nullptr;
   }
+  auto ret = lite::LiteSession::CreateSessionByPath(model_path, session);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init session failed";
+    delete session;
+    return nullptr;
+  }
+  return session;
+}
+
+int lite::LiteSession::CreateSessionByBuf(const char *model_buf, size_t size, session::LiteSession *session) {
+  auto *model = lite::ImportFromBuffer(model_buf, size, true);
+  if (model == nullptr) {
+    MS_LOG(ERROR) << "Import model failed";
+    return RET_ERROR;
+  }
+  auto ret = session->CompileGraph(model);
+  if (ret != lite::RET_OK) {
+    MS_LOG(ERROR) << "Compile model failed";
+    delete model;
+    return RET_ERROR;
+  }
+  model->buf = nullptr;
+  (reinterpret_cast<lite::LiteSession *>(session))->set_model(model);
+  return RET_OK;
+}
+
+int lite::LiteSession::CreateSessionByPath(const std::string &model_path, session::LiteSession *session) {
+  size_t model_size;
+  auto model_buf = lite::ReadFile(model_path.c_str(), &model_size);
+  if (model_buf == nullptr) {
+    MS_LOG(ERROR) << "Read model file failed";
+    return RET_ERROR;
+  }
   auto *model = lite::ImportFromBuffer(model_buf, model_size, true);
   if (model == nullptr) {
     MS_LOG(ERROR) << "Import model failed";
-    return nullptr;
+    return RET_ERROR;
   }
   (reinterpret_cast<lite::LiteModel *>(model))->set_keep_model_buf(true);
   auto ret = session->CompileGraph(model);
   if (ret != lite::RET_OK) {
     MS_LOG(ERROR) << "Compile model failed";
-    return nullptr;
+    return RET_ERROR;
   }
   (reinterpret_cast<lite::LiteSession *>(session))->set_model(model);
-  return session;
+  return RET_OK;
 }
+
 }  // namespace mindspore
diff --git a/mindspore/lite/src/lite_session.h b/mindspore/lite/src/lite_session.h
index 63f93b58a15..55892200954 100644
--- a/mindspore/lite/src/lite_session.h
+++ b/mindspore/lite/src/lite_session.h
@@ -30,8 +30,12 @@
 #include "schema/model_generated.h"
 #include "src/executor.h"
 #include "src/tensor.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/tensorlist.h"
+#endif
+#ifndef DELEGATE_CLIP
 #include "include/api/delegate.h"
+#endif
 #if GPU_OPENCL
 #include "src/runtime/gpu/opencl/opencl_runtime.h"
 #endif
@@ -47,7 +51,10 @@ class LiteSession : public session::LiteSession {
 
   static session::LiteSession *CreateSession(const std::string &model_path, const lite::Context *context);
 
-  virtual int Init(const Context *context);
+  static int CreateSessionByBuf(const char *model_buf, size_t size, session::LiteSession *session);
+  static int CreateSessionByPath(const std::string &model_path, session::LiteSession *session);
+
+  virtual int Init(InnerContext *context);
 
   void BindThread(bool if_bind) override;
 
diff --git a/mindspore/lite/src/ops/CMakeLists.txt b/mindspore/lite/src/ops/CMakeLists.txt
index 05b1a731ac9..c288bd67564 100644
--- a/mindspore/lite/src/ops/CMakeLists.txt
+++ b/mindspore/lite/src/ops/CMakeLists.txt
@@ -13,7 +13,7 @@ if(MSLITE_STRING_KERNEL)
             ${OPS_SRC_STRING}
             )
 endif()
-if(MSLITE_CONTROL_TENSORLIST)
+if(MSLITE_CONTROLFLOW_TENSORLIST)
     file(GLOB OPS_SRC_CONTROL_TENSORLIST
             ${CMAKE_CURRENT_SOURCE_DIR}/populate/control/*.cc
             )
@@ -22,7 +22,7 @@ if(MSLITE_CONTROL_TENSORLIST)
             ${OPS_SRC_CONTROL_TENSORLIST}
             )
 endif()
-if(ENABLE_V0)
+if(MSLITE_ENABLE_V0)
     file(GLOB_RECURSE COMPAT_SRC ${CMAKE_CURRENT_SOURCE_DIR}/compat/*.cc)
     file(GLOB OPS_SRC_V0 ${CMAKE_CURRENT_SOURCE_DIR}/populate/v0/*.cc)
     if(MSLITE_STRING_KERNEL)
@@ -34,7 +34,7 @@ if(ENABLE_V0)
                 ${OPS_SRC_STRING_V0}
                 )
     endif()
-    if(MSLITE_CONTROL_TENSORLIST)
+    if(MSLITE_CONTROLFLOW_TENSORLIST)
         file(GLOB OPS_SRC_CONTROL_TENSORLIST_V0
                 ${CMAKE_CURRENT_SOURCE_DIR}/populate/v0/control/*.cc
                 )
diff --git a/mindspore/lite/src/ops/populate/adder_populate.cc b/mindspore/lite/src/ops/populate/adder_populate.cc
index 5b41e4f5ae7..894434ef590 100644
--- a/mindspore/lite/src/ops/populate/adder_populate.cc
+++ b/mindspore/lite/src/ops/populate/adder_populate.cc
@@ -42,7 +42,13 @@ OpParameter *PopulateAdderParameter(const void *prim) {
   auto pad_list = value->pad_list();
   auto dilation = value->dilation();
   if (kernel_size == nullptr || stride == nullptr || pad_list == nullptr || dilation == nullptr) {
-    MS_LOG(ERROR) << "nullptr";
+    MS_LOG(ERROR) << "exist attr is nullptr";
+    free(param);
+    return nullptr;
+  }
+  if (kernel_size->size() < kMinShapeSizeTwo || stride->size() < kMinShapeSizeTwo ||
+      pad_list->size() < kMinShapeSizeFour || dilation->size() < kMinShapeSizeTwo) {
+    MS_LOG(ERROR) << "exist attr size is invalid.";
     free(param);
     return nullptr;
   }
diff --git a/mindspore/lite/src/ops/populate/constant_of_shape_populate.cc b/mindspore/lite/src/ops/populate/constant_of_shape_populate.cc
index 25e721dee69..097b0780395 100644
--- a/mindspore/lite/src/ops/populate/constant_of_shape_populate.cc
+++ b/mindspore/lite/src/ops/populate/constant_of_shape_populate.cc
@@ -47,6 +47,8 @@ OpParameter *PopulateConstantOfShapeParameter(const void *prim) {
   param->data_type_ = static_cast<int>(value->data_type());
   if (val.empty() || val.size() > 1) {
     MS_LOG(ERROR) << "The value of constant of shape is empty or more than 1.";
+    free(param);
+    return nullptr;
   } else {
     switch (param->data_type_) {
       case kNumberTypeFloat32:
diff --git a/mindspore/lite/src/ops/populate/custom_extract_features_populate.cc b/mindspore/lite/src/ops/populate/custom_extract_features_populate.cc
deleted file mode 100644
index b6ac687730a..00000000000
--- a/mindspore/lite/src/ops/populate/custom_extract_features_populate.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/ops/populate/populate_register.h"
-using mindspore::schema::PrimitiveType_CustomExtractFeatures;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateExtractFeaturesParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-
-  auto *param = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "new OpParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(OpParameter));
-
-  param->type_ = primitive->value_type();
-  return reinterpret_cast<OpParameter *>(param);
-}
-
-REG_POPULATE(PrimitiveType_CustomExtractFeatures, PopulateExtractFeaturesParameter, SCHEMA_CUR);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/custom_normalize_populate.cc b/mindspore/lite/src/ops/populate/custom_normalize_populate.cc
deleted file mode 100644
index 4e24d8a6dfe..00000000000
--- a/mindspore/lite/src/ops/populate/custom_normalize_populate.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/ops/populate/populate_register.h"
-using mindspore::schema::PrimitiveType_CustomNormalize;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateCustomNormalizeParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-
-  auto *param = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "new OpParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(OpParameter));
-
-  param->type_ = primitive->value_type();
-  return reinterpret_cast<OpParameter *>(param);
-}
-REG_POPULATE(PrimitiveType_CustomNormalize, PopulateCustomNormalizeParameter, SCHEMA_CUR);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/custom_predict_populate.cc b/mindspore/lite/src/ops/populate/custom_predict_populate.cc
deleted file mode 100644
index 5065dbabe57..00000000000
--- a/mindspore/lite/src/ops/populate/custom_predict_populate.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/predict_parameter.h"
-using mindspore::schema::PrimitiveType_CustomPredict;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateCustomPredictParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto value = primitive->value_as_CustomPredict();
-  if (value == nullptr) {
-    MS_LOG(ERROR) << "value is nullptr";
-    return nullptr;
-  }
-
-  auto *param = reinterpret_cast<PredictParameter *>(malloc(sizeof(PredictParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "malloc param failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(PredictParameter));
-
-  param->op_parameter_.type_ = primitive->value_type();
-  param->output_num = value->output_num();
-  param->weight_threshold = value->weight_threshold();
-  return reinterpret_cast<OpParameter *>(param);
-}
-REG_POPULATE(PrimitiveType_CustomPredict, PopulateCustomPredictParameter, SCHEMA_CUR);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/hashtable_lookup_populate.cc b/mindspore/lite/src/ops/populate/hashtable_lookup_populate.cc
deleted file mode 100644
index 006a0825091..00000000000
--- a/mindspore/lite/src/ops/populate/hashtable_lookup_populate.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/ops/populate/populate_register.h"
-using mindspore::schema::PrimitiveType_HashtableLookup;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateHashtableLookupParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-
-  auto *param = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "new OpParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(OpParameter));
-
-  param->type_ = primitive->value_type();
-  return param;
-}
-REG_POPULATE(PrimitiveType_HashtableLookup, PopulateHashtableLookupParameter, SCHEMA_CUR);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/lsh_projection_populate.cc b/mindspore/lite/src/ops/populate/lsh_projection_populate.cc
deleted file mode 100644
index 4b465a4e695..00000000000
--- a/mindspore/lite/src/ops/populate/lsh_projection_populate.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "nnacl/lsh_projection_parameter.h"
-#include "src/ops/populate/populate_register.h"
-using mindspore::schema::PrimitiveType_LshProjection;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateLshProjectionParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto value = primitive->value_as_LshProjection();
-  if (value == nullptr) {
-    MS_LOG(ERROR) << "value is nullptr";
-    return nullptr;
-  }
-
-  auto *param = reinterpret_cast<LshProjectionParameter *>(malloc(sizeof(LshProjectionParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "malloc LshProjectionParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(LshProjectionParameter));
-
-  param->op_parameter_.type_ = primitive->value_type();
-  param->lsh_type_ = value->type();
-  return reinterpret_cast<OpParameter *>(param);
-}
-REG_POPULATE(PrimitiveType_LshProjection, PopulateLshProjectionParameter, SCHEMA_CUR);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/skip_gram_populate.cc b/mindspore/lite/src/ops/populate/skip_gram_populate.cc
deleted file mode 100644
index f911592cf63..00000000000
--- a/mindspore/lite/src/ops/populate/skip_gram_populate.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/skip_gram_parameter.h"
-using mindspore::schema::PrimitiveType_SkipGram;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateSkipGramParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto value = primitive->value_as_SkipGram();
-  if (value == nullptr) {
-    MS_LOG(ERROR) << "value is nullptr";
-    return nullptr;
-  }
-
-  auto *param = reinterpret_cast<SkipGramParameter *>(malloc(sizeof(SkipGramParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "malloc SkipGramParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(SkipGramParameter));
-
-  param->op_parameter_.type_ = primitive->value_type();
-  param->ngram_size = value->ngram_size();
-  param->max_skip_size = value->max_skip_size();
-  param->include_all_ngrams = value->include_all_grams();
-  return reinterpret_cast<OpParameter *>(param);
-}
-REG_POPULATE(PrimitiveType_SkipGram, PopulateSkipGramParameter, SCHEMA_CUR)
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/splice_populate.cc b/mindspore/lite/src/ops/populate/splice_populate.cc
index 2e3a8ef1efa..5ebb09c4b85 100644
--- a/mindspore/lite/src/ops/populate/splice_populate.cc
+++ b/mindspore/lite/src/ops/populate/splice_populate.cc
@@ -44,7 +44,7 @@ OpParameter *PopulateSpliceParameter(const void *prim) {
     return nullptr;
   }
   std::vector<int> primitive_context(context->begin(), context->end());
-  if (primitive_context.size() > std::numeric_limits<int>::max()) {
+  if (static_cast<int>(primitive_context.size()) > std::numeric_limits<int>::max()) {
     MS_LOG(ERROR) << "size is too big.";
     free(param);
     return nullptr;
@@ -74,7 +74,7 @@ OpParameter *PopulateSpliceParameter(const void *prim) {
     return nullptr;
   }
   std::vector<int> primitive_forward_indexes(forward_indexes->begin(), forward_indexes->end());
-  if (primitive_forward_indexes.size() > std::numeric_limits<int>::max()) {
+  if (static_cast<int>(primitive_forward_indexes.size()) > std::numeric_limits<int>::max()) {
     MS_LOG(ERROR) << "size is too big.";
     free(param->context_);
     free(param);
diff --git a/mindspore/lite/src/ops/populate/switch_populate.cc b/mindspore/lite/src/ops/populate/switch_populate.cc
deleted file mode 100644
index 4a8673287ff..00000000000
--- a/mindspore/lite/src/ops/populate/switch_populate.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/ops/populate/populate_register.h"
-using mindspore::schema::PrimitiveType_Switch;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateSwitchParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-
-  auto *param = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "malloc OpParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(OpParameter));
-
-  param->type_ = primitive->value_type();
-  return reinterpret_cast<OpParameter *>(param);
-}
-
-REG_POPULATE(PrimitiveType_Switch, PopulateSwitchParameter, SCHEMA_CUR)
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/tensor_array_populate.cc b/mindspore/lite/src/ops/populate/tensor_array_populate.cc
deleted file mode 100644
index 16e39a1dcc1..00000000000
--- a/mindspore/lite/src/ops/populate/tensor_array_populate.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/op_base.h"
-#include "nnacl/tensor_array_parameter.h"
-
-using mindspore::schema::PrimitiveType_TensorArray;
-using mindspore::schema::PrimitiveType_TensorArrayRead;
-using mindspore::schema::PrimitiveType_TensorArrayWrite;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateTensorArrayParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto value = primitive->value_as_TensorArray();
-  if (value == nullptr) {
-    MS_LOG(ERROR) << "cast to tensor array primitive failed!";
-    return nullptr;
-  }
-
-  auto param = reinterpret_cast<TensorArrayParameter *>(malloc(sizeof(TensorArrayParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "malloc TensorArray nnacl Parameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(TensorArrayParameter));
-
-  param->op_parameter_.type_ = primitive->value_type();
-  bool dynamic_size = value->dynamic_size();
-  param->dynamic_size_ = dynamic_size;
-  bool identical_element_shapes = value->identical_element_shapes();
-  param->identical_element_shapes_ = identical_element_shapes;
-  std::vector<int> primitive_element_shape(value->element_shape()->begin(), value->element_shape()->end());
-  param->element_shape_size_ = primitive_element_shape.size();
-  int size = sizeof(int) * param->element_shape_size_;
-  param->element_shape_ = static_cast<int *>(malloc(size));
-  if (param->element_shape_ == nullptr) {
-    MS_LOG(ERROR) << "malloc element_shape failed!";
-    free(param);
-    return nullptr;
-  }
-  memset(param->element_shape_, 0, size);
-  memcpy(param->element_shape_, primitive_element_shape.data(), size);
-  param->data_type_ = value->data_type();
-  return reinterpret_cast<OpParameter *>(param);
-}
-
-OpParameter *PopulateTACommonParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-
-  auto *param = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "malloc OpParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(OpParameter));
-
-  param->type_ = primitive->value_type();
-  return reinterpret_cast<OpParameter *>(param);
-}
-
-REG_POPULATE(PrimitiveType_TensorArray, PopulateTensorArrayParameter, SCHEMA_CUR)
-REG_POPULATE(PrimitiveType_TensorArrayRead, PopulateTACommonParameter, SCHEMA_CUR)
-REG_POPULATE(PrimitiveType_TensorArrayWrite, PopulateTACommonParameter, SCHEMA_CUR)
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/tensorlistfromtensor_populate.cc b/mindspore/lite/src/ops/populate/tensorlistfromtensor_populate.cc
deleted file mode 100644
index 0999c11ad4a..00000000000
--- a/mindspore/lite/src/ops/populate/tensorlistfromtensor_populate.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/tensorlist_parameter.h"
-#include "src/ops/populate/populate_register.h"
-using mindspore::schema::PrimitiveType_TensorListFromTensor;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateTensorListFromTensorParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto value = primitive->value_as_TensorListFromTensor();
-  if (value == nullptr) {
-    MS_LOG(ERROR) << "value is nullptr";
-    return nullptr;
-  }
-
-  auto *param = reinterpret_cast<TensorListParameter *>(malloc(sizeof(TensorListParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "malloc TensorListParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(TensorListParameter));
-
-  param->op_parameter_.type_ = primitive->value_type();
-  param->shape_type_ = value->shape_type();
-  param->element_dtype_ = value->element_dtype();
-  return reinterpret_cast<OpParameter *>(param);
-}
-REG_POPULATE(PrimitiveType_TensorListFromTensor, PopulateTensorListFromTensorParameter, SCHEMA_CUR);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/tensorlistgetitem_populate.cc b/mindspore/lite/src/ops/populate/tensorlistgetitem_populate.cc
deleted file mode 100644
index f96fc475936..00000000000
--- a/mindspore/lite/src/ops/populate/tensorlistgetitem_populate.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/tensorlist_parameter.h"
-using mindspore::schema::PrimitiveType_TensorListGetItem;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateTensorListGetItemParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto value = primitive->value_as_TensorListGetItem();
-  if (value == nullptr) {
-    MS_LOG(ERROR) << "value is nullptr";
-    return nullptr;
-  }
-
-  auto *param = reinterpret_cast<TensorListParameter *>(malloc(sizeof(TensorListParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "malloc TensorListParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(TensorListParameter));
-
-  param->op_parameter_.type_ = primitive->value_type();
-  param->element_dtype_ = value->element_dtype();
-  return reinterpret_cast<OpParameter *>(param);
-}
-REG_POPULATE(PrimitiveType_TensorListGetItem, PopulateTensorListGetItemParameter, SCHEMA_CUR);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/tensorlistreserve_populate.cc b/mindspore/lite/src/ops/populate/tensorlistreserve_populate.cc
deleted file mode 100644
index 37d1ea5f787..00000000000
--- a/mindspore/lite/src/ops/populate/tensorlistreserve_populate.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/tensorlist_parameter.h"
-using mindspore::schema::PrimitiveType_TensorListReserve;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateTensorListReserveParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto value = primitive->value_as_TensorListReserve();
-  if (value == nullptr) {
-    MS_LOG(ERROR) << "value is nullptr";
-    return nullptr;
-  }
-
-  auto *param = reinterpret_cast<TensorListParameter *>(malloc(sizeof(TensorListParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "malloc TensorListParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(TensorListParameter));
-
-  param->op_parameter_.type_ = primitive->value_type();
-  param->element_dtype_ = value->element_dtype();
-  return reinterpret_cast<OpParameter *>(param);
-}
-REG_POPULATE(PrimitiveType_TensorListReserve, PopulateTensorListReserveParameter, SCHEMA_CUR);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/tensorlistsetlitem_populate.cc b/mindspore/lite/src/ops/populate/tensorlistsetlitem_populate.cc
deleted file mode 100644
index 2a03483988f..00000000000
--- a/mindspore/lite/src/ops/populate/tensorlistsetlitem_populate.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/tensorlist_parameter.h"
-using mindspore::schema::PrimitiveType_TensorListSetItem;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateTensorListSetItemParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto value = primitive->value_as_TensorListSetItem();
-  if (value == nullptr) {
-    MS_LOG(ERROR) << "value is nullptr";
-    return nullptr;
-  }
-
-  auto *param = reinterpret_cast<TensorListParameter *>(malloc(sizeof(TensorListParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "malloc TensorListParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(TensorListParameter));
-
-  param->op_parameter_.type_ = primitive->value_type();
-  param->element_dtype_ = value->element_dtype();
-  return reinterpret_cast<OpParameter *>(param);
-}
-REG_POPULATE(PrimitiveType_TensorListSetItem, PopulateTensorListSetItemParameter, SCHEMA_CUR);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/tensorliststack_populate.cc b/mindspore/lite/src/ops/populate/tensorliststack_populate.cc
deleted file mode 100644
index 10f5a3b8d7b..00000000000
--- a/mindspore/lite/src/ops/populate/tensorliststack_populate.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/tensorlist_parameter.h"
-using mindspore::schema::PrimitiveType_TensorListStack;
-
-namespace mindspore {
-namespace lite {
-OpParameter *PopulateTensorListStackParameter(const void *prim) {
-  auto primitive = static_cast<const schema::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto value = primitive->value_as_TensorListStack();
-  if (value == nullptr) {
-    MS_LOG(ERROR) << "value is nullptr";
-    return nullptr;
-  }
-
-  auto *param = reinterpret_cast<TensorListParameter *>(malloc(sizeof(TensorListParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "malloc TensorListParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(TensorListParameter));
-
-  param->op_parameter_.type_ = primitive->value_type();
-  param->element_dtype_ = value->element_dtype();
-  param->num_element_ = value->num_elements();
-  return reinterpret_cast<OpParameter *>(param);
-}
-REG_POPULATE(PrimitiveType_TensorListStack, PopulateTensorListStackParameter, SCHEMA_CUR);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/v0/custom_extract_features_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/custom_extract_features_populate_v0.cc
deleted file mode 100644
index 684f489ee4c..00000000000
--- a/mindspore/lite/src/ops/populate/v0/custom_extract_features_populate_v0.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "schema/model_v0_generated.h"
-#include "src/ops/populate/populate_register.h"
-
-namespace mindspore {
-namespace lite {
-namespace {
-OpParameter *PopulateExtractFeaturesParameter(const void *prim) {
-  auto *param = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "new OpParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(OpParameter));
-  auto *primitive = reinterpret_cast<const schema::v0::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto type = primitive->value_type();
-  if (type == schema::v0::PrimitiveType_CustomExtractFeatures) {
-    param->type_ = schema::PrimitiveType_CustomExtractFeatures;
-  } else {
-    param->type_ = type;
-  }
-  return param;
-}
-}  // namespace
-
-Registry g_customExtractFeaturesV0ParameterRegistry(schema::v0::PrimitiveType_CustomExtractFeatures,
-                                                    PopulateExtractFeaturesParameter, SCHEMA_V0);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/v0/custom_normalize_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/custom_normalize_populate_v0.cc
deleted file mode 100644
index c8e39edbe45..00000000000
--- a/mindspore/lite/src/ops/populate/v0/custom_normalize_populate_v0.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "schema/model_v0_generated.h"
-#include "src/ops/populate/populate_register.h"
-
-namespace mindspore {
-namespace lite {
-namespace {
-OpParameter *PopulateCustomNormalizeParameter(const void *prim) {
-  auto *param = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "new OpParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(OpParameter));
-  auto *primitive = reinterpret_cast<const schema::v0::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto type = primitive->value_type();
-  if (type == schema::v0::PrimitiveType_CustomNormalize) {
-    param->type_ = schema::PrimitiveType_CustomNormalize;
-  } else {
-    param->type_ = type;
-  }
-  return param;
-}
-}  // namespace
-
-Registry g_customNormalizeV0ParameterRegistry(schema::v0::PrimitiveType_CustomNormalize,
-                                              PopulateCustomNormalizeParameter, SCHEMA_V0);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/v0/custom_predict_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/custom_predict_populate_v0.cc
deleted file mode 100644
index c18ae05f634..00000000000
--- a/mindspore/lite/src/ops/populate/v0/custom_predict_populate_v0.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "schema/model_v0_generated.h"
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/predict_parameter.h"
-
-namespace mindspore {
-namespace lite {
-namespace {
-OpParameter *PopulateCustomPredictParameter(const void *prim) {
-  auto *primitive = static_cast<const schema::v0::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto custom_predict_prim = primitive->value_as_CustomPredict();
-  if (custom_predict_prim == nullptr) {
-    MS_LOG(ERROR) << "custom_predict_prim is nullptr";
-    return nullptr;
-  }
-  auto *param = reinterpret_cast<PredictParameter *>(malloc(sizeof(PredictParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "malloc param failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(PredictParameter));
-  param->op_parameter_.type_ = schema::PrimitiveType_CustomPredict;
-
-  param->output_num = custom_predict_prim->outputNum();
-  param->weight_threshold = custom_predict_prim->weightThreshold();
-  return reinterpret_cast<OpParameter *>(param);
-}
-}  // namespace
-
-Registry g_customPredictV0ParameterRegistry(schema::v0::PrimitiveType_CustomPredict, PopulateCustomPredictParameter,
-                                            SCHEMA_V0);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/v0/hashtable_lookup_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/hashtable_lookup_populate_v0.cc
deleted file mode 100644
index 2d1b3029858..00000000000
--- a/mindspore/lite/src/ops/populate/v0/hashtable_lookup_populate_v0.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "schema/model_v0_generated.h"
-#include "src/ops/populate/populate_register.h"
-
-namespace mindspore {
-namespace lite {
-namespace {
-OpParameter *PopulateHashtableLookupParameter(const void *prim) {
-  OpParameter *param = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
-  if (param == nullptr) {
-    MS_LOG(ERROR) << "new OpParameter failed.";
-    return nullptr;
-  }
-  memset(param, 0, sizeof(OpParameter));
-  param->type_ = schema::PrimitiveType_HashtableLookup;
-  return param;
-}
-}  // namespace
-
-Registry g_hashtableLookupV0ParameterRegistry(schema::v0::PrimitiveType_HashtableLookup,
-                                              PopulateHashtableLookupParameter, SCHEMA_V0);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/v0/lsh_projection_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/lsh_projection_populate_v0.cc
deleted file mode 100644
index 2511a56618f..00000000000
--- a/mindspore/lite/src/ops/populate/v0/lsh_projection_populate_v0.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "schema/model_v0_generated.h"
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/lsh_projection_parameter.h"
-
-namespace mindspore {
-namespace lite {
-namespace {
-OpParameter *PopulateLshProjectionParameter(const void *prim) {
-  auto *primitive = static_cast<const schema::v0::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto lsh_projection_prim = primitive->value_as_LshProjection();
-  if (lsh_projection_prim == nullptr) {
-    MS_LOG(ERROR) << "lsh_projection_prim is nullptr";
-    return nullptr;
-  }
-  auto *lsh_project_param = reinterpret_cast<LshProjectionParameter *>(malloc(sizeof(LshProjectionParameter)));
-  if (lsh_project_param == nullptr) {
-    MS_LOG(ERROR) << "malloc LshProjectionParameter failed.";
-    return nullptr;
-  }
-  memset(lsh_project_param, 0, sizeof(LshProjectionParameter));
-  lsh_project_param->op_parameter_.type_ = schema::PrimitiveType_LshProjection;
-
-  lsh_project_param->lsh_type_ = lsh_projection_prim->type();
-  return reinterpret_cast<OpParameter *>(lsh_project_param);
-}
-}  // namespace
-
-Registry g_lshProjectionV0ParameterRegistry(schema::v0::PrimitiveType_LshProjection, PopulateLshProjectionParameter,
-                                            SCHEMA_V0);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/v0/skip_gram_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/skip_gram_populate_v0.cc
deleted file mode 100644
index 00d1ff787ac..00000000000
--- a/mindspore/lite/src/ops/populate/v0/skip_gram_populate_v0.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "schema/model_v0_generated.h"
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/skip_gram_parameter.h"
-
-namespace mindspore {
-namespace lite {
-namespace {
-OpParameter *PopulateSkipGramParameter(const void *prim) {
-  auto *primitive = static_cast<const schema::v0::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto skip_gram_prim = primitive->value_as_SkipGram();
-  if (skip_gram_prim == nullptr) {
-    MS_LOG(ERROR) << "skip_gram_prim is nullptr";
-    return nullptr;
-  }
-  auto *skipGramParameter = reinterpret_cast<SkipGramParameter *>(malloc(sizeof(SkipGramParameter)));
-  if (skipGramParameter == nullptr) {
-    MS_LOG(ERROR) << "malloc SkipGramParameter failed.";
-    return nullptr;
-  }
-  memset(skipGramParameter, 0, sizeof(SkipGramParameter));
-  skipGramParameter->op_parameter_.type_ = schema::PrimitiveType_SkipGram;
-
-  skipGramParameter->ngram_size = skip_gram_prim->ngramSize();
-  skipGramParameter->max_skip_size = skip_gram_prim->maxSkipSize();
-  skipGramParameter->include_all_ngrams = skip_gram_prim->includeAllGrams();
-  return reinterpret_cast<OpParameter *>(skipGramParameter);
-}
-}  // namespace
-
-Registry g_skipGramV0ParameterRegistry(schema::v0::PrimitiveType_SkipGram, PopulateSkipGramParameter, SCHEMA_V0);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/v0/switch_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/switch_populate_v0.cc
deleted file mode 100644
index 3cda18f0918..00000000000
--- a/mindspore/lite/src/ops/populate/v0/switch_populate_v0.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "schema/model_v0_generated.h"
-#include "src/ops/populate/populate_register.h"
-
-namespace mindspore {
-namespace lite {
-namespace {
-OpParameter *PopulateSwitchParameter(const void *prim) {
-  OpParameter *switch_parameter = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
-  if (switch_parameter == nullptr) {
-    MS_LOG(ERROR) << "malloc SwitchParameter failed.";
-    return nullptr;
-  }
-  memset(switch_parameter, 0, sizeof(OpParameter));
-  switch_parameter->type_ = schema::PrimitiveType_Switch;
-
-  return reinterpret_cast<OpParameter *>(switch_parameter);
-}
-}  // namespace
-
-Registry g_switchv0ParameterRegistry(schema::v0::PrimitiveType_Switch, PopulateSwitchParameter, SCHEMA_V0);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/v0/tensorlistfromtensor_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/tensorlistfromtensor_populate_v0.cc
deleted file mode 100644
index 0f1bf51132c..00000000000
--- a/mindspore/lite/src/ops/populate/v0/tensorlistfromtensor_populate_v0.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "schema/model_v0_generated.h"
-#include "nnacl/tensorlist_parameter.h"
-#include "src/ops/populate/populate_register.h"
-
-namespace mindspore {
-namespace lite {
-namespace {
-OpParameter *PopulateTensorListFromTensorParameter(const void *prim) {
-  auto *primitive = static_cast<const schema::v0::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto tensorList = primitive->value_as_TensorListFromTensor();
-  if (tensorList == nullptr) {
-    MS_LOG(ERROR) << "tensorList is nullptr";
-    return nullptr;
-  }
-  auto *TensorList_param = reinterpret_cast<TensorListParameter *>(malloc(sizeof(TensorListParameter)));
-  if (TensorList_param == nullptr) {
-    MS_LOG(ERROR) << "malloc TensorListParameter failed.";
-    return nullptr;
-  }
-  memset(TensorList_param, 0, sizeof(TensorListParameter));
-  TensorList_param->op_parameter_.type_ = schema::PrimitiveType_TensorListFromTensor;
-  TensorList_param->shape_type_ = tensorList->shapeType();
-  TensorList_param->element_dtype_ = tensorList->elementDType();
-  return reinterpret_cast<OpParameter *>(TensorList_param);
-}
-}  // namespace
-Registry g_tensorListFromTensorV0ParameterRegistry(schema::v0::PrimitiveType_TensorListFromTensor,
-                                                   PopulateTensorListFromTensorParameter, SCHEMA_V0);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/v0/tensorlistgetitem_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/tensorlistgetitem_populate_v0.cc
deleted file mode 100644
index bdaf9c4a0d4..00000000000
--- a/mindspore/lite/src/ops/populate/v0/tensorlistgetitem_populate_v0.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "schema/model_v0_generated.h"
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/tensorlist_parameter.h"
-
-namespace mindspore {
-namespace lite {
-namespace {
-OpParameter *PopulateTensorListGetItemParameter(const void *prim) {
-  auto *primitive = static_cast<const schema::v0::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto tensorList_prim = primitive->value_as_TensorListGetItem();
-  if (tensorList_prim == nullptr) {
-    MS_LOG(ERROR) << "tensorList_prim is nullptr";
-    return nullptr;
-  }
-  auto *getItem_param = reinterpret_cast<TensorListParameter *>(malloc(sizeof(TensorListParameter)));
-  if (getItem_param == nullptr) {
-    MS_LOG(ERROR) << "malloc TensorListParameter failed.";
-    return nullptr;
-  }
-  memset(getItem_param, 0, sizeof(TensorListParameter));
-  getItem_param->op_parameter_.type_ = schema::PrimitiveType_TensorListGetItem;
-  getItem_param->element_dtype_ = tensorList_prim->elementDType();
-  return reinterpret_cast<OpParameter *>(getItem_param);
-}
-}  // namespace
-
-Registry g_tensorListGetItemV0ParameterRegistry(schema::v0::PrimitiveType_TensorListGetItem,
-                                                PopulateTensorListGetItemParameter, SCHEMA_V0);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/v0/tensorlistreserve_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/tensorlistreserve_populate_v0.cc
deleted file mode 100644
index 0863ef4b30d..00000000000
--- a/mindspore/lite/src/ops/populate/v0/tensorlistreserve_populate_v0.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "schema/model_v0_generated.h"
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/tensorlist_parameter.h"
-
-namespace mindspore {
-namespace lite {
-namespace {
-OpParameter *PopulateTensorListReserveParameter(const void *prim) {
-  auto *primitive = static_cast<const schema::v0::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto tensorList_prim = primitive->value_as_TensorListReserve();
-  if (tensorList_prim == nullptr) {
-    MS_LOG(ERROR) << "tensorList_prim is nullptr";
-    return nullptr;
-  }
-  auto *reserve_param = reinterpret_cast<TensorListParameter *>(malloc(sizeof(TensorListParameter)));
-  if (reserve_param == nullptr) {
-    MS_LOG(ERROR) << "malloc TensorListParameter failed.";
-    return nullptr;
-  }
-  memset(reserve_param, 0, sizeof(TensorListParameter));
-  reserve_param->op_parameter_.type_ = schema::PrimitiveType_TensorListReserve;
-  reserve_param->element_dtype_ = tensorList_prim->elementDType();
-  return reinterpret_cast<OpParameter *>(reserve_param);
-}
-}  // namespace
-Registry g_tensorListReserveV0ParameterRegistry(schema::v0::PrimitiveType_TensorListReserve,
-                                                PopulateTensorListReserveParameter, SCHEMA_V0);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/v0/tensorlistsetlitem_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/tensorlistsetlitem_populate_v0.cc
deleted file mode 100644
index e0091787e9b..00000000000
--- a/mindspore/lite/src/ops/populate/v0/tensorlistsetlitem_populate_v0.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "schema/model_v0_generated.h"
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/tensorlist_parameter.h"
-
-namespace mindspore {
-namespace lite {
-namespace {
-OpParameter *PopulateTensorListSetItemParameter(const void *prim) {
-  auto *primitive = static_cast<const schema::v0::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto tensorList_prim = primitive->value_as_TensorListSetItem();
-  if (tensorList_prim == nullptr) {
-    MS_LOG(ERROR) << "tensorList_prim is nullptr";
-    return nullptr;
-  }
-  auto *setItem_param = reinterpret_cast<TensorListParameter *>(malloc(sizeof(TensorListParameter)));
-  if (setItem_param == nullptr) {
-    MS_LOG(ERROR) << "malloc TensorListParameter failed.";
-    return nullptr;
-  }
-  memset(setItem_param, 0, sizeof(TensorListParameter));
-  setItem_param->op_parameter_.type_ = schema::PrimitiveType_TensorListSetItem;
-  setItem_param->element_dtype_ = tensorList_prim->elementDType();
-  return reinterpret_cast<OpParameter *>(setItem_param);
-}
-}  // namespace
-Registry g_tensorListSetItemV0ParameterRegistry(schema::v0::PrimitiveType_TensorListSetItem,
-                                                PopulateTensorListSetItemParameter, SCHEMA_V0);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/ops/populate/v0/tensorliststack_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/tensorliststack_populate_v0.cc
deleted file mode 100644
index 9dba8f930d8..00000000000
--- a/mindspore/lite/src/ops/populate/v0/tensorliststack_populate_v0.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "schema/model_v0_generated.h"
-#include "src/ops/populate/populate_register.h"
-#include "nnacl/tensorlist_parameter.h"
-
-namespace mindspore {
-namespace lite {
-namespace {
-OpParameter *PopulateTensorListStackParameter(const void *prim) {
-  auto *primitive = static_cast<const schema::v0::Primitive *>(prim);
-  MS_ASSERT(primitive != nullptr);
-  auto tensorList_prim = primitive->value_as_TensorListStack();
-  if (tensorList_prim == nullptr) {
-    MS_LOG(ERROR) << "tensorList_prim is nullptr";
-    return nullptr;
-  }
-  auto *stack_param = reinterpret_cast<TensorListParameter *>(malloc(sizeof(TensorListParameter)));
-  if (stack_param == nullptr) {
-    MS_LOG(ERROR) << "malloc TensorListParameter failed.";
-    return nullptr;
-  }
-  memset(stack_param, 0, sizeof(TensorListParameter));
-  stack_param->op_parameter_.type_ = schema::PrimitiveType_TensorListStack;
-  stack_param->element_dtype_ = tensorList_prim->elementDType();
-  stack_param->num_element_ = tensorList_prim->numElements();
-  return reinterpret_cast<OpParameter *>(stack_param);
-}
-}  // namespace
-
-Registry g_tensorListStackV0ParameterRegistry(schema::v0::PrimitiveType_TensorListStack,
-                                              PopulateTensorListStackParameter, SCHEMA_V0);
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/src/registry/kernel_interface.cc b/mindspore/lite/src/registry/kernel_interface.cc
deleted file mode 100644
index 05adf371e15..00000000000
--- a/mindspore/lite/src/registry/kernel_interface.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "include/registry/kernel_interface.h"
-#include <set>
-#include <utility>
-#include "src/registry/kernel_interface_registry.h"
-
-namespace mindspore {
-namespace kernel {
-int RegisterKernelInterface::Reg(const std::string &provider, int op_type, KernelInterfaceCreator creator) {
-  return lite::KernelInterfaceRegistry::Instance()->Reg(provider, op_type, creator);
-}
-
-int RegisterKernelInterface::CustomReg(const std::string &provider, const std::string &op_type,
-                                       KernelInterfaceCreator creator) {
-  return lite::KernelInterfaceRegistry::Instance()->CustomReg(provider, op_type, creator);
-}
-
-std::shared_ptr<kernel::KernelInterface> RegisterKernelInterface::GetKernelInterface(
-  const std::string &provider, const schema::Primitive *primitive) {
-  return lite::KernelInterfaceRegistry::Instance()->GetKernelInterface(provider, primitive);
-}
-}  // namespace kernel
-}  // namespace mindspore
diff --git a/mindspore/lite/src/registry/kernel_interface_registry.cc b/mindspore/lite/src/registry/kernel_interface_registry.cc
index df5fe5ba437..32078e6f54c 100644
--- a/mindspore/lite/src/registry/kernel_interface_registry.cc
+++ b/mindspore/lite/src/registry/kernel_interface_registry.cc
@@ -15,17 +15,17 @@
  */
 #include "src/registry/kernel_interface_registry.h"
 #include <memory>
-#include "include/registry/kernel_interface.h"
+#include "include/registry/register_kernel_interface.h"
 #include "include/errorcode.h"
 #include "src/common/log_adapter.h"
 #include "src/common/version_manager.h"
 #include "schema/model_generated.h"
 
-using mindspore::kernel::KernelInterfaceCreator;
+using mindspore::registry::KernelInterfaceCreator;
 using mindspore::schema::PrimitiveType_MAX;
 using mindspore::schema::PrimitiveType_MIN;
 namespace mindspore {
-namespace lite {
+namespace registry {
 namespace {
 static const auto kMaxKernelNum = PrimitiveType_MAX - PrimitiveType_MIN;
 std::string GetCustomType(const schema::Primitive *primitive) {
@@ -35,10 +35,10 @@ std::string GetCustomType(const schema::Primitive *primitive) {
 }
 }  // namespace
 
-int KernelInterfaceRegistry::CustomReg(const std::string &provider, const std::string &type,
-                                       KernelInterfaceCreator creator) {
+Status KernelInterfaceRegistry::CustomReg(const std::string &provider, const std::string &type,
+                                          KernelInterfaceCreator creator) {
   custom_creators_[provider][type] = creator;
-  return RET_OK;
+  return kSuccess;
 }
 
 std::shared_ptr<kernel::KernelInterface> KernelInterfaceRegistry::GetCacheInterface(const std::string &provider,
@@ -124,10 +124,10 @@ std::shared_ptr<kernel::KernelInterface> KernelInterfaceRegistry::GetKernelInter
   return nullptr;
 }
 
-int KernelInterfaceRegistry::Reg(const std::string &provider, int op_type, KernelInterfaceCreator creator) {
+Status KernelInterfaceRegistry::Reg(const std::string &provider, int op_type, KernelInterfaceCreator creator) {
   if (op_type < PrimitiveType_MIN || op_type > kMaxKernelNum) {
     MS_LOG(ERROR) << "reg op_type invalid!op_type: " << op_type << ", max value: " << kMaxKernelNum;
-    return RET_ERROR;
+    return kLiteError;
   }
 
   std::unique_lock<std::mutex> lock(mutex_);
@@ -137,12 +137,12 @@ int KernelInterfaceRegistry::Reg(const std::string &provider, int op_type, Kerne
       reinterpret_cast<KernelInterfaceCreator *>(calloc(kMaxKernelNum, sizeof(KernelInterfaceCreator)));
     if (kernel_creators_[provider] == nullptr) {
       MS_LOG(ERROR) << "malloc kernel dev delegate creator fail!";
-      return RET_ERROR;
+      return kLiteError;
     }
   }
 
   kernel_creators_[provider][op_type] = creator;
-  return RET_OK;
+  return kSuccess;
 }
 
 KernelInterfaceRegistry::~KernelInterfaceRegistry() {
@@ -151,5 +151,5 @@ KernelInterfaceRegistry::~KernelInterfaceRegistry() {
     item.second = nullptr;
   }
 }
-}  // namespace lite
+}  // namespace registry
 }  // namespace mindspore
diff --git a/mindspore/lite/src/registry/kernel_interface_registry.h b/mindspore/lite/src/registry/kernel_interface_registry.h
index 18849dd211b..0739eb64ab7 100644
--- a/mindspore/lite/src/registry/kernel_interface_registry.h
+++ b/mindspore/lite/src/registry/kernel_interface_registry.h
@@ -22,11 +22,11 @@
 #include <memory>
 #include <mutex>
 #include <set>
-#include "include/registry/kernel_interface.h"
+#include "include/registry/register_kernel_interface.h"
 #include "include/model.h"
 
 namespace mindspore {
-namespace lite {
+namespace registry {
 class KernelInterfaceRegistry {
  public:
   static KernelInterfaceRegistry *Instance() {
@@ -36,8 +36,8 @@ class KernelInterfaceRegistry {
 
   std::shared_ptr<kernel::KernelInterface> GetKernelInterface(const std::string &provider,
                                                               const schema::Primitive *primitive);
-  int CustomReg(const std::string &provider, const std::string &op_type, kernel::KernelInterfaceCreator creator);
-  int Reg(const std::string &provider, int op_type, kernel::KernelInterfaceCreator creator);
+  Status CustomReg(const std::string &provider, const std::string &op_type, registry::KernelInterfaceCreator creator);
+  Status Reg(const std::string &provider, int op_type, registry::KernelInterfaceCreator creator);
   virtual ~KernelInterfaceRegistry();
 
  private:
@@ -49,13 +49,13 @@ class KernelInterfaceRegistry {
 
   std::mutex mutex_;
   // key: provider
-  std::map<std::string, kernel::KernelInterfaceCreator *> kernel_creators_;
+  std::map<std::string, registry::KernelInterfaceCreator *> kernel_creators_;
   std::map<std::string, std::map<int, std::shared_ptr<kernel::KernelInterface>>> kernel_interfaces_;
   // key: provider        key: custom type
-  std::map<std::string, std::map<std::string, kernel::KernelInterfaceCreator>> custom_creators_;
+  std::map<std::string, std::map<std::string, registry::KernelInterfaceCreator>> custom_creators_;
   std::map<std::string, std::map<std::string, std::shared_ptr<kernel::KernelInterface>>> custom_kernels_;
 };
-}  // namespace lite
+}  // namespace registry
 }  // namespace mindspore
 
 #endif  // MINDSPORE_LITE_SRC_REGISTRY_KERNEL_INTERFACE_REGISTRY_H_
diff --git a/mindspore/lite/src/registry/register_kernel.cc b/mindspore/lite/src/registry/register_kernel.cc
index 07743fa677e..0bc68e83f67 100644
--- a/mindspore/lite/src/registry/register_kernel.cc
+++ b/mindspore/lite/src/registry/register_kernel.cc
@@ -16,18 +16,39 @@
 
 #include "include/registry/register_kernel.h"
 #include <set>
+#include "include/errorcode.h"
+#include "src/common/log_adapter.h"
 #include "src/registry/register_kernel_impl.h"
 
 namespace mindspore {
-namespace kernel {
-int RegisterKernel::RegCustomKernel(const std::string &arch, const std::string &provider, TypeId data_type,
-                                    const std::string &type, CreateKernel creator) {
-  return lite::RegistryKernelImpl::GetInstance()->RegCustomKernel(arch, provider, data_type, type, creator);
+namespace registry {
+Status RegisterKernel::RegCustomKernel(const std::string &arch, const std::string &provider, DataType data_type,
+                                       const std::string &type, CreateKernel creator) {
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
+  return RegistryKernelImpl::GetInstance()->RegCustomKernel(arch, provider, data_type, type, creator);
+#else
+  MS_LOG(ERROR) << unsupport_custom_kernel_register_log;
+  return lite::RET_NOT_SUPPORT;
+#endif
 }
 
-int RegisterKernel::RegKernel(const std::string &arch, const std::string &provider, TypeId data_type, int op_type,
-                              CreateKernel creator) {
-  return lite::RegistryKernelImpl::GetInstance()->RegKernel(arch, provider, data_type, op_type, creator);
+Status RegisterKernel::RegKernel(const std::string &arch, const std::string &provider, DataType data_type, int op_type,
+                                 CreateKernel creator) {
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
+  return RegistryKernelImpl::GetInstance()->RegKernel(arch, provider, data_type, op_type, creator);
+#else
+  MS_LOG(ERROR) << unsupport_custom_kernel_register_log;
+  return lite::RET_NOT_SUPPORT;
+#endif
 }
-}  // namespace kernel
+
+CreateKernel RegisterKernel::GetCreator(const schema::Primitive *primitive, KernelDesc *desc) {
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
+  return RegistryKernelImpl::GetInstance()->GetProviderCreator(primitive, desc);
+#else
+  MS_LOG(ERROR) << unsupport_custom_kernel_register_log;
+  return nullptr;
+#endif
+}
+}  // namespace registry
 }  // namespace mindspore
diff --git a/mindspore/lite/src/registry/register_kernel_impl.cc b/mindspore/lite/src/registry/register_kernel_impl.cc
index b6885eee601..46dd64542be 100644
--- a/mindspore/lite/src/registry/register_kernel_impl.cc
+++ b/mindspore/lite/src/registry/register_kernel_impl.cc
@@ -19,59 +19,65 @@
 #include "src/common/version_manager.h"
 #include "src/common/log_adapter.h"
 
-using mindspore::kernel::CreateKernel;
-using mindspore::kernel::KernelDesc;
-
-namespace mindspore::lite {
+using mindspore::registry::CreateKernel;
+using mindspore::registry::KernelDesc;
+using mindspore::schema::PrimitiveType_MAX;
+using mindspore::schema::PrimitiveType_MIN;
+namespace mindspore::registry {
 namespace {
-static const int kKernelMaxNum = (kNumberTypeEnd - kNumberTypeBegin - 1) * (PrimitiveType_MAX - PrimitiveType_MIN);
+static const auto kKernelMaxNum =
+  (static_cast<int>(DataType::kNumberTypeEnd) - static_cast<int>(DataType::kNumberTypeBegin) - 1) *
+  (PrimitiveType_MAX - PrimitiveType_MIN);
+static const auto kDataTypeLen =
+  static_cast<int>(DataType::kNumberTypeEnd) - static_cast<int>(DataType::kNumberTypeBegin) - 1;
+static const auto kOpTypeLen = PrimitiveType_MAX - PrimitiveType_MIN;
 }  // namespace
 
-int RegistryKernelImpl::GetFuncIndex(const kernel::KernelDesc &desc) {
-  if (desc.data_type >= kNumberTypeEnd) {
+int RegistryKernelImpl::GetFuncIndex(const KernelDesc &desc) {
+  if (desc.data_type >= DataType::kNumberTypeEnd) {
     return -1;
   }
-  int data_type_index = static_cast<int>(desc.data_type) - kNumberTypeBegin - 1;
+  int data_type_index = static_cast<int>(desc.data_type) - static_cast<int>(DataType::kNumberTypeBegin) - 1;
   if (data_type_index < 0) {
     return -1;
   }
-  return data_type_index * op_type_length_ + desc.type;
+  return data_type_index * kOpTypeLen + desc.type;
 }
 
-int RegistryKernelImpl::RegCustomKernel(const std::string &arch, const std::string &provider, TypeId data_type,
-                                        const std::string &type, CreateKernel creator) {
-  if (data_type >= kNumberTypeEnd) {
-    MS_LOG(ERROR) << "invalid data_type: " << data_type << "!provider: " << provider;
-    return RET_ERROR;
+Status RegistryKernelImpl::RegCustomKernel(const std::string &arch, const std::string &provider, DataType data_type,
+                                           const std::string &type, CreateKernel creator) {
+  if (data_type >= DataType::kNumberTypeEnd) {
+    MS_LOG(ERROR) << "invalid data_type: " << static_cast<int>(data_type) << "!provider: " << provider;
+    return kLiteError;
   }
   std::unique_lock<std::mutex> lock(lock_);
   if (custom_kernel_creators_[provider][arch][type] == nullptr) {
     custom_kernel_creators_[provider][arch][type] =
-      reinterpret_cast<CreateKernel *>(calloc(data_type_length_, sizeof(CreateKernel)));
+      reinterpret_cast<CreateKernel *>(calloc(kDataTypeLen, sizeof(CreateKernel)));
     if (custom_kernel_creators_[provider][arch][type] == nullptr) {
       MS_LOG(ERROR) << "malloc custom kernel creator fail!provider: " << provider << ", arch: " << arch;
-      return RET_ERROR;
+      return kLiteError;
     }
   }
 
-  int data_type_index = data_type - kNumberTypeBegin - 1;
-  if (data_type_index < 0 || data_type_index >= data_type_length_) {
-    MS_LOG(ERROR) << "invalid data_type: " << data_type << "!provider: " << provider;
-    return RET_ERROR;
+  int data_type_index = static_cast<int>(data_type) - static_cast<int>(DataType::kNumberTypeBegin) - 1;
+  if (data_type_index < 0 || data_type_index >= kDataTypeLen) {
+    MS_LOG(ERROR) << "invalid data_type: " << static_cast<int>(data_type) << "!provider: " << provider;
+    return kLiteError;
   }
   custom_kernel_creators_[provider][arch][type][data_type_index] = creator;
-  return RET_OK;
+  return kSuccess;
 }
 
-int RegistryKernelImpl::RegKernel(const std::string &arch, const std::string &provider, TypeId data_type, int type,
-                                  kernel::CreateKernel creator) {
+Status RegistryKernelImpl::RegKernel(const std::string &arch, const std::string &provider, DataType data_type, int type,
+                                     registry::CreateKernel creator) {
   std::unique_lock<std::mutex> lock(lock_);
   auto iter = kernel_creators_.find(provider);
   if (iter == kernel_creators_.end()) {
     kernel_creators_[provider][arch] = reinterpret_cast<CreateKernel *>(calloc(kKernelMaxNum, sizeof(CreateKernel)));
     if (kernel_creators_[provider][arch] == nullptr) {
       MS_LOG(ERROR) << "malloc kernel creator buffer fail! provider: " << provider << ",arch:" << arch;
-      return RET_ERROR;
+      return kLiteError;
     }
   } else {
     auto iter_arch = iter->second.find(arch);
@@ -79,7 +85,7 @@ int RegistryKernelImpl::RegKernel(const std::string &arch, const std::string &pr
       iter->second[arch] = reinterpret_cast<CreateKernel *>(calloc(kKernelMaxNum, sizeof(CreateKernel)));
       if (iter->second[arch] == nullptr) {
         MS_LOG(ERROR) << "malloc kernel creator buffer fail! provider: " << provider << ",arch:" << arch;
-        return RET_ERROR;
+        return kLiteError;
       }
     }
   }
@@ -87,17 +93,18 @@ int RegistryKernelImpl::RegKernel(const std::string &arch, const std::string &pr
   KernelDesc desc = {data_type, type, arch, provider};
   int index = GetFuncIndex(desc);
   if (index >= kKernelMaxNum || index < 0) {
-    MS_LOG(ERROR) << "invalid kernel key, arch " << arch << ", data_type" << data_type << ",op type " << type;
-    return RET_ERROR;
+    MS_LOG(ERROR) << "invalid kernel key, arch " << arch << ", data_type" << static_cast<int>(data_type) << ",op type "
+                  << type;
+    return kLiteError;
   }
 
   kernel_creators_[provider][arch][index] = creator;
-  return RET_OK;
+  return kSuccess;
 }
 
-kernel::CreateKernel RegistryKernelImpl::GetCustomKernelCreator(const schema::Primitive *primitive,
-                                                                kernel::KernelDesc *desc) {
-  int data_type_index = static_cast<int>(desc->data_type) - kNumberTypeBegin - 1;
+registry::CreateKernel RegistryKernelImpl::GetCustomKernelCreator(const schema::Primitive *primitive,
+                                                                  KernelDesc *desc) {
+  int data_type_index = static_cast<int>(desc->data_type) - static_cast<int>(DataType::kNumberTypeBegin) - 1;
   if (data_type_index < 0) {
     return nullptr;
   }
@@ -125,9 +132,8 @@ kernel::CreateKernel RegistryKernelImpl::GetCustomKernelCreator(const schema::Pr
   return nullptr;
 }
 
-kernel::CreateKernel RegistryKernelImpl::GetProviderCreator(const schema::Primitive *primitive,
-                                                            kernel::KernelDesc *desc) {
-  kernel::CreateKernel creator = nullptr;
+registry::CreateKernel RegistryKernelImpl::GetProviderCreator(const schema::Primitive *primitive, KernelDesc *desc) {
+  registry::CreateKernel creator = nullptr;
   std::unique_lock<std::mutex> lock(lock_);
   if (desc->type == schema::PrimitiveType_Custom) {
     return GetCustomKernelCreator(primitive, desc);
@@ -173,4 +179,4 @@ RegistryKernelImpl::~RegistryKernelImpl() {
     }
   }
 }
-}  // namespace mindspore::lite
+}  // namespace mindspore::registry
diff --git a/mindspore/lite/src/registry/register_kernel_impl.h b/mindspore/lite/src/registry/register_kernel_impl.h
index 508ccd6fb6b..37edb6f7421 100644
--- a/mindspore/lite/src/registry/register_kernel_impl.h
+++ b/mindspore/lite/src/registry/register_kernel_impl.h
@@ -24,12 +24,8 @@
 #include <vector>
 #include <set>
 #include "include/registry/register_kernel.h"
-#include "src/registry/register_utils.h"
 
-using mindspore::schema::PrimitiveType_MAX;
-using mindspore::schema::PrimitiveType_MIN;
-
-namespace mindspore::lite {
+namespace mindspore::registry {
 class RegistryKernelImpl {
  public:
   RegistryKernelImpl() = default;
@@ -40,33 +36,30 @@ class RegistryKernelImpl {
     return &instance;
   }
 
-  int GetFuncIndex(const kernel::KernelDesc &desc);
+  Status RegCustomKernel(const std::string &arch, const std::string &provider, DataType data_type,
+                         const std::string &type, registry::CreateKernel creator);
 
-  int RegCustomKernel(const std::string &arch, const std::string &provider, TypeId data_type, const std::string &type,
-                      kernel::CreateKernel creator);
+  Status RegKernel(const std::string &arch, const std::string &provider, DataType data_type, int type,
+                   registry::CreateKernel creator);
 
-  int RegKernel(const std::string &arch, const std::string &provider, TypeId data_type, int type,
-                kernel::CreateKernel creator);
+  virtual registry::CreateKernel GetProviderCreator(const schema::Primitive *primitive, registry::KernelDesc *desc);
 
-  virtual kernel::CreateKernel GetProviderCreator(const schema::Primitive *primitive, kernel::KernelDesc *desc);
-
-  const std::map<std::string, std::unordered_map<std::string, kernel::CreateKernel *>> &kernel_creators() {
+  const std::map<std::string, std::unordered_map<std::string, registry::CreateKernel *>> &kernel_creators() {
     return kernel_creators_;
   }
 
  protected:
-  static const int data_type_length_{kNumberTypeEnd - kNumberTypeBegin + 1};
-  static const int op_type_length_{PrimitiveType_MAX - PrimitiveType_MIN + 1};
-  std::map<std::string, std::unordered_map<std::string, kernel::CreateKernel *>> kernel_creators_;
+  std::map<std::string, std::unordered_map<std::string, registry::CreateKernel *>> kernel_creators_;
   // keys:provider, arch, type
-  std::map<std::string, std::map<std::string, std::unordered_map<std::string, kernel::CreateKernel *>>>
+  std::map<std::string, std::map<std::string, std::unordered_map<std::string, registry::CreateKernel *>>>
     custom_kernel_creators_;
 
  private:
   std::mutex lock_;
 
-  kernel::CreateKernel GetCustomKernelCreator(const schema::Primitive *primitive, kernel::KernelDesc *desc);
+  registry::CreateKernel GetCustomKernelCreator(const schema::Primitive *primitive, registry::KernelDesc *desc);
+  int GetFuncIndex(const registry::KernelDesc &desc);
 };
-}  // namespace mindspore::lite
+}  // namespace mindspore::registry
 
 #endif  // MINDSPORE_LITE_SRC_REGISTRY_REGISTER_KERNEL_IMPL_H_
diff --git a/mindspore/lite/src/registry/register_utils.cc b/mindspore/lite/src/registry/register_utils.cc
deleted file mode 100644
index b6b0231927a..00000000000
--- a/mindspore/lite/src/registry/register_utils.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/registry/register_utils.h"
-#include "src/registry/register_kernel_impl.h"
-
-namespace mindspore {
-namespace kernel {
-CreateKernel RegisterUtils::GetCreator(const schema::Primitive *primitive, kernel::KernelDesc *desc) {
-  return lite::RegistryKernelImpl::GetInstance()->GetProviderCreator(primitive, desc);
-}
-}  // namespace kernel
-}  // namespace mindspore
diff --git a/mindspore/lite/src/registry/register_utils.h b/mindspore/lite/src/registry/register_utils.h
deleted file mode 100644
index 2a0a9746eca..00000000000
--- a/mindspore/lite/src/registry/register_utils.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_REGISTRY_REGISTER_UTILS_H_
-#define MINDSPORE_LITE_SRC_REGISTRY_REGISTER_UTILS_H_
-#include <string>
-#include "include/registry/register_kernel.h"
-#include "schema/model_generated.h"
-#include "ir/dtype/type_id.h"
-
-namespace mindspore {
-namespace kernel {
-/// \brief KernelDesc defined kernel's basic attribute.
-struct KernelDesc {
-  TypeId data_type;     /**< kernel data type argument */
-  int type;             /**< op type argument */
-  std::string arch;     /**< deviceType argument */
-  std::string provider; /**< user identification argument */
-
-  bool operator<(const KernelDesc &dst) const {
-    if (provider != dst.provider) {
-      return provider < dst.provider;
-    } else if (arch != dst.arch) {
-      return arch < dst.arch;
-    } else if (data_type != dst.data_type) {
-      return data_type < dst.data_type;
-    } else {
-      return type < dst.type;
-    }
-  }
-};
-
-/// \brief RegisterKernel Defined registration of kernel.
-class RegisterUtils {
- public:
-  /// \brief Static methon to get a kernel's create function.
-  ///
-  /// \param[in] desc Define kernel's basic attribute.
-  /// \param[in] primitive Define the attributes of op.
-  ///
-  /// \return Function pointer to create a kernel.
-  static CreateKernel GetCreator(const schema::Primitive *primitive, kernel::KernelDesc *desc);
-};
-}  // namespace kernel
-}  // namespace mindspore
-#endif  // MINDSPORE_LITE_SRC_REGISTRY_REGISTER_UTILS_H_
diff --git a/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc
index dbc917a4d40..18cfbd73011 100644
--- a/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc
@@ -28,6 +28,9 @@ OpenCLAllocator::~OpenCLAllocator() { Clear(); }
 
 void OpenCLAllocator::SetContext(const AllocatorContext &ctx) {
   lock_flag_ = ctx.lockFlag;
+  if (ctx.shiftFactor < 0) {
+    MS_LOG(ERROR) << "shiftFactor from AllocatorContext is invalid negative.";
+  }
   shift_factor_ = ctx.shiftFactor;
 }
 
@@ -78,7 +81,8 @@ void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::B
   MS_ASSERT(host_ptr);
   if (host_ptr == nullptr) {
     delete *buffer;
-    MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << *buffer << ", host_ptr=" << host_ptr;
+    buffer = nullptr;
+    MS_LOG(ERROR) << "Map buffer failed, can not found buffer.";
     return nullptr;
   }
   cl::Memory *mem = *buffer;
diff --git a/mindspore/lite/src/runtime/infer_manager.cc b/mindspore/lite/src/runtime/infer_manager.cc
index bb2720ee651..7acd3d74604 100644
--- a/mindspore/lite/src/runtime/infer_manager.cc
+++ b/mindspore/lite/src/runtime/infer_manager.cc
@@ -17,31 +17,34 @@
 #include <algorithm>
 #include <set>
 #include <string>
+#include <memory>
 #include "src/common/prim_util.h"
 #include "src/common/tensor_util.h"
 #include "src/cxx_api/tensor/tensor_impl.h"
 #include "schema/model_generated.h"
 #include "include/errorcode.h"
 #include "nnacl/errorcode.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/tensorlist.h"
-#include "include/registry/kernel_interface.h"
+#endif
+#include "include/registry/register_kernel_interface.h"
 #include "src/kernel_registry.h"
 
 namespace mindspore {
 namespace lite {
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
 int KernelInferShape(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
-                     const void *primitive, std::set<std::string> &&providers) {
+                     const void *primitive, std::set<std::string> &&providers, int schema_version) {
   if (primitive == nullptr) {
     return RET_NOT_SUPPORT;
   }
-  auto prim_type = GetPrimitiveType(primitive);
   std::shared_ptr<kernel::KernelInterface> kernel_interface = nullptr;
-  if (prim_type == schema::PrimitiveType_Custom) {
+  if (IsCustomNode(primitive, schema_version)) {
     kernel_interface =
-      kernel::RegisterKernelInterface::GetKernelInterface("", static_cast<const schema::Primitive *>(primitive));
+      registry::RegisterKernelInterface::GetKernelInterface("", static_cast<const schema::Primitive *>(primitive));
   } else {
     for (auto &&provider : providers) {
-      kernel_interface = kernel::RegisterKernelInterface::GetKernelInterface(
+      kernel_interface = registry::RegisterKernelInterface::GetKernelInterface(
         provider, static_cast<const schema::Primitive *>(primitive));
       if (kernel_interface != nullptr) {
         break;
@@ -58,12 +61,16 @@ int KernelInferShape(const std::vector<lite::Tensor *> &inputs, const std::vecto
   std::transform(outputs.begin(), outputs.end(), std::back_inserter(out_tensors),
                  [](lite::Tensor *tensor) { return mindspore::MSTensor(std::make_shared<MSTensor::Impl>(tensor)); });
   auto ret = kernel_interface->Infer(&in_tensors, &out_tensors, static_cast<const schema::Primitive *>(primitive));
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "op_type: " << PrimitiveTypeName(prim_type) << " infer fail!ret: " << ret;
-    return ret;
+  if (ret == kLiteInferInvalid) {
+    return RET_INFER_INVALID;
+  }
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "op_type: " << GetPrimitiveTypeName(primitive, schema_version) << " infer fail!ret: " << ret;
+    return RET_ERROR;
   }
   return RET_OK;
 }
+#endif
 
 int KernelInferShape(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
                      OpParameter *parameter) {
@@ -71,9 +78,9 @@ int KernelInferShape(const std::vector<lite::Tensor *> &inputs, const std::vecto
     MS_LOG(ERROR) << "No input!";
     return RET_ERROR;
   }
-#ifndef ENABLE_CONTROL_TENSORLIST
+#ifdef CONTROLFLOW_TENSORLIST_CLIP
   if (parameter->type_ == schema::PrimitiveType_Switch) {
-    MS_LOG(ERROR) << unsupport_control_tensorlist_log;
+    MS_LOG(ERROR) << unsupport_controlflow_tensorlist_log;
     return RET_ERROR;
   }
 #endif
@@ -107,7 +114,7 @@ int KernelInferShape(const std::vector<lite::Tensor *> &inputs, const std::vecto
     if (out_tensors.at(i) == nullptr) {
       continue;
     }
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
     if (reinterpret_cast<TensorListC *>(out_tensors.at(i))->data_type_ == TypeIdC::kObjectTypeTensorType) {
       auto *tensor_list_c = reinterpret_cast<TensorListC *>(out_tensors.at(i));
       auto *tensor_list = reinterpret_cast<TensorList *>(outputs.at(i));
@@ -121,7 +128,7 @@ int KernelInferShape(const std::vector<lite::Tensor *> &inputs, const std::vecto
     } else {
 #endif
       TensorC2Tensor(out_tensors.at(i), outputs.at(i));
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
     }
 #endif
     if (ret == NNACL_INFER_INVALID) {
diff --git a/mindspore/lite/src/runtime/infer_manager.h b/mindspore/lite/src/runtime/infer_manager.h
index ee7bdbf84ca..e5eb98a68b8 100644
--- a/mindspore/lite/src/runtime/infer_manager.h
+++ b/mindspore/lite/src/runtime/infer_manager.h
@@ -30,8 +30,10 @@
 namespace mindspore::lite {
 int KernelInferShape(const std::vector<lite::Tensor *> &tensors_in, const std::vector<lite::Tensor *> &outputs,
                      OpParameter *parameter);
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
 int KernelInferShape(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
-                     const void *primitive, std::set<std::string> &&providers);
+                     const void *primitive, std::set<std::string> &&providers, int schema_version);
+#endif
 class InferManager {
  public:
   static InferManager *GetInstance() {
diff --git a/mindspore/lite/src/runtime/kernel/arm/CMakeLists.txt b/mindspore/lite/src/runtime/kernel/arm/CMakeLists.txt
index be4c29cf375..3d603efdded 100644
--- a/mindspore/lite/src/runtime/kernel/arm/CMakeLists.txt
+++ b/mindspore/lite/src/runtime/kernel/arm/CMakeLists.txt
@@ -14,7 +14,7 @@ if(MSLITE_STRING_KERNEL)
         ${KERNEL_STRING_SRC}
         )
 endif()
-if(MSLITE_CONTROL_TENSORLIST)
+if(MSLITE_CONTROLFLOW_TENSORLIST)
     file(GLOB KERNEL_CONTROL_TENSORLIST
             ${CMAKE_CURRENT_SOURCE_DIR}/control/*.cc
             )
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/argminmax_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/argminmax_base.cc
index 99e8c75fd1b..6695d0823dc 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/argminmax_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/argminmax_base.cc
@@ -89,6 +89,9 @@ int ArgMinMaxCPUKernel::Run() {
 #endif
   } else {
     MS_LOG(ERROR) << "unsupported data type!";
+    ms_context_->allocator->Free(arg_param_->arg_elements_);
+    arg_param_->arg_elements_ = nullptr;
+    return RET_ERROR;
   }
 
   ms_context_->allocator->Free(arg_param_->arg_elements_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/call.cc b/mindspore/lite/src/runtime/kernel/arm/base/call.cc
index ebacb7b214a..8450d52ce7c 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/call.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/call.cc
@@ -17,7 +17,9 @@
 #include "src/runtime/kernel/arm/base/call.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/tensorlist.h"
+#endif
 #include "src/common/utils.h"
 
 using mindspore::lite::KernelRegistrar;
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/call.h b/mindspore/lite/src/runtime/kernel/arm/base/call.h
index 1a511c65834..0233e1bf038 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/call.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/call.h
@@ -19,7 +19,9 @@
 #include <vector>
 #include "src/runtime/kernel/arm/base/carry_data.h"
 #include "src/tensor.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/tensorlist.h"
+#endif
 
 // this file is useless when move create actor before schedule.
 namespace mindspore::kernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/carry_data.cc b/mindspore/lite/src/runtime/kernel/arm/base/carry_data.cc
index fef89f2c486..1c899f5cba6 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/carry_data.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/carry_data.cc
@@ -16,7 +16,6 @@
 
 #include "src/runtime/kernel/arm/base/carry_data.h"
 #include "include/errorcode.h"
-#include "src/tensorlist.h"
 
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_NOT_SUPPORT;
@@ -45,12 +44,12 @@ int CarryDataKernel::MoveData(const std::vector<lite::Tensor *>::iterator &dst_b
       MS_LOG(ERROR) << "Carry const data and graph inputs.";
     } else {
       if (src_tensor->data_type() == kObjectTypeTensorType && dst_tensor->data_type() == kObjectTypeTensorType) {
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
         MS_LOG(ERROR) << "Carry MoveTensorListData";
         ret = MoveTensorListData(reinterpret_cast<lite::TensorList *>(dst_tensor),
                                  reinterpret_cast<lite::TensorList *>(src_tensor));
 #else
-        MS_LOG(ERROR) << unsupport_control_tensorlist_log;
+        MS_LOG(ERROR) << unsupport_controlflow_tensorlist_log;
         return RET_NOT_SUPPORT;
 #endif
       } else {
@@ -87,7 +86,7 @@ int CarryDataKernel::MoveTensorData(lite::Tensor *dst_tensor, lite::Tensor *src_
   memcpy(dst_tensor->data(), src_tensor->data(), src_tensor->Size());
   return RET_OK;
 }
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 int CarryDataKernel::MoveTensorListData(lite::TensorList *dst_tensorlist, lite::TensorList *src_tensorlist) {
   // shape may change, because tensors.size() can be change in RunGraph
   if (dst_tensorlist->data_type() != src_tensorlist->data_type() ||
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/carry_data.h b/mindspore/lite/src/runtime/kernel/arm/base/carry_data.h
index 1a5f47fa30e..638d340fee9 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/carry_data.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/carry_data.h
@@ -19,7 +19,9 @@
 #include <vector>
 #include "src/inner_kernel.h"
 #include "src/tensor.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/tensorlist.h"
+#endif
 
 namespace mindspore::kernel {
 class CarryDataKernel : public InnerKernel {
@@ -35,7 +37,7 @@ class CarryDataKernel : public InnerKernel {
                const std::vector<lite::Tensor *>::iterator &src_begin,
                const std::vector<lite::Tensor *>::iterator &src_limit);
   int MoveTensorData(lite::Tensor *dst_tensor, lite::Tensor *src_tensor);
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   int MoveTensorListData(lite::TensorList *dst_tensorlist, lite::TensorList *src_tensorlist);
 #endif
 };
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
index 3448f500547..35063d7f471 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
@@ -48,9 +48,11 @@ void ConvolutionBaseCPUKernel::FreeAlignedData(void **ptr) {
 ConvolutionBaseCPUKernel::~ConvolutionBaseCPUKernel() {
   if (addr_map.find(reinterpret_cast<uintptr_t>(packed_weight_)) != addr_map.end()) {
     FreeAlignedData(reinterpret_cast<void **>(&packed_weight_));
-  } else if (packed_weight_ != nullptr) {
-    free(packed_weight_);
-    packed_weight_ = nullptr;
+  } else if (!op_parameter_->is_train_session_) {
+    if (packed_weight_ != nullptr) {
+      free(packed_weight_);
+      packed_weight_ = nullptr;
+    }
   }
   if (addr_map.find(reinterpret_cast<uintptr_t>(bias_data_)) != addr_map.end()) {
     FreeAlignedData(reinterpret_cast<void **>(&bias_data_));
@@ -134,11 +136,13 @@ int ConvolutionBaseCPUKernel::InitConvWeightBias() {
   } else {
     MS_ASSERT(in_tensors_.size() == kInputSize1);
   }
-  if (origin_weight_ != nullptr) {
-    PackWeight();
-  } else {
-    is_repack_ = true;
-    MS_LOG(WARNING) << "The weight is nullptr, will pack in runtime.";
+  if (!op_parameter_->is_train_session_) {
+    if (origin_weight_ != nullptr) {
+      PackWeight();
+    } else {
+      is_repack_ = true;
+      MS_LOG(WARNING) << "The weight is nullptr, will pack in runtime.";
+    }
   }
   return lite::RET_OK;
 }
@@ -149,8 +153,13 @@ int ConvolutionBaseCPUKernel::RepackWeight() {
     MS_LOG(ERROR) << "Malloc data for bias and weight failed.";
     return lite::RET_ERROR;
   }
-  if (IsRepack() || (IsTrain() && IsTrainable())) {
-    is_repack_ = (IsTrain() && IsTrainable()) ? IsRepack() : false;
+  if (IsRepack() || (op_parameter_->is_train_session_)) {
+    if (op_parameter_->is_train_session_) {
+      packed_weight_ = reinterpret_cast<float *>(workspace());
+      memset(packed_weight_, 0, workspace_size());
+    } else {
+      is_repack_ = false;
+    }
     PackWeight();
   }
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/crop_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/crop_base.cc
index 2ce5e246451..aaf957f5fb1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/crop_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/crop_base.cc
@@ -56,7 +56,9 @@ void CropBaseCPUKernel::PadOffset(int input_dim, CropParameter *crop_para) const
       if (offsets_size == 1) {
         crop_offset = crop_para->offset_[0];
       } else if (offsets_size > 1) {
-        crop_offset = crop_para->offset_[i - axis];
+        if (i - axis < CROP_OFFSET_MAX_SIZE) {
+          crop_offset = crop_para->offset_[i - axis];
+        }
       }
     }
     crop_para->in_offset_[i] = crop_offset;
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/group_convolution_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/group_convolution_base.cc
index 35b1f97596d..4cdfcc2bf11 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/group_convolution_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/group_convolution_base.cc
@@ -74,6 +74,10 @@ void GroupConvolutionBaseCPUKernel::FreeSubKernel() {
     sub_conv = nullptr;
   }
   group_convs_.clear();
+  if (group_conv_creator_ != nullptr) {
+    delete group_conv_creator_;
+    group_conv_creator_ = nullptr;
+  }
 }
 
 int GroupConvolutionBaseCPUKernel::PreProcess() {
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/partial_fusion.cc b/mindspore/lite/src/runtime/kernel/arm/base/partial_fusion.cc
index 0a7949c30bb..b9188914e5e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/partial_fusion.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/partial_fusion.cc
@@ -17,7 +17,9 @@
 #include "src/runtime/kernel/arm/base/partial_fusion.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/tensorlist.h"
+#endif
 #include "src/common/utils.h"
 
 // this file is going to be removed when move create actor before schedule.
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/partial_fusion.h b/mindspore/lite/src/runtime/kernel/arm/base/partial_fusion.h
index b74bc1c0b32..1f5b73bd330 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/partial_fusion.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/partial_fusion.h
@@ -17,7 +17,6 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_PARTIAL_FUSION_H_
 
 #include <vector>
-#include "src/runtime/kernel/arm/base/carry_data.h"
 #include "src/tensor.h"
 #include "src/lite_kernel.h"
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/random_standard_normal.cc b/mindspore/lite/src/runtime/kernel/arm/base/random_standard_normal.cc
index 1a71396fd97..dd0ebff4a83 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/random_standard_normal.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/random_standard_normal.cc
@@ -18,7 +18,9 @@
 #include <random>
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/tensorlist.h"
+#endif
 
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/select.cc b/mindspore/lite/src/runtime/kernel/arm/base/select.cc
index 07bf7ce3371..648b321d62a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/select.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/select.cc
@@ -16,7 +16,9 @@
 #include "src/runtime/kernel/arm/base/select.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/tensorlist.h"
+#endif
 
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/select.h b/mindspore/lite/src/runtime/kernel/arm/base/select.h
index 8e5944d4baf..57d3302e5e3 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/select.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/select.h
@@ -19,7 +19,9 @@
 #include <vector>
 #include "src/runtime/kernel/arm/base/carry_data.h"
 #include "src/inner_kernel.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/tensorlist.h"
+#endif
 
 namespace mindspore::kernel {
 class SelectCPUKernel : public CarryDataKernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/switch.cc b/mindspore/lite/src/runtime/kernel/arm/base/switch.cc
deleted file mode 100644
index 51302648f9f..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/base/switch.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/runtime/kernel/arm/base/switch.h"
-#include "src/kernel_registry.h"
-#include "include/errorcode.h"
-#include "src/tensorlist.h"
-
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_Switch;
-
-namespace mindspore::kernel {
-int SwitchCPUKernel::Init() { return RET_OK; }
-int SwitchCPUKernel::ReSize() { return RET_OK; }
-int SwitchCPUKernel::Run() { return RET_OK; }
-
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Switch, LiteKernelCreator<SwitchCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Switch, LiteKernelCreator<SwitchCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeBool, PrimitiveType_Switch, LiteKernelCreator<SwitchCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Switch, LiteKernelCreator<SwitchCPUKernel>)
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/switch.h b/mindspore/lite/src/runtime/kernel/arm/base/switch.h
deleted file mode 100644
index 8f9439c0d92..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/base/switch.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_SWITCH_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_SWITCH_H_
-
-#include <vector>
-#include "src/runtime/kernel/arm/base/carry_data.h"
-#include "src/inner_kernel.h"
-#include "src/tensorlist.h"
-
-namespace mindspore::kernel {
-class SwitchCPUKernel : public InnerKernel {
- public:
-  SwitchCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                  const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : InnerKernel(parameter, inputs, outputs, ctx) {}
-  ~SwitchCPUKernel() override = default;
-  int Init() override;
-  int ReSize() override;
-  int Run() override;
-};
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_SWITCH_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_fromtensor.cc b/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_fromtensor.cc
deleted file mode 100644
index 87acef4f18a..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_fromtensor.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "include/errorcode.h"
-#include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/base/tensorlist_fromtensor.h"
-
-using mindspore::kernel::KERNEL_ARCH;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_NULL_PTR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_TensorListFromTensor;
-
-namespace mindspore::kernel {
-int TensorListFromTensorCPUKernel::IsCompatibleShape() {
-  if (input1_->data_type() != kNumberTypeInt && input1_->data_type() != kNumberTypeInt32) {  // element_shape
-    MS_LOG(ERROR) << "in_tensors_[1] data type is must be int";
-    return RET_ERROR;
-  }
-  int in1_ele_num = input1_->ElementsNum();
-  std::vector<int> tensor_shape = input0_->shape();
-  if (static_cast<int>(tensor_shape.size() - 1) != in1_ele_num) {
-    MS_LOG(ERROR) << "in_tensors_[0].shape().size() - 1:" << (tensor_shape.size() - 1)
-                  << " must be equal in_tensors_[1].ElementsNum():" << in1_ele_num;
-    return RET_ERROR;
-  }
-  int *elements_shape = reinterpret_cast<int *>(input1_->data_c());  // element shape in tensor data
-  if (elements_shape == nullptr) {
-    return RET_NULL_PTR;
-  }
-  for (int i = 0; i < in1_ele_num; ++i) {
-    int dim0 = tensor_shape[i + 1];
-    int dim1 = elements_shape[i];
-    if (dim0 >= 0 && dim1 >= 0 && dim0 != dim1) {
-      MS_LOG(ERROR) << "input0_->shape()[" << (i + 1) << "]:" << dim0 << " is not equal input1_->data_c()[" << i
-                    << "]:" << dim1;
-      return RET_ERROR;
-    }
-  }
-  return RET_OK;
-}
-
-int TensorListFromTensorCPUKernel::Init() { return RET_OK; }
-
-int TensorListFromTensorCPUKernel::ReSize() { return RET_OK; }
-
-int TensorListFromTensorCPUKernel::Run() {
-  input0_ = in_tensors_[0];  // row tensor
-  input1_ = in_tensors_[1];  // element_shape tensor
-  output0_ = out_tensors_[0];
-  if (IsCompatibleShape() != RET_OK) {
-    MS_LOG(ERROR) << "IsNotCompatibleShape!";
-    return RET_ERROR;
-  }
-  dtype_ = in_tensors_[0]->data_type();
-  if (input0_->shape().size() == 0) {
-    MS_LOG(ERROR) << "input0_->shape().size():" << input0_->shape().size() << " must be greater than 0";
-  }
-  int dim0 = input0_->shape()[0];
-  if (dim0 <= 0) {
-    MS_LOG(ERROR) << "input0_->shape()[0]:" << dim0 << " must be greater than 0!";
-    return RET_ERROR;
-  }
-  auto output0 = reinterpret_cast<lite::TensorList *>(output0_);
-  if (dim0 != output0->ElementsNum()) {
-    MS_LOG(ERROR) << "output0_->ElementsNum():" << output0->ElementsNum() << " must be equal to dim0:" << dim0;
-    return RET_ERROR;
-  }
-  if (dim0 == 0) {
-    MS_LOG(ERROR) << "div zero";
-    return RET_ERROR;
-  }
-  int devision_dim0 = input0_->ElementsNum() / dim0;
-  auto data_offset = devision_dim0 * lite::DataTypeSize(dtype_);
-  auto in_data = reinterpret_cast<char *>(input0_->data_c());
-  MS_ASSERT(in_data != nullptr);
-  // copy data from input0(tensor) to output(tensorlist) vector<*tensor>
-  for (int i = 0; i < dim0; ++i) {
-    auto out_ptr = output0->GetTensor(i);
-    MS_ASSERT(out_ptr != nullptr);
-    if (out_ptr->ElementsNum() != devision_dim0) {
-      MS_LOG(ERROR) << "tensors_[" << i << "].ElementsNum():" << out_ptr->ElementsNum()
-                    << " must be euqal to devision_dim0:" << devision_dim0;
-      return RET_ERROR;
-    }
-    auto out_data = out_ptr->data_c();
-    MS_ASSERT(out_data != nullptr);
-    memcpy(out_data, in_data, data_offset);
-    out_ptr->set_data_type(dtype_);
-    in_data += data_offset;
-  }
-  output0->set_own_data(true);
-  output0->set_tensors_data_type(dtype_);
-  return RET_OK;
-}
-
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_TensorListFromTensor,
-           LiteKernelCreator<TensorListFromTensorCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_TensorListFromTensor, LiteKernelCreator<TensorListFromTensorCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_TensorListFromTensor,
-           LiteKernelCreator<TensorListFromTensorCPUKernel>)
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_fromtensor.h b/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_fromtensor.h
deleted file mode 100644
index bf7329563c1..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_fromtensor.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTFROMTENSOR_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTFROMTENSOR_H_
-
-#include <vector>
-#include "src/inner_kernel.h"
-#include "src/tensorlist.h"
-#include "schema/model_generated.h"
-#include "nnacl/tensorlist_parameter.h"
-
-namespace mindspore::kernel {
-class TensorListFromTensorCPUKernel : public InnerKernel {
- public:
-  TensorListFromTensorCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                                const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : InnerKernel(parameter, inputs, outputs, ctx),
-        dtype_(static_cast<TypeId>(reinterpret_cast<TensorListParameter *>(parameter)->element_dtype_)) {}
-  ~TensorListFromTensorCPUKernel() = default;
-
-  int Init() override;
-  int ReSize() override;
-  int Run() override;
-  int IsCompatibleShape();
-
- private:
-  std::vector<int> output_shape_;
-  lite::Tensor *output0_ = nullptr;
-  lite::Tensor *input0_ = nullptr;
-  lite::Tensor *input1_ = nullptr;
-  TypeId dtype_ = kTypeUnknown;
-};
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTFROMTENSOR_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_getitem.cc b/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_getitem.cc
deleted file mode 100644
index ea54d8a9b01..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_getitem.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "include/errorcode.h"
-#include "include/ms_tensor.h"
-#include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/base/tensorlist_getitem.h"
-
-using mindspore::kernel::KERNEL_ARCH;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_NULL_PTR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_TensorListGetItem;
-
-namespace mindspore::kernel {
-int TensorListGetItemCPUKernel::Init() { return RET_OK; }
-
-int TensorListGetItemCPUKernel::Run() {
-  MS_ASSERT(in_tensors_.size() >= 2);
-  MS_ASSERT(in_tensors_.at(0) != nullptr);
-  MS_ASSERT(in_tensors_.at(1) != nullptr);
-  MS_ASSERT(out_tensors_.at(0) != nullptr);
-  auto input0 = reinterpret_cast<lite::TensorList *>(in_tensors_.at(0));
-  dtype_ = input0->tensors_data_type();
-  MS_ASSERT(in_tensors_.at(1)->data_c() != nullptr);
-  index_ = reinterpret_cast<int *>(in_tensors_.at(1)->data_c())[0];
-  int dim0 = input0->ElementsNum();
-  if (index_ < 0 || index_ >= dim0) {
-    MS_LOG(ERROR) << "index tensor:[" << index_ << "] must be in [0, " << dim0 << ")!";
-    return RET_ERROR;
-  }
-  auto src_ptr = input0->GetTensor(index_);
-  MS_ASSERT(src_ptr != nullptr);
-  if (src_ptr->data_type() != kTypeUnknown) {
-    if (src_ptr->ElementsNum() != out_tensors_.at(0)->ElementsNum()) {
-      MS_LOG(ERROR) << "src_ptr->ElementsNum():" << src_ptr->ElementsNum()
-                    << " must be equal to out_tensors_[0]->ElementsNum():" << out_tensors_.at(0)->ElementsNum();
-      return RET_ERROR;
-    }
-    auto status = lite::Tensor::CopyTensorData(*src_ptr, out_tensors_.at(0));
-    if (status == RET_ERROR) {
-      MS_LOG(ERROR) << "copy tensor data failed!";
-      return RET_ERROR;
-    }
-  } else {
-    // reset data buffer is zero
-    auto out_data = out_tensors_[0]->data_c();
-    if (out_data == nullptr) {
-      MS_LOG(ERROR) << "data of out_tensors_[0] is nullptr";
-      return RET_ERROR;
-    }
-    memset(out_data, 0, out_tensors_[0]->Size());
-  }
-  return RET_OK;
-}
-
-int TensorListGetItemCPUKernel::ReSize() { return RET_OK; }
-
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_TensorListGetItem, LiteKernelCreator<TensorListGetItemCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_TensorListGetItem, LiteKernelCreator<TensorListGetItemCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_TensorListGetItem, LiteKernelCreator<TensorListGetItemCPUKernel>)
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_getitem.h b/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_getitem.h
deleted file mode 100644
index abb3a088613..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_getitem.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTGETITEM_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTGETITEM_H_
-
-#include <vector>
-#include "src/inner_kernel.h"
-#include "src/tensorlist.h"
-#include "schema/model_generated.h"
-#include "nnacl/tensorlist_parameter.h"
-
-namespace mindspore::kernel {
-class TensorListGetItemCPUKernel : public InnerKernel {
- public:
-  TensorListGetItemCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                             const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : InnerKernel(parameter, inputs, outputs, ctx),
-        dtype_(reinterpret_cast<TensorListParameter *>(parameter)->element_dtype_) {}
-  ~TensorListGetItemCPUKernel() = default;
-
-  int Init() override;
-  int ReSize() override;
-  int Run() override;
-
- private:
-  int index_ = 0;
-  int dtype_ = kTypeUnknown;
-};
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTGETITEM_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_reserve.cc b/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_reserve.cc
deleted file mode 100644
index 3deba11c758..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_reserve.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <vector>
-#include "include/errorcode.h"
-#include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/base/tensorlist_reserve.h"
-
-using mindspore::kernel::KERNEL_ARCH;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_NULL_PTR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_TensorListReserve;
-
-namespace mindspore::kernel {
-int TensorListReserveCPUKernel::Init() { return RET_OK; }
-
-int TensorListReserveCPUKernel::Run() {
-  auto input0 = in_tensors_.at(0);
-  auto input1 = in_tensors_.at(1);
-  int num_elements = reinterpret_cast<int *>(input1->data_c())[0];
-  MS_ASSERT(input1->data_c() != nullptr);
-  auto output = reinterpret_cast<lite::TensorList *>(out_tensors_[0]);
-  if (output->tensors().size() < static_cast<uint32_t>(num_elements)) {
-    auto ele_shape_ptr = reinterpret_cast<int *>(input0->data_c());
-    if (ele_shape_ptr == nullptr) {
-      return RET_NULL_PTR;
-    }
-    std::vector<std::vector<int> > tmp_shape(num_elements, std::vector<int>());
-    output->set_element_shape(std::vector<int>(ele_shape_ptr, ele_shape_ptr + input0->ElementsNum()));
-    output->set_shape(std::vector<int>(1, num_elements));
-    output->MallocTensorListData(kTypeUnknown, tmp_shape);
-  }
-  output->set_tensors_data_type(element_dtype_);
-  return RET_OK;
-}
-
-int TensorListReserveCPUKernel::ReSize() { return RET_OK; }
-
-REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_TensorListReserve, LiteKernelCreator<TensorListReserveCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_TensorListReserve, LiteKernelCreator<TensorListReserveCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_TensorListReserve, LiteKernelCreator<TensorListReserveCPUKernel>)
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_reserve.h b/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_reserve.h
deleted file mode 100644
index c623642b8b1..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_reserve.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTRESERVE_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTRESERVE_H_
-
-#include <vector>
-#include "src/inner_kernel.h"
-#include "src/tensorlist.h"
-#include "schema/model_generated.h"
-#include "nnacl/tensorlist_parameter.h"
-
-namespace mindspore::kernel {
-class TensorListReserveCPUKernel : public InnerKernel {
- public:
-  TensorListReserveCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                             const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : InnerKernel(parameter, inputs, outputs, ctx),
-        element_dtype_(static_cast<TypeId>(reinterpret_cast<TensorListParameter *>(parameter)->element_dtype_)) {}
-  ~TensorListReserveCPUKernel() = default;
-
-  int Init() override;
-  int ReSize() override;
-  int Run() override;
-
- private:
-  TypeId element_dtype_ = kTypeUnknown;
-};
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTRESERVE_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_setitem.cc b/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_setitem.cc
deleted file mode 100644
index 7dcaffaaaca..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_setitem.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "include/errorcode.h"
-#include "include/ms_tensor.h"
-#include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/base/tensorlist_setitem.h"
-
-using mindspore::kernel::KERNEL_ARCH;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_NULL_PTR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_TensorListSetItem;
-
-namespace mindspore::kernel {
-int TensorListSetItemCPUKernel::Init() { return RET_OK; }
-
-int TensorListSetItemCPUKernel::CheckParam() {
-  if (in_tensors_[1]->data_type() != kNumberTypeInt && in_tensors_[1]->data_type() != kNumberTypeInt32) {
-    MS_LOG(ERROR) << "in_tensors_[1]->data_type():" << in_tensors_[1]->data_type() << " must be int";
-    return RET_ERROR;
-  }
-  if (in_tensors_[1]->ElementsNum() != 1) {
-    MS_LOG(ERROR) << "in_tensors_[1]->ElementsNum():" << in_tensors_[1]->ElementsNum() << " must be equal to 1!";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int TensorListSetItemCPUKernel::IncrementOutputSize(int origin_size) {
-  int new_tensors_size = origin_size + 1;
-  output0_->set_shape({new_tensors_size});
-  std::vector<std::vector<int>> out_shape;
-  out_shape.resize(new_tensors_size, in_tensors_[2]->shape());
-  auto ret = output0_->MallocTensorListData(in_tensors_[2]->data_type(), out_shape);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "increment output size malloc tensorlist data error";
-    return ret;
-  }
-  return RET_OK;
-}
-
-int TensorListSetItemCPUKernel::Run() {
-  input0_ = reinterpret_cast<lite::TensorList *>(in_tensors_[0]);
-  output0_ = reinterpret_cast<lite::TensorList *>(out_tensors_[0]);
-  if (CheckParam() != RET_OK) {
-    MS_LOG(ERROR) << "check param failed.";
-    return RET_ERROR;
-  }
-
-  int dim0 = output0_->ElementsNum() - 1;
-  index_ = reinterpret_cast<int *>(in_tensors_[1]->data_c())[0];
-  if (index_ < 0 || index_ > dim0) {
-    if (IncrementOutputSize(output0_->tensors().size()) != RET_OK) {
-      MS_LOG(ERROR) << "Resizeoutput Error ,index tensor:[" << index_ << "] must be in [0, " << dim0 << "]!";
-      return RET_ERROR;
-    }
-  }
-  input2_ = in_tensors_[2];
-  MS_ASSERT(input2_ != nullptr);
-  if (!input0_->IsCompatibleShape(input2_->shape())) {
-    return RET_ERROR;
-  }
-  output0_ = reinterpret_cast<lite::TensorList *>(out_tensors_[0]);
-  MS_ASSERT(output0_ != nullptr);
-  output0_->set_allocator(ms_context_->allocator);
-  // new loop count
-  if (output0_->tensors().empty() && input0_->tensors().empty()) {
-    if (IncrementOutputSize(0) != RET_OK) {
-      MS_LOG(ERROR) << "Resizeoutput Error!";
-      return RET_ERROR;
-    }
-  }
-  // copy each tensor in tensors_
-  if (input0_->tensors().empty() && index_ == 0) {
-    input0_->set_element_shape(input2_->shape());
-    output0_->set_element_shape(input2_->shape());
-  }
-  if (output0_->allocator() == nullptr) {
-    output0_->set_allocator(ms_context_->allocator);
-  }
-  for (int i = 0; i < output0_->ElementsNum(); ++i) {
-    if (i == index_) {
-      auto dst = output0_->GetTensor(i);
-      if (dst == nullptr) {
-        dst = lite::Tensor::CopyTensor(*input2_, true, ms_context_->allocator);
-        auto &tensors = output0_->tensors();
-        tensors.emplace_back(dst);
-      } else {
-        dst->set_data_type(input2_->data_type());
-        dst->set_shape(input2_->shape());
-        dst->set_format(input2_->format());
-        dst->set_category(input2_->category());
-        dst->set_quant_clusters(input2_->quant_clusters());
-        auto ret = lite::Tensor::CopyTensorData(*input2_, dst);
-        if (ret != RET_OK) {
-          MS_LOG(ERROR) << "CopyTensorData[" << i << "] is failed!";
-          return RET_ERROR;
-        }
-      }
-    } else {
-      auto src = input0_->GetTensor(i);
-      auto dst = output0_->GetTensor(i);
-      MS_ASSERT(src != nullptr);
-      // merge move data will delete tensors
-      if (dst == nullptr) {
-        dst = lite::Tensor::CopyTensor(*src, src->data_c() != nullptr, ms_context_->allocator);
-        auto &tensors = output0_->tensors();
-        tensors.emplace_back(dst);
-        continue;
-      }
-
-      if (src->data_type() != kTypeUnknown) {
-        auto ret = lite::Tensor::CopyTensorData(*src, dst);
-        if (ret != RET_OK) {
-          MS_LOG(ERROR) << "CopyTensorData[" << i << "] is failed!";
-          return RET_ERROR;
-        }
-      }
-    }
-  }
-  return RET_OK;
-}
-
-int TensorListSetItemCPUKernel::ReSize() { return RET_OK; }
-
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_TensorListSetItem, LiteKernelCreator<TensorListSetItemCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_TensorListSetItem, LiteKernelCreator<TensorListSetItemCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_TensorListSetItem, LiteKernelCreator<TensorListSetItemCPUKernel>)
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_setitem.h b/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_setitem.h
deleted file mode 100644
index d978d373132..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_setitem.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTSETITEM_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTSETITEM_H_
-
-#include <vector>
-#include "src/inner_kernel.h"
-#include "src/tensorlist.h"
-#include "schema/model_generated.h"
-#include "nnacl/tensorlist_parameter.h"
-
-namespace mindspore::kernel {
-class TensorListSetItemCPUKernel : public InnerKernel {
- public:
-  TensorListSetItemCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                             const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : InnerKernel(parameter, inputs, outputs, ctx) {}
-  ~TensorListSetItemCPUKernel() = default;
-
-  int Init() override;
-  int ReSize() override;
-  int Run() override;
-  int IncrementOutputSize(int origin_size);
-
- private:
-  int CheckParam();
-  lite::TensorList *input0_ = nullptr;
-  lite::Tensor *input2_ = nullptr;
-  lite::TensorList *output0_ = nullptr;
-  int index_ = 0;
-};
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTSETITEM_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_stack.cc b/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_stack.cc
deleted file mode 100644
index b05be63e1db..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_stack.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <functional>
-#include <vector>
-#include "include/errorcode.h"
-#include "ir/dtype/type_id.h"
-#include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/base/tensorlist_stack.h"
-
-using mindspore::kernel::KERNEL_ARCH;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_NULL_PTR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_TensorListStack;
-
-namespace mindspore::kernel {
-int TensorListStackCPUKernel::CheckParam() {
-  if (num_element_ != -1 && input0_->ElementsNum() != num_element_) {
-    MS_LOG(ERROR) << "in_tensors_[0].ElementsNum():[" << input0_->ElementsNum() << "] must be equal "
-                  << "param.elements_num:[" << num_element_ << "]";
-    return RET_ERROR;
-  }
-  num_element_ = input0_->ElementsNum();
-  if (output0_->shape().size() < 1) {
-    MS_LOG(ERROR) << "out_tensors_[0].shape().size():" << output0_->shape().size()
-                  << " must be greater than or equal to 1!";
-    return RET_ERROR;
-  }
-  int dim0 = output0_->shape()[0];
-  if (dim0 != num_element_) {
-    MS_LOG(ERROR) << "out_tensors_[0].shape()[0] must be:" << num_element_ << ", but now is:" << dim0;
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int TensorListStackCPUKernel::Init() {
-  input0_ = reinterpret_cast<lite::TensorList *>(in_tensors_[0]);
-  MS_ASSERT(input0_ != nullptr);
-  output0_ = out_tensors_[0];
-  MS_ASSERT(output0_ != nullptr);
-  return RET_OK;
-}
-
-bool TensorListStackCPUKernel::IsFullyDefined(const std::vector<int> &shape) const {
-  for (size_t i = 0; i < shape.size(); ++i) {
-    if (shape[i] < 0) {
-      return false;
-    }
-  }
-  return true;
-}
-
-int TensorListStackCPUKernel::MergeElementShape() {
-  MS_ASSERT(in_tensors_[1]);
-  if (in_tensors_[1]->data_type() != kNumberTypeInt && in_tensors_[1]->data_type() != kNumberTypeInt32) {
-    MS_LOG(ERROR) << "in_tensors_[1]->data_type():" << in_tensors_[1]->data_type() << " must be int";
-    return RET_ERROR;
-  }
-  auto ele_shape_data = reinterpret_cast<int *>(in_tensors_[1]->data_c());
-  output_shape_.clear();
-  for (int i = 0; i < in_tensors_[1]->ElementsNum(); ++i) {
-    output_shape_.push_back(ele_shape_data[i]);
-  }
-  auto status = MergeSubShape(input0_->element_shape());
-  if (status == RET_ERROR) {
-    MS_LOG(ERROR) << "Merge element_shape is error!";
-    return RET_ERROR;
-  }
-
-  if (!IsFullyDefined(output_shape_)) {
-    MS_LOG(ERROR) << "output_shape_ Is Not FullyDefined!";
-    return RET_ERROR;
-  }
-  if (!IsFullyDefined(input0_->element_shape())) {
-    for (int i = 0; i < input0_->ElementsNum(); ++i) {  // get tensorlist every tensor
-      auto tensor_ele = input0_->GetTensor(i);
-      MS_ASSERT(tensor_ele != nullptr);
-      if (tensor_ele->data_type() != kTypeUnknown) {
-        status = MergeSubShape(tensor_ele->shape());
-        if (status == RET_ERROR) {
-          MS_LOG(ERROR) << "Merge tensors_[" << i << "] is error!";
-          return RET_ERROR;
-        }
-      }
-    }
-  }
-  TypeUnknownSize = std::accumulate(output_shape_.begin(), output_shape_.end(), 1LL, std::multiplies<int>());
-  return RET_OK;
-}
-
-int TensorListStackCPUKernel::MergeSubShape(const std::vector<int> &shape) {
-  size_t dim0 = shape.size();
-  size_t dim1 = output_shape_.size();
-  // unknown shape use input element shape
-  if (dim1 != 0 && output_shape_[0] == -1) {
-    if (dim0 == 0) {
-      output_shape_.clear();
-      output_shape_.emplace_back(1);
-    } else {
-      output_shape_ = shape;
-    }
-    return RET_OK;
-  }
-  if (dim1 != dim0) {
-    MS_LOG(ERROR) << "shape.size():" << dim1 << " must be equal output_shape_.size():" << dim0;
-    return RET_ERROR;
-  }
-  for (size_t i = 0; i < dim0; ++i) {
-    int dim0_size = shape[i];
-    int dim1_size = output_shape_[i];
-    if (dim0_size >= 0 && dim1_size >= 0 && dim0_size != dim1_size) {
-      MS_LOG(ERROR) << "shape[" << i << "]:" << dim0_size << " is incompatible with output_shape_[" << i
-                    << "]:" << dim1_size;
-      return RET_ERROR;
-    }
-    output_shape_[i] = dim1_size >= 0 ? dim1_size : dim0_size;
-  }
-  return RET_OK;
-}
-
-int TensorListStackCPUKernel::Run() {
-  output0_ = out_tensors_[0];
-  if (CheckParam() != RET_OK) {
-    MS_LOG(ERROR) << "CheckParam failed!";
-    return RET_ERROR;
-  }
-  dtype_ = input0_->tensors_data_type();
-  if (output0_->ElementsNum() == 0) {
-    return RET_OK;
-  }
-  auto ret = MergeElementShape();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "MergeElementShape failed!";
-    return RET_ERROR;
-  }
-  size_t in_ele_num = num_element_ * TypeUnknownSize;
-  size_t out_ele_num = output0_->ElementsNum();
-  if (in_ele_num != out_ele_num) {
-    MS_LOG(ERROR) << "out_tensors_[0]->ElementsNum():" << out_ele_num << "must be equal to in_ele_num:" << in_ele_num;
-    return RET_ERROR;
-  }
-  auto out_data = reinterpret_cast<char *>(output0_->MutableData());
-  auto unknown_type_offset = TypeUnknownSize * lite::DataTypeSize(dtype_);
-  MS_ASSERT(out_data != nullptr);
-  for (int i = 0; i < num_element_; ++i) {
-    auto in_ptr = input0_->GetTensor(i);
-    if (in_ptr == nullptr) {
-      MS_LOG(DEBUG) << "no need to stack.";
-      continue;
-    }
-    if (in_ptr->data_type() != kTypeUnknown) {
-      int data_size = in_ptr->ElementsNum() * lite::DataTypeSize(dtype_);
-      auto in_data = in_ptr->data_c();
-      MS_ASSERT(in_data != nullptr);
-      memcpy(out_data, in_data, data_size);
-      out_data += data_size;
-    } else {
-      memset(out_data, 0, unknown_type_offset);
-      out_data += unknown_type_offset;
-    }
-  }
-  return RET_OK;
-}
-
-int TensorListStackCPUKernel::ReSize() { return RET_OK; }
-
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_TensorListStack, LiteKernelCreator<TensorListStackCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_TensorListStack, LiteKernelCreator<TensorListStackCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_TensorListStack, LiteKernelCreator<TensorListStackCPUKernel>)
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_stack.h b/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_stack.h
deleted file mode 100644
index 442a01c4408..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/base/tensorlist_stack.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTSTACK_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTSTACK_H_
-
-#include <vector>
-
-#include "src/inner_kernel.h"
-#include "src/tensorlist.h"
-#include "schema/model_generated.h"
-#include "nnacl/tensorlist_parameter.h"
-
-namespace mindspore::kernel {
-class TensorListStackCPUKernel : public InnerKernel {
- public:
-  TensorListStackCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                           const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : InnerKernel(parameter, inputs, outputs, ctx),
-        num_element_(reinterpret_cast<TensorListParameter *>(parameter)->num_element_),
-        dtype_(static_cast<TypeId>(reinterpret_cast<TensorListParameter *>(parameter)->element_dtype_)) {}
-  ~TensorListStackCPUKernel() = default;
-
-  int Init() override;
-  int ReSize() override;
-  int Run() override;
-  int CheckParam();
-  int MergeElementShape();
-  int MergeSubShape(const std::vector<int> &shape);
-  bool IsFullyDefined(const std::vector<int> &shape) const;
-
- private:
-  size_t TypeUnknownSize = 0;
-  int num_element_ = -1;
-  TypeId dtype_ = kTypeUnknown;
-  lite::TensorList *input0_ = nullptr;
-  lite::Tensor *output0_ = nullptr;
-  std::vector<int> output_shape_;
-};
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORLISTSTACK_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/control/tensorlist_reserve.cc b/mindspore/lite/src/runtime/kernel/arm/control/tensorlist_reserve.cc
index aba1516c09d..b7a633b45bd 100644
--- a/mindspore/lite/src/runtime/kernel/arm/control/tensorlist_reserve.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/control/tensorlist_reserve.cc
@@ -42,7 +42,11 @@ int TensorListReserveCPUKernel::Run() {
     std::vector<std::vector<int> > tmp_shape(num_elements, std::vector<int>());
     output->set_element_shape(std::vector<int>(ele_shape_ptr, ele_shape_ptr + input0->ElementsNum()));
     output->set_shape(std::vector<int>(1, num_elements));
-    output->MallocTensorListData(kTypeUnknown, tmp_shape);
+    auto ret = output->MallocTensorListData(kTypeUnknown, tmp_shape);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Failed to MallocTensorListData";
+      return ret;
+    }
   }
   output->set_tensors_data_type(element_dtype_);
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.h
index 06b99dd7fdd..7e86cdf56b8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.h
@@ -26,7 +26,7 @@ namespace mindspore::kernel {
 typedef int (*ArithmeticCompareFuncFp16)(const float16_t *input0, const float16_t *input1, uint8_t *output,
                                          int element_size);
 typedef int (*ArithmeticCompareOptFuncFp16)(const float16_t *input0, const float16_t *input1, uint8_t *output,
-                                            int element_size, ArithmeticParameter *param);
+                                            int element_size, const ArithmeticParameter *param);
 typedef struct {
   int primitive_type_;
   int activation_type_;
@@ -52,8 +52,8 @@ class ArithmeticCompareFP16CPUKernel : public InnerKernel {
 
  private:
   void FreeTmpBuffer();
-  int outside_;
-  int break_pos_;
+  int outside_ = 0;
+  int break_pos_ = 0;
   bool is_input0_fp32_ = false;
   bool is_input1_fp32_ = false;
   float16_t *input0_fp16_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.h
index 85295f246ed..a0c746cce86 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.h
@@ -24,7 +24,7 @@ namespace mindspore::kernel {
 typedef int (*ArithmeticFuncFp16)(const float16_t *input0, const float16_t *input1, float16_t *output,
                                   int element_size);
 typedef int (*ArithmeticOptFuncFp16)(const float16_t *input0, const float16_t *input1, float16_t *output,
-                                     int element_size, ArithmeticParameter *param);
+                                     int element_size, const ArithmeticParameter *param);
 typedef struct {
   int primitive_type_;
   int activation_type_;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.h
index 96e0ba04be0..824efe19726 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.h
@@ -20,7 +20,7 @@
 #include "src/runtime/kernel/arm/fp32/arithmetic_self_fp32.h"
 
 namespace mindspore::kernel {
-typedef int (*ArithmeticSelfFp16Func)(float16_t *input, float16_t *output, int element_size);
+typedef int (*ArithmeticSelfFp16Func)(const float16_t *input, float16_t *output, int element_size);
 class ArithmeticSelfFp16CPUKernel : public ArithmeticSelfCPUKernel {
  public:
   explicit ArithmeticSelfFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.cc
index 58cb9aaa3f2..80c557c68e5 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.cc
@@ -72,10 +72,10 @@ int BiasAddCPUFp16Kernel::Run() {
     ms_context_->allocator->Free(tile_bias);
     return RET_NULL_PTR;
   }
-  BroadcastAddFp16(in, bias_data_, tile_in, tile_bias, out, data_size, bias_param_);
+  auto ret = BroadcastAddFp16(in, bias_data_, tile_in, tile_bias, out, data_size, bias_param_);
   ms_context_->allocator->Free(tile_in);
   ms_context_->allocator->Free(tile_bias);
-  return RET_OK;
+  return ret;
 }
 
 BiasAddCPUFp16Kernel::~BiasAddCPUFp16Kernel() {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.h
index 964f2cea768..cb715f2a256 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/biasadd_fp16.h
@@ -42,7 +42,7 @@ class BiasAddCPUFp16Kernel : public InnerKernel {
   ArithmeticParameter *bias_param_ = nullptr;
   float16_t *bias_data_ = nullptr;
   lite::Tensor *bias_tensor_ = nullptr;
-  TypeId bias_data_type_;
+  TypeId bias_data_type_ = kNumberTypeFloat16;
   bool is_repack_ = false;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
index 84d1018efa3..691906574d2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -84,14 +84,16 @@ int Convolution1x1FP16CPUKernel::MallocWeightBiasData() {
   auto output_channel = weight_tensor->Batch();
 
   size_t size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float16_t);
-  if (packed_weight_ == nullptr) {
-    packed_weight_ = malloc(size);
+  if (!op_parameter_->is_train_session_) {
     if (packed_weight_ == nullptr) {
-      MS_LOG(ERROR) << "Conv1x1 Malloc packed_weight_ error!";
-      return RET_ERROR;
+      packed_weight_ = malloc(size);
+      if (packed_weight_ == nullptr) {
+        MS_LOG(ERROR) << "Conv1x1 Malloc packed_weight_ error!";
+        return RET_ERROR;
+      }
     }
+    memset(reinterpret_cast<char *>(packed_weight_), 0, size);
   }
-  memset(reinterpret_cast<char *>(packed_weight_), 0, size);
 
   if (in_tensors_.size() == kInputSize2) {
     size = UP_ROUND(output_channel, col_tile_) * sizeof(float16_t);
@@ -111,7 +113,7 @@ void Convolution1x1FP16CPUKernel::PackWeight() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   auto input_channel = weight_tensor->Channel();
   auto output_channel = weight_tensor->Batch();
-  void *weight_origin = IsTrainable() ? weight_tensor->data_c() : origin_weight_;
+  void *weight_origin = (op_parameter_->is_train_session_) ? weight_tensor->data_c() : origin_weight_;
   MS_ASSERT(weight_origin != nullptr);
 #ifdef ENABLE_ARM64
   RowMajor2Col16MajorFp16Opt(static_cast<const float16_t *>(weight_origin),
@@ -132,6 +134,13 @@ int Convolution1x1FP16CPUKernel::Init() {
   row_tile_ = C12NUM;
   col_tile_ = C8NUM;
 #endif
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    auto input_channel = weight_tensor->Channel();
+    auto output_channel = weight_tensor->Batch();
+    size_t size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float16_t);
+    set_workspace_size(size);
+  }
   matmul_param_ = new (std::nothrow) MatMulParameter();
   if (matmul_param_ == nullptr) {
     MS_LOG(ERROR) << "Init matmul_param_ failed.";
@@ -288,10 +297,4 @@ int Convolution1x1FP16CPUKernel::Run() {
   return RET_OK;
 }
 
-int Convolution1x1FP16CPUKernel::Eval() {
-  if (IsTrainable()) {
-    is_repack_ = true;
-  }
-  return InnerKernel::Eval();
-}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
index f2420e2fdfa..3f436442e4f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
@@ -37,7 +37,6 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseCPUKernel {
   int Init() override;
   int ReSize() override;
   int Run() override;
-  int Eval() override;
 
  public:
   int RunOc(int task_id);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc
index 71c79f61139..48c1559a451 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc
@@ -24,6 +24,7 @@
 #include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h"
 #include "src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.h"
 #include "src/runtime/kernel/arm/base/group_convolution_creator.h"
+#include "nnacl/base/conv_common_base.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@@ -48,7 +49,7 @@ void ConvolutionDelegateFP16CPUKernel::FreeCopiedData() {
   }
 }
 
-void *ConvolutionDelegateFP16CPUKernel::CopyData(lite::Tensor *tensor) {
+void *ConvolutionDelegateFP16CPUKernel::CopyData(const lite::Tensor *tensor) {
   auto data_type = tensor->data_type();
   if (data_type != kNumberTypeFloat32 && data_type != kNumberTypeFloat16) {
     MS_LOG(ERROR) << "Not supported data type: " << data_type;
@@ -85,7 +86,7 @@ int ConvolutionDelegateFP16CPUKernel::Init() {
   return ReSize();
 }
 
-static void SetInputOutputShapeInfo(ConvParameter *conv_param, lite::Tensor *input, lite::Tensor *output,
+static void SetInputOutputShapeInfo(ConvParameter *conv_param, const lite::Tensor *input, const lite::Tensor *output,
                                     const InnerContext *ctx) {
   conv_param->input_batch_ = input->Batch();
   conv_param->input_h_ = input->Height();
@@ -113,7 +114,9 @@ int ConvolutionDelegateFP16CPUKernel::ReSize() {
   }
   // copied weight and bias are not be used anymore,free them.
   FreeCopiedData();
-  return fp16_conv_kernel_->ReSize();
+  auto ret = fp16_conv_kernel_->ReSize();
+  set_workspace_size(fp16_conv_kernel_->workspace_size());
+  return ret;
 }
 
 kernel::InnerKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
@@ -165,6 +168,11 @@ kernel::InnerKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &
     kernel = new (std::nothrow)
       kernel::ConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, origin_weight, origin_bias);
   }
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "kernel is nullptr";
+    free(op_parameter);
+    return nullptr;
+  }
   // Once kernel is selected, init func will invoke InitWeightAndBias
   auto ret = kernel->Init();
   if (ret != RET_OK) {
@@ -178,9 +186,20 @@ kernel::InnerKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &
 kernel::InnerKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                    const std::vector<lite::Tensor *> &outputs,
                                                    OpParameter *op_parameter, const InnerContext *ctx) {
-  auto *group_conv_creator = new GroupConvCreator(inputs, outputs, op_parameter, ctx, false, kNumberTypeFloat16);
-  return new (std::nothrow) GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, group_conv_creator,
-                                                          reinterpret_cast<ConvParameter *>(op_parameter)->group_);
+  auto *group_conv_creator =
+    new (std::nothrow) GroupConvCreator(inputs, outputs, op_parameter, ctx, false, kNumberTypeFloat16);
+  if (group_conv_creator == nullptr) {
+    MS_LOG(ERROR) << "new GroupConvCreator fail";
+    free(op_parameter);
+    return nullptr;
+  }
+  auto kernel = new (std::nothrow) GroupConvolutionFP16CPUKernel(
+    op_parameter, inputs, outputs, ctx, group_conv_creator, reinterpret_cast<ConvParameter *>(op_parameter)->group_);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "new GroupConvolutionFP16CPUKernel fail";
+    free(op_parameter);
+  }
+  return kernel;
 }
 
 /* creator func */
@@ -200,7 +219,7 @@ kernel::InnerKernel *CpuConvFp16KernelCreator(const std::vector<lite::Tensor *>
     kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, static_cast<const lite::InnerContext *>(ctx));
   }
 
-  if (kernel == nullptr) {
+  if (conv_param->group_ == 1 && kernel == nullptr) {
     MS_LOG(DEBUG) << "Create conv fp16 kernel failed.";
     free(opParameter);
     return nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h
index ed20b68a3d2..63c5316a343 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h
@@ -39,18 +39,15 @@ class ConvolutionDelegateFP16CPUKernel : public InnerKernel {
       fp16_conv_kernel_ = nullptr;
     }
   }
-  void *CopyData(lite::Tensor *tensor);
+  void *CopyData(const lite::Tensor *tensor);
   void FreeCopiedData();
   int Init() override;
   int ReSize() override;
   int Run() override {
     fp16_conv_kernel_->set_name(name_);
+    fp16_conv_kernel_->set_workspace(workspace());
     return fp16_conv_kernel_->Run();
   }
-  int Eval() override {
-    InnerKernel::Eval();
-    return fp16_conv_kernel_->Eval();
-  }
   int Train() override {
     InnerKernel::Train();
     return fp16_conv_kernel_->Train();
@@ -59,6 +56,10 @@ class ConvolutionDelegateFP16CPUKernel : public InnerKernel {
     InnerKernel::SetTrainable(trainable);
     return fp16_conv_kernel_->SetTrainable(trainable);
   }
+  size_t workspace_size() override {
+    InnerKernel::workspace_size();
+    return fp16_conv_kernel_->workspace_size();
+  }
 
   void set_in_tensor(lite::Tensor *in_tensor, size_t index) override {
     MS_ASSERT(index < in_tensors_.size());
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.cc
index b5e54dbcb40..3cc631561a1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.cc
@@ -29,7 +29,7 @@ namespace mindspore::kernel {
 void ConvolutionDepthwise3x3Fp16CPUKernel::PackWeight() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   int channel = weight_tensor->Batch();
-  void *origin_weight = IsTrainable() ? weight_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? weight_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
   PackWeightConvDw3x3Fp16(reinterpret_cast<float16_t *>(origin_weight), reinterpret_cast<float16_t *>(packed_weight_),
                           channel);
@@ -40,11 +40,16 @@ int ConvolutionDepthwise3x3Fp16CPUKernel::MallocWeightBiasData() {
   int channel = weight_tensor->Batch();
   int c8 = UP_ROUND(channel, C8NUM);
   int pack_weight_size = c8 * C12NUM;
-  if (packed_weight_ == nullptr) {
-    packed_weight_ = malloc(pack_weight_size * sizeof(float16_t));
+  if (!op_parameter_->is_train_session_) {
     if (packed_weight_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
+      packed_weight_ = malloc(pack_weight_size * sizeof(float16_t));
+      if (packed_weight_ == nullptr) {
+        packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
+        if (packed_weight_ == nullptr) {
+          MS_LOG(ERROR) << "Malloc buffer failed.";
+          return RET_ERROR;
+        }
+      }
     }
   }
   if (bias_data_ == nullptr) {
@@ -59,6 +64,13 @@ int ConvolutionDepthwise3x3Fp16CPUKernel::MallocWeightBiasData() {
 }
 
 int ConvolutionDepthwise3x3Fp16CPUKernel::Init() {
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    int channel = weight_tensor->Batch();
+    int c8 = UP_ROUND(channel, C8NUM);
+    int pack_weight_size = c8 * C12NUM;
+    set_workspace_size(pack_weight_size * sizeof(float16_t));
+  }
   auto ret = InitConvWeightBias();
   if (ret != 0) {
     MS_LOG(ERROR) << "Convolution depthwise 3x3 fp16 InitConvWeightBias failed.";
@@ -128,11 +140,5 @@ int ConvolutionDepthwise3x3Fp16CPUKernel::Run() {
   return RET_OK;
 }
 
-int ConvolutionDepthwise3x3Fp16CPUKernel::Eval() {
-  if (IsTrainable()) {
-    is_repack_ = true;
-  }
-  return InnerKernel::Eval();
-}
 }  // namespace mindspore::kernel
 #endif
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.h
index cc66bb528d0..26d64823ca7 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DEPTHWISE_3X3_FP16_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DEPTHWISE_3X3_FP16_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_3X3_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_3X3_FP16_H_
 
 #ifdef ENABLE_ARM
 #include <vector>
@@ -37,7 +37,6 @@ class ConvolutionDepthwise3x3Fp16CPUKernel : public ConvolutionBaseCPUKernel {
   int Run() override;
 
   int Execute(int task_id);
-  int Eval() override;
 
  private:
   void PackWeight() override;
@@ -48,4 +47,4 @@ class ConvolutionDepthwise3x3Fp16CPUKernel : public ConvolutionBaseCPUKernel {
 };
 }  // namespace mindspore::kernel
 #endif
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DEPTHWISE_3X3_FP16_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_3X3_FP16_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
index fae625bc7b9..c6772d4104d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -25,7 +25,7 @@ using mindspore::lite::RET_OK;
 namespace mindspore::kernel {
 void ConvolutionDepthwiseFp16CPUKernel::PackWeight() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
-  void *origin_weight = IsTrainable() ? weight_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? weight_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
   PackNCHWToNHWCFp16(reinterpret_cast<float16_t *>(origin_weight), reinterpret_cast<float16_t *>(packed_weight_), 1,
                      weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch(), 0, 0);
@@ -35,11 +35,13 @@ int ConvolutionDepthwiseFp16CPUKernel::MallocWeightBiasData() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   int channel = weight_tensor->Batch();
   int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width();
-  if (packed_weight_ == nullptr) {
-    packed_weight_ = malloc(pack_weight_size * sizeof(float16_t));
+  if (!op_parameter_->is_train_session_) {
     if (packed_weight_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
+      packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
+      if (packed_weight_ == nullptr) {
+        MS_LOG(ERROR) << "Malloc buffer failed.";
+        return RET_ERROR;
+      }
     }
   }
   if (bias_data_ == nullptr) {
@@ -56,6 +58,12 @@ int ConvolutionDepthwiseFp16CPUKernel::MallocWeightBiasData() {
 int ConvolutionDepthwiseFp16CPUKernel::Init() {
   CHECK_LESS_RETURN(in_tensors_.size(), 2);
   CHECK_LESS_RETURN(out_tensors_.size(), 1);
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    int channel = weight_tensor->Batch();
+    int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width();
+    set_workspace_size(pack_weight_size * sizeof(float16_t));
+  }
   auto ret = InitConvWeightBias();
   if (ret != 0) {
     MS_LOG(ERROR) << "Convolution depthwise fp16 InitConvWeightBias failed.";
@@ -113,10 +121,4 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
   return ret;
 }
 
-int ConvolutionDepthwiseFp16CPUKernel::Eval() {
-  if (IsTrainable()) {
-    is_repack_ = true;
-  }
-  return InnerKernel::Eval();
-}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
index 1b37edc0cd3..3975c1d42ca 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
@@ -43,7 +43,6 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
   int Init() override;
   int ReSize() override;
   int Run() override;
-  int Eval() override;
 
   int Execute(int task_id);
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
index 294f8a8a404..5efcdde5923 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
@@ -56,7 +56,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitPackedInputOutput() {
 
 void ConvolutionDepthwiseSWFp16CPUKernel::PackWeight() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
-  void *origin_weight = IsTrainable() ? weight_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? weight_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
   PackNCHWFp16ToNC8HW8Fp16(reinterpret_cast<float16_t *>(origin_weight), reinterpret_cast<float16_t *>(packed_weight_),
                            1, weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch());
@@ -66,13 +66,19 @@ int ConvolutionDepthwiseSWFp16CPUKernel::MallocWeightBiasData() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
   int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
-  if (packed_weight_ == nullptr) {
-    packed_weight_ = malloc(pack_weight_size * sizeof(float16_t));
+  if (!op_parameter_->is_train_session_) {
     if (packed_weight_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
+      packed_weight_ = malloc(pack_weight_size * sizeof(float16_t));
+      if (packed_weight_ == nullptr) {
+        packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
+        if (packed_weight_ == nullptr) {
+          MS_LOG(ERROR) << "Malloc buffer failed.";
+          return RET_ERROR;
+        }
+      }
     }
   }
+
   if (bias_data_ == nullptr) {
     bias_data_ = malloc(C8NUM * OC8 * sizeof(float16_t));
     if (bias_data_ == nullptr) {
@@ -88,6 +94,12 @@ int ConvolutionDepthwiseSWFp16CPUKernel::MallocWeightBiasData() {
 int ConvolutionDepthwiseSWFp16CPUKernel::Init() {
   CHECK_LESS_RETURN(in_tensors_.size(), 2);
   CHECK_LESS_RETURN(out_tensors_.size(), 1);
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
+    int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
+    set_workspace_size(pack_weight_size * sizeof(float16_t));
+  }
   sliding_ = new (std::nothrow) SlidingWindowParam;
   if (sliding_ == nullptr) {
     MS_LOG(ERROR) << "new sliding window param failed.";
@@ -182,10 +194,4 @@ void ConvolutionDepthwiseSWFp16CPUKernel::FreePackedInputOutput() {
   }
 }
 
-int ConvolutionDepthwiseSWFp16CPUKernel::Eval() {
-  if (IsTrainable()) {
-    is_repack_ = true;
-  }
-  return InnerKernel::Eval();
-}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
index 5219c2c8570..f94f6f0107e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
@@ -44,7 +44,6 @@ class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseCPUKernel {
   int Init() override;
   int ReSize() override;
   int Run() override;
-  int Eval() override;
 
   int InitPackedInputOutput();
   int Execute(int task_id);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
index 25ebcebf147..56c1eb57109 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@@ -32,7 +32,7 @@ void ConvolutionFP16CPUKernel::PackWeight() {
   int in_channel = filter_tensor->Channel();
   int out_channel = filter_tensor->Batch();
   int kernel_plane = filter_tensor->Height() * filter_tensor->Width();
-  void *weight_origin = IsTrainable() ? filter_tensor->data_c() : origin_weight_;
+  void *weight_origin = (op_parameter_->is_train_session_) ? filter_tensor->data_c() : origin_weight_;
   MS_ASSERT(weight_origin != nullptr);
   RowMajor2Col8MajorFp16(weight_origin, reinterpret_cast<float16_t *>(packed_weight_), out_channel,
                          in_channel * kernel_plane, false);
@@ -49,15 +49,19 @@ int ConvolutionFP16CPUKernel::MallocWeightBiasData() {
   int pack_weight_size = oc8 * in_channel * kernel_plane;
 
   // init weight
-  if (packed_weight_ == nullptr) {
-    packed_weight_ = malloc(pack_weight_size * sizeof(float16_t));
+  if (!op_parameter_->is_train_session_) {
     if (packed_weight_ == nullptr) {
-      MS_LOG(ERROR) << "malloc packed_weight_ failed.";
-      return RET_ERROR;
+      packed_weight_ = malloc(pack_weight_size * sizeof(float16_t));
+      if (packed_weight_ == nullptr) {
+        packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
+        if (packed_weight_ == nullptr) {
+          MS_LOG(ERROR) << "malloc packed_weight_ failed.";
+          return RET_ERROR;
+        }
+      }
     }
+    memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
   }
-  memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
-
   // init bias
   if (bias_data_ == nullptr) {
     bias_data_ = malloc(oc8 * sizeof(float16_t));
@@ -91,6 +95,15 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {
 int ConvolutionFP16CPUKernel::Init() {
   CHECK_LESS_RETURN(in_tensors_.size(), 2);
   CHECK_LESS_RETURN(out_tensors_.size(), 1);
+  if (op_parameter_->is_train_session_) {
+    auto filter_tensor = in_tensors_.at(kWeightIndex);
+    int in_channel = filter_tensor->Channel();
+    int out_channel = filter_tensor->Batch();
+    int oc8 = UP_ROUND(out_channel, col_tile_);
+    int kernel_plane = filter_tensor->Height() * filter_tensor->Width();
+    int pack_weight_size = oc8 * in_channel * kernel_plane;
+    set_workspace_size(pack_weight_size * sizeof(float16_t));
+  }
 #ifdef ENABLE_ARM64
   row_tile_ = C16NUM;
 #else
@@ -127,16 +140,21 @@ int ConvolutionFP16CPUKernel::ReSize() {
 }
 
 int ConvolutionFP16CPUKernel::RunImpl(int task_id) {
-  auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
-  auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
-  MS_ASSERT(input_ptr != nullptr);
-  MS_ASSERT(output_ptr != nullptr);
-  if (input_ptr == nullptr || output_ptr == nullptr) {
-    MS_LOG(ERROR) << "Convolution Fp16 get null tensor data!";
-    return RET_ERROR;
+  auto input_tensor = in_tensors_[0];
+  auto output_tensor = out_tensors_[0];
+  MS_ASSERT(input_tensor != nullptr);
+  MS_ASSERT(output_tensor != nullptr);
+  auto input_ptr = reinterpret_cast<float16_t *>(input_tensor->data_c());
+  auto output_ptr = reinterpret_cast<float16_t *>(output_tensor->data_c());
+  CHECK_NULL_RETURN(input_ptr);
+  CHECK_NULL_RETURN(output_ptr);
+  if (output_tensor->format() == NC4HW4) {
+    ConvOutNc8hw8Fp16(input_ptr, packed_input_, reinterpret_cast<float16_t *>(packed_weight_),
+                      reinterpret_cast<float16_t *>(bias_data_), col_major_input_, output_ptr, task_id, conv_param_);
+  } else {
+    ConvFp16(input_ptr, packed_input_, reinterpret_cast<float16_t *>(packed_weight_),
+             reinterpret_cast<float16_t *>(bias_data_), col_major_input_, output_ptr, task_id, conv_param_);
   }
-  ConvFp16(input_ptr, packed_input_, reinterpret_cast<float16_t *>(packed_weight_),
-           reinterpret_cast<float16_t *>(bias_data_), col_major_input_, output_ptr, task_id, conv_param_);
   return RET_OK;
 }
 
@@ -170,10 +188,4 @@ int ConvolutionFP16CPUKernel::Run() {
   return ret;
 }
 
-int ConvolutionFP16CPUKernel::Eval() {
-  if (IsTrainable()) {
-    is_repack_ = true;
-  }
-  return InnerKernel::Eval();
-}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
index ef08a5dfa2a..90f8df92e10 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
@@ -34,7 +34,6 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseCPUKernel {
   int Init() override;
   int ReSize() override;
   int Run() override;
-  int Eval() override;
   int RunImpl(int task_id);
   int InitTmpBuffer();
   void AdjustNumberOfThread();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
index 33ad5e4da68..cfbea94a6e2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@@ -20,8 +20,8 @@ using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 
 namespace mindspore::kernel {
-int ConvolutionWinogradFP16CPUKernel::WinogradFilterTransformFp16(const float16_t *weight_data, float *matrix_g,
-                                                                  float *matrix_gt, int oc_block) {
+int ConvolutionWinogradFP16CPUKernel::WinogradFilterTransformFp16(const float16_t *weight_data, const float *matrix_g,
+                                                                  const float *matrix_gt, int oc_block) {
   if (oc_block == 0) {
     MS_LOG(ERROR) << "Divide by zero";
     return RET_ERROR;
@@ -41,14 +41,16 @@ int ConvolutionWinogradFP16CPUKernel::MallocWeightBiasData() {
   int oc_block_num = UP_DIV(out_channel, col_tile_);
   // init weight
   auto trans_matrix_data_size = input_unit_ * input_unit_ * in_channel * oc_block_num * col_tile_ * sizeof(float16_t);
-  if (packed_weight_ == nullptr) {
-    packed_weight_ = malloc(trans_matrix_data_size);
+  if (!op_parameter_->is_train_session_) {
     if (packed_weight_ == nullptr) {
-      MS_LOG(ERROR) << "malloc packed_weight_ failed.";
-      return RET_ERROR;
+      packed_weight_ = malloc(trans_matrix_data_size);
+      if (packed_weight_ == nullptr) {
+        MS_LOG(ERROR) << "malloc packed_weight_ failed.";
+        return RET_ERROR;
+      }
     }
+    memset(packed_weight_, 0, trans_matrix_data_size);
   }
-  memset(packed_weight_, 0, trans_matrix_data_size);
 
   float matrix_a[64];
   float matrix_at[64];
@@ -78,7 +80,7 @@ int ConvolutionWinogradFP16CPUKernel::MallocWeightBiasData() {
 
 void ConvolutionWinogradFP16CPUKernel::PackWeight() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
-  void *weight_origin = IsTrainable() ? weight_tensor->data_c() : origin_weight_;
+  void *weight_origin = (op_parameter_->is_train_session_) ? weight_tensor->data_c() : origin_weight_;
   MS_ASSERT(weight_origin != nullptr);
   WinogradFilterTransformFp16(reinterpret_cast<float16_t *>(weight_origin), matrix_g_, matrix_gt_, col_tile_);
 }
@@ -144,6 +146,14 @@ int ConvolutionWinogradFP16CPUKernel::Init() {
 #else
   row_tile_ = C12NUM;
 #endif
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    int in_channel = weight_tensor->Channel();
+    int out_channel = weight_tensor->Batch();
+    int oc_block_num = UP_DIV(out_channel, col_tile_);
+    auto trans_matrix_data_size = input_unit_ * input_unit_ * in_channel * oc_block_num * col_tile_ * sizeof(float16_t);
+    set_workspace_size(trans_matrix_data_size);
+  }
   kernel_unit_ = conv_param_->kernel_h_;
   input_unit_ = output_unit_ + kernel_unit_ - 1;
   conv_param_->input_unit_ = input_unit_;
@@ -190,6 +200,7 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
     MS_LOG(ERROR) << "AdjustNumberOfThread failed.";
     return ret;
   }
+  conv_param_->out_format_ = out_tensors_[0]->format();
   return RET_OK;
 }
 
@@ -237,10 +248,4 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
   return ret;
 }
 
-int ConvolutionWinogradFP16CPUKernel::Eval() {
-  if (IsTrainable()) {
-    is_repack_ = true;
-  }
-  return InnerKernel::Eval();
-}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
index e94191966b0..a770b5bca5f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
@@ -38,11 +38,11 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseCPUKernel {
   int Init() override;
   int ReSize() override;
   int Run() override;
-  int Eval() override;
   int RunImpl(int task_id);
   int InitTmpBuffer();
   int ConfigInputOutput();
-  int WinogradFilterTransformFp16(const float16_t *weight_data, float *matrix_g, float *matrix_gt, int oc_block);
+  int WinogradFilterTransformFp16(const float16_t *weight_data, const float *matrix_g, const float *matrix_gt,
+                                  int oc_block);
   int AdjustNumberOfThread();
 
  private:
@@ -66,6 +66,7 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseCPUKernel {
       col_buffer_ = nullptr;
     }
   }
+  int FilterWeight();
   int kernel_unit_ = 0;
   int input_unit_ = 0;
   int output_unit_;
@@ -75,7 +76,7 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseCPUKernel {
   float16_t *col_buffer_ = nullptr;
   float matrix_g_[64];
   float matrix_gt_[64];
-  TmpBufferAddressFp16 tmp_buffer_address_list_[4];
+  TmpBufferAddressFp16 tmp_buffer_address_list_[4] = {0};
   InputTransFp16Func in_func_ = nullptr;
   OutputTransFp16Func out_func_ = nullptr;
   int col_tile_ = 0;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
index 445003fdf6b..16afef7dee0 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@@ -70,10 +70,12 @@ int DeconvolutionDepthwiseFp16CPUKernel::MallocWeightBiasData() {
   int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
   int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
 
-  packed_weight_ = malloc(pack_weight_size * sizeof(float16_t));
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
+  if (!op_parameter_->is_train_session_) {
+    packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
+    if (packed_weight_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
   }
 
   bias_data_ = malloc(C8NUM * OC8 * sizeof(float16_t));
@@ -88,7 +90,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::MallocWeightBiasData() {
 
 void DeconvolutionDepthwiseFp16CPUKernel::PackWeight() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
-  void *origin_weight = IsTrainable() ? weight_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? weight_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
   PackNCHWFp16ToNC8HW8Fp16(reinterpret_cast<float16_t *>(origin_weight), reinterpret_cast<float16_t *>(packed_weight_),
                            1, weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch());
@@ -97,6 +99,12 @@ void DeconvolutionDepthwiseFp16CPUKernel::PackWeight() {
 int DeconvolutionDepthwiseFp16CPUKernel::Init() {
   CHECK_LESS_RETURN(in_tensors_.size(), 2);
   CHECK_LESS_RETURN(out_tensors_.size(), 1);
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
+    int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
+    set_workspace_size(pack_weight_size * sizeof(float16_t));
+  }
   sliding_ = new (std::nothrow) SlidingWindowParam;
   if (sliding_ == nullptr) {
     MS_LOG(ERROR) << "new SlidingWindowParam fail!";
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
index c80479b2756..183ab983aae 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@@ -54,7 +54,7 @@ void DeConvolutionFp16CPUKernel::PackWeight() {
   auto output_channel = weight_tensor->Channel();
   auto kernel_h = weight_tensor->Height();
   auto kernel_w = weight_tensor->Width();
-  void *origin_weight = IsTrainable() ? weight_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? weight_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
   PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast<float16_t *>(origin_weight), reinterpret_cast<float16_t *>(packed_weight_),
                            input_channel, kernel_w * kernel_h, output_channel);
@@ -67,12 +67,14 @@ int DeConvolutionFp16CPUKernel::MallocWeightBiasData() {
   auto kernel_h = weight_tensor->Height();
   auto kernel_w = weight_tensor->Width();
   size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
-  packed_weight_ = malloc(weight_pack_size);
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "deconv malloc packed_weight_ error!";
-    return RET_ERROR;
+  if (!op_parameter_->is_train_session_) {
+    packed_weight_ = malloc(weight_pack_size);
+    if (packed_weight_ == nullptr) {
+      MS_LOG(ERROR) << "deconv malloc packed_weight_ error!";
+      return RET_ERROR;
+    }
+    memset(packed_weight_, 0, weight_pack_size);
   }
-  memset(packed_weight_, 0, weight_pack_size);
   auto bias_size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
   bias_data_ = malloc(bias_size);
   if (bias_data_ == nullptr) {
@@ -174,6 +176,15 @@ int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
 int DeConvolutionFp16CPUKernel::Init() {
   CHECK_LESS_RETURN(in_tensors_.size(), 2);
   CHECK_LESS_RETURN(out_tensors_.size(), 1);
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    auto input_channel = weight_tensor->Batch();
+    auto output_channel = weight_tensor->Channel();
+    auto kernel_h = weight_tensor->Height();
+    auto kernel_w = weight_tensor->Width();
+    size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
+    set_workspace_size(weight_pack_size);
+  }
   matmul_param_ = new (std::nothrow) MatMulParameter();
   if (matmul_param_ == nullptr) {
     MS_LOG(ERROR) << "Memory allocation failed";
@@ -203,7 +214,6 @@ int DeConvolutionFp16CPUKernel::Run() {
     MS_LOG(ERROR) << "DeConvolution Fp16 get null tensor data!";
     return RET_ERROR;
   }
-
   int error_code = InitRunBuf();
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "deconv fp16 InitRunBuf error! error_code[" << error_code << "]";
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.h
index 21f286b2998..7af41dfc31b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.h
@@ -46,12 +46,12 @@ class DeConvolutionFp16CPUKernel : public ConvolutionBaseCPUKernel {
   void PackWeight() override;
 
  private:
-  MatMulParameter *matmul_param_;
-  int input_plane_;
-  int kernel_plane_;
-  int output_plane_;
-  int thread_count_;
-  int thread_stride_;
+  MatMulParameter *matmul_param_ = nullptr;
+  int input_plane_ = 0;
+  int kernel_plane_ = 0;
+  int output_plane_ = 0;
+  int thread_count_ = 0;
+  int thread_stride_ = 0;
   float16_t *pack_input_ = nullptr;
   float16_t *pack_output_ = nullptr;
   float16_t *tmp_buffer_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
index d4e1bb73ce0..921a063de3a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
@@ -151,6 +151,9 @@ int DeConvWinogradFp16CPUKernel::InitParameter() {
   for (int i = 0; i < deconv_param_->compute_size_; i++) {
     DeConvComputeUnit &unit = deconv_param_->compute_units_[i];
     if (unit.use_winograd_) {
+      if (unit.winograd_.kh_ >= DECONV_WINOGRAD_BUFFER_COUNT) {
+        return RET_ERROR;
+      }
       if (deconv_param_->a_buffer_[unit.winograd_.kh_].buf_init_ == false) {
         deconv_param_->a_buffer_[unit.winograd_.kh_].buf_init_ = true;
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/gather_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/gather_fp16.cc
index f88969604d3..9cd76bdfc00 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/gather_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/gather_fp16.cc
@@ -153,7 +153,7 @@ int GatherFp16CPUKernel::Run() {
   return ret;
 }
 
-int GatherFp16CPUKernel::AssignIndicesData(bool isIndicesInt32, int indices_num, lite::Tensor *indices_tensor) {
+int GatherFp16CPUKernel::AssignIndicesData(bool isIndicesInt32, int indices_num, const lite::Tensor *indices_tensor) {
   MS_ASSERT(indices_tensor->data_c() != nullptr);
   if (!isIndicesInt32) {
     if (indices_num >= std::numeric_limits<int>::max() / static_cast<int>(sizeof(int))) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/gather_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/gather_fp16.h
index 39167c747fd..ba2dd21e2b0 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/gather_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/gather_fp16.h
@@ -39,7 +39,7 @@ class GatherFp16CPUKernel : public InnerKernel {
 
  private:
   int *indices_data_ = nullptr;
-  int AssignIndicesData(bool isIndicesInt32, int indices_num, lite::Tensor *indices_tensor);
+  int AssignIndicesData(bool isIndicesInt32, int indices_num, const lite::Tensor *indices_tensor);
   void FreeIndicesData();
   float16_t *input_data_ = nullptr;
   bool const_input_ = false;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/gru_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/gru_fp16.h
index 45c748865fc..ef1e5a11a51 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/gru_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/gru_fp16.h
@@ -47,7 +47,7 @@ class GruFp16CPUKernel : public InnerKernel {
   float16_t *input_bias_ = nullptr;
   float16_t *state_bias_ = nullptr;
 
-  float16_t *buffer_[4];
+  float16_t *buffer_[4] = {0};
   const int gate_num = 3;
   const int packed_input_index = 0;
   const int input_gate_index = 1;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/instance_norm_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/instance_norm_fp16.cc
index ad4bd8870cc..980da8fc53f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/instance_norm_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/instance_norm_fp16.cc
@@ -85,15 +85,21 @@ int InstanceNormFp16CPUKernel::Init() {
 }
 
 int InstanceNormFp16CPUKernel::ReSize() {
-  auto shape = in_tensors_.front()->shape();
-  param_->batch_ = shape[0];
-  param_->inner_size_ = shape[2] * shape[3];
-  param_->channel_ = shape[1];
+  param_->op_parameter_.thread_num_ = op_parameter_->thread_num_;
+  auto in_tensor = in_tensors_.front();
+  param_->batch_ = in_tensor->Batch();
+  param_->inner_size_ = in_tensor->Height() * in_tensor->Width();
+  param_->channel_ = in_tensor->Channel();
   return RET_OK;
 }
 
 int InstanceNormFp16CPUKernel::DoInstanceNorm(int task_id) {
-  int ret = InstanceNormFp16(src_data_, dst_data_, gamma_data_, beta_data_, param_, task_id);
+  int ret = RET_OK;
+  if (in_tensors_[0]->format() == NC4HW4) {
+    ret = InstanceNormNC8HW8Fp16(src_data_, dst_data_, gamma_data_, beta_data_, param_, task_id);
+  } else {
+    ret = InstanceNormFp16(src_data_, dst_data_, gamma_data_, beta_data_, param_, task_id);
+  }
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "DoInstanceNorm error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/lstm_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/lstm_fp16.h
index 53afa9ab266..1ab190ce082 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/lstm_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/lstm_fp16.h
@@ -49,7 +49,7 @@ class LstmFp16CPUKernel : public InnerKernel {
   float16_t *input_bias_ = nullptr;
   float16_t *state_bias_ = nullptr;
 
-  float16_t *buffer_[6];
+  float16_t *buffer_[6] = {0};
   const int gate_num = 4;
   const int packed_input_index = 0;
   const int input_gate_index = 1;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc
index 256c598b0be..1e2b27f42b2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc
@@ -155,21 +155,21 @@ int MatmulBaseFP16CPUKernel::InitBufferB() {
   return RET_OK;
 }
 
-void MatmulBaseFP16CPUKernel::InitMatrixA(void *src_ptr) {
+void MatmulBaseFP16CPUKernel::InitMatrixA(const void *src_ptr) {
   auto src_data_type = in_tensors_[0]->data_type();
 
   if (vec_matmul_) {
     if (src_data_type == kNumberTypeFloat32) {
-      Float32ToFloat16(reinterpret_cast<float *>(src_ptr), a_pack_ptr_, params_->batch * params_->deep_);
+      Float32ToFloat16(reinterpret_cast<const float *>(src_ptr), a_pack_ptr_, params_->batch * params_->deep_);
     } else {
       memcpy(a_pack_ptr_, src_ptr, params_->batch * params_->deep_ * sizeof(float16_t));
     }
     return;
   }
 
-  int8_t *int8_src = reinterpret_cast<int8_t *>(src_ptr);
+  const int8_t *int8_src = reinterpret_cast<const int8_t *>(src_ptr);
   for (int i = 0; i < params_->batch; i++) {
-    int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type);
+    const int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type);
     float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_;
     if (params_->a_transpose_) {
 #ifdef ENABLE_ARM64
@@ -188,13 +188,13 @@ void MatmulBaseFP16CPUKernel::InitMatrixA(void *src_ptr) {
   return;
 }
 
-void MatmulBaseFP16CPUKernel::InitMatrixB(void *src_ptr, TypeId src_data_type) {
-  int8_t *int8_src = reinterpret_cast<int8_t *>(src_ptr);
+void MatmulBaseFP16CPUKernel::InitMatrixB(const void *src_ptr, TypeId src_data_type) {
+  const int8_t *int8_src = reinterpret_cast<const int8_t *>(src_ptr);
 
   if (vec_matmul_) {
     if (params_->b_transpose_) {
       if (src_data_type == kNumberTypeFloat32) {
-        Float32ToFloat16(reinterpret_cast<float *>(src_ptr), b_pack_ptr_,
+        Float32ToFloat16(reinterpret_cast<const float *>(src_ptr), b_pack_ptr_,
                          params_->batch * params_->col_ * params_->deep_);
       } else {
 #ifdef ENABLE_ARM64
@@ -220,7 +220,7 @@ void MatmulBaseFP16CPUKernel::InitMatrixB(void *src_ptr, TypeId src_data_type) {
   }
 
   for (int i = 0; i < params_->batch; i++) {
-    int8_t *src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type);
+    const int8_t *src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type);
     float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
     if (params_->b_transpose_) {
       RowMajor2Col8MajorFp16(src, dst, params_->col_, params_->deep_, src_data_type == kNumberTypeFloat32);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.h
index ea2f4e5dec8..78d4f63c4ea 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.h
@@ -50,8 +50,8 @@ class MatmulBaseFP16CPUKernel : public InnerKernel {
   void ResizeParameter();
   int InitBufferA();
   int InitBufferB();
-  void InitMatrixA(void *src_ptr);
-  void InitMatrixB(void *src_ptr, TypeId data_type);
+  void InitMatrixA(const void *src_ptr);
+  void InitMatrixB(const void *src_ptr, TypeId data_type);
   void FreeResizeBufA();
   void FreeResizeBufB();
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc
index 0a35595eebb..20c8a4f784a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc
@@ -27,7 +27,6 @@ using mindspore::schema::PrimitiveType_PadFusion;
 namespace mindspore::kernel {
 namespace {
 constexpr size_t kPadCommonInputSize = 2;
-constexpr size_t kPadMaxInputSize = 3;
 }  // namespace
 int PadFp16CPUKernel::RunImpl(int task_id) {
   PadFp16(input_, output_, in_, out_, pad_param_->paddings_, task_id, op_parameter_->thread_num_);
@@ -102,9 +101,6 @@ int PadFp16CPUKernel::Run() {
         return RET_ERROR;
       }
     }
-    if (in_tensors_.size() == kPadMaxInputSize) {
-      pad_param_->constant_value_ = reinterpret_cast<float *>(in_tensors_.at(2)->data_c())[0];
-    }
     if (pad_param_->constant_value_ - 0.0f < 1e-5) {
       memset(output_, 0, output_tensor->ElementsNum() * sizeof(float16_t));
     } else {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/power_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/power_fp16.h
index cd501f94cd0..22d0c8bf5b4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/power_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/power_fp16.h
@@ -45,7 +45,7 @@ class PowerFp16CPUKernel : public InnerKernel {
   float shift_;
   float16_t *exp_data_ = nullptr;
   lite::Tensor *exp_tensor_ = nullptr;
-  TypeId exp_data_type_;
+  TypeId exp_data_type_ = kNumberTypeFloat16;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc
index 1df7d4486ac..f89dd891f2b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc
@@ -78,7 +78,7 @@ int QuantDTypeCastFp16CPUKernel::Init() {
 int QuantDTypeCastFp16CPUKernel::ReSize() {
   auto in_tensor = in_tensors_.front();
   num_unit_ = static_cast<int>(in_tensor->ElementsNum());
-  thread_n_num_ = MSMIN(thread_num_, num_unit_);
+  thread_n_num_ = MSMIN(ms_context_->thread_num_, num_unit_);
   thread_n_stride_ = UP_DIV(num_unit_, thread_n_num_);
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.h
index 1ad3b22bbd9..7040c469a2a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.h
@@ -26,7 +26,7 @@ class QuantDTypeCastFp16CPUKernel : public InnerKernel {
  public:
   QuantDTypeCastFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                               const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : InnerKernel(parameter, inputs, outputs, ctx), thread_num_(ctx->thread_num_) {}
+      : InnerKernel(parameter, inputs, outputs, ctx) {}
   ~QuantDTypeCastFp16CPUKernel() override = default;
 
   int Init() override;
@@ -35,15 +35,14 @@ class QuantDTypeCastFp16CPUKernel : public InnerKernel {
   int QuantDTypeCast(int task_id);
 
  private:
-  int thread_num_;
-  int thread_n_num_;
-  int thread_n_stride_;
-  int num_unit_;
-  int8_t *int8_ptr_;
-  uint8_t *uint8_ptr_;
-  float16_t *float16_ptr_;
-  bool int_to_float_;
-  bool is_uint8_;
+  int thread_n_num_ = 0;
+  int thread_n_stride_ = 0;
+  int num_unit_ = 0;
+  int8_t *int8_ptr_ = nullptr;
+  uint8_t *uint8_ptr_ = nullptr;
+  float16_t *float16_ptr_ = nullptr;
+  bool int_to_float_ = false;
+  bool is_uint8_ = false;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h
index 13585a86e30..ced30851852 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h
@@ -40,7 +40,7 @@ class StackFp16CPUKernel : public StackBaseCPUKernel {
   std::vector<bool> malloc_buffers_;
   std::vector<void *> buffers_;
   float16_t *out_buffer_ = nullptr;
-  bool malloc_out_;
+  bool malloc_out_ = false;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
index 7f129a758ed..7fff594c1ab 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
@@ -117,6 +117,13 @@ int Convolution1x1CPUKernel::Init() {
     MS_LOG(ERROR) << "Memory allocation failed";
     return RET_ERROR;
   }
+  if (op_parameter_->is_train_session_) {
+    auto filter_tensor = in_tensors_.at(kWeightIndex);
+    auto input_channel = filter_tensor->Channel();
+    auto output_channel = filter_tensor->Batch();
+    int size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float);
+    set_workspace_size(size);
+  }
   int error_code = InitConvWeightBias();
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Convolution1x1 init weight and bias failed.";
@@ -142,9 +149,15 @@ int Convolution1x1CPUKernel::DoConv1x1(int task_id) {
     return RET_OK;
   }
   auto bias = (bias_data_ == nullptr) ? nullptr : reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id;
-  MatMulOpt(pack_input_, reinterpret_cast<float *>(packed_weight_) + task_id * thread_stride_ * matmul_param_->deep_,
-            output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
-            matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);
+  if (out_tensors()[0]->format() != NC4HW4) {
+    MatMulOpt(pack_input_, reinterpret_cast<float *>(packed_weight_) + task_id * thread_stride_ * matmul_param_->deep_,
+              output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
+              matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);
+  } else {
+    MatMulOpt(pack_input_, reinterpret_cast<float *>(packed_weight_) + task_id * thread_stride_ * matmul_param_->deep_,
+              output_ptr_ + task_id * thread_stride_ * matmul_param_->row_, bias, matmul_param_->act_type_,
+              matmul_param_->deep_, matmul_param_->row_, cur_oc, matmul_param_->row_, OutType_NC4HW4);
+  }
   return RET_OK;
 }
 
@@ -167,15 +180,26 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) {
 
   float *thread_input_ptr = input_ptr_ + task_id * thread_stride_ * matmul_param_->deep_;
   float *thread_pack_input = pack_input_ + task_id * row_tile_ * matmul_param_->deep_;
-  float *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_;
+  float *thread_output_ptr;
+  if (out_tensors()[0]->format() != NC4HW4) {
+    thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_;
+  } else {
+    thread_output_ptr = output_ptr_ + task_id * thread_stride_ * MSMIN(matmul_param_->col_, C4NUM);
+  }
   float *cur_intput = thread_input_ptr;
   float *cur_output = thread_output_ptr;
   for (int i = 0; i < cur_hw_; i += row_tile_) {
     int cur_rows = (cur_hw_ - i >= row_tile_) ? row_tile_ : (cur_hw_ - i);
     PackMatmulInput(cur_intput, thread_pack_input, cur_rows, matmul_param_->deep_);
-    MatMulOpt(thread_pack_input, reinterpret_cast<float *>(packed_weight_), cur_output,
-              reinterpret_cast<float *>(bias_data_), matmul_param_->act_type_, matmul_param_->deep_, cur_rows,
-              matmul_param_->col_, matmul_param_->col_, OutType_Nhwc);
+    if (out_tensors()[0]->format() != NC4HW4) {
+      MatMulOpt(thread_pack_input, reinterpret_cast<float *>(packed_weight_), cur_output,
+                reinterpret_cast<float *>(bias_data_), matmul_param_->act_type_, matmul_param_->deep_, cur_rows,
+                matmul_param_->col_, matmul_param_->col_, OutType_Nhwc);
+    } else {
+      MatMulOpt(thread_pack_input, reinterpret_cast<float *>(packed_weight_), cur_output,
+                reinterpret_cast<float *>(bias_data_), matmul_param_->act_type_, matmul_param_->deep_, cur_rows,
+                matmul_param_->col_, matmul_param_->row_, OutType_NC4HW4);
+    }
     cur_intput += row_tile_ * matmul_param_->deep_;
     cur_output += row_tile_ * matmul_param_->col_;
   }
@@ -253,7 +277,7 @@ void Convolution1x1CPUKernel::PackWeight() {
     return;
   }
 
-  void *origin_weight = IsTrainable() ? filter_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? filter_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
 #ifdef ENABLE_AVX
   RowMajor2Col16Major(reinterpret_cast<float *>(origin_weight), reinterpret_cast<float *>(packed_weight_),
@@ -272,12 +296,14 @@ int Convolution1x1CPUKernel::MallocWeightBiasData() {
   auto input_channel = filter_tensor->Channel();
   auto output_channel = filter_tensor->Batch();
   int size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float);
-  packed_weight_ = malloc(size);
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc packed_weight_ error!";
-    return RET_ERROR;
+  if (!op_parameter_->is_train_session_) {
+    packed_weight_ = malloc(size);
+    if (packed_weight_ == nullptr) {
+      MS_LOG(ERROR) << "Conv1x1 Malloc packed_weight_ error!";
+      return RET_ERROR;
+    }
+    memset(reinterpret_cast<char *>(packed_weight_), 0, size);
   }
-  memset(reinterpret_cast<char *>(packed_weight_), 0, size);
 
   if (in_tensors_.size() == 3) {
     size = UP_ROUND(output_channel, col_tile_) * sizeof(float);
@@ -290,16 +316,4 @@ int Convolution1x1CPUKernel::MallocWeightBiasData() {
   }
   return RET_OK;
 }
-
-int Convolution1x1CPUKernel::Eval() {
-  auto ret = InnerKernel::Eval();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "eval failed!";
-    return ret;
-  }
-  if (IsTrainable()) {
-    PackWeight();
-  }
-  return RET_OK;
-}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h
index 19d3d040ec7..c187449de30 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h
@@ -40,7 +40,6 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
   int Init() override;
   int Run() override;
   int ReSize() override;
-  int Eval() override;
 
  public:
   int DoConv1x1(int task_id);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
index 27411fbb226..cd935cc5c1d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
@@ -24,6 +24,7 @@
 #include "src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_x86_fp32.h"
 #include "src/runtime/kernel/arm/base/group_convolution_creator.h"
 #include "src/runtime/kernel/arm/fp32/group_convolution_fp32.h"
+#include "nnacl/base/conv_common_base.h"
 #include "schema/model_generated.h"
 #include "include/errorcode.h"
 #if defined(ENABLE_ARM) || (defined(ENABLE_SSE) && !defined(ENABLE_AVX))
@@ -39,6 +40,7 @@
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_INFER_INVALID;
+using mindspore::lite::RET_NULL_PTR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Conv2DFusion;
 
@@ -78,14 +80,11 @@ int ConvolutionDelegateCPUKernel::GetWeightData() {
   }
   if (InferShapeDone()) {
     origin_weight_ = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
-    MS_ASSERT(origin_weight_ != nullptr);
+    CHECK_NULL_RETURN(origin_weight_);
     return RET_OK;
   }
   origin_weight_ = CopyData(in_tensors_.at(kWeightIndex));
-  if (origin_weight_ == nullptr) {
-    MS_LOG(ERROR) << "Copy weight data failed.";
-    return RET_ERROR;
-  }
+  CHECK_NULL_RETURN(origin_weight_);
   need_free_weight_ = true;
   return RET_OK;
 }
@@ -94,14 +93,11 @@ int ConvolutionDelegateCPUKernel::GetBiasData() {
   if (in_tensors_.size() == 3) {
     if (InferShapeDone()) {
       origin_bias_ = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->data_c());
-      MS_ASSERT(origin_bias_ != nullptr);
+      CHECK_NULL_RETURN(origin_bias_);
       return RET_OK;
     } else {
       origin_bias_ = CopyData(in_tensors_.at(kBiasIndex));
-      if (origin_bias_ == nullptr) {
-        MS_LOG(ERROR) << "Copy bias data failed.";
-        return RET_ERROR;
-      }
+      CHECK_NULL_RETURN(origin_bias_);
       need_free_bias_ = true;
       return RET_OK;
     }
@@ -129,7 +125,7 @@ int ConvolutionDelegateCPUKernel::ReSize() {
   if (conv_kernel_ == nullptr) {
     // need to select actual execute kernel here
     conv_kernel_ = CpuConvFp32KernelSelect();
-    if (!conv_kernel_) {
+    if (conv_kernel_ == nullptr) {
       MS_LOG(ERROR) << "Selecting execute kernel failed for conv_kernel, got a nullptr.";
       return RET_ERROR;
     }
@@ -215,6 +211,7 @@ kernel::InnerKernel *ConvolutionDelegateCPUKernel::CpuConvFp32KernelSelect() {
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "conv kernel init failed.";
       delete kernel;
+      op_parameter_ = nullptr;
       return nullptr;
     }
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h
index b97e2ba0ec4..d01342f1dda 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h
@@ -38,7 +38,11 @@ class ConvolutionDelegateCPUKernel : public InnerKernel {
   };
   int Init() override;
   int ReSize() override;
-  int Run() override { return conv_kernel_->Run(); }
+  int Run() override {
+    conv_kernel_->set_name(name_);
+    conv_kernel_->set_workspace(workspace());
+    return conv_kernel_->Run();
+  }
 
   void set_in_tensor(lite::Tensor *in_tensor, size_t index) override {
     MS_ASSERT(index < in_tensors_.size());
@@ -81,10 +85,6 @@ class ConvolutionDelegateCPUKernel : public InnerKernel {
     }
   }
   // Train API
-  int Eval() override {
-    InnerKernel::Eval();
-    return conv_kernel_->Eval();
-  }
   int Train() override {
     InnerKernel::Train();
     return conv_kernel_->Train();
@@ -93,6 +93,10 @@ class ConvolutionDelegateCPUKernel : public InnerKernel {
     InnerKernel::SetTrainable(trainable);
     return conv_kernel_->SetTrainable(trainable);
   }
+  size_t workspace_size() override {
+    InnerKernel::workspace_size();
+    return conv_kernel_->workspace_size();
+  }
 
  protected:
   kernel::InnerKernel *conv_kernel_{nullptr};
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.cc
index d7e090ba343..baaea60befc 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.cc
@@ -27,6 +27,13 @@ namespace mindspore::kernel {
 int ConvolutionDepthwise3x3CPUKernel::Init() {
   CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
   CHECK_LESS_RETURN(out_tensors_.size(), 1);
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    int channel = weight_tensor->Batch();
+    int c4 = UP_ROUND(channel, C4NUM);
+    int pack_weight_size = c4 * C12NUM;
+    set_workspace_size(pack_weight_size * sizeof(float));
+  }
   auto ret = InitConvWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Convolution depthwise 3x3 fp32 InitConvWeightBias failed.";
@@ -104,25 +111,10 @@ int ConvolutionDepthwise3x3CPUKernel::Run() {
   return RET_OK;
 }
 
-int ConvolutionDepthwise3x3CPUKernel::Eval() {
-  auto ret = InnerKernel::Eval();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "eval failed!";
-    return ret;
-  }
-  if (IsTrainable()) {
-    if (InitConvWeightBias() != RET_OK) {
-      MS_LOG(ERROR) << "Convolution depthwise 3x3 fp32 Eval:InitWeightBias failed.";
-      return RET_ERROR;
-    }
-  }
-  return RET_OK;
-}
-
 void ConvolutionDepthwise3x3CPUKernel::PackWeight() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   int channel = weight_tensor->Batch();
-  void *origin_weight = IsTrainable() ? weight_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? weight_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
   PackWeightConvDw3x3Fp32(reinterpret_cast<float *>(origin_weight), reinterpret_cast<float *>(packed_weight_), channel);
 }
@@ -131,12 +123,14 @@ int ConvolutionDepthwise3x3CPUKernel::MallocWeightBiasData() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   int channel = weight_tensor->Batch();
   int c4 = UP_ROUND(channel, C4NUM);
-  if (packed_weight_ == nullptr) {
-    int pack_weight_size = c4 * C12NUM;
-    packed_weight_ = malloc(pack_weight_size * sizeof(float));
+  int pack_weight_size = c4 * C12NUM;
+  if (!op_parameter_->is_train_session_) {
     if (packed_weight_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
+      packed_weight_ = malloc(pack_weight_size * sizeof(float));
+      if (packed_weight_ == nullptr) {
+        MS_LOG(ERROR) << "Malloc buffer failed.";
+        return RET_ERROR;
+      }
     }
   }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.h
index bbed4403552..82785f1fbb2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.h
@@ -37,7 +37,6 @@ class ConvolutionDepthwise3x3CPUKernel : public ConvolutionBaseCPUKernel {
   int Run() override;
 
   int Execute(int task_id);
-  int Eval() override;
 
  private:
   int MallocWeightBiasData() override;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
index 5f3d171a311..48a5c2f4e86 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
@@ -22,9 +22,19 @@ using mindspore::lite::RET_INFER_INVALID;
 using mindspore::lite::RET_OK;
 
 namespace mindspore::kernel {
+
 int ConvolutionDepthwiseCPUKernel::Init() {
   CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
   CHECK_LESS_RETURN(out_tensors_.size(), 1);
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    int pack_weight_size = weight_tensor->Batch() * weight_tensor->Height() * weight_tensor->Width();
+    if (pack_weight_size >= std::numeric_limits<int>::max() / static_cast<int>(sizeof(float))) {
+      MS_LOG(ERROR) << "pack_weight_size is invalid, pack_weight_size: " << pack_weight_size;
+      return RET_ERROR;
+    }
+    set_workspace_size(pack_weight_size * sizeof(float));
+  }
   auto ret = InitConvWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Convolution depthwise fp32 InitConvWeightBias failed.";
@@ -89,7 +99,7 @@ int ConvolutionDepthwiseCPUKernel::Run() {
 
 void ConvolutionDepthwiseCPUKernel::PackWeight() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
-  void *origin_weight = IsTrainable() ? weight_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? weight_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
   PackWeightKHWToHWKFp32(reinterpret_cast<float *>(origin_weight), reinterpret_cast<float *>(packed_weight_),
                          weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch());
@@ -103,10 +113,12 @@ int ConvolutionDepthwiseCPUKernel::MallocWeightBiasData() {
     MS_LOG(ERROR) << "pack_weight_size is invalid, pack_weight_size: " << pack_weight_size;
     return RET_ERROR;
   }
-  packed_weight_ = malloc(pack_weight_size * sizeof(float));
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
+  if (!op_parameter_->is_train_session_) {
+    packed_weight_ = malloc(pack_weight_size * sizeof(float));
+    if (packed_weight_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
   }
 
   bias_data_ = malloc(channel * sizeof(float));
@@ -117,16 +129,4 @@ int ConvolutionDepthwiseCPUKernel::MallocWeightBiasData() {
   memset(bias_data_, 0, channel * sizeof(float));
   return RET_OK;
 }
-
-int ConvolutionDepthwiseCPUKernel::Eval() {
-  auto ret = InnerKernel::Eval();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "eval failed!";
-    return ret;
-  }
-  if (IsTrainable()) {
-    PackWeight();
-  }
-  return RET_OK;
-}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.h
index 622fe326136..e4b9a949bcc 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.h
@@ -37,7 +37,6 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
   int Run() override;
 
   int Execute(int task_id);
-  int Eval() override;
 
  private:
   int MallocWeightBiasData() override;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
index 66ef6c781cb..d5d2aa5a3c2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
@@ -36,6 +36,17 @@ ConvolutionDepthwiseIndirectCPUKernel::~ConvolutionDepthwiseIndirectCPUKernel()
 int ConvolutionDepthwiseIndirectCPUKernel::Init() {
   CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
   CHECK_LESS_RETURN(out_tensors_.size(), 1);
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_[kWeightIndex];
+#ifdef ENABLE_AVX
+    int div_flag = C8NUM;
+#else
+    int div_flag = C4NUM;
+#endif
+    int batch_flag = UP_DIV(weight_tensor->Batch(), div_flag);
+    int pack_weight_size = div_flag * batch_flag * weight_tensor->Height() * weight_tensor->Width();
+    set_workspace_size(pack_weight_size * sizeof(float));
+  }
   auto ret = InitConvWeightBias();
   if (ret != 0) {
     MS_LOG(ERROR) << "Convolution depthwise Indirect fp32 InitConvWeightBias failed.";
@@ -163,7 +174,7 @@ int ConvolutionDepthwiseIndirectCPUKernel::Run() {
 
 void ConvolutionDepthwiseIndirectCPUKernel::PackWeight() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
-  void *origin_weight = IsTrainable() ? weight_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? weight_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
 #ifdef ENABLE_AVX
   PackDepthwiseIndirectWeightC8Fp32(reinterpret_cast<float *>(origin_weight), reinterpret_cast<float *>(packed_weight_),
@@ -183,10 +194,12 @@ int ConvolutionDepthwiseIndirectCPUKernel::MallocWeightBiasData() {
 #endif
   int batch_flag = UP_DIV(weight_tensor->Batch(), div_flag);
   int pack_weight_size = div_flag * batch_flag * weight_tensor->Height() * weight_tensor->Width();
-  packed_weight_ = malloc(pack_weight_size * sizeof(float));
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
+  if (!op_parameter_->is_train_session_) {
+    packed_weight_ = malloc(pack_weight_size * sizeof(float));
+    if (packed_weight_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
   }
   bias_data_ = malloc(batch_flag * div_flag * sizeof(float));
   if (bias_data_ == nullptr) {
@@ -205,15 +218,4 @@ int ConvolutionDepthwiseIndirectCPUKernel::MallocWeightBiasData() {
   return RET_OK;
 }
 
-int ConvolutionDepthwiseIndirectCPUKernel::Eval() {
-  auto ret = InnerKernel::Eval();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "eval failed!";
-    return ret;
-  }
-  if (IsTrainable()) {
-    PackWeight();
-  }
-  return RET_OK;
-}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.h
index 1f404d5c5c4..f128735a6ba 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.h
@@ -36,7 +36,6 @@ class ConvolutionDepthwiseIndirectCPUKernel : public ConvolutionBaseCPUKernel {
   int Run() override;
 
   int Execute(int task_id);
-  int Eval() override;
 
  private:
   int MallocIndirectBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.cc
index b8f0475f921..169d0e275c2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.cc
@@ -59,7 +59,12 @@ int ConvolutionDepthwiseSWCPUKernel::Init() {
     MS_LOG(ERROR) << "new sliding window param failed.";
     return RET_ERROR;
   }
-
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
+    int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
+    set_workspace_size(pack_weight_size * sizeof(float));
+  }
   auto ret = InitConvWeightBias();
   if (ret != 0) {
     MS_LOG(ERROR) << "Convolution depthwise fp32 InitConvWeightBias failed.";
@@ -155,7 +160,7 @@ void ConvolutionDepthwiseSWCPUKernel::FreePackedInputOutput() {
 
 void ConvolutionDepthwiseSWCPUKernel::PackWeight() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
-  void *origin_weight = IsTrainable() ? weight_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? weight_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
   PackNCHWToNC4HW4Fp32(reinterpret_cast<float *>(origin_weight), reinterpret_cast<float *>(packed_weight_), 1,
                        weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch());
@@ -165,10 +170,12 @@ int ConvolutionDepthwiseSWCPUKernel::MallocWeightBiasData() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
   int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
-  packed_weight_ = malloc(pack_weight_size * sizeof(float));
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
+  if (!op_parameter_->is_train_session_) {
+    packed_weight_ = malloc(pack_weight_size * sizeof(float));
+    if (packed_weight_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
   }
   int malloc_size = MSMAX(conv_param_->output_channel_, C4NUM * OC4);
   if (malloc_size <= 0) {
@@ -185,15 +192,4 @@ int ConvolutionDepthwiseSWCPUKernel::MallocWeightBiasData() {
   return RET_OK;
 }
 
-int ConvolutionDepthwiseSWCPUKernel::Eval() {
-  auto ret = InnerKernel::Eval();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "eval failed!";
-    return ret;
-  }
-  if (IsTrainable()) {
-    PackWeight();
-  }
-  return RET_OK;
-}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.h
index f5294723bef..c82f2a72d96 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.h
@@ -36,7 +36,6 @@ class ConvolutionDepthwiseSWCPUKernel : public ConvolutionBaseCPUKernel {
   int Run() override;
 
   int Execute(int task_id);
-  int Eval() override;
 
  private:
   int InitPackedInputOutput();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_x86_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_x86_fp32.cc
index 2ffecf8d98c..8d24a02a019 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_x86_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_x86_fp32.cc
@@ -62,6 +62,12 @@ int ConvolutionDepthwiseSWCPUKernelX86::Init() {
 #ifdef ENABLE_AVX
   oc_tile_ = C8NUM;
 #endif
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    int oc_algin = UP_DIV(weight_tensor->Batch(), oc_tile_);
+    int pack_weight_size = oc_algin * oc_tile_ * weight_tensor->Height() * weight_tensor->Width();
+    set_workspace_size(pack_weight_size * sizeof(float));
+  }
   sliding_ = new (std::nothrow) SlidingWindowParam;
   if (sliding_ == nullptr) {
     MS_LOG(ERROR) << "new sliding window param failed.";
@@ -169,10 +175,12 @@ int ConvolutionDepthwiseSWCPUKernelX86::MallocWeightBiasData() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   int oc_algin = UP_DIV(weight_tensor->Batch(), oc_tile_);
   int pack_weight_size = oc_algin * oc_tile_ * weight_tensor->Height() * weight_tensor->Width();
-  packed_weight_ = malloc(pack_weight_size * sizeof(float));
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc packed_weight_ is failed!";
-    return RET_NULL_PTR;
+  if (!op_parameter_->is_train_session_) {
+    packed_weight_ = malloc(pack_weight_size * sizeof(float));
+    if (packed_weight_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc packed_weight_ is failed!";
+      return RET_NULL_PTR;
+    }
   }
 
   if (in_tensors_.size() == kInputSize2) {
@@ -187,16 +195,5 @@ int ConvolutionDepthwiseSWCPUKernelX86::MallocWeightBiasData() {
   return RET_OK;
 }
 
-int ConvolutionDepthwiseSWCPUKernelX86::Eval() {
-  auto ret = InnerKernel::Eval();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "eval failed!";
-    return ret;
-  }
-  if (IsTrainable()) {
-    PackWeight();
-  }
-  return RET_OK;
-}
 }  // namespace mindspore::kernel
 #endif
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_x86_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_x86_fp32.h
index c4bc1ffed67..62a351dbca2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_x86_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_x86_fp32.h
@@ -36,7 +36,6 @@ class ConvolutionDepthwiseSWCPUKernelX86 : public ConvolutionBaseCPUKernel {
   int Run() override;
 
   int Execute(int task_id);
-  int Eval() override;
 
  private:
   void FreePackedInputOutput();
@@ -52,5 +51,5 @@ class ConvolutionDepthwiseSWCPUKernelX86 : public ConvolutionBaseCPUKernel {
 };
 }  // namespace mindspore::kernel
 
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_X86_FP32_H_
 #endif
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_X86_FP32_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
index 54cca8410f4..0c89a76905b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
@@ -62,6 +62,15 @@ int ConvolutionCPUKernel::InitTmpBuffer() {
 int ConvolutionCPUKernel::Init() {
   CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
   CHECK_LESS_RETURN(out_tensors_.size(), 1);
+  if (op_parameter_->is_train_session_) {
+    auto filter_tensor = in_tensors_.at(kWeightIndex);
+    size_t in_channel = filter_tensor->Channel();
+    size_t out_channel = filter_tensor->Batch();
+    size_t oc_block_num = UP_ROUND(out_channel, OC_BLOCK);
+    size_t kernel_plane = filter_tensor->Height() * filter_tensor->Width();
+    size_t pack_weight_size = oc_block_num * in_channel * kernel_plane;
+    set_workspace_size(pack_weight_size * sizeof(float));
+  }
   auto ret = InitConvWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
@@ -87,8 +96,18 @@ int ConvolutionCPUKernel::ReSize() {
 int ConvolutionCPUKernel::RunImpl(int task_id) {
   auto ori_input_data = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->data_c());
   auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->data_c());
-  ConvFp32(ori_input_data, packed_input_, reinterpret_cast<float *>(packed_weight_),
-           reinterpret_cast<float *>(bias_data_), col_major_input_, output_addr, task_id, conv_param_);
+  if (out_tensors()[0]->format() != NC4HW4) {
+    ConvFp32(ori_input_data, packed_input_, reinterpret_cast<float *>(packed_weight_),
+             reinterpret_cast<float *>(bias_data_), col_major_input_, output_addr, task_id, conv_param_);
+  } else {
+#if ENABLE_ARM64
+    ConvFp32OutNC4HW4(ori_input_data, packed_input_, reinterpret_cast<float *>(packed_weight_),
+                      reinterpret_cast<float *>(bias_data_), col_major_input_, output_addr, task_id, conv_param_);
+#else
+    MS_LOG(ERROR) << "ConvFp32OutNC4HW4 not implemented.";
+    return RET_ERROR;
+#endif
+  }
   return RET_OK;
 }
 
@@ -139,7 +158,7 @@ void ConvolutionCPUKernel::PackWeight() {
     MS_LOG(ERROR) << "get height and width from filter_tensor failed.";
     return;
   }
-  void *origin_weight = IsTrainable() ? filter_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? filter_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
 #ifdef ENABLE_AVX
   RowMajor2Col16Major(reinterpret_cast<float *>(origin_weight), reinterpret_cast<float *>(packed_weight_), out_channel,
@@ -162,12 +181,14 @@ int ConvolutionCPUKernel::MallocWeightBiasData() {
   size_t oc_block_num = UP_ROUND(out_channel, OC_BLOCK);
   size_t kernel_plane = filter_tensor->Height() * filter_tensor->Width();
   size_t pack_weight_size = oc_block_num * in_channel * kernel_plane;
-  packed_weight_ = malloc(pack_weight_size * sizeof(float));
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "malloc packed weight failed.";
-    return RET_ERROR;
+  if (!op_parameter_->is_train_session_) {
+    packed_weight_ = malloc(pack_weight_size * sizeof(float));
+    if (packed_weight_ == nullptr) {
+      MS_LOG(ERROR) << "malloc packed weight failed.";
+      return RET_ERROR;
+    }
+    memset(packed_weight_, 0, pack_weight_size * sizeof(float));
   }
-  memset(packed_weight_, 0, pack_weight_size * sizeof(float));
 
   bias_data_ = malloc(oc_block_num * sizeof(float));
   if (bias_data_ == nullptr) {
@@ -178,11 +199,4 @@ int ConvolutionCPUKernel::MallocWeightBiasData() {
   return RET_OK;
 }
 
-int ConvolutionCPUKernel::Eval() {
-  InnerKernel::Eval();
-  if (IsTrainable()) {
-    PackWeight();
-  }
-  return RET_OK;
-}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.h
index 64d070f5ef4..5c8417ad7fa 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_FP32_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_FP32_H_
 
 #include <vector>
 #include "src/inner_kernel.h"
@@ -37,8 +37,6 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
   int Run() override;
   virtual int RunImpl(int task_id);
 
-  int Eval() override;
-
  protected:
   int MallocWeightBiasData() override;
   void PackWeight() override;
@@ -59,4 +57,4 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
 };
 }  // namespace mindspore::kernel
 
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_FP32_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc
index 8e2ab33b3a6..127c0160cbc 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc
@@ -36,6 +36,17 @@ int ConvolutionSWCPUKernel::Init() {
     in_tile_ = C8NUM;
     ic_res_ = conv_param_->input_channel_ % in_tile_;
   }
+  if (op_parameter_->is_train_session_) {
+    auto filter_tensor = in_tensors_.at(kWeightIndex);
+    auto input_channel = filter_tensor->Channel();
+    auto output_channel = filter_tensor->Batch();
+    int kernel_h = filter_tensor->Height();
+    int kernel_w = filter_tensor->Width();
+    int kernel_plane = kernel_h * kernel_w;
+    int oc_block_num = UP_DIV(output_channel, oc_tile_);
+    int pack_weight_size = oc_block_num * oc_tile_ * input_channel * kernel_plane;
+    set_workspace_size(pack_weight_size * sizeof(float));
+  }
   auto ret = InitConvWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
@@ -64,7 +75,6 @@ int ConvolutionSWCPUKernel::ReSize() {
     MS_LOG(ERROR) << "ConvolutionBase init failed.";
     return RET_ERROR;
   }
-
   // init sliding window param
   slidingWindow_param_ = new (std::nothrow) SlidingWindowParam;
   if (slidingWindow_param_ == nullptr) {
@@ -175,7 +185,7 @@ void ConvolutionSWCPUKernel::PackWeight() {
   int kernel_h = filter_tensor->Height();
   int kernel_w = filter_tensor->Width();
   int oc_block_num = UP_DIV(output_channel, oc_tile_);
-  void *origin_weight = IsTrainable() ? filter_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? filter_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
   PackNHWCToNXHWCXFp32(kernel_h, kernel_w, output_channel, oc_block_num, input_channel,
                        reinterpret_cast<float *>(packed_weight_), reinterpret_cast<float *>(origin_weight));
@@ -192,12 +202,14 @@ int ConvolutionSWCPUKernel::MallocWeightBiasData() {
   int kernel_plane = kernel_h * kernel_w;
   int oc_block_num = UP_DIV(output_channel, oc_tile_);
   int pack_weight_size = oc_block_num * oc_tile_ * input_channel * kernel_plane;
-  packed_weight_ = malloc(pack_weight_size * sizeof(float));
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "malloc packed weight failed.";
-    return RET_NULL_PTR;
+  if (!op_parameter_->is_train_session_) {
+    packed_weight_ = malloc(pack_weight_size * sizeof(float));
+    if (packed_weight_ == nullptr) {
+      MS_LOG(ERROR) << "malloc packed weight failed.";
+      return RET_NULL_PTR;
+    }
+    memset(packed_weight_, 0, pack_weight_size * sizeof(float));
   }
-  memset(packed_weight_, 0, pack_weight_size * sizeof(float));
 
   if (in_tensors_.size() == kInputSize2) {
     bias_data_ = malloc(oc_block_num * oc_tile_ * sizeof(float));
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
index 08fb239ff58..21a4786ad05 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
@@ -21,6 +21,7 @@
 
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_MEMORY_FAILED;
+using mindspore::lite::RET_NULL_PTR;
 using mindspore::lite::RET_OK;
 
 namespace mindspore::kernel {
@@ -102,7 +103,14 @@ int ConvolutionWinogradCPUKernel::Init() {
   input_unit_ = output_unit_ + kernel_unit_ - 1;
   conv_param_->input_unit_ = input_unit_;
   conv_param_->output_unit_ = output_unit_;
-
+  if (op_parameter_->is_train_session_) {
+    auto filter_tensor = in_tensors_.at(kWeightIndex);
+    int in_channel = filter_tensor->Channel();
+    int out_channel = filter_tensor->Batch();
+    auto trans_matrix_data_size =
+      input_unit_ * input_unit_ * in_channel * UP_ROUND(out_channel, oc_block_) * sizeof(float);
+    set_workspace_size(trans_matrix_data_size);
+  }
   auto ret = InitConvWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
@@ -127,15 +135,16 @@ int ConvolutionWinogradCPUKernel::ReSize() {
     MS_LOG(ERROR) << "ConfigInputOutput failed.";
     return RET_ERROR;
   }
+  conv_param_->out_format_ = out_tensors_[0]->format();
   return RET_OK;
 }
 
 int ConvolutionWinogradCPUKernel::RunImpl(int task_id) {
   auto input_tensor = in_tensors_.at(kInputIndex);
   auto ori_input_data = reinterpret_cast<float *>(input_tensor->data_c());
-  MS_ASSERT(ori_input_data != nullptr);
+  CHECK_NULL_RETURN(ori_input_data);
   auto output_data = reinterpret_cast<float *>(out_tensors_.front()->data_c());
-  MS_ASSERT(output_data != nullptr);
+  CHECK_NULL_RETURN(output_data);
   ConvWinogardFp32(ori_input_data, reinterpret_cast<float *>(packed_weight_),
                    reinterpret_cast<const float *>(bias_data_), output_data, tmp_buffer_address_list_, task_id,
                    conv_param_, in_func_, out_func_);
@@ -191,14 +200,16 @@ int ConvolutionWinogradCPUKernel::MallocWeightBiasData() {
   // set data
   auto trans_matrix_data_size =
     input_unit_ * input_unit_ * in_channel * UP_ROUND(out_channel, oc_block_) * sizeof(float);
-  if (packed_weight_ == nullptr) {
-    packed_weight_ = malloc(trans_matrix_data_size);
+  if (!op_parameter_->is_train_session_) {
     if (packed_weight_ == nullptr) {
-      MS_LOG(ERROR) << "malloc matrix_buffer failed.";
-      return RET_MEMORY_FAILED;
+      packed_weight_ = malloc(trans_matrix_data_size);
+      if (packed_weight_ == nullptr) {
+        MS_LOG(ERROR) << "malloc matrix_buffer failed.";
+        return RET_MEMORY_FAILED;
+      }
     }
+    memset(packed_weight_, 0, trans_matrix_data_size);
   }
-  memset(packed_weight_, 0, trans_matrix_data_size);
 
   float matrix_a[64];
   float matrix_at[64];
@@ -230,24 +241,9 @@ int ConvolutionWinogradCPUKernel::MallocWeightBiasData() {
 
 void ConvolutionWinogradCPUKernel::PackWeight() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
-  void *origin_weight = IsTrainable() ? weight_tensor->data_c() : origin_weight_;
+  void *origin_weight = (op_parameter_->is_train_session_) ? weight_tensor->data_c() : origin_weight_;
   MS_ASSERT(origin_weight != nullptr);
   WinogradFilterTransform(reinterpret_cast<float *>(origin_weight), matrix_g_, matrix_gt_, oc_block_);
 }
 
-int ConvolutionWinogradCPUKernel::Eval() {
-  auto ret = InnerKernel::Eval();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "eval failed!";
-    return ret;
-  }
-  if (IsTrainable()) {
-    ret = InitConvWeightBias();
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "Init weight bias failed.";
-      return RET_ERROR;
-    }
-  }
-  return RET_OK;
-}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h
index 9fd402a7bc2..306f851eaea 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h
@@ -36,7 +36,6 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
   int Init() override;
   int ReSize() override;
   int Run() override;
-  int Eval() override;
   int RunImpl(int task_id);
   int InitTmpBuffer();
   int ConfigInputOutput();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise_fp32.cc
index b798512d1ab..d7d565858f6 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise_fp32.cc
@@ -72,7 +72,12 @@ int DeconvolutionDepthwiseCPUKernel::Init() {
     MS_LOG(ERROR) << "new sliding window param failed.";
     return RET_ERROR;
   }
-
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
+    int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
+    set_workspace_size(pack_weight_size * sizeof(float));
+  }
   auto ret = InitConvWeightBias();
   if (ret != 0) {
     MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitConvWeightBias failed.ret: " << ret;
@@ -165,10 +170,12 @@ int DeconvolutionDepthwiseCPUKernel::MallocWeightBiasData() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
   int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
-  packed_weight_ = malloc(pack_weight_size * sizeof(float));
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
+  if (!op_parameter_->is_train_session_) {
+    packed_weight_ = malloc(pack_weight_size * sizeof(float));
+    if (packed_weight_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
   }
 
   bias_data_ = malloc(C4NUM * OC4 * sizeof(float));
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_fp32.cc
index 9b118687cfb..375c43cbeee 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_fp32.cc
@@ -55,13 +55,15 @@ int DeConvolutionCPUKernel::MallocWeightBiasData() {
   auto kernel_h_ = weight_tensor->Height();
   auto kernel_w_ = weight_tensor->Width();
   int output_aligned_size = UP_ROUND(output_channel, C8NUM);
-  size_t weight_pack_size = input_channel * kernel_w_ * kernel_h_ * output_aligned_size * sizeof(float);
-  packed_weight_ = MallocAlignedData(C32NUM, weight_pack_size);
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "deconv malloc packed_weight_ error!";
-    return RET_ERROR;
+  size_t pack_weight_size = input_channel * kernel_w_ * kernel_h_ * output_aligned_size * sizeof(float);
+  if (!op_parameter_->is_train_session_) {
+    packed_weight_ = MallocAlignedData(C32NUM, pack_weight_size);
+    if (packed_weight_ == nullptr) {
+      MS_LOG(ERROR) << "deconv malloc packed_weight_ error!";
+      return RET_ERROR;
+    }
+    memset(packed_weight_, 0, pack_weight_size);
   }
-  memset(packed_weight_, 0, weight_pack_size);
 
   bias_data_ = MallocAlignedData(C32NUM, output_aligned_size * sizeof(float));
   if (bias_data_ == nullptr) {
@@ -161,6 +163,16 @@ int DeConvolutionCPUKernel::Init() {
 #else
   row_tile_ = C12NUM;
 #endif
+  if (op_parameter_->is_train_session_) {
+    auto weight_tensor = in_tensors_.at(kWeightIndex);
+    auto input_channel = weight_tensor->Batch();
+    auto output_channel = weight_tensor->Channel();
+    auto kernel_h_ = weight_tensor->Height();
+    auto kernel_w_ = weight_tensor->Width();
+    int output_aligned_size = UP_ROUND(output_channel, C8NUM);
+    size_t pack_weight_size = input_channel * kernel_w_ * kernel_h_ * output_aligned_size * sizeof(float);
+    set_workspace_size(pack_weight_size);
+  }
   matmul_param_ = new (std::nothrow) MatMulParameter();
   if (matmul_param_ == nullptr) {
     MS_LOG(ERROR) << "Memory allocation failed";
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc
index f18162d392b..aeed5900f02 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc
@@ -37,15 +37,20 @@ int InstanceNormCPUKernel::Init() {
 
 int InstanceNormCPUKernel::ReSize() {
   param_->op_parameter_.thread_num_ = op_parameter_->thread_num_;
-  auto shape = in_tensors_.front()->shape();
-  param_->batch_ = shape[0];
-  param_->inner_size_ = shape[2] * shape[3];
-  param_->channel_ = shape[1];
+  auto in_tensor = in_tensors_.front();
+  param_->batch_ = in_tensor->Batch();
+  param_->inner_size_ = in_tensor->Height() * in_tensor->Width();
+  param_->channel_ = in_tensor->Channel();
   return RET_OK;
 }
 
 int InstanceNormCPUKernel::DoInstanceNorm(int task_id) {
-  int ret = InstanceNorm(src_data_, dst_data_, gamma_data_, beta_data_, param_, task_id);
+  int ret = 0;
+  if (in_tensors_[0]->format() == NC4HW4) {
+    ret = InstanceNormNC4HW4(src_data_, dst_data_, gamma_data_, beta_data_, param_, task_id);
+  } else {
+    ret = InstanceNorm(src_data_, dst_data_, gamma_data_, beta_data_, param_, task_id);
+  }
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "DoInstanceNorm error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/lsh_projection_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/lsh_projection_fp32.cc
deleted file mode 100644
index f63e01d85a1..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/lsh_projection_fp32.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/runtime/kernel/arm/fp32/lsh_projection_fp32.h"
-
-#include "include/errorcode.h"
-#include "src/common/string_util.h"
-#include "src/kernel_registry.h"
-
-using mindspore::kernel::KERNEL_ARCH;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_LshProjection;
-
-namespace mindspore::kernel {
-int LshProjectionCPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
-  return ReSize();
-}
-
-int LshProjectionCPUKernel::ReSize() { return RET_OK; }
-
-int LshProjectionRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
-  auto kernel = reinterpret_cast<LshProjectionCPUKernel *>(cdata);
-  return kernel->DoExecute(task_id);
-}
-
-int LshProjectionCPUKernel::Run() {
-  auto input0_tensor = in_tensors_.at(0);
-  auto input1_tensor = in_tensors_.at(1);
-  auto out_tensor = out_tensors_.at(0);
-
-  hash_seed_ = reinterpret_cast<float *>(input0_tensor->MutableData());
-  feature_ = reinterpret_cast<int32_t *>(input1_tensor->MutableData());
-  weight_ = in_tensors_.size() == 2 ? nullptr : reinterpret_cast<float *>(in_tensors_.at(2)->MutableData());
-  output_ = reinterpret_cast<int32_t *>(out_tensor->MutableData());
-
-  param_->hash_buff_size_ = sizeof(float) + sizeof(int32_t);
-  param_->feature_num_ = input1_tensor->ElementsNum();
-  param_->hash_shape_[0] = input0_tensor->DimensionSize(0);
-  param_->hash_shape_[1] = input0_tensor->DimensionSize(1);
-  param_->thread_stride_ = op_parameter_->thread_num_ > 1 ? UP_DIV(param_->hash_shape_[0], op_parameter_->thread_num_)
-                                                          : param_->hash_shape_[0];
-  auto ret = MallocKeys();
-  if (ret != RET_OK) {
-    return ret;
-  }
-  ret = ParallelLaunch(this->ms_context_, LshProjectionRun, this, op_parameter_->thread_num_);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "LshProjection kernel parallel launch failed";
-  }
-  FreeKeys();
-  return ret;
-}
-
-int LshProjectionCPUKernel::MallocKeys() {
-  param_->hash_buffs_ =
-    static_cast<char **>(ms_context_->allocator->Malloc(op_parameter_->thread_num_ * sizeof(char *)));
-  if (param_->hash_buffs_ == nullptr) {
-    MS_LOG(ERROR) << "Memory allocation failed";
-    return RET_ERROR;
-  }
-  for (int i = 0; i < op_parameter_->thread_num_; i++) {
-    param_->hash_buffs_[i] = static_cast<char *>(ms_context_->allocator->Malloc(param_->hash_buff_size_));
-    if (param_->hash_buffs_[i] == nullptr) {
-      FreeKeys();
-      MS_LOG(ERROR) << "Memory allocation failed";
-      return RET_ERROR;
-    }
-  }
-  return RET_OK;
-}
-
-void LshProjectionCPUKernel::FreeKeys() {
-  if (param_->hash_buffs_ != nullptr) {
-    for (int i = 0; i < op_parameter_->thread_num_; i++) {
-      ms_context_->allocator->Free(param_->hash_buffs_[i]);
-      param_->hash_buffs_[i] = nullptr;
-    }
-    ms_context_->allocator->Free(param_->hash_buffs_);
-    param_->hash_buffs_ = nullptr;
-  }
-}
-
-int LshProjectionCPUKernel::DoExecute(int task_id) {
-  int cur_group_num = MSMIN(param_->hash_shape_[0] - task_id * param_->thread_stride_, param_->thread_stride_);
-  int start = task_id * param_->thread_stride_;
-  int end = start + cur_group_num;
-  char *hash_buff = param_->hash_buffs_[task_id];
-
-  switch (param_->lsh_type_) {
-    case schema::LshProjectionType_SPARSE:
-      LshProjectionSparse(hash_seed_, feature_, weight_, output_, param_, start, end, hash_buff);
-      break;
-    case schema::LshProjectionType_DENSE:
-      LshProjectionDense(hash_seed_, feature_, weight_, output_, param_, start, end, hash_buff);
-      break;
-    default:
-      return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int LshProjectionCPUKernel::GetSignBit(int32_t *feature, float *weight, float seed, LshProjectionParameter *para,
-                                       char *hash_buff) {
-  double score = 0.0;
-  for (int i = 0; i < para->feature_num_; i++) {
-    memcpy(hash_buff, &seed, sizeof(float));
-    memcpy(hash_buff + sizeof(float), &(feature[i]), sizeof(int32_t));
-    int64_t hash_i = static_cast<int64_t>(lite::StringHash64(hash_buff, para->hash_buff_size_));
-    double hash_d = static_cast<double>(hash_i);
-    if (weight == nullptr) {
-      score += hash_d;
-    } else {
-      score += weight[i] * hash_d;
-    }
-  }
-  return (score > 0) ? 1 : 0;
-}
-
-void LshProjectionCPUKernel::LshProjectionSparse(float *hashSeed, int32_t *feature, float *weight, int32_t *output,
-                                                 LshProjectionParameter *para, int32_t start, int32_t end,
-                                                 char *hash_buff) {
-  for (int i = start; i < end; i++) {
-    int32_t hash_sign = 0;
-    for (int j = 0; j < para->hash_shape_[1]; j++) {
-      int bit = GetSignBit(feature, weight, hashSeed[i * para->hash_shape_[1] + j], para, hash_buff);
-      hash_sign = (hash_sign << 1) | bit;
-    }
-    output[i] = hash_sign + i * (1 << para->hash_shape_[1]);
-  }
-}
-
-void LshProjectionCPUKernel::LshProjectionDense(float *hashSeed, int32_t *feature, float *weight, int32_t *output,
-                                                LshProjectionParameter *para, int32_t start, int32_t end,
-                                                char *hash_buff) {
-  for (int i = start; i < end; i++) {
-    for (int j = 0; j < para->hash_shape_[1]; j++) {
-      output[i * para->hash_shape_[1] + j] =
-        GetSignBit(feature, weight, hashSeed[i * para->hash_shape_[1] + j], para, hash_buff);
-    }
-  }
-}
-
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_LshProjection, LiteKernelCreator<LshProjectionCPUKernel>)
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/lsh_projection_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/lsh_projection_fp32.h
deleted file mode 100644
index b0aebb58a10..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/lsh_projection_fp32.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_LSH_PROJECTION_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_LSH_PROJECTION_H_
-
-#include <vector>
-
-#include "nnacl/lsh_projection_parameter.h"
-#include "src/inner_kernel.h"
-
-namespace mindspore::kernel {
-class LshProjectionCPUKernel : public InnerKernel {
- public:
-  LshProjectionCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                         const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : InnerKernel(parameter, inputs, outputs, ctx) {
-    param_ = reinterpret_cast<LshProjectionParameter *>(op_parameter_);
-  }
-  ~LshProjectionCPUKernel() = default;
-
-  int Init() override;
-  int ReSize() override;
-  int Run() override;
-  int DoExecute(int task_id);
-
- private:
-  int MallocKeys();
-  void FreeKeys();
-  int GetSignBit(int32_t *feature, float *weight, float seed, LshProjectionParameter *para, char *hash_buff);
-  void LshProjectionSparse(float *hashSeed, int32_t *feature, float *weight, int32_t *output,
-                           LshProjectionParameter *param, int32_t start, int32_t end, char *hash_buff);
-  void LshProjectionDense(float *hashSeed, int32_t *feature, float *weight, int32_t *output,
-                          LshProjectionParameter *param, int32_t start, int32_t end, char *hash_buff);
-  LshProjectionParameter *param_ = nullptr;
-  float *hash_seed_ = nullptr;
-  int32_t *feature_ = nullptr;
-  float *weight_ = nullptr;
-  int32_t *output_ = nullptr;
-};
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_LSH_PROJECTION_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
index f1e6da7ac25..bd8cf92c7a2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
@@ -78,8 +78,8 @@ int MatmulFp32BaseCPUKernel::InitBufferA() {
   if (op_parameter_->is_train_session_) {
     a_pack_ptr_ = reinterpret_cast<float *>(workspace());
   } else {
-    a_pack_ptr_ =
-      reinterpret_cast<float *>(ms_context_->allocator->Malloc(matrix_a_pack_size_ * static_cast<int>(sizeof(float))));
+    a_pack_ptr_ = reinterpret_cast<float *>(
+      ms_context_->allocator->Malloc(static_cast<size_t>(matrix_a_pack_size_) * sizeof(float)));
   }
   if (a_pack_ptr_ == nullptr) {
     MS_LOG(ERROR) << "malloc a_pack_ptr_ failed";
@@ -95,8 +95,8 @@ int MatmulFp32BaseCPUKernel::InitBufferB() {
   if (op_parameter_->is_train_session_) {
     b_pack_ptr_ = reinterpret_cast<float *>(workspace()) + matrix_a_pack_size_;
   } else {
-    b_pack_ptr_ =
-      reinterpret_cast<float *>(ms_context_->allocator->Malloc(matrix_b_pack_size_ * static_cast<int>(sizeof(float))));
+    b_pack_ptr_ = reinterpret_cast<float *>(
+      ms_context_->allocator->Malloc(static_cast<size_t>(matrix_b_pack_size_) * sizeof(float)));
   }
   if (b_pack_ptr_ == nullptr) {
     MS_LOG(ERROR) << "malloc b_pack_ptr_ failed";
@@ -128,7 +128,7 @@ int MatmulFp32BaseCPUKernel::CalBroadCastBiasDataElements() {
 int MatmulFp32BaseCPUKernel::InitBiasData() {
   if (in_tensors_.size() == 3) {
     auto bias_tensor = in_tensors_[2];
-    int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
+    size_t max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
     // malloc addr need to aligned to 32 bytes
     bias_ptr_ = reinterpret_cast<float *>(malloc(max_bias_data * static_cast<int>(sizeof(float))));
     if (bias_ptr_ == nullptr) {
@@ -140,7 +140,7 @@ int MatmulFp32BaseCPUKernel::InitBiasData() {
       max_bias_data = CalBroadCastBiasDataElements();
       float broadcast_data = (reinterpret_cast<float *>(bias_tensor->data_c()))[0];
       // broadcast bias data
-      for (int i = 0; i < max_bias_data; ++i) {
+      for (size_t i = 0; i < max_bias_data; ++i) {
         bias_ptr_[i] = broadcast_data;
       }
     } else {
@@ -404,6 +404,7 @@ int MatmulFp32BaseCPUKernel::InitTmpOutBuffer() {
 int MatmulFp32BaseCPUKernel::Run() {
   if (!params_->a_const_) {
     auto a_ptr = reinterpret_cast<float *>(in_tensors_.at(0)->data_c());
+    CHECK_NULL_RETURN(a_ptr);
     if (RET_OK != InitBufferA()) {
       return RET_ERROR;
     }
@@ -415,6 +416,7 @@ int MatmulFp32BaseCPUKernel::Run() {
   }
   if (!params_->b_const_) {
     auto b_ptr = reinterpret_cast<float *>(in_tensors_.at(1)->data_c());
+    CHECK_NULL_RETURN(b_ptr);
     if (RET_OK != InitBufferB()) {
       FreeResizeBufA();
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc
index f3b6c1d0295..5c8d9460dac 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc
@@ -29,7 +29,6 @@ namespace mindspore::kernel {
 namespace {
 constexpr size_t kMirrorPadInputSize = 2;
 constexpr size_t kPadCommonInputSize = 2;
-constexpr size_t kPadMaxInputSize = 3;
 }  // namespace
 int PadCPUKernel::Init() {
   CHECK_LESS_RETURN(in_tensors_.size(), 1);
@@ -397,9 +396,6 @@ int PadCPUKernel::Run() {
         return RET_ERROR;
       }
     }
-    if (in_tensors_.size() == kPadMaxInputSize) {
-      pad_param_->constant_value_ = reinterpret_cast<float *>(in_tensors_.at(2)->data_c())[0];
-    }
     auto output = out_tensors_.at(0);
     int output_size = output->ElementsNum();
     auto output_data = reinterpret_cast<float *>(output->data_c());
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/skip_gram_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/skip_gram_fp32.cc
deleted file mode 100644
index cdb35fb8910..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/skip_gram_fp32.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/runtime/kernel/arm/fp32/skip_gram_fp32.h"
-
-#include "include/errorcode.h"
-#include "src/kernel_registry.h"
-
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-using mindspore::lite::StringPack;
-using mindspore::schema::PrimitiveType_SkipGram;
-
-namespace mindspore::kernel {
-int SkipGramCPUKernel::Init() {
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
-  return ReSize();
-}
-
-int SkipGramCPUKernel::ReSize() { return RET_OK; }
-
-void ParseSentenceToWords(const StringPack &sentence, std::vector<StringPack> *words) {
-  int pre = 0;
-  int i;
-  for (i = 0; i < sentence.len; i++) {
-    if (sentence.data[i] != ' ') {
-      pre = i;
-      break;
-    }
-  }
-  for (; i < sentence.len; i++) {
-    if (sentence.data[i] == ' ') {
-      if (sentence.data[pre] != ' ') {
-        words->push_back({i - pre, sentence.data + pre});
-      }
-      pre = i + 1;
-    }
-  }
-  if (sentence.data[sentence.len - 1] != ' ') {
-    words->push_back({sentence.len - pre, sentence.data + pre});
-  }
-}
-
-int SkipGramCPUKernel::Run() {
-  skip_gram_parameter_ = reinterpret_cast<SkipGramParameter *>(op_parameter_);
-  MS_ASSERT(skip_gram_parameter_);
-  if (skip_gram_parameter_->ngram_size < 1) {
-    MS_LOG(ERROR) << "Skip Gram Parameter Error, NgramSize should be at least 1, get "
-                  << skip_gram_parameter_->ngram_size;
-    return RET_ERROR;
-  }
-
-  StringPack sentence = mindspore::lite::ParseTensorBuffer(in_tensors_.at(0)).at(0);
-  std::vector<StringPack> words;
-  ParseSentenceToWords(sentence, &words);
-
-  std::vector<std::vector<StringPack>> result;
-  std::vector<int> stack(skip_gram_parameter_->ngram_size, 0);
-
-  int index = 1;
-  int size = words.size();
-  while (index >= 0) {
-    if (index < skip_gram_parameter_->ngram_size && stack.at(index) + 1 < size &&
-        (index == 0 || stack.at(index) - stack.at(index - 1) <= skip_gram_parameter_->max_skip_size)) {
-      stack.at(index)++;
-      index++;
-      if (index < skip_gram_parameter_->ngram_size) {
-        stack.at(index) = stack.at(index - 1);
-      }
-    } else {
-      if (index > 0 && ((skip_gram_parameter_->include_all_ngrams && index <= skip_gram_parameter_->ngram_size) ||
-                        (!skip_gram_parameter_->include_all_ngrams && index == skip_gram_parameter_->ngram_size))) {
-        std::vector<StringPack> gram(2 * index - 1);
-        char blank[1] = {' '};
-        StringPack blank_str = {1, blank};
-        for (int i = 0; i < 2 * index - 2; i += 2) {
-          gram.at(i) = words.at(stack.at(i / 2));
-          gram.at(i + 1) = blank_str;
-        }
-        gram.at(2 * index - 2) = words.at(stack.at(index - 1));
-        result.push_back(gram);
-      }
-      index--;
-    }
-  }
-  auto ret = mindspore::lite::WriteSeperatedStringsToTensor(out_tensors_.at(0), result);
-  return ret;
-}
-
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_SkipGram, LiteKernelCreator<SkipGramCPUKernel>)
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/skip_gram_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/skip_gram_fp32.h
deleted file mode 100644
index ae1682da1df..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/skip_gram_fp32.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SKIP_GRAM_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SKIP_GRAM_H_
-
-#include <vector>
-#include "src/inner_kernel.h"
-#include "nnacl/skip_gram_parameter.h"
-#include "src/common/string_util.h"
-
-namespace mindspore::kernel {
-
-class SkipGramCPUKernel : public InnerKernel {
- public:
-  explicit SkipGramCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                             const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : InnerKernel(parameter, inputs, outputs, ctx), ctx_(ctx), thread_count_(ctx->thread_num_) {}
-  ~SkipGramCPUKernel() override = default;
-
-  int Init() override;
-  int ReSize() override;
-  int Run() override;
-  int DoExcute(int task_id);
-
- protected:
-  const lite::InnerContext *ctx_ = nullptr;
-  int thread_count_ = 1;
-  SkipGramParameter *skip_gram_parameter_ = nullptr;
-};
-
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SKIP_GRAM_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/tensor_array_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/tensor_array_fp32.cc
deleted file mode 100644
index 5d01f1389bf..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/tensor_array_fp32.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-#include <memory>
-#include "include/errorcode.h"
-#include "src/runtime/kernel/arm/fp32/tensor_array_fp32.h"
-#include "schema/model_generated.h"
-#include "src/kernel_registry.h"
-#include "src/tensorlist.h"
-#include "src/common/log_util.h"
-
-using mindspore::kernel::KERNEL_ARCH;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_TensorArray;
-using mindspore::schema::PrimitiveType_TensorArrayRead;
-using mindspore::schema::PrimitiveType_TensorArrayWrite;
-
-namespace mindspore::kernel {
-constexpr int kTensorArrayReadInSize = 3;
-constexpr int kTensorArrayWriteInSize = 4;
-constexpr int kHandleIndex = 0;
-// input index for tensor arrya write/read
-constexpr int kIndexInputIdx = 1;
-constexpr int kValueIndex = 2;
-
-int TensorArrayCPUKernel::Init() {
-  MSLITE_CHECK_PTR(this->ta_param_);
-  int *element_shape = this->ta_param_->element_shape_;
-  MSLITE_CHECK_PTR(element_shape);
-  int element_shape_size = this->ta_param_->element_shape_size_;
-  // element shape to vector
-  std::vector<int> element_shape_v(element_shape, element_shape + element_shape_size);
-  // check inputs' size
-  if (this->in_tensors_.size() != 1) {
-    MS_LOG(ERROR) << "invalid number of tensor array!";
-    return RET_ERROR;
-  }
-  // get size from input
-  lite::Tensor *input = InnerKernel::in_tensors_.at(kInputIndex);
-  // check input tensor's datatype is int or not
-  if (input->data_type() != TypeId::kNumberTypeInt32 || input->ElementsNum() != 1) {
-    MS_LOG(ERROR) << "checked invalid tensor array's input!";
-    return RET_ERROR;
-  }
-  std::vector<int> shape = {*(static_cast<int *>(input->data()))};
-  this->tensor_list_ = std::make_unique<lite::TensorList>(shape, element_shape_v);
-  std::vector<std::vector<int>> tensor_shape(shape.front(), element_shape_v);
-  this->tensor_list_->MallocTensorListData(TypeId::kNumberTypeFloat32, tensor_shape);
-  this->tensor_list_->MallocData();
-  return RET_OK;
-}
-
-inline int TensorArrayCPUKernel::Run() {
-  // set handle to outputs, fake malloc, call set_data
-  lite::Tensor *output = out_tensors_.at(kOutputIndex);
-  void *tensor_list = static_cast<void *>(this->tensor_list_.get());
-  void *delta = InnerKernel::ms_context_->allocator->Malloc(sizeof(tensor_list));
-  MSLITE_CHECK_PTR(delta);
-  memcpy(delta, &tensor_list, sizeof(tensor_list));
-  output->set_data(delta);
-  return RET_OK;
-}
-
-/**
- * read operate just copy handle(tensor buffer) to output,
- * on the contrary, write just copy output to buffer.
- */
-int TensorArrayBaseCPUKernel::Init() {
-  // check index_tensor
-  lite::Tensor *input_y = in_tensors_.at(kIndexInputIdx);
-  if (input_y->category() != lite::Tensor::Category::CONST_TENSOR) {
-    MS_LOG(ERROR) << "invalid category of index input";
-    return RET_ERROR;
-  }
-  MSLITE_CHECK_PTR(input_y->data());
-  index_ = *(static_cast<int *>(input_y->data()));
-  return RET_OK;
-}
-
-int TensorArrayBaseCPUKernel::Run() {
-  lite::Tensor *input_x = in_tensors_.at(kHandleIndex);
-  // check output shape is same as handle
-  lite::TensorList **delta = static_cast<lite::TensorList **>(input_x->data());
-  lite::TensorList *tensor_list = *delta;
-  if (tensor_list == nullptr) {
-    MS_LOG(ERROR) << "get tensor list failed!";
-    return RET_ERROR;
-  }
-  this->handle_ = tensor_list->GetTensor(index_);
-  MSLITE_CHECK_PTR(this->handle_);
-  return RET_OK;
-}
-
-int TensorArrayReadCPUKernel::Init() {
-  // just check
-  if (in_tensors_.size() != kTensorArrayReadInSize) {
-    MS_LOG(ERROR) << "invalid input numbers of TensorArrayReadCPUKernel";
-    return RET_ERROR;
-  }
-  // check index_tensor
-  TensorArrayBaseCPUKernel::Init();
-  return RET_OK;
-}
-
-int TensorArrayReadCPUKernel::Run() {
-  TensorArrayBaseCPUKernel::Run();
-  lite::Tensor *output = out_tensors_.at(kOutputIndex);
-  lite::Tensor::CopyTensorData(*(TensorArrayBaseCPUKernel::handle_), output);
-  return RET_OK;
-}
-
-int TensorArrayWriteCPUKernel::Init() {
-  // just check
-  if (in_tensors_.size() != kTensorArrayWriteInSize) {
-    MS_LOG(ERROR) << "invalid input numbers of TensorArrayWriteCPUKernel";
-    return RET_ERROR;
-  }
-  TensorArrayBaseCPUKernel::Init();
-  return RET_OK;
-}
-
-int TensorArrayWriteCPUKernel::Run() {
-  TensorArrayBaseCPUKernel::Run();
-  lite::Tensor *value = in_tensors_.at(kValueIndex);
-  lite::Tensor::CopyTensorData(*value, TensorArrayBaseCPUKernel::handle_);
-  return RET_OK;
-}
-
-REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_TensorArray, LiteKernelCreator<TensorArrayCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_TensorArrayRead, LiteKernelCreator<TensorArrayReadCPUKernel>)
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_TensorArrayWrite, LiteKernelCreator<TensorArrayWriteCPUKernel>)
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/tensor_array_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/tensor_array_fp32.h
deleted file mode 100644
index e151147918a..00000000000
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/tensor_array_fp32.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORARRAY_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORARRAY_H_
-
-#include <vector>
-#include <memory>
-#include "nnacl/tensor_array_parameter.h"
-#include "src/inner_kernel.h"
-#include "src/tensorlist.h"
-
-namespace mindspore::kernel {
-class TensorArrayCPUKernel : public InnerKernel {
- public:
-  TensorArrayCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                       const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : InnerKernel(parameter, inputs, outputs, ctx) {
-    ta_param_ = reinterpret_cast<TensorArrayParameter *>(parameter);
-  }
-
-  ~TensorArrayCPUKernel() = default;
-
-  int Init() override;
-  int ReSize() override { return 0; }
-  int Run() override;
-
- private:
-  TensorArrayParameter *ta_param_{nullptr};
-  std::unique_ptr<lite::TensorList> tensor_list_;
-};
-
-class TensorArrayBaseCPUKernel : public InnerKernel {
- public:
-  TensorArrayBaseCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                           const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : InnerKernel(parameter, inputs, outputs, ctx) {}
-  ~TensorArrayBaseCPUKernel() = default;
-
-  int Init() override;
-  int ReSize() override { return 0; }
-  inline int Run() override;
-
- protected:
-  lite::Tensor *handle_{nullptr};
-  int index_{0};
-};
-
-class TensorArrayReadCPUKernel : public TensorArrayBaseCPUKernel {
- public:
-  TensorArrayReadCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                           const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : TensorArrayBaseCPUKernel(parameter, inputs, outputs, ctx) {}
-  ~TensorArrayReadCPUKernel() = default;
-
-  int Init() override;
-  int ReSize() override { return 0; }
-  int Run() override;
-};
-
-class TensorArrayWriteCPUKernel : public TensorArrayBaseCPUKernel {
- public:
-  TensorArrayWriteCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                            const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : TensorArrayBaseCPUKernel(parameter, inputs, outputs, ctx) {}
-  ~TensorArrayWriteCPUKernel() = default;
-
-  int Init() override;
-  int ReSize() override { return 0; }
-  int Run() override;
-};
-
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_TENSORARRAY_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/string/normalize.cc b/mindspore/lite/src/runtime/kernel/arm/string/normalize.cc
index dce272309ad..5f583b72b41 100644
--- a/mindspore/lite/src/runtime/kernel/arm/string/normalize.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/string/normalize.cc
@@ -29,14 +29,10 @@ using mindspore::schema::PrimitiveType_CustomNormalize;
 namespace mindspore::kernel {
 namespace {
 const char kPunctuationsRegex[] = "[.*()\"]";
-const std::map<std::string, std::string> *kRegexTransforms = new (std::nothrow) std::map<std::string, std::string>({
-  {"([\\S]+)n't", "$1 not"},
-  {"([\\S]+)'nt", "$1 not"},
-  {"([\\S]+)'ll", "$1 will"},
-  {"([\\S]+)'re", "$1 are"},
-  {"([\\S]+)'ve", "$1 have"},
-  {"i'm", "i am"},
-});
+const std::map<std::string, std::string> kRegexTransforms = {
+  {"([\\S]+)n't", "$1 not"}, {"([\\S]+)'nt", "$1 not"},  {"([\\S]+)'ll", "$1 will"},
+  {"([\\S]+)'re", "$1 are"}, {"([\\S]+)'ve", "$1 have"}, {"i'm", "i am"},
+};
 const int32_t kMaxStringLength = 300;
 }  // namespace
 
@@ -74,8 +70,7 @@ std::string NormalizeCPUKernel::Normalize(const std::string &str) {
   result = GlobalReplace(result, "\\s('t|'nt|n't|'d|'ll|'s|'m|'ve|'re)([\\s,;:/])", "$1$2");
   result = GlobalReplace(result, "\\s('t|'nt|n't|'d|'ll|'s|'m|'ve|'re)$", "$1");
   // transform shortening to full
-  MS_ASSERT(kRegexTransforms != nullptr);
-  for (auto iter = kRegexTransforms->begin(); iter != kRegexTransforms->end(); ++iter) {
+  for (auto iter = kRegexTransforms.begin(); iter != kRegexTransforms.end(); ++iter) {
     result = GlobalReplace(result, iter->first, iter->second);
   }
   result = GlobalReplace(result, "([?])+", "$1");
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl
index 85213e92231..d36de906ea3 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl
@@ -9,7 +9,7 @@ __kernel void gather(__write_only image2d_t dst_data, __read_only image2d_t src_
   if (X >= dst_size.x || Y >= dst_size.y * dst_size.w || Z >= dst_size.z || dst_size.y == 0) {
     return;
   }
-  DTYPE4 res_data = (DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
+  DTYPE4 res_data = (DTYPE4)(0, 0, 0, 0);
   int batch = Y / dst_size.y;
   int height = Y % dst_size.y;
   if (axis == 0) {
@@ -23,7 +23,7 @@ __kernel void gather(__write_only image2d_t dst_data, __read_only image2d_t src_
     DTYPE tmp[4];
     DTYPE res_tmp[4];
     for (int i = 0; i < indices_num; ++i) {
-      DTYPE4 rd_data = (DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
+      DTYPE4 rd_data = (DTYPE4)(0, 0, 0, 0);
       rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + offset[i], batch * src_size.y + height));
       if (i >= 1 && offset[i] != offset[i - 1]) {
         rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + offset[i], batch * src_size.y + height));
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/int8/arithmetic.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/int8/arithmetic.cl
index 1abe4a9004f..5e712c9ddc7 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/int8/arithmetic.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/int8/arithmetic.cl
@@ -14,6 +14,6 @@ __kernel void ElementAddInt8(__read_only image2d_t input_a, __read_only image2d_
   float4 real_a = convert_float4(a - zero_point.x) * scale.x;
   float4 real_b = convert_float4(b - zero_point.y) * scale.y;
   int4 result = convert_int4(round((real_a + real_b) / scale.z)) + zero_point.z;
-  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+  result = clamp(result, (int)(act_min), (int)(act_max));
   write_imagei(output, (int2)(X, Y), result);
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
index 7031a9a8f9e..b43e57231e7 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
@@ -40,8 +40,8 @@ class ActivationOpenCLKernel : public OpenCLKernel {
 
  private:
   static std::string GetActTypeString(int act_type);
-  int type_;
-  float alpha_;
+  int type_ = 0;
+  float alpha_ = 0.0f;
   GpuTensorInfo outShape;
 };
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
index 8d7118776a5..07922e1ef13 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
@@ -191,7 +191,11 @@ int ArgMinMaxOpenCLKernel::Prepare() {
   param->keep_dims_ =
     param->keep_dims_ || param->topk_ > 1 || in_tensors_[0]->shape().size() == out_tensors_[0]->shape().size();
 
-  InitWeights();
+  ret = InitWeights();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "InitWeights failed.";
+    return ret;
+  }
   SetGlobalLocal();
   if (SetConstArgs() != RET_OK) {
     MS_LOG(ERROR) << "SeConstArgs failed.";
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
index 05a986da862..92bcac2ce63 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@@ -34,6 +34,7 @@ int ConcatOpenCLKernel::RunAxis0() {
   auto allocator_ = ocl_runtime_->GetAllocator();
   ImageSize img_size;
   auto dst_data = out_tensors_[0]->data_c();
+  MS_ASSERT(dst_data);
   auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
   auto *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data));
   for (int i = 0; i < in_tensors_.size(); i++) {
@@ -45,7 +46,10 @@ int ConcatOpenCLKernel::RunAxis0() {
     auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
     auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
     auto *input_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
-    ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin, region);
+    if (ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin,
+                                                                 region) != CL_SUCCESS) {
+      MS_LOG(WARNING) << "enqueueCopyImage failed.";
+    }
     dst_origin[1] += region[1];
   }
   return RET_OK;
@@ -219,7 +223,11 @@ int ConcatOpenCLKernel::ConvertWeightToTensor() {
 }
 
 int ConcatOpenCLKernel::Prepare() {
-  ConvertWeightToTensor();
+  int ret = ConvertWeightToTensor();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ConvertWeightToTensor failed.";
+    return ret;
+  }
   if (axis_ == 0) {
     if (std::any_of(in_tensors_.begin(), in_tensors_.end(), [](lite::Tensor *t) { return t->shape().size() != 1; })) {
       return RET_OK;
@@ -248,7 +256,7 @@ int ConcatOpenCLKernel::Prepare() {
     return RET_ERROR;
   }
   auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
-  auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
+  ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Build kernel failed.";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
index bfed62a5129..f0b430572d0 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@@ -367,6 +367,7 @@ int Conv2DOpenCLKernel::InitBias() {
   // align bias from C to C4
   auto bias_tensor = in_tensors_.at(2);
   void *src_data = stored_bias_ == nullptr ? bias_tensor->data_c() : stored_bias_;
+  MS_ASSERT(src_data);
   size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_;
   packed_bias_ = allocator->Malloc(packed_bias_size, lite::opencl::MemType::BUF);
   if (packed_bias_ == nullptr) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
index 16bd63384c5..33a7339030e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@@ -259,6 +259,7 @@ int Conv2dTransposeOpenCLKernel::InitBias() {
   memset(bias_, 0x00, div_co * C4NUM * data_size);
   if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
     void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
+    MS_ASSERT(src_data);
     auto bias_dtype = in_tensors_[2]->data_type();
     if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) {
       for (int i = 0; i < co; i++) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
index 73733bafd20..9cbea18808f 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@@ -111,6 +111,7 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
   auto out_info = GpuTensorInfo(out_tensors_[0]);
   // weight: o, h, w, i; o == group, i == 1
   void *origin_weight = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_;
+  MS_ASSERT(origin_weight);
   int CO4 = UP_DIV(out_info.C, C4NUM);
   int pack_weight_size = C4NUM * CO4 * parameter->kernel_h_ * parameter->kernel_w_;
 
@@ -200,6 +201,7 @@ int DepthwiseConv2dOpenCLKernel::InitBias() {
     dst_type = is_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32;
     auto element_size = in_tensors_.at(kBiasIndex)->ElementsNum();
     void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
+    MS_ASSERT(src_data);
     ConvertBias(src_data, temp_bias.data(), element_size, dtype_size, src_type, dst_type);
   }
   bias_data_ = allocator->Malloc(bias_size, temp_bias.data());
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fill.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fill.cc
index dac1c248bcf..a27408f37e3 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fill.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fill.cc
@@ -35,6 +35,7 @@ int FillOpenCLKernel::RunFill() {
   cl_int4 fill_value = {};
   fill_value.s[0] = fill_value.s[1] = fill_value.s[2] = fill_value.s[3] = default_;
   auto src_data = out_tensors_[0]->data_c();
+  MS_ASSERT(src_data);
   if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
     MS_LOG(ERROR) << "GetImageSize failed.";
     return RET_ERROR;
@@ -42,23 +43,33 @@ int FillOpenCLKernel::RunFill() {
   auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
   auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
   cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
-  ocl_runtime_->GetDefaultCommandQueue()->enqueueFillImage(*out_image, fill_value, src_origin, region);
+  if (ocl_runtime_->GetDefaultCommandQueue()->enqueueFillImage(*out_image, fill_value, src_origin, region) !=
+      CL_SUCCESS) {
+    MS_LOG(ERROR) << "enqueueFillImage failed.";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
 int FillOpenCLKernel::RunShape() {
   auto allocator_ = ocl_runtime_->GetAllocator();
   auto src_data = out_tensors_[0]->data_c();
+  MS_ASSERT(src_data);
   cl_int4 fill_value = {default_, default_, default_, default_};
   auto tensor_shape = in_tensors_[0]->shape();
   void *tensor_shape_data = tensor_shape.data();
+  MS_ASSERT(tensor_shape_data);
   for (int i = 0; i < tensor_shape.size(); ++i) {
     fill_value.s[i] = reinterpret_cast<int *>(tensor_shape_data)[i];
   }
   auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
   auto region = cl::array<cl::size_type, 3U>{1, 1, 1};
   cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
-  ocl_runtime_->GetDefaultCommandQueue()->enqueueFillImage(*out_image, fill_value, src_origin, region);
+  if (ocl_runtime_->GetDefaultCommandQueue()->enqueueFillImage(*out_image, fill_value, src_origin, region) !=
+      CL_SUCCESS) {
+    MS_LOG(ERROR) << "enqueueFillImage failed.";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
@@ -90,9 +101,9 @@ int FillOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running! ";
   auto param = this->op_parameter_;
   if (param->type_ == PrimitiveType_Fill) {
-    RunFill();
+    return RunFill();
   } else {
-    RunShape();
+    return RunShape();
   }
 
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
index f86b979bf9c..8bb4deebb73 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
@@ -153,6 +153,7 @@ int FullConnectionOpenCLKernel::InitFilter() {
   auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
   memset(padWeight_, 0x00, nhw_remainder * intensor_shape.Slice * co4 * C4NUM * C4NUM * dtype_size);
   void *src_data = stored_weight_ == nullptr ? in_tensors_.at(kWeightIndex)->data_c() : stored_weight_;
+  MS_ASSERT(src_data);
   auto originWeightFp32 = reinterpret_cast<float *>(src_data);
   auto originWeightFp16 = reinterpret_cast<float16_t *>(src_data);
   bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;
@@ -228,6 +229,7 @@ int FullConnectionOpenCLKernel::InitBias() {
   memset(bias_, 0x00, co4 * C4NUM * dtype_size);
   if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
     void *src_data = stored_bias_ == nullptr ? in_tensors_.at(kBiasIndex)->data_c() : stored_bias_;
+    MS_ASSERT(src_data);
     if (in_tensors_[kBiasIndex]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
       for (int i = 0; i < CO_; i++) {
         reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(src_data)[i];
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc
index faaa7e81a00..4d17eba5093 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc
@@ -181,7 +181,11 @@ int FusionEltwiseOpenCLKernel::Prepare() {
     MS_LOG(ERROR) << "Build kernel failed.";
     return ret;
   }
-  InitWeights();
+  ret = InitWeights();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "InitWeights failed.";
+    return ret;
+  }
   SetGlobalLocal();
   if (SetConstArgs() != RET_OK) {
     MS_LOG(ERROR) << "SeConstArgs failed.";
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
index 68dbaf98b4b..3f1bf1d76e7 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
@@ -172,6 +172,7 @@ int GatherOpenCLKernel::ConvertTensorToweight() {
   }
   auto data_type = indices_tensor->data_type();
   auto data = indices_tensor->data_c();
+  MS_ASSERT(data);
   if (data_type == kNumberTypeInt32) {
     for (int i = 0; i < indices_num; i++) {
       indices_data_[i] = reinterpret_cast<int32_t *>(data)[i];
@@ -205,6 +206,7 @@ int GatherOpenCLKernel::InitWeights() {
 
   auto data_type = indices_tensor->data_type();
   auto data = indices_tensor->data_c();
+  MS_ASSERT(data);
   if (data_type == kNumberTypeInt32) {
     for (int i = 0; i < indices_num; i++) {
       indices_data_[i] = reinterpret_cast<int32_t *>(data)[i];
@@ -242,7 +244,11 @@ int GatherOpenCLKernel::PreProcess() {
 int GatherOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running! ";
   if (intensor1_is_tensor) {
-    ConvertTensorToweight();
+    int ret = ConvertTensorToweight();
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "ConvertTensorToweight failed.";
+      return ret;
+    }
   }
   if (ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c()) != CL_SUCCESS) {
     MS_LOG(ERROR) << "SetKernelArg failed.";
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc
index ea3599de657..ca2f45602be 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc
@@ -141,6 +141,8 @@ int LayerNormOpenCLKernel::Initweight() {
   }
   memset(gamma_, 0x01, weight_size);
   memset(beta_, 0x00, weight_size);
+  MS_ASSERT(in_tensors_.at(1)->data_c());
+  MS_ASSERT(in_tensors_.at(INPUT_TENSOR_SIZE_2)->data_c());
 
   if (weight_tensor->data_type() == kNumberTypeFloat16) {
     if (use_fp16_enable_) {
@@ -187,9 +189,9 @@ int LayerNormOpenCLKernel::Initweight() {
 int LayerNormOpenCLKernel::Prepare() {
   use_fp16_enable_ = ocl_runtime_->GetFp16Enable();
   int ret = Initweight();
-  if (ret) {
+  if (ret != RET_OK) {
     MS_LOG(ERROR) << "Initweight failed ";
-    return RET_ERROR;
+    return ret;
   }
   normalized_shape_size_ = in_tensors_.at(0)->shape().at(normalized_axis_);
   auto allocator = ocl_runtime_->GetAllocator();
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc
index fe128cf5c49..df8009ef717 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc
@@ -76,6 +76,7 @@ int OneHotOpenCLKernel::Prepare() {
 
 int OneHotOpenCLKernel::InitWeights() {
   depth_ = static_cast<int32_t *>(in_tensors_[1]->data_c())[0];
+  MS_ASSERT(depth_);
   // inputs num is 3 or 4.
   if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {  // onnx
     off_value_ = static_cast<float *>(in_tensors_[2]->data_c())[0];
@@ -87,6 +88,8 @@ int OneHotOpenCLKernel::InitWeights() {
     off_value_ = static_cast<float *>(in_tensors_[3]->data_c())[0];
     param_->support_neg_index_ = false;
   }
+  MS_ASSERT(off_value_);
+  MS_ASSERT(on_value_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
index 2784f06b708..218b71ddffe 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
@@ -41,6 +41,7 @@ int PReluOpenCLKernel::InitWeights() {
     } else {
       weight_scalar_ = *reinterpret_cast<float *>(weight_tensor->data_c());
     }
+    MS_ASSERT(weight_scalar_);
   } else {
     int C_ = weight_tensor->ElementsNum();
     auto sizeof_FLT = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
index b343ecc5ed2..085200dc473 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@@ -127,7 +127,10 @@ int ReshapeOpenCLKernel::PreProcess() {
         MS_LOG(ERROR) << "SyncCommandQueue failed.";
         return RET_ERROR;
       }
-      shape_tensor->MutableData();
+      if (shape_tensor->MutableData() == nullptr) {
+        MS_LOG(ERROR) << "MutableData failed.";
+        return RET_ERROR;
+      }
     }
   }
   return OpenCLKernel::PreProcess();
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
index 14c83e0a780..7a4d2b81482 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
@@ -93,6 +93,8 @@ int ScaleOpenCLKernel::InitWeights() {
   ImageSize img_size;
   GetImageSize(0, &img_size);
   img_size.dtype = scale_dtype == kNumberTypeFloat16 ? CL_HALF_FLOAT : CL_FLOAT;
+  MS_ASSERT(scale_tensor->data_c());
+  MS_ASSERT(offset_tensor->data_c());
 
   if (broadcast_flag_) {
     img_size.height = 1;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
index dc532bbbb92..ccb3b4bc566 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
@@ -37,6 +37,7 @@ int SparseToDenseOpenCLKernel::InitOutputToDefault() {
   cl_float4 fill_value = {};
   fill_value.s[0] = fill_value.s[1] = fill_value.s[2] = fill_value.s[3] = default_;
   auto src_data = out_tensors_[0]->data_c();
+  MS_ASSERT(src_data);
   if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
     MS_LOG(ERROR) << "GetImageSize failed.";
     return RET_ERROR;
@@ -44,7 +45,11 @@ int SparseToDenseOpenCLKernel::InitOutputToDefault() {
   auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
   auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
   cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
-  ocl_runtime_->GetDefaultCommandQueue()->enqueueFillImage(*out_image, fill_value, src_origin, region);
+  if (ocl_runtime_->GetDefaultCommandQueue()->enqueueFillImage(*out_image, fill_value, src_origin, region) !=
+      CL_SUCCESS) {
+    MS_LOG(ERROR) << "enqueueFillImage failed.";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
@@ -55,6 +60,7 @@ int SparseToDenseOpenCLKernel::InitWeights() {
   for (int i = 0; i < weight_tensor->shape().size(); ++i) {
     size *= weight_tensor->shape()[i];
   }
+  MS_ASSERT(weight_tensor->data_c());
   if (weight_scalar_) {
     if (weight_tensor->data_type() == kNumberTypeFloat16) {
       weight_scalar_ = static_cast<float>(*reinterpret_cast<float16_t *>(weight_tensor->data_c()));
@@ -199,9 +205,18 @@ int SparseToDenseOpenCLKernel::Prepare() {
     } else {
       default_ = *reinterpret_cast<float *>(input_tensor3->data_c());
     }
+    MS_ASSERT(default_);
+  }
+  ret = InitWeights();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "InitWeights failed.";
+    return ret;
+  }
+  ret = InferShapeTo4D();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "InferShapeTo4D failed.";
+    return ret;
   }
-  InitWeights();
-  InferShapeTo4D();
   SetGlobalLocal();
   if (SetConstArgs() != RET_OK) {
     MS_LOG(ERROR) << "SeConstArgs failed.";
@@ -242,7 +257,11 @@ int SparseToDenseOpenCLKernel::InferShapeTo4D() {
 
 int SparseToDenseOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running! ";
-  InitOutputToDefault();
+  int ret = InitOutputToDefault();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "InitOutputToDefault failed.";
+    return ret;
+  }
   int arg_cn = 0;
   if (ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()) != CL_SUCCESS) {
     MS_LOG(ERROR) << "SetKernelArg failed.";
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc
index 206bbffbf33..17a6204af5f 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc
@@ -32,6 +32,7 @@ namespace mindspore::kernel {
 int SplitOpenCLKernel::RunAxis0() {
   auto allocator_ = ocl_runtime_->GetAllocator();
   auto src_data = in_tensors_[0]->data_c();
+  MS_ASSERT(src_data);
   cl::Image2D *in_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
   if (in_image == nullptr) {
     MS_LOG(ERROR) << "RunAxis0 in_image can not be nullptr";
@@ -40,6 +41,7 @@ int SplitOpenCLKernel::RunAxis0() {
   auto src_area = cl::array<cl::size_type, 3U>{0, 0, 0};
   for (int i = 0; i < out_tensors_.size(); i++) {
     auto dst_data = out_tensors_[i]->data_c();
+    MS_ASSERT(dst_data);
     ImageSize img_size;
     if (allocator_->GetImageSize(dst_data, &img_size) != RET_OK) {
       MS_LOG(ERROR) << "GetImageSize failed.";
@@ -52,7 +54,10 @@ int SplitOpenCLKernel::RunAxis0() {
       MS_LOG(ERROR) << "RunAxis0 out_image can not be nullptr";
       return RET_ERROR;
     }
-    ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*in_image, *out_image, src_area, dst_area, region);
+    if (ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*in_image, *out_image, src_area, dst_area, region) !=
+        CL_SUCCESS) {
+      MS_LOG(WARNING) << "enqueueCopyImage failed.";
+    }
     src_area[1] += region[1];
   }
   return RET_OK;
@@ -229,7 +234,11 @@ void SplitOpenCLKernel::SetGlobalLocal() {
 
 int SplitOpenCLKernel::Run() {
   if (split_dim_ == 0) {
-    RunAxis0();
+    int ret = RunAxis0();
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "RunAxis0 failed.";
+      return ret;
+    }
     return RET_OK;
   }
   int arg_cn = 0;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
index 5b08fbb3245..2302c2f4156 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
@@ -32,10 +32,12 @@ int StackOpenCLKernel::RunAxis0() {
   auto allocator_ = ocl_runtime_->GetAllocator();
   ImageSize img_size;
   auto dst_data = out_tensors_[0]->data_c();
+  MS_ASSERT(dst_data);
   auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
   cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data));
   for (int i = 0; i < in_tensors_.size(); i++) {
     auto src_data = in_tensors_[i]->data_c();
+    MS_ASSERT(src_data);
     if (allocator_->GetImageSize(src_data, &img_size) != RET_OK) {
       MS_LOG(ERROR) << "GetImageSize failed.";
       return RET_ERROR;
@@ -43,7 +45,10 @@ int StackOpenCLKernel::RunAxis0() {
     auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
     auto region = cl::array<cl::size_type, 3U>{img_size.width, img_size.height, 1};
     cl::Image2D *input_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
-    ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin, region);
+    if (ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin,
+                                                                 region) != CL_SUCCESS) {
+      MS_LOG(WARNING) << "enqueueCopyImage failed.";
+    }
     dst_origin[1] += region[1];
   }
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc
index bd21ab17886..be61ca7b6f3 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc
@@ -134,8 +134,10 @@ int StrassenOpenCLKernel::InitWeights() {
   auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
   auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
   memset(padWeight_, 0x00, NumA * NumB * dtype_size);
-  auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
-  auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c());
+  auto weight_tensor_data = in_tensors_.at(kWeightIndex)->data_c();
+  MS_ASSERT(weight_tensor_data);
+  auto originWeightFp32 = reinterpret_cast<float *>(weight_tensor_data);
+  auto originWeightFp16 = reinterpret_cast<float16_t *>(weight_tensor_data);
   bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;
   if (AllocatorMemoryForStrassen(NumA / 2, NumB / 2) != RET_OK) {
     MS_LOG(ERROR) << "AllocatorMemoryForStrassen failed.";
@@ -192,9 +194,21 @@ void StrassenOpenCLKernel::SetGlobalLocal() {
   local_size_ = {32, 4, 1};
   global_size_ = {1, 1, 1};
   size_t strassen_size = outShape[3] / 2;
-  StrassenSetGlobalLocal(strassen_size, 0);  // set global_ and local
-  StrassenSetGlobalLocal(strassen_size, 1);  // set global_size_add_sub
-  StrassenSetGlobalLocal(strassen_size, 2);  // set global_size_weights
+  int ret = StrassenSetGlobalLocal(strassen_size, 0);  // set global_ and local
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "StrassenSetGlobalLocal 0 failed.";
+    return;
+  }
+  ret = StrassenSetGlobalLocal(strassen_size, 1);  // set global_size_add_sub
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "StrassenSetGlobalLocal 1 failed.";
+    return;
+  }
+  ret = StrassenSetGlobalLocal(strassen_size, 2);  // set global_size_weights
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "StrassenSetGlobalLocal 2 failed.";
+    return;
+  }
 }
 
 int StrassenOpenCLKernel::StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size,
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
index 9d00ac7a4dd..a1c7a921fd4 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
@@ -115,7 +115,9 @@ int StridedSliceOpenCLKernel::InitConstArgs() {
 
   if (type() == PrimitiveType_SliceFusion) {
     auto *begin = reinterpret_cast<int32_t *>(in_tensors_.at(1)->data_c());
+    MS_ASSERT(begin);
     auto *size = reinterpret_cast<int32_t *>(in_tensors_.at(2)->data_c());
+    MS_ASSERT(size);
     Broadcast2GpuShape(begin_.s, begin, input_info.NDim, 0);
     Broadcast2GpuShape(size_.s, size, input_info.NDim, -1);
     for (int i = 0; i < 4; ++i) {
@@ -137,8 +139,11 @@ int StridedSliceOpenCLKernel::InitConstArgs() {
     }
   } else {
     auto *begin = reinterpret_cast<int32_t *>(in_tensors_.at(1)->data_c());
+    MS_ASSERT(begin);
     auto *end = reinterpret_cast<int32_t *>(in_tensors_.at(2)->data_c());
+    MS_ASSERT(end);
     auto *stride = reinterpret_cast<int32_t *>(in_tensors_.at(3)->data_c());
+    MS_ASSERT(stride);
     cl_int4 end_ = input_shape_;
     Broadcast2GpuShape(begin_.s, begin, input_info.NDim, 0);
     Broadcast2GpuShape(end_.s, end, input_info.NDim);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
index 8e51bcaaaed..b189213693e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
@@ -18,6 +18,7 @@
 #include <memory>
 #include "src/runtime/kernel/opencl/cl/winograd.cl.inc"
 #include "nnacl/base/minimal_filtering_generator.h"
+#include "nnacl/errorcode.h"
 
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
@@ -131,6 +132,7 @@ int WinogradOpenCLKernel::InitFilter() {
   // rearrange filter
   auto filter_tensor = in_tensors_.at(1);
   void *src_filter_data = stored_filter_ == nullptr ? filter_tensor->data_c() : stored_filter_;
+  MS_ASSERT(src_filter_data);
 #ifndef ENABLE_ARM64
   auto winograd_filter = GenerateWinogradFilter(src_filter_data, filter_tensor->data_type(), CO_, CI_);
   void *src_data = winograd_filter.data();
@@ -140,9 +142,13 @@ int WinogradOpenCLKernel::InitFilter() {
     MS_LOG(ERROR) << "new winograd_filter failed.";
     return RET_ERROR;
   }
-  WinogradWeightTransform(reinterpret_cast<const float *>(src_filter_data),
-                          reinterpret_cast<float *>(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false);
-
+  int trans_ret =
+    WinogradWeightTransform(reinterpret_cast<const float *>(src_filter_data),
+                            reinterpret_cast<float *>(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false);
+  if (trans_ret != NNACL_OK) {
+    MS_LOG(ERROR) << "WinogradWeightTransform failed.";
+    return RET_ERROR;
+  }
   void *src_data = winograd_filter.get();
 #endif
 
@@ -196,7 +202,11 @@ int WinogradOpenCLKernel::AllocateMemory() {
 }
 
 int WinogradOpenCLKernel::SetConstArgs() {
-  AllocateMemory();
+  int ret = AllocateMemory();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "AllocateMemory failed.";
+    return ret;
+  }
 
   int arg_cn = 1;
   cl_int4 input_shape = {batch_size_, OH_, OW_, CI_SLICES_};  // maybe pad=0, so use OH/OW
@@ -316,16 +326,28 @@ double WinogradOpenCLKernel::GetProfilingTimeMs() {
   if (!ocl_runtime_->isProfiling()) {
     return MAX_PROFILING_TIME_MILLI_SECOND;
   }
-  cl_ulong time_start;
-  cl_ulong time_end;
-  event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
-  event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
+  cl_ulong time_start = 0;
+  cl_ulong time_end = 0;
+  if (event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start) != CL_SUCCESS) {
+    MS_LOG(ERROR) << "event_ getProfilingInfo CL_PROFILING_COMMAND_START failed, time_start is untrustable.";
+  }
+  if (event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end) != CL_SUCCESS) {
+    MS_LOG(ERROR) << "event_ getProfilingInfo CL_PROFILING_COMMAND_END failed, time_end is untrustable.";
+  }
   cl_ulong time_ns = time_end - time_start;
-  kernel2_event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
-  kernel2_event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
+  if (kernel2_event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start) != CL_SUCCESS) {
+    MS_LOG(ERROR) << "kernel2_event_ getProfilingInfo CL_PROFILING_COMMAND_START failed, time_start is untrustable.";
+  }
+  if (kernel2_event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end) != CL_SUCCESS) {
+    MS_LOG(ERROR) << "kernel2_event_ getProfilingInfo CL_PROFILING_COMMAND_END failed, time_end is untrustable.";
+  }
   time_ns += time_end - time_start;
-  kernel3_event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
-  kernel3_event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
+  if (kernel3_event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start) != CL_SUCCESS) {
+    MS_LOG(ERROR) << "kernel3_event_ getProfilingInfo CL_PROFILING_COMMAND_START failed, time_start is untrustable.";
+  }
+  if (kernel3_event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end) != CL_SUCCESS) {
+    MS_LOG(ERROR) << "evekernel3_event_nt_ getProfilingInfo CL_PROFILING_COMMAND_END failed, time_end is untrustable.";
+  }
   time_ns += time_end - time_start;
   return static_cast<double>(time_ns) * 1e-6;
 }
diff --git a/mindspore/lite/src/runtime/runtime_pass.cc b/mindspore/lite/src/runtime/runtime_pass.cc
index 8bb988e3338..0954c178d9f 100644
--- a/mindspore/lite/src/runtime/runtime_pass.cc
+++ b/mindspore/lite/src/runtime/runtime_pass.cc
@@ -20,30 +20,60 @@
 namespace mindspore::lite {
 void Nc4hw4PassReplace(std::vector<kernel::LiteKernel *> *kernels, std::vector<Tensor *> *tensors, size_t index) {
   kernel::LiteKernel *conv_kernel = kernels->at(index);
-  kernel::LiteKernel *traspose_kernel = conv_kernel->out_kernels().front();
-  kernel::LiteKernel *c4_kernel = traspose_kernel->out_kernels().front();
+  kernel::LiteKernel *transpose_kernel = conv_kernel->out_kernels().front();
+  kernel::LiteKernel *c4_kernel = transpose_kernel->out_kernels().front();
+  kernel::LiteKernel *transpose2_kernel = c4_kernel->out_kernels().front();
+  std::vector<kernel::LiteKernel *> end_kernels = transpose2_kernel->out_kernels();
 
   /* tensor */
-  Tensor *transpose_param_tensor = traspose_kernel->in_tensors().at(1);
-  VectorErase(tensors, transpose_param_tensor);
-  delete transpose_param_tensor;
-  transpose_param_tensor = nullptr;
+  {
+    /* transpose_kernel */
+    Tensor *transpose_param_tensor = transpose_kernel->in_tensors().at(1);
+    VectorSetNull(tensors, transpose_param_tensor);
+    delete transpose_param_tensor;
+    transpose_param_tensor = nullptr;
 
-  Tensor *conv_out_tensor = conv_kernel->out_tensors().front();
-  conv_out_tensor->set_format(NC4HW4);
-  Tensor *c4_input_tensor = c4_kernel->in_tensors().front();
-  c4_kernel->set_in_tensor(conv_out_tensor, 0);
-  VectorErase(tensors, c4_input_tensor);
-  delete c4_input_tensor;
-  c4_input_tensor = nullptr;
+    Tensor *conv_out_tensor = conv_kernel->out_tensors().front();
+    conv_out_tensor->set_format(NC4HW4);
+    Tensor *c4_input_tensor = c4_kernel->in_tensors().front();
+    c4_kernel->set_in_tensor(conv_out_tensor, 0);
+    VectorSetNull(tensors, c4_input_tensor);
+    delete c4_input_tensor;
+    c4_input_tensor = nullptr;
+  }
+  {
+    /* transpose2_kernel */
+    Tensor *transpose_param_tensor = transpose2_kernel->in_tensors().at(1);
+    VectorSetNull(tensors, transpose_param_tensor);
+    delete transpose_param_tensor;
+    transpose_param_tensor = nullptr;
+
+    Tensor *nwhc_tensor = c4_kernel->out_tensors().front();
+    nwhc_tensor->set_format(NHWC);
+    for (auto end : end_kernels) {
+      end->set_in_tensor(nwhc_tensor, 0);
+    }
+    Tensor *trans_out = transpose2_kernel->out_tensors().front();
+    VectorSetNull(tensors, trans_out);
+    delete trans_out;
+    trans_out = nullptr;
+  }
 
   /* kernel */
-  VectorErase(kernels, traspose_kernel);
-  delete traspose_kernel;
-  traspose_kernel = nullptr;
+  VectorErase(kernels, transpose_kernel);
+  delete transpose_kernel;
+  transpose_kernel = nullptr;
   conv_kernel->set_out_kernels({c4_kernel});
   c4_kernel->set_in_kernels({conv_kernel});
 
+  c4_kernel->set_out_kernels(transpose2_kernel->out_kernels());
+  for (auto end : end_kernels) {
+    end->set_in_kernels({c4_kernel});
+  }
+  VectorErase(kernels, transpose2_kernel);
+  delete transpose2_kernel;
+  transpose2_kernel = nullptr;
+
   return;
 }
 
@@ -60,27 +90,38 @@ bool Nc4hw4PassMatch(std::vector<kernel::LiteKernel *> *kernels, size_t index) {
     return false;
   }
 
-  kernel::LiteKernel *traspose_kernel = start_kernel->out_kernels().front();
-  if (start_kernel->type() != Nc4hw4FormatTransposeOp) {
+  kernel::LiteKernel *traspose_nhwc2nchw_kernel = start_kernel->out_kernels().front();
+  if (traspose_nhwc2nchw_kernel->type() != Nc4hw4FormatTransposeOp) {
     return false;
   }
-  if (traspose_kernel->out_kernels().size() != 1) {
+  if (traspose_nhwc2nchw_kernel->out_kernels().size() != 1) {
     return false;
   }
 
-  kernel::LiteKernel *end_kernel = traspose_kernel->out_kernels().front();
+  kernel::LiteKernel *end_kernel = traspose_nhwc2nchw_kernel->out_kernels().front();
   if (IsContain(Nc4hw4FormatInOpList, end_kernel->type()) == false) {
     return false;
   }
+  if (end_kernel->out_kernels().size() != 1) {
+    return false;
+  }
+
+  kernel::LiteKernel *transpose_nchw2nhwc_kernel = end_kernel->out_kernels().front();
+  if (transpose_nchw2nhwc_kernel->type() != Nc4hw4FormatTransposeOp) {
+    return false;
+  }
 
   /* double check ops topological sorted in kernel-list */
   auto start_iter = find(kernels->begin(), kernels->end(), start_kernel);
   auto start_index = std::distance(kernels->begin(), start_iter);
-  auto transpose_iter = find(kernels->begin(), kernels->end(), traspose_kernel);
-  auto transpose_index = std::distance(kernels->begin(), transpose_iter);
+  auto traspose_nhwc2nchw_iter = find(kernels->begin(), kernels->end(), traspose_nhwc2nchw_kernel);
+  auto traspose_nhwc2nchw_index = std::distance(kernels->begin(), traspose_nhwc2nchw_iter);
   auto end_iter = find(kernels->begin(), kernels->end(), end_kernel);
   auto end_index = std::distance(kernels->begin(), end_iter);
-  if (start_index > transpose_index || transpose_index > end_index) {
+  auto transpose_nchw2nhwc_iter = find(kernels->begin(), kernels->end(), transpose_nchw2nhwc_kernel);
+  auto transpose_nchw2nhwc_index = std::distance(kernels->begin(), transpose_nchw2nhwc_iter);
+  if (start_index > traspose_nhwc2nchw_index || traspose_nhwc2nchw_index > end_index ||
+      end_index > transpose_nchw2nhwc_index) {
     return false;
   }
 
@@ -88,31 +129,31 @@ bool Nc4hw4PassMatch(std::vector<kernel::LiteKernel *> *kernels, size_t index) {
 }
 
 bool Nc4hw4PassValid(const InnerContext *context, std::vector<kernel::LiteKernel *> *kernels) {
-  return false;
-
   if (context->IsGpuEnabled() || context->IsNpuEnabled()) {
     return false;
   }
 
   for (auto kernel : *kernels) {
-    if (kernel->op_parameter()->quant_type_ == schema::QuantType_AwareTraining ||
-        kernel->op_parameter()->quant_type_ == schema::QuantType_PostTraining) {
-      return false;
+    if (kernel->op_parameter() != nullptr) {
+      if (kernel->op_parameter()->quant_type_ == schema::QuantType_AwareTraining ||
+          kernel->op_parameter()->quant_type_ == schema::QuantType_PostTraining) {
+        return false;
+      }
     }
   }
-  return true;
+  return false;
 }
 
-void Nc4hw4Pass(std::vector<kernel::LiteKernel *> *kernels, std::vector<Tensor *> *tensors) {
+void Nc4hw4PassAct(std::vector<kernel::LiteKernel *> *kernels, std::vector<Tensor *> *tensors) {
   size_t kernel_size = kernels->size();
   size_t index = 0;
-  for (; index < kernel_size - 2; index++) {
+  for (; index + 3 < kernel_size; index++) {
     kernel::LiteKernel *kernel = kernels->at(index);
 
     if (kernel->subgraph_type() != kernel::kNotSubGraph) {
       kernel::SubGraphKernel *subgraph = reinterpret_cast<kernel::SubGraphKernel *>(kernel);
       std::vector<kernel::LiteKernel *> &particial_nodes = subgraph->nodes();
-      Nc4hw4Pass(&particial_nodes, tensors);
+      Nc4hw4PassAct(&particial_nodes, tensors);
     }
 
     if (Nc4hw4PassMatch(kernels, index)) {
@@ -123,4 +164,11 @@ void Nc4hw4Pass(std::vector<kernel::LiteKernel *> *kernels, std::vector<Tensor *
   }
   return;
 }
+
+void Nc4hw4Pass(const InnerContext *context, std::vector<kernel::LiteKernel *> *kernels,
+                std::vector<Tensor *> *tensors) {
+  if (Nc4hw4PassValid(context, kernels)) {
+    Nc4hw4PassAct(kernels, tensors);
+  }
+}
 }  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/runtime_pass.h b/mindspore/lite/src/runtime/runtime_pass.h
index 141c7d8e3c4..a12d050461c 100644
--- a/mindspore/lite/src/runtime/runtime_pass.h
+++ b/mindspore/lite/src/runtime/runtime_pass.h
@@ -17,6 +17,7 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_RUNTIME_PASS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_RUNTIME_PASS_H_
 
+#ifndef RUNTIME_PASS_CLIP
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/sub_graph_kernel.h"
@@ -26,16 +27,15 @@
 namespace mindspore::lite {
 
 /* Nc4hw4 PASS
- * before  :  CONV --(nhwc)-- TRANSPOSE --(nhwc)-- OP
- * after   :  CONV --(nc4hw4)-- OP
+ * before  : --(nhwc)-- CONV --(nhwc)-- TRANSPOSE --(nchw)-- IN --(nchw)-- TRANSPOSE --(nhwc)--
+ * after   : --(nhwc)-- CONV --(nc4hw4)-- IN --(nhwc)--
  * */
 static const schema::PrimitiveType Nc4hw4FormatTransposeOp = schema::PrimitiveType_Transpose;
 static const std::vector<schema::PrimitiveType> Nc4hw4FormatOutOpList = {schema::PrimitiveType_Conv2DFusion};
-static const std::vector<schema::PrimitiveType> Nc4hw4FormatInOpList = {schema::PrimitiveType_InstanceNorm,
-                                                                        schema::PrimitiveType_PadFusion};
-bool Nc4hw4PassValid(const InnerContext *context, std::vector<kernel::LiteKernel *> *kernels);
-void Nc4hw4Pass(std::vector<kernel::LiteKernel *> *kernels, std::vector<Tensor *> *tensors);
+static const std::vector<schema::PrimitiveType> Nc4hw4FormatInOpList = {schema::PrimitiveType_InstanceNorm};
+void Nc4hw4Pass(const InnerContext *context, std::vector<kernel::LiteKernel *> *kernels,
+                std::vector<Tensor *> *tensors);
 
 }  // namespace mindspore::lite
-
+#endif
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_RUNTIME_PASS_H_
diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc
index 6b6793d2d0c..1614c986dce 100644
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@@ -21,7 +21,9 @@
 #include <string>
 #include <vector>
 #include <algorithm>
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/tensorlist.h"
+#endif
 #include "include/errorcode.h"
 #include "src/common/graph_util.h"
 #include "src/common/utils.h"
@@ -32,19 +34,28 @@
 #include "src/ops/populate/populate_register.h"
 #include "src/common/version_manager.h"
 #include "src/common/prim_util.h"
+#include "src/lite_model.h"
 #include "src/common/tensor_util.h"
 #include "src/runtime/infer_manager.h"
+#ifndef RUNTIME_PASS_CLIP
 #include "src/runtime/runtime_pass.h"
+#endif
+#ifndef AUTO_PARALLEL_CLIP
 #include "src/sub_graph_split.h"
+#endif
+#ifndef WEIGHT_DECODE_CLIP
 #include "src/weight_decoder.h"
+#endif
 #include "src/runtime/kernel/arm/fp16/fp16_op_handler.h"
 #include "nnacl/nnacl_common.h"
 #if GPU_OPENCL
 #include "src/runtime/kernel/opencl/opencl_subgraph.h"
 #include "src/runtime/gpu/opencl/opencl_runtime.h"
 #endif
-#include "include/registry/kernel_interface.h"
+#include "include/registry/register_kernel_interface.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/runtime/kernel/arm/base/partial_fusion.h"
+#endif
 
 namespace mindspore::lite {
 namespace {
@@ -67,10 +78,12 @@ int Scheduler::InitKernels(std::vector<kernel::LiteKernel *> dst_kernels) {
     return RET_OK;
   }
   for (auto kernel : dst_kernels) {
+#ifndef DELEGATE_CLIP
     // delegate graph kernel
     if (kernel->desc().delegate != nullptr) {
       continue;
     }
+#endif
     if (kernel->subgraph_type() == kernel::kNotSubGraph) {
       MS_LOG(ERROR) << "construct subgraph failed.";
       return RET_ERROR;
@@ -87,19 +100,7 @@ int Scheduler::InitKernels(std::vector<kernel::LiteKernel *> dst_kernels) {
   return RET_OK;
 }
 
-int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
-  if (dst_kernels == nullptr) {
-    return RET_ERROR;
-  }
-  if (src_model_ == nullptr) {
-    MS_LOG(ERROR) << "Input model is nullptr";
-    return RET_PARAM_INVALID;
-  }
-  if (src_model_->sub_graphs_.empty()) {
-    MS_LOG(ERROR) << "Model should have a subgraph at least";
-    return RET_PARAM_INVALID;
-  }
-
+int Scheduler::SchedulePreProcess() {
   this->graph_output_node_indexes_ = GetGraphOutputNodes(src_model_);
 
   int infershape_ret = InferSubGraphShape(kMainSubGraphIndex);
@@ -109,7 +110,7 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
   }
 
   if (context_->enable_parallel_ && infershape_ret != RET_INFER_INVALID) {
-#ifdef ENABLE_AUTO_PARALLEL
+#ifndef AUTO_PARALLEL_CLIP
     auto search_sub_graph =
       SearchSubGraph(context_, src_model_, src_tensors_, &op_parameters_, &graph_output_node_indexes_);
     search_sub_graph.SubGraphSplit();
@@ -118,31 +119,55 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
     return RET_NOT_SUPPORT;
 #endif
   }
+  return RET_OK;
+}
 
-  int ret = ScheduleGraphToKernels(dst_kernels);
+int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
+  int check_input_ret = CheckInputParam(dst_kernels);
+  if (check_input_ret != RET_OK) {
+    MS_LOG(ERROR) << "CheckInputParam failed! ret: " << check_input_ret;
+    return check_input_ret;
+  }
+
+  schema_version_ = reinterpret_cast<LiteModel *>(src_model_)->GetSchemaVersion();
+
+  int ret = SchedulePreProcess();
+  if (ret != RET_OK) {
+    return ret;
+  }
+
+  ret = ScheduleGraphToKernels(dst_kernels);
+  FreeOpParameters();
   op_parameters_.clear();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Schedule graph to kernels failed.";
     return ret;
   }
 
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   SetSubgraphForPartialNode();
 #endif
+
   if (delegate_ != nullptr) {
+#ifndef DELEGATE_CLIP
     ret = ReplaceDelegateKernels(dst_kernels);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Repalce delegate kernels failed.";
       return ret;
     }
-  }
-
-  if (Nc4hw4PassValid(context_, dst_kernels)) {
-    Nc4hw4Pass(dst_kernels, src_tensors_);
+#else
+    MS_LOG(ERROR) << unsupport_delegate_log;
+    return RET_ERROR;
+#endif
   }
 
   FindAllInoutKernels(*dst_kernels);
-#ifdef ENABLE_CONTROL_TENSORLIST
+
+#ifndef RUNTIME_PASS_CLIP
+  Nc4hw4Pass(context_, dst_kernels, src_tensors_);
+#endif
+
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   if (IsControlFlowParttern(*dst_kernels)) {
     ret = ConstructControlFlowMainGraph(dst_kernels);
     if (ret != RET_OK) {
@@ -159,7 +184,7 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
       MS_LOG(ERROR) << "ConstructSubGraphs failed.";
       return ret;
     }
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   }
 #endif
 
@@ -173,6 +198,22 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
   return RET_OK;
 }
 
+int Scheduler::CheckInputParam(std::vector<kernel::LiteKernel *> *dst_kernels) {
+  if (dst_kernels == nullptr) {
+    return RET_ERROR;
+  }
+  if (src_model_ == nullptr) {
+    MS_LOG(ERROR) << "Input model is nullptr";
+    return RET_PARAM_INVALID;
+  }
+  if (src_model_->sub_graphs_.empty()) {
+    MS_LOG(ERROR) << "Model should have a subgraph at least";
+    return RET_PARAM_INVALID;
+  }
+  return RET_OK;
+}
+
+#ifndef DELEGATE_CLIP
 int Scheduler::ReplaceDelegateKernels(std::vector<kernel::LiteKernel *> *dst_kernels) {
   std::vector<kernel::Kernel *> kernels;
   for (size_t i = 0; i < dst_kernels->size(); i++) {
@@ -181,7 +222,7 @@ int Scheduler::ReplaceDelegateKernels(std::vector<kernel::LiteKernel *> *dst_ker
 
   ms_inputs_ = LiteTensorsToMSTensors(inputs_);
   ms_outputs_ = LiteTensorsToMSTensors(outputs_);
-  auto schema_version = static_cast<SchemaVersion>(VersionManager::GetInstance()->GetSchemaVersion());
+  auto schema_version = static_cast<SchemaVersion>(schema_version_);
   DelegateModel *model =
     new (std::nothrow) DelegateModel(&kernels, ms_inputs_, ms_outputs_, primitives_, schema_version);
   if (model == nullptr) {
@@ -241,6 +282,7 @@ int Scheduler::ReplaceDelegateKernels(std::vector<kernel::LiteKernel *> *dst_ker
   delete model;
   return RET_OK;
 }
+#endif
 
 void Scheduler::FindNodeInoutTensors(const lite::Model::Node &node, std::vector<Tensor *> *inputs,
                                      std::vector<Tensor *> *outputs) {
@@ -265,21 +307,25 @@ int Scheduler::InferNodeShape(const lite::Model::Node *node) {
   std::vector<Tensor *> inputs;
   std::vector<Tensor *> outputs;
   FindNodeInoutTensors(*node, &inputs, &outputs);
-  auto ret = KernelInferShape(inputs, outputs, node->primitive_, context_->GetProviders());
+  int ret;
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
+  ret = KernelInferShape(inputs, outputs, node->primitive_, context_->GetProviders(), schema_version_);
   if (ret != RET_NOT_SUPPORT) {
     return ret;
   }
+#endif
 
-  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
-  auto parame_gen =
-    PopulateRegistry::GetInstance()->GetParameterCreator(GetPrimitiveType(node->primitive_), schema_version);
+  auto parame_gen = PopulateRegistry::GetInstance()->GetParameterCreator(
+    GetPrimitiveType(node->primitive_, schema_version_), schema_version_);
   if (parame_gen == nullptr) {
     MS_LOG(ERROR) << "parameter generator is nullptr.";
+    FreeOpParameters();
     return RET_NULL_PTR;
   }
   auto parameter = parame_gen(primitive);
   if (parameter == nullptr) {
-    MS_LOG(ERROR) << "PopulateParameter return nullptr, type: " << PrimitiveTypeName(GetPrimitiveType(primitive));
+    MS_LOG(ERROR) << "PopulateParameter return nullptr, type: " << GetPrimitiveTypeName(primitive, schema_version_);
+    FreeOpParameters();
     return RET_ERROR;
   }
   parameter->quant_type_ = node->quant_type_;
@@ -292,7 +338,7 @@ int Scheduler::InferNodeShape(const lite::Model::Node *node) {
     op_parameters_[node->output_indices_.at(0)] = parameter;
   }
 
-  if (IsCallNode(primitive)) {
+  if (IsCallNode(primitive, schema_version_)) {
     return InferCallShape(node);
   }
   ret = KernelInferShape(inputs, outputs, parameter);
@@ -316,21 +362,28 @@ int Scheduler::InferNodeShape(const lite::Model::Node *node) {
     for (auto &output : outputs) {
       if (output->ElementsNum() >= MAX_MALLOC_SIZE / static_cast<int>(sizeof(int64_t))) {
         MS_LOG(ERROR) << "The size of output tensor is too big";
+        FreeOpParameters();
         return RET_ERROR;
       }
     }
   } else if (ret != RET_INFER_INVALID) {
-    for (auto &param : op_parameters_) {
-      free(param.second);
-      param.second = nullptr;
-      return RET_ERROR;
-    }
+    FreeOpParameters();
+    return RET_ERROR;
   }
   return ret;
 }
 
+void Scheduler::FreeOpParameters() {
+  for (auto &param : op_parameters_) {
+    if (param.second != nullptr) {
+      free(param.second);
+      param.second = nullptr;
+    }
+  }
+}
+
 int Scheduler::RestoreSubGraphInput(const lite::Model::Node *partial_node) {
-  auto subgraph_index = GetPartialGraphIndex(partial_node->primitive_);
+  auto subgraph_index = GetPartialGraphIndex(partial_node->primitive_, schema_version_);
   auto subgraph = src_model_->sub_graphs_.at(subgraph_index);
   for (size_t i = 0; i < subgraph->input_indices_.size(); ++i) {
     auto &subgraph_input = src_tensors_->at(subgraph->input_indices_[i]);
@@ -347,7 +400,7 @@ void CopyCommonTensor(Tensor *dst_tensor, Tensor *src_tensor) {
 }
 
 int Scheduler::CopyPartialShapeToSubGraph(const lite::Model::Node *partial_node) {
-  auto subgraph_index = GetPartialGraphIndex(partial_node->primitive_);
+  auto subgraph_index = GetPartialGraphIndex(partial_node->primitive_, schema_version_);
   auto subgraph = src_model_->sub_graphs_.at(subgraph_index);
   if (subgraph->input_indices_.size() != partial_node->input_indices_.size()) {
     MS_LOG(ERROR) << "partial node " << partial_node->name_ << " inputs size: " << partial_node->input_indices_.size()
@@ -376,12 +429,12 @@ int Scheduler::CopyPartialShapeToSubGraph(const lite::Model::Node *partial_node)
 int Scheduler::InferPartialShape(const lite::Model::Node *node) {
   MS_ASSERT(src_model_ != nullptr);
   MS_ASSERT(node != nullptr);
-  if (!IsPartialNode(node->primitive_)) {
+  if (!IsPartialNode(node->primitive_, schema_version_)) {
     MS_LOG(ERROR) << "Node is not a partial";
     return RET_PARAM_INVALID;
   }
   CopyPartialShapeToSubGraph(node);
-  int subgraph_index = GetPartialGraphIndex(node->primitive_);
+  int subgraph_index = GetPartialGraphIndex(node->primitive_, schema_version_);
   auto ret = InferSubGraphShape(subgraph_index);
   if (ret != RET_OK) {
     MS_LOG(WARNING) << "infer subgraph: " << subgraph_index << " failed, ret:" << ret;
@@ -395,7 +448,7 @@ Model::Node *Scheduler::NodeInputIsPartial(const lite::Model::Node *node) {
   MS_ASSERT(node != nullptr);
   for (auto &iter : src_model_->all_nodes_) {
     if (iter->output_indices_ == node->input_indices_) {
-      if (IsPartialNode(iter->primitive_)) {
+      if (IsPartialNode(iter->primitive_, schema_version_)) {
         return iter;
       } else {
         return nullptr;
@@ -408,7 +461,7 @@ Model::Node *Scheduler::NodeInputIsPartial(const lite::Model::Node *node) {
 int Scheduler::InferCallShape(const lite::Model::Node *node) {
   MS_ASSERT(src_model_ != nullptr);
   MS_ASSERT(node != nullptr);
-  if (!IsCallNode(node->primitive_)) {
+  if (!IsCallNode(node->primitive_, schema_version_)) {
     MS_LOG(ERROR) << "Node is not a call cnode";
     return RET_PARAM_INVALID;
   }
@@ -417,7 +470,7 @@ int Scheduler::InferCallShape(const lite::Model::Node *node) {
   if (partial_input) {
     return InferPartialShape(partial_input);
   }
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   auto switch_input = NodeInputIsSwitch(node);
   if (switch_input) {
     return InferSwitchShape(switch_input);
@@ -442,14 +495,14 @@ int Scheduler::InferSubGraphShape(size_t subgraph_index) {
       MS_LOG(ERROR) << "Op " << node->name_ << " should exist in model!";
       return RET_ERROR;
     }
-    auto type = GetPrimitiveType(primitive);
     auto ret = InferNodeShape(node);
     if (ret == RET_INFER_INVALID) {
-      MS_LOG(INFO) << "InferShape interrupted, name: " << node->name_ << ", type: " << PrimitiveTypeName(type)
-                   << ", set infer flag to false.";
+      MS_LOG(INFO) << "InferShape interrupted, name: " << node->name_
+                   << ", type: " << GetPrimitiveTypeName(primitive, schema_version_) << ", set infer flag to false.";
       subgraph_infershape_ret = RET_INFER_INVALID;
     } else if (ret != RET_OK) {
-      MS_LOG(ERROR) << "InferShape failed, name: " << node->name_ << ", type: " << PrimitiveTypeName(type);
+      MS_LOG(ERROR) << "InferShape failed, name: " << node->name_
+                    << ", type: " << GetPrimitiveTypeName(primitive, schema_version_);
       return RET_INFER_ERR;
     }
   }
@@ -603,11 +656,14 @@ int Scheduler::FindCpuKernel(const std::vector<Tensor *> &in_tensors, const std:
     }
     cpu_desc.data_type = kNumberTypeFloat16;
   }
-  auto ret = WeightDecoder::DequantNode(op_parameter, in_tensors, kernel_data_type);
+  int ret;
+#ifndef WEIGHT_DECODE_CLIP
+  ret = WeightDecoder::DequantNode(op_parameter, in_tensors, kernel_data_type);
   if (ret != RET_OK) {
     MS_LOG(DEBUG) << "Dequant input tensors failed: " << ret;
     return RET_NOT_SUPPORT;
   }
+#endif
   std::map<Tensor *, Tensor *> restored_origin_tensors;
 
   ret = CastConstTensorsData(in_tensors, &restored_origin_tensors, kernel_data_type,
@@ -640,6 +696,7 @@ int Scheduler::FindCpuKernel(const std::vector<Tensor *> &in_tensors, const std:
   return ret;
 }
 
+#ifdef GPU_OPENCL
 int Scheduler::FindGpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                              OpParameter *op_parameter, const kernel::KernelKey &desc, kernel::LiteKernel **kernel) {
   MS_ASSERT(op_parameter != nullptr);
@@ -650,13 +707,15 @@ int Scheduler::FindGpuKernel(const std::vector<Tensor *> &in_tensors, const std:
     if (desc.data_type == kNumberTypeFloat32 && context_->IsGpuFloat16Enabled()) {
       gpu_desc.data_type = kNumberTypeFloat16;
     }
-
+    int ret;
+#ifndef WEIGHT_DECODE_CLIP
     // weight dequant
-    auto ret = WeightDecoder::DequantNode(op_parameter, in_tensors, kNumberTypeFloat32);
+    ret = WeightDecoder::DequantNode(op_parameter, in_tensors, kNumberTypeFloat32);
     if (ret != RET_OK) {
       MS_LOG(DEBUG) << "Dequant input tensors failed: " << ret;
       return RET_NOT_SUPPORT;
     }
+#endif
     // we don't need to restore tensor for copy data
     ret = CopyConstTensorData(in_tensors, op_parameter->type_);
     if (ret != RET_OK) {
@@ -674,12 +733,14 @@ int Scheduler::FindGpuKernel(const std::vector<Tensor *> &in_tensors, const std:
   }
   return RET_NOT_SUPPORT;
 }
+#endif
 
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
 int Scheduler::FindProviderKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                                   const Model::Node *node, TypeId data_type, kernel::LiteKernel **kernel) {
   MS_ASSERT(kernel != nullptr);
   int ret = RET_NOT_SUPPORT;
-  auto prim_type = GetPrimitiveType(node->primitive_);
+  auto prim_type = GetPrimitiveType(node->primitive_, schema_version_);
   if (prim_type == schema::PrimitiveType_Custom) {
     for (auto &&device : context_->device_list_) {
       if (!device.provider_.empty() && !device.provider_device_.empty()) {
@@ -704,7 +765,7 @@ int Scheduler::FindProviderKernel(const std::vector<Tensor *> &in_tensors, const
   if (!context_->IsProviderEnabled()) {
     return ret;
   }
-  if (VersionManager::GetInstance()->GetSchemaVersion() == SCHEMA_V0) {
+  if (schema_version_ == SCHEMA_V0) {
     return ret;
   }
   for (auto &&device : context_->device_list_) {
@@ -721,6 +782,7 @@ int Scheduler::FindProviderKernel(const std::vector<Tensor *> &in_tensors, const
 
   return RET_NOT_SUPPORT;
 }
+#endif
 
 kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in_tensors,
                                                  const std::vector<Tensor *> &out_tensors, const Model::Node *node,
@@ -730,14 +792,17 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
   TypeId data_type =
     (node->quant_type_ == schema::QuantType_QUANT_WEIGHT) ? kNumberTypeFloat32 : GetFirstFp32Fp16OrInt8Type(in_tensors);
   kernel::LiteKernel *kernel = nullptr;
-  int status = FindProviderKernel(in_tensors, out_tensors, node, data_type, &kernel);
+  int status;
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
+  status = FindProviderKernel(in_tensors, out_tensors, node, data_type, &kernel);
   if (status == RET_OK && kernel != nullptr) {
     return kernel;
   }
+#endif
   MS_ASSERT(!node->output_indices_.empty());
   OpParameter *op_parameter = op_parameters_[node->output_indices_.at(0)];
   if (op_parameter == nullptr) {
-    MS_LOG(ERROR) << "Can not find OpParameter!type: " << PrimitiveTypeName(GetPrimitiveType(node->primitive_));
+    MS_LOG(ERROR) << "Can not find OpParameter!type: " << GetPrimitiveTypeName(node->primitive_, schema_version_);
     return nullptr;
   }
   int kernel_thread_count = op_parameter->thread_num_;
@@ -800,6 +865,8 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
       if (!(ret == RET_INFER_INVALID || ret == RET_OK)) {
         MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_;
       }
+    } else if (status == RET_NOT_SUPPORT) {
+      free(op_parameter);
     }
   }
   return nullptr;
@@ -809,7 +876,7 @@ namespace {
 kernel::SubGraphKernel *CreateSubGraphKernel(const std::vector<kernel::LiteKernel *> &kernels,
                                              const std::vector<lite::Tensor *> *in_tensors,
                                              const std::vector<lite::Tensor *> *out_tensors, kernel::SubGraphType type,
-                                             const InnerContext &context) {
+                                             const InnerContext &context, int schema_version) {
   if (type == kernel::kApuSubGraph) {
     return nullptr;
   }
@@ -880,6 +947,7 @@ kernel::SubGraphKernel *CreateSubGraphKernel(const std::vector<kernel::LiteKerne
     return nullptr;
   }
   sub_graph->set_context(&context);
+  sub_graph->SetSchemaVersion(schema_version);
   return sub_graph;
 }
 
@@ -923,10 +991,10 @@ kernel::LiteKernel *Scheduler::SchedulePartialToKernel(const lite::Model::Node *
   MS_ASSERT(src_node != nullptr);
   auto *primitive = src_node->primitive_;
   MS_ASSERT(primitive != nullptr);
-  if (!IsPartialNode(primitive)) {
+  if (!IsPartialNode(primitive, schema_version_)) {
     return nullptr;
   }
-  auto subgraph_index = GetPartialGraphIndex(src_node->primitive_);
+  auto subgraph_index = GetPartialGraphIndex(src_node->primitive_, schema_version_);
   auto subgraph_kernel = SchedulePartialToSubGraphKernel(subgraph_index);
   subgraph_kernel->set_name("subgraph_" + std::to_string(subgraph_index));
   return subgraph_kernel;
@@ -945,7 +1013,7 @@ int Scheduler::SubGraphPreferDataType(const int &subgraph_index, TypeId *prefer_
     MS_ASSERT(!node->output_indices_.empty());
     OpParameter *op_parameter = op_parameters_[node->output_indices_.at(0)];
     if (op_parameter == nullptr) {
-      MS_LOG(ERROR) << "Can not find OpParameter!type: " << PrimitiveTypeName(GetPrimitiveType(node->primitive_));
+      MS_LOG(ERROR) << "Can not find OpParameter!type: " << GetPrimitiveTypeName(node->primitive_, schema_version_);
       return RET_ERROR;
     }
     kernel::KernelKey desc{kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat16,
@@ -1001,9 +1069,13 @@ kernel::LiteKernel *Scheduler::SchedulePartialToSubGraphKernel(const int &subgra
     return {};
   }
   FindAllInoutKernels(kernels);
-  auto cur_sub_graph_type = GetKernelSubGraphType(kernels.front(), *context_, true);
+  kernel::SubGraphType cur_sub_graph_type = kernel::kCpuFP32SubGraph;
+  if (!kernels.empty()) {
+    cur_sub_graph_type = GetKernelSubGraphType(kernels.front(), *context_, true);
+  }
   MS_LOG(INFO) << "cur_sub_graph_type: " << cur_sub_graph_type;
-  auto subgraph_kernel = CreateSubGraphKernel(kernels, &in_tensors, &out_tensors, cur_sub_graph_type, *context_);
+  auto subgraph_kernel =
+    CreateSubGraphKernel(kernels, &in_tensors, &out_tensors, cur_sub_graph_type, *context_, schema_version_);
   if (subgraph_kernel == nullptr) {
     MS_LOG(ERROR) << "CreateSubGraphKernel failed, cur_sub_graph_type: " << cur_sub_graph_type;
     return nullptr;
@@ -1028,11 +1100,13 @@ std::vector<kernel::LiteKernel *> Scheduler::ScheduleSubGraphToSubGraphKernels(c
 kernel::LiteKernel *Scheduler::ScheduleNodeToKernel(const lite::Model::Node *src_node, TypeId prefer_data_type) {
   std::vector<Tensor *> inputs;
   std::vector<Tensor *> outputs;
+  MS_ASSERT(src_node != nullptr);
   FindNodeInoutTensors(*src_node, &inputs, &outputs);
   auto *kernel = this->FindBackendKernel(inputs, outputs, src_node, prefer_data_type);
+  op_parameters_[src_node->output_indices_.at(0)] = nullptr;
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "FindBackendKernel return nullptr, name: " << src_node->name_
-                  << ", type: " << PrimitiveTypeName(GetPrimitiveType(src_node->primitive_));
+                  << ", type: " << GetPrimitiveTypeName(src_node->primitive_, schema_version_);
     return nullptr;
   }
   SetKernelTensorDataType(kernel);
@@ -1051,9 +1125,9 @@ bool Scheduler::IsControlFlowPattern(const lite::Model::Node &partial_node) {
     }
   }
 
-  return partial_node_output == nullptr
-           ? false
-           : (IsCallNode(partial_node_output->primitive_) || IsSwitchNode(partial_node_output->primitive_));
+  return partial_node_output == nullptr ? false
+                                        : (IsCallNode(partial_node_output->primitive_, schema_version_) ||
+                                           IsSwitchNode(partial_node_output->primitive_, schema_version_));
 }
 
 int Scheduler::ScheduleGraphToKernels(std::vector<kernel::LiteKernel *> *dst_kernels, TypeId prefer_data_type) {
@@ -1087,13 +1161,12 @@ int Scheduler::ScheduleSubGraphToKernels(size_t subgraph_index, std::vector<kern
     auto *primitive = node->primitive_;
     MS_ASSERT(primitive != nullptr);
     kernel::LiteKernel *kernel = nullptr;
-    auto prim_type = GetPrimitiveType(primitive);
 
-    if (IsPartialNode(primitive)) {
+    if (IsPartialNode(primitive, schema_version_)) {
       if (IsControlFlowPattern(*node)) {
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
         kernel = ScheduleNodeToKernel(node, prefer_data_type);
-        auto partial_subgraph_index = GetPartialGraphIndex(primitive);
+        auto partial_subgraph_index = GetPartialGraphIndex(primitive, schema_version_);
         if (SubGraphHasScheduled(partial_subgraph_index)) {
           partial_kernel_subgraph_index_map_[kernel] = partial_subgraph_index;
           MS_LOG(INFO) << "subgraph has scheduled. ";
@@ -1103,7 +1176,7 @@ int Scheduler::ScheduleSubGraphToKernels(size_t subgraph_index, std::vector<kern
           subgraphs_to_schedule_.push_back(partial_subgraph_index);
         }
 #else
-        MS_LOG(ERROR) << unsupport_control_tensorlist_log;
+        MS_LOG(ERROR) << unsupport_controlflow_tensorlist_log;
         return RET_ERROR;
 #endif
       } else {
@@ -1114,7 +1187,7 @@ int Scheduler::ScheduleSubGraphToKernels(size_t subgraph_index, std::vector<kern
     }
     if (kernel == nullptr || ret != RET_OK) {
       MS_LOG(ERROR) << "FindBackendKernel return nullptr, name: " << node->name_
-                    << ", type: " << PrimitiveTypeName(prim_type);
+                    << ", type: " << GetPrimitiveTypeName(primitive, schema_version_);
       return RET_ERROR;
     }
     kernel->set_is_model_output(IsContain(graph_output_node_indexes_, size_t(node_index)));
@@ -1166,7 +1239,7 @@ bool KernelFitCurrentSubGraph(const kernel::SubGraphType subgraph_type, const ke
 }
 
 kernel::LiteKernel *FindAllSubGraphKernels(const std::vector<kernel::LiteKernel *> &sorted_kernels,
-                                           const InnerContext &context, size_t *cur_index) {
+                                           const InnerContext &context, size_t *cur_index, int schema_version) {
   std::vector<kernel::LiteKernel *> sub_kernels;
   sub_kernels.emplace_back(sorted_kernels[*cur_index]);
   auto cur_sub_graph_type = GetKernelSubGraphType(sorted_kernels[*cur_index], context);
@@ -1174,17 +1247,20 @@ kernel::LiteKernel *FindAllSubGraphKernels(const std::vector<kernel::LiteKernel
     auto cur_kernel = sorted_kernels[*cur_index];
     MS_ASSERT(GetKernelSubGraphType(cur_kernel, context) != kernel::kApuSubGraph);
     // already a subgraph or a delegate
-    if (cur_kernel->subgraph_type() != kernel::kNotSubGraph || cur_kernel->desc().delegate != nullptr) {
+#ifndef DELEGATE_CLIP
+    if (cur_kernel->desc().delegate != nullptr) {
       --(*cur_index);
       break;
     }
-    if (!KernelFitCurrentSubGraph(cur_sub_graph_type, *cur_kernel)) {
+#endif
+    if (cur_kernel->subgraph_type() != kernel::kNotSubGraph ||
+        !KernelFitCurrentSubGraph(cur_sub_graph_type, *cur_kernel)) {
       --(*cur_index);
       break;
     }
     sub_kernels.emplace_back(cur_kernel);
   }
-  return CreateSubGraphKernel(sub_kernels, nullptr, nullptr, cur_sub_graph_type, context);
+  return CreateSubGraphKernel(sub_kernels, nullptr, nullptr, cur_sub_graph_type, context, schema_version);
 }
 }  // namespace
 
@@ -1201,12 +1277,18 @@ int Scheduler::ConstructSubGraphs(std::vector<kernel::LiteKernel *> src_kernel,
     MS_ASSERT(cur_kernel != nullptr);
     // Not support APU now
     MS_ASSERT(GetKernelSubGraphType(cur_kernel, *context_) != kernel::kApuSubGraph);
-    // already a subgraph or a delegate
-    if (cur_kernel->subgraph_type() != kernel::kNotSubGraph || cur_kernel->desc().delegate != nullptr) {
+#ifndef DELEGATE_CLIP
+    if (cur_kernel->desc().delegate != nullptr) {
       dst_kernel->emplace_back(cur_kernel);
       continue;
     }
-    auto subgraph = FindAllSubGraphKernels(src_kernel, *context_, &index);
+#endif
+    // already a subgraph or a delegate
+    if (cur_kernel->subgraph_type() != kernel::kNotSubGraph) {
+      dst_kernel->emplace_back(cur_kernel);
+      continue;
+    }
+    auto subgraph = FindAllSubGraphKernels(src_kernel, *context_, &index, schema_version_);
     if (subgraph == nullptr) {
       MS_LOG(ERROR) << "Create SubGraphKernel failed";
       return RET_ERROR;
@@ -1214,14 +1296,18 @@ int Scheduler::ConstructSubGraphs(std::vector<kernel::LiteKernel *> src_kernel,
     dst_kernel->emplace_back(subgraph);
   }
   for (auto *subgraph : *dst_kernel) {
+#ifndef DELEGATE_CLIP
     auto subgraph_delegate = subgraph->desc().delegate;
     if (subgraph_delegate == nullptr) {
+#endif
       auto ret = subgraph->Init();
       if (ret != RET_OK) {
         MS_LOG(ERROR) << "Init SubGraph failed: " << ret;
         return ret;
       }
+#ifndef DELEGATE_CLIP
     }
+#endif
   }
   return RET_OK;
 }
@@ -1232,7 +1318,7 @@ TypeId Scheduler::GetFirstFp32Fp16OrInt8Type(const std::vector<Tensor *> &in_ten
     if (dtype == kObjectTypeString) {
       return kNumberTypeFloat32;
     }
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
     if (dtype == kObjectTypeTensorType) {
       auto tensor_list = reinterpret_cast<TensorList *>(tensor);
       auto tensor_list_dtype = tensor_list->tensors_data_type();
@@ -1317,11 +1403,11 @@ kernel::SubGraphType Scheduler::PartialSubGraphType(const std::vector<kernel::Li
   return kernel::kCpuFP32SubGraph;
 }
 
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 int Scheduler::InferSwitchShape(const lite::Model::Node *switch_node) {
   MS_ASSERT(src_model_ != nullptr);
   MS_ASSERT(switch_node != nullptr);
-  if (!IsSwitchNode(switch_node->primitive_)) {
+  if (!IsSwitchNode(switch_node->primitive_, schema_version_)) {
     MS_LOG(ERROR) << "Node is not a switch";
     return RET_PARAM_INVALID;
   }
@@ -1331,7 +1417,8 @@ int Scheduler::InferSwitchShape(const lite::Model::Node *switch_node) {
   for (auto &node : src_model_->all_nodes_) {
     if ((IsContain(node->output_indices_, true_branch_output_index) ||
          IsContain(node->output_indices_, false_branch_output_index)) &&
-        IsPartialNode(node->primitive_) && partial_cnode_inferred_.find(node) == partial_cnode_inferred_.end()) {
+        IsPartialNode(node->primitive_, schema_version_) &&
+        partial_cnode_inferred_.find(node) == partial_cnode_inferred_.end()) {
       partial_cnode_inferred_.insert(node);
       partial_cnode_to_infer.push_back(node);
     }
@@ -1353,7 +1440,7 @@ Model::Node *Scheduler::NodeInputIsSwitch(const lite::Model::Node *node) {
   MS_ASSERT(node != nullptr);
   for (auto &iter : src_model_->all_nodes_) {
     if (iter->output_indices_ == node->input_indices_) {
-      if (IsSwitchNode(iter->primitive_)) {
+      if (IsSwitchNode(iter->primitive_, schema_version_)) {
         return iter;
       } else {
         return nullptr;
@@ -1369,6 +1456,7 @@ bool Scheduler::SubGraphHasScheduled(const int &index) {
 
 void Scheduler::SubGraphMarkScheduled(const int &index) { scheduled_subgraph_index_.insert(index); }
 
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 void Scheduler::SetSubgraphForPartialNode() {
   for (auto &pair : partial_kernel_subgraph_index_map_) {
     auto &partial_kernel = pair.first;
@@ -1377,6 +1465,7 @@ void Scheduler::SetSubgraphForPartialNode() {
       ->set_subgraph_kernel(subgraph_index_subgraph_kernel_map_.at(subgraph_index));
   }
 }
+#endif
 
 void CopyTensorList(TensorList *dst_tensor, TensorList *src_tensor) {
   dst_tensor->set_data_type(src_tensor->data_type());
@@ -1415,7 +1504,8 @@ int Scheduler::ConstructControlFlowMainGraph(std::vector<kernel::LiteKernel *> *
     }
   }
   auto cur_subgraph_type = PartialSubGraphType(main_graph_kernels);
-  auto subgraph_kernel = CreateSubGraphKernel(main_graph_kernels, nullptr, nullptr, cur_subgraph_type, *context_);
+  auto subgraph_kernel =
+    CreateSubGraphKernel(main_graph_kernels, nullptr, nullptr, cur_subgraph_type, *context_, schema_version_);
   if (subgraph_kernel == nullptr) {
     MS_LOG(ERROR) << "create main graph for control flow model failed.";
     return RET_ERROR;
diff --git a/mindspore/lite/src/scheduler.h b/mindspore/lite/src/scheduler.h
index 077e1d65836..637fcba2c69 100644
--- a/mindspore/lite/src/scheduler.h
+++ b/mindspore/lite/src/scheduler.h
@@ -28,10 +28,12 @@
 #include "src/inner_context.h"
 #include "include/model.h"
 #include "src/scheduler_cb.h"
-
+#ifndef DELEGATE_CLIP
 #include "include/api/delegate.h"
+#endif
 
 namespace mindspore::lite {
+constexpr int kDefaultDeviceType = -1;
 const constexpr int kSwitchTrueBranch = 1;
 const constexpr int kSwitchFalseBranch = 2;
 class Scheduler {
@@ -53,11 +55,14 @@ class Scheduler {
   void SetupSchedulerCb(std::unique_ptr<SchedulerCb> cb) { sched_cb_ = std::move(cb); }
 
  private:
+  int SchedulePreProcess();
+  int CheckInputParam(std::vector<kernel::LiteKernel *> *dst_kernels);
   void FindNodeInoutTensors(const Model::Node &node, std::vector<Tensor *> *inputs, std::vector<Tensor *> *outputs);
   Model::Node *NodeInputIsPartial(const Model::Node *node);
   int InferPartialShape(const Model::Node *node);
   int InferCallShape(const Model::Node *node);
   int InferNodeShape(const Model::Node *node);
+  void FreeOpParameters();
   int InferSubGraphShape(size_t subgraph_index);
   // schedule a node to kernel according to context and kernels registered
   kernel::LiteKernel *FindBackendKernel(const std::vector<Tensor *> &in_tensors,
@@ -66,8 +71,10 @@ class Scheduler {
   int FindCpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                     OpParameter *op_parameter, const kernel::KernelKey &desc, TypeId kernel_data_type,
                     kernel::LiteKernel **kernel);
+#ifdef GPU_OPENCL
   int FindGpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                     OpParameter *op_parameter, const kernel::KernelKey &desc, kernel::LiteKernel **kernel);
+#endif
   int FindProviderKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                          const Model::Node *node, TypeId data_type, kernel::LiteKernel **kernel);
 
@@ -102,7 +109,7 @@ class Scheduler {
 
   bool IsControlFlowPattern(const lite::Model::Node &partial_node);
   int SubGraphPreferDataType(const int &subgraph_index, TypeId *prefer_data_type);
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   int InferSwitchShape(const Model::Node *node);
   Model::Node *NodeInputIsSwitch(const Model::Node *node);
   bool SubGraphHasScheduled(const int &index);
@@ -129,11 +136,12 @@ class Scheduler {
   std::shared_ptr<Delegate> delegate_ = nullptr;
   std::deque<int> subgraphs_to_schedule_{};
   std::unordered_map<size_t, kernel::LiteKernel *> subgraph_index_subgraph_kernel_map_{};
-#ifdef ENABLE_CONTROL_TENSORLIST
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
   std::set<int> scheduled_subgraph_index_{};
   std::unordered_map<kernel::LiteKernel *, size_t> partial_kernel_subgraph_index_map_{};
   std::set<lite::Model::Node *> partial_cnode_inferred_{};
 #endif
+  int schema_version_ = SCHEMA_VERSION::SCHEMA_CUR;
 };
 }  // namespace mindspore::lite
 
diff --git a/mindspore/lite/src/sub_graph_kernel.cc b/mindspore/lite/src/sub_graph_kernel.cc
index 4e8b7637238..b473b3359f0 100644
--- a/mindspore/lite/src/sub_graph_kernel.cc
+++ b/mindspore/lite/src/sub_graph_kernel.cc
@@ -16,7 +16,9 @@
 
 #include "src/sub_graph_kernel.h"
 #include "src/tensor.h"
+#ifndef CONTROLFLOW_TENSORLIST_CLIP
 #include "src/tensorlist.h"
+#endif
 #ifdef ENABLE_FP16
 #include "src/runtime/kernel/arm/fp16/fp16_op_handler.h"
 #endif
@@ -102,17 +104,21 @@ int SubGraphKernel::ReSize() {
     for (auto &output : outputs) {
       output->FreeData();
     }
-    auto ret =
-      lite::KernelInferShape(inputs, outputs, kernel->kernel()->primitive(), kernel->Context()->GetProviders());
+    int ret;
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
+    ret = lite::KernelInferShape(inputs, outputs, kernel->kernel()->primitive(), kernel->Context()->GetProviders(),
+                                 schema_version_);
     if (ret == lite::RET_NOT_SUPPORT) {
+#endif
       auto parameter = kernel->op_parameter();
       if (parameter == nullptr) {
         MS_LOG(ERROR) << "kernel(" << kernel->name() << ")'s op_parameter is nullptr!";
         return RET_ERROR;
       }
       ret = lite::KernelInferShape(inputs, outputs, parameter);
+#ifndef CUSTOM_KERNEL_REGISTRY_CLIP
     }
-
+#endif
     if (ret == RET_INFER_INVALID) {
       MS_LOG(INFO) << "InferShape shouldn't be done before runtime, type:"
                    << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(kernel->type()))
diff --git a/mindspore/lite/src/sub_graph_kernel.h b/mindspore/lite/src/sub_graph_kernel.h
index 647c1a075ef..59cbae41a28 100644
--- a/mindspore/lite/src/sub_graph_kernel.h
+++ b/mindspore/lite/src/sub_graph_kernel.h
@@ -26,6 +26,7 @@
 #include "src/lite_kernel.h"
 #include "src/executor.h"
 #include "src/common/log_adapter.h"
+#include "src/common/version_manager.h"
 #include "src/cpu_info.h"
 #ifdef ENABLE_ARM64
 #include "src/common/utils.h"
@@ -117,6 +118,8 @@ class SubGraphKernel : public LiteKernel {
 
   std::vector<LiteKernel *> out_nodes() { return this->out_nodes_; }
 
+  void SetSchemaVersion(int schema_version) { schema_version_ = schema_version; }
+
  protected:
   std::vector<LiteKernel *> nodes_{};
   // entry nodes in nodes
@@ -124,6 +127,7 @@ class SubGraphKernel : public LiteKernel {
   // exit nodes in nodes
   std::vector<LiteKernel *> out_nodes_{};
   mindspore::lite::Executor *executor_ = nullptr;
+  int schema_version_ = lite::SCHEMA_VERSION::SCHEMA_CUR;
 };
 
 class CpuSubGraph : public SubGraphKernel {
diff --git a/mindspore/lite/src/sub_graph_split.cc b/mindspore/lite/src/sub_graph_split.cc
index 5ec49eddb9b..07968cca756 100644
--- a/mindspore/lite/src/sub_graph_split.cc
+++ b/mindspore/lite/src/sub_graph_split.cc
@@ -24,12 +24,10 @@
 #include "schema/ops_generated.h"
 #include "schema/model_generated.h"
 #include "src/ops/populate/populate_register.h"
-#include "nnacl/fp32/winograd_utils.h"
+#include "src/scheduler.h"
 #include "nnacl/pooling_parameter.h"
 #include "include/model.h"
-#if defined(ENABLE_ARM) || (defined(ENABLE_SSE) && !defined(ENABLE_AVX))
-#include "nnacl/fp32/conv_depthwise_fp32.h"
-#endif
+#include "nnacl/base/conv_common_base.h"
 
 namespace mindspore::lite {
 size_t CommConvMul(std::vector<int> weight_shape, std::vector<int> output_shape) {
@@ -58,7 +56,7 @@ bool IsOfflineParallelNode(const void *node_primitive, int node_device_type) {
   if (node_primitive == nullptr) {
     return false;
   }
-  return (GetPrimitiveType(node_primitive) == schema::PrimitiveType_Conv2DFusion) &&
+  return (GetPrimitiveType(node_primitive, SCHEMA_VERSION::SCHEMA_CUR) == schema::PrimitiveType_Conv2DFusion) &&
          (node_device_type != kDefaultDeviceType);
 }
 
@@ -97,7 +95,7 @@ bool SearchSubGraph::CheckIsParallelSubGraph(const std::vector<Subgraph> &subgra
         continue;
       }
       auto input_node_index = tensors_.at(input).out_nodes_.front();
-      if (GetPrimitiveType(model_->all_nodes_.at(input_node_index)->primitive_) !=
+      if (GetPrimitiveType(model_->all_nodes_.at(input_node_index)->primitive_, SCHEMA_VERSION::SCHEMA_CUR) !=
           schema::PrimitiveType_SplitWithOverlap) {
         return false;
       }
@@ -109,7 +107,8 @@ bool SearchSubGraph::CheckIsParallelSubGraph(const std::vector<Subgraph> &subgra
         continue;
       }
       auto output_node_index = tensors_.at(output).in_nodes_.front();
-      if (GetPrimitiveType(model_->all_nodes_.at(output_node_index)->primitive_) != schema::PrimitiveType_Concat) {
+      if (GetPrimitiveType(model_->all_nodes_.at(output_node_index)->primitive_, SCHEMA_VERSION::SCHEMA_CUR) !=
+          schema::PrimitiveType_Concat) {
         return false;
       }
     }
@@ -348,7 +347,7 @@ void SearchSubGraph::SearchMultyInNodes(std::vector<uint32_t> *multy_in_nodes) {
     uint32_t node_index = all_main_sub_nodes[i];
     Model::Node *node = node_list_[node_index];
 
-    if (IsPartialNode(node->primitive_)) {
+    if (IsPartialNode(node->primitive_, model_->GetSchemaVersion())) {
       continue;
     }
     int input_count = std::count_if(node->input_indices_.begin(), node->input_indices_.end(),
@@ -774,7 +773,7 @@ void SearchSubGraph::CalculateCostModel(std::vector<Subgraph> *sub_graphs) {
       cost.mul_cost_ = 1;
 
       Model::Node *node = model_->all_nodes_[node_index];
-      if (GetPrimitiveType(node->primitive_) == schema::PrimitiveType_Conv2DFusion) {
+      if (GetPrimitiveType(node->primitive_, SCHEMA_VERSION::SCHEMA_CUR) == schema::PrimitiveType_Conv2DFusion) {
         cost = CalculateConv2DFusion(node);
       }
 
@@ -853,7 +852,7 @@ void SearchSubGraph::SubGraphSplitByOffLineParallel() {
 
   for (uint32_t node_index : multy_in_nodes) {
     Model::Node *node = node_list_[node_index];
-    if (GetPrimitiveType(node->primitive_) != schema::PrimitiveType_Concat) {
+    if (GetPrimitiveType(node->primitive_, SCHEMA_VERSION::SCHEMA_CUR) != schema::PrimitiveType_Concat) {
       continue;
     }
     std::vector<Subgraph> node_subs;
@@ -1041,6 +1040,9 @@ bool SearchSubGraph::ValidInParallel() {
   if (model_->sub_graphs_.size() > 1) {
     return false;
   }
+  if (model_->GetSchemaVersion() != SCHEMA_VERSION::SCHEMA_CUR) {
+    return false;
+  }
   return true;
 }
 
diff --git a/mindspore/lite/src/sub_graph_split.h b/mindspore/lite/src/sub_graph_split.h
index 34ba3ca594f..baacf604607 100644
--- a/mindspore/lite/src/sub_graph_split.h
+++ b/mindspore/lite/src/sub_graph_split.h
@@ -30,7 +30,6 @@
 #include "nnacl/conv_parameter.h"
 
 namespace mindspore::lite {
-constexpr int kDefaultDeviceType = -1;
 constexpr int kDefaultSubGraphSize = 2;
 constexpr int kDefaultFirstSubgraph = 0;
 constexpr int kDefaultSecondSubgraph = 1;
diff --git a/mindspore/lite/src/tensorlist.h b/mindspore/lite/src/tensorlist.h
index e2474eb4d76..85da3bfab52 100644
--- a/mindspore/lite/src/tensorlist.h
+++ b/mindspore/lite/src/tensorlist.h
@@ -24,7 +24,7 @@
 #include "src/common/log_adapter.h"
 #include "schema/model_generated.h"
 #include "src/tensor.h"
-#ifdef ENABLE_CONTROL_TENSORLIST
+
 namespace mindspore::lite {
 /**
  * Tensorlist is a container of vector, in which each element is a tensor object.
@@ -177,5 +177,4 @@ class TensorList : public Tensor {
   int max_elements_num_ = -1;
 };
 }  // namespace mindspore::lite
-#endif
 #endif  // MINDSPORE_LITE_SRC_TENSORLIST_H_
diff --git a/mindspore/lite/src/train/train_session.cc b/mindspore/lite/src/train/train_session.cc
index d6601eafcde..8a47ca6ab4f 100644
--- a/mindspore/lite/src/train/train_session.cc
+++ b/mindspore/lite/src/train/train_session.cc
@@ -52,17 +52,9 @@ const char *kOptimizerName = "optimizer";
 TrainSession::TrainSession() {
   is_train_session_ = true;
   InitCallBack();
-#ifdef ENABLE_V0
-  if (VersionManager::GetInstance()->CheckV0Schema()) {
-    kernel::PopulateTrainV0Parameters();
-  }
-#endif
-  if (!VersionManager::GetInstance()->CheckV0Schema()) {
-    kernel::PopulateTrainParameters();
-  }
 }
 
-int TrainSession::Init(const Context *context, const TrainCfg *train_cfg) {
+int TrainSession::Init(InnerContext *context, const TrainCfg *train_cfg) {
   if (train_cfg != nullptr) {
     if (train_cfg->mix_precision_cfg_.loss_scale_ <= 0) {
       MS_LOG(ERROR) << "illegal loss scale configuration";
@@ -114,7 +106,10 @@ int TrainSession::AllocWorkSpace() {
 }
 
 void TrainSession::FreeWorkSpace() {
-  free(workspace_);
+  if (workspace_ != nullptr) {
+    free(workspace_);
+    workspace_ = nullptr;
+  }
   for (auto kernel : this->train_kernels_) {
     static_cast<kernel::InnerKernel *>(kernel->kernel())->FreeWorkspace();
   }
@@ -125,7 +120,7 @@ int TrainSession::InitCallBack() {
     if (!context_->IsCpuFloat16Enabled()) {
       return false;
     }
-    auto node_type = GetPrimitiveType(node->primitive_);
+    auto node_type = GetPrimitiveType(node->primitive_, SCHEMA_VERSION::SCHEMA_CUR);
     if (node_type == schema::PrimitiveType_Cast) {
       return false;
     }
@@ -218,6 +213,15 @@ int TrainSession::CompileTrainGraph(std::shared_ptr<Model> model) {
     return RET_ERROR;
   }
 
+#ifdef ENABLE_V0
+  if (reinterpret_cast<LiteModel *>(model_.get())->GetSchemaVersion() == SCHEMA_VERSION::SCHEMA_V0) {
+    kernel::PopulateTrainV0Parameters();
+  }
+#endif
+  if (reinterpret_cast<LiteModel *>(model_.get())->GetSchemaVersion() == SCHEMA_VERSION::SCHEMA_CUR) {
+    kernel::PopulateTrainParameters();
+  }
+
   auto ret = lite::LiteSession::CompileGraph(model_.get());
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "failed to compile train model";
@@ -765,6 +769,30 @@ bool TrainSession::IsBN(kernel::LiteKernel *kernel) const {
           (kernel->type() == schema::PrimitiveType_FusedBatchNorm));
 }
 
+int TrainSession::Resize(const std::vector<tensor::MSTensor *> &inputs, const std::vector<std::vector<int>> &dims) {
+  FreeWorkSpace();
+  if (tensors_data_ != nullptr) {
+    free(tensors_data_);
+    tensors_data_ = nullptr;
+  }
+  auto ret = lite::LiteSession::Resize(inputs, dims);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "train resize input failed.";
+    return RET_ERROR;
+  }
+  ret = AllocWorkSpace();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "failed to allocate space";
+    return RET_ERROR;
+  }
+  ret = AllocTensors(train_kernels_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "train alloc failed after resize.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
 int TrainSession::Export(const std::string &file_name, ModelType model_type, QuantizationType quant_type,
                          FormatType format) {
   if (file_name.empty()) {
@@ -857,7 +885,9 @@ session::LiteSession *session::TrainSession::CreateTrainSession(const std::strin
       MS_LOG(ERROR) << " cannot convert to static allocation";
     }
   }
-  auto ret = session->Init(context, cfg);
+
+  mindspore::lite::InnerContext *inner_context = new (std::nothrow) mindspore::lite::InnerContext(context);
+  auto ret = session->Init(inner_context, cfg);
   if (ret != mindspore::lite::RET_OK) {
     MS_LOG(ERROR) << "init session failed";
     return nullptr;
diff --git a/mindspore/lite/src/train/train_session.h b/mindspore/lite/src/train/train_session.h
index 257d29180cc..ee7f2863ef5 100644
--- a/mindspore/lite/src/train/train_session.h
+++ b/mindspore/lite/src/train/train_session.h
@@ -54,7 +54,7 @@ class TrainSession : virtual public lite::LiteSession {
   int CompileGraph(lite::Model *model) override;
   virtual int CompileTrainGraph(std::shared_ptr<Model> model);
 
-  virtual int Init(const Context *context, const TrainCfg *train_cfg);
+  virtual int Init(InnerContext *context, const TrainCfg *train_cfg);
 
   int Train() override;
   int Eval() override;
@@ -80,9 +80,7 @@ class TrainSession : virtual public lite::LiteSession {
   mindspore::tensor::MSTensor *GetOutputByTensorName(const std::string &tensor_name) const override {
     return lite::LiteSession::GetOutputByTensorName(tensor_name);
   }
-  int Resize(const std::vector<tensor::MSTensor *> &inputs, const std::vector<std::vector<int>> &dims) override {
-    return lite::LiteSession::Resize(inputs, dims);
-  }
+  int Resize(const std::vector<tensor::MSTensor *> &inputs, const std::vector<std::vector<int>> &dims) override;
 
   std::vector<tensor::MSTensor *> GetPredictions() const override {
     std::vector<tensor::MSTensor *> outputs;
diff --git a/mindspore/lite/src/train/transfer_session.cc b/mindspore/lite/src/train/transfer_session.cc
index 8eb1d1d2b4f..4234d7a6fae 100644
--- a/mindspore/lite/src/train/transfer_session.cc
+++ b/mindspore/lite/src/train/transfer_session.cc
@@ -248,7 +248,8 @@ static session::LiteSession *CreateTransferSessionInt(const char *model_buf_back
     return nullptr;
   }
 
-  auto ret = session->Init(context, cfg);
+  mindspore::lite::InnerContext *inner_context = new (std::nothrow) mindspore::lite::InnerContext(context);
+  auto ret = session->Init(inner_context, cfg);
   if (ret != lite::RET_OK) {
     MS_LOG(ERROR) << "init transfer session failed";
     delete session;
diff --git a/mindspore/lite/src/weight_decoder.cc b/mindspore/lite/src/weight_decoder.cc
index 589d2284454..7decd5a69f3 100644
--- a/mindspore/lite/src/weight_decoder.cc
+++ b/mindspore/lite/src/weight_decoder.cc
@@ -235,7 +235,6 @@ int WeightDecoder::DequantWeight(lite::Tensor *input_tensor, bool channel_first,
   return RET_OK;
 }
 
-#ifdef ENABLE_HUFFMAN_DECODE
 int WeightDecoder::DecodeHuffmanCode(const schema::Tensor &src_tensor, lite::Tensor *dst_tensor) {
   MS_ASSERT(dst_tensor != nullptr);
   if (!dst_tensor->IsConst() || !src_tensor.enableHuffmanCode()) {
@@ -265,7 +264,6 @@ int WeightDecoder::DecodeHuffmanCode(const schema::Tensor &src_tensor, lite::Ten
   }
   return RET_OK;
 }
-#endif
 
 int WeightDecoder::UnPackToInt(const schema::Tensor &src_tensor, lite::Tensor *dst_tensor) {
   MS_ASSERT(dst_tensor != nullptr);
@@ -301,6 +299,22 @@ int WeightDecoder::UnPackToInt(const schema::Tensor &src_tensor, lite::Tensor *d
   }
 }
 
+int WeightDecoder::UnPack(const schema::Tensor &src_tensor, lite::Tensor *dst_tensor) {
+  STATUS ret = RET_OK;
+  if (src_tensor.enableHuffmanCode()) {
+    ret = WeightDecoder::DecodeHuffmanCode(src_tensor, dst_tensor);
+    if (ret != RET_OK && ret != RET_NO_CHANGE) {
+      MS_LOG(ERROR) << "Decode huffman code failed: " << ret;
+    }
+  } else {
+    ret = WeightDecoder::UnPackToInt(src_tensor, dst_tensor);
+    if (ret != RET_OK && ret != RET_NO_CHANGE) {
+      MS_LOG(ERROR) << "Unpack to int8 failed: " << ret;
+    }
+  }
+  return ret;
+}
+
 int WeightDecoder::DequantNode(OpParameter *op_parameter, const std::vector<Tensor *> &in_tensors,
                                TypeId dst_data_type) {
   if (op_parameter->quant_type_ != schema::QuantType_QUANT_WEIGHT) {
diff --git a/mindspore/lite/src/weight_decoder.h b/mindspore/lite/src/weight_decoder.h
index 0d4097f62a9..79ead9e8631 100644
--- a/mindspore/lite/src/weight_decoder.h
+++ b/mindspore/lite/src/weight_decoder.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_DEQUANT_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_DEQUANT_H_
+#ifndef MINDSPORE_LITE_SRC_WEIGHT_DECODER_H_
+#define MINDSPORE_LITE_SRC_WEIGHT_DECODER_H_
 
 #include <map>
 #include <utility>
@@ -30,7 +30,11 @@
 #include "src/tensor.h"
 
 static constexpr int kPerTensor = 1;
+static constexpr int kBitNum1 = 1;
+static constexpr int kBitNum8 = 8;
+static constexpr int kBitNum16 = 16;
 
+#ifndef WEIGHT_DECODE_CLIP
 namespace mindspore::lite {
 
 template <typename T>
@@ -122,21 +126,17 @@ STATUS IndexingDecompress(const schema::Tensor &src_tensor, Tensor *dst_tensor);
 
 class WeightDecoder {
  public:
-  static constexpr int kBitNum1 = 1;
-  static constexpr int kBitNum8 = 8;
-  static constexpr int kBitNum16 = 16;
-
-  static int UnPackToInt(const schema::Tensor &src_tensor, lite::Tensor *dst_tensor);
-
-#ifdef ENABLE_HUFFMAN_DECODE
-  static int DecodeHuffmanCode(const schema::Tensor &src_tensor, lite::Tensor *dst_tensor);
-#endif
-
   static int DequantNode(OpParameter *op_parameter, const std::vector<Tensor *> &in_tensors, TypeId dst_data_type);
 
+  static int UnPack(const schema::Tensor &src_tensor, lite::Tensor *dst_tensor);
+
  private:
   static int DequantTensor(Tensor *tensor, bool channel_first = true, TypeId dst_data_type = kNumberTypeFloat32);
 
+  static int UnPackToInt(const schema::Tensor &src_tensor, lite::Tensor *dst_tensor);
+
+  static int DecodeHuffmanCode(const schema::Tensor &src_tensor, lite::Tensor *dst_tensor);
+
   template <typename ST, typename DT = float>
   static DT *DequantData(lite::Tensor *input_tensor, bool channel_first = true) {
     const auto *quant_datas = static_cast<const ST *>(input_tensor->data_c());
@@ -287,5 +287,5 @@ class WeightDecoder {
   }
 };
 }  // namespace mindspore::lite
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_DEQUANT_H_
+#endif
+#endif  // MINDSPORE_LITE_SRC_WEIGHT_DECODER_H_
diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt
index ef6612e9e97..79fd4269c90 100644
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -125,6 +125,7 @@ set(TEST_LITE_SRC
         ${LITE_DIR}/src/common/utils.cc
         ${LITE_DIR}/src/common/dynamic_library_loader.cc
         ${LITE_DIR}/src/common/string_util.cc
+        ${LITE_DIR}/src/common/lite_utils.cc
         ${LITE_DIR}/src/common/quant_utils.cc
         ${LITE_DIR}/src/delegate/delegate.cc
         ${LITE_DIR}/src/errorcode.cc
@@ -134,6 +135,15 @@ set(TEST_LITE_SRC
         ${LITE_DIR}/src/train/train_populate_parameter_v0.cc
         )
 
+# Avoid multiple definitions
+if(MSLITE_ENABLE_CONVERTER STREQUAL "off")
+    set(TEST_LITE_SRC
+            ${TEST_LITE_SRC}
+            ${LITE_DIR}/tools/converter/quantizer/fse_decoder.cc
+            ${LITE_DIR}/tools/converter/quantizer/fse_bit_stream.cc
+            )
+endif()
+
 file(GLOB KERNEL_REG_SRC ${LITE_DIR}/src/registry/*.cc)
 set(TEST_LITE_SRC ${TEST_LITE_SRC} ${KERNEL_REG_SRC})
 
@@ -161,7 +171,7 @@ if(MSLITE_GPU_BACKEND STREQUAL opencl)
             )
 endif()
 
-if(ENABLE_MINDRT)
+if(MSLITE_ENABLE_MINDRT)
     include_directories(${CORE_DIR}/mindrt/)
     include_directories(${CORE_DIR}/mindrt/src/)
     set(TEST_LITE_SRC ${TEST_LITE_SRC}
@@ -201,7 +211,6 @@ if(MSLITE_ENABLE_CONVERTER)
             ${LITE_DIR}/tools/converter/converter_flags.cc
             ${LITE_DIR}/tools/converter/converter.cc
             ${LITE_DIR}/tools/converter/export_model.cc
-            ${LITE_DIR}/tools/converter/dump_graph.cc
             ${LITE_DIR}/tools/converter/optimizer_manager.cc
             ${LITE_DIR}/tools/converter/parser/parser_utils.cc
             ${LITE_DIR}/tools/optimizer/common/node_pass_extends.cc
@@ -224,11 +233,11 @@ if(MSLITE_ENABLE_CONVERTER)
             ${LITE_DIR}/tools/optimizer/fusion/multi_head_attention_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/reshape_reshape_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/constant_folding_fusion.cc
-            ${LITE_DIR}/tools/optimizer/fusion/quant_dtype_cast_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/norm_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/batchmatmul_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/sigmoid_mul_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/conv_conv_fusion.cc
+            ${LITE_DIR}/tools/optimizer/fusion/conv_pad_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/conv_tuplegetitem_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/tflite_lstm_cell_fusion.cc
             ${LITE_DIR}/tools/optimizer/fusion/tf_lstm_cell_fusion.cc
@@ -258,6 +267,7 @@ if(MSLITE_ENABLE_CONVERTER)
             ${LITE_DIR}/tools/optimizer/graph/transpose_strategy.cc
             ${LITE_DIR}/tools/optimizer/graph/reduce_same_act_pass.cc
             ${LITE_DIR}/tools/optimizer/graph/split_one_pass.cc
+            ${LITE_DIR}/tools/optimizer/graph/specify_graph_input_format.cc
             ${LITE_DIR}/tools/optimizer/fisson/eliminate_concat_split.cc
             ${LITE_DIR}/tools/optimizer/fisson/fisson_util.cc
             ${LITE_DIR}/tools/optimizer/fisson/iter_node_outputs.cc
@@ -278,12 +288,14 @@ if(MSLITE_ENABLE_CONVERTER)
             ${LITE_DIR}/tools/common/storage.cc
             ${LITE_DIR}/tools/converter/parser/inputs_adjust.cc
             ${LITE_DIR}/tools/converter/parser/unify_format.cc
+            ${LITE_DIR}/tools/converter/parser/lstm_adjust_pass.cc
             ${LITE_DIR}/tools/converter/parser/unused_node_remove_pass.cc
             ${LITE_DIR}/tools/converter/parser/conv1d_inout_adjust.cc
             ${LITE_DIR}/tools/converter/parser/tf_bidirection_gru_cf_fusion.cc
             ${LITE_DIR}/tools/converter/import/mindspore_importer.cc
             ${LITE_DIR}/tools/converter/import/primitive_adjust.cc
             ${LITE_DIR}/tools/converter/import/mindir_adjust.cc
+            ${LITE_DIR}/tools/converter/import/mindir_control_flow_adjust.cc
             )
 endif()
 ### train
diff --git a/mindspore/lite/test/config/models_caffe.cfg b/mindspore/lite/test/config/models_caffe.cfg
index 05a29d658cc..c920f846041 100644
--- a/mindspore/lite/test/config/models_caffe.cfg
+++ b/mindspore/lite/test/config/models_caffe.cfg
@@ -122,6 +122,6 @@ ml_face_emotion
 hdc_ocr_recog_horizontal
 ml_Heatmap_depth_240180;2
 ml_Heatmap_depth_180240;2
-ml_video_edit_person_divison_video;2
+ml_video_edit_person_divison_video;2:2,1
 ml_video_edit_hair_dyeing_segmodel_v2
 ml_video_edit_hairline_segmentation;3
diff --git a/mindspore/lite/test/config/models_caffe_posttraining.cfg b/mindspore/lite/test/config/models_caffe_posttraining.cfg
deleted file mode 100644
index a0cb52ba50b..00000000000
--- a/mindspore/lite/test/config/models_caffe_posttraining.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-ml_face_mnet 105
-ml_face_landmark_2 2
diff --git a/mindspore/lite/test/config/models_for_process_only.cfg b/mindspore/lite/test/config/models_for_process_only.cfg
deleted file mode 100644
index dde8b698e9a..00000000000
--- a/mindspore/lite/test/config/models_for_process_only.cfg
+++ /dev/null
@@ -1,31 +0,0 @@
-lite-model_arbitrary-image-stylization-inceptionv3_dr_transfer_1.tflite
-lite-model_arbitrary-image-stylization-inceptionv3_int8_transfer_1.tflite
-lite-model_arbitrary-image-stylization-inceptionv3-dynamic-shapes_dr_transfer_1.tflite;2;1,1,1,100:1,64,64,3
-lite-model_cartoongan_dr_1.tflite
-lite-model_deeplabv3-mobilenetv2_1_default_1.tflite
-lite-model_deeplabv3-mobilenetv2_dm05_1_default_1.tflite
-lite-model_deeplabv3-mobilenetv2-int8_1_default_1.tflite
-lite-model_deeplabv3-mobilenetv2-ade20k_1_default_1.tflite
-lite-model_deeplabv3-mobilenetv2_dm05-int8_1_default_1.tflite
-lite-model_deeplabv3-mobilenetv3-cityscapes_1_default_1.tflite
-lite-model_east-text-detector_dr_1.tflite
-magenta_arbitrary-image-stylization-v1-256_int8_transfer_1.tflite
-magenta_arbitrary-image-stylization-v1-256_int8_prediction_1.tflite
-efficientnet_lite0_int8_2.tflite
-efficientnet_lite1_int8_2.tflite
-efficientnet_lite2_int8_2.tflite
-efficientnet_lite3_int8_2.tflite
-efficientnet_lite4_int8_2.tflite
-mtk_transformer_encoder.tflite
-mtk_transformer_decoder_joint.tflite
-quant_aware_bank_card_detection_inception.onnx
-quant_aware_bank_card_recognition_fcny.onnx
-quant_aware_identify_card_detect.onnx
-# cur acc for ml_video_edit_art_transfer is 2+%
-ml_video_edit_art_transfer.onnx;3
-#ml_table_detection.onnx: onnx quantized model
-ml_table_detection.onnx
-intent_detect_hi_v2.tflite
-raise_watch.tflite
-ml_pic_shopping.pb
-hdc_orc_recog_202106.onnx
diff --git a/mindspore/lite/test/config/models_npu.cfg b/mindspore/lite/test/config/models_npu.cfg
index fff1d7aaf65..6a07224c333 100644
--- a/mindspore/lite/test/config/models_npu.cfg
+++ b/mindspore/lite/test/config/models_npu.cfg
@@ -31,16 +31,16 @@ ml_video_edit_style_transfer_autoportrait.onnx 9
 ml_video_edit_style_transfer_candy.onnx 11
 ml_video_edit_style_transfer_gongnongbing.onnx 10
 ml_video_edit_style_transfer_starry.onnx 11
-porseg_tmp.onnx;2 1
+porseg_tmp.onnx;2:2,1 1
 ml_video_edit_Mnet 1.5
 ml_video_edit_hairSeg_have_imageProcessLayer_interpTo145 0.5
 ml_video_edit_img_segment 1
 ml_video_edit_video_segment_gauss_adaptis_part1 2
 ml_video_edit_generate_filter.pb 1
-ml_video_edit_img_segment_adaptise.pb;2 0.5
-ml_video_edit_video_segment_gauss_adaptis_part2.pb;2 10
+ml_video_edit_img_segment_adaptise.pb;2:2,1 0.5
+ml_video_edit_video_segment_gauss_adaptis_part2.pb;2:2,1 10
 ml_video_edit_person_divison_pic 0.5
-ml_video_edit_person_divison_video;2 13
+ml_video_edit_person_divison_video;2:2,1 13
 ml_video_edit_judge.onnx 5
 ml_video_edit_vignet.onnx 0.5
 hdc_Face_Aesthetic_MTI_Aesthetic 0.5
@@ -67,12 +67,12 @@ hdc_ocr_attention.onnx 0.5 #too many subgraphs
 # hdc_ocr_detect.onnx 30 #too many subgraphs
 ml_edu_kit_hand_detection.onnx 1
 ml_edu_kit_hand_key_position.onnx 2
-ml_video_edit_oneclick_adaptis.pb;3 2.4
+ml_video_edit_oneclick_adaptis.pb;3:2,1,3 2.4
 densenet.tflite 3
 resnet_v2_101_299.tflite 1
 ml_video_edit_enhance.pb 2
-ml_video_edit_video_segment_gauss_adaptis_part2_pb2tflite.tflite;2 10
-ml_video_edit_img_segment_adaptise_pb2tflite.tflite;2 0.5
+ml_video_edit_video_segment_gauss_adaptis_part2_pb2tflite.tflite;2:2,1 10
+ml_video_edit_img_segment_adaptise_pb2tflite.tflite;2:2,1 0.5
 #the fifth value of the ml_video_edit_imitate_filter.onnx's output is very small (10-5).
 ml_video_edit_imitate_filter.onnx 200
 hdc_mobilenet_1w_class.onnx 20
@@ -83,12 +83,12 @@ ml_video_edit_art_generate.onnx 0.5
 ml_video_edit_art_transfer.onnx;3 3
 ml_video_edit_enhance_update_tmp.onnx 0.5
 ml_video_edit_art_generate_20210513.onnx 0.5
-ml_video_edit_art_transfer_20210513.onnx;3 0.5
+ml_video_edit_art_transfer_20210513.onnx;3:1,3,2 0.5
 ml_video_edit_hair_dyeing_segmodel_v2 0.5
 ml_video_edit_makeup_mobilenetv203.onnx 2
 ml_video_edit_hairline_segmentation;3 0.5
-ml_video_edit_hair_dyeing_migrate_v2.onnx;4 0.5
-ml_audio_kit_encoder_v5.pb;6;1,32:1,32:1,32:1,32:1:1
+ml_video_edit_hair_dyeing_migrate_v2.onnx;4:3,4,1,2 0.5
+ml_audio_kit_encoder_v5.pb;6:5,2,1,4,6,3;1:1,32:1,32:1,32:1:1,32
 fsr_270_mindspore.pb 1
 fsr_360_mindspore.pb 1
 fsr_720_mindspore.pb 1
diff --git a/mindspore/lite/test/config/models_npu_fp16.cfg b/mindspore/lite/test/config/models_npu_fp16.cfg
index 0a8a2a0fcdf..2e656c427eb 100644
--- a/mindspore/lite/test/config/models_npu_fp16.cfg
+++ b/mindspore/lite/test/config/models_npu_fp16.cfg
@@ -16,16 +16,16 @@ ml_video_edit_style_transfer_autoportrait.onnx 9
 ml_video_edit_style_transfer_candy.onnx 11
 ml_video_edit_style_transfer_gongnongbing.onnx 11
 ml_video_edit_style_transfer_starry.onnx 11
-porseg_tmp.onnx;2 1
+porseg_tmp.onnx;2:2,1 1
 ml_video_edit_Mnet 1.5
 ml_video_edit_hairSeg_have_imageProcessLayer_interpTo145 0.5
 ml_video_edit_img_segment 1
 ml_video_edit_video_segment_gauss_adaptis_part1 2
 ml_video_edit_generate_filter.pb 1
-ml_video_edit_img_segment_adaptise.pb;2 0.5
-ml_video_edit_video_segment_gauss_adaptis_part2.pb;2 10
+ml_video_edit_img_segment_adaptise.pb;2:2,1 0.5
+ml_video_edit_video_segment_gauss_adaptis_part2.pb;2:2,1 10
 ml_video_edit_person_divison_pic 0.5
-ml_video_edit_person_divison_video;2 13
+ml_video_edit_person_divison_video;2:2,1 13
 ml_video_edit_judge.onnx 5
 ml_video_edit_vignet.onnx 0.5
 hdc_Face_Aesthetic_MTI_Aesthetic 0.5
@@ -52,12 +52,12 @@ ml_video_edit_v10_best_model_nomean_20200723 8
 # hdc_ocr_detect.onnx 30 #too many subgraphs
 ml_edu_kit_hand_detection.onnx 1
 ml_edu_kit_hand_key_position.onnx 2
-ml_video_edit_oneclick_adaptis.pb;3 2.4
+ml_video_edit_oneclick_adaptis.pb;3:2,1,3 2.4
 densenet.tflite 3
 resnet_v2_101_299.tflite 1
 ml_video_edit_enhance.pb 2
-ml_video_edit_video_segment_gauss_adaptis_part2_pb2tflite.tflite;2 10
-ml_video_edit_img_segment_adaptise_pb2tflite.tflite;2 0.5
+ml_video_edit_video_segment_gauss_adaptis_part2_pb2tflite.tflite;2:2,1 10
+ml_video_edit_img_segment_adaptise_pb2tflite.tflite;2:2,1 0.5
 #the fifth value of the ml_video_edit_imitate_filter.onnx's output is very small (10-5).
 ml_video_edit_imitate_filter.onnx 200
 hdc_mobilenet_1w_class.onnx 20
@@ -69,6 +69,6 @@ ml_video_edit_art_transfer.onnx;3 3
 ml_video_edit_enhance_update_tmp.onnx 0.5
 #ml_video_edit_art_generate_20210513.onnx, output is out of range
 # ConstructSubgraph change, adjust threshold(3->29) for nlu temporary
-ml_video_edit_art_transfer_20210513.onnx;3 29
+ml_video_edit_art_transfer_20210513.onnx;3:1,3,2 29
 ml_video_edit_hair_dyeing_segmodel_v2 0.5
 ml_video_edit_makeup_mobilenetv203.onnx 2
diff --git a/mindspore/lite/test/config/models_onnx.cfg b/mindspore/lite/test/config/models_onnx.cfg
index bd303c24687..28ee4e3bac8 100644
--- a/mindspore/lite/test/config/models_onnx.cfg
+++ b/mindspore/lite/test/config/models_onnx.cfg
@@ -85,14 +85,14 @@ ml_asr_encoder_int8_202103.onnx
 rpnt_pdr_conv2d_16_fixed_last.onnx
 hdc_efficientnet_b3_1w_class.onnx
 yolov5s.onnx
-porseg_tmp.onnx;2
-hiai_nlu_onnx_model_v1_0.onnx;3
-hiai_nlu_onnx_model_v1_1.onnx;3
-ml_video_edit_art_transfer_20210513.onnx;3
+porseg_tmp.onnx;2:2,1
+hiai_nlu_onnx_model_v1_0.onnx;3:3,1,2
+hiai_nlu_onnx_model_v1_1.onnx;3:2,1,3
+ml_video_edit_art_transfer_20210513.onnx;3:1,3,2
 ml_asr_decoder_202103.onnx;2;1,64,512:1,64
 decoder.onnx;2;1,7,512:1,7
 ml_video_edit_makeup_mobilenetv203.onnx
-ml_video_edit_hair_dyeing_migrate_v2.onnx;4
+ml_video_edit_hair_dyeing_migrate_v2.onnx;4:3,4,1,2
 # cur acc for ml_audio_kit_vocals_test is 1.7% because the softmax's output of the last op has very small numbers.
 ml_audio_kit_vocals_test.onnx;1;1,512,1024,2 2
 gender_lstm_scd.onnx
diff --git a/mindspore/lite/test/config/models_onnx_fp16.cfg b/mindspore/lite/test/config/models_onnx_fp16.cfg
index d9ec6e16a45..93afe6b3215 100644
--- a/mindspore/lite/test/config/models_onnx_fp16.cfg
+++ b/mindspore/lite/test/config/models_onnx_fp16.cfg
@@ -97,9 +97,9 @@ hdc_efficientnet_b3_1w_class.onnx 18
 yolov5s.onnx 2
 ml_video_edit_art_transfer.onnx;3 3
 decoder.onnx;2;1,7,512:1,7 113
-ml_video_edit_art_transfer_20210513.onnx;3 1
+ml_video_edit_art_transfer_20210513.onnx;3:1,3,2 1
 ml_asr_decoder_202103.onnx;2;1,64,512:1,64 0.5
 ml_video_edit_makeup_mobilenetv203.onnx 4
 # The input of ml_video_edit_hair_dyeing_migrate_v2.onnx should be between [0, 1]
-ml_video_edit_hair_dyeing_migrate_v2.onnx;4 2.5
+ml_video_edit_hair_dyeing_migrate_v2.onnx;4:3,4,1,2 2.5
 Q888_CV_face_recognition_self.onnx 3.5
diff --git a/mindspore/lite/test/config/models_posttraining.cfg b/mindspore/lite/test/config/models_posttraining.cfg
index f684576a709..5c997283099 100644
--- a/mindspore/lite/test/config/models_posttraining.cfg
+++ b/mindspore/lite/test/config/models_posttraining.cfg
@@ -1,5 +1,5 @@
 ml_face_mnet 105
 ml_face_landmark_2 2
 mobilenet.tflite 0.5
-transformer_20200831_encoder_fp32.tflite;36 70
-transformer_20200831_decoder_fp32.tflite;11 35
+#transformer_20200831_encoder_fp32.tflite;36 70
+#transformer_20200831_decoder_fp32.tflite;11 35
diff --git a/mindspore/lite/test/config/models_tf.cfg b/mindspore/lite/test/config/models_tf.cfg
index 999fb519b56..515f83b0de0 100644
--- a/mindspore/lite/test/config/models_tf.cfg
+++ b/mindspore/lite/test/config/models_tf.cfg
@@ -72,11 +72,11 @@ siteAI_trans_nonlinear40g.pb;1;1,271
 siteAI_trans_nonlinear134g.pb;1;1,137
 siteAI_trans_nonlinear134g_nrz.pb;1;1,182
 ml_vision_guide_detection2.pb;1;1,320,320,1
-ml_tts_encoder.pb;4;1:1,44:1:1;;input_dependent
+ml_tts_encoder.pb;4:2,4,3,1;1,44:1:1:1;;input_dependent
 # encoder_0111_control_flow.pb is same as ml_tts_encoder_control_flow.pb
 #encoder_0111_control_flow.pb;4;1:1,44:1:1;;input_dependent
-ml_video_edit_img_segment_adaptise.pb;2
-ml_video_edit_video_segment_gauss_adaptis_part2.pb;2
+ml_video_edit_img_segment_adaptise.pb;2:2,1
+ml_video_edit_video_segment_gauss_adaptis_part2.pb;2:2,1
 #fasterrcnn_crop.pb is the same model as gts_object_detect_Ics.pb.
 #fasterrcnn_crop.pb;1;420,630,3
 #decoder_step_201217.pb is the same model as ml_tts_decoder.pb.
@@ -85,25 +85,25 @@ ml_video_edit_video_segment_gauss_adaptis_part2.pb;2
 #decoder_step_201217_modified.pb;5
 #encoder_0111.pb is the same model as ml_tts_encoder.pb.
 #encoder_0111.pb;4;1:1,44:1:1
-encoder_201228.pb;3;1:1,22:1;;input_dependent
-ml_video_edit_oneclick_adaptis.pb;3
-tacotron_encoder_stf.pb;5;1:1,62:1,62:1,62:1,62;;input_dependent
-female_model_step2_int16_noiseout.pb;66
-ml_female_model_step6_noiseout.pb;66
-ml_male_model_step6_noiseout.pb;66
-ml_tts_decoder_control_flow.pb;5
-ml_tts_decoder.pb;5
-ml_tts_encoder_control_flow.pb;4;1:1,22:1:1;;input_dependent
-ml_tts_vocoder.pb;66
+encoder_201228.pb;3:2,3,1;1,22:1:1;;input_dependent
+ml_video_edit_oneclick_adaptis.pb;3:2,1,3
+tacotron_encoder_stf.pb;5:2,3,5,4,1;1,62:1,62:1,62:1,62:1;;input_dependent
+female_model_step2_int16_noiseout.pb;66:2,7,6,1,3,4,5,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38
+ml_female_model_step6_noiseout.pb;66:2,7,6,1,3,4,5,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38
+ml_male_model_step6_noiseout.pb;66:2,7,6,1,3,4,5,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38
+ml_tts_decoder_control_flow.pb;5:5,4,3,1,2
+ml_tts_decoder.pb;5:4,5,2,1,3
+ml_tts_encoder_control_flow.pb;4:2,4,3,1;1,22:1:1:1;;input_dependent
+ml_tts_vocoder.pb;66:2,7,6,1,3,4,5,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38
 hiai_nlu_model.pb;3;1,16:1,16:1,16
 gts_object_detect_Ics.pb;1;420,630,3;;input_dependent
-hiai_transformer_encoder.pb;15
-decoder_step_nocumsum_v5.pb;13;1:1,512:1,1429,2:1,127:1,127:1,127:1,127,320:1,80:1,512:1,512:1,512:1,512:1,512
-ml_audio_kit_encoder_v5.pb;6;1,32:1,32:1,32:1,32:1:1
-hiai_nlu_model_v1.pb;3;1,16:1,16:1,16 2.0
-hiai_nlu_model_v2.pb;7;1,5:1,6:1,174:1,98:1,5:1,5:1,5
-hiai_nlu_model_multi.pb;6;1,32:1,32:1,6:1,11:1,74:1,32
-hiai_nlu_model_single.pb;3;1,32:1,32:1,32
+hiai_transformer_encoder.pb;15:1,3,4,5,6,7,8,9,10,11,12,13,14,15,2
+decoder_step_nocumsum_v5.pb;13:11,2,13,12,10,7,3,5,1,4,9,8,6;1,512:1,512:1,512:1,512:1,512:1,127,320:1,1429,2:1,127:1:1,127:1,512:1,80:1,127
+ml_audio_kit_encoder_v5.pb;6:5,2,1,4,6,3;1:1,32:1,32:1,32:1:1,32
+hiai_nlu_model_v1.pb;3:1,3,2;1,16:1,16:1,16 2.0
+hiai_nlu_model_v2.pb;7:5,7,6,4,3,2,1;1,5:1,5:1,5:1,98:1,174:1,6:1,5
+# hiai_nlu_model_multi.pb;6:1,6,2,5,4,3;1,32:1,32:1,32:1,74:1,11:1,6
+hiai_nlu_model_single.pb;3:1,3,2;1,32:1,32:1,32
 fsr_270_mindspore.pb
 fsr_360_mindspore.pb
 fsr_720_mindspore.pb
diff --git a/mindspore/lite/test/config/models_tf_fp16.cfg b/mindspore/lite/test/config/models_tf_fp16.cfg
index 3196d0697e3..18d12e2c02e 100644
--- a/mindspore/lite/test/config/models_tf_fp16.cfg
+++ b/mindspore/lite/test/config/models_tf_fp16.cfg
@@ -65,29 +65,29 @@ siteAI_trans_nonlinear134g.pb;1;1,137 0.5
 siteAI_trans_nonlinear134g_nrz.pb;1;1,182 0.6
 ml_vision_guide_detection2.pb;1;1,320,320,1 1
 # ml_tts_encoder.pb has a round op, which will cause round-off error when the decimal of input value is near 0.5
-ml_tts_encoder.pb;4;1:1,44:1:1 9
+ml_tts_encoder.pb;4:2,4,3,1;1,44:1:1:1 9
 # encoder_0111_control_flow.pb is same as ml_tts_encoder_control_flow.pb
 #encoder_0111_control_flow.pb;4;1:1,44:1:1 10
-ml_video_edit_video_segment_gauss_adaptis_part2.pb;2 11
-ml_video_edit_img_segment_adaptise.pb;2 40
-ml_video_edit_person_divison_video;2 38
-ml_video_edit_oneclick_adaptis.pb;3 6
+ml_video_edit_video_segment_gauss_adaptis_part2.pb;2:2,1 11
+ml_video_edit_img_segment_adaptise.pb;2:2,1 40
+ml_video_edit_person_divison_video;2:2,1 38
+ml_video_edit_oneclick_adaptis.pb;3:2,1,3 6
 #decoder_step_201217.pb is the same model as ml_tts_decoder.pb.
 #decoder_step_201217.pb;5 187
 #decoder_step_201217_modified.pb is the same model as ml_tts_decoder_control_flow.pb.
 #decoder_step_201217_modified.pb;5 0.5
 #encoder_0111.pb is the same model as ml_tts_encoder.pb.
 #encoder_0111.pb;4;1:1,44:1:1
-ml_female_model_step6_noiseout.pb;66 2
-ml_male_model_step6_noiseout.pb;66 2.5
-ml_tts_encoder_control_flow.pb;4;1:1,22:1:1 1.5
-ml_tts_decoder_control_flow.pb;5 1
-ml_tts_decoder.pb;5 2.5
-ml_tts_vocoder.pb;66 53
-hiai_transformer_encoder.pb;15 4
-decoder_step_nocumsum_v5.pb;13;1:1,512:1,1429,2:1,127:1,127:1,127:1,127,320:1,80:1,512:1,512:1,512:1,512:1,512 1.2
-hiai_nlu_model_multi.pb;6;1,32:1,32:1,6:1,11:1,74:1,32 25
-hiai_nlu_model_single.pb;3;1,32:1,32:1,32 2470
+ml_female_model_step6_noiseout.pb;66:2,7,6,1,3,4,5,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38 2
+ml_male_model_step6_noiseout.pb;66:2,7,6,1,3,4,5,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38 2.5
+ml_tts_encoder_control_flow.pb;4:2,4,3,1;1,22:1:1:1 1.5
+ml_tts_decoder_control_flow.pb;5:5,4,3,1,2 1
+ml_tts_decoder.pb;5:4,5,2,1,3 2.5
+ml_tts_vocoder.pb;66:2,7,6,1,3,4,5,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38 53
+hiai_transformer_encoder.pb;15:1,3,4,5,6,7,8,9,10,11,12,13,14,15,2 4
+decoder_step_nocumsum_v5.pb;13:11,2,13,12,10,7,3,5,1,4,9,8,6;1,512:1,512:1,512:1,512:1,512:1,127,320:1,1429,2:1,127:1:1,127:1,512:1,80:1,127 1.2
+# hiai_nlu_model_multi.pb;6:1,6,2,5,4,3;1,32:1,32:1,32:1,74:1,11:1,6 25
+hiai_nlu_model_single.pb;3:1,3,2;1,32:1,32:1,32 2470
 fsr_270_mindspore.pb 6.0
 fsr_360_mindspore.pb 6.5
 fsr_720_mindspore.pb 2.0
diff --git a/mindspore/lite/test/config/models_tflite.cfg b/mindspore/lite/test/config/models_tflite.cfg
index b8a38de085e..fa645dc5b9b 100644
--- a/mindspore/lite/test/config/models_tflite.cfg
+++ b/mindspore/lite/test/config/models_tflite.cfg
@@ -185,18 +185,18 @@ bloom_isface.tflite
 hiai_object_detect_814.tflite
 hiai_object_tflite_graph_8bit.tflite
 lma_tsec_shallow_channels16_ds2.1.1_model-best-f1.tflite
-lite-model_arbitrary-image-stylization-inceptionv3_fp16_transfer_1.tflite;2
-magenta_arbitrary-image-stylization-v1-256_fp16_transfer_1.tflite;2
-albert_lite_base_squadv1_1.tflite;3
-mobilebert_1_default_1.tflite;3
-ml_video_edit_img_segment_adaptise_pb2tflite.tflite;2
-ml_video_edit_video_segment_gauss_adaptis_part2_pb2tflite.tflite;2
-hdc_tb_cn_neg.tflite;3
-hiai_cv_labelDetectorModel_v3.tflite;2
+lite-model_arbitrary-image-stylization-inceptionv3_fp16_transfer_1.tflite;2:2,1
+magenta_arbitrary-image-stylization-v1-256_fp16_transfer_1.tflite;2:2,1
+albert_lite_base_squadv1_1.tflite;3:2,3,1
+mobilebert_1_default_1.tflite;3:2,3,1
+ml_video_edit_img_segment_adaptise_pb2tflite.tflite;2:2,1
+ml_video_edit_video_segment_gauss_adaptis_part2_pb2tflite.tflite;2:2,1
+hdc_tb_cn_neg.tflite;3:3,1,2 0.5
+hiai_cv_labelDetectorModel_v3.tflite;2:2,1
 ml_tacotron_decoder_step_stf.tflite;9;1,80:1,256:1,1024:1,1024:1,1024:1,1024:1,8:1,1,256:1
-ml_headpose_pb2tflite.tflite;3;16:1,64,64,3:16
-ml_ei_headpose_pb2tflite.tflite;3;16:1,64,64,3:16
-lite-model_albert_lite_base_squadv1_metadata_1.tflite;3
+ml_headpose_pb2tflite.tflite;3:2,3,1;1,64,64,3:16:16
+ml_ei_headpose_pb2tflite.tflite;3:2,3,1;1,64,64,3:16:16
+lite-model_albert_lite_base_squadv1_metadata_1.tflite;3:2,3,1
 lite-model_mobilebert_1_metadata_1.tflite;3
 hiai_vad.tflite;2
 add_uint8.tflite;2
diff --git a/mindspore/lite/test/config/models_tflite_fp16.cfg b/mindspore/lite/test/config/models_tflite_fp16.cfg
index 24d175ad9ee..2dc7b7ae526 100644
--- a/mindspore/lite/test/config/models_tflite_fp16.cfg
+++ b/mindspore/lite/test/config/models_tflite_fp16.cfg
@@ -10,7 +10,7 @@ hiai_model_normalize_object_scene_ps_20200519.tflite 20
 #hiai_detectmodel_06_23_960_480_1180700.tflite 20
 #hiai_detect_curve_model_float32.tflite 20
 #hiai_detectmodel_desnet_256_128_64_32.tflite 20
-mtk_AADB_HADB_MBV2_model_fp32.tflite 5
+mtk_AADB_HADB_MBV2_model_fp32.tflite 2
 mtk_AADB_HADB_MBV3_model_fp32.tflite 6
 mobilenet_v1_0.25_128.tflite 5
 mobilenet_v1_0.25_160.tflite 5
@@ -213,10 +213,10 @@ bloom_isface.tflite 0.5
 # The output values of conv layers range from -e±5 to e±5, which almost reaches the representation limit of fp16. In
 # this range, the fp16 data will has big bias. And the accumulation of this bias lowers the final precision.
 hiai_object_detect_814.tflite 14
-ml_video_edit_video_segment_gauss_adaptis_part2_pb2tflite.tflite;2 11
-ml_video_edit_img_segment_adaptise_pb2tflite.tflite;2 0.5
-hdc_tb_cn_neg.tflite;3 295
+ml_video_edit_video_segment_gauss_adaptis_part2_pb2tflite.tflite;2:2,1 11
+ml_video_edit_img_segment_adaptise_pb2tflite.tflite;2:2,1 0.5
+hdc_tb_cn_neg.tflite;3:3,1,2 295
 # The input of hiai_cv_labelDetectorModel_v3.tflite is between 0-255.
-hiai_cv_labelDetectorModel_v3.tflite;2 2
-ml_headpose_pb2tflite.tflite;3;16:1,64,64,3:16 1
-ml_ei_headpose_pb2tflite.tflite;3;16:1,64,64,3:16 0.6
+hiai_cv_labelDetectorModel_v3.tflite;2:2,1 2
+ml_headpose_pb2tflite.tflite;3:2,3,1;1,64,64,3:16:16 1
+ml_ei_headpose_pb2tflite.tflite;3:2,3,1;1,64,64,3:16:16 0.6
diff --git a/mindspore/lite/test/config/models_tflite_posttraining.cfg b/mindspore/lite/test/config/models_tflite_posttraining.cfg
deleted file mode 100644
index 59eba326b97..00000000000
--- a/mindspore/lite/test/config/models_tflite_posttraining.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-mobilenet.tflite 0.5
-transformer_20200831_encoder_fp32.tflite;36 70
-transformer_20200831_decoder_fp32.tflite;11 35
diff --git a/mindspore/lite/test/runtest.sh b/mindspore/lite/test/runtest.sh
index 91a33f61883..73311bf7a67 100644
--- a/mindspore/lite/test/runtest.sh
+++ b/mindspore/lite/test/runtest.sh
@@ -12,6 +12,13 @@ mkdir -pv ${CUR_DIR}/do_test
 # prepare data for ut
 cd ${CUR_DIR}/do_test
 cp ${BUILD_DIR}/test/lite-test ./
+cp ${BUILD_DIR}/googletest/googlemock/gtest/libgtest.so ./
+tar -xzf ../../../../output/mindspore-lite-*.tar.gz --strip-components=3 --wildcards *runtime/lib/*.so* || true
+tar -xzf ../../../../output/mindspore-lite-*.tar.gz --strip-components=4 --wildcards *converter/lib/*.so* || true
+tar -xzf ../../../../output/mindspore-lite-*.tar.gz --strip-components=5 --wildcards *libjpeg-turbo/lib/*.so* || true
+ls -l *.so*
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:./
+
 cp -r ${CUR_DIR}/ut/src/runtime/kernel/arm/test_data/* ./
 cp -r ${CUR_DIR}/ut/tools/converter/parser/tflite/test_data/* ./
 # prepare data for dataset
diff --git a/mindspore/lite/test/st/graph_test.cc b/mindspore/lite/test/st/graph_test.cc
index 261d8926046..87669f01d91 100644
--- a/mindspore/lite/test/st/graph_test.cc
+++ b/mindspore/lite/test/st/graph_test.cc
@@ -82,7 +82,7 @@ TEST_F(GraphTest, UserSetGraphOutput1) {
     string name = out_data.first;
     void *data = out_data.second;
     float *fp32_data = reinterpret_cast<float *>(data);
-    if (name == "Stack-8") {
+    if (name == "output") {
       output_count++;
       ASSERT_LE(fabs(fp32_data[0] - (0.115831)), 0.01);
       ASSERT_LE(fabs(fp32_data[1] - (0.113074)), 0.01);
@@ -90,7 +90,7 @@ TEST_F(GraphTest, UserSetGraphOutput1) {
       ASSERT_LE(fabs(fp32_data[3] - (0.346307)), 0.01);
       ASSERT_LE(fabs(fp32_data[4] - (-0.15687)), 0.01);
     }
-    if (name == "Stack-10") {
+    if (name == "output2") {
       output_count++;
       ASSERT_LE(fabs(fp32_data[0] - (0.06387864)), 0.01);
       ASSERT_LE(fabs(fp32_data[1] - (0.22883008)), 0.01);
@@ -98,7 +98,7 @@ TEST_F(GraphTest, UserSetGraphOutput1) {
       ASSERT_LE(fabs(fp32_data[3] - (0.04586578)), 0.01);
       ASSERT_LE(fabs(fp32_data[4] - (0.06820235)), 0.01);
     }
-    if (name == "Stack-13") {
+    if (name == "output3") {
       output_count++;
       ASSERT_LE(fabs(fp32_data[0] - (-0.1617176)), 0.01);
       ASSERT_LE(fabs(fp32_data[1] - (-0.3828573)), 0.01);
diff --git a/mindspore/lite/test/st/run_benchmark_nets.sh b/mindspore/lite/test/st/run_benchmark_nets.sh
index 7a6db1a6e40..01911824bc7 100644
--- a/mindspore/lite/test/st/run_benchmark_nets.sh
+++ b/mindspore/lite/test/st/run_benchmark_nets.sh
@@ -28,52 +28,22 @@ done
 cur_path=$(pwd)
 echo "cur_path is "$cur_path
 
-if [[ $backend == "all" || $backend == "arm64_cpu" || $backend == "arm64_fp32" || $backend == "arm64_fp16" || \
-        $backend == "arm64_codegen" ]]; then
+if [[ $backend == "all" || $backend == "arm64_cpu" || $backend == "arm64_fp32" || $backend == "arm64_fp16" ]]; then
     sh $cur_path/scripts/run_benchmark_arm64.sh -r $release_path -m $models_path -d $device_id -e $backend
     arm64_status=$?
     if [[ $arm64_status -ne 0 ]]; then
       echo "Run arm64 failed"
       exit 1
     fi
-    # run codegen
-    sh $cur_path/scripts/run_benchmark_codegen.sh -r $release_path -m $models_path -d $device_id -e "arm64_codegen"
-    arm64_status=$?
-    if [[ $arm64_status -ne 0 ]]; then
-      echo "Run arm64 codegen failed"
-      exit 1
-    fi
-    # run train
-    sh $cur_path/scripts/run_net_train.sh -r $release_path -m ${models_path}/../../models_train -d $device_id -e "arm64_train"
-    arm64_status=$?
-    if [[ $arm64_status -ne 0 ]]; then
-      echo "Run arm64 train failed"
-      exit 1
-    fi
 fi
 
-if [[ $backend == "all" || $backend == "arm32_cpu" || $backend == "arm32_fp32" || $backend == "arm32_fp16" || \
-      $backend == "arm32_codegen" ]]; then
+if [[ $backend == "all" || $backend == "arm32_cpu" || $backend == "arm32_fp32" || $backend == "arm32_fp16" ]]; then
     sh $cur_path/scripts/run_benchmark_arm32.sh -r $release_path -m $models_path -d $device_id -e $backend
     arm32_status=$?
     if [[ $arm32_status -ne 0 ]]; then
       echo "Run arm32 failed"
       exit 1
     fi
-    # run codegen
-    sh $cur_path/scripts/run_benchmark_codegen.sh -r $release_path -m $models_path -d $device_id -e "arm32_codegen"
-    arm32_status=$?
-    if [[ $arm32_status -ne 0 ]]; then
-      echo "Run arm32 codegen failed"
-      exit 1
-    fi
-    # run train
-    sh $cur_path/scripts/run_net_train.sh -r $release_path -m ${models_path}/../../models_train -d $device_id -e "arm32_train"
-    arm32_status=$?
-    if [[ $arm32_status -ne 0 ]]; then
-      echo "Run arm32 train failed"
-      exit 1
-    fi
 fi
 
 if [[ $backend == "all" || $backend == "gpu" ]]; then
@@ -95,25 +65,37 @@ if [[ $backend == "all" || $backend == "npu" ]]; then
 fi
 
 if [[ $backend == "all" || $backend == "x86-all" || $backend == "x86" || $backend == "x86-sse" || \
-      $backend == "x86-avx" || $backend == "x86-java" || $backend == "x86_codegen" ]]; then
+      $backend == "x86-avx" || $backend == "x86-java" ]]; then
     sh $cur_path/scripts/run_benchmark_x86.sh -r $release_path -m $models_path -e $backend
     x86_status=$?
     if [[ $x86_status -ne 0 ]]; then
       echo "Run x86 failed"
       exit 1
     fi
+fi
+
+if [[ $backend == "all" || $backend == "codegen_and_train" ]]; then
     # run codegen
-    sh $cur_path/scripts/run_benchmark_codegen.sh -r $release_path -m $models_path -e "x86_codegen"
+    sh $cur_path/scripts/run_benchmark_codegen.sh -r $release_path -m $models_path -d $device_id -e $backend
     x86_status=$?
     if [[ $x86_status -ne 0 ]]; then
-      echo "Run x86 codegen failed"
+      echo "Run codegen failed"
       exit 1
     fi
     # run train
-    sh $cur_path/scripts/run_net_train.sh -r $release_path -m ${models_path}/../../models_train -e "x86_train"
+    sh $cur_path/scripts/run_net_train.sh -r $release_path -m ${models_path}/../../models_train -d $device_id -e $backend
     x86_status=$?
     if [[ $x86_status -ne 0 ]]; then
-      echo "Run x86 train failed"
+      echo "Run train failed"
+      exit 1
+    fi
+fi
+
+if [[ $backend == "all" || $backend == "x86_asan" ]]; then
+    sh $cur_path/scripts/run_benchmark_asan.sh -r $release_path -m $models_path -e $backend
+    x86_asan_status=$?
+    if [[ $x86_asan_status -ne 0 ]]; then
+      echo "Run x86 asan failed"
       exit 1
     fi
 fi
diff --git a/mindspore/lite/test/st/scripts/base_functions.sh b/mindspore/lite/test/st/scripts/base_functions.sh
index 480009512fe..55c9caa4d73 100644
--- a/mindspore/lite/test/st/scripts/base_functions.sh
+++ b/mindspore/lite/test/st/scripts/base_functions.sh
@@ -133,7 +133,7 @@ function Run_Benchmark() {
       model_info=`echo ${line_info}|awk -F ' ' '{print $1}'`
       spec_acc_limit=`echo ${line_info}|awk -F ' ' '{print $2}'`
       model_name=`echo ${model_info}|awk -F ';' '{print $1}'`
-      input_num=`echo ${model_info} | awk -F ';' '{print $2}'`
+      input_config=`echo ${model_info} | awk -F ';' '{print $2}'`
       input_shapes=`echo ${model_info} | awk -F ';' '{print $3}'`
       spec_threads=`echo ${model_info} | awk -F ';' '{print $4}'`
       extra_info=`echo ${model_info} | awk -F ';' '{print $5}'`
@@ -172,13 +172,24 @@ function Run_Benchmark() {
       input_files=""
       output_file=""
       data_path=$3"/input_output/"
-      if [[ ${input_num} == "" || ${input_num} == 1 ]]; then
+      if [[ ${input_config} == "" || ${input_config} == 1 ]]; then
         input_files=${data_path}'input/'${model_name}'.ms.bin'
       else
-        for i in $(seq 1 $input_num)
-        do
-          input_files=${input_files}${data_path}'input/'${model_name}'.ms.bin_'$i','
-        done
+        input_num=`echo ${input_config} | awk -F ':' '{print $1}'`
+        input_seq=`echo ${input_config} | awk -F ':' '{print $2}'`
+        if [[ ${input_seq} == "" ]]; then
+          for i in $(seq 1 $input_num)
+          do
+            input_files=${input_files}${data_path}'input/'${model_name}'.ms.bin_'$i','
+          done
+        else
+          for i in $(seq 1 $input_num)
+          do
+            cur_input_num=${input_seq%%,*}
+            input_seq=${input_seq#*,}
+            input_files=${input_files}${data_path}'input/'${model_name}'.ms.bin_'$cur_input_num','
+          done
+        fi
       fi
       output_file=${data_path}'output/'${model_name}'.ms.out'
       # adjust threads
diff --git a/mindspore/lite/test/st/scripts/run_benchmark_codegen.sh b/mindspore/lite/test/st/scripts/run_benchmark_codegen.sh
index 0ef8f43c2f9..9388811cc9e 100644
--- a/mindspore/lite/test/st/scripts/run_benchmark_codegen.sh
+++ b/mindspore/lite/test/st/scripts/run_benchmark_codegen.sh
@@ -23,6 +23,7 @@ function Run_Converter() {
 # Run on x86 codegen benchmark
 function Run_x86_codegen() {
     # $1:buildPath $2:modelPath $3:cfgFile $4:logFile $5:resultFile
+    local support_parallel bind_mode thread_num suffix run_result
     local CODEGEN_PATH=${x86_path}/mindspore-lite-${version}-linux-x64/tools/codegen
     rm -rf $1
     mkdir -p $1
@@ -64,6 +65,7 @@ function Run_x86_codegen() {
 
 function Run_arm_codegen() {
     # $1:buildPath $2:modelPath $3:cfgFile $4:logFile $5:resultFile $6:deviceID $7:processor
+    local package_path package_suffix target platform android_abi toolchain_name package_path run_result
     echo "ANDROID_NDK: ${ANDROID_NDK}" >> $4
     package_path=${arm64_path}
     package_suffix="aarch64"
@@ -205,14 +207,16 @@ version=${file_name_array[2]}
 models_codegen_config=${basepath}/../config/models_codegen.cfg
 models_codegen_parallel_config=${basepath}/../config/models_codegen_parallel.cfg
 
+# Set models and build path
 ms_models_path=${basepath}/ms_models
-build_path=${basepath}/codegen_build
-build_parallal_path=${basepath}/codegen_parallel_build
+build_path_x86=${basepath}/codegen_build_x86
+build_path_parallel=${basepath}/codegen_build_parallel
+build_path_arm64=${basepath}/codegen_build_arm64
+build_path_arm32=${basepath}/codegen_build_arm32
 
 # Write converter result to temp file
 run_converter_log_file=${basepath}/run_converter_log.txt
 echo ' ' > ${run_converter_log_file}
-
 run_converter_result_file=${basepath}/run_converter_result.txt
 echo ' ' > ${run_converter_result_file}
 
@@ -255,38 +259,38 @@ echo "input backend is ${backend}"
 backend=${backend:-"all"}
 isFailed=0
 echo "current backend is ${backend}"
-if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "codegen_and_train" ]]; then
     # Run on x86-codegen
     echo "start Run x86 codegen ..."
-    Run_x86_codegen ${build_path} ${ms_models_path} ${models_codegen_config} ${run_x86_codegen_log_file} ${run_benchmark_result_file} &
+    Run_x86_codegen ${build_path_x86} ${ms_models_path} ${models_codegen_config} ${run_x86_codegen_log_file} ${run_benchmark_result_file} &
     Run_x86_codegen_PID=$!
     sleep 1
 fi
-if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "x86_codegen_parallel" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "x86_codegen_parallel" || $backend == "codegen_and_train" ]]; then
     # Run on x86-codegen-parallel
     echo "start Run x86 codegen parallel ..."
-    Run_x86_codegen ${build_parallal_path} ${ms_models_path} ${models_codegen_parallel_config} ${run_x86_codegen_parallel_log_file} ${run_benchmark_result_file} &
+    Run_x86_codegen ${build_path_parallel} ${ms_models_path} ${models_codegen_parallel_config} ${run_x86_codegen_parallel_log_file} ${run_benchmark_result_file} &
     Run_x86_codegen_parallel_PID=$!
     sleep 1
 fi
-if [[ $backend == "all" || $backend == "codegen" || $backend == "arm64_codegen" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "codegen" || $backend == "arm64_codegen" || $backend == "codegen_and_train" ]]; then
     # Run on codegen
     echo "start Run arm64 codegen ..."
-    Run_arm_codegen ${build_path} ${ms_models_path} ${models_codegen_config} ${run_arm64_fp32_codegen_log_file} ${run_benchmark_result_file} ${device_id} "arm64"
+    Run_arm_codegen ${build_path_arm64} ${ms_models_path} ${models_codegen_config} ${run_arm64_fp32_codegen_log_file} ${run_benchmark_result_file} ${device_id} "arm64"
     Run_arm64_codegen_status=$?
 #    Run_arm64_codegen_PID=$!
 #    sleep 1
 fi
-if [[ $backend == "all" || $backend == "codegen" || $backend == "arm32_codegen" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "codegen" || $backend == "arm32_codegen" || $backend == "codegen_and_train" ]]; then
     # Run on arm32 codegen
     echo "start Run arm32 codegen ..."
-    Run_arm_codegen ${build_path} ${ms_models_path} ${models_codegen_config} ${run_arm32_fp32_codegen_log_file} ${run_benchmark_result_file} ${device_id} "arm32"
+    Run_arm_codegen ${build_path_arm32} ${ms_models_path} ${models_codegen_config} ${run_arm32_fp32_codegen_log_file} ${run_benchmark_result_file} ${device_id} "arm32"
     Run_arm32_codegen_status=$?
 #    Run_arm32_codegen_PID=$!
 #    sleep 1
 fi
 
-if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "codegen_and_train" ]]; then
     wait ${Run_x86_codegen_PID}
     Run_x86_codegen_status=$?
     if [[ ${Run_x86_codegen_status} != 0 ]];then
@@ -295,7 +299,7 @@ if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" ||
         isFailed=1
     fi
 fi
-if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "x86_codegen_parallel" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" || $backend == "x86_codegen_parallel" || $backend == "codegen_and_train" ]]; then
     wait ${Run_x86_codegen_parallel_PID}
     Run_x86_codegen_parallel_status=$?
     if [[ ${Run_x86_codegen_parallel_status} != 0 ]];then
@@ -304,7 +308,7 @@ if [[ $backend == "all" || $backend == "codegen" || $backend == "x86_codegen" ||
         isFailed=1
     fi
 fi
-if [[ $backend == "all" || $backend == "codegen" || $backend == "arm64_codegen" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "codegen" || $backend == "arm64_codegen" || $backend == "codegen_and_train" ]]; then
 #    wait ${Run_arm64_codegen_PID}
 #    Run_arm64_codegen_status=$?
     if [[ ${Run_arm64_codegen_status} != 0 ]];then
@@ -313,7 +317,7 @@ if [[ $backend == "all" || $backend == "codegen" || $backend == "arm64_codegen"
         isFailed=1
     fi
 fi
-if [[ $backend == "all" || $backend == "codegen" || $backend == "arm32_codegen" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "codegen" || $backend == "arm32_codegen" || $backend == "codegen_and_train" ]]; then
 #    wait ${Run_arm32_codegen_PID}
 #    Run_arm32_codegen_status=$?
     if [[ ${Run_arm32_codegen_status} != 0 ]];then
diff --git a/mindspore/lite/test/st/scripts/run_benchmark_npu.sh b/mindspore/lite/test/st/scripts/run_benchmark_npu.sh
index 02c3fdfe8c3..e2b77c8f773 100644
--- a/mindspore/lite/test/st/scripts/run_benchmark_npu.sh
+++ b/mindspore/lite/test/st/scripts/run_benchmark_npu.sh
@@ -15,7 +15,7 @@ function Run_Converter() {
     mkdir -p ${ms_models_path}
 
     # Prepare the config file list
-    local npu_cfg_file_list=("$models_npu_config")
+    local npu_cfg_file_list=("$models_npu_config" "$models_npu_weightquant_config")
     # Convert models:
     # $1:cfgFileList; $2:inModelPath; $3:outModelPath; $4:logFile; $5:resultFile;
     Convert "${npu_cfg_file_list[*]}" $models_path $ms_models_path $run_converter_log_file $run_converter_result_file
@@ -24,24 +24,14 @@ function Run_Converter() {
 # Run on npu platform:
 function Run_npu() {
     # Prepare the config file list
-    local npu_cfg_file_list=("$models_npu_config")
+    local npu_cfg_file_list=("$models_npu_config" "$models_npu_fp16_config" "$models_npu_weightquant_config")
     # Run converted models:
     # $1:cfgFileList; $2:modelPath; $3:dataPath; $4:logFile; $5:resultFile; $6:platform; $7:processor; $8:phoneId;
     Run_Benchmark "${npu_cfg_file_list[*]}" . '/data/local/tmp' $run_npu_log_file $run_benchmark_result_file 'arm64' 'NPU' $device_id
 }
 
-# Run on npu and fp16 platform:
-function Run_npu_fp16() {
-    # Prepare the config file list
-    local npu_fp16_cfg_file_list=("$models_npu_fp16_config")
-    # Run converted models:
-    # $1:cfgFileList; $2:modelPath; $3:dataPath; $4:logFile; $5:resultFile; $6:platform; $7:processor; $8:phoneId;
-    Run_Benchmark "${npu_fp16_cfg_file_list[*]}" . '/data/local/tmp' $run_npu_fp16_log_file $run_benchmark_result_file 'arm64' 'NPU' $device_id
-}
-
 basepath=$(pwd)
 echo ${basepath}
-#set -e
 
 # Example:sh run_benchmark_npu.sh -r /home/temp_test -m /home/temp_test/models -d "8KE5T19620002408" -e arm_cpu
 while getopts "r:m:d:e:" opt; do
@@ -78,6 +68,7 @@ version=${file_name_array[2]}
 # Set models config filepath
 models_npu_config=${basepath}/../config/models_npu.cfg
 models_npu_fp16_config=${basepath}/../config/models_npu_fp16.cfg
+models_npu_weightquant_config=${basepath}/../config/models_npu_weightquant.cfg
 
 ms_models_path=${basepath}/ms_models
 
@@ -110,9 +101,6 @@ echo ' ' > ${run_benchmark_result_file}
 run_npu_log_file=${basepath}/run_npu_log.txt
 echo 'run npu logs: ' > ${run_npu_log_file}
 
-run_npu_fp16_log_file=${basepath}/run_npu_fp16_log.txt
-echo 'run npu fp16 logs: ' > ${run_npu_fp16_log_file}
-
 # Copy the MindSpore models:
 echo "Push files to the arm and run benchmark"
 benchmark_test_path=${basepath}/benchmark_test
@@ -133,13 +121,6 @@ if [[ $backend == "all" || $backend == "npu" || $backend == "npu_fp32" ]]; then
     # Run_npu_PID=$!
     # sleep 1
 fi
-if [[ $backend == "all" || $backend == "npu" || $backend == "npu_fp16" ]]; then
-    echo "start Run npu fp16 ..."
-    Run_npu_fp16
-    Run_npu_fp16_status=$?
-    # Run_npu_fp16_PID=$!
-    # sleep 1
-fi
 
 if [[ $backend == "all" || $backend == "npu" || $backend == "npu_fp32" ]]; then
     # wait ${Run_npu_PID}
@@ -150,16 +131,7 @@ if [[ $backend == "all" || $backend == "npu" || $backend == "npu_fp32" ]]; then
         isFailed=1
     fi
 fi
-if [[ $backend == "all" || $backend == "npu" || $backend == "npu_fp16" ]]; then
-    # wait ${Run_npu_fp16_PID}
-    # Run_npu_fp16_status=$?
-    if [[ ${Run_npu_fp16_status} != 0 ]];then
-        echo "Run_npu_fp16 failed"
-        cat ${run_npu_fp16_log_file}
-        isFailed=1
-    fi
-fi
 
-echo "Run_npu and Run_npu_fp16 ended"
+echo "Run_npu ended"
 Print_Benchmark_Result $run_benchmark_result_file
 exit ${isFailed}
diff --git a/mindspore/lite/test/st/scripts/run_benchmark_x86.sh b/mindspore/lite/test/st/scripts/run_benchmark_x86.sh
index dd61c255e70..389d3265b61 100644
--- a/mindspore/lite/test/st/scripts/run_benchmark_x86.sh
+++ b/mindspore/lite/test/st/scripts/run_benchmark_x86.sh
@@ -183,7 +183,6 @@ function Run_x86_java() {
 
 basepath=$(pwd)
 echo ${basepath}
-#set -e
 
 # Example:sh run_benchmark_x86.sh -r /home/temp_test -m /home/temp_test/models -e arm_cpu
 while getopts "r:m:e:" opt; do
@@ -206,8 +205,6 @@ while getopts "r:m:e:" opt; do
     esac
 done
 
-# mkdir train
-
 x86_path=${release_path}/ubuntu_x86
 file_name=$(ls ${x86_path}/*-linux-x64.tar.gz)
 IFS="-" read -r -a file_name_array <<< "$file_name"
diff --git a/mindspore/lite/test/st/scripts/run_net_train.sh b/mindspore/lite/test/st/scripts/run_net_train.sh
index cde5ff984d4..6773943c8b1 100755
--- a/mindspore/lite/test/st/scripts/run_net_train.sh
+++ b/mindspore/lite/test/st/scripts/run_net_train.sh
@@ -399,7 +399,7 @@ function Run_CodeExamples() {
       cd -
     fi
 
-    if [[ $backend == "all" || $backend == "train" || $backend == "x86_train" || $backend == "codegen&train" || $backend == "arm64_train" ]]; then
+    if [[ $backend == "all" || $backend == "train" || $backend == "x86_train" || $backend == "codegen_and_train" || $backend == "arm64_train" ]]; then
 
       should_run_example "unified_api"
       should_run=$?
@@ -630,21 +630,21 @@ mkdir -p ${benchmark_train_test_path}
 cp -a ${ms_models_path}/*.ms ${benchmark_train_test_path}
 
 isFailed=0
-if [[ $backend == "all" || $backend == "train" || $backend == "x86_train" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "train" || $backend == "x86_train" || $backend == "codegen_and_train" ]]; then
     # Run on x86
     echo "Start Run x86 ..."
     Run_x86 &
     Run_x86_PID=$!
     sleep 1
 fi
-if [[ $backend == "all" || $backend == "train" || $backend == "x86_train" || $backend == "x86-java" || $backend == "codegen&train" || $backend == "arm64_train" ]]; then
+if [[ $backend == "all" || $backend == "train" || $backend == "x86_train" || $backend == "x86-java" || $backend == "codegen_and_train" || $backend == "arm64_train" ]]; then
     # Run Code Examples 
     echo "Start Code Examples ..."
     Run_CodeExamples &
     Run_CodeExamples_PID=$!
     sleep 1
 fi
-if [[ $backend == "all" || $backend == "train" || $backend == "arm64_train" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "train" || $backend == "arm64_train" || $backend == "codegen_and_train" ]]; then
     # Run on arm64
     echo "Start Run arm64 ..."
     Run_arm arm64
@@ -652,7 +652,7 @@ if [[ $backend == "all" || $backend == "train" || $backend == "arm64_train" || $
 #   Run_arm64_PID=$!
 #   sleep 1
 fi
-if [[ $backend == "all" || $backend == "train" || $backend == "arm32_train" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "train" || $backend == "arm32_train" || $backend == "codegen_and_train" ]]; then
     # Run on arm32
     echo "Start Run arm32 ..."
     Run_arm arm32
@@ -661,7 +661,7 @@ if [[ $backend == "all" || $backend == "train" || $backend == "arm32_train" || $
 #   sleep 1
 fi
 
-if [[ $backend == "all" || $backend == "train" || $backend == "x86_train" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "train" || $backend == "x86_train" || $backend == "codegen_and_train" ]]; then
     wait ${Run_x86_PID}
     Run_x86_status=$?
     if [[ ${Run_x86_status} != 0 ]];then
@@ -670,7 +670,7 @@ if [[ $backend == "all" || $backend == "train" || $backend == "x86_train" || $ba
         isFailed=1
     fi
 fi
-if [[ $backend == "all" || $backend == "train" || $backend == "x86_train" || $backend == "x86-java" || $backend == "codegen&train" || $backend == "arm64_train" ]]; then
+if [[ $backend == "all" || $backend == "train" || $backend == "x86_train" || $backend == "x86-java" || $backend == "codegen_and_train" || $backend == "arm64_train" ]]; then
     wait ${Run_CodeExamples_PID}
     Run_CodeExamples_status=$?
     if [[ ${Run_CodeExamples_status} != 0 ]];then
@@ -681,7 +681,7 @@ if [[ $backend == "all" || $backend == "train" || $backend == "x86_train" || $ba
 fi
 
 
-if [[ $backend == "all" || $backend == "train" || $backend == "arm64_train" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "train" || $backend == "arm64_train" || $backend == "codegen_and_train" ]]; then
 #   wait ${Run_arm64_PID}
 #   Run_arm64_status=$?
     if [[ ${Run_arm64_status} != 0 ]];then
@@ -690,7 +690,7 @@ if [[ $backend == "all" || $backend == "train" || $backend == "arm64_train" || $
         isFailed=1
     fi
 fi
-if [[ $backend == "all" || $backend == "train" || $backend == "arm32_train" || $backend == "codegen&train" ]]; then
+if [[ $backend == "all" || $backend == "train" || $backend == "arm32_train" || $backend == "codegen_and_train" ]]; then
 #   wait ${Run_arm32_PID}
 #   Run_arm32_status=$?
     if [[ ${Run_arm32_status} != 0 ]];then
diff --git a/mindspore/lite/test/ut/src/registry/registry_custom_op_test.cc b/mindspore/lite/test/ut/src/registry/registry_custom_op_test.cc
index 9e6211e578a..76df850feab 100644
--- a/mindspore/lite/test/ut/src/registry/registry_custom_op_test.cc
+++ b/mindspore/lite/test/ut/src/registry/registry_custom_op_test.cc
@@ -25,7 +25,7 @@
 #include "include/errorcode.h"
 #include "src/common/log_adapter.h"
 #include "src/lite_session.h"
-#include "include/registry/kernel_interface.h"
+#include "include/registry/register_kernel_interface.h"
 #include "include/registry/register_kernel.h"
 
 using mindspore::kernel::Kernel;
@@ -39,6 +39,7 @@ namespace mindspore {
 namespace {
 const char *const kKeyName = "test_key";
 const char *const kTestData = "test_data";
+constexpr auto kFloat32 = DataType::kNumberTypeFloat32;
 }  // namespace
 
 class TestData {
@@ -110,12 +111,12 @@ class TestCustomOpInfer : public KernelInterface {
  public:
   TestCustomOpInfer() = default;
   ~TestCustomOpInfer() = default;
-  int Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
-            const schema::Primitive *primitive) override {
+  Status Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
+               const schema::Primitive *primitive) override {
     (*outputs)[0].SetFormat((*inputs)[0].format());
     (*outputs)[0].SetDataType((*inputs)[0].DataType());
     (*outputs)[0].SetShape((*inputs)[0].Shape());
-    return RET_OK;
+    return kSuccess;
   }
 };
 
@@ -128,7 +129,7 @@ std::shared_ptr<Kernel> TestCustomAddCreator(const std::vector<MSTensor> &inputs
 std::shared_ptr<KernelInterface> CustomAddInferCreator() { return std::make_shared<TestCustomOpInfer>(); }
 }  // namespace
 
-REGISTER_CUSTOM_KERNEL(CPU, BuiltInTest, kNumberTypeFloat32, Add, TestCustomAddCreator)
+REGISTER_CUSTOM_KERNEL(CPU, BuiltInTest, kFloat32, Add, TestCustomAddCreator)
 REGISTER_CUSTOM_KERNEL_INTERFACE(BuiltInTest, Add, CustomAddInferCreator)
 
 class TestRegistryCustomOp : public mindspore::CommonTest {
diff --git a/mindspore/lite/test/ut/src/registry/registry_test.cc b/mindspore/lite/test/ut/src/registry/registry_test.cc
index 3bbb525a1fa..e3c4b2bfdc0 100644
--- a/mindspore/lite/test/ut/src/registry/registry_test.cc
+++ b/mindspore/lite/test/ut/src/registry/registry_test.cc
@@ -25,7 +25,7 @@
 #include "src/common/log_adapter.h"
 #include "src/lite_session.h"
 #include "src/runtime/inner_allocator.h"
-#include "include/registry/kernel_interface.h"
+#include "include/registry/register_kernel_interface.h"
 #include "include/registry/register_kernel.h"
 
 using mindspore::kernel::Kernel;
@@ -80,12 +80,12 @@ class TestCustomAddInfer : public KernelInterface {
  public:
   TestCustomAddInfer() = default;
   ~TestCustomAddInfer() = default;
-  int Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
-            const schema::Primitive *primitive) override {
+  Status Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
+               const schema::Primitive *primitive) override {
     (*outputs)[0].SetFormat((*inputs)[0].format());
     (*outputs)[0].SetDataType((*inputs)[0].DataType());
     (*outputs)[0].SetShape((*inputs)[0].Shape());
-    return RET_OK;
+    return kSuccess;
   }
 };
 
@@ -96,9 +96,10 @@ std::shared_ptr<Kernel> TestCustomAddCreator(const std::vector<MSTensor> &inputs
 }
 
 std::shared_ptr<KernelInterface> CustomAddInferCreator() { return std::make_shared<TestCustomAddInfer>(); }
+const auto kFloat32 = DataType::kNumberTypeFloat32;
 }  // namespace
 
-REGISTER_KERNEL(CPU, BuiltInTest, kNumberTypeFloat32, PrimitiveType_AddFusion, TestCustomAddCreator)
+REGISTER_KERNEL(CPU, BuiltInTest, kFloat32, PrimitiveType_AddFusion, TestCustomAddCreator)
 REGISTER_KERNEL_INTERFACE(BuiltInTest, PrimitiveType_AddFusion, CustomAddInferCreator)
 
 class TestRegistry : public mindspore::CommonTest {
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/cxx_api/model_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/cxx_api/model_test.cc
index 115f67393c8..dbad66a2f94 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/cxx_api/model_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/cxx_api/model_test.cc
@@ -30,7 +30,7 @@ TEST_F(TestCxxApiLiteModel, test_build_context_uninitialized_FAILED) {
   Model model;
   Graph graph;
 
-  ASSERT_TRUE(Serialization::Load("./nets/conv_train_model.ms", ModelType::kFlatBuffer, &graph) == kSuccess);
+  ASSERT_TRUE(Serialization::Load("./nets/conv_train_model.ms", ModelType::kMindIR, &graph) == kSuccess);
   auto status = model.Build(GraphCell(graph), nullptr, nullptr);
   ASSERT_TRUE(status != kSuccess);
   auto err_mst = status.GetErrDescription();
@@ -53,7 +53,7 @@ TEST_F(TestCxxApiLiteModel, test_build_SUCCES) {
   auto cpu_context = std::make_shared<mindspore::CPUDeviceInfo>();
   context->MutableDeviceInfo().push_back(cpu_context);
 
-  ASSERT_TRUE(Serialization::Load("./nets/conv_train_model.ms", ModelType::kFlatBuffer, &graph) == kSuccess);
+  ASSERT_TRUE(Serialization::Load("./nets/conv_train_model.ms", ModelType::kMindIR, &graph) == kSuccess);
   ASSERT_TRUE(model.Build(GraphCell(graph), context, nullptr) == kSuccess);
 }
 
@@ -69,7 +69,7 @@ TEST_F(TestCxxApiLiteModel, test_train_mode_SUCCES) {
   auto cpu_context = std::make_shared<mindspore::CPUDeviceInfo>();
   context->MutableDeviceInfo().push_back(cpu_context);
 
-  ASSERT_TRUE(Serialization::Load("./nets/conv_train_model.ms", ModelType::kFlatBuffer, &graph) == kSuccess);
+  ASSERT_TRUE(Serialization::Load("./nets/conv_train_model.ms", ModelType::kMindIR, &graph) == kSuccess);
   ASSERT_TRUE(model.Build(GraphCell(graph), context, nullptr) == kSuccess);
   ASSERT_TRUE(model.SetTrainMode(true) == kSuccess);
   ASSERT_TRUE(model.GetTrainMode() == true);
@@ -88,7 +88,7 @@ TEST_F(TestCxxApiLiteModel, test_outputs_SUCCESS) {
   auto cpu_context = std::make_shared<mindspore::CPUDeviceInfo>();
   context->MutableDeviceInfo().push_back(cpu_context);
 
-  ASSERT_TRUE(Serialization::Load("./nets/conv_train_model.ms", ModelType::kFlatBuffer, &graph) == kSuccess);
+  ASSERT_TRUE(Serialization::Load("./nets/conv_train_model.ms", ModelType::kMindIR, &graph) == kSuccess);
   ASSERT_TRUE(model.Build(GraphCell(graph), context, nullptr) == kSuccess);
   auto outputs = model.GetOutputs();
   ASSERT_GT(outputs.size(), 0);
@@ -109,7 +109,7 @@ TEST_F(TestCxxApiLiteModel, test_metrics_SUCCESS) {
   auto cpu_context = std::make_shared<mindspore::CPUDeviceInfo>();
   context->MutableDeviceInfo().push_back(cpu_context);
 
-  ASSERT_TRUE(Serialization::Load("./nets/conv_train_model.ms", ModelType::kFlatBuffer, &graph) == kSuccess);
+  ASSERT_TRUE(Serialization::Load("./nets/conv_train_model.ms", ModelType::kMindIR, &graph) == kSuccess);
   ASSERT_TRUE(model.Build(GraphCell(graph), context, nullptr) == kSuccess);
   AccuracyMetrics ac;
   ASSERT_TRUE(model.InitMetrics({&ac}) == kSuccess);
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/cxx_api/serialization_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/cxx_api/serialization_test.cc
index d8be4d487be..f20adcca66a 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/cxx_api/serialization_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/cxx_api/serialization_test.cc
@@ -25,7 +25,7 @@ class TestCxxApiLiteSerialization : public mindspore::CommonTest {
 
 TEST_F(TestCxxApiLiteSerialization, test_load_no_encrpty_mindir_SUCCESS) {
   Graph graph;
-  ASSERT_TRUE(Serialization::Load("./nets/retinaface1.ms", ModelType::kFlatBuffer, &graph) == kSuccess);
+  ASSERT_TRUE(Serialization::Load("./nets/retinaface1.ms", ModelType::kMindIR, &graph) == kSuccess);
 }
 
 TEST_F(TestCxxApiLiteSerialization, test_load_file_not_exist_FAILED) {
@@ -37,7 +37,7 @@ TEST_F(TestCxxApiLiteSerialization, test_load_file_not_exist_FAILED) {
 TEST_F(TestCxxApiLiteSerialization, test_load_file_not_exist_x2_FAILED) {
   std::vector<Graph> graphs;
   auto status =
-    Serialization::Load(std::vector<std::string>(2, "./nets/file_not_exist.mindir"), ModelType::kMindIR, &graphs);
+    Serialization::Load(std::vector<std::string>(2, "./nets/file_not_exist.mindir"), ModelType::kFlatBuffer, &graphs);
   ASSERT_TRUE(status != kSuccess);
 }
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc
index d6baa4b45fc..0210de45f97 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc
@@ -138,6 +138,7 @@ TEST_F(TestActivationFp32, HSwishFp32) {
 
   input0_tensor.set_data(nullptr);
   output0_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestActivationFp32, HardTanh1) {
@@ -184,6 +185,7 @@ TEST_F(TestActivationFp32, HardTanh1) {
 
   input0_tensor.set_data(nullptr);
   output0_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestActivationFp32, HardTanh2) {
@@ -230,6 +232,7 @@ TEST_F(TestActivationFp32, HardTanh2) {
 
   input0_tensor.set_data(nullptr);
   output0_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestActivationFp32, Softplus) {
@@ -275,6 +278,7 @@ TEST_F(TestActivationFp32, Softplus) {
 
   input0_tensor.set_data(nullptr);
   output0_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batchnorm_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batchnorm_fp32_tests.cc
index 06a31c349a2..9edabeafd3f 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batchnorm_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batchnorm_fp32_tests.cc
@@ -77,6 +77,7 @@ TEST_F(TestBatchnormFp32, BNTest) {
   input1_tensor.set_data(nullptr);
   input2_tensor.set_data(nullptr);
   output0_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestBatchnormFp32, FusedBNTest) {
@@ -137,6 +138,7 @@ TEST_F(TestBatchnormFp32, FusedBNTest) {
   input3.set_data(nullptr);
   input4.set_data(nullptr);
   output0.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestBatchnormFp32, easyTest) {
@@ -188,6 +190,7 @@ TEST_F(TestBatchnormFp32, easyTest) {
   input1.set_data(nullptr);
   input2.set_data(nullptr);
   output0.set_data(nullptr);
+  delete kernel;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/crop_fp32_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/crop_fp32_test.cc
index 728f536d5cd..05e8af3bfae 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/crop_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/crop_fp32_test.cc
@@ -284,5 +284,12 @@ TEST_F(CropTestFp32, CropTest11) {
   std::cout << "\n";
   ASSERT_EQ(0, CompareOutputData(output, expect_out, kOutSize, 0.000001));
   delete ctx;
+  for (unsigned int i = 0; i < inputs.size(); i++) {
+    delete inputs[i];
+  }
+  for (unsigned int i = 0; i < outputs.size(); i++) {
+    delete outputs[i];
+  }
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/elu_fp32_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/elu_fp32_test.cc
index 612695f4329..4aa28f4ba6b 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/elu_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/elu_fp32_test.cc
@@ -69,6 +69,13 @@ TEST_F(TestEluFp32, EluTest) {
   }
   std::cout << std::endl;
   delete ctx;
+  for (unsigned int i = 0; i < inputs_.size(); i++) {
+    delete inputs_[i];
+  }
+  for (unsigned int i = 0; i < outputs_.size(); i++) {
+    delete outputs_[i];
+  }
+  delete elu;
 }
 
 };  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/embedding_lookup_fp32_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/embedding_lookup_fp32_test.cc
index 25f28deabdd..0db91b4e17a 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/embedding_lookup_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/embedding_lookup_fp32_test.cc
@@ -81,6 +81,14 @@ TEST_F(TestEmbeddingLookupFp32, ElTest) {
     std::cout << out[i] << ' ';
   }
   std::cout << std::endl;
+  for (unsigned int i = 0; i < inputs_.size(); i++) {
+    delete inputs_[i];
+  }
+  for (unsigned int i = 0; i < outputs_.size(); i++) {
+    delete outputs_[i];
+  }
+  delete el;
+  delete ctx;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/fullconnection_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/fullconnection_fp32_tests.cc
index 6c741385daf..cad86db967b 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/fullconnection_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/fullconnection_fp32_tests.cc
@@ -93,6 +93,12 @@ TEST_F(TestFcFp32, FcTest1) {
   ASSERT_EQ(0, CompareOutputData(reinterpret_cast<float *>(outputs_[0]->MutableData()), correct, total_size, 0.0001));
   delete fc;
   delete ctx;
+  for (unsigned int i = 0; i < inputs_.size(); i++) {
+    delete inputs_[i];
+  }
+  for (unsigned int i = 0; i < outputs_.size(); i++) {
+    delete outputs_[i];
+  }
 }
 
 int FcTestInit2(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_,
@@ -155,6 +161,12 @@ TEST_F(TestFcFp32, FcTest2) {
 #endif
   fc->Run();
   ASSERT_EQ(0, CompareOutputData(reinterpret_cast<float *>(outputs_[0]->MutableData()), correct, total_size, 0.0001));
+  for (unsigned int i = 0; i < inputs_.size(); i++) {
+    delete inputs_[i];
+  }
+  for (unsigned int i = 0; i < outputs_.size(); i++) {
+    delete outputs_[i];
+  }
   delete fc;
   delete ctx;
 }
@@ -212,6 +224,12 @@ TEST_F(TestFcFp32, FcTest3) {
   for (int i = 0; i < 100000; ++i) fc->Run();
   gettimeofday(&end, nullptr);
   // printf("## elapsed: %llu\n", 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - end.tv_usec);
+  for (unsigned int i = 0; i < inputs_.size(); i++) {
+    delete inputs_[i];
+  }
+  for (unsigned int i = 0; i < outputs_.size(); i++) {
+    delete outputs_[i];
+  }
   delete fc;
   delete ctx;
 }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/l2norm_fp32_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/l2norm_fp32_test.cc
index f2ed0cfacc4..db7731704af 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/l2norm_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/l2norm_fp32_test.cc
@@ -41,6 +41,7 @@ class TestL2NormFp32 : public mindspore::CommonTest {
 };
 
 void TestL2NormFp32::TearDown() {
+  delete kernel_;
   in_tensor_.set_data(nullptr);
   out_tensor_.set_data(nullptr);
 }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/lsh_projection_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/lsh_projection_fp32_tests.cc
index 0f20496c75f..ed13a78f36d 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/lsh_projection_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/lsh_projection_fp32_tests.cc
@@ -78,6 +78,7 @@ TEST_F(TestLshProjectionFp32, Dense1DInputs) {
   in_tensor0.set_data(nullptr);
   in_tensor1.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestLshProjectionFp32, Sparse1DInputs) {
@@ -120,6 +121,7 @@ TEST_F(TestLshProjectionFp32, Sparse1DInputs) {
   in_tensor0.set_data(nullptr);
   in_tensor1.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestLshProjectionFp32, Sparse3DInputs) {
@@ -166,5 +168,6 @@ TEST_F(TestLshProjectionFp32, Sparse3DInputs) {
   in_tensor0.set_data(nullptr);
   in_tensor1.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/lstm_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/lstm_fp32_tests.cc
index 3f82c3d5450..cff528efc0d 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/lstm_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/lstm_fp32_tests.cc
@@ -180,6 +180,7 @@ TEST_F(LstmFp32, LstmForwardFp32Accuracy) {
     delete output;
   }
   delete kernel;
+  delete ctx;
   MS_LOG(INFO) << "LstmFp32 forward accuracy passed";
 }
 
@@ -332,6 +333,7 @@ TEST_F(LstmFp32, LstmBackwardFp32Accuracy) {
     delete output;
   }
   delete kernel;
+  delete ctx;
   MS_LOG(INFO) << "LstmFp32 backward accuracy passed";
 }
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/non_max_suppression_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/non_max_suppression_fp32_tests.cc
index e833e77cc8a..4e2b1fa5984 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/non_max_suppression_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/non_max_suppression_fp32_tests.cc
@@ -50,6 +50,7 @@ class TestNMSFp32 : public mindspore::CommonTest {
 };
 
 void TestNMSFp32::TearDown() {
+  delete kernel_;
   box_tensor_.set_data(nullptr);
   score_tensor_.set_data(nullptr);
   max_output_box_per_class_tensor_.set_data(nullptr);
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/pad_fp32_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/pad_fp32_test.cc
index efe4472ea76..4e18f2dc4a0 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/pad_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/pad_fp32_test.cc
@@ -51,6 +51,7 @@ class TestPadFp32 : public mindspore::CommonTest {
 };
 
 void TestPadFp32::TearDown() {
+  delete kernel_;
   paddings_tensor_.set_data(nullptr);
   in_tensor_.set_data(nullptr);
   out_tensor_.set_data(nullptr);
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/ragged_range_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/ragged_range_fp32_tests.cc
index 17998dc3ab7..47f1630a387 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/ragged_range_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/ragged_range_fp32_tests.cc
@@ -76,6 +76,7 @@ TEST_F(TestRaggedRangeFp32, 001) {
   in_tensor2.set_data(nullptr);
   out_tensor0.set_data(nullptr);
   out_tensor1.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestRaggedRangeFp32, 002) {
@@ -128,5 +129,6 @@ TEST_F(TestRaggedRangeFp32, 002) {
   in_tensor2.set_data(nullptr);
   out_tensor0.set_data(nullptr);
   out_tensor1.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reduce_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reduce_fp32_tests.cc
index ecf39d5b49d..cded35d1946 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reduce_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reduce_fp32_tests.cc
@@ -60,6 +60,9 @@ class TestReduceFp32 : public mindspore::CommonTest {
 
 void TestReduceFp32::TearDown() {
   delete ctx_;
+  delete kernel_;
+  ctx_ = nullptr;
+  kernel_ = nullptr;
   in_tensor_.set_data(nullptr);
   out_tensor_.set_data(nullptr);
 }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_bilinear_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_bilinear_fp32_tests.cc
index ab0817da7d2..b8b71586bdd 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_bilinear_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_bilinear_fp32_tests.cc
@@ -45,6 +45,7 @@ class TestResizeBilinearFp32 : public mindspore::CommonTest {
 };
 
 void TestResizeBilinearFp32::TearDown() {
+  delete kernel_;
   in_tensor_.set_data(nullptr);
   out_tensor_.set_data(nullptr);
 }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_nearest_neighbor_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_nearest_neighbor_fp32_tests.cc
index 111a5ce0e31..2ff1890a6c4 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_nearest_neighbor_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_nearest_neighbor_fp32_tests.cc
@@ -42,6 +42,7 @@ class TestResizeNearestNeighborFp32 : public mindspore::CommonTest {
 };
 
 void TestResizeNearestNeighborFp32::TearDown() {
+  delete kernel_;
   in_tensor_.set_data(nullptr);
   out_tensor_.set_data(nullptr);
 }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reverse_sequence_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reverse_sequence_fp32_tests.cc
index 554793f7c37..66a553be546 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reverse_sequence_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reverse_sequence_fp32_tests.cc
@@ -70,6 +70,7 @@ TEST_F(TestReverseSequenceFp32, BatchLessSeq) {
   in_tensor0.set_data(nullptr);
   in_tensor1.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestReverseSequenceFp32, BatchGreaterSeq) {
@@ -116,6 +117,7 @@ TEST_F(TestReverseSequenceFp32, BatchGreaterSeq) {
   in_tensor0.set_data(nullptr);
   in_tensor1.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestReverseSequenceFp32, BatchSeqNotAdjacent) {
@@ -162,5 +164,6 @@ TEST_F(TestReverseSequenceFp32, BatchSeqNotAdjacent) {
   in_tensor0.set_data(nullptr);
   in_tensor1.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/roi_pooling_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/roi_pooling_fp32_tests.cc
index f6e6d580779..973fa3851b0 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/roi_pooling_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/roi_pooling_fp32_tests.cc
@@ -71,6 +71,7 @@ TEST_F(TestROIPoolingFp32, Simple) {
   printf("\n");
   ASSERT_EQ(0, CompareOutputData(reinterpret_cast<float *>(outputs_[0]->MutableData()), correct, total_size, 0.0001));
   delete op;
+  delete ctx;
   for (auto t : inputs_) delete t;
   for (auto t : outputs_) delete t;
 }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/scale_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/scale_fp32_tests.cc
index 48909768cc0..7debf18e541 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/scale_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/scale_fp32_tests.cc
@@ -54,6 +54,7 @@ class TestScaleFp32 : public mindspore::CommonTest {
 };
 
 void TestScaleFp32::TearDown() {
+  delete kernel_;
   in_tensor_.set_data(nullptr);
   scale_tensor_.set_data(nullptr);
   offset_tensor_.set_data(nullptr);
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/skip_gram_fp32.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/skip_gram_fp32.cc
index 7bbc852b1b3..68a1c709139 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/skip_gram_fp32.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/skip_gram_fp32.cc
@@ -60,11 +60,11 @@ TEST_F(TestSkipGramFp32, ElTest) {
   lite::InnerContext *ctx = new lite::InnerContext;
   ctx->thread_num_ = 2;
   ASSERT_EQ(lite::RET_OK, ctx->Init());
-  kernel::SkipGramCPUKernel *el =
+  kernel::SkipGramCPUKernel *op =
     new kernel::SkipGramCPUKernel(reinterpret_cast<OpParameter *>(skip_gram_param_), inputs_, outputs_, ctx);
 
-  el->Init();
-  el->Run();
+  op->Init();
+  op->Run();
 
   std::vector<StringPack> output = mindspore::lite::ParseTensorBuffer(outputs_[0]);
   for (unsigned int i = 0; i < output.size(); i++) {
@@ -73,6 +73,13 @@ TEST_F(TestSkipGramFp32, ElTest) {
     }
     printf("\n");
   }
+  for (unsigned int i = 0; i < inputs_.size(); i++) {
+    delete inputs_[i];
+  }
+  for (unsigned int i = 0; i < outputs_.size(); i++) {
+    delete outputs_[i];
+  }
+  delete op;
   delete ctx;
 }
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/softmax_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/softmax_tests.cc
index f345e1a482f..0b5758276f8 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/softmax_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/softmax_tests.cc
@@ -57,5 +57,6 @@ TEST_F(TestSoftmaxFp32, 001) {
   }
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/space_to_depth_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/space_to_depth_fp32_tests.cc
index 62b04a3e4bd..c10e4181318 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/space_to_depth_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/space_to_depth_fp32_tests.cc
@@ -92,6 +92,7 @@ TEST_F(SpaceToDepthTestFp32, SpaceToDepthTest2) {
   }
   std::cout << "\n";
   ASSERT_EQ(0, CompareOutputData(output.data(), expect_out, out_size, 0.000001));
+  delete kernel;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/sparse_to_dense_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/sparse_to_dense_fp32_tests.cc
index aab1a3d139a..b9e09725923 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/sparse_to_dense_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/sparse_to_dense_fp32_tests.cc
@@ -114,6 +114,7 @@ TEST_F(TestSparseToDenseFp32, SparseToDense_test1) {
   delete input_tensor4;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestSparseToDenseFp32, SparseToDense_test2) {
@@ -200,6 +201,7 @@ TEST_F(TestSparseToDenseFp32, SparseToDense_test2) {
   delete input_tensor4;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestSparseToDenseFp32, SparseToDense_test3) {
@@ -284,6 +286,7 @@ TEST_F(TestSparseToDenseFp32, SparseToDense_test3) {
   delete input_tensor4;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestSparseToDenseFp32, SparseToDense_test4) {
@@ -368,6 +371,7 @@ TEST_F(TestSparseToDenseFp32, SparseToDense_test4) {
   delete input_tensor4;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestSparseToDenseFp32, SparseToDense_test5) {
@@ -454,5 +458,6 @@ TEST_F(TestSparseToDenseFp32, SparseToDense_test5) {
   delete input_tensor4;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/strided_slice_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/strided_slice_fp32_tests.cc
index 97bd90661a8..e227000fc1a 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/strided_slice_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/strided_slice_fp32_tests.cc
@@ -164,6 +164,7 @@ TEST_F(TestStridedSliceFp32, StridedSlice3) {
   ret = kernel->Run();
   EXPECT_EQ(0, ret);
   delete ctx;
+  delete kernel;
 
   ASSERT_EQ(0, CompareOutputData(output_data, correct, 2, 0.000001));
   input_tensor.set_data(nullptr);
@@ -217,6 +218,7 @@ TEST_F(TestStridedSliceFp32, StridedSlice4) {
   ret = kernel->Run();
   EXPECT_EQ(0, ret);
   delete ctx;
+  delete kernel;
 
   ASSERT_EQ(0, CompareOutputData(output_data, correct, 4, 0.000001));
   input_tensor.set_data(nullptr);
@@ -277,6 +279,7 @@ TEST_F(TestStridedSliceFp32, StridedSlice5) {
   ret = kernel->Run();
   EXPECT_EQ(0, ret);
   delete ctx;
+  delete kernel;
 
   ASSERT_EQ(0, CompareOutputData(output_data, correct, 12, 0.000001));
   input_tensor.set_data(nullptr);
@@ -337,6 +340,7 @@ TEST_F(TestStridedSliceFp32, StridedSlice6) {
   ret = kernel->Run();
   EXPECT_EQ(0, ret);
   delete ctx;
+  delete kernel;
 
   ASSERT_EQ(0, CompareOutputData(output_data, correct, 8, 0.000001));
   input_tensor.set_data(nullptr);
@@ -389,6 +393,7 @@ TEST_F(TestStridedSliceFp32, StridedSlice7) {
   ret = kernel->Run();
   EXPECT_EQ(0, ret);
   delete ctx;
+  delete kernel;
 
   ASSERT_EQ(0, CompareOutputData(output_data, correct, 1, 0.000001));
   input_tensor.set_data(nullptr);
@@ -449,6 +454,7 @@ TEST_F(TestStridedSliceFp32, StridedSlice8) {
   ret = kernel->Run();
   EXPECT_EQ(0, ret);
   delete ctx;
+  delete kernel;
 
   ASSERT_EQ(0, CompareOutputData(output_data, correct, 5, 0.000001));
   input_tensor.set_data(nullptr);
@@ -602,6 +608,7 @@ TEST_F(TestStridedSliceFp32, StridedSlice9) {
   ret = kernel->Run();
   EXPECT_EQ(0, ret);
   delete ctx;
+  delete kernel;
 
   ASSERT_EQ(0, CompareOutputData(output_data, correct, 490, 0.000001));
   input_tensor.set_data(nullptr);
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/tile_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/tile_fp32_tests.cc
index 317e0d96815..1b64b3a5289 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/tile_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/tile_fp32_tests.cc
@@ -68,6 +68,7 @@ TEST_F(TestTileFp32, Tile) {
 
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestTileFp32, SimpleTile1) {
@@ -115,6 +116,7 @@ TEST_F(TestTileFp32, SimpleTile1) {
 
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestTileFp32, SimpleTile2) {
@@ -162,5 +164,6 @@ TEST_F(TestTileFp32, SimpleTile2) {
 
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/topk_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/topk_fp32_tests.cc
index 041acc41bb9..3fe097e3996 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/topk_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/topk_fp32_tests.cc
@@ -66,5 +66,6 @@ TEST_F(TestTopKFp32, TopK) {
   in_tensor.set_data(nullptr);
   out_tensor0.set_data(nullptr);
   out_tensor1.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/uniform_real_fp32_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/uniform_real_fp32_test.cc
index 8785609f5b4..3b7bab7b540 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/uniform_real_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/uniform_real_fp32_test.cc
@@ -65,5 +65,6 @@ TEST_F(TestUniformRealFp32, UniformReal) {
     std::cout << output_data0[i] << " ";
   }
   out_tensor0.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/unique_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/unique_fp32_tests.cc
index 0e08c127e46..695a107dcc2 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/unique_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/unique_fp32_tests.cc
@@ -69,5 +69,6 @@ TEST_F(TestUniqueFp32, Unique) {
   in_tensor.set_data(nullptr);
   out_tensor0.set_data(nullptr);
   out_tensor1.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/unstack_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/unstack_fp32_tests.cc
index b9bf1252360..12c2b8ca4e7 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/unstack_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/unstack_fp32_tests.cc
@@ -77,6 +77,7 @@ TEST_F(TestUnstackFp32, Unstack) {
   out_tensor1.set_data(nullptr);
   out_tensor2.set_data(nullptr);
   out_tensor3.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestUnstackFp32, Unstack2) {
@@ -124,5 +125,6 @@ TEST_F(TestUnstackFp32, Unstack2) {
   out_tensor0.set_data(nullptr);
   out_tensor1.set_data(nullptr);
   out_tensor2.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/add_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/add_int8_tests.cc
index 9e703106024..b4e91216020 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/add_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/add_int8_tests.cc
@@ -73,5 +73,6 @@ TEST_F(TestQuantizedAdd, Add) {
   in_tensor0.set_data(nullptr);
   in_tensor1.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/arithmetic_self_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/arithmetic_self_int8_tests.cc
index 13168ab23e7..c36d114c436 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/arithmetic_self_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/arithmetic_self_int8_tests.cc
@@ -89,6 +89,7 @@ TEST_F(TestArithmeticSelfInt8, floor_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, floor_quant1_thread2) {
@@ -151,6 +152,7 @@ TEST_F(TestArithmeticSelfInt8, floor_quant1_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, round_quant0_thread2) {
@@ -213,6 +215,7 @@ TEST_F(TestArithmeticSelfInt8, round_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, round_quant1_thread2) {
@@ -275,6 +278,7 @@ TEST_F(TestArithmeticSelfInt8, round_quant1_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, ceil_quant0_thread2) {
@@ -337,6 +341,7 @@ TEST_F(TestArithmeticSelfInt8, ceil_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, ceil_quant1_thread2) {
@@ -399,6 +404,7 @@ TEST_F(TestArithmeticSelfInt8, ceil_quant1_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, abs_quant0_thread0) {
@@ -461,6 +467,7 @@ TEST_F(TestArithmeticSelfInt8, abs_quant0_thread0) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, abs_quant1_thread2) {
@@ -523,6 +530,7 @@ TEST_F(TestArithmeticSelfInt8, abs_quant1_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, sin_quant0_thread2) {
@@ -585,6 +593,7 @@ TEST_F(TestArithmeticSelfInt8, sin_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, cos_quant0_thread2) {
@@ -647,6 +656,7 @@ TEST_F(TestArithmeticSelfInt8, cos_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, log_quant0_thread2) {
@@ -709,6 +719,7 @@ TEST_F(TestArithmeticSelfInt8, log_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, sqrt_quant0_thread2) {
@@ -771,6 +782,7 @@ TEST_F(TestArithmeticSelfInt8, sqrt_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, rsqrt_quant0_thread2) {
@@ -833,6 +845,7 @@ TEST_F(TestArithmeticSelfInt8, rsqrt_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, square_quant0_thread2) {
@@ -895,6 +908,7 @@ TEST_F(TestArithmeticSelfInt8, square_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, square_quant1_thread2) {
@@ -957,6 +971,7 @@ TEST_F(TestArithmeticSelfInt8, square_quant1_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestArithmeticSelfInt8, logical_not_quant0_thread2) {
@@ -1019,6 +1034,7 @@ TEST_F(TestArithmeticSelfInt8, logical_not_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/batchnorm_int8_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/batchnorm_int8_test.cc
index 2eda590bdcf..70b3202fca4 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/batchnorm_int8_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/batchnorm_int8_test.cc
@@ -126,6 +126,7 @@ TEST_F(TestBatchnormInt8, FusedTest) {
   input3_tensor.set_data(nullptr);
   input4_tensor.set_data(nullptr);
   output0_tensor.set_data(nullptr);
+  delete kernel;
   MS_LOG(INFO) << "TestBathNormFp32 accuracy passed";
 }
 
@@ -207,6 +208,7 @@ TEST_F(TestBatchnormInt8, BNTest) {
   input1_tensor.set_data(nullptr);
   input2_tensor.set_data(nullptr);
   output0_tensor.set_data(nullptr);
+  delete kernel;
   MS_LOG(INFO) << "TestBathNormFp32 accuracy passed";
 }
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/concat_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/concat_int8_tests.cc
index a52244d548f..de5ff748fbd 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/concat_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/concat_int8_tests.cc
@@ -102,6 +102,7 @@ TEST_F(TestConcatInt8, Concat1_axis0) {
   delete input_tensor2;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestConcatInt8, Concat1_axis1_thread2) {
@@ -177,6 +178,7 @@ TEST_F(TestConcatInt8, Concat1_axis1_thread2) {
   delete input_tensor2;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestConcatInt8, Concat1_axis1_thread2_quant1) {
@@ -252,6 +254,7 @@ TEST_F(TestConcatInt8, Concat1_axis1_thread2_quant1) {
   delete input_tensor2;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/crop_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/crop_int8_tests.cc
index 60275fa3d99..94f59d1fda4 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/crop_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/crop_int8_tests.cc
@@ -94,6 +94,7 @@ TEST_F(TestCropInt8, crop_1d_axis0_offset0_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestCropInt8, crop_2d_axis1_offset0_quant0_thread2) {
@@ -160,6 +161,7 @@ TEST_F(TestCropInt8, crop_2d_axis1_offset0_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestCropInt8, crop_3d_axis1_offset0_quant0_thread0) {
@@ -226,6 +228,7 @@ TEST_F(TestCropInt8, crop_3d_axis1_offset0_quant0_thread0) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestCropInt8, crop_3d_axis1_offset0_quant0_thread2) {
@@ -293,6 +296,7 @@ TEST_F(TestCropInt8, crop_3d_axis1_offset0_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestCropInt8, crop_4d_axis0_offset0_quant0_thread0) {
@@ -359,6 +363,7 @@ TEST_F(TestCropInt8, crop_4d_axis0_offset0_quant0_thread0) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestCropInt8, crop_4d_axis1_offset0_quant0_thread0) {
@@ -425,6 +430,7 @@ TEST_F(TestCropInt8, crop_4d_axis1_offset0_quant0_thread0) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestCropInt8, crop_4d_axis1_offset1_quant0_thread0) {
@@ -494,6 +500,7 @@ TEST_F(TestCropInt8, crop_4d_axis1_offset1_quant0_thread0) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestCropInt8, crop_4d_axis1_offset1_quant1_thread0) {
@@ -563,6 +570,7 @@ TEST_F(TestCropInt8, crop_4d_axis1_offset1_quant1_thread0) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestCropInt8, crop_4d_axis0_offset0_quant0_thread2) {
@@ -631,6 +639,7 @@ TEST_F(TestCropInt8, crop_4d_axis0_offset0_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestCropInt8, crop_4d_axis0_offset0_quant0_thread3) {
@@ -699,5 +708,6 @@ TEST_F(TestCropInt8, crop_4d_axis0_offset0_quant0_thread3) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gatherNd_int8_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gatherNd_int8_test.cc
index 05a812107a4..5d5dc46c29a 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gatherNd_int8_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gatherNd_int8_test.cc
@@ -98,6 +98,7 @@ TEST_F(TestGatherNdInt8, GatherNdTest) {
   input0_tensor.set_data(nullptr);
   input1_tensor.set_data(nullptr);
   output0_tensor.set_data(nullptr);
+  delete kernel;
   MS_LOG(INFO) << "TestGatherNd accuracy passed";
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gather_int8_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gather_int8_test.cc
index 9eb204a439c..e5ed6de12eb 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gather_int8_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/gather_int8_test.cc
@@ -96,6 +96,7 @@ TEST_F(TestGatherInt8, GatherTest) {
   input0_tensor.set_data(nullptr);
   input1_tensor.set_data(nullptr);
   output0_tensor.set_data(nullptr);
+  delete kernel;
   MS_LOG(INFO) << "TestGather_int8 accuracy passed";
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/hswish_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/hswish_int8_tests.cc
index 7b37f179771..4d523d7000c 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/hswish_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/hswish_int8_tests.cc
@@ -73,5 +73,6 @@ TEST_F(TestHSwishInt8, HSwish) {
 
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/l2_norm_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/l2_norm_int8_tests.cc
index 6b601c9e54c..f829982ed78 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/l2_norm_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/l2_norm_int8_tests.cc
@@ -71,6 +71,7 @@ TEST_F(TestL2NormInt8, norm) {
   free(param_.axis_);
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestL2NormInt8, norm2) {
@@ -116,5 +117,6 @@ TEST_F(TestL2NormInt8, norm2) {
   free(param_.axis_);
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/mul_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/mul_int8_tests.cc
index 14041629fae..d99675ab28a 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/mul_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/mul_int8_tests.cc
@@ -101,6 +101,7 @@ TEST_F(TestMulInt8, Mul_quant0) {
   delete input_tensor2;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestMulInt8, Mul_quant0_thread0) {
@@ -174,6 +175,7 @@ TEST_F(TestMulInt8, Mul_quant0_thread0) {
   delete input_tensor2;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestMulInt8, Mul_quant1) {
@@ -247,6 +249,7 @@ TEST_F(TestMulInt8, Mul_quant1) {
   delete input_tensor2;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestMulInt8, Mul_quant1_thread1) {
@@ -320,6 +323,7 @@ TEST_F(TestMulInt8, Mul_quant1_thread1) {
   delete input_tensor2;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestMulInt8, test) {
@@ -393,6 +397,7 @@ TEST_F(TestMulInt8, test) {
   delete input_tensor2;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/pad_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/pad_int8_tests.cc
index 1b580cbc4d6..15398797dd3 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/pad_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/pad_int8_tests.cc
@@ -54,7 +54,7 @@ int PadInt8TestInit1(std::vector<Tensor *> *inputs_, std::vector<Tensor *> *outp
   memcpy(*correct, co, out_t->ElementsNum() * sizeof(int8_t));
 
   int padding[] = {0, 0, 0, 0, 0, 0, 2, 2};
-  memcpy(pad_param->paddings_, padding, MAX_PAD_SIZE * sizeof(int));
+  memcpy(pad_param->paddings_, padding, std::min(sizeof(padding), MAX_PAD_SIZE * sizeof(int)));
   pad_param->constant_value_ = 0;
 
   return out_t->ElementsNum();
@@ -107,7 +107,7 @@ int PadInt8TestInit2(std::vector<Tensor *> *inputs_, std::vector<Tensor *> *outp
   memcpy(*correct, co, out_t->ElementsNum() * sizeof(int8_t));
 
   int padding[] = {0, 0, 0, 0, 3, 1, 1, 2};
-  memcpy(pad_param->paddings_, padding, MAX_PAD_SIZE * sizeof(int));
+  memcpy(pad_param->paddings_, padding, std::min(sizeof(padding), MAX_PAD_SIZE * sizeof(int)));
   pad_param->constant_value_ = 0;
 
   return out_t->ElementsNum();
@@ -174,7 +174,7 @@ int PadInt8TestInit4(std::vector<Tensor *> *inputs_, std::vector<Tensor *> *outp
   memcpy(*correct, co, out_t->ElementsNum() * sizeof(int8_t));
 
   int padding[] = {3, 1, 1, 2, 2, 0, 1, 1};
-  memcpy(pad_param->paddings_, padding, MAX_PAD_SIZE * sizeof(int));
+  memcpy(pad_param->paddings_, padding, std::min(sizeof(padding), MAX_PAD_SIZE * sizeof(int)));
   pad_param->constant_value_ = 0;
 
   return out_t->ElementsNum();
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/power_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/power_int8_tests.cc
index d60b5b5f89a..d082a6384aa 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/power_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/power_int8_tests.cc
@@ -85,6 +85,7 @@ TEST_F(TestPowerInt8, PowerInt8) {
 
   input0_tensor.set_data(nullptr);
   output0_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestPowerInt8, normal) {
@@ -156,5 +157,6 @@ TEST_F(TestPowerInt8, normal) {
 
   input0_tensor.set_data(nullptr);
   output0_tensor.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/prelu_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/prelu_int8_tests.cc
index 17a70c1d1b6..779d5e8de7f 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/prelu_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/prelu_int8_tests.cc
@@ -93,6 +93,7 @@ TEST_F(TestPreluInt8, prelu_1) {
   output0_tensor->set_data(nullptr);
   delete input_tensor1;
   delete ctx;
+  delete kernel;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/quant_dtype_cast_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/quant_dtype_cast_tests.cc
index 8943af4654d..2daed471fd4 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/quant_dtype_cast_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/quant_dtype_cast_tests.cc
@@ -80,6 +80,7 @@ TEST_F(QuantDTypeCastTestFp32, QuantDTypeCastTest1) {
   }
   std::cout << "\n";
   ASSERT_EQ(0, CompareOutputData(output.data(), expect_out, out_size, 0.000001));
+  delete kernel;
 }
 
 TEST_F(QuantDTypeCastTestFp32, QuantDTypeCastTest2) {
@@ -129,5 +130,6 @@ TEST_F(QuantDTypeCastTestFp32, QuantDTypeCastTest2) {
   }
   std::cout << "\n";
   ASSERT_EQ(0, CompareOutputData(output.data(), expect_out, out_size, 0.000001));
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reduce_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reduce_int8_tests.cc
index d3236c4c835..23058a3f293 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reduce_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reduce_int8_tests.cc
@@ -57,6 +57,7 @@ class TestReduceInt8 : public mindspore::CommonTest {
 };
 
 void TestReduceInt8::TearDown() {
+  delete kernel_;
   in_tensor_.set_data(nullptr);
   out_tensor_.set_data(nullptr);
 }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/relux_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/relux_int8_tests.cc
index d2b9ce6a258..9727399391c 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/relux_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/relux_int8_tests.cc
@@ -71,6 +71,7 @@ TEST_F(TestReluXInt8, Relu) {
 
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestReluXInt8, Relu6) {
@@ -118,5 +119,6 @@ TEST_F(TestReluXInt8, Relu6) {
 
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reshape_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reshape_int8_tests.cc
index 3f51ea28380..19dc0bd29d8 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reshape_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reshape_int8_tests.cc
@@ -90,6 +90,7 @@ TEST_F(TestReshapeInt8, reshape_quant0) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestReshapeInt8, reshape_quant1_thread2) {
@@ -152,6 +153,7 @@ TEST_F(TestReshapeInt8, reshape_quant1_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc
index 7fa7de2c395..1b7ff250a21 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc
@@ -47,6 +47,7 @@ class TestResizeBilinearInt8 : public mindspore::CommonTest {
 };
 
 void TestResizeBilinearInt8::TearDown() {
+  delete kernel_;
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
 }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_nearest_neighbor_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_nearest_neighbor_int8_tests.cc
index 7e801a70b09..af0873ac6ab 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_nearest_neighbor_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_nearest_neighbor_int8_tests.cc
@@ -81,6 +81,7 @@ void TestResizeNearestNeighborInt8::Prepare(const std::vector<int> &in_shape, co
 }
 
 void TestResizeNearestNeighborInt8::TearDown() {
+  delete kernel_;
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
 }
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/scale_int8.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/scale_int8.cc
index 10f500363eb..e9073b36585 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/scale_int8.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/scale_int8.cc
@@ -55,6 +55,7 @@ class TestScaleInt8 : public mindspore::CommonTest {
 };
 
 void TestScaleInt8::TearDown() {
+  delete kernel_;
   in_tensor_.set_data(nullptr);
   scale_tensor_.set_data(nullptr);
   bias_tensor_.set_data(nullptr);
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/sigmoid_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/sigmoid_int8_tests.cc
index b31101c500d..9a4affaf203 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/sigmoid_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/sigmoid_int8_tests.cc
@@ -70,5 +70,6 @@ TEST_F(TestSigmoidInt8, Sigmoid) {
 
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/softmax_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/softmax_int8_tests.cc
index 2596984475b..31dc5578c79 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/softmax_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/softmax_int8_tests.cc
@@ -90,6 +90,7 @@ TEST_F(TestSoftmaxInt8, SoftmaxInt8) {
 
   input0_tensor.set_data(nullptr);
   output0_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/space_to_batch_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/space_to_batch_int8_tests.cc
index 96deb532e4e..be23ef5cd04 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/space_to_batch_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/space_to_batch_int8_tests.cc
@@ -56,5 +56,6 @@ TEST_F(SpaceToBatchTestInt8, test1) {
   }
   in_tensor.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/split_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/split_int8_tests.cc
index ad939b88252..ee3dd3c9b29 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/split_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/split_int8_tests.cc
@@ -112,6 +112,7 @@ TEST_F(TestSplitInt8, Split_quant0_thread2) {
   delete output1_tensor;
   delete output2_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestSplitInt8, Split_quant0_thread2_num) {
@@ -208,6 +209,7 @@ TEST_F(TestSplitInt8, Split_quant0_thread2_num) {
   delete output2_tensor;
   delete output3_tensor;
   delete ctx;
+  delete kernel;
 }
 
 TEST_F(TestSplitInt8, Split_quant1_thread2_num) {
@@ -304,6 +306,7 @@ TEST_F(TestSplitInt8, Split_quant1_thread2_num) {
   delete output2_tensor;
   delete output3_tensor;
   delete ctx;
+  delete kernel;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/squeeze_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/squeeze_int8_tests.cc
index eff490d1c84..c8143c3f99e 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/squeeze_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/squeeze_int8_tests.cc
@@ -90,5 +90,6 @@ TEST_F(TestSqueezeInt8, Squeeze_1d_axis0_offset0_quant0_thread2) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/sub_int_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/sub_int_tests.cc
index 575b40d2590..d48403abee0 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/sub_int_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/sub_int_tests.cc
@@ -75,6 +75,7 @@ TEST_F(TestSubInt8, SubInt8) {
   in_tensor0.set_data(nullptr);
   in_tensor1.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 
 TEST_F(TestSubInt8, SubInt8T2) {
@@ -125,5 +126,6 @@ TEST_F(TestSubInt8, SubInt8T2) {
   in_tensor0.set_data(nullptr);
   in_tensor1.set_data(nullptr);
   out_tensor.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/topk_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/topk_int8_tests.cc
index 246078b8c40..e00a70bd8a5 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/topk_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/topk_int8_tests.cc
@@ -64,5 +64,6 @@ TEST_F(TestTopKInt8, TopK) {
   in_tensor.set_data(nullptr);
   out_tensor0.set_data(nullptr);
   out_tensor1.set_data(nullptr);
+  delete kernel;
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/unsqueeze_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/unsqueeze_int8_tests.cc
index 37b77f2b854..807aa9b07f3 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/unsqueeze_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/unsqueeze_int8_tests.cc
@@ -92,6 +92,7 @@ TEST_F(TestUnsqueezeInt8, Unsqueeze_1) {
   delete input_tensor1;
   delete output0_tensor;
   delete ctx;
+  delete kernel;
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/string/normalize.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/string/normalize.cc
index 202ae5a2f87..ef20904eb86 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/string/normalize.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/string/normalize.cc
@@ -32,6 +32,7 @@ class TestNormalize : public mindspore::CommonTest {
  public:
   TestNormalize() {}
   void NormalizeTestInit();
+  void TearDown() override;
 
  public:
   Tensor input_tensor_;
@@ -45,6 +46,12 @@ class TestNormalize : public mindspore::CommonTest {
   kernel::InnerKernel *kernel_ = nullptr;
 };
 
+void TestNormalize::TearDown() {
+  delete kernel_;
+  input_tensor_.set_data(nullptr);
+  output_tensor_.set_data(nullptr);
+}
+
 void TestNormalize::NormalizeTestInit() {
   input_tensor_.set_data_type(kObjectTypeString);
   input_tensor_.set_format(mindspore::NHWC);
@@ -79,9 +86,6 @@ TEST_F(TestNormalize, TestSentence) {
     }
     printf("\n");
   }
-
-  input_tensor_.set_data(nullptr);
-  output_tensor_.set_data(nullptr);
 }
 
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/mindrtParallel/mindrt_parallel_model.out b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/mindrtParallel/mindrt_parallel_model.out
index bbceca4236a..91f28fb697b 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/mindrtParallel/mindrt_parallel_model.out
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/mindrtParallel/mindrt_parallel_model.out
@@ -1,6 +1,6 @@
-Stack-8 5 1 1 8 4 2 
+output 5 1 1 8 4 2 
 0.115831 0.11307496 0.24593274 0.34630755 -0.156871 0.21111916 -0.1046219 0.01590158 0.2745127 0.17317073 0.1787783 0.36557162 -0.13658395 0.2911819 -0.17356569 0.06825469 0.30655888 0.29681587 0.0078597255 0.3846875 -0.09266291 0.26170188 -0.15063931 0.04322962 0.25661856 0.25256 0.023097975 0.32573196 -0.043139715 0.25530565 -0.17270242 0.06442319 0.16240332 0.14648464 0.09654196 0.31037596 -0.0539147 0.23819281 -0.15090092 0.048991375 0.11573871 0.078725 0.19393174 0.26017824 -0.053352155 0.23836473 -0.15971972 0.054956935 0.19800682 0.17823274 0.17631978 0.3600948 -0.057391744 0.30457845 -0.19889072 0.05244953 0.090213075 0.17350613 0.044377614 0.29630166 -0.06999667 0.28462386 -0.17194743 0.093742274 
-Stack-10 5 1 1 8 4 1 
+output2 5 1 1 8 4 1 
 0.06387864 0.22883008 0.23308714 0.045865785 0.06820235 0.26621705 0.29714558 0.112830795 0.1669129 0.33512616 0.25788227 0.08388044 0.14331667 0.27875048 0.23716372 0.10920572 0.07898582 0.24287388 0.22543576 0.08901558 0.03376824 0.16912283 0.225415 0.09693983 0.09598104 0.26216167 0.28474298 0.10668853 0.12471523 0.24643728 0.27107987 0.13469991 
-Stack-13 3 1 8 4 
+output3 3 1 8 4 
 -0.16171767 -0.3828573 0.08357508 0.10217983 -0.34800848 -0.3206381 0.03284559 0.15394436 -0.42709222 -0.15115751 -0.0015709695 0.13956246 -0.35903975 -0.14498001 -0.050358675 0.15447712 -0.22225751 -0.21515054 -0.03286325 0.13769037 -0.1488501 -0.29710612 -0.033508375 0.14458355 -0.27084687 -0.31606156 -0.053954814 0.18598628 -0.15771987 -0.15602258 -0.0335121 0.14279547 
diff --git a/mindspore/lite/test/ut/src/runtime/runtime_pass_tests.cc b/mindspore/lite/test/ut/src/runtime/runtime_pass_tests.cc
index ab961bb8b87..0f05beb6208 100644
--- a/mindspore/lite/test/ut/src/runtime/runtime_pass_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/runtime_pass_tests.cc
@@ -59,23 +59,52 @@ void Nc4hw4PassConstruct(std::vector<kernel::LiteKernel *> *kernels, std::vector
                                                  transpose_param, &transpose_kernel, nullptr);
   kernels->push_back(transpose_kernel);
 
-  lite::Tensor *pad_param_tensor = new lite::Tensor();
-  tensors->push_back(pad_param_tensor);
-  lite::Tensor *pad_out_tensor = new lite::Tensor();
-  tensors->push_back(pad_out_tensor);
-  OpParameter *pad_param = new OpParameter();
-  kernel::KernelKey pad_desc{kernel::kCPU, kNumberTypeFloat32, schema::PrimitiveType_PadFusion};
-  kernel::LiteKernel *pad_kernel = nullptr;
-  std::vector<lite::Tensor *> pad_in = {transpose_out_tensor, pad_param_tensor};
-  std::vector<lite::Tensor *> pad_out = {pad_out_tensor};
-  lite::KernelRegistry::GetInstance()->GetKernel(pad_in, pad_out, ctx, nullptr, pad_desc, pad_param, &pad_kernel,
-                                                 nullptr);
-  kernels->push_back(pad_kernel);
+  lite::Tensor *in_param_tensor = new lite::Tensor();
+  tensors->push_back(in_param_tensor);
+  lite::Tensor *in_out_tensor = new lite::Tensor();
+  tensors->push_back(in_out_tensor);
+  OpParameter *in_param = new OpParameter();
+  kernel::KernelKey in_desc{kernel::kCPU, kNumberTypeFloat32, schema::PrimitiveType_InstanceNorm};
+  kernel::LiteKernel *in_kernel = nullptr;
+  std::vector<lite::Tensor *> in_in = {transpose_out_tensor, in_param_tensor};
+  std::vector<lite::Tensor *> in_out = {in_out_tensor};
+  lite::KernelRegistry::GetInstance()->GetKernel(in_in, in_out, ctx, nullptr, in_desc, in_param, &in_kernel, nullptr);
+  kernels->push_back(in_kernel);
+
+  lite::Tensor *transpose2_param_tensor = new lite::Tensor();
+  tensors->push_back(transpose_param_tensor);
+  lite::Tensor *transpose2_out_tensor = new lite::Tensor();
+  tensors->push_back(transpose_param_tensor);
+  OpParameter *transpose2_param = new OpParameter();
+  kernel::KernelKey transpose2_desc{kernel::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Transpose};
+  kernel::LiteKernel *transpose2_kernel = nullptr;
+  std::vector<lite::Tensor *> transpose2_in = {in_out_tensor, transpose2_param_tensor};
+  std::vector<lite::Tensor *> transpose2_out = {transpose2_out_tensor};
+  lite::KernelRegistry::GetInstance()->GetKernel(transpose2_in, transpose2_out, ctx, nullptr, transpose2_desc,
+                                                 transpose2_param, &transpose2_kernel, nullptr);
+  kernels->push_back(transpose2_kernel);
+
+  lite::Tensor *conv2_weight = new lite::Tensor();
+  tensors->push_back(conv2_weight);
+  lite::Tensor *conv2_out_tensor = new lite::Tensor();
+  tensors->push_back(conv2_out_tensor);
+  std::vector<lite::Tensor *> conv2_in = {transpose2_out_tensor, conv_weight};
+  std::vector<lite::Tensor *> conv2_out = {conv2_out_tensor};
+  OpParameter *conv2_param = new OpParameter();
+  kernel::KernelKey conv2_desc{kernel::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Conv2DFusion};
+  kernel::LiteKernel *conv2_kernel = nullptr;
+  lite::KernelRegistry::GetInstance()->GetKernel(conv2_in, conv2_out, ctx, nullptr, conv2_desc, conv2_param,
+                                                 &conv2_kernel, nullptr);
+  kernels->push_back(conv2_kernel);
 
   conv_kernel->set_out_kernels({transpose_kernel});
   transpose_kernel->set_in_kernels({conv_kernel});
-  transpose_kernel->set_out_kernels({pad_kernel});
-  pad_kernel->set_in_kernels({transpose_kernel});
+  transpose_kernel->set_out_kernels({in_kernel});
+  in_kernel->set_in_kernels({transpose_kernel});
+  in_kernel->set_out_kernels({transpose2_kernel});
+  transpose2_kernel->set_in_kernels({in_kernel});
+  transpose2_kernel->set_out_kernels({conv2_kernel});
+  conv2_kernel->set_in_kernels({transpose2_kernel});
   return;
 }
 
@@ -85,11 +114,12 @@ TEST_F(RuntimePass, Nc4hw4Pass1) {
   std::vector<lite::Tensor *> tensors;
   Nc4hw4PassConstruct(&kernels, &tensors, ctx.get());
 
+  ASSERT_EQ(kernels.size(), 5);
+
   /* runtime pass */
   lite::Nc4hw4PassReplace(&kernels, &tensors, 0);
 
-  ASSERT_EQ(kernels.size(), 2);
-  ASSERT_EQ(tensors.size(), 5);
+  ASSERT_EQ(kernels.size(), 3);
 
   for (auto tensor : tensors) {
     delete tensor;
diff --git a/mindspore/lite/test/ut/tools/converter/registry/model_parser_registry_test.cc b/mindspore/lite/test/ut/tools/converter/registry/model_parser_registry_test.cc
index d823b613af4..e0f4a570698 100644
--- a/mindspore/lite/test/ut/tools/converter/registry/model_parser_registry_test.cc
+++ b/mindspore/lite/test/ut/tools/converter/registry/model_parser_registry_test.cc
@@ -19,9 +19,8 @@
 #include "ut/tools/converter/registry/model_parser_test.h"
 #include "tools/optimizer/common/gllo_utils.h"
 
-using mindspore::lite::ModelRegistrar;
-using mindspore::lite::converter::ConverterParameters;
-using mindspore::lite::converter::FmkType_CAFFE;
+using mindspore::converter::ConverterParameters;
+using mindspore::converter::kFmkTypeCaffe;
 namespace mindspore {
 class ModelParserRegistryTest : public mindspore::CommonTest {
  public:
@@ -34,9 +33,9 @@ TEST_F(ModelParserRegistryTest, TestRegistry) {
   ASSERT_NE(add_parser, nullptr);
   auto proposal_parser = node_parser_reg->GetNodeParser("proposal");
   ASSERT_NE(proposal_parser, nullptr);
-  REG_MODEL_PARSER(FmkType_CAFFE,
+  REG_MODEL_PARSER(kFmkTypeCaffe,
                    TestModelParserCreator);  // register test model parser creator, which will overwrite existing.
-  auto model_parser = lite::ModelParserRegistry::GetInstance()->GetModelParser(FmkType_CAFFE);
+  auto model_parser = registry::ModelParserRegistry::GetModelParser(kFmkTypeCaffe);
   ASSERT_NE(model_parser, nullptr);
   ConverterParameters converter_parameters;
   auto func_graph = model_parser->Parse(converter_parameters);
diff --git a/mindspore/lite/test/ut/tools/converter/registry/model_parser_test.cc b/mindspore/lite/test/ut/tools/converter/registry/model_parser_test.cc
index 31e28cf275d..9961b9e34b9 100644
--- a/mindspore/lite/test/ut/tools/converter/registry/model_parser_test.cc
+++ b/mindspore/lite/test/ut/tools/converter/registry/model_parser_test.cc
@@ -21,7 +21,7 @@
 #include "include/registry/model_parser_registry.h"
 
 namespace mindspore {
-FuncGraphPtr ModelParserTest::Parse(const lite::converter::ConverterParameters &flag) {
+FuncGraphPtr ModelParserTest::Parse(const converter::ConverterParameters &flag) {
   // construct funcgraph
   res_graph_ = std::make_shared<FuncGraph>();
   auto ret = InitOriginModelStructure();
@@ -160,7 +160,7 @@ int ModelParserTest::BuildGraphOutputs() {
   return lite::RET_OK;
 }
 
-lite::ModelParser *TestModelParserCreator() {
+converter::ModelParser *TestModelParserCreator() {
   auto *model_parser = new (std::nothrow) ModelParserTest();
   if (model_parser == nullptr) {
     MS_LOG(ERROR) << "new model parser failed";
diff --git a/mindspore/lite/test/ut/tools/converter/registry/model_parser_test.h b/mindspore/lite/test/ut/tools/converter/registry/model_parser_test.h
index c3804324e62..757f790f20e 100644
--- a/mindspore/lite/test/ut/tools/converter/registry/model_parser_test.h
+++ b/mindspore/lite/test/ut/tools/converter/registry/model_parser_test.h
@@ -25,10 +25,10 @@
 #include "tools/converter/model_parser.h"
 
 namespace mindspore {
-class ModelParserTest : public lite::ModelParser {
+class ModelParserTest : public converter::ModelParser {
  public:
   ModelParserTest() = default;
-  FuncGraphPtr Parse(const lite::converter::ConverterParameters &flag) override;
+  FuncGraphPtr Parse(const converter::ConverterParameters &flag) override;
 
  private:
   int InitOriginModelStructure();
@@ -40,7 +40,7 @@ class ModelParserTest : public lite::ModelParser {
   std::vector<std::string> model_structure_;
 };
 
-lite::ModelParser *TestModelParserCreator();
+converter::ModelParser *TestModelParserCreator();
 }  // namespace mindspore
 
 #endif  // LITE_TEST_UT_TOOLS_CONVERTER_REGISTRY_MODEL_PARSER_TEST_H
diff --git a/mindspore/lite/test/ut/tools/converter/registry/pass_registry_test.cc b/mindspore/lite/test/ut/tools/converter/registry/pass_registry_test.cc
deleted file mode 100644
index f138087c40f..00000000000
--- a/mindspore/lite/test/ut/tools/converter/registry/pass_registry_test.cc
+++ /dev/null
@@ -1,239 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <map>
-#include <string>
-#include <vector>
-#include "common/common_test.h"
-#include "backend/optimizer/common/pass.h"
-#include "include/registry/model_parser_registry.h"
-#include "include/registry/pass_registry.h"
-#include "ops/fusion/add_fusion.h"
-#include "ops/addn.h"
-#include "ops/custom.h"
-#include "tools/converter/model_parser.h"
-#include "tools/converter/registry/pass_content.h"
-#include "tools/optimizer/common/gllo_utils.h"
-#include "ut/tools/converter/registry/model_parser_test.h"
-
-using mindspore::lite::ModelRegistrar;
-using mindspore::lite::converter::ConverterParameters;
-using mindspore::lite::converter::FmkType_CAFFE;
-namespace mindspore {
-class PassRegistryTest : public mindspore::CommonTest {
- public:
-  PassRegistryTest() = default;
-  void SetUp() override {
-    REG_MODEL_PARSER(FmkType_CAFFE, TestModelParserCreator);
-    auto model_parser = lite::ModelParserRegistry::GetInstance()->GetModelParser(FmkType_CAFFE);
-    if (model_parser == nullptr) {
-      return;
-    }
-    ConverterParameters converter_parameters;
-    func_graph_ = model_parser->Parse(converter_parameters);
-  }
-  FuncGraphPtr func_graph_ = nullptr;
-};
-
-namespace opt {
-// fuse add and add to addn.
-class Test1Fusion : public Pass {
- public:
-  Test1Fusion() : Pass("test1_fusion") {}
-  bool CanFusion(const CNodePtr &cnode) {
-    if (cnode == nullptr) {
-      return false;
-    }
-    if (!opt::CheckPrimitiveType(cnode, prim::kPrimAddFusion)) {
-      return false;
-    }
-    auto primc = GetValueNode<std::shared_ptr<ops::AddFusion>>(cnode->input(0));
-    if (primc == nullptr) {
-      return false;
-    }
-    if (primc->GetAttr(ops::kActivationType) != nullptr && primc->get_activation_type() != mindspore::NO_ACTIVATION) {
-      return false;
-    }
-    size_t input_cnode_num = 0;
-    for (size_t i = 1; i < cnode->size(); ++i) {
-      auto input = cnode->input(i);
-      if (!utils::isa<CNodePtr>(input)) {
-        continue;
-      }
-      if (!opt::CheckPrimitiveType(input, prim::kPrimAddFusion)) {
-        return false;
-      }
-      auto input_cnode = input->cast<CNodePtr>();
-      auto add_primc = GetValueNode<std::shared_ptr<ops::AddFusion>>(input_cnode->input(0));
-      if (add_primc == nullptr) {
-        return false;
-      }
-      if (add_primc->GetAttr(ops::kActivationType) != nullptr &&
-          add_primc->get_activation_type() != mindspore::NO_ACTIVATION) {
-        return false;
-      }
-      ++input_cnode_num;
-      continue;
-    }
-    return input_cnode_num > 0;
-  }
-
-  bool Run(const FuncGraphPtr &func_graph) override {
-    if (func_graph == nullptr) {
-      return false;
-    }
-    auto manager = func_graph->manager();
-    if (manager == nullptr) {
-      return false;
-    }
-    auto node_list = TopoSort(func_graph->get_return());
-    for (auto &node : node_list) {
-      if (!utils::isa<CNode>(node)) {
-        continue;
-      }
-      auto cnode = node->cast<CNodePtr>();
-      if (!CanFusion(cnode)) {
-        continue;
-      }
-      std::vector<AnfNodePtr> inputs;
-      for (size_t i = 1; i < cnode->size(); ++i) {
-        auto input_node = cnode->input(i);
-        if (!utils::isa<CNode>(input_node)) {
-          inputs.push_back(input_node);
-          continue;
-        }
-        auto input_cnode = input_node->cast<CNodePtr>();
-        for (size_t j = 1; j < input_cnode->size(); ++j) {
-          inputs.push_back(input_cnode->input(j));
-        }
-      }
-      auto primc = std::make_shared<ops::AddN>();
-      auto new_cnode = func_graph->NewCNode(primc, inputs);
-      new_cnode->set_fullname_with_scope(cnode->fullname_with_scope());
-      new_cnode->set_abstract(cnode->abstract()->Clone());
-      manager->Replace(node, new_cnode);
-    }
-    return true;
-  }
-};
-
-// convert addn to custom op
-class Test2Fusion : public Pass {
- public:
-  Test2Fusion() : Pass("test2_fusion") {}
-  AnfNodePtr CreateCustomOp(const FuncGraphPtr func_graph, const CNodePtr &cnode) {
-    if (cnode == nullptr) {
-      return nullptr;
-    }
-    auto primc = std::make_shared<ops::Custom>();
-    if (primc == nullptr) {
-      return nullptr;
-    }
-    primc->set_type("Custom_AddN");
-    std::map<std::string, std::vector<uint8_t>> custom_attrs;
-    std::string input_num = std::to_string(3);
-    std::vector<uint8_t> input_num_attr(input_num.begin(), input_num.end());
-    custom_attrs["input_num"] = input_num_attr;
-    std::string op_kind = "custom op";
-    std::vector<uint8_t> op_kind_attr(op_kind.begin(), op_kind.end());
-    custom_attrs["op_kind"] = op_kind_attr;
-    primc->set_attr(custom_attrs);
-    auto inputs = cnode->inputs();
-    inputs.erase(inputs.begin());
-    auto custom_cnode = func_graph->NewCNode(primc, inputs);
-    custom_cnode->set_fullname_with_scope(cnode->fullname_with_scope());
-    custom_cnode->set_abstract(cnode->abstract()->Clone());
-    return custom_cnode;
-  }
-
-  bool Run(const FuncGraphPtr &func_graph) override {
-    if (func_graph == nullptr) {
-      return false;
-    }
-    auto manager = func_graph->manager();
-    if (manager == nullptr) {
-      return false;
-    }
-    auto node_list = TopoSort(func_graph->get_return());
-    for (auto &node : node_list) {
-      if (!utils::isa<CNode>(node)) {
-        continue;
-      }
-      if (!opt::CheckPrimitiveType(node, prim::kPrimAddN)) {
-        continue;
-      }
-      auto cnode = node->cast<CNodePtr>();
-      auto custome_cnode = CreateCustomOp(func_graph, cnode);
-      if (custome_cnode == nullptr) {
-        return false;
-      }
-      manager->Replace(node, custome_cnode);
-    }
-    return true;
-  }
-};
-
-class TestFusion : public Pass {
- public:
-  TestFusion() : Pass("test_fusion") {}
-  bool Run(const FuncGraphPtr &func_graph) override {
-    if (func_graph == nullptr) {
-      return false;
-    }
-    auto manager = Manage(func_graph, true);
-    if (manager == nullptr) {
-      return false;
-    }
-    auto test1_fusion = std::make_shared<Test1Fusion>();
-    if (!test1_fusion->Run(func_graph)) {
-      return false;
-    }
-    auto test2_fusion = std::make_shared<Test2Fusion>();
-    if (!test2_fusion->Run(func_graph)) {
-      return false;
-    }
-    return true;
-  }
-};
-REG_PASS(TestFusion, TestFusion)
-REG_SCHEDULED_PASS(POSITION_BEGIN, {"TestFusion"})
-}  // namespace opt
-
-TEST_F(PassRegistryTest, TestRegistry) {
-  auto &passes = opt::PassStoreRoomInfo();
-  auto &assigned_passes = opt::ExternalAssignedPassesInfo();
-  ASSERT_EQ(assigned_passes.size(), 1);
-  auto pass_names = assigned_passes[opt::POSITION_BEGIN];
-  ASSERT_EQ(pass_names.size(), 1);
-  auto begin_pass = passes[pass_names.front()];
-  ASSERT_NE(begin_pass, nullptr);
-  auto begin_pass_test = std::dynamic_pointer_cast<opt::TestFusion>(begin_pass);
-  ASSERT_NE(begin_pass_test, nullptr);
-  ASSERT_NE(func_graph_, nullptr);
-  auto res = begin_pass_test->Run(func_graph_);
-  ASSERT_EQ(res, true);
-  auto cnode_list = func_graph_->GetOrderedCnodes();
-  ASSERT_EQ(cnode_list.size(), 2);
-  bool is_custom = opt::CheckPrimitiveType(cnode_list.front(), prim::kPrimCustom);
-  ASSERT_EQ(is_custom, true);
-  auto custome_prim = GetValueNode<std::shared_ptr<ops::Custom>>(cnode_list.front()->input(0));
-  ASSERT_NE(custome_prim, nullptr);
-  auto type = custome_prim->get_type();
-  ASSERT_EQ(type, std::string("Custom_AddN"));
-  bool is_return = opt::CheckPrimitiveType(cnode_list.back(), prim::kPrimReturn);
-  ASSERT_EQ(is_return, true);
-}
-}  // namespace mindspore
diff --git a/mindspore/lite/tools/anf_exporter/anf_exporter.cc b/mindspore/lite/tools/anf_exporter/anf_exporter.cc
index 8bfd02d7301..ca8172323a1 100644
--- a/mindspore/lite/tools/anf_exporter/anf_exporter.cc
+++ b/mindspore/lite/tools/anf_exporter/anf_exporter.cc
@@ -38,18 +38,19 @@
 #include "src/common/utils.h"
 #include "tools/common/graph_util.h"
 #include "src/ops/ops_utils.h"
+#include "src/weight_decoder.h"
 #include "tools/common/node_util.h"
 #include "tools/converter/converter_context.h"
 #include "tools/converter/quantizer/quantize_util.h"
+#include "tools/converter/quantizer/fix_bit_weight_quantizer.h"
+#include "tools/converter/quantizer/fse_encoder.h"
 
 using mindspore::ops::PrimitiveC;
 
 namespace mindspore::lite {
 namespace {
-constexpr int kBitNum8 = 8;
-constexpr int kBitNum16 = 16;
 constexpr int kIndexOfValueInputOfGetTupleItem = 2;
-
+constexpr int kMaxDepth = 2048;
 std::list<CNodePtr> GetOrderedCNodes(const FuncGraphPtr fg) {
   auto BelongSameGraph = std::bind(IncludeBelongGraph, fg, std::placeholders::_1);
   auto succ_include_fv = [&fg](const AnfNodePtr &node) -> std::vector<AnfNodePtr> {
@@ -117,7 +118,17 @@ static STATUS CompressTensor(schema::TensorT *tensor_input, const std::unique_pt
     auto repetition_packed = false;
     MS_LOG(DEBUG) << dst_node->name;
     if (dst_node->quantType == schema::QuantType_QUANT_WEIGHT) {
-      if (bit_num <= kBitNum8) {
+      if (bit_num == 0) {
+        if (tensor_input->data.empty() || tensor_input->dims.size() <= 1) {
+          return RET_OK;
+        }
+        quant::FSEEncoder fse_encoder;
+        if (dst_node->primitive->value.type == PrimitiveType_GRU) {
+          fse_encoder.Compress(tensor_input);
+        } else {
+          fse_encoder.Compress(tensor_input);
+        }
+      } else if (bit_num <= kBitNum8) {
         repetition_packed = PackRepetition<int8_t>(bit_num, tensor_input);
       } else {
         repetition_packed = PackRepetition<int16_t>(bit_num, tensor_input);
@@ -468,36 +479,13 @@ int AnfExporter::ExportSubgraph(const FuncGraphPtr &func_graph, const std::uniqu
   return RET_OK;
 }
 
-bool AnfExporter::IsCall(const AnfNodePtr node) {
-  if (!utils::isa<CNodePtr>(node)) {
-    return false;
-  }
-  auto cnode = node->cast<CNodePtr>();
-  if (cnode->inputs().empty()) {
-    return false;
-  }
-  auto cnode_first_input = cnode->input(kPrimIndex);
-  if (utils::isa<CNodePtr>(cnode_first_input)) {
-    return true;
-  }
-
-  return false;
-}
-
-bool IsPartialFusion(const AnfNodePtr &node) {
-  if (node == nullptr) {
-    lite::ReturnCode::GetSingleReturnCode()->UpdateReturnCode(lite::RET_NULL_PTR);
-    return false;
-  }
-  if (node->isa<mindspore::CNode>()) {
-    auto cnode = node->cast<CNodePtr>();
-    auto vnode_value = cnode->input(0)->cast<ValueNodePtr>()->value();
-    return GetValue<NamedPtr>(vnode_value)->name() == "PartialFusion";
-  }
-  return false;
-}
-
 FuncGraphPtr GetFinalGraph(const FuncGraphPtr &func_graph) {
+  static int i = 0;
+  if (i > kMaxDepth) {
+    MS_LOG(ERROR) << "exceed max depth 2048, i " << i;
+    return nullptr;
+  }
+  i++;
   // get output
   CNodePtr call_cnode = nullptr;
   auto fg_output = func_graph->output();
@@ -522,6 +510,23 @@ FuncGraphPtr GetFinalGraph(const FuncGraphPtr &func_graph) {
   return nullptr;
 }
 
+int AnfExporter::SetMetaGraphInput(const FuncGraphPtr &func_graph,
+                                   const std::unique_ptr<schema::MetaGraphT> &meta_graphT) {
+  MS_ASSERT(func_graph != nullptr);
+  if (!reorder_input_) {
+    return RET_OK;
+  }
+  meta_graphT->inputIndex.clear();
+  for (const auto &input : func_graph->get_inputs()) {
+    auto iter = graph_inputs_map_.find(input);
+    if (iter == graph_inputs_map_.end()) {
+      return RET_ERROR;
+    }
+    meta_graphT->inputIndex.emplace_back(iter->second);
+  }
+  return RET_OK;
+}
+
 int AnfExporter::SetMetaGraphOutput(const FuncGraphPtr &func_graph,
                                     const std::unique_ptr<schema::MetaGraphT> &meta_graphT) {
   auto final_fg = GetFinalGraph(func_graph);
@@ -544,6 +549,9 @@ int AnfExporter::SetMetaGraphOutput(const FuncGraphPtr &func_graph,
 schema::MetaGraphT *AnfExporter::Export(const FuncGraphPtr &func_graph, bool keep_graph, bool copy_primitive,
                                         bool train_flag) {
   this->train_flag_ = train_flag;
+  // hardcode for nnie and train
+  this->reorder_input_ = !(train_flag) && !(ConverterContext::GetInstance()->GetGraphInputTensorNames().empty());
+  this->graph_inputs_map_.clear();
   auto meta_graphT = std::make_unique<schema::MetaGraphT>();
   auto fmk = func_graph->get_attr("fmk");
   MS_ASSERT(fmk != nullptr);
@@ -558,7 +566,18 @@ schema::MetaGraphT *AnfExporter::Export(const FuncGraphPtr &func_graph, bool kee
     return nullptr;
   }
 
-  SetMetaGraphOutput(func_graph, meta_graphT);
+  ret = SetMetaGraphInput(func_graph, meta_graphT);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "SetMetaGraphInput failed.";
+    ReturnCode::GetSingleReturnCode()->UpdateReturnCode(ret);
+    return nullptr;
+  }
+  ret = SetMetaGraphOutput(func_graph, meta_graphT);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "SetMetaGraphOutput failed.";
+    ReturnCode::GetSingleReturnCode()->UpdateReturnCode(ret);
+    return nullptr;
+  }
 
   return meta_graphT.release();
 }
@@ -739,8 +758,11 @@ int AnfExporter::SetOpInputNode(const CNodePtr &cnode, const std::unique_ptr<sch
       if (IsContain(graph_inputs_, input_node->cast<AnfNodePtr>()) &&
           graph_inputs_has_exported_.find(input_node) == graph_inputs_has_exported_.end()) {
         graph_inputs_has_exported_.insert(input_node);
-        meta_graphT->inputIndex.push_back(meta_graphT->allTensors.size() - 1);
-        meta_graphT->allTensors.back()->format = schema::Format_NHWC;
+        if (reorder_input_) {
+          graph_inputs_map_[input_node] = meta_graphT->allTensors.size() - 1;
+        } else {
+          meta_graphT->inputIndex.push_back(meta_graphT->allTensors.size() - 1);
+        }
       }
     } else if (input_node->isa<ValueNode>()) {
       auto ret = ConvertInputValueNode(cnode, i, primitive_c, meta_graphT, fb_node);
@@ -846,18 +868,6 @@ void AnfExporter::SetOpOutputNode(const CNodePtr &cnode, const std::unique_ptr<s
   }
 }
 
-ValueNodePtr AnfExporter::GetPartialAnfPrim() {
-  auto partial_prim = std::make_shared<mindspore::ops::PartialFusion>();
-  ValueNodePtr partial_anf_prim = NewValueNode(partial_prim);
-  return partial_anf_prim;
-}
-
-ValueNodePtr AnfExporter::GetCallAnfPrim() {
-  auto call_prim = std::make_shared<mindspore::ops::Call>();
-  ValueNodePtr call_anf_prim = NewValueNode(call_prim);
-  return call_anf_prim;
-}
-
 CNodePtr AnfExporter::CreateCallCnode(const FuncGraphPtr &fg, const AnfNodePtr &node) {
   auto call_anf_prim_vnode = GetCallAnfPrim();
   std::vector<AnfNodePtr> inputs{call_anf_prim_vnode, node};
@@ -873,13 +883,13 @@ CNodePtr AnfExporter::CreatePartialCnode(const FuncGraphPtr &fg, const AnfNodePt
     if (primitive_c != nullptr) {
       return cnode;
     }
-    auto partial_anf_prim_vnode = GetPartialAnfPrim();
+    auto partial_anf_prim_vnode = GetPartialFusionPrim();
     auto cnode_input = cnode->inputs();
     cnode_input.insert(cnode_input.begin(), partial_anf_prim_vnode);
     cnode->set_inputs(cnode_input);
     return cnode;
   } else if (utils::isa<ValueNodePtr>(node)) {
-    auto partial_anf_prim_vnode = GetPartialAnfPrim();
+    auto partial_anf_prim_vnode = GetPartialFusionPrim();
     std::vector<AnfNodePtr> inputs{partial_anf_prim_vnode, node};
     auto cnode = fg->NewCNode(inputs);
     return cnode;
diff --git a/mindspore/lite/tools/anf_exporter/anf_exporter.h b/mindspore/lite/tools/anf_exporter/anf_exporter.h
index 8a61e82bef2..ad534e99da2 100644
--- a/mindspore/lite/tools/anf_exporter/anf_exporter.h
+++ b/mindspore/lite/tools/anf_exporter/anf_exporter.h
@@ -31,6 +31,7 @@
 #include "tools/converter/converter_context.h"
 #include "tools/converter/converter_flags.h"
 #include "tools/optimizer/common/gllo_utils.h"
+#include "tools/common/node_util.h"
 
 using mindspore::ops::PrimitiveC;
 
@@ -46,7 +47,6 @@ class AnfExporter {
  public:
   AnfExporter() = default;
   virtual ~AnfExporter() = default;
-  void set_train_flag(bool train_flag) { train_flag_ = train_flag; }
   schema::MetaGraphT *Export(const FuncGraphPtr &func_graph, bool keep_graph = false, bool copy_primitive = false,
                              bool train_flag = false);
   void SetOpOutputNode(const CNodePtr &cnode, const std::unique_ptr<schema::MetaGraphT> &meta_graphT,
@@ -74,8 +74,6 @@ class AnfExporter {
              const size_t &subgraph_index, const bool &keep_graph, const bool &copy_primitive);
   int ExportSubgraph(const FuncGraphPtr &func_graph, const std::unique_ptr<schema::MetaGraphT> &meta_graphT,
                      bool keep_graph, bool copy_primitive, const std::shared_ptr<AnfNode> &partial_anode = nullptr);
-  static ValueNodePtr GetPartialAnfPrim();
-  static ValueNodePtr GetCallAnfPrim();
   static CNodePtr CreateCallCnode(const FuncGraphPtr &fg, const AnfNodePtr &cnode);
   static CNodePtr CreatePartialCnode(const FuncGraphPtr &fg, const AnfNodePtr &node);
   bool HasExported(const FuncGraphPtr &func_graph);
@@ -83,8 +81,8 @@ class AnfExporter {
                         const bool &copy_primitive, const CNodePtr &partial_cnode,
                         const std::unique_ptr<schema::CNodeT> &schema_cnode);
   std::list<CNodePtr> InsertCallNode(const FuncGraphPtr &func_graph);
+  int SetMetaGraphInput(const FuncGraphPtr &func_graph, const std::unique_ptr<schema::MetaGraphT> &meta_graphT);
   int SetMetaGraphOutput(const FuncGraphPtr &func_graph, const std::unique_ptr<schema::MetaGraphT> &meta_graphT);
-  bool IsCall(const AnfNodePtr node);
   int CreateNewTensorForParameter(const std::unique_ptr<schema::MetaGraphT> &meta_graphT, const AnfNodePtr &input);
 
  private:
@@ -94,8 +92,10 @@ class AnfExporter {
   std::map<FuncGraphPtr, size_t> fg_subgraph_map_;
   std::vector<AnfNodePtr> graph_inputs_;
   std::set<AnfNodePtr> graph_inputs_has_exported_;
+  std::map<AnfNodePtr, int> graph_inputs_map_;
   uint32_t node_idx_ = 0;
   bool train_flag_ = false;
+  bool reorder_input_ = false;
 };
 // by default, copy_primitive is false, which means that the MetaGraph and func_graph share the same schema::PrimitiveT.
 // but in PostQuantization, the func_graph need to transfer to MetaGraph first and do MetaGraph pass, which may modify
diff --git a/mindspore/lite/tools/anf_exporter/fetch_content.cc b/mindspore/lite/tools/anf_exporter/fetch_content.cc
index c8a65042dc8..64d3d0e1b5b 100644
--- a/mindspore/lite/tools/anf_exporter/fetch_content.cc
+++ b/mindspore/lite/tools/anf_exporter/fetch_content.cc
@@ -77,12 +77,12 @@ STATUS GetShapeVectorFromStringTensor(const tensor::TensorPtr &tensor_info, Shap
 }
 int GetFormatByFmk(int32_t fmk_type) {
   switch (fmk_type) {
-    case converter::FmkType_ONNX:
-    case lite::converter::FmkType_CAFFE:
-    case lite::converter::FmkType_MS:
+    case converter::kFmkTypeOnnx:
+    case converter::kFmkTypeCaffe:
+    case converter::kFmkTypeMs:
       return mindspore::NCHW;
-    case lite::converter::FmkType_TF:
-    case lite::converter::FmkType_TFLITE:
+    case converter::kFmkTypeTf:
+    case converter::kFmkTypeTflite:
       return mindspore::NHWC;
     default:
       return -1;
@@ -286,15 +286,15 @@ int FetchDataFromParameterNode(const CNodePtr &cnode, size_t index, converter::F
     return RET_ERROR;
   }
   auto prim = GetValueNode<PrimitivePtr>(cnode->input(0));
+  if (prim->GetAttr(ops::kFormat) == nullptr && !param_node->has_default()) {
+    data_info->format_ = mindspore::NHWC;
+  }
   if (prim->GetAttr(ops::kFormat) != nullptr && !opt::CheckPrimitiveType(cnode, prim::kPrimResize)) {
     auto value = prim->GetAttr(ops::kFormat);
     if (value->isa<mindspore::Int64Imm>()) {
       data_info->format_ = GetValue<int64_t>(value);
     }
   }
-  if (!param_node->has_default()) {
-    data_info->format_ = NHWC;
-  }
   // attr weightFormat is only used by conv-like ops' second input
   if ((opt::CheckPrimitiveType(cnode, prim::kPrimConv2DFusion) ||
        opt::CheckPrimitiveType(cnode, opt::kPrimConv2DBackpropInputFusion) ||
diff --git a/mindspore/lite/tools/benchmark/benchmark.cc b/mindspore/lite/tools/benchmark/benchmark.cc
index 22409ebf150..88f11b01a09 100644
--- a/mindspore/lite/tools/benchmark/benchmark.cc
+++ b/mindspore/lite/tools/benchmark/benchmark.cc
@@ -20,6 +20,7 @@
 #undef __STDC_FORMAT_MACROS
 #include <utility>
 #include <functional>
+#include <algorithm>
 #include "include/context.h"
 #include "include/ms_tensor.h"
 #include "include/version.h"
@@ -115,7 +116,7 @@ int Benchmark::ReadTensorData(std::ifstream &in_file_stream, const std::string &
   if (this->benchmark_data_.find(tensor_name) != this->benchmark_data_.end()) {
     return RET_OK;
   }
-  tensor::MSTensor *tensor = GetTensorByNameOrShape(tensor_name, dims);
+  tensor::MSTensor *tensor = session_->GetOutputByTensorName(tensor_name);
   if (tensor == nullptr) {
     MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
     return RET_ERROR;
@@ -175,17 +176,17 @@ int Benchmark::CompareOutput() {
   float total_bias = 0;
   int total_size = 0;
   for (const auto &calib_tensor : benchmark_data_) {
-    std::string node_or_tensor_name = calib_tensor.first;
-    tensor::MSTensor *tensor = GetTensorByNameOrShape(node_or_tensor_name, calib_tensor.second->shape);
+    std::string tensor_name = calib_tensor.first;
+    tensor::MSTensor *tensor = session_->GetOutputByTensorName(tensor_name);
     if (tensor == nullptr) {
-      MS_LOG(ERROR) << "Get tensor failed, tensor name: " << node_or_tensor_name;
+      MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
       return RET_ERROR;
     }
     int ret;
     if (tensor->data_type() == kObjectTypeString) {
-      ret = CompareStringData(node_or_tensor_name, tensor);
+      ret = CompareStringData(tensor_name, tensor);
     } else {
-      ret = CompareDataGetTotalBiasAndSize(node_or_tensor_name, tensor, &total_bias, &total_size);
+      ret = CompareDataGetTotalBiasAndSize(tensor_name, tensor, &total_bias, &total_size);
     }
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Error in CompareData";
@@ -212,41 +213,6 @@ int Benchmark::CompareOutput() {
   return RET_OK;
 }
 
-tensor::MSTensor *Benchmark::GetTensorByNodeShape(const std::vector<size_t> &node_shape) {
-  std::vector<tensor::MSTensor *> match_tensors;
-  std::vector<int> shape_vector;
-  (void)std::transform(node_shape.begin(), node_shape.end(), std::back_inserter(shape_vector),
-                       [](const size_t &value) { return static_cast<int>(value); });
-  auto tensors = session_->GetOutputs();
-  for (auto &out_tensor_pair : tensors) {
-    if (out_tensor_pair.second->shape() == shape_vector) {
-      match_tensors.emplace_back(out_tensor_pair.second);
-    }
-  }
-  if (match_tensors.empty() || match_tensors.size() != 1) {
-    MS_LOG(ERROR) << "get tensor by node shape failed";
-    return nullptr;
-  }
-  return match_tensors.front();
-}
-
-tensor::MSTensor *Benchmark::GetTensorByNameOrShape(const std::string &node_or_tensor_name,
-                                                    const std::vector<size_t> &dims) {
-  tensor::MSTensor *tensor = nullptr;
-  auto tensors = session_->GetOutputsByNodeName(node_or_tensor_name);
-  if (tensors.empty() || tensors.size() != 1) {
-    MS_LOG(INFO) << "Cannot find output node: " << node_or_tensor_name
-                 << " or node has more than one output tensor, switch to GetOutputByTensorName";
-    tensor = session_->GetOutputByTensorName(node_or_tensor_name);
-    if (tensor == nullptr) {
-      return GetTensorByNodeShape(dims);
-    }
-  } else {
-    tensor = tensors.front();
-  }
-  return tensor;
-}
-
 int Benchmark::CompareDataGetTotalBiasAndSize(const std::string &name, tensor::MSTensor *tensor, float *total_bias,
                                               int *total_size) {
   float bias = 0;
diff --git a/mindspore/lite/tools/benchmark/benchmark.h b/mindspore/lite/tools/benchmark/benchmark.h
index 69124e8db93..fdc0da2a019 100644
--- a/mindspore/lite/tools/benchmark/benchmark.h
+++ b/mindspore/lite/tools/benchmark/benchmark.h
@@ -60,10 +60,6 @@ class MS_API Benchmark : public BenchmarkBase {
 
   int CompareOutput() override;
 
-  tensor::MSTensor *GetTensorByNameOrShape(const std::string &node_or_tensor_name, const std::vector<size_t> &dims);
-
-  tensor::MSTensor *GetTensorByNodeShape(const std::vector<size_t> &node_shape);
-
   int CompareDataGetTotalBiasAndSize(const std::string &name, tensor::MSTensor *tensor, float *total_bias,
                                      int *total_size);
 
diff --git a/mindspore/lite/tools/benchmark/benchmark_unified_api.cc b/mindspore/lite/tools/benchmark/benchmark_unified_api.cc
index 69a35ef02d6..a80ed571d37 100644
--- a/mindspore/lite/tools/benchmark/benchmark_unified_api.cc
+++ b/mindspore/lite/tools/benchmark/benchmark_unified_api.cc
@@ -120,7 +120,7 @@ int BenchmarkUnifiedApi::ReadTensorData(std::ifstream &in_file_stream, const std
   if (this->benchmark_data_.find(tensor_name) != this->benchmark_data_.end()) {
     return RET_OK;
   }
-  mindspore::MSTensor tensor = GetMSTensorByNameOrShape(tensor_name, dims);
+  mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
   if (tensor == nullptr) {
     MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
     return RET_ERROR;
@@ -178,10 +178,10 @@ int BenchmarkUnifiedApi::CompareOutput() {
   float total_bias = 0;
   int total_size = 0;
   for (const auto &calib_tensor : benchmark_data_) {
-    std::string node_or_tensor_name = calib_tensor.first;
-    mindspore::MSTensor tensor = GetMSTensorByNameOrShape(node_or_tensor_name, calib_tensor.second->shape);
+    std::string tensor_name = calib_tensor.first;
+    mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
     if (tensor == nullptr) {
-      MS_LOG(ERROR) << "Get tensor failed, tensor name: " << node_or_tensor_name;
+      MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
       return RET_ERROR;
     }
     int ret;
@@ -190,7 +190,7 @@ int BenchmarkUnifiedApi::CompareOutput() {
       MS_LOG(ERROR) << "Unsupported  kObjectTypeString:";
       return RET_ERROR;
     } else {
-      ret = CompareDataGetTotalBiasAndSize(node_or_tensor_name, &tensor, &total_bias, &total_size);
+      ret = CompareDataGetTotalBiasAndSize(tensor_name, &tensor, &total_bias, &total_size);
     }
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Error in CompareData";
@@ -217,36 +217,6 @@ int BenchmarkUnifiedApi::CompareOutput() {
   return RET_OK;
 }
 
-mindspore::MSTensor BenchmarkUnifiedApi::GetMSTensorByNodeShape(const std::vector<size_t> &node_shape) {
-  std::vector<mindspore::MSTensor> match_tensors;
-  std::vector<int64_t> shape_vector = ConverterToInt64Vector<size_t>(node_shape);
-  auto tensors = ms_model_.GetOutputs();
-  for (auto &out_tensor_pair : tensors) {
-    if (out_tensor_pair.Shape() == shape_vector) {
-      match_tensors.emplace_back(out_tensor_pair);
-    }
-  }
-
-  return match_tensors.front();
-}
-
-mindspore::MSTensor BenchmarkUnifiedApi::GetMSTensorByNameOrShape(const std::string &node_or_tensor_name,
-                                                                  const std::vector<size_t> &dims) {
-  mindspore::MSTensor tensor;
-  auto tensors = ms_model_.GetOutputsByNodeName(node_or_tensor_name);
-  if (tensors.empty() || tensors.size() != 1) {
-    MS_LOG(INFO) << "Cannot find output node: " << node_or_tensor_name
-                 << " or node has more than one output tensor, switch to GetOutputByTensorName";
-    tensor = ms_model_.GetOutputByTensorName(node_or_tensor_name);
-    if (tensor == nullptr) {
-      return GetMSTensorByNodeShape(dims);
-    }
-  } else {
-    tensor = tensors.front();
-  }
-  return tensor;
-}
-
 int BenchmarkUnifiedApi::CompareDataGetTotalBiasAndSize(const std::string &name, mindspore::MSTensor *tensor,
                                                         float *total_bias, int *total_size) {
   float bias = 0;
diff --git a/mindspore/lite/tools/benchmark/benchmark_unified_api.h b/mindspore/lite/tools/benchmark/benchmark_unified_api.h
index 6aebb5f6779..503d07f56f3 100644
--- a/mindspore/lite/tools/benchmark/benchmark_unified_api.h
+++ b/mindspore/lite/tools/benchmark/benchmark_unified_api.h
@@ -52,8 +52,6 @@ class MS_API BenchmarkUnifiedApi : public BenchmarkBase {
   int CompareDataGetTotalBiasAndSize(const std::string &name, mindspore::MSTensor *tensor, float *total_bias,
                                      int *total_size);
   void InitContext(const std::shared_ptr<mindspore::Context> &context);
-  mindspore::MSTensor GetMSTensorByNodeShape(const std::vector<size_t> &node_shape);
-  mindspore::MSTensor GetMSTensorByNameOrShape(const std::string &node_or_tensor_name, const std::vector<size_t> &dims);
 
   // call GenerateRandomData to fill inputTensors
   int GenerateInputData() override;
diff --git a/mindspore/lite/tools/benchmark_train/main.cc b/mindspore/lite/tools/benchmark_train/main.cc
index 901897f460f..570849efbff 100644
--- a/mindspore/lite/tools/benchmark_train/main.cc
+++ b/mindspore/lite/tools/benchmark_train/main.cc
@@ -15,13 +15,31 @@
  */
 
 #include <malloc.h>
+#include <unistd.h>
+#include <fstream>
 #include "tools/benchmark_train/net_train.h"
 #include "include/version.h"
 
+void PrintMem() {
+  std::string proc_file = "/proc/" + std::to_string(getpid()) + "/status";
+  std::ifstream infile(proc_file);
+  if (infile.good()) {
+    std::string line;
+    while (std::getline(infile, line)) {
+      if (line.find("VmHWM") != std::string::npos) {
+        std::cout << line << std::endl;
+      }
+    }
+    infile.close();
+    struct mallinfo info = mallinfo();
+    std::cout << "Arena allocation: " << info.arena + info.hblkhd << std::endl;
+    // process pair (a,b)
+  }
+}
+
 int main(int argc, const char **argv) {
   MS_LOG(INFO) << mindspore::lite::Version();
   int res = mindspore::lite::RunNetTrain(argc, argv);
-  struct mallinfo info = mallinfo();
-  std::cout << "Total allocation: " << info.arena + info.hblkhd << std::endl;
+  PrintMem();
   return res;
 }
diff --git a/mindspore/lite/tools/common/func_graph_subgraph.cc b/mindspore/lite/tools/common/func_graph_subgraph.cc
index 79d900fa277..a507353a63b 100644
--- a/mindspore/lite/tools/common/func_graph_subgraph.cc
+++ b/mindspore/lite/tools/common/func_graph_subgraph.cc
@@ -482,7 +482,9 @@ void SubGraph::CreateCNodeForPartialSubGraph(
   // move cnode from belong_graph to subgraph
   for (auto &node : this->GetNodes()) {
     sub_graph->AddNode(node);
-    node->set_func_graph(sub_graph);
+    if (!utils::isa<ValueNodePtr>(node)) {
+      node->set_func_graph(sub_graph);
+    }
     for (size_t i = 0; i < node->inputs().size(); i++) {
       if (node == nullptr || node->inputs().at(i)) {
         continue;
diff --git a/mindspore/lite/tools/common/node_util.cc b/mindspore/lite/tools/common/node_util.cc
index 65d6a8659e9..c7ee6aeec17 100644
--- a/mindspore/lite/tools/common/node_util.cc
+++ b/mindspore/lite/tools/common/node_util.cc
@@ -24,6 +24,9 @@
 #include "tools/common/graph_util.h"
 #include "tools/common/tensor_util.h"
 #include "src/runtime/infer_manager.h"
+#include "mindspore/core/ops/switch.h"
+#include "mindspore/core/ops/call.h"
+#include "mindspore/core/ops/fusion/partial_fusion.h"
 
 namespace mindspore {
 namespace lite {
@@ -335,5 +338,77 @@ size_t GetCNodeOutputsSize(const std::shared_ptr<AnfNode> &anf_node, bool train_
     return 1;
   }
 }
+
+bool IsPartialFusion(const AnfNodePtr &node) {
+  if (node == nullptr) {
+    return false;
+  }
+  if (node->isa<mindspore::CNode>()) {
+    auto cnode = node->cast<CNodePtr>();
+    auto vnode_value = cnode->input(0)->cast<ValueNodePtr>()->value();
+    return GetValue<NamedPtr>(vnode_value)->name() == "PartialFusion";
+  }
+  return false;
+}
+
+bool IsCall(const AnfNodePtr &node) {
+  if (node == nullptr) {
+    return false;
+  }
+  if (!utils::isa<CNodePtr>(node)) {
+    return false;
+  }
+  auto cnode = node->cast<CNodePtr>();
+  if (cnode->inputs().empty()) {
+    return false;
+  }
+  auto cnode_first_input = cnode->input(0);
+  if (utils::isa<CNodePtr>(cnode_first_input)) {
+    return true;
+  }
+  if (utils::isa<ValueNode>(cnode_first_input)) {
+    auto vnode = cnode_first_input->cast<ValueNodePtr>();
+    return GetValueNode<FuncGraphPtr>(vnode) != nullptr;
+  }
+  return false;
+}
+
+bool IsSwitch(const AnfNodePtr &node) {
+  if (node == nullptr) {
+    return false;
+  }
+  if (!utils::isa<CNodePtr>(node)) {
+    return false;
+  }
+  return opt::CheckPrimitiveType(node, prim::kPrimSwitch);
+}
+
+bool IsMakeTuple(const AnfNodePtr &node) {
+  if (node == nullptr) {
+    return false;
+  }
+  if (!utils::isa<CNodePtr>(node)) {
+    return false;
+  }
+  return opt::CheckPrimitiveType(node, prim::kPrimMakeTuple);
+}
+
+ValueNodePtr GetPartialFusionPrim() {
+  auto partial_prim = std::make_shared<mindspore::ops::PartialFusion>();
+  ValueNodePtr partial_anf_prim = NewValueNode(partial_prim);
+  return partial_anf_prim;
+}
+
+ValueNodePtr GetSwitchAnfPrim() {
+  auto switch_prim = std::make_shared<mindspore::ops::Switch>();
+  ValueNodePtr switch_anf_prim = NewValueNode(switch_prim);
+  return switch_anf_prim;
+}
+
+ValueNodePtr GetCallAnfPrim() {
+  auto call_prim = std::make_shared<mindspore::ops::Call>();
+  ValueNodePtr call_anf_prim = NewValueNode(call_prim);
+  return call_anf_prim;
+}
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/common/node_util.h b/mindspore/lite/tools/common/node_util.h
index 6a2f1a560ae..5f0870e78cb 100644
--- a/mindspore/lite/tools/common/node_util.h
+++ b/mindspore/lite/tools/common/node_util.h
@@ -413,6 +413,20 @@ static STATUS TransFilterFormat(schema::TensorT *tensor, kTransFilterType type)
 STATUS TransFilterFormat(schema::TensorT *tensor, schema::Format dstFormat);
 
 size_t GetCNodeOutputsSize(const std::shared_ptr<AnfNode> &anf_node, bool train_flag = false);
+
+bool IsPartialFusion(const AnfNodePtr &node);
+
+bool IsCall(const AnfNodePtr &node);
+
+bool IsSwitch(const AnfNodePtr &node);
+
+bool IsMakeTuple(const AnfNodePtr &node);
+
+ValueNodePtr GetPartialFusionPrim();
+
+ValueNodePtr GetSwitchAnfPrim();
+
+ValueNodePtr GetCallAnfPrim();
 }  // namespace lite
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_TOOLS_COMMON_NODE_UTIL_H
diff --git a/mindspore/lite/tools/converter/CMakeLists.txt b/mindspore/lite/tools/converter/CMakeLists.txt
index 52e3e50abe3..0440f8c9383 100644
--- a/mindspore/lite/tools/converter/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/CMakeLists.txt
@@ -26,6 +26,7 @@ file(GLOB_RECURSE CONVERTER_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/node_util.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/tensor_util.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/string_util.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common/lite_utils.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/protobuf_utils.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/func_graph_subgraph.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/flag_parser.cc
@@ -39,9 +40,11 @@ file(GLOB_RECURSE CONVERTER_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         ${CMAKE_CURRENT_SOURCE_DIR}/parser/conv1d_inout_adjust.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/parser/inputs_adjust.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/parser/unify_format.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/parser/lstm_adjust_pass.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/import/mindspore_importer.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/import/primitive_adjust.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/import/mindir_adjust.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/import/mindir_control_flow_adjust.cc
 
         ../optimizer/common/node_pass_extends.cc
         ../optimizer/common/pass_manager_extends.cc
@@ -62,11 +65,11 @@ file(GLOB_RECURSE CONVERTER_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         ../optimizer/fusion/conv_bn_fusion.cc
         ../optimizer/fusion/conv_tuplegetitem_fusion.cc
         ../optimizer/fusion/constant_folding_fusion.cc
-        ../optimizer/fusion/quant_dtype_cast_fusion.cc
         ../optimizer/fusion/norm_fusion.cc
         ../optimizer/fusion/batchmatmul_fusion.cc
         ../optimizer/fusion/sigmoid_mul_fusion.cc
         ../optimizer/fusion/conv_conv_fusion.cc
+        ../optimizer/fusion/conv_pad_fusion.cc
         ../optimizer/fusion/tflite_lstm_cell_fusion.cc
         ../optimizer/fusion/tf_lstm_cell_fusion.cc
         ../optimizer/fusion/tf_bidirection_gru_fusion.cc
@@ -114,6 +117,7 @@ file(GLOB_RECURSE CONVERTER_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         ../optimizer/graph/reduce_same_act_pass.cc
         ../optimizer/graph/split_one_pass.cc
         ../optimizer/graph/find_const_subgraph_pass.cc
+        ../optimizer/graph/specify_graph_input_format.cc
         )
 
 add_subdirectory(../anf_exporter anf_exporter)
@@ -133,6 +137,7 @@ set(LITE_SRC
         ${SRC_DIR}/common/context_util.cc
         ${SRC_DIR}/common/graph_util.cc
         ${SRC_DIR}/common/string_util.cc
+        ${SRC_DIR}/common/lite_utils.cc
         ${SRC_DIR}/common/prim_util.cc
         ${SRC_DIR}/common/tensor_util.cc
         ${SRC_DIR}/runtime/inner_allocator.cc
@@ -142,10 +147,6 @@ set(LITE_SRC
         ${SRC_DIR}/tensor.cc
         ${SRC_DIR}/ms_tensor.cc
         ${SRC_DIR}/tensorlist.cc
-        ${SRC_DIR}/registry/kernel_interface_registry.cc
-        ${SRC_DIR}/registry/register_utils.cc
-        ${SRC_DIR}/registry/register_kernel_impl.cc
-        ${SRC_DIR}/registry/kernel_interface.cc
         ${SRC_DIR}/kernel_registry.cc
         ${SRC_DIR}/inner_kernel.cc
         ${SRC_DIR}/lite_kernel.cc
diff --git a/mindspore/lite/tools/converter/anf_transform.cc b/mindspore/lite/tools/converter/anf_transform.cc
index 65b50868f20..219676c7f99 100644
--- a/mindspore/lite/tools/converter/anf_transform.cc
+++ b/mindspore/lite/tools/converter/anf_transform.cc
@@ -36,6 +36,7 @@
 #include "tools/optimizer/fusion/batchmatmul_fusion.h"
 #include "tools/optimizer/fusion/sigmoid_mul_fusion.h"
 #include "tools/optimizer/fusion/conv_conv_fusion.h"
+#include "tools/optimizer/fusion/conv_pad_fusion.h"
 #include "tools/optimizer/fusion/tflite_lstm_cell_fusion.h"
 #include "tools/optimizer/fusion/tf_lstm_cell_fusion.h"
 #include "tools/optimizer/fusion/tf_bidirection_gru_fusion.h"
@@ -58,6 +59,8 @@
 #include "tools/optimizer/graph/reduce_same_act_pass.h"
 #include "tools/optimizer/graph/split_one_pass.h"
 #include "tools/optimizer/graph/decrease_transpose_algo.h"
+#include "tools/optimizer/graph/specify_graph_input_format.h"
+#include "tools/optimizer/graph/dump_graph.h"
 #include "tools/converter/quantizer/post_training_quantizer.h"
 #include "tools/converter/quantizer/quant_cast.h"
 #include "tools/converter/quantizer/weight_quantizer.h"
@@ -114,7 +117,7 @@ int AnfTransform::RunFusionPass(const FuncGraphPtr &old_graph, const converter::
     fusion_pm->AddPass(std::make_shared<opt::AffineFusion>());
     fusion_pm->AddPass(std::make_shared<opt::AffineActivationFusion>());
   }
-  if (config->fmk == lite::converter::FmkType_MS) {
+  if (config->fmk == converter::kFmkTypeMs) {
     auto remove_unused_cast_pass = std::make_shared<opt::RemoveUnusedCastOpPass>();
     if (remove_unused_cast_pass == nullptr) {
       MS_LOG(ERROR) << "RemoveUnusedCastOpPass should be specified";
@@ -124,6 +127,7 @@ int AnfTransform::RunFusionPass(const FuncGraphPtr &old_graph, const converter::
     fusion_pm->AddPass(remove_unused_cast_pass);
   }
   fusion_pm->AddPass(std::make_shared<opt::ConvConvFusion>());
+  fusion_pm->AddPass(std::make_shared<opt::ConvPadFusion>());
   if (!config->trainModel) {
     fusion_pm->AddPass(std::make_shared<opt::MatMulAddFusion>());
   }
@@ -194,8 +198,8 @@ int AnfTransform::RunParallelPass(const FuncGraphPtr &old_graph, const converter
 int AnfTransform::RunGraphPass(const FuncGraphPtr &old_graph, const converter::Flags *config) {
   auto optimizer = std::make_shared<opt::GraphOptimizer>();
   auto graph_pm = std::make_shared<opt::PassManager>("anf graph pass manager", true);
-  if (config->fmk == lite::converter::FmkType_TFLITE || config->fmk == lite::converter::FmkType_TF ||
-      config->fmk == lite::converter::FmkType_ONNX) {
+  if (config->fmk == converter::kFmkTypeTflite || config->fmk == converter::kFmkTypeTf ||
+      config->fmk == converter::kFmkTypeOnnx) {
     graph_pm->AddPass(std::make_shared<opt::ControlFlowPass>());
   }
   auto slice_prepose_pass = std::make_shared<opt::SlicePreposePass>();
@@ -286,7 +290,7 @@ int AnfTransform::DoSingleGraphQuantize(const FuncGraphPtr &old_graph, const con
     m_quantizer_->flags = *config;
     auto status = m_quantizer_->DoQuantize(old_graph);
     if (status != RET_OK) {
-      MS_LOG(ERROR) << "Quant failed " << status;
+      MS_LOG(ERROR) << "DoQuantization failed " << status;
       ReturnCode::GetSingleReturnCode()->UpdateReturnCode(status);
       return RET_ERROR;
     }
@@ -325,14 +329,18 @@ FuncGraphPtr AnfTransform::TransformFuncGraph(const FuncGraphPtr &old_graph, con
     return nullptr;
   }
 
-  if (!opt::RunExternalPass(old_graph, opt::POSITION_BEGIN)) {
+  if (!RunExternalPass(old_graph, registry::POSITION_BEGIN)) {
     MS_LOG(ERROR) << "Run external pass failed, place is BEGIN";
     return nullptr;
   }
 
-  if (!opt::RunOptimizerPass(old_graph, {"InferShapePass", "DeleteRedundantTranspose", "DecreaseTransposeAlgo"})) {
-    MS_LOG(ERROR) << "Run transpose opt pass failed.";
-    return nullptr;
+  if (!RunOptimizerPass(old_graph, {"InferShapePass"})) {
+    MS_LOG(WARNING) << "Run infershape opt pass failed.";
+  } else {
+    if (!RunOptimizerPass(old_graph, {"DeleteRedundantTranspose", "DecreaseTransposeAlgo"})) {
+      MS_LOG(ERROR) << "Run transpose opt pass failed.";
+      return nullptr;
+    }
   }
 
   auto reduce_act_pass = std::make_shared<opt::ReduceSameActPass>();
@@ -355,12 +363,16 @@ FuncGraphPtr AnfTransform::TransformFuncGraph(const FuncGraphPtr &old_graph, con
     }
   }
 
-  if (!opt::RunOptimizerPass(old_graph, {"InferShapePass", "DeleteRedundantTranspose", "DecreaseTransposeAlgo"})) {
-    MS_LOG(ERROR) << "Run transpose opt pass failed.";
-    return nullptr;
+  if (!RunOptimizerPass(old_graph, {"InferShapePass"})) {
+    MS_LOG(WARNING) << "Run infershape opt pass failed.";
+  } else {
+    if (!RunOptimizerPass(old_graph, {"DeleteRedundantTranspose", "DecreaseTransposeAlgo"})) {
+      MS_LOG(ERROR) << "Run transpose opt pass failed.";
+      return nullptr;
+    }
   }
 
-  if (!opt::RunExternalPass(old_graph, opt::POSITION_END)) {
+  if (!RunExternalPass(old_graph, registry::POSITION_END)) {
     MS_LOG(ERROR) << "Run external pass failed, place is END";
     return nullptr;
   }
@@ -382,17 +394,25 @@ FuncGraphPtr AnfTransform::TransformFuncGraph(const FuncGraphPtr &old_graph, con
     MS_LOG(ERROR) << "Do Quantize failed.";
     return nullptr;
   }
+
+  if (!RunOptimizerPass(old_graph, {"SpecifyGraphInputFormat"})) {
+    MS_LOG(ERROR) << "Run transpose opt pass failed.";
+    return nullptr;
+  }
   return old_graph;
 }
 
 void AnfTransform::AppendPassToStoreRoom(const converter::Flags *config) {
   auto fmk = config->fmk;
   auto is_train = config->trainModel;
-  opt::PassRegistry("DecreaseTransposeAlgo", std::make_shared<opt::DecreaseTransposeAlgo>(fmk, is_train));
-  opt::PassRegistry("DeleteRedundantTranspose", std::make_shared<opt::DeleteRedundantTranspose>());
-  opt::PassRegistry("InferShapePass", std::make_shared<opt::InferShapePass>(fmk, is_train));
-  opt::PassRegistry("ToNCHWFormat", std::make_shared<opt::ToNCHWFormat>(fmk, is_train));
-  opt::PassRegistry("ToNHWCFormat", std::make_shared<opt::ToNHWCFormat>(fmk, is_train));
+  registry::PassRegistry("DecreaseTransposeAlgo", std::make_shared<opt::DecreaseTransposeAlgo>(fmk, is_train));
+  registry::PassRegistry("DeleteRedundantTranspose", std::make_shared<opt::DeleteRedundantTranspose>());
+  registry::PassRegistry("InferShapePass", std::make_shared<opt::InferShapePass>(fmk, is_train));
+  registry::PassRegistry("ToNCHWFormat", std::make_shared<opt::ToNCHWFormat>(fmk, is_train));
+  registry::PassRegistry("ToNHWCFormat", std::make_shared<opt::ToNHWCFormat>(fmk, is_train));
+  registry::PassRegistry("SpecifyGraphInputFormat",
+                         std::make_shared<opt::SpecifyGraphInputFormat>(config->graphInputFormat));
+  registry::PassRegistry("DumpGraph", std::make_shared<opt::DumpGraph>(config));
 }
 
 FuncGraphPtr AnfTransform::Transform(const FuncGraphPtr &main_graph, const converter::Flags *config) {
diff --git a/mindspore/lite/tools/converter/converter.cc b/mindspore/lite/tools/converter/converter.cc
index feab3a7786a..b804fa61a50 100644
--- a/mindspore/lite/tools/converter/converter.cc
+++ b/mindspore/lite/tools/converter/converter.cc
@@ -26,23 +26,22 @@
 #include "src/train/train_populate_parameter.h"
 #include "include/registry/model_parser_registry.h"
 #include "src/common/dynamic_library_loader.h"
-#include "tools/converter/export_model.h"
 #include "tools/converter/parser/parser_utils.h"
 #include "tools/converter/import/mindspore_importer.h"
 namespace mindspore {
 namespace lite {
 namespace {
 void InitConverterParameters(const converter::Flags &flag, converter::ConverterParameters *converter_parameters) {
-  converter_parameters->fmk_ = flag.fmk;
-  converter_parameters->quant_type_ = flag.quantType;
-  converter_parameters->model_file_ = flag.modelFile;
-  converter_parameters->weight_file_ = flag.weightFile;
+  converter_parameters->fmk = flag.fmk;
+  converter_parameters->quant_type = flag.quantType;
+  converter_parameters->model_file = flag.modelFile;
+  converter_parameters->weight_file = flag.weightFile;
 }
 }  // namespace
 
 FuncGraphPtr Converter::BuildFuncGraph(const converter::Flags &flag) {
   FuncGraphPtr func_graph = nullptr;
-  if (flag.fmk == converter::FmkType::FmkType_MS) {
+  if (flag.fmk == converter::FmkType::kFmkTypeMs) {
     kernel::PopulateTrainParameters();
     MindsporeImporter ms_import;
     func_graph = ms_import.ImportMindIR(flag);
@@ -50,7 +49,7 @@ FuncGraphPtr Converter::BuildFuncGraph(const converter::Flags &flag) {
       return nullptr;
     }
   } else {
-    model_parser_ = ModelParserRegistry::GetInstance()->GetModelParser(flag.fmk);
+    model_parser_ = registry::ModelParserRegistry::GetModelParser(flag.fmk);
     if (model_parser_ == nullptr) {
       return nullptr;
     }
@@ -118,6 +117,14 @@ schema::MetaGraphT *Converter::Convert(const std::unique_ptr<converter::Flags> &
     ReturnCode::GetSingleReturnCode()->UpdateReturnCode(status);
     return nullptr;
   }
+
+  // set output tensor names to the original names, the output_names is null in nnie converter.
+  auto output_names = ConverterContext::GetInstance()->GetGraphOutputTensorNames();
+  MS_ASSERT(output_names.size() == meta_graphT->outputIndex.size());
+  for (size_t idx = 0; idx < output_names.size(); idx++) {
+    auto &tensor = meta_graph->allTensors.at(meta_graph->outputIndex.at(idx));
+    tensor->name = output_names.at(idx);
+  }
   return meta_graph;
 }
 
@@ -141,8 +148,6 @@ int RunConverter(int argc, const char **argv) {
     }
     return status;
   }
-  // Init dump graph func
-  ExportModelInit(flags.get());
   // Load graph
   MS_LOG(DEBUG) << "start reading model file";
   Converter cvt;
diff --git a/mindspore/lite/tools/converter/converter.h b/mindspore/lite/tools/converter/converter.h
index 4d4a054325b..96a0dd74957 100644
--- a/mindspore/lite/tools/converter/converter.h
+++ b/mindspore/lite/tools/converter/converter.h
@@ -39,7 +39,7 @@ class Converter {
   FuncGraphPtr BuildFuncGraph(const converter::Flags &flag);
 
  protected:
-  ModelParser *model_parser_ = nullptr;
+  converter::ModelParser *model_parser_ = nullptr;
   std::unique_ptr<GraphDefTransform> metagraph_transform_ = std::make_unique<GraphDefTransform>();
   std::unique_ptr<AnfTransform> funcgraph_transform_ = std::make_unique<AnfTransform>();
 };
diff --git a/mindspore/lite/tools/converter/converter_context.h b/mindspore/lite/tools/converter/converter_context.h
index 87845b8347b..137c16b801c 100644
--- a/mindspore/lite/tools/converter/converter_context.h
+++ b/mindspore/lite/tools/converter/converter_context.h
@@ -106,6 +106,16 @@ class ConverterContext {
   }
   size_t GetGraphInputTensorShapeMapSize() { return graph_input_tensor_shape_map_.size(); }
 
+  void SetGraphOutputTensorNames(const std::vector<std::string> &output_names) {
+    graph_output_tensor_names_ = output_names;
+  }
+
+  const std::vector<std::string> GetGraphOutputTensorNames() const { return graph_output_tensor_names_; }
+
+  void AddGraphInputTensorNames(const std::string &input_name) { graph_input_tensor_names_.emplace_back(input_name); }
+
+  const std::vector<std::string> GetGraphInputTensorNames() const { return graph_input_tensor_names_; }
+
  private:
   ConverterContext() {}
   virtual ~ConverterContext() = default;
@@ -113,6 +123,8 @@ class ConverterContext {
   std::map<int32_t, int32_t> graph_input_data_type_map_;
   std::map<int32_t, int32_t> graph_output_data_type_map_;
   std::map<std::string, std::vector<int64_t>> graph_input_tensor_shape_map_;
+  std::vector<std::string> graph_input_tensor_names_;
+  std::vector<std::string> graph_output_tensor_names_;
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/converter_flags.cc b/mindspore/lite/tools/converter/converter_flags.cc
index c17fe9a2814..bd6edce3400 100644
--- a/mindspore/lite/tools/converter/converter_flags.cc
+++ b/mindspore/lite/tools/converter/converter_flags.cc
@@ -27,8 +27,9 @@
 #include "tools/converter/converter_context.h"
 
 namespace mindspore {
-namespace lite {
 namespace converter {
+using mindspore::lite::RET_INPUT_PARAM_INVALID;
+using mindspore::lite::RET_OK;
 namespace {
 constexpr int kBase = 10;
 constexpr int kQuantBitNumInt16 = 16;
@@ -75,6 +76,8 @@ Flags::Flags() {
           "set this option. Model input shapes is same with origin model by default."
           "e.g. inTensor1:1,32,32,32;inTensor2:1,1,32,32,4",
           "");
+  AddFlag(&Flags::graphInputFormatStr, "inputFormat",
+          "Assign the format of model inputs. Valid only for 4-dimensional input. NHWC | NCHW", "NHWC");
 }
 
 int Flags::InitInputOutputDataType() {
@@ -111,21 +114,21 @@ int Flags::InitInputOutputDataType() {
 
 int Flags::InitFmk() {
   if (this->fmkIn == "CAFFE") {
-    this->fmk = FmkType_CAFFE;
+    this->fmk = kFmkTypeCaffe;
   } else if (this->fmkIn == "MINDIR") {
-    this->fmk = FmkType_MS;
+    this->fmk = kFmkTypeMs;
   } else if (this->fmkIn == "TFLITE") {
-    this->fmk = FmkType_TFLITE;
+    this->fmk = kFmkTypeTflite;
   } else if (this->fmkIn == "ONNX") {
-    this->fmk = FmkType_ONNX;
+    this->fmk = kFmkTypeOnnx;
   } else if (this->fmkIn == "TF") {
-    this->fmk = FmkType_TF;
+    this->fmk = kFmkTypeTf;
   } else {
     std::cerr << "INPUT ILLEGAL: fmk must be TF|TFLITE|CAFFE|MINDIR|ONNX" << std::endl;
     return RET_INPUT_PARAM_INVALID;
   }
 
-  if (this->fmk != FmkType_CAFFE && !weightFile.empty()) {
+  if (this->fmk != kFmkTypeCaffe && !weightFile.empty()) {
     std::cerr << "INPUT ILLEGAL: weightFile is not a valid flag" << std::endl;
     return RET_INPUT_PARAM_INVALID;
   }
@@ -159,20 +162,28 @@ int Flags::QuantParamInputCheck() {
     std::cerr << "bitNum should be a valid number." << std::endl;
     return RET_INPUT_PARAM_INVALID;
   }
-  if (this->bitNum <= 0 || this->bitNum > kQuantBitNumInt16) {
-    std::cerr << "bitNum should be greater than zero and lesser than 16 currently." << std::endl;
-    return RET_INPUT_PARAM_INVALID;
+  if (this->quantType == schema::QuantType_WeightQuant) {
+    if (this->bitNum < 0 || this->bitNum > kQuantBitNumInt16) {
+      std::cerr << "bitNum should be greater than zero and less than 16 currently." << std::endl;
+      return RET_INPUT_PARAM_INVALID;
+    }
+  } else {
+    if (this->bitNum <= 0 || this->bitNum > kQuantBitNumInt16) {
+      std::cerr << "bitNum should be greater or equal to zero and less than 16 currently." << std::endl;
+      return RET_INPUT_PARAM_INVALID;
+    }
   }
+
   return RET_OK;
 }
 
 int Flags::InitQuantParam() {
   if (this->quantTypeStr == "WeightQuant") {
-    this->quantType = QuantType_WeightQuant;
+    this->quantType = schema::QuantType_WeightQuant;
   } else if (this->quantTypeStr == "PostTraining") {
-    this->quantType = QuantType_PostTraining;
+    this->quantType = schema::QuantType_PostTraining;
   } else if (this->quantTypeStr.empty()) {
-    this->quantType = QuantType_QUANT_NONE;
+    this->quantType = schema::QuantType_QUANT_NONE;
   } else {
     std::cerr << "INPUT ILLEGAL: quantType must be WeightQuant|PostTraining" << std::endl;
     return RET_INPUT_PARAM_INVALID;
@@ -193,7 +204,7 @@ int Flags::InitTrainModel() {
   }
 
   if (this->trainModel) {
-    if (this->fmk != FmkType_MS) {
+    if (this->fmk != kFmkTypeMs) {
       std::cerr << "INPUT ILLEGAL: train model converter supporting only MINDIR format" << std::endl;
       return RET_INPUT_PARAM_INVALID;
     }
@@ -210,12 +221,15 @@ int Flags::InitTrainModel() {
 }
 
 int Flags::InitInTensorShape() {
+  if (this->inTensorShape.empty()) {
+    return RET_OK;
+  }
   std::string content = this->inTensorShape;
   std::vector<int64_t> shape;
-  auto shape_strs = StrSplit(content, std::string(";"));
+  auto shape_strs = lite::StrSplit(content, std::string(";"));
   for (const auto &shape_str : shape_strs) {
     shape.clear();
-    auto string_split = StrSplit(shape_str, std::string(":"));
+    auto string_split = lite::StrSplit(shape_str, std::string(":"));
     auto name = string_split[0];
     if (name.empty()) {
       MS_LOG(ERROR) << "input tensor name is empty";
@@ -224,19 +238,31 @@ int Flags::InitInTensorShape() {
     if (dim_strs.empty()) {
       MS_LOG(ERROR) << "input tensor dim string is empty";
     }
-    auto dims = StrSplit(dim_strs, std::string(","));
+    auto dims = lite::StrSplit(dim_strs, std::string(","));
     if (dims.empty()) {
       MS_LOG(ERROR) << "input tensor dim is empty";
     }
     for (const auto &dim : dims) {
       if (std::stoi(dim) < 0) {
         MS_LOG(ERROR) << "Unsupported dim < 0.";
-        return RET_ERROR;
+        return lite::RET_ERROR;
       } else {
         shape.push_back(std::stoi(dim));
       }
     }
-    ConverterContext::GetInstance()->UpdateGraphInputTensorShape(name, shape);
+    lite::ConverterContext::GetInstance()->UpdateGraphInputTensorShape(name, shape);
+  }
+  return RET_OK;
+}
+
+int Flags::InitGraphInputFormat() {
+  if (this->graphInputFormatStr == "NHWC") {
+    graphInputFormat = mindspore::NHWC;
+  } else if (this->graphInputFormatStr == "NCHW") {
+    graphInputFormat = mindspore::NCHW;
+  } else if (!this->graphInputFormatStr.empty()) {
+    MS_LOG(ERROR) << "graph input format is invalid.";
+    return RET_INPUT_PARAM_INVALID;
   }
   return RET_OK;
 }
@@ -247,7 +273,7 @@ int Flags::InitConfigFile() {
     const char *delimiter = ";";
     auto relative_path = SplitStringToVector(plugins_path_str, *delimiter);
     for (size_t i = 0; i < relative_path.size(); i++) {
-      this->pluginsPath.push_back(RealPath(relative_path[i].c_str()));
+      this->pluginsPath.push_back(lite::RealPath(relative_path[i].c_str()));
     }
   }
 
@@ -271,9 +297,9 @@ int Flags::Init(int argc, const char **argv) {
   int ret;
   if (argc == 1) {
     std::cout << this->Usage() << std::endl;
-    return RET_SUCCESS_EXIT;
+    return lite::RET_SUCCESS_EXIT;
   }
-  Option<std::string> err = this->ParseFlags(argc, argv);
+  lite::Option<std::string> err = this->ParseFlags(argc, argv);
 
   if (err.IsSome()) {
     std::cerr << err.Get() << std::endl;
@@ -283,7 +309,7 @@ int Flags::Init(int argc, const char **argv) {
 
   if (this->help) {
     std::cout << this->Usage() << std::endl;
-    return RET_SUCCESS_EXIT;
+    return lite::RET_SUCCESS_EXIT;
   }
   if (this->modelFile.empty()) {
     std::cerr << "INPUT MISSING: model file path is necessary" << std::endl;
@@ -350,12 +376,16 @@ int Flags::Init(int argc, const char **argv) {
     return RET_INPUT_PARAM_INVALID;
   }
 
-  if (!this->inTensorShape.empty()) {
-    ret = InitInTensorShape();
-    if (ret != RET_OK) {
-      std::cerr << "Init input tensor shape failed." << std::endl;
-      return RET_INPUT_PARAM_INVALID;
-    }
+  ret = InitInTensorShape();
+  if (ret != RET_OK) {
+    std::cerr << "Init input tensor shape failed." << std::endl;
+    return RET_INPUT_PARAM_INVALID;
+  }
+
+  ret = InitGraphInputFormat();
+  if (ret != RET_OK) {
+    std::cerr << "Init graph input format failed." << std::endl;
+    return RET_INPUT_PARAM_INVALID;
   }
   return RET_OK;
 }
@@ -488,5 +518,4 @@ std::vector<std::string> SplitStringToVector(const std::string &raw_str, const c
   return res;
 }
 }  // namespace converter
-}  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/converter_flags.h b/mindspore/lite/tools/converter/converter_flags.h
index 09e8616b8d3..657a64f1a8c 100644
--- a/mindspore/lite/tools/converter/converter_flags.h
+++ b/mindspore/lite/tools/converter/converter_flags.h
@@ -19,19 +19,15 @@
 
 #include <string>
 #include <vector>
-#include "include/registry/framework.h"
+#include "include/api/format.h"
+#include "include/registry/parser_context.h"
 #include "tools/common/flag_parser.h"
 #include "ir/dtype/type_id.h"
 #include "schema/inner/model_generated.h"
 
 namespace mindspore {
-namespace lite {
-using mindspore::schema::QuantType;
-using mindspore::schema::QuantType_AwareTraining;
-using mindspore::schema::QuantType_PostTraining;
-using mindspore::schema::QuantType_QUANT_NONE;
-using mindspore::schema::QuantType_WeightQuant;
 namespace converter {
+using mindspore::schema::QuantType;
 enum ParallelSplitType { SplitNo = 0, SplitByUserRatio = 1, SplitByUserAttr = 2 };
 constexpr auto kMaxSplitRatio = 10;
 constexpr auto kComputeRate = "computeRate";
@@ -65,6 +61,8 @@ class Flags : public virtual mindspore::lite::FlagParser {
 
   int InitInTensorShape();
 
+  int InitGraphInputFormat();
+
   int Init(int argc, const char **argv);
 
  public:
@@ -98,6 +96,8 @@ class Flags : public virtual mindspore::lite::FlagParser {
   std::string inTensorShape;
   std::string dec_key = "";
   std::string dec_mode = "AES-GCM";
+  std::string graphInputFormatStr;
+  mindspore::Format graphInputFormat = mindspore::NHWC;
 };
 
 bool CheckOfflineParallelConfig(const std::string &file, ParallelSplitConfig *parallel_split_config);
@@ -106,7 +106,6 @@ std::string GetStrFromConfigFile(const std::string &file, const std::string &tar
 
 std::vector<std::string> SplitStringToVector(const std::string &raw_str, const char &delimiter);
 }  // namespace converter
-}  // namespace lite
 }  // namespace mindspore
 
 #endif
diff --git a/mindspore/lite/tools/converter/dump_graph.cc b/mindspore/lite/tools/converter/dump_graph.cc
deleted file mode 100644
index 1f71452590b..00000000000
--- a/mindspore/lite/tools/converter/dump_graph.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "tools/converter/dump_graph.h"
-#include "tools/converter/dump_graph_init.h"
-#include "include/errorcode.h"
-#include "src/common/log_adapter.h"
-
-namespace mindspore {
-namespace lite {
-static GraphDumpFunc graph_dump_interface = nullptr;
-void InitDumpGraphFunc(const GraphDumpFunc &graph_dump_func) { graph_dump_interface = graph_dump_func; }
-
-int DumpGraph(const FuncGraphPtr &func_graph) {
-  if (graph_dump_interface == nullptr) {
-    MS_LOG(ERROR) << "graph_dump_interface is nullptr, which is not init.";
-    return RET_ERROR;
-  }
-  return graph_dump_interface(func_graph);
-}
-}  // namespace lite
-}  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/dump_graph.h b/mindspore/lite/tools/converter/dump_graph.h
deleted file mode 100644
index 98ee8bdf494..00000000000
--- a/mindspore/lite/tools/converter/dump_graph.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_DUMP_GRAPH_H_
-#define MINDSPORE_LITE_TOOLS_CONVERTER_DUMP_GRAPH_H_
-
-#include <memory>
-#include "include/lite_utils.h"
-
-namespace mindspore {
-class FuncGraph;
-using FuncGraphPtr = std::shared_ptr<FuncGraph>;
-namespace lite {
-using GraphDumpFunc = std::function<int(const FuncGraphPtr &)>;
-int MS_API DumpGraph(const FuncGraphPtr &func_graph);
-}  // namespace lite
-}  // namespace mindspore
-#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_DUMP_GRAPH_H_
diff --git a/mindspore/lite/tools/converter/dump_graph_init.h b/mindspore/lite/tools/converter/dump_graph_init.h
deleted file mode 100644
index 84ac21719ff..00000000000
--- a/mindspore/lite/tools/converter/dump_graph_init.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_DUMP_GRAPH_INIT_H
-#define MINDSPORE_LITE_TOOLS_CONVERTER_DUMP_GRAPH_INIT_H
-
-#include "tools/converter/dump_graph.h"
-
-namespace mindspore {
-namespace lite {
-void MS_API InitDumpGraphFunc(const GraphDumpFunc &graph_dump_func);
-}  // namespace lite
-}  // namespace mindspore
-
-#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_DUMP_GRAPH_INIT_H
diff --git a/mindspore/lite/tools/converter/export_model.cc b/mindspore/lite/tools/converter/export_model.cc
index 6789e67afba..27e481d8588 100644
--- a/mindspore/lite/tools/converter/export_model.cc
+++ b/mindspore/lite/tools/converter/export_model.cc
@@ -26,7 +26,6 @@
 #include "ir/func_graph.h"
 #include "tools/anf_exporter/anf_exporter.h"
 #include "tools/converter/graphdef_transform.h"
-#include "tools/converter/dump_graph_init.h"
 #include "tools/converter/optimizer_manager.h"
 #include "tools/optimizer/graph/control_flow_pass.h"
 
@@ -34,9 +33,6 @@ namespace mindspore {
 namespace lite {
 namespace {
 using NodesMap = std::map<std::string, std::vector<AnfNodePtr>>;
-}
-static converter::Flags *flags = nullptr;
-
 void CloneGraphInputs(const FuncGraphPtr &origin, const FuncGraphPtr &mirror, NodesMap *origin_map,
                       NodesMap *mirror_map) {
   MS_ASSERT(origin != nullptr && mirror != nullptr);
@@ -53,7 +49,8 @@ void CloneGraphInputs(const FuncGraphPtr &origin, const FuncGraphPtr &mirror, No
   }
 }
 
-AnfNodePtr CloneParameterAndValueNode(const CNodePtr &cnode, size_t index, const FuncGraphPtr &mirror_graph) {
+AnfNodePtr CloneParameterAndValueNode(const CNodePtr &cnode, size_t index, const FuncGraphPtr &mirror_graph,
+                                      const converter::Flags *flags) {
   MS_ASSERT(cnode != nullptr && mirror_graph != nullptr);
   if (index >= cnode->size()) {
     MS_LOG(ERROR) << "input index out of range.";
@@ -131,7 +128,7 @@ PrimitivePtr ClonePrimitive(const CNodePtr &cnode) {
   return prim;
 }
 
-FuncGraphPtr CloneFuncGraph(const FuncGraphPtr &graph) {
+FuncGraphPtr CloneFuncGraph(const FuncGraphPtr &graph, const converter::Flags *flags) {
   MS_ASSERT(graph != nullptr);
   auto mirror_graph = std::make_shared<FuncGraph>();
   mirror_graph->set_attrs(graph->attrs());
@@ -157,10 +154,10 @@ FuncGraphPtr CloneFuncGraph(const FuncGraphPtr &graph) {
       if (mirror_input == nullptr) {
         if (IsValueNode<FuncGraph>(origin_input)) {
           auto sub_func_graph = GetValueNode<FuncGraphPtr>(origin_input);
-          auto mirror_sub_graph = CloneFuncGraph(sub_func_graph);
+          auto mirror_sub_graph = CloneFuncGraph(sub_func_graph, flags);
           mirror_input = NewValueNode(mirror_sub_graph);
         } else {
-          mirror_input = CloneParameterAndValueNode(cnode, i, mirror_graph);
+          mirror_input = CloneParameterAndValueNode(cnode, i, mirror_graph, flags);
         }
         if (mirror_input == nullptr) {
           MS_LOG(ERROR) << "node input cannot be found.";
@@ -184,23 +181,24 @@ FuncGraphPtr CloneFuncGraph(const FuncGraphPtr &graph) {
   }
   return mirror_graph;
 }
+}  // namespace
 
-STATUS ExportModel(const FuncGraphPtr &graph) {
+STATUS ExportModel(const FuncGraphPtr &graph, const converter::Flags *flags) {
   MS_ASSERT(graph != nullptr && flags != nullptr);
-  auto mirror_graph = CloneFuncGraph(graph);
+  auto mirror_graph = CloneFuncGraph(graph, flags);
   if (mirror_graph == nullptr) {
     MS_LOG(ERROR) << "Clone funcGraph failed.";
     return RET_ERROR;
   }
   (void)Manage(mirror_graph, true);
-  if (!opt::RunOptimizerPass(mirror_graph, {"InferShapePass", "DeleteRedundantTranspose", "DecreaseTransposeAlgo"})) {
+  if (!RunOptimizerPass(mirror_graph, {"InferShapePass", "DeleteRedundantTranspose", "DecreaseTransposeAlgo"})) {
     MS_LOG(ERROR) << "Run transpose opt pass failed.";
     return RET_ERROR;
   }
   auto optimizer = std::make_shared<opt::GraphOptimizer>();
   auto graph_pm = std::make_shared<opt::PassManager>("anf graph pass manager", true);
-  if (flags->fmk == lite::converter::FmkType_TFLITE || flags->fmk == lite::converter::FmkType_TF ||
-      flags->fmk == lite::converter::FmkType_ONNX) {
+  if (flags->fmk == converter::kFmkTypeTflite || flags->fmk == converter::kFmkTypeTf ||
+      flags->fmk == converter::kFmkTypeOnnx) {
     graph_pm->AddPass(std::make_shared<opt::ControlFlowPass>());
   }
   optimizer->AddPassManager(graph_pm);
@@ -233,11 +231,5 @@ STATUS ExportModel(const FuncGraphPtr &graph) {
   delete meta_graph;
   return status;
 }
-
-void ExportModelInit(converter::Flags *flag) {
-  MS_ASSERT(flag != nullptr);
-  flags = flag;
-  InitDumpGraphFunc(ExportModel);
-}
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/export_model.h b/mindspore/lite/tools/converter/export_model.h
index 46ab469e6b9..03ab259522b 100644
--- a/mindspore/lite/tools/converter/export_model.h
+++ b/mindspore/lite/tools/converter/export_model.h
@@ -18,10 +18,11 @@
 #define MINDSPORE_LITE_TOOLS_CONVERTER_EXPORT_MODEL_H
 
 #include "tools/converter/converter_flags.h"
+#include "ir/func_graph.h"
 
 namespace mindspore {
 namespace lite {
-void ExportModelInit(lite::converter::Flags *flag);
+STATUS ExportModel(const FuncGraphPtr &graph, const converter::Flags *flags);
 }  // namespace lite
 }  // namespace mindspore
 
diff --git a/mindspore/lite/tools/converter/graphdef_transform.cc b/mindspore/lite/tools/converter/graphdef_transform.cc
index dfd9f4a6b58..87685282f4c 100644
--- a/mindspore/lite/tools/converter/graphdef_transform.cc
+++ b/mindspore/lite/tools/converter/graphdef_transform.cc
@@ -74,7 +74,7 @@ int GraphDefTransform::Transform(const converter::Flags &ctx) {
     // init old node indices
     auto old_nodes = GetGraphNodes();
     Optimizer format_trans_optimizer;
-    if (!ctx.trainModel && ctx.fmk != converter::FmkType_ONNX) {
+    if (!ctx.trainModel && ctx.fmk != converter::kFmkTypeOnnx) {
       format_trans_optimizer.AddPass(new (std::nothrow) IsolatedNodeRemovePass());
       format_trans_optimizer.AddPass(new (std::nothrow) SubgraphNodePass(old_nodes));
     }
@@ -117,7 +117,7 @@ int GraphDefTransform::Transform(const converter::Flags &ctx) {
   }
 
   // quantization
-  if (ctx.fmk != converter::FmkType_TF) {
+  if (ctx.fmk != converter::kFmkTypeTf) {
     // init old node indices
     auto old_nodes = GetGraphNodes();
     Optimizer tensor_quant_optimizer;
@@ -134,7 +134,7 @@ int GraphDefTransform::Transform(const converter::Flags &ctx) {
   }
 
   // quantization
-  if (ctx.fmk != converter::FmkType_TF) {
+  if (ctx.fmk != converter::kFmkTypeTf) {
     // init old node indices
     Optimizer quant_node_optimizer;
     quant_node_optimizer.AddPass(new (std::nothrow) TopologicalSortPass());
diff --git a/mindspore/lite/tools/converter/import/mindir_adjust.cc b/mindspore/lite/tools/converter/import/mindir_adjust.cc
index 6b40c9d6188..e9a05790c33 100644
--- a/mindspore/lite/tools/converter/import/mindir_adjust.cc
+++ b/mindspore/lite/tools/converter/import/mindir_adjust.cc
@@ -232,6 +232,10 @@ int MindirAdjust::ComputeQuantParams(std::shared_ptr<AnfNode> anf_node) {
     MS_LOG(ERROR) << "the cnode is invalid.";
     return lite::RET_NULL_PTR;
   }
+  if (utils::isa<CNodePtr>(cnode->input(0))) {
+    MS_LOG(INFO) << "call cnode no need to convert primitive.";
+    return lite::RET_NO_CHANGE;
+  }
   auto value_node = cnode->input(0)->cast<ValueNodePtr>();
   if (value_node == nullptr || value_node->value() == nullptr) {
     MS_LOG(ERROR) << "value node is invalid.";
@@ -239,8 +243,13 @@ int MindirAdjust::ComputeQuantParams(std::shared_ptr<AnfNode> anf_node) {
   }
   auto primitive = value_node->value()->cast<PrimitivePtr>();
   if (primitive == nullptr) {
-    MS_LOG(ERROR) << "the value is not primitive.";
-    return lite::RET_ERROR;
+    if (utils::isa<FuncGraphPtr>(value_node->value())) {
+      MS_LOG(INFO) << "is a funcgraph.";
+      return lite::RET_NO_CHANGE;
+    } else {
+      MS_LOG(ERROR) << "the value is not primitive.";
+      return lite::RET_ERROR;
+    }
   }
   auto inputs = cnode->inputs();
   inputs.erase(inputs.begin());
@@ -257,7 +266,7 @@ int MindirAdjust::ComputeQuantParams(std::shared_ptr<AnfNode> anf_node) {
 }
 
 bool MindirAdjust::Run(const FuncGraphPtr &func_graph) {
-  if (this->fmk_type_ != lite::converter::FmkType_MS) {
+  if (this->fmk_type_ != converter::kFmkTypeMs) {
     MS_LOG(INFO) << "The framework type of model should be mindir.";
     return lite::RET_OK;
   }
diff --git a/mindspore/lite/tools/converter/import/mindir_adjust.h b/mindspore/lite/tools/converter/import/mindir_adjust.h
index e38918fda79..0c8b3df16ee 100644
--- a/mindspore/lite/tools/converter/import/mindir_adjust.h
+++ b/mindspore/lite/tools/converter/import/mindir_adjust.h
@@ -21,7 +21,7 @@
 #include "tools/converter/converter_flags.h"
 #include "tools/optimizer/common/gllo_utils.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 using mindspore::schema::QuantType;
 namespace mindspore::lite {
 class MindirAdjust {
@@ -38,7 +38,7 @@ class MindirAdjust {
   int ComputeQuantParams(AnfNodePtr anf_node);
 
   QuantType quant_type_ = QuantType::QuantType_QUANT_NONE;
-  FmkType fmk_type_ = FmkType::FmkType_MS;
+  FmkType fmk_type_ = FmkType::kFmkTypeMs;
   bool train_flag_ = false;
 };
 }  // namespace mindspore::lite
diff --git a/mindspore/lite/tools/converter/import/mindspore_importer.cc b/mindspore/lite/tools/converter/import/mindspore_importer.cc
index 7dbacf58af4..fd39ac62185 100644
--- a/mindspore/lite/tools/converter/import/mindspore_importer.cc
+++ b/mindspore/lite/tools/converter/import/mindspore_importer.cc
@@ -22,9 +22,11 @@
 #include "tools/converter/parser/parser_utils.h"
 #include "tools/converter/import/primitive_adjust.h"
 #include "tools/converter/import/mindir_adjust.h"
+#include "tools/converter/import/mindir_control_flow_adjust.h"
 #include "tools/optimizer/common/gllo_utils.h"
 #include "tools/common/tensor_util.h"
 #include "tools/converter/parser/unify_format.h"
+#include "tools/converter/parser/lstm_adjust_pass.h"
 
 namespace mindspore::lite {
 namespace {
@@ -43,7 +45,14 @@ STATUS MindsporeImporter::Mindir2AnfAdjust(const FuncGraphPtr &func_graph, const
   mindir_adjust_pass->SetQuantType(flag.quantType);
   mindir_adjust_pass->SetTrainFlag(flag.trainModel);
   if (!mindir_adjust_pass->Run(func_graph)) {
-    MS_LOG(ERROR) << "mindir adjust failed.";
+    MS_LOG(ERROR) << "MindIr adjust failed.";
+    ReturnCode::GetSingleReturnCode()->UpdateReturnCode(RET_ERROR);
+    return RET_ERROR;
+  }
+  auto mindir_control_flow_adjust = std::make_shared<MindIRControlFlowAdjust>();
+  mindir_control_flow_adjust->SetFmkType(flag.fmk);
+  if (!mindir_control_flow_adjust->Run(func_graph)) {
+    MS_LOG(ERROR) << "MindIR control flow adjust failed.";
     ReturnCode::GetSingleReturnCode()->UpdateReturnCode(RET_ERROR);
     return RET_ERROR;
   }
@@ -112,18 +121,24 @@ FuncGraphPtr MindsporeImporter::ImportMindIR(const converter::Flags &flag) {
     return nullptr;
   }
   func_graph->set_attr("graph_name", MakeValue("main_graph"));
-  func_graph->set_attr("fmk", MakeValue(static_cast<int>(converter::FmkType_MS)));
+  func_graph->set_attr("fmk", MakeValue(static_cast<int>(converter::kFmkTypeMs)));
   STATUS status;
   if ((status = Mindir2AnfAdjust(func_graph, flag)) != RET_OK) {
     MS_LOG(ERROR) << "Mindir2AnfAdjust failed.";
     ReturnCode::GetSingleReturnCode()->UpdateReturnCode(status);
     return nullptr;
   }
-  auto unify_format = std::make_shared<UnifyFormatToNHWC>(lite::converter::FmkType_MS, flag.trainModel, flag.quantType);
+  auto unify_format = std::make_shared<UnifyFormatToNHWC>(converter::kFmkTypeMs, flag.trainModel, flag.quantType);
   if (!unify_format->Run(func_graph)) {
     MS_LOG(ERROR) << "Run insert transpose failed.";
     return nullptr;
   }
+
+  auto lstm_adjust_pass = std::make_shared<opt::LstmAdjustPass>();
+  if (!lstm_adjust_pass->Run(func_graph)) {
+    MS_LOG(ERROR) << "Run mindir lstm adjust failed.";
+    return nullptr;
+  }
   return func_graph;
 }
 }  // namespace mindspore::lite
diff --git a/mindspore/lite/tools/converter/import/mindspore_importer.h b/mindspore/lite/tools/converter/import/mindspore_importer.h
index 96bcabe1d7f..883421ba634 100644
--- a/mindspore/lite/tools/converter/import/mindspore_importer.h
+++ b/mindspore/lite/tools/converter/import/mindspore_importer.h
@@ -31,7 +31,7 @@ class MindsporeImporter {
 
  private:
   STATUS Mindir2AnfAdjust(const FuncGraphPtr &func_graph, const converter::Flags &flag);
-  QuantType quant_type_ = schema::QuantType_QUANT_NONE;
+  schema::QuantType quant_type_ = schema::QuantType_QUANT_NONE;
   size_t Hex2ByteArray(const std::string &hex_str, unsigned char *byte_array, size_t max_len);
 };
 
diff --git a/mindspore/lite/tools/converter/import/primitive_adjust.cc b/mindspore/lite/tools/converter/import/primitive_adjust.cc
index fe43ff544d2..a033f306dec 100644
--- a/mindspore/lite/tools/converter/import/primitive_adjust.cc
+++ b/mindspore/lite/tools/converter/import/primitive_adjust.cc
@@ -39,6 +39,8 @@
 #include "ops/fusion/max_pool_fusion.h"
 #include "ops/fusion/mul_fusion.h"
 #include "ops/fusion/pad_fusion.h"
+#include "ops/partial.h"
+#include "ops/fusion/partial_fusion.h"
 #include "ops/fusion/pow_fusion.h"
 #include "ops/fusion/prelu_fusion.h"
 #include "ops/fusion/reduce_fusion.h"
@@ -95,6 +97,8 @@ using mindspore::ops::kNameMaxPool;
 using mindspore::ops::kNameMaxPoolGrad;
 using mindspore::ops::kNameMul;
 using mindspore::ops::kNamePad;
+using mindspore::ops::kNamePartial;
+using mindspore::ops::kNamePartialFusion;
 using mindspore::ops::kNamePow;
 using mindspore::ops::kNamePReLU;
 using mindspore::ops::kNameReduceAll;
@@ -519,14 +523,24 @@ int MoveAttrMapResizeGrad(const CNodePtr &cnode) {
 }  // namespace
 
 bool PrimitiveAdjust::Run(const FuncGraphPtr &func_graphs) {
-  if (this->fmk_type_ != lite::converter::FmkType_MS) {
+  if (this->fmk_type_ != converter::kFmkTypeMs) {
     MS_LOG(INFO) << "The framework type of model should be mindir.";
     return lite::RET_OK;
   }
   MS_ASSERT(graph != nullptr);
+  static auto root_func_manager = Manage(func_graphs);
   std::set<FuncGraphPtr> all_func_graphs = {};
   lite::GetAllFuncGraph(func_graphs, &all_func_graphs);
+  int i = 0;
   for (auto func_graph : all_func_graphs) {
+    func_graph->set_manager(root_func_manager);
+    func_graph->set_attr("fmk", MakeValue(static_cast<int>(FmkType::kFmkTypeMs)));
+    if (i == 0) {
+      func_graph->set_attr("graph_name", MakeValue("main_graph"));
+    } else {
+      func_graph->set_attr("graph_name", MakeValue("subgraph" + std::to_string(i)));
+    }
+    i++;
     auto node_list = TopoSort(func_graph->get_return());
     int status = lite::RET_OK;
     for (auto &node : node_list) {
@@ -537,11 +551,16 @@ bool PrimitiveAdjust::Run(const FuncGraphPtr &func_graphs) {
       MS_ASSERT(cnode->size() > 0);
       auto value_node = cnode->input(0)->cast<ValueNodePtr>();
       if (value_node == nullptr) {
+        if (cnode->input(0)->cast<CNodePtr>() != nullptr) {
+          continue;
+        }
         MS_LOG(ERROR) << "cnode first input is invalid.";
         return false;
       }
       auto prim = GetValueNode<PrimitivePtr>(cnode->input(0));
-      MS_ASSERT(prim != nullptr);
+      if (prim == nullptr) {
+        continue;
+      }
       auto name = prim->name();
       auto adjust_func = PrimitiveAdjustRegistry::GetInstance()->GetPrimitiveCreator(name);
       if (adjust_func == nullptr) {
@@ -594,6 +613,7 @@ REGIST_PRIMITIVE_ADJUST(kNameMaxPool, MoveAttrPool)
 REGIST_PRIMITIVE_ADJUST(kNameMaxPoolGrad, MoveAttrPoolGrad)
 REGIST_PRIMITIVE_ADJUST(kNameMul, MoveAttrMapCommon<ops::MulFusion>)
 REGIST_PRIMITIVE_ADJUST(kNamePad, MoveAttrMapCommon<ops::PadFusion>)
+REGIST_PRIMITIVE_ADJUST(kNamePartial, MoveAttrMapCommon<ops::PartialFusion>)
 REGIST_PRIMITIVE_ADJUST(kNamePow, MoveAttrMapCommon<ops::PowFusion>)
 REGIST_PRIMITIVE_ADJUST(kNamePReLU, MoveAttrMapCommon<ops::PReLUFusion>)
 REGIST_PRIMITIVE_ADJUST(kNameReduceAll, MoveAttrMapReduce)
diff --git a/mindspore/lite/tools/converter/import/primitive_adjust.h b/mindspore/lite/tools/converter/import/primitive_adjust.h
index 9b810a452aa..8596059fdba 100644
--- a/mindspore/lite/tools/converter/import/primitive_adjust.h
+++ b/mindspore/lite/tools/converter/import/primitive_adjust.h
@@ -24,7 +24,7 @@
 #include "tools/converter/converter_flags.h"
 #include "tools/optimizer/common/gllo_utils.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore {
 namespace lite {
 typedef int (*PrimitiveAdjustCreator)(const CNodePtr &value_node);
@@ -71,7 +71,7 @@ class PrimitiveAdjust {
   bool Run(const FuncGraphPtr &func_graph);
 
  protected:
-  FmkType fmk_type_ = FmkType::FmkType_MS;
+  FmkType fmk_type_ = FmkType::kFmkTypeMs;
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.cc b/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.cc
index 2f6a27ed99b..566078df01e 100644
--- a/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.cc
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.cc
@@ -240,7 +240,7 @@ STATUS BatchNormConvertScalePass::GetBnWeightTensors(MetaGraphT *graph, BNWeight
   MS_ASSERT(graph->allTensors.size() > bnNode->inputIndex.at(1));
   auto bnWeightTensorIdxes = bnNode->inputIndex;
   bnWeightTensorIdxes.erase(bnWeightTensorIdxes.begin());
-  if (fmkType == converter::FmkType_CAFFE) {
+  if (fmkType == converter::kFmkTypeCaffe) {
     bnWeightTensors->meanTensor = graph->allTensors.at(bnWeightTensorIdxes[CAFFE_BATCHNORM_MEAN_INDEX]).get();
     bnWeightTensors->varianceTensor = graph->allTensors.at(bnWeightTensorIdxes[CAFFE_BATCHNORM_VARIANCE_INDEX]).get();
   } else {
@@ -258,7 +258,7 @@ STATUS BatchNormConvertScalePass::GetBnWeightTensors(MetaGraphT *graph, BNWeight
     MS_LOG(ERROR) << "BatchNorm's variance tensor is nullptr";
     return RET_ERROR;
   }
-  if (fmkType == converter::FmkType_CAFFE) {
+  if (fmkType == converter::kFmkTypeCaffe) {
     auto scaleTensor = graph->allTensors.at(bnWeightTensorIdxes[CAFFE_BATCHNORM_SCALE_INDEX]).get();
     // calibrate mean and variance
     float scale_factor_data = (reinterpret_cast<float *>(scaleTensor->data.data()))[0];
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.h b/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.h
index 3844f660975..8c3936103c0 100644
--- a/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.h
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/batchnorm_convert_scale_pass.h
@@ -61,7 +61,7 @@ class BatchNormConvertScalePass : public GraphPass {
   float *transBias = nullptr;
   std::unique_ptr<TensorT> newScaleWeightTensor = nullptr;
   std::unique_ptr<TensorT> newScaleBiasTensor = nullptr;
-  converter::FmkType fmkType = converter::FmkType_TF;
+  converter::FmkType fmkType = converter::kFmkTypeTf;
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.cc b/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.cc
index 6c320553eb2..5f30de1f233 100644
--- a/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.cc
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.cc
@@ -30,7 +30,7 @@
 #include "tools/converter/converter_flags.h"
 #include "src/common/string_util.h"
 
-using mindspore::lite::converter::FmkType_TF;
+using mindspore::converter::kFmkTypeTf;
 namespace mindspore {
 namespace lite {
 namespace {
@@ -203,7 +203,7 @@ STATUS NodeInferShape(const std::unique_ptr<schema::CNodeT> &node, const std::ve
     return RET_ERROR;
   }
 
-  auto ret = KernelInferShape(inputs, *outputs, prim, {});
+  auto ret = KernelInferShape(inputs, *outputs, prim, {}, SCHEMA_CUR);
   if (ret == lite::RET_NOT_SUPPORT) {
     auto parameter_gen = lite::PopulateRegistry::GetInstance()->GetParameterCreator(prim->value_type(), SCHEMA_CUR);
     if (parameter_gen == nullptr) {
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.h b/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.h
index 477876275b6..3fc4280dbd8 100644
--- a/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.h
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.h
@@ -26,7 +26,7 @@
 #include "tools/converter/optimizer.h"
 #include "tools/converter/converter_flags.h"
 
-using mindspore::lite::converter::FmkType_TF;
+using mindspore::converter::kFmkTypeTf;
 using mindspore::schema::TensorT;
 namespace mindspore {
 namespace lite {
@@ -59,7 +59,7 @@ class InferShapePass : public GraphPass {
   void InitInferTensor(MetaGraphT *graph);
   int InferSubgraph(const int &subgraph_index, MetaGraphT *graph);
 
-  lite::converter::FmkType fmk_type_ = FmkType_TF;
+  converter::FmkType fmk_type_ = kFmkTypeTf;
   std::vector<InferTensor> tensors_ = {};
 };
 }  // namespace lite
diff --git a/mindspore/lite/tools/converter/model_parser.h b/mindspore/lite/tools/converter/model_parser.h
index 11f3be07e43..ce4d033c8aa 100644
--- a/mindspore/lite/tools/converter/model_parser.h
+++ b/mindspore/lite/tools/converter/model_parser.h
@@ -25,8 +25,7 @@
 #include "include/registry/model_parser_registry.h"
 #include "utils/log_adapter.h"
 
-namespace mindspore::lite {
-using namespace schema;
+namespace mindspore::converter {
 class ModelParser {
  public:
   ModelParser() = default;
@@ -50,6 +49,6 @@ ModelParser *LiteModelParserCreator() {
   }
   return parser;
 }
-}  // namespace mindspore::lite
+}  // namespace mindspore::converter
 
 #endif
diff --git a/mindspore/lite/tools/converter/optimizer_manager.cc b/mindspore/lite/tools/converter/optimizer_manager.cc
index 5ba17213fdd..354bfbd8128 100644
--- a/mindspore/lite/tools/converter/optimizer_manager.cc
+++ b/mindspore/lite/tools/converter/optimizer_manager.cc
@@ -18,51 +18,41 @@
 #include <string>
 #include <vector>
 #include "backend/optimizer/common/pass.h"
-#include "tools/converter/registry/pass_content.h"
 
 namespace mindspore {
-namespace opt {
-bool RunOptimizerPass(const FuncGraphPtr &func_graph, std::vector<std::string> pass_names) {
+namespace lite {
+bool RunOptimizerPass(const FuncGraphPtr &func_graph, const std::vector<std::string> &pass_names) {
   if (func_graph == nullptr) {
     MS_LOG(ERROR) << "func graph is nullptr.";
     return false;
   }
-  auto &passes_info = PassStoreRoomInfo();
-  for (auto &name : pass_names) {
-    if (passes_info.find(name) == passes_info.end()) {
-      MS_LOG(ERROR) << "cannot find required pass.";
-      return false;
-    }
-    if (!passes_info[name]->Run(func_graph)) {
-      MS_LOG(ERROR) << "run pass failed, pass name is " << name;
+  auto schedule_passes = registry::PassRegistry::GetPassFromStoreRoom(pass_names);
+  if (schedule_passes.size() != pass_names.size()) {
+    MS_LOG(ERROR) << "exited pass cannot be obtained.";
+    return false;
+  }
+  int index = 0;
+  for (auto &pass : schedule_passes) {
+    if (!pass->Run(func_graph)) {
+      MS_LOG(ERROR) << "run pass failed, pass name is " << pass_names[index];
       return false;
     }
+    ++index;
   }
   return true;
 }
 
-bool RunExternalPass(const FuncGraphPtr &func_graph, PassPosition position) {
+bool RunExternalPass(const FuncGraphPtr &func_graph, registry::PassPosition position) {
   if (func_graph == nullptr) {
     MS_LOG(ERROR) << "func graph is nullptr.";
     return false;
   }
-  auto &external_assigned = ExternalAssignedPassesInfo();
-  if (external_assigned.find(position) == external_assigned.end()) {
-    MS_LOG(DEBUG) << "there is no external pass in current position, position is " << position;
-    return true;
-  }
-  auto &passes_info = PassStoreRoomInfo();
-  for (auto &name : external_assigned[position]) {
-    if (passes_info.find(name) == passes_info.end()) {
-      MS_LOG(ERROR) << "cannot find required pass.";
-      return false;
-    }
-    if (!passes_info[name]->Run(func_graph)) {
-      MS_LOG(ERROR) << "run pass failed, pass name is " << name;
-      return false;
-    }
+  auto schedule_task = registry::PassRegistry::GetOuterScheduleTask(position);
+  if (!RunOptimizerPass(func_graph, schedule_task)) {
+    MS_LOG(ERROR) << "run external scheduled task failed.";
+    return false;
   }
   return true;
 }
-}  // namespace opt
+}  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/optimizer_manager.h b/mindspore/lite/tools/converter/optimizer_manager.h
index 09e761f3de4..f9f32ac61a6 100644
--- a/mindspore/lite/tools/converter/optimizer_manager.h
+++ b/mindspore/lite/tools/converter/optimizer_manager.h
@@ -23,10 +23,10 @@
 #include "ir/func_graph.h"
 
 namespace mindspore {
-namespace opt {
-bool RunOptimizerPass(const FuncGraphPtr &func_graph, std::vector<std::string> pass_names);
-bool RunExternalPass(const FuncGraphPtr &func_graph, PassPosition position);
-}  // namespace opt
+namespace lite {
+bool RunOptimizerPass(const FuncGraphPtr &func_graph, const std::vector<std::string> &pass_names);
+bool RunExternalPass(const FuncGraphPtr &func_graph, registry::PassPosition position);
+}  // namespace lite
 }  // namespace mindspore
 
 #endif  // MINDSPORE_LITE_TOOLS_CONVERTER_OPTIMIZER_MANAGER_H
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_inspector.cc b/mindspore/lite/tools/converter/parser/caffe/caffe_inspector.cc
index fabbe6ffa26..fec4a5d1568 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_inspector.cc
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_inspector.cc
@@ -16,6 +16,7 @@
 
 #include "tools/converter/parser/caffe/caffe_inspector.h"
 #include "src/common/log_adapter.h"
+#include "src/common/utils.h"
 
 namespace mindspore {
 namespace lite {
@@ -48,13 +49,13 @@ STATUS CaffeInspector::ParseInput() {
 
 STATUS CaffeInspector::FindGraphInputsAndOutputs() {
   for (const auto &iter : layerBottoms) {
-    if (layerTops.find(iter) == layerTops.end()) {
+    if (!IsContain(layerTops, iter)) {
       graphInput.insert(iter);
     }
   }
   for (const auto &iter : layerTops) {
-    if (layerBottoms.find(iter) == layerBottoms.end()) {
-      graphOutput.insert(iter);
+    if (layerBottoms.find(iter) == layerBottoms.end() && !IsContain(graphOutput, iter)) {
+      graphOutput.push_back(iter);
     }
   }
   return RET_OK;
@@ -70,7 +71,9 @@ STATUS CaffeInspector::SetLayerTopsAndBottoms() {
       graphInput.insert(layer.top(0));
     }
     for (int j = 0; j < layer.top_size(); j++) {
-      layerTops.insert(layer.top(j));
+      if (!IsContain(layerTops, layer.top(j))) {
+        layerTops.push_back(layer.top(j));
+      }
     }
     for (int j = 0; j < layer.bottom_size(); j++) {
       layerBottoms.insert(layer.bottom(j));
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_inspector.h b/mindspore/lite/tools/converter/parser/caffe/caffe_inspector.h
index bb2a6dffeec..76432e7b4de 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_inspector.h
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_inspector.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <unordered_map>
 #include <memory>
+#include <vector>
 #include "proto/caffe.pb.h"
 #include "include/errorcode.h"
 
@@ -37,16 +38,16 @@ class CaffeInspector {
   STATUS SetLayerTopsAndBottoms();
 
   std::set<std::string> GetGraphInput() { return graphInput; }
-  std::set<std::string> GetGraphOutput() { return graphOutput; }
+  std::vector<std::string> GetGraphOutput() { return graphOutput; }
 
  private:
   caffe::NetParameter net;
 
-  std::set<std::string> layerTops;
+  std::vector<std::string> layerTops;
   std::set<std::string> layerBottoms;
 
   std::set<std::string> graphInput;
-  std::set<std::string> graphOutput;
+  std::vector<std::string> graphOutput;
 };
 
 using CaffeInspectorPtr = std::shared_ptr<CaffeInspector>;
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.cc b/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.cc
index 18564ad112f..7254a69d10c 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.cc
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.cc
@@ -33,7 +33,7 @@
 #include "tools/optimizer/common/gllo_utils.h"
 #include "tools/converter/parser/unify_format.h"
 
-using mindspore::lite::converter::FmkType_CAFFE;
+using mindspore::converter::kFmkTypeCaffe;
 namespace mindspore::lite {
 namespace {
 namespace {
@@ -77,9 +77,9 @@ CaffeModelParser::CaffeModelParser() = default;
 CaffeModelParser::~CaffeModelParser() = default;
 
 FuncGraphPtr CaffeModelParser::Parse(const converter::ConverterParameters &flag) {
-  auto model_file = flag.model_file_;
-  auto weight_file = flag.weight_file_;
-  quant_type_ = flag.quant_type_;
+  auto model_file = flag.model_file;
+  auto weight_file = flag.weight_file;
+  quant_type_ = flag.quant_type;
   STATUS status = InitOriginModel(model_file, weight_file);
   if (status != RET_OK) {
     ReturnCode::GetSingleReturnCode()->UpdateReturnCode(status);
@@ -104,7 +104,7 @@ FuncGraphPtr CaffeModelParser::Parse(const converter::ConverterParameters &flag)
     return nullptr;
   }
   res_graph_->set_attr("graph_name", MakeValue("main_graph"));
-  res_graph_->set_attr("fmk", MakeValue(static_cast<int>(converter::FmkType_CAFFE)));
+  res_graph_->set_attr("fmk", MakeValue(static_cast<int>(converter::kFmkTypeCaffe)));
   std::set<FuncGraphPtr> all_func_graphs = {};
   GetAllFuncGraph(res_graph_, &all_func_graphs);
   if ((status = CommonAnfAdjust(all_func_graphs)) != RET_OK) {
@@ -112,7 +112,7 @@ FuncGraphPtr CaffeModelParser::Parse(const converter::ConverterParameters &flag)
     ReturnCode::GetSingleReturnCode()->UpdateReturnCode(status);
     return nullptr;
   }
-  auto unify_format = std::make_shared<UnifyFormatToNHWC>(lite::converter::FmkType_CAFFE, false, quant_type_);
+  auto unify_format = std::make_shared<UnifyFormatToNHWC>(converter::kFmkTypeCaffe, false, quant_type_);
   if (!unify_format->Run(res_graph_)) {
     MS_LOG(ERROR) << "Run insert transpose failed.";
     return nullptr;
@@ -243,6 +243,10 @@ STATUS CaffeModelParser::ConvertGraphInputsOfLayer() {
   for (int i = 0; i < caffe_model_.layer_size(); i++) {
     auto layer = caffe_model_.layer(i);
     if (layer.type() == "Input") {
+      if (layer.bottom_size() != 0) {
+        MS_LOG(ERROR) << "The input layer should not have inputs";
+        return RET_ERROR;
+      }
       auto parameter = res_graph_->add_parameter();
       std::vector<int64_t> shape = ConverterContext::GetInstance()->GetGraphInputTensorShape(layer.name());
       if (ConverterContext::GetInstance()->GetGraphInputTensorShapeMapSize() > 0 && shape.empty()) {
@@ -259,7 +263,8 @@ STATUS CaffeModelParser::ConvertGraphInputsOfLayer() {
         return RET_ERROR;
       }
       parameter->set_abstract(abstract);
-      parameter->set_name("graph_input-" + std::to_string(i));
+      parameter->set_name(layer.name());
+      ConverterContext::GetInstance()->AddGraphInputTensorNames(layer.name());
       nodes_.insert(std::pair(layer.top(0), parameter));
     }
   }
@@ -291,7 +296,8 @@ STATUS CaffeModelParser::ConvertGraphInputsOfShape() {
       return RET_ERROR;
     }
     parameter->set_abstract(abstract);
-    parameter->set_name("graph_input-" + caffe_model_.input(i));
+    parameter->set_name(caffe_model_.input(i));
+    ConverterContext::GetInstance()->AddGraphInputTensorNames(caffe_model_.input(i));
     nodes_.insert(std::pair(caffe_model_.input(i), parameter));
   }
   return RET_OK;
@@ -323,7 +329,8 @@ STATUS CaffeModelParser::ConvertGraphInputsOfDim() {
       return RET_ERROR;
     }
     parameter->set_abstract(abstract);
-    parameter->set_name("graph_input-" + caffe_model_.input(i));
+    parameter->set_name(caffe_model_.input(i));
+    ConverterContext::GetInstance()->AddGraphInputTensorNames(caffe_model_.input(i));
     nodes_.insert(std::pair(caffe_model_.input(i), parameter));
   }
   return RET_OK;
@@ -334,12 +341,17 @@ STATUS CaffeModelParser::ConvertGraphInputs() {
   if (ret != RET_OK) {
     return ret;
   }
-  if (caffe_model_.input_dim_size() > 0) {
-    return ConvertGraphInputsOfDim();
-  } else {
-    return ConvertGraphInputsOfShape();
+  ret = ConvertGraphInputsOfShape();
+  if (ret != RET_OK) {
+    return ret;
   }
-  return ret;
+  if (caffe_model_.input_dim_size() > 0) {
+    ret = ConvertGraphInputsOfDim();
+    if (ret != RET_OK) {
+      return ret;
+    }
+  }
+  return RET_OK;
 }
 
 STATUS CaffeModelParser::ConvertGraphOutputs() {
@@ -385,11 +397,11 @@ STATUS CaffeModelParser::ConvertGraphOutputs() {
     }
     auto valueNode = NewValueNode(returnPrim);
     std::vector<AnfNodePtr> opInputs{valueNode};
-    if (nodes_.find(*caffeInspector.GetGraphOutput().begin()) == nodes_.end()) {
+    if (nodes_.find(caffeInspector.GetGraphOutput().front()) == nodes_.end()) {
       MS_LOG(ERROR) << "Can't find input node.";
       return RET_NOT_FIND_OP;
     }
-    auto cnode = nodes_.find(*caffeInspector.GetGraphOutput().begin())->second;
+    auto cnode = nodes_.find(caffeInspector.GetGraphOutput().front())->second;
     if (cnode == nullptr) {
       MS_LOG(ERROR) << "Can't find input node.";
       return RET_NOT_FIND_OP;
@@ -399,6 +411,8 @@ STATUS CaffeModelParser::ConvertGraphOutputs() {
     returnCnode->set_fullname_with_scope("Return");
     res_graph_->set_return(returnCnode);
   }
+  // save original output tensor names.
+  ConverterContext::GetInstance()->SetGraphOutputTensorNames(caffeInspector.GetGraphOutput());
   return RET_OK;
 }
 
@@ -553,5 +567,5 @@ std::string CaffeModelParser::GetOriginLayerName(const std::string &layer_name)
   }
   return layer.name();
 }
-REG_MODEL_PARSER(FmkType_CAFFE, LiteModelParserCreator<CaffeModelParser>)
+REG_MODEL_PARSER(kFmkTypeCaffe, converter::LiteModelParserCreator<CaffeModelParser>)
 }  // namespace mindspore::lite
diff --git a/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.h b/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.h
index 91a6c28a303..e7a0746fa73 100644
--- a/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.h
+++ b/mindspore/lite/tools/converter/parser/caffe/caffe_model_parser.h
@@ -28,7 +28,7 @@
 
 using STATUS = int;
 namespace mindspore::lite {
-class CaffeModelParser : public ModelParser {
+class CaffeModelParser : public converter::ModelParser {
  public:
   CaffeModelParser();
 
@@ -66,7 +66,7 @@ class CaffeModelParser : public ModelParser {
   caffe::NetParameter caffe_weight_;
   std::unordered_map<std::string, caffe::LayerParameter> caffe_layers_;
   std::unordered_map<std::string, AnfNodePtr> nodes_;
-  QuantType quant_type_ = schema::QuantType_QUANT_NONE;
+  schema::QuantType quant_type_ = schema::QuantType_QUANT_NONE;
 };
 }  // namespace mindspore::lite
 
diff --git a/mindspore/lite/tools/converter/parser/onnx/onnx_inputs_adjust.h b/mindspore/lite/tools/converter/parser/onnx/onnx_inputs_adjust.h
index 26d6071acc2..da3de8a1bb0 100644
--- a/mindspore/lite/tools/converter/parser/onnx/onnx_inputs_adjust.h
+++ b/mindspore/lite/tools/converter/parser/onnx/onnx_inputs_adjust.h
@@ -22,7 +22,7 @@
 #include "tools/converter/converter_flags.h"
 #include "tools/optimizer/common/gllo_utils.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore::lite {
 class OnnxInputAdjust {
  public:
diff --git a/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.cc b/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.cc
index 948cb8fbf48..40ca3fcf922 100644
--- a/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.cc
+++ b/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.cc
@@ -37,7 +37,7 @@
 #include "ops/transpose.h"
 #include "tools/converter/parser/unify_format.h"
 
-using mindspore::lite::converter::FmkType_ONNX;
+using mindspore::converter::kFmkTypeOnnx;
 namespace mindspore {
 namespace lite {
 namespace {
@@ -59,8 +59,8 @@ std::unordered_map<int, mindspore::TypeId> TYPE_MAP = {
   {onnx::TensorProto_DataType_BOOL, mindspore::kNumberTypeBool}};
 
 FuncGraphPtr OnnxModelParser::Parse(const converter::ConverterParameters &flag) {
-  string model_file = flag.model_file_;
-  quant_type_ = flag.quant_type_;
+  string model_file = flag.model_file;
+  quant_type_ = flag.quant_type;
   NotSupportOp::GetInstance()->set_fmk_type("ONNX");
   res_graph_ = std::make_shared<FuncGraph>();
   auto status = InitOriginModel(model_file);
@@ -79,10 +79,10 @@ FuncGraphPtr OnnxModelParser::Parse(const converter::ConverterParameters &flag)
   static auto root_func_manager = Manage(res_graph_);
   for (auto &subgraph : all_subgraphs_) {
     subgraph->set_manager(root_func_manager);
-    subgraph->set_attr("fmk", MakeValue(static_cast<int>(converter::FmkType_ONNX)));
+    subgraph->set_attr("fmk", MakeValue(static_cast<int>(converter::kFmkTypeOnnx)));
   }
   res_graph_->set_attr("graph_name", MakeValue("main_graph"));
-  res_graph_->set_attr("fmk", MakeValue(static_cast<int>(converter::FmkType_ONNX)));
+  res_graph_->set_attr("fmk", MakeValue(static_cast<int>(converter::kFmkTypeOnnx)));
   std::set<FuncGraphPtr> all_func_graphs = {};
   GetAllFuncGraph(res_graph_, &all_func_graphs);
   if ((status = CommonAnfAdjust(all_func_graphs)) != RET_OK) {
@@ -95,7 +95,7 @@ FuncGraphPtr OnnxModelParser::Parse(const converter::ConverterParameters &flag)
     ReturnCode::GetSingleReturnCode()->UpdateReturnCode(status);
     return nullptr;
   }
-  auto unify_format = std::make_shared<UnifyFormatToNHWC>(lite::converter::FmkType_ONNX, false, quant_type_);
+  auto unify_format = std::make_shared<UnifyFormatToNHWC>(converter::kFmkTypeOnnx, false, quant_type_);
   if (!unify_format->Run(res_graph_)) {
     MS_LOG(ERROR) << "Run insert transpose failed.";
     return nullptr;
@@ -118,7 +118,7 @@ STATUS OnnxModelParser::InitOriginModel(const std::string &model_file) {
   }
   OnnxNodeParser::set_opset_version(onnx_model_.opset_import().Get(0).version());
   onnx_root_graph_ = onnx_model_.graph();
-  res_graph_->set_attr("fmk", MakeValue(static_cast<int>(converter::FmkType_ONNX)));
+  res_graph_->set_attr("fmk", MakeValue(static_cast<int>(converter::kFmkTypeOnnx)));
   return RET_OK;
 }
 STATUS OnnxModelParser::ConvertOnnxGraph(const onnx::GraphProto &onnx_graph, const FuncGraphPtr &anf_graph,
@@ -157,6 +157,13 @@ STATUS OnnxModelParser::ConvertOnnxGraph(const onnx::GraphProto &onnx_graph, con
     MS_LOG(ERROR) << "convert graph outputs failed.";
     return RET_ERROR;
   }
+  // save original output tensor names.
+  if (root_node_name == "root_node") {
+    std::vector<std::string> output_names;
+    std::transform(onnx_graph.output().begin(), onnx_graph.output().end(), std::back_inserter(output_names),
+                   [](auto &graph_output) { return graph_output.name(); });
+    ConverterContext::GetInstance()->SetGraphOutputTensorNames(output_names);
+  }
   return status;
 }
 STATUS OnnxModelParser::ConvertConstTensors(const onnx::GraphProto &onnx_graph, const FuncGraphPtr &func_graph_ptr,
@@ -214,6 +221,7 @@ STATUS OnnxModelParser::ConvertGraphInputs(const onnx::GraphProto &onnx_graph, c
     }
     parameter->set_abstract(abstract_tensor);
     parameter->set_name(input_value.name());
+    ConverterContext::GetInstance()->AddGraphInputTensorNames(input_value.name());
     anf_nodes_map->emplace(input_value.name(), parameter);
   }
   return RET_OK;
@@ -246,7 +254,7 @@ STATUS OnnxModelParser::ConvertNodes(const onnx::GraphProto &onnx_graph, const F
       continue;
     }
     if (primitive_c->GetAttr(ops::kFormat) == nullptr) {
-      primitive_c->AddAttr(mindspore::ops::kFormat, MakeValue<int64_t>(Format_NCHW));
+      primitive_c->AddAttr(mindspore::ops::kFormat, MakeValue<int64_t>(mindspore::NCHW));
     }
     status = ConvertOpQuantParams(onnx_node, primitive_c);
     if (status != RET_OK) {
@@ -1246,6 +1254,6 @@ int OnnxModelParser::Onnx2AnfAdjust(const std::set<FuncGraphPtr> &all_func_graph
   return RET_OK;
 }
 
-REG_MODEL_PARSER(FmkType_ONNX, LiteModelParserCreator<OnnxModelParser>)
+REG_MODEL_PARSER(kFmkTypeOnnx, converter::LiteModelParserCreator<OnnxModelParser>)
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.h b/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.h
index 10ea0de5781..11c04d3ba12 100644
--- a/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.h
+++ b/mindspore/lite/tools/converter/parser/onnx/onnx_model_parser.h
@@ -36,7 +36,7 @@
 
 namespace mindspore {
 namespace lite {
-class OnnxModelParser : public ModelParser {
+class OnnxModelParser : public converter::ModelParser {
  public:
   OnnxModelParser() = default;
 
@@ -99,7 +99,7 @@ class OnnxModelParser : public ModelParser {
   std::unordered_map<std::string, AnfNodePtr> anf_nodes_map_;
   std::unordered_map<std::string, std::unordered_map<std::string, AnfNodePtr> *> control_nodes_map_;
   std::unordered_map<std::string, std::string> child_root_map_;  // for nest control flow node
-  QuantType quant_type_ = schema::QuantType_QUANT_NONE;
+  schema::QuantType quant_type_ = schema::QuantType_QUANT_NONE;
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/parser/tf/functionalize_cond.cc b/mindspore/lite/tools/converter/parser/tf/functionalize_cond.cc
index ea341513d31..e0ce89c6281 100644
--- a/mindspore/lite/tools/converter/parser/tf/functionalize_cond.cc
+++ b/mindspore/lite/tools/converter/parser/tf/functionalize_cond.cc
@@ -79,7 +79,9 @@ STATUS FunctionalizeCond::BranchSubGraphAddNodes(const FuncGraphPtr &graph, cons
     } else {
       graph->AddNode(node);
     }
-    node->set_func_graph(graph);
+    if (!utils::isa<ValueNodePtr>(node)) {
+      node->set_func_graph(graph);
+    }
     if (utils::isa<CNodePtr>(node)) {
       auto cnode = utils::cast<CNodePtr>(node);
       for (size_t i = 1; i < cnode->inputs().size(); i++) {
@@ -133,7 +135,7 @@ STATUS FunctionalizeCond::IdentifySubgraphInput(const FuncGraphPtr &graph, std::
 }
 
 FuncGraphPtr FunctionalizeCond::CreateBranchGraph(const AnfNodePtr &node, std::string name, BranchType branch_type) {
-  auto graph = FunctionalizeControlOpPass::NewFuncGraph(name, mindspore::lite::converter::FmkType_TF);
+  auto graph = FunctionalizeControlOpPass::NewFuncGraph(name, converter::kFmkTypeTf);
   if (graph == nullptr) {
     MS_LOG(ERROR) << "new graph Partial Node return nullptr";
     return nullptr;
diff --git a/mindspore/lite/tools/converter/parser/tf/functionalize_cond.h b/mindspore/lite/tools/converter/parser/tf/functionalize_cond.h
index 602434c49bb..856d12c2cfe 100644
--- a/mindspore/lite/tools/converter/parser/tf/functionalize_cond.h
+++ b/mindspore/lite/tools/converter/parser/tf/functionalize_cond.h
@@ -25,7 +25,7 @@
 #include "tools/optimizer/common/gllo_utils.h"
 #include "tools/converter/parser/tf/functionalize_control_op_pass.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore::opt {
 
 typedef enum { kThenBranch = 0, kElseBranch = 1 } BranchType;
diff --git a/mindspore/lite/tools/converter/parser/tf/functionalize_control_op_pass.h b/mindspore/lite/tools/converter/parser/tf/functionalize_control_op_pass.h
index 2fb4c27096a..e1872173f42 100644
--- a/mindspore/lite/tools/converter/parser/tf/functionalize_control_op_pass.h
+++ b/mindspore/lite/tools/converter/parser/tf/functionalize_control_op_pass.h
@@ -26,7 +26,7 @@
 #include "tools/converter/ops/ops_def.h"
 #include "tools/optimizer/common/gllo_utils.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore::opt {
 using AimFunc = std::function<bool(const AnfNodePtr &)>;
 class FunctionalizeControlOpPass : public Pass {
diff --git a/mindspore/lite/tools/converter/parser/tf/functionalize_while.cc b/mindspore/lite/tools/converter/parser/tf/functionalize_while.cc
index 8bc518ab4b8..882ef2626d9 100644
--- a/mindspore/lite/tools/converter/parser/tf/functionalize_while.cc
+++ b/mindspore/lite/tools/converter/parser/tf/functionalize_while.cc
@@ -297,7 +297,9 @@ STATUS FunctionalizeWhile::CondSubgraphAddNodes() {
     } else {
       cond_sub_func_graph_->AddNode(node);
     }
-    node->set_func_graph(cond_sub_func_graph_);
+    if (!utils::isa<ValueNodePtr>(node)) {
+      node->set_func_graph(cond_sub_func_graph_);
+    }
     if (utils::isa<CNodePtr>(node)) {
       auto cnode = utils::cast<CNodePtr>(node);
       for (size_t i = 1; i < cnode->inputs().size(); i++) {
@@ -367,8 +369,7 @@ STATUS FunctionalizeWhile::IdentifyCondSubgraphOutput() {
 
 STATUS FunctionalizeWhile::BuildCondGraph() {
   cond_subgraph_name_ = FunctionalizeControlOpPass::NodeClusterName(loop_cond_node_) + "_cond";
-  cond_sub_func_graph_ =
-    FunctionalizeControlOpPass::NewFuncGraph(cond_subgraph_name_, mindspore::lite::converter::FmkType_TF);
+  cond_sub_func_graph_ = FunctionalizeControlOpPass::NewFuncGraph(cond_subgraph_name_, converter::kFmkTypeTf);
   if (cond_sub_func_graph_ == nullptr) {
     MS_LOG(ERROR) << "new cond_sub_func_graph_ return nullptr";
     return RET_NULL_PTR;
@@ -419,7 +420,9 @@ STATUS FunctionalizeWhile::BodySubgraphAddNodes() {
     } else {
       body_sub_func_graph_->AddNode(node);
     }
-    node->set_func_graph(body_sub_func_graph_);
+    if (!utils::isa<ValueNodePtr>(node)) {
+      node->set_func_graph(body_sub_func_graph_);
+    }
     if (utils::isa<CNodePtr>(node)) {
       auto cnode = utils::cast<CNodePtr>(node);
       for (size_t i = 1; i < cnode->inputs().size(); i++) {
@@ -523,8 +526,7 @@ STATUS FunctionalizeWhile::IdentifyBodySubgraphOutput() {
 
 STATUS FunctionalizeWhile::BuildBodyGraph() {
   body_subgraph_name_ = FunctionalizeControlOpPass::NodeClusterName(loop_cond_node_) + "_body";
-  body_sub_func_graph_ =
-    FunctionalizeControlOpPass::NewFuncGraph(body_subgraph_name_, mindspore::lite::converter::FmkType_TF);
+  body_sub_func_graph_ = FunctionalizeControlOpPass::NewFuncGraph(body_subgraph_name_, converter::kFmkTypeTf);
   if (body_sub_func_graph_ == nullptr) {
     MS_LOG(ERROR) << "new body_sub_func_graph_ return nullptr";
     return RET_NULL_PTR;
diff --git a/mindspore/lite/tools/converter/parser/tf/functionalize_while.h b/mindspore/lite/tools/converter/parser/tf/functionalize_while.h
index 8b8e18a8c49..3e58daaec3f 100644
--- a/mindspore/lite/tools/converter/parser/tf/functionalize_while.h
+++ b/mindspore/lite/tools/converter/parser/tf/functionalize_while.h
@@ -25,7 +25,7 @@
 #include "tools/optimizer/common/gllo_utils.h"
 #include "tools/converter/parser/tf/functionalize_control_op_pass.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore::opt {
 
 constexpr const int POS_INVALID = -1;
diff --git a/mindspore/lite/tools/converter/parser/tf/tf_conv_base_parser.cc b/mindspore/lite/tools/converter/parser/tf/tf_conv_base_parser.cc
index 28666d007ed..42cb2e56f95 100644
--- a/mindspore/lite/tools/converter/parser/tf/tf_conv_base_parser.cc
+++ b/mindspore/lite/tools/converter/parser/tf/tf_conv_base_parser.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,14 @@
 #include "schema/inner/model_generated.h"
 namespace mindspore {
 namespace lite {
+
+namespace {
+constexpr size_t kPadDims = 4;
+constexpr size_t kExplicitPaddingsDims = 8;
+constexpr size_t NHWCTopPadPos = 2;
+constexpr size_t NCHWTopPadPos = 4;
+}  // namespace
+
 STATUS TFConvBaseParser::ParseKernels(const tensorflow::NodeDef &node_def, const mindspore::Format &format,
                                       std::vector<int64_t> *kernel) {
   tensorflow::AttrValue attr_value;
@@ -60,6 +68,33 @@ STATUS TFConvBaseParser::ParseStrides(const tensorflow::NodeDef &node_def, const
   return RET_OK;
 }
 
+STATUS TFConvBaseParser::ParseExplicitPaddings(const tensorflow::NodeDef &node_def, const mindspore::Format &format,
+                                               std::vector<int64_t> *explicit_paddings) {
+  MS_ASSERT(explicit_paddings != nullptr);
+  tensorflow::AttrValue attr_value;
+  if (!TensorFlowUtils::FindAttrValue(node_def, "explicit_paddings", &attr_value)) {
+    MS_LOG(ERROR) << "The explicit paddings value should be specified";
+    return RET_ERROR;
+  } else {
+    auto explicit_paddings_list = attr_value.list();
+    if (explicit_paddings_list.i_size() != kExplicitPaddingsDims) {
+      MS_LOG(ERROR) << "The explicit paddings attr should contain only 8 elements";
+      return RET_ERROR;
+    }
+    explicit_paddings->clear();
+    if (format == mindspore::NHWC) {
+      for (size_t i = 0; i < kPadDims; ++i) {
+        explicit_paddings->push_back(explicit_paddings_list.i(i + NHWCTopPadPos));
+      }
+    } else {
+      for (size_t i = 0; i < kPadDims; ++i) {
+        explicit_paddings->push_back(explicit_paddings_list.i(i + NCHWTopPadPos));
+      }
+    }
+  }
+  return RET_OK;
+}
+
 STATUS TFConvBaseParser::ParseDilations(const tensorflow::NodeDef &node_def, const mindspore::Format &format,
                                         std::vector<int64_t> *dilations) {
   tensorflow::AttrValue attr_value;
@@ -87,6 +122,8 @@ mindspore::PadMode TFConvBaseParser::ParsePadMode(const tensorflow::NodeDef &nod
   }
   if (attr_value.s() == "SAME") {
     return mindspore::PadMode::SAME;
+  } else if (attr_value.s() == "EXPLICIT") {
+    return mindspore::PadMode::PAD;
   }
   return mindspore::PadMode::VALID;
 }
diff --git a/mindspore/lite/tools/converter/parser/tf/tf_conv_base_parser.h b/mindspore/lite/tools/converter/parser/tf/tf_conv_base_parser.h
index 37d195f504d..27e38ed2c8c 100644
--- a/mindspore/lite/tools/converter/parser/tf/tf_conv_base_parser.h
+++ b/mindspore/lite/tools/converter/parser/tf/tf_conv_base_parser.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,6 +34,8 @@ class TFConvBaseParser : public TFNodeParser {
                                std::vector<int64_t> *dilations);
   static STATUS ParseKernels(const tensorflow::NodeDef &node_def, const mindspore::Format &format,
                              std::vector<int64_t> *kernel);
+  static STATUS ParseExplicitPaddings(const tensorflow::NodeDef &node_def, const mindspore::Format &format,
+                                      std::vector<int64_t> *explicit_paddings);
   static mindspore::PadMode ParsePadMode(const tensorflow::NodeDef &node_def);
 };
 }  // namespace lite
diff --git a/mindspore/lite/tools/converter/parser/tf/tf_conv_parser.cc b/mindspore/lite/tools/converter/parser/tf/tf_conv_parser.cc
index 621fcee9dee..363716dc605 100644
--- a/mindspore/lite/tools/converter/parser/tf/tf_conv_parser.cc
+++ b/mindspore/lite/tools/converter/parser/tf/tf_conv_parser.cc
@@ -68,6 +68,14 @@ ops::PrimitiveC *TFConvParser::Parse(const tensorflow::NodeDef &tf_op,
 
   auto pad_mode = ParsePadMode(tf_op);
   prim->set_pad_mode(pad_mode);
+  if (pad_mode == PadMode::PAD) {
+    std::vector<int64_t> explicit_paddings;
+    if (ParseExplicitPaddings(tf_op, format, &explicit_paddings) != RET_OK) {
+      MS_LOG(ERROR) << "parse explicit paddings attr failed";
+      return nullptr;
+    }
+    prim->set_pad_list(explicit_paddings);
+  }
 
   *output_size = 1;
   if (AddOpInput(tf_op, 0, inputs) != RET_OK || AddOpInput(tf_op, 1, inputs) != RET_OK) {
diff --git a/mindspore/lite/tools/converter/parser/tf/tf_model_parser.cc b/mindspore/lite/tools/converter/parser/tf/tf_model_parser.cc
index bea44401e44..4ee52f4bf4a 100644
--- a/mindspore/lite/tools/converter/parser/tf/tf_model_parser.cc
+++ b/mindspore/lite/tools/converter/parser/tf/tf_model_parser.cc
@@ -35,7 +35,7 @@
 #include "tools/common/tensor_util.h"
 #include "tools/converter/parser/unify_format.h"
 
-using mindspore::lite::converter::FmkType_TF;
+using mindspore::converter::kFmkTypeTf;
 namespace mindspore {
 namespace lite {
 namespace {
@@ -414,7 +414,7 @@ STATUS TFModelParser::ConvertConstTensor(const tensorflow::NodeDef &node_def, co
 }
 
 STATUS TFModelParser::ConvertParameter(const tensorflow::NodeDef &node, const ParameterPtr &parameter,
-                                       std::unordered_map<std::string, AnfNodePtr> *anf_node_map) {
+                                       std::unordered_map<std::string, AnfNodePtr> *anf_node_map, bool root_graph) {
   MS_ASSERT(node != nullptr);
   MS_ASSERT(parameter != nullptr);
 
@@ -446,7 +446,10 @@ STATUS TFModelParser::ConvertParameter(const tensorflow::NodeDef &node, const Pa
       return status;
     }
   } else {
-    graph_input_names_.emplace_back(node.name());  // only root graph need set graph input names
+    if (root_graph) {
+      graph_input_names_.emplace_back(node.name());  // only root graph need set graph input names
+      ConverterContext::GetInstance()->AddGraphInputTensorNames(node.name());
+    }
   }
 
   type = (type == kNumberTypeInt64) ? kNumberTypeInt32 : type;
@@ -463,13 +466,14 @@ STATUS TFModelParser::ConvertParameter(const tensorflow::NodeDef &node, const Pa
   return RET_OK;
 }
 
-STATUS TFModelParser::ConvertGraphInputsAndConsts(
-  const std::map<std::string, const tensorflow::NodeDef *> &tf_graph_nodes, const FuncGraphPtr &anf_graph,
-  std::unordered_map<std::string, AnfNodePtr> *anf_node_map) {
-  for (auto &pair : tf_graph_nodes) {
+STATUS TFModelParser::ConvertGraphInputsAndConsts(const std::vector<const tensorflow::NodeDef *> &tf_graph_nodes,
+                                                  const FuncGraphPtr &anf_graph,
+                                                  std::unordered_map<std::string, AnfNodePtr> *anf_node_map,
+                                                  bool root_graph) {
+  for (auto &node : tf_graph_nodes) {
     bool have_data_depend = false;
-    for (int i = 0; i < pair.second->input_size(); ++i) {
-      auto name = pair.second->input(i);
+    for (int i = 0; i < node->input_size(); ++i) {
+      auto name = node->input(i);
       if (!name.empty() && name[0] != '^') {  // control_depend input start with "^"
         have_data_depend = true;
         break;
@@ -477,7 +481,7 @@ STATUS TFModelParser::ConvertGraphInputsAndConsts(
     }
     if (!have_data_depend) {
       auto parameter = anf_graph->add_parameter();
-      if (ConvertParameter(*pair.second, parameter, anf_node_map) != RET_OK) {
+      if (ConvertParameter(*node, parameter, anf_node_map, root_graph) != RET_OK) {
         MS_LOG(ERROR) << "convert Parameter Node failed";
         return RET_ERROR;
       }
@@ -487,8 +491,8 @@ STATUS TFModelParser::ConvertGraphInputsAndConsts(
 }
 
 FuncGraphPtr TFModelParser::Parse(const converter::ConverterParameters &flag) {
-  auto modelFile = flag.model_file_;
-  quant_type_ = flag.quant_type_;
+  auto modelFile = flag.model_file;
+  quant_type_ = flag.quant_type;
   NotSupportOp::GetInstance()->set_fmk_type("TF");
   auto status = ValidateFileStr(modelFile, ".pb");
   if (status != RET_OK) {
@@ -515,14 +519,15 @@ FuncGraphPtr TFModelParser::Parse(const converter::ConverterParameters &flag) {
     return nullptr;
   }
   res_graph_->set_attr("graph_name", MakeValue("main_graph"));
-  res_graph_->set_attr("fmk", MakeValue(static_cast<int>(converter::FmkType_TF)));
+  res_graph_->set_attr("fmk", MakeValue(static_cast<int>(converter::kFmkTypeTf)));
 
   for (int i = 0; i < tf_root_graph_->node_size(); i++) {
     auto &node_def = tf_root_graph_->node(i);
     tf_root_graph_nodes_[node_def.name()] = &node_def;
+    tf_root_graph_nodes_vec_.emplace_back(&node_def);
   }
 
-  status = ConvertGraphInputsAndConsts(tf_root_graph_nodes_, res_graph_, &anf_root_node_map_);
+  status = ConvertGraphInputsAndConsts(tf_root_graph_nodes_vec_, res_graph_, &anf_root_node_map_, true);
   if (status != RET_OK) {
     ReturnCode::GetSingleReturnCode()->UpdateReturnCode(status);
     return nullptr;
@@ -576,7 +581,7 @@ FuncGraphPtr TFModelParser::Parse(const converter::ConverterParameters &flag) {
     ReturnCode::GetSingleReturnCode()->UpdateReturnCode(status);
     return nullptr;
   }
-  auto unify_format = std::make_shared<UnifyFormatToNHWC>(lite::converter::FmkType_TF, false, quant_type_);
+  auto unify_format = std::make_shared<UnifyFormatToNHWC>(converter::kFmkTypeTf, false, quant_type_);
   if (!unify_format->Run(res_graph_)) {
     MS_LOG(ERROR) << "Run insert transpose failed.";
     return nullptr;
@@ -607,11 +612,13 @@ STATUS TFModelParser::ConvertSubgraphInputs(std::map<std::string, const tensorfl
     }
     sub_graph_inputs.emplace_back(parameter);
   }
+  std::vector<const tensorflow::NodeDef *> subgraph_tf_node_vec;
   for (int j = 0; j < tf_sub_fuction.node_def_size(); j++) {
     auto &node_def = tf_sub_fuction.node_def(j);
     (*tf_sub_node_map)[node_def.name()] = &node_def;
+    subgraph_tf_node_vec.emplace_back(&node_def);
   }
-  if (ConvertGraphInputsAndConsts(*tf_sub_node_map, sub_func_graph, anf_sub_node_map) != RET_OK) {
+  if (ConvertGraphInputsAndConsts(subgraph_tf_node_vec, sub_func_graph, anf_sub_node_map, false) != RET_OK) {
     MS_LOG(ERROR) << "Convert subgraph consts failed";
     return RET_ERROR;
   }
@@ -727,7 +734,7 @@ STATUS TFModelParser::ConvertSubgraph() {
 
     FuncGraphPtr sub_func_graph = std::make_shared<FuncGraph>();
     sub_func_graph->set_attr("graph_name", MakeValue(sub_graph_name));
-    sub_func_graph->set_attr("fmk", MakeValue(static_cast<int>(converter::FmkType_TF)));
+    sub_func_graph->set_attr("fmk", MakeValue(static_cast<int>(converter::kFmkTypeTf)));
     std::unordered_map<std::string, AnfNodePtr> anf_sub_node_map;
     std::map<std::string, const tensorflow::NodeDef *> tf_sub_node_map;
 
@@ -921,7 +928,6 @@ STATUS TFModelParser::ConvertOps(const tensorflow::NodeDef &node_def,
   if (op_type == "Placeholder" || op_type == "Const" || op_type == "Identity" || op_type == "StopGradient") {
     return RET_OK;
   }
-
   MS_LOG(INFO) << "parse op : " << op_type;
   auto node_parser = TFNodeParserRegistry::GetInstance()->GetNodeParser(op_type);
   if (node_parser == nullptr) {
@@ -1030,23 +1036,24 @@ STATUS TFModelParser::ConvertRootGraphOutputs() {
   // tf_root_graph_nodes_ but not anf_root_node_map_
   std::set<std::string> all_node_inputs;
   std::vector<AnfNodePtr> output_nodes;
-  for (auto &pair : tf_root_graph_nodes_) {
-    for (int i = 0; i < pair.second->input_size(); ++i) {
-      all_node_inputs.insert(TensorFlowUtils::GetNodeName(pair.second->input(i)));
-      auto input_name = pair.second->input(i);
+  for (auto &node : tf_root_graph_nodes_vec_) {
+    for (int i = 0; i < node->input_size(); ++i) {
+      all_node_inputs.insert(TensorFlowUtils::GetNodeName(node->input(i)));
+      auto input_name = node->input(i);
       if (input_name[0] == '^') {
         input_name.erase(0, 1);
       }
       all_node_inputs.insert(input_name);
     }
   }
-  for (auto &pair : tf_root_graph_nodes_) {
-    if (pair.second->op() == "Assert") {
+  for (auto &node : tf_root_graph_nodes_vec_) {
+    if (node->op() == "Assert") {
       continue;
     }
-    auto it = all_node_inputs.find(pair.first);
-    if (it == all_node_inputs.end() && pair.second->input_size() > 0) {  // output node not constraint to Identity
-      auto origin_name = GetOriginInputName(*(pair.second), tf_root_graph_nodes_);
+    auto it = all_node_inputs.find(node->name());
+    if (it == all_node_inputs.end() && node->input_size() > 0) {  // output node not constraint to Identity
+      auto origin_name = GetOriginInputName(*(node), tf_root_graph_nodes_);
+      // node with multiple outputs has been changed to tupleGetItem, and the original name changes to be name:idx.
       for (int i = 0; i < node_output_num_[origin_name]; i++) {
         auto anf_node = GetAnfNode(origin_name, anf_root_node_map_, i);
         if (anf_node == nullptr) {
@@ -1054,7 +1061,22 @@ STATUS TFModelParser::ConvertRootGraphOutputs() {
           return RET_ERROR;
         }
         output_nodes.push_back(anf_node);
-        graph_output_names_.push_back(anf_node->fullname_with_scope());
+        // Get the name of node 'Identity' and 'StopGradient'.
+        if (node->op() == "Identity" || node->op() == "StopGradient") {
+          auto tmp_node = node;
+          bool found_input = true;
+          while (tmp_node->name().empty() && (tmp_node->op() == "Identity" || tmp_node->op() == "StopGradient")) {
+            auto flatten_input_name = TensorFlowUtils::GetFlattenNodeName(tmp_node->input(0));
+            if (tf_root_graph_nodes_.find(flatten_input_name) != tf_root_graph_nodes_.end()) {
+              tmp_node = tf_root_graph_nodes_.at(flatten_input_name);
+            } else {
+              found_input = false;
+              break;
+            }
+          }
+          origin_name = found_input ? tmp_node->name() : origin_name;
+        }
+        graph_output_names_.push_back(origin_name);
       }
     }
   }
@@ -1063,6 +1085,8 @@ STATUS TFModelParser::ConvertRootGraphOutputs() {
     MS_LOG(ERROR) << "make anf graph outputs node error";
     return status;
   }
+  // save original output tensor names.
+  ConverterContext::GetInstance()->SetGraphOutputTensorNames(graph_output_names_);
   return RET_OK;
 }
 STATUS TFModelParser::MakeAnfGraphOutputs(std::vector<AnfNodePtr> *output_nodes, const FuncGraphPtr &anf_graph) {
@@ -1119,6 +1143,6 @@ int TFModelParser::TF2AnfAdjust(const std::set<FuncGraphPtr> &all_func_graphs) {
   return RET_OK;
 }
 
-REG_MODEL_PARSER(FmkType_TF, LiteModelParserCreator<TFModelParser>)
+REG_MODEL_PARSER(kFmkTypeTf, converter::LiteModelParserCreator<TFModelParser>)
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/parser/tf/tf_model_parser.h b/mindspore/lite/tools/converter/parser/tf/tf_model_parser.h
index f0ecc57a254..7ff914d8d1e 100644
--- a/mindspore/lite/tools/converter/parser/tf/tf_model_parser.h
+++ b/mindspore/lite/tools/converter/parser/tf/tf_model_parser.h
@@ -35,7 +35,7 @@
 
 namespace mindspore {
 namespace lite {
-class TFModelParser : public ModelParser {
+class TFModelParser : public converter::ModelParser {
  public:
   TFModelParser() = default;
   ~TFModelParser() override = default;
@@ -51,10 +51,11 @@ class TFModelParser : public ModelParser {
                                    std::vector<int64_t> *shape_vector);
   static STATUS SetTensorInfoFromType(const tensorflow::TensorProto &tensor_proto, tensor::TensorPtr *tensor_info);
   STATUS ConvertParameter(const tensorflow::NodeDef &node, const ParameterPtr &parameter,
-                          std::unordered_map<std::string, AnfNodePtr> *anf_node_map);
-  STATUS ConvertGraphInputsAndConsts(const std::map<std::string, const tensorflow::NodeDef *> &tf_graph_nodes,
+                          std::unordered_map<std::string, AnfNodePtr> *anf_node_map, bool root_graph = false);
+  STATUS ConvertGraphInputsAndConsts(const std::vector<const tensorflow::NodeDef *> &tf_graph_nodes,
                                      const FuncGraphPtr &anf_graph,
-                                     std::unordered_map<std::string, AnfNodePtr> *anf_node_map);
+                                     std::unordered_map<std::string, AnfNodePtr> *anf_node_map,
+                                     bool root_graph = false);
   static STATUS ConvertInputNodes(const tensorflow::NodeDef &node_def, const std::vector<std::string> &input_names,
                                   const std::map<std::string, const tensorflow::NodeDef *> &tf_node_map,
                                   const std::unordered_map<std::string, AnfNodePtr> &anf_node_map,
@@ -97,6 +98,7 @@ class TFModelParser : public ModelParser {
 
   std::unique_ptr<tensorflow::GraphDef> tf_root_graph_;                     // tf root graph def
   std::map<std::string, const tensorflow::NodeDef *> tf_root_graph_nodes_;  // tf root graph node map
+  std::vector<const tensorflow::NodeDef *> tf_root_graph_nodes_vec_;
   std::unordered_map<std::string, AnfNodePtr> anf_root_node_map_;
   std::vector<std::string> graph_input_names_;
   std::vector<std::string> graph_output_names_;
@@ -106,7 +108,7 @@ class TFModelParser : public ModelParser {
   std::vector<std::string> while_cond_branch_name_;
   std::vector<std::string> if_then_branch_name_;
   std::unordered_map<std::string, int> node_output_num_;
-  QuantType quant_type_ = schema::QuantType_QUANT_NONE;
+  schema::QuantType quant_type_ = schema::QuantType_QUANT_NONE;
   std::map<CNodePtr, FuncGraphPtr> while_cond_map_, while_body_map_, if_then_map_, if_else_map_;
 };
 }  // namespace lite
diff --git a/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.cc b/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.cc
index 7d29bfcb66b..bd1ba82d5c3 100644
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.cc
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.cc
@@ -32,7 +32,7 @@
 #include "tools/converter/parser/parser_utils.h"
 #include "tools/converter/parser/unify_format.h"
 
-using mindspore::lite::converter::FmkType_TFLITE;
+using mindspore::converter::kFmkTypeTflite;
 namespace mindspore::lite {
 namespace {
 constexpr size_t kConvWeightIndex = 2;
@@ -53,8 +53,8 @@ std::unique_ptr<tflite::ModelT> TfliteModelParser::ReadTfliteModel(const std::st
 }
 
 FuncGraphPtr TfliteModelParser::Parse(const converter::ConverterParameters &flag) {
-  auto model_file = flag.model_file_;
-  quant_type_ = flag.quant_type_;
+  auto model_file = flag.model_file;
+  quant_type_ = flag.quant_type;
   // load graph
   tflite_model_ = ReadTfliteModel(model_file);
   if (tflite_model_ == nullptr) {
@@ -69,7 +69,7 @@ FuncGraphPtr TfliteModelParser::Parse(const converter::ConverterParameters &flag
     return nullptr;
   }
   res_graph_ = std::make_shared<FuncGraph>();
-  res_graph_->set_attr("fmk", MakeValue(static_cast<int>(converter::FmkType_TFLITE)));
+  res_graph_->set_attr("fmk", MakeValue(static_cast<int>(converter::kFmkTypeTflite)));
 
   auto status = ConvertGraphInputs();
   if (status != RET_OK) {
@@ -105,7 +105,7 @@ FuncGraphPtr TfliteModelParser::Parse(const converter::ConverterParameters &flag
     ReturnCode::GetSingleReturnCode()->UpdateReturnCode(status);
     return nullptr;
   }
-  auto unify_format = std::make_shared<UnifyFormatToNHWC>(lite::converter::FmkType_TFLITE, false, quant_type_);
+  auto unify_format = std::make_shared<UnifyFormatToNHWC>(converter::kFmkTypeTflite, false, quant_type_);
   if (!unify_format->Run(res_graph_)) {
     MS_LOG(ERROR) << "Run insert transpose failed.";
     return nullptr;
@@ -134,8 +134,8 @@ STATUS TfliteModelParser::ConvertOps() {
   int op_idx = 0;
   for (auto &op : tflite_subgraph->operators) {
     auto tflite_op_type = (tflite_model_->operator_codes[op->opcode_index])->builtin_code;
-    auto op_type = GetMSOpType(tflite_op_type);
-    auto op_name = op_type + "-" + std::to_string(op_idx);
+    std::string op_type = tflite::EnumNameBuiltinOperator(tflite_op_type);
+    std::string op_name = op_type + "-" + std::to_string(op_idx);
     op_idx++;
     // parse primitive
     MS_LOG(INFO) << "parse node :" << op_name;
@@ -336,7 +336,8 @@ STATUS TfliteModelParser::ConvertGraphInputs() {
       return RET_ERROR;
     }
     parameter->set_abstract(abstract_tensor);
-    parameter->set_name("graph_input-" + std::to_string(tflite_graph_input));
+    parameter->set_name(tensor->name);
+    ConverterContext::GetInstance()->AddGraphInputTensorNames(tensor->name);
     nodes_.insert(std::pair(tflite_graph_input, parameter));
   }
   return RET_OK;
@@ -398,6 +399,12 @@ STATUS TfliteModelParser::ConvertGraphOutputs() {
     returnCnode->set_fullname_with_scope("Return");
     res_graph_->set_return(returnCnode);
   }
+  // save original output tensor names.
+  std::vector<std::string> output_names;
+  auto output_idx = tflite_subgraph->outputs;
+  std::transform(output_idx.begin(), output_idx.end(), std::back_inserter(output_names),
+                 [&](auto out_idx) { return tflite_subgraph->tensors.at(out_idx)->name; });
+  ConverterContext::GetInstance()->SetGraphOutputTensorNames(output_names);
   return RET_OK;
 }
 
@@ -540,5 +547,5 @@ int TfliteModelParser::Tflite2AnfAdjust(const std::set<FuncGraphPtr> &all_func_g
   return RET_OK;
 }
 
-REG_MODEL_PARSER(FmkType_TFLITE, LiteModelParserCreator<TfliteModelParser>)
+REG_MODEL_PARSER(kFmkTypeTflite, converter::LiteModelParserCreator<TfliteModelParser>)
 }  // namespace mindspore::lite
diff --git a/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.h b/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.h
index 78d8b22d2e8..038a6c083df 100644
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.h
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.h
@@ -28,7 +28,7 @@
 
 namespace mindspore {
 namespace lite {
-class TfliteModelParser : public ModelParser {
+class TfliteModelParser : public converter::ModelParser {
  public:
   TfliteModelParser() = default;
 
@@ -52,7 +52,7 @@ class TfliteModelParser : public ModelParser {
   STATUS ConvertGraphOutputs();
   static STATUS SetTensorQuantParam(const tflite::TensorT *tflite_tensor, std::vector<QuantParamT> *quant_params,
                                     int round_type = 1);
-  QuantType quant_type_ = schema::QuantType_QUANT_NONE;
+  schema::QuantType quant_type_ = schema::QuantType_QUANT_NONE;
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/parser/tflite/tflite_util.cc b/mindspore/lite/tools/converter/parser/tflite/tflite_util.cc
index 63ff27e969e..052d7b89fee 100644
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_util.cc
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_util.cc
@@ -24,107 +24,6 @@
 
 namespace mindspore {
 namespace lite {
-std::map<tflite::BuiltinOperator, std::string> tfMsOpTypeMap{
-  {tflite::BuiltinOperator_CONV_2D, "Conv2D"},
-  {tflite::BuiltinOperator_DEPTHWISE_CONV_2D, "DepthwiseConv2D"},
-  {tflite::BuiltinOperator_AVERAGE_POOL_2D, "MeanPooling"},
-  {tflite::BuiltinOperator_MAX_POOL_2D, "MaxPooling"},
-  {tflite::BuiltinOperator_ADD, "Add"},
-  {tflite::BuiltinOperator_CONCATENATION, "Concat"},
-  {tflite::BuiltinOperator_RESIZE_BILINEAR, "ResizeBilinear"},
-  {tflite::BuiltinOperator_RESHAPE, "Reshape"},
-  {tflite::BuiltinOperator_LOGISTIC, "Logistic"},
-  {tflite::BuiltinOperator_MUL, "Mul"},
-  {tflite::BuiltinOperator_SOFTMAX, "Softmax"},
-  {tflite::BuiltinOperator_FULLY_CONNECTED, "FullyConnected"},
-  {tflite::BuiltinOperator_SLICE, "Slice"},
-  {tflite::BuiltinOperator_SUB, "Sub"},
-  {tflite::BuiltinOperator_TRANSPOSE, "Transpose"},
-  {tflite::BuiltinOperator_PACK, "Stack"},
-  {tflite::BuiltinOperator_MEAN, "Mean"},
-  {tflite::BuiltinOperator_RELU6, "ReLU6"},
-  {tflite::BuiltinOperator_TANH, "Tanh"},
-  {tflite::BuiltinOperator_RSQRT, "Rsqrt"},
-  {tflite::BuiltinOperator_ARG_MAX, "Argmax"},
-  {tflite::BuiltinOperator_SQUARED_DIFFERENCE, "SquaredDifference"},
-  {tflite::BuiltinOperator_FAKE_QUANT, "FakeQuant"},
-  {tflite::BuiltinOperator_TRANSPOSE_CONV, "DeConv2D"},
-  {tflite::BuiltinOperator_PAD, "Pad"},
-  {tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, "NearestNeighbor"},
-  {tflite::BuiltinOperator_RELU, "ReLU"},
-  {tflite::BuiltinOperator_LEAKY_RELU, "LeakyRelu"},
-  {tflite::BuiltinOperator_SQUEEZE, "Squeeze"},
-  {tflite::BuiltinOperator_POW, "Pow"},
-  {tflite::BuiltinOperator_ARG_MIN, "Argmin"},
-  {tflite::BuiltinOperator_CEIL, "Ceil"},
-  {tflite::BuiltinOperator_EXPAND_DIMS, "ExpandDims"},
-  {tflite::BuiltinOperator_FILL, "Fill"},
-  {tflite::BuiltinOperator_DIV, "Div"},
-  {tflite::BuiltinOperator_FLOOR, "flOOR"},
-  {tflite::BuiltinOperator_FLOOR_DIV, "FloorDiv"},
-  {tflite::BuiltinOperator_FLOOR_MOD, "FloorMod"},
-  {tflite::BuiltinOperator_GATHER, "Gather"},
-  {tflite::BuiltinOperator_GATHER_ND, "GatherND"},
-  {tflite::BuiltinOperator_REVERSE_V2, "reverse"},
-  {tflite::BuiltinOperator_RANGE, "Range"},
-  {tflite::BuiltinOperator_RANK, "Rank"},
-  {tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION, "LocalResponseNorm"},
-  {tflite::BuiltinOperator_GATHER, "Gather"},
-  {tflite::BuiltinOperator_EXP, "Exp"},
-  {tflite::BuiltinOperator_SPLIT_V, "SplitV"},
-  {tflite::BuiltinOperator_SPLIT, "Split"},
-  {tflite::BuiltinOperator_BATCH_TO_SPACE_ND, "BatchToSpaceND"},
-  {tflite::BuiltinOperator_STRIDED_SLICE, "StridedSlice"},
-  {tflite::BuiltinOperator_ONE_HOT, "OneHot"},
-  {tflite::BuiltinOperator_SHAPE, "Shape"},
-  {tflite::BuiltinOperator_SQUEEZE, "Squeeze"},
-  {tflite::BuiltinOperator_ABS, "Abs"},
-  {tflite::BuiltinOperator_SIN, "Sin"},
-  {tflite::BuiltinOperator_COS, "Cos"},
-  {tflite::BuiltinOperator_LOG, "Log"},
-  {tflite::BuiltinOperator_SQRT, "Sqrt"},
-  {tflite::BuiltinOperator_SQUARE, "Square"},
-  {tflite::BuiltinOperator_LOGICAL_NOT, "LogicalNot"},
-  {tflite::BuiltinOperator_LOGICAL_AND, "LogicalAnd"},
-  {tflite::BuiltinOperator_LOGICAL_OR, "LogicalOr"},
-  {tflite::BuiltinOperator_HARD_SWISH, "HSwish"},
-  {tflite::BuiltinOperator_SUM, "Sum"},
-  {tflite::BuiltinOperator_REDUCE_PROD, "ReduceProd"},
-  {tflite::BuiltinOperator_REDUCE_MAX, "ReduceMax"},
-  {tflite::BuiltinOperator_REDUCE_MIN, "ReduceMin"},
-  {tflite::BuiltinOperator_SCATTER_ND, "ScatterNd"},
-  {tflite::BuiltinOperator_MAXIMUM, "Maximum"},
-  {tflite::BuiltinOperator_MINIMUM, "Minimum"},
-  {tflite::BuiltinOperator_ADD_N, "AddN"},
-  {tflite::BuiltinOperator_CAST, "Cast"},
-  {tflite::BuiltinOperator_EQUAL, "Equal"},
-  {tflite::BuiltinOperator_NOT_EQUAL, "NotEqual"},
-  {tflite::BuiltinOperator_GREATER, "Greater"},
-  {tflite::BuiltinOperator_GREATER_EQUAL, "GreaterEqual"},
-  {tflite::BuiltinOperator_LESS, "Less"},
-  {tflite::BuiltinOperator_LESS_EQUAL, "LessEqual"},
-  {tflite::BuiltinOperator_DEPTH_TO_SPACE, "DepthToSpace"},
-  {tflite::BuiltinOperator_SPACE_TO_BATCH_ND, "SpaceToBatchND"},
-  {tflite::BuiltinOperator_SPACE_TO_DEPTH, "SpaceToDepth"},
-  {tflite::BuiltinOperator_ROUND, "Round"},
-  {tflite::BuiltinOperator_WHERE, "Where"},
-  {tflite::BuiltinOperator_SPARSE_TO_DENSE, "SparseToDense"},
-  {tflite::BuiltinOperator_ZEROS_LIKE, "ZerosLike"},
-  {tflite::BuiltinOperator_TILE, "Tile"},
-  {tflite::BuiltinOperator_TOPK_V2, "TopKV2"},
-  {tflite::BuiltinOperator_REVERSE_SEQUENCE, "ReverseSequence"},
-  {tflite::BuiltinOperator_UNIQUE, "Unique"},
-  {tflite::BuiltinOperator_UNPACK, "Unstack"},
-  {tflite::BuiltinOperator_CUSTOM, "Custom"},
-  {tflite::BuiltinOperator_MIRROR_PAD, "MirrorPad"},
-  {tflite::BuiltinOperator_NEG, "Neg"},
-  {tflite::BuiltinOperator_PRELU, "PRELU"},
-  {tflite::BuiltinOperator_HASHTABLE_LOOKUP, "HashtableLookup"},
-  {tflite::BuiltinOperator_LSH_PROJECTION, "LshProjection"},
-  {tflite::BuiltinOperator_SKIP_GRAM, "SKipGram"},
-  {tflite::BuiltinOperator_WHILE, "While"},
-};
-
 std::map<tflite::ActivationFunctionType, mindspore::ActivationType> tfMsActivationFunctionMap{
   {tflite::ActivationFunctionType_NONE, mindspore::ActivationType::NO_ACTIVATION},
   {tflite::ActivationFunctionType_RELU, mindspore::ActivationType::RELU},
@@ -144,14 +43,6 @@ mindspore::ActivationType GetActivationFunctionType(tflite::ActivationFunctionTy
   return tfMsActivationFunctionMap.at(tfliteAFType);
 }
 
-std::string GetMSOpType(tflite::BuiltinOperator tfliteOpType) {
-  auto iter = tfMsOpTypeMap.find(tfliteOpType);
-  if (iter == tfMsOpTypeMap.end()) {
-    return tflite::EnumNameBuiltinOperator(tfliteOpType);
-  }
-  return iter->second;
-}
-
 TypeId GetTfliteDataType(const tflite::TensorType &tflite_data_type) {
   auto iter = type_map.find(tflite_data_type);
   if (iter == type_map.end()) {
diff --git a/mindspore/lite/tools/converter/parser/tflite/tflite_util.h b/mindspore/lite/tools/converter/parser/tflite/tflite_util.h
index 71e11c1c50c..241857f2763 100644
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_util.h
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_util.h
@@ -39,8 +39,6 @@ size_t GetDataTypeSize(const TypeId &data_type);
 
 mindspore::ActivationType GetActivationFunctionType(tflite::ActivationFunctionType tfliteAFType);
 
-std::string GetMSOpType(tflite::BuiltinOperator tfliteOpType);
-
 TypeId GetTfliteDataType(const tflite::TensorType &tflite_data_type);
 
 STATUS getPaddingParam(const std::unique_ptr<tflite::TensorT> &tensor, mindspore::PadMode pad_mode, int strideH,
diff --git a/mindspore/lite/tools/converter/parser/unify_format.cc b/mindspore/lite/tools/converter/parser/unify_format.cc
index f3a07842db2..23b264ad986 100644
--- a/mindspore/lite/tools/converter/parser/unify_format.cc
+++ b/mindspore/lite/tools/converter/parser/unify_format.cc
@@ -50,10 +50,10 @@ STATUS DecideTFConvWeightSrcFormat(const CNodePtr &cnode, schema::QuantType quan
   }
   bool is_depth_wise = prim->GetAttr(ops::kIsDepthWise) != nullptr && GetValue<bool>(prim->GetAttr(ops::kIsDepthWise));
   switch (quant_type) {
-    case QuantType_AwareTraining:
-    case QuantType_PostTraining:
-    case QuantType_WeightQuant:
-    case QuantType_QUANT_NONE: {
+    case schema::QuantType_AwareTraining:
+    case schema::QuantType_PostTraining:
+    case schema::QuantType_WeightQuant:
+    case schema::QuantType_QUANT_NONE: {
       if (opt::CheckPrimitiveType(cnode, prim::kPrimConv2DFusion)) {
         if (!is_depth_wise) {
           *src_format = schema::Format_HWCK;
@@ -85,10 +85,10 @@ STATUS DecideTFLITEConvWeightSrcFormat(const CNodePtr &cnode, schema::QuantType
   }
   bool is_depth_wise = prim->GetAttr(ops::kIsDepthWise) != nullptr && GetValue<bool>(prim->GetAttr(ops::kIsDepthWise));
   switch (quant_type) {
-    case QuantType_AwareTraining:
-    case QuantType_PostTraining:
-    case QuantType_WeightQuant:
-    case QuantType_QUANT_NONE: {
+    case schema::QuantType_AwareTraining:
+    case schema::QuantType_PostTraining:
+    case schema::QuantType_WeightQuant:
+    case schema::QuantType_QUANT_NONE: {
       if (opt::CheckPrimitiveType(cnode, prim::kPrimConv2DFusion)) {
         if (!is_depth_wise) {
           *src_format = schema::Format_KHWC;
@@ -127,7 +127,7 @@ STATUS DecideONNXConvWeightSrcFormat(const CNodePtr &cnode, schema::QuantType qu
   bool is_depth_wise = prim->GetAttr(ops::kIsDepthWise) != nullptr && GetValue<bool>(prim->GetAttr(ops::kIsDepthWise));
   int64_t format = prim->GetAttr(ops::kFormat) != nullptr ? GetValue<int64_t>(prim->GetAttr(ops::kFormat)) : 0;
   switch (quant_type) {
-    case QuantType_AwareTraining: {
+    case schema::QuantType_AwareTraining: {
       if (opt::CheckPrimitiveType(cnode, prim::kPrimConv2DFusion)) {
         if (!is_depth_wise) {
           *src_format = schema::Format_KHWC;
@@ -141,9 +141,9 @@ STATUS DecideONNXConvWeightSrcFormat(const CNodePtr &cnode, schema::QuantType qu
         return lite::RET_ERROR;
       }
     } break;
-    case QuantType_PostTraining:
-    case QuantType_WeightQuant:
-    case QuantType_QUANT_NONE: {
+    case schema::QuantType_PostTraining:
+    case schema::QuantType_WeightQuant:
+    case schema::QuantType_QUANT_NONE: {
       if (opt::CheckPrimitiveType(cnode, prim::kPrimConv2DFusion) ||
           opt::CheckPrimitiveType(cnode, prim::kPrimConv2dTransposeFusion)) {
         if (format == schema::Format_NHWC) {
@@ -173,16 +173,18 @@ STATUS UnifyFormatToNHWC::GetTransNodeFormatType(const CNodePtr &cnode, opt::Tra
   MS_ASSERT(cnode != nullptr && trans_info != nullptr);
   auto prim_node = cnode->input(0);
   auto prim = GetValueNode<PrimitivePtr>(prim_node);
-  MS_ASSERT(prim != nullptr);
+  if (prim == nullptr) {
+    return RET_OK;
+  }
   auto &specify_nhwc_op_map = opt::GetNHWCOpMap();
   auto &specify_nchw_op_map = opt::GetNCHWOpMap();
-  if (fmk_type_ == lite::converter::FmkType_TFLITE) {
+  if (fmk_type_ == converter::kFmkTypeTflite) {
     if (specify_nchw_op_map.find(prim->name()) == specify_nchw_op_map.end()) {
       return lite::RET_OK;
     }
     trans_info->pre_ = opt::kNHWC2NCHW;
     trans_info->post_ = opt::kNCHW2NHWC;
-  } else if (fmk_type_ == lite::converter::FmkType_TF) {
+  } else if (fmk_type_ == converter::kFmkTypeTf) {
     if (specify_nhwc_op_map.find(prim->name()) != specify_nhwc_op_map.end() && opt::GetFormat(cnode) == NCHW) {
       trans_info->pre_ = opt::kNCHW2NHWC;
       trans_info->post_ = opt::kNHWC2NCHW;
@@ -193,7 +195,7 @@ STATUS UnifyFormatToNHWC::GetTransNodeFormatType(const CNodePtr &cnode, opt::Tra
     }
   } else {
     if (specify_nhwc_op_map.find(prim->name()) != specify_nhwc_op_map.end()) {
-      if (fmk_type_ == lite::converter::FmkType_ONNX && prim->GetAttr(ops::kFormat) != nullptr &&
+      if (fmk_type_ == converter::kFmkTypeOnnx && prim->GetAttr(ops::kFormat) != nullptr &&
           GetValue<int64_t>(prim->GetAttr(ops::kFormat)) == NHWC) {
         return lite::RET_OK;
       }
@@ -213,10 +215,10 @@ void UnifyFormatToNHWC::SetSensitiveOps() {
 
 bool UnifyFormatToNHWC::DecideWhetherHandleGraphInput(const FuncGraphPtr &func_graph, const ShapeVector &shape) {
   MS_ASSERT(func_graph != nullptr);
-  if (fmk_type_ == converter::FmkType_TF || fmk_type_ == converter::FmkType_TFLITE) {
+  if (fmk_type_ == converter::kFmkTypeTf || fmk_type_ == converter::kFmkTypeTflite) {
     return false;
   }
-  if (func_graph->get_inputs().size() == 1 && fmk_type_ == lite::converter::FmkType_ONNX &&
+  if (func_graph->get_inputs().size() == 1 && fmk_type_ == converter::kFmkTypeOnnx &&
       shape[opt::kInputIndexThree] == kInputChannal && shape[1] == -1) {
     return false;
   }
@@ -230,11 +232,11 @@ STATUS UnifyFormatToNHWC::DecideConvWeightSrcAndDstFormat(const CNodePtr &cnode,
   MS_ASSERT(cnode != nullptr && src_format != nullptr && dst_format != nullptr);
   *dst_format = schema::Format_KHWC;
   std::map<converter::FmkType, std::function<int(const CNodePtr &, schema::QuantType, schema::Format *)>>
-    decide_functions = {{converter::FmkType_MS, DecideMINDIRConvWeightSrcFormat},
-                        {converter::FmkType_TF, DecideTFConvWeightSrcFormat},
-                        {converter::FmkType_TFLITE, DecideTFLITEConvWeightSrcFormat},
-                        {converter::FmkType_CAFFE, DecideCAFFEConvWeightSrcFormat},
-                        {converter::FmkType_ONNX, DecideONNXConvWeightSrcFormat}};
+    decide_functions = {{converter::kFmkTypeMs, DecideMINDIRConvWeightSrcFormat},
+                        {converter::kFmkTypeTf, DecideTFConvWeightSrcFormat},
+                        {converter::kFmkTypeTflite, DecideTFLITEConvWeightSrcFormat},
+                        {converter::kFmkTypeCaffe, DecideCAFFEConvWeightSrcFormat},
+                        {converter::kFmkTypeOnnx, DecideONNXConvWeightSrcFormat}};
   auto iter = decide_functions.find(fmk_type_);
   if (iter == decide_functions.end()) {
     MS_LOG(ERROR) << "current fmk don't support, please check.";
diff --git a/mindspore/lite/tools/converter/parser/unify_format.h b/mindspore/lite/tools/converter/parser/unify_format.h
index 1ef43187504..4fc5f0ec121 100644
--- a/mindspore/lite/tools/converter/parser/unify_format.h
+++ b/mindspore/lite/tools/converter/parser/unify_format.h
@@ -19,12 +19,12 @@
 
 #include "tools/optimizer/format/to_format_base.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore {
 namespace lite {
 class UnifyFormatToNHWC : public opt::ToFormatBase {
  public:
-  explicit UnifyFormatToNHWC(FmkType fmk_type = lite::converter::FmkType_MS, bool train_flag = false,
+  explicit UnifyFormatToNHWC(FmkType fmk_type = converter::kFmkTypeMs, bool train_flag = false,
                              schema::QuantType quant_type = schema::QuantType_QUANT_NONE)
       : ToFormatBase(fmk_type, train_flag), quant_type_(quant_type) {}
   ~UnifyFormatToNHWC() override = default;
diff --git a/mindspore/lite/tools/converter/quantizer/CMakeLists.txt b/mindspore/lite/tools/converter/quantizer/CMakeLists.txt
index 3e64f880908..7545f2722f4 100644
--- a/mindspore/lite/tools/converter/quantizer/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/quantizer/CMakeLists.txt
@@ -12,6 +12,10 @@ file(GLOB QUANTIZER
         ${CMAKE_CURRENT_SOURCE_DIR}/quant_cast.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/weight_quantizer.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/huffman_encode.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/fse_decoder.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/fse_bit_stream.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/fse_encoder.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/fix_bit_weight_quantizer.cc
         )
 set_property(SOURCE ${QUANTIZER} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_LITE)
 add_library(quantizer_mid OBJECT ${QUANTIZER})
diff --git a/mindspore/lite/tools/converter/quantizer/post_training_quantizer.cc b/mindspore/lite/tools/converter/quantizer/post_training_quantizer.cc
index 6e4bce06232..4c728a2bf99 100644
--- a/mindspore/lite/tools/converter/quantizer/post_training_quantizer.cc
+++ b/mindspore/lite/tools/converter/quantizer/post_training_quantizer.cc
@@ -580,8 +580,9 @@ STATUS PostTrainingQuantizer::DoWeightQuant(const std::string &op_name, const An
       quant_min_t = -(1 << (unsigned int)(bit_num_t - 1));
     }
   }
+  auto weight_quant_type = perchanel ? WeightQuantType::FIXED_BIT_PER_CHANNEL : WeightQuantType::FIXED_BIT_PER_LAYER;
   auto status = QuantFilter<int8_t>(tensor_info, primitive, QuantType_PostTraining, quant_max_t, quant_min_t, bit_num_t,
-                                    perchanel, kNumberTypeInt8);
+                                    weight_quant_type, kNumberTypeInt8);
   if (status != RET_OK) {
     MS_LOG(ERROR) << "QuantFilter failed: " << status;
     return status;
diff --git a/mindspore/lite/tools/converter/quantizer/quantize_util.cc b/mindspore/lite/tools/converter/quantizer/quantize_util.cc
index 3a1bf47be9c..68b6a46463c 100644
--- a/mindspore/lite/tools/converter/quantizer/quantize_util.cc
+++ b/mindspore/lite/tools/converter/quantizer/quantize_util.cc
@@ -1017,4 +1017,52 @@ void CalQuantAssitInfo(const schema::PrimitiveT &primitive, const std::vector<in
     }
   }
 }
+
+STATUS QuantFilter(const tensor::TensorPtr &weight, const PrimitivePtr &primitive, QuantType quant_type,
+                   WeightQuantType weight_quant_type, TypeId quant_data_type, int index) {
+  MS_ASSERT(weight != nullptr);
+  MS_ASSERT(primitive != nullptr);
+  auto dims = weight->shape();
+  if (weight_quant_type == FIXED_BIT_PER_CHANNEL) {
+    if (dims.size() <= 1) {
+      MS_LOG(WARNING) << "dims is " << dims.size() << " can not per_channel";
+      weight_quant_type = FIXED_BIT_PER_LAYER;
+    }
+  }
+  std::vector<schema::QuantParamT> quant_params;
+  size_t elem_count = weight->DataSize();
+  auto *raw_data = static_cast<float *>(weight->data_c());
+  if (raw_data == nullptr) {
+    MS_LOG(ERROR) << "rawDatas is nullptr";
+    return RET_ERROR;
+  }
+
+  std::vector<int16_t> quant_data(elem_count);
+  int ret = RET_OK;
+  if (weight_quant_type == MIXED_BIT_PER_LAYER) {
+    FixBitWeightQuantizer quantizer(0.02);
+    quantizer.DoQuantization(static_cast<float *>(weight->data_c()), weight->shape_c(), 0, &quant_params, &quant_data);
+  } else {
+    MS_LOG(ERROR) << "Unsupported weight quant type:" << weight_quant_type;
+  }
+  auto status =
+    UpdateTensorDataAndSize(weight, quant_data.data(), quant_data.size() * sizeof(int16_t), TypeId::kNumberTypeInt16);
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "UpdateTensorDataAndSize error";
+    return RET_ERROR;
+  }
+
+  if (quant_params.empty()) {
+    MS_LOG(ERROR) << "quant_params empty";
+    return RET_ERROR;
+  }
+  auto quant_param_holder = GetCNodeQuantHolder(primitive);
+  if (quant_type == QuantType_PostTraining) {
+    quant_param_holder->AddInputQuantParam(quant_params);
+  } else {
+    quant_param_holder->set_input_quant_param(index, quant_params);
+  }
+  return ret;
+}
+
 }  // namespace mindspore::lite::quant
diff --git a/mindspore/lite/tools/converter/quantizer/quantize_util.h b/mindspore/lite/tools/converter/quantizer/quantize_util.h
index 63a5b0e7414..622abc4a419 100644
--- a/mindspore/lite/tools/converter/quantizer/quantize_util.h
+++ b/mindspore/lite/tools/converter/quantizer/quantize_util.h
@@ -40,12 +40,18 @@
 #include "abstract/dshape.h"
 #include "tools/converter/quantizer/huffman_encode.h"
 #include "tools/converter/quantizer/bitpacking.h"
+#include "tools/converter/quantizer/fix_bit_weight_quantizer.h"
 #include "src/lite_session.h"
 #include "tools/converter/graphdef_transform.h"
 #include "src/common/file_utils.h"
 #include "src/common/quant_utils.h"
 
 namespace mindspore::lite::quant {
+enum WeightQuantType {
+  FIXED_BIT_PER_CHANNEL = 0,
+  FIXED_BIT_PER_LAYER = 1,
+  MIXED_BIT_PER_LAYER = 2,
+};
 constexpr size_t kUint8Quantization = 8;
 constexpr size_t kMaxBit = 8;
 constexpr size_t kMaxNum1024 = 1024;
@@ -155,17 +161,20 @@ STATUS DoBitPack(const tensor::TensorPtr &weight, const size_t &bit_num, const s
   return RET_OK;
 }
 
+STATUS QuantFilter(const tensor::TensorPtr &weight, const PrimitivePtr &primitive, QuantType quant_type,
+                   WeightQuantType weight_quant_type, TypeId quant_data_type, int index = 1);
+
 template <typename T>
 STATUS QuantFilter(const tensor::TensorPtr &weight, const PrimitivePtr &primitive, QuantType quant_type, int quant_max,
-                   int quant_min, size_t bit_num, bool per_channel, TypeId quant_data_type, int index = 1,
-                   bool k_means = false) {
+                   int quant_min, size_t bit_num, WeightQuantType weight_quant_type, TypeId quant_data_type,
+                   int index = 1, bool k_means = false) {
   MS_ASSERT(weight != nullptr);
   MS_ASSERT(primitive != nullptr);
   auto dims = weight->shape();
-  if (per_channel) {
+  if (weight_quant_type == FIXED_BIT_PER_CHANNEL) {
     if (dims.size() <= 1) {
       MS_LOG(WARNING) << "dims is " << dims.size() << " can not per_channel";
-      per_channel = false;
+      weight_quant_type = FIXED_BIT_PER_LAYER;
     }
   }
 
@@ -179,7 +188,7 @@ STATUS QuantFilter(const tensor::TensorPtr &weight, const PrimitivePtr &primitiv
 
   std::vector<T> quant_data(elem_count);
   int ret = RET_OK;
-  if (per_channel) {
+  if (weight_quant_type == FIXED_BIT_PER_CHANNEL) {
     bool channel_at_first = true;
     int channel_cnt = -1;
     CalQuantAssitInfo(primitive, dims, index, &channel_at_first, &channel_cnt);
@@ -197,13 +206,15 @@ STATUS QuantFilter(const tensor::TensorPtr &weight, const PrimitivePtr &primitiv
       MS_LOG(ERROR) << "Do per channel quant failed.";
       return ret;
     }
-  } else {
+  } else if (weight_quant_type == FIXED_BIT_PER_LAYER) {
     ret = DoPerLayerQuant<T>(static_cast<float *>(weight->data_c()), weight->DataSize(), &quant_params, quant_max,
                              quant_min, bit_num, k_means, &quant_data);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Do per layer quant failed.";
       return ret;
     }
+  } else {
+    MS_LOG(ERROR) << "Unsupported weight quant type:" << weight_quant_type;
   }
   auto status = UpdateTensorDataAndSize(weight, quant_data.data(), quant_data.size() * sizeof(T), quant_data_type);
   if (status != RET_OK) {
diff --git a/mindspore/lite/tools/converter/quantizer/quantizer.h b/mindspore/lite/tools/converter/quantizer/quantizer.h
index fdb9bc8fe03..e3c41070a9e 100644
--- a/mindspore/lite/tools/converter/quantizer/quantizer.h
+++ b/mindspore/lite/tools/converter/quantizer/quantizer.h
@@ -53,7 +53,7 @@ class Quantizer {
 
   virtual STATUS DoQuantize(FuncGraphPtr func_graph) = 0;
 
-  mindspore::lite::converter::Flags flags;
+  converter::Flags flags;
 
  protected:
   FuncGraphPtr funcGraph = nullptr;
diff --git a/mindspore/lite/tools/converter/quantizer/weight_quantizer.cc b/mindspore/lite/tools/converter/quantizer/weight_quantizer.cc
index 53ff184e15e..7234c40f22c 100644
--- a/mindspore/lite/tools/converter/quantizer/weight_quantizer.cc
+++ b/mindspore/lite/tools/converter/quantizer/weight_quantizer.cc
@@ -35,6 +35,10 @@ WeightQuantizer::WeightQuantizer(FuncGraphPtr graph, const converter::Flags &con
   this->config_file_ = config.configFile;
   auto quant_size = config.quantWeightSize;
   this->bit_num_ = config.bitNum;
+  if (this->bit_num_ == 0) {
+    type_id_ = kNumberTypeInt16;
+    this->is_mixed_bit_ = true;
+  }
   auto convQuantWeightChannelThreshold = config.quantWeightChannel;
   quant_strategy_ = std::make_unique<QuantStrategy>(quant_size, convQuantWeightChannelThreshold);
   quant_max_ = (1 << (unsigned int)(this->bit_num_ - 1)) - 1;
@@ -75,7 +79,7 @@ STATUS WeightQuantizer::SetAbstract(const tensor::TensorPtr &tensor_info, const
   auto quant_param_holder = GetCNodeQuantHolder(primitive);
   quant_param_holder->set_quant_type(schema::QuantType_QUANT_WEIGHT);
 
-  weight_quantized_tensors.insert({tensor_info, param_node});
+  weight_quantized_tensors_.insert({tensor_info, param_node});
   return RET_OK;
 }
 
@@ -105,12 +109,15 @@ STATUS WeightQuantizer::DoConvQuantize(const CNodePtr &cnode) {
     return RET_OK;
   }
   auto status = RET_ERROR;
-  if (type_id_ == kNumberTypeInt8) {
-    status = QuantFilter<int8_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_, true,
-                                 type_id_);
+  if (is_mixed_bit_) {
+    type_id_ = kNumberTypeInt16;
+    status = QuantFilter(tensor_info, primitive, QuantType_WeightQuant, WeightQuantType::MIXED_BIT_PER_LAYER, type_id_);
+  } else if (type_id_ == kNumberTypeInt8) {
+    status = QuantFilter<int8_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_,
+                                 WeightQuantType::FIXED_BIT_PER_CHANNEL, type_id_);
   } else if (type_id_ == kNumberTypeInt16) {
-    status = QuantFilter<int16_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_, true,
-                                  type_id_);
+    status = QuantFilter<int16_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_,
+                                  WeightQuantType::FIXED_BIT_PER_CHANNEL, type_id_);
   }
   if (status == RET_CONTINUE) {
     return RET_OK;
@@ -142,16 +149,19 @@ STATUS WeightQuantizer::DoMulQuantize(const CNodePtr &cnode) {
           }
 
           auto status = RET_ERROR;
-          auto per_channel = true;
-          if (i == kInputSize2) {
-            per_channel = false;
+          auto weight_quant_type = WeightQuantType::FIXED_BIT_PER_CHANNEL;
+          if (i == 3) {
+            weight_quant_type = WeightQuantType::FIXED_BIT_PER_LAYER;
           }
-          if (type_id_ == kNumberTypeInt8) {
+          if (is_mixed_bit_) {
+            status = QuantFilter(tensor_info, primitive, QuantType_WeightQuant, WeightQuantType::MIXED_BIT_PER_LAYER,
+                                 type_id_, i - 1);
+          } else if (type_id_ == kNumberTypeInt8) {
             status = QuantFilter<int8_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_,
-                                         bit_num_, per_channel, type_id_, i - 1);
+                                         bit_num_, weight_quant_type, type_id_, i - 1);
           } else if (type_id_ == kNumberTypeInt16) {
             status = QuantFilter<int16_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_,
-                                          bit_num_, per_channel, type_id_, i - 1);
+                                          bit_num_, weight_quant_type, type_id_, i - 1);
           }
           if (status == RET_CONTINUE) {
             continue;
@@ -224,12 +234,15 @@ STATUS WeightQuantizer::DoGatherQuantize(const CNodePtr &cnode) {
   }
 
   auto status = RET_ERROR;
-  if (type_id_ == kNumberTypeInt8) {
-    status = QuantFilter<int8_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_, false,
-                                 type_id_, 0);
+  if (is_mixed_bit_) {
+    status =
+      QuantFilter(tensor_info, primitive, QuantType_WeightQuant, WeightQuantType::MIXED_BIT_PER_LAYER, type_id_, 0);
+  } else if (type_id_ == kNumberTypeInt8) {
+    status = QuantFilter<int8_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_,
+                                 WeightQuantType::FIXED_BIT_PER_LAYER, type_id_, 0);
   } else if (type_id_ == kNumberTypeInt16) {
     status = QuantFilter<int16_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_,
-                                  false, type_id_, 0);
+                                  WeightQuantType::FIXED_BIT_PER_LAYER, type_id_, 0);
   }
   if (status == RET_CONTINUE) {
     return RET_OK;
@@ -274,10 +287,10 @@ STATUS WeightQuantizer::DoOptimizerQuantize(const CNodePtr &cnode) {
     auto status = RET_ERROR;
     if (type_id_ == kNumberTypeInt8) {
       status = QuantFilter<int8_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_,
-                                   false, type_id_, idx - 1);
+                                   WeightQuantType::FIXED_BIT_PER_LAYER, type_id_, idx - 1);
     } else if (type_id_ == kNumberTypeInt16) {
       status = QuantFilter<int16_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_,
-                                    false, type_id_, idx - 1);
+                                    WeightQuantType::FIXED_BIT_PER_LAYER, type_id_, idx - 1);
     }
     if (status != RET_OK && status != RET_CONTINUE) {
       MS_LOG(ERROR) << "QuantFilter failed : " << status;
@@ -311,8 +324,8 @@ STATUS WeightQuantizer::DoMarkWeightQuantizeIfQuantized(const CNodePtr &cnode) {
       ParameterPtr param_node;
       tensor::TensorPtr tensor_info;
       GetLiteParameter(inputNode, &param_node, &tensor_info);
-      auto param = weight_quantized_tensors.find(tensor_info);
-      if (param != weight_quantized_tensors.end()) {
+      auto param = weight_quantized_tensors_.find(tensor_info);
+      if (param != weight_quantized_tensors_.end()) {
         quant_param_holder->set_quant_type(schema::QuantType_QUANT_WEIGHT);
         continue;
       }
@@ -343,12 +356,15 @@ STATUS WeightQuantizer::ProcessLstmWeightByIndex(const CNodePtr &cnode, const Pr
     return RET_OK;
   }
   auto status = RET_ERROR;
-  if (type_id_ == kNumberTypeInt8) {
-    status = QuantFilter<int8_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_, true,
-                                 type_id_, index - 1);
+  if (is_mixed_bit_) {
+    status = QuantFilter(tensor_info, primitive, QuantType_WeightQuant, WeightQuantType::MIXED_BIT_PER_LAYER, type_id_,
+                         index - 1);
+  } else if (type_id_ == kNumberTypeInt8) {
+    status = QuantFilter<int8_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_,
+                                 WeightQuantType::FIXED_BIT_PER_CHANNEL, type_id_, index - 1);
   } else if (type_id_ == kNumberTypeInt16) {
-    status = QuantFilter<int16_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_, true,
-                                  type_id_, index - 1);
+    status = QuantFilter<int16_t>(tensor_info, primitive, QuantType_WeightQuant, quant_max_, quant_min_, bit_num_,
+                                  WeightQuantType::FIXED_BIT_PER_CHANNEL, type_id_, index - 1);
   }
   if (status == RET_CONTINUE) {
     return RET_OK;
@@ -559,10 +575,10 @@ STATUS WeightQuantizer::TryQuant(const int &bit_num_t, const ParameterPtr &param
 
   if (type_id_ == TypeId::kNumberTypeInt8) {
     status = QuantFilter<int8_t>(tensor_info, primitive, QuantType::QuantType_WeightQuant, quant_max_t, quant_min_t,
-                                 bit_num_t, true, type_id_);
+                                 bit_num_t, WeightQuantType::FIXED_BIT_PER_CHANNEL, type_id_);
   } else if (type_id_ == TypeId::kNumberTypeInt16) {
     status = QuantFilter<int16_t>(tensor_info, primitive, QuantType::QuantType_WeightQuant, quant_max_t, quant_min_t,
-                                  bit_num_t, true, type_id_);
+                                  bit_num_t, WeightQuantType::FIXED_BIT_PER_CHANNEL, type_id_);
   } else {
     MS_LOG(ERROR) << "unexpected type_id_: " << type_id_;
     return RET_ERROR;
@@ -727,7 +743,7 @@ STATUS WeightQuantizer::DoMixedQuant(const FuncGraphPtr &func_graph) {
 
 STATUS WeightQuantizer::DoFixedQuant(const FuncGraphPtr &func_graph) {
   MS_ASSERT(func_graph != nullptr);
-  weight_quantized_tensors.clear();
+  weight_quantized_tensors_.clear();
 
   for (auto &cnode : func_graph->GetOrderedCnodes()) {
     auto primitive = GetValueNode<std::shared_ptr<ops::PrimitiveC>>(cnode->input(0));
diff --git a/mindspore/lite/tools/converter/quantizer/weight_quantizer.h b/mindspore/lite/tools/converter/quantizer/weight_quantizer.h
index 9b21f71cefb..aa8e260209d 100644
--- a/mindspore/lite/tools/converter/quantizer/weight_quantizer.h
+++ b/mindspore/lite/tools/converter/quantizer/weight_quantizer.h
@@ -41,9 +41,9 @@ class WeightQuantizer : public Quantizer {
   ~WeightQuantizer() override;
 
   STATUS DoQuantize(FuncGraphPtr func_graph) override;
-  STATUS DoConvQuantize(const CNodePtr &);
-  STATUS DoMulQuantize(const CNodePtr &);
-  STATUS DoOptimizerQuantize(const CNodePtr &);
+  STATUS DoConvQuantize(const CNodePtr &cnode);
+  STATUS DoMulQuantize(const CNodePtr &cnode);
+  STATUS DoOptimizerQuantize(const CNodePtr &cnode);
   STATUS DoLstmQuantize(const CNodePtr &cnode);
   STATUS DoGatherQuantize(const CNodePtr &cnode);
 
@@ -58,10 +58,11 @@ class WeightQuantizer : public Quantizer {
   std::unique_ptr<QuantStrategy> quant_strategy_;
   size_t bit_num_{8};
   std::string config_file_;
-  std::map<tensor::TensorPtr, ParameterPtr> weight_quantized_tensors;
+  std::map<tensor::TensorPtr, ParameterPtr> weight_quantized_tensors_;
   PostQuantConfig config_param_;
   std::vector<std::vector<std::string>> images_;  // multi_input, [[mode_input_0], [model_input_1]...]
   std::vector<std::unordered_map<std::string, mindspore::tensor::MSTensor *>> fp32_output_tensors_;
+  bool is_mixed_bit_ = false;
 
   STATUS DoMixedQuant(const FuncGraphPtr &);
   STATUS SetAbstract(const tensor::TensorPtr &tensor_info, const ParameterPtr &param_node,
@@ -78,7 +79,6 @@ class WeightQuantizer : public Quantizer {
   STATUS TryQuant(const int &bit_num_t, const ParameterPtr &param_node, const tensor::TensorPtr &tensor_info,
                   const PrimitivePtr &primitive);
   STATUS DoQuantSearch(const FuncGraphPtr &func_graph);
-  STATUS DoTensorQuantize(const CNodePtr &);
 };
 }  // namespace mindspore::lite::quant
 #endif  // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_WEIGHT_QUANTIZER_H_
diff --git a/mindspore/lite/tools/converter/registry/CMakeLists.txt b/mindspore/lite/tools/converter/registry/CMakeLists.txt
index ca6c0ddb445..c254482534c 100644
--- a/mindspore/lite/tools/converter/registry/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/registry/CMakeLists.txt
@@ -13,11 +13,11 @@ set(REG_SRC ${CONVERT_REG_SRC}
         ${KERNEL_REG_DIR}/../tensor.cc
         ${KERNEL_REG_DIR}/../runtime/inner_allocator.cc
         ${KERNEL_REG_DIR}/../common/string_util.cc
+        ${KERNEL_REG_DIR}/../common/lite_utils.cc
         ${CORE_DIR}/utils/log_adapter.cc
         ${CORE_DIR}/utils/status.cc
         ${CORE_DIR}/gvar/log_adapter_common.cc
-        ${CORE_DIR}/gvar/logging_level.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/../dump_graph.cc)
+        ${CORE_DIR}/gvar/logging_level.cc)
 set_property(SOURCE ${REG_SRC} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_LITE)
 add_library(mslite_converter_plugin SHARED ${REG_SRC})
 target_link_libraries(mslite_converter_plugin mindspore::glog)
diff --git a/mindspore/lite/tools/converter/registry/model_parser_registry.cc b/mindspore/lite/tools/converter/registry/model_parser_registry.cc
index 975df7a09e0..2e67d816d13 100644
--- a/mindspore/lite/tools/converter/registry/model_parser_registry.cc
+++ b/mindspore/lite/tools/converter/registry/model_parser_registry.cc
@@ -15,36 +15,30 @@
  */
 
 #include "include/registry/model_parser_registry.h"
-#include <string>
-#include <set>
-#include <unordered_map>
-#include "include/errorcode.h"
+#include <map>
 #include "src/common/log_adapter.h"
 
 namespace mindspore {
-namespace lite {
-ModelParserRegistry *ModelParserRegistry::GetInstance() {
-  static ModelParserRegistry instance;
-  return &instance;
+namespace registry {
+namespace {
+std::map<FmkType, ModelParserCreator> model_parser_room;
+}  // namespace
+
+ModelParserRegistry::ModelParserRegistry(FmkType fmk, ModelParserCreator creator) {
+  if (fmk < converter::kFmkTypeTf || fmk > converter::kFmkTypeTflite) {
+    MS_LOG(ERROR) << "ILLEGAL FMK: fmk must be in FmkType.";
+    return;
+  }
+  model_parser_room[fmk] = creator;
 }
 
-ModelParser *ModelParserRegistry::GetModelParser(const FmkType fmk) {
-  auto it = parsers_.find(fmk);
-  if (it != parsers_.end()) {
+converter::ModelParser *ModelParserRegistry::GetModelParser(FmkType fmk) {
+  auto it = model_parser_room.find(fmk);
+  if (it != model_parser_room.end()) {
     auto creator = it->second;
     return creator();
   }
   return nullptr;
 }
-
-int ModelParserRegistry::RegParser(const FmkType fmk, ModelParserCreator creator) {
-  if (fmk < converter::FmkType_TF || fmk > converter::FmkType_TFLITE) {
-    MS_LOG(ERROR) << "ILLEGAL FMK: fmk must be in FmkType.";
-    return RET_ERROR;
-  }
-  auto instance = ModelParserRegistry::GetInstance();
-  instance->parsers_[fmk] = creator;
-  return RET_OK;
-}
-}  // namespace lite
+}  // namespace registry
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/registry/pass_content.h b/mindspore/lite/tools/converter/registry/pass_content.h
deleted file mode 100644
index b184f8b4af0..00000000000
--- a/mindspore/lite/tools/converter/registry/pass_content.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_REGISTRY_PASS_CONTENT_H
-#define MINDSPORE_LITE_TOOLS_CONVERTER_REGISTRY_PASS_CONTENT_H
-
-#include <map>
-#include <string>
-#include <vector>
-#include "include/registry/pass_registry.h"
-
-namespace mindspore {
-namespace opt {
-std::map<std::string, PassPtr> &MS_API PassStoreRoomInfo();
-std::map<PassPosition, std::vector<std::string>> &MS_API ExternalAssignedPassesInfo();
-}  // namespace opt
-}  // namespace mindspore
-
-#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_REGISTRY_PASS_CONTENT_H
diff --git a/mindspore/lite/tools/converter/registry/pass_registry.cc b/mindspore/lite/tools/converter/registry/pass_registry.cc
index d377c54d450..6e2c0dc6ad6 100644
--- a/mindspore/lite/tools/converter/registry/pass_registry.cc
+++ b/mindspore/lite/tools/converter/registry/pass_registry.cc
@@ -19,16 +19,15 @@
 #include <mutex>
 #include <string>
 #include <vector>
-#include "tools/converter/registry/pass_content.h"
 #include "src/common/log_adapter.h"
 
 namespace mindspore {
-namespace opt {
+namespace registry {
 namespace {
-std::map<std::string, PassPtr> pass_store_room;
-std::map<PassPosition, std::vector<std::string>> external_assigned_passes;
+std::map<std::string, opt::PassPtr> pass_store_room;
+std::map<registry::PassPosition, std::vector<std::string>> external_assigned_passes;
 std::mutex pass_mutex;
-void RegPass(const std::string &pass_name, const PassPtr &pass) {
+void RegPass(const std::string &pass_name, const opt::PassPtr &pass) {
   if (pass == nullptr) {
     MS_LOG(ERROR) << "pass is nullptr.";
     return;
@@ -38,15 +37,27 @@ void RegPass(const std::string &pass_name, const PassPtr &pass) {
 }
 }  // namespace
 
-PassRegistry::PassRegistry(const std::string &pass_name, const PassPtr &pass) { RegPass(pass_name, pass); }
+PassRegistry::PassRegistry(const std::string &pass_name, const opt::PassPtr &pass) { RegPass(pass_name, pass); }
 
-PassRegistry::PassRegistry(PassPosition position, const std::vector<std::string> &assigned) {
+PassRegistry::PassRegistry(PassPosition position, const std::vector<std::string> &names) {
   std::unique_lock<std::mutex> lock(pass_mutex);
-  external_assigned_passes[position] = assigned;
+  external_assigned_passes[position] = names;
 }
 
-std::map<std::string, PassPtr> &PassStoreRoomInfo() { return pass_store_room; }
+std::vector<std::string> PassRegistry::GetOuterScheduleTask(PassPosition position) {
+  return external_assigned_passes[position];
+}
 
-std::map<PassPosition, std::vector<std::string>> &ExternalAssignedPassesInfo() { return external_assigned_passes; }
-}  // namespace opt
+std::vector<opt::PassPtr> PassRegistry::GetPassFromStoreRoom(const std::vector<std::string> &pass_names) {
+  std::vector<opt::PassPtr> schedule_passes;
+  for (auto &name : pass_names) {
+    auto iter = pass_store_room.find(name);
+    if (iter == pass_store_room.end()) {
+      continue;
+    }
+    schedule_passes.push_back(iter->second);
+  }
+  return schedule_passes;
+}
+}  // namespace registry
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/cropper/build_cropper_config.sh b/mindspore/lite/tools/cropper/build_cropper_config.sh
index feba431a9ab..f2e1fe6a1f9 100644
--- a/mindspore/lite/tools/cropper/build_cropper_config.sh
+++ b/mindspore/lite/tools/cropper/build_cropper_config.sh
@@ -118,8 +118,6 @@ getCommonFile() {
     mindspore/lite/src/runtime/infer_manager.h
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.h
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/nnacl_utils.h
-    mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/pack.h
-    mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h
     mindspore/lite/src/ops/populate/populate_register.h
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h
     mindspore/core/ir/dtype/type_id.h
@@ -129,7 +127,6 @@ getCommonFile() {
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer.h
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/tensor_c.h
-    mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.h
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/errorcode.h
   )
   all_files_h=("${include_h[@]}" "${regist_include_h[@]}" "${src_files_h[@]}" "${common_files_h[@]}" "${runtime_files_h[@]}" "${others_files_h[@]}")
@@ -163,12 +160,10 @@ getCommonFile() {
   while IFS='' read -r line; do assembly_files+=("$line"); done < <(ls mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/*/*.S)
   others_files_c=(
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/nnacl_utils.c
-    mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc
     mindspore/lite/src/runtime/infer_manager.cc
     mindspore/lite/src/ops/populate/populate_register.cc
     mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.c
     mindspore/core/utils/status.cc
-    mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.c
   )
   all_files=("${src_files[@]}" "${regist_files[@]}" "${common_files[@]}" "${runtime_files_cc[@]}"
     "${others_files_c[@]}" "${assembly_files[@]}" "${mindrt_files[@]}"
diff --git a/mindspore/lite/tools/optimizer/common/format_utils.cc b/mindspore/lite/tools/optimizer/common/format_utils.cc
index b21d600b7a0..61a32d0bac7 100644
--- a/mindspore/lite/tools/optimizer/common/format_utils.cc
+++ b/mindspore/lite/tools/optimizer/common/format_utils.cc
@@ -136,9 +136,9 @@ STATUS GetTransposePerm(const CNodePtr &cnode, std::vector<int> *perm) {
   lite::DataInfo data_info;
   int status;
   if (utils::isa<ParameterPtr>(cnode->input(kInputIndexTwo))) {
-    status = lite::FetchDataFromParameterNode(cnode, kInputIndexTwo, lite::converter::FmkType_MS, false, &data_info);
+    status = lite::FetchDataFromParameterNode(cnode, kInputIndexTwo, converter::kFmkTypeMs, false, &data_info);
   } else {
-    status = lite::FetchDataFromValueNode(cnode, kInputIndexTwo, lite::converter::FmkType_MS, false, &data_info);
+    status = lite::FetchDataFromValueNode(cnode, kInputIndexTwo, converter::kFmkTypeMs, false, &data_info);
   }
   if (status != lite::RET_OK) {
     MS_LOG(ERROR) << "fetch transpose perm data failed.";
diff --git a/mindspore/lite/tools/optimizer/fisson/fisson_util.cc b/mindspore/lite/tools/optimizer/fisson/fisson_util.cc
index f9c2d654294..fe4f1f969d0 100644
--- a/mindspore/lite/tools/optimizer/fisson/fisson_util.cc
+++ b/mindspore/lite/tools/optimizer/fisson/fisson_util.cc
@@ -26,7 +26,7 @@
 #include "tools/optimizer/parallel/split_strategy.h"
 #include "nnacl/op_base.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore {
 namespace opt {
 std::vector<int64_t> GetSplitPadList(const std::shared_ptr<ops::Conv2DFusion> &ori_conv_prim, int64_t input_h,
@@ -172,8 +172,8 @@ bool UpdateSplitInfo(const FuncGraphPtr &func_graph, const std::vector<AnfNodePt
   }
   auto splited_axis = split_info->axis;
   // need to check
-  if (split_info->fmk_type == FmkType::FmkType_CAFFE ||
-      split_info->fmk_type == FmkType::FmkType_ONNX) {  // NHWC -> NCHW
+  if (split_info->fmk_type == FmkType::kFmkTypeCaffe ||
+      split_info->fmk_type == FmkType::kFmkTypeOnnx) {  // NHWC -> NCHW
     splited_axis += 1;
   }
 
diff --git a/mindspore/lite/tools/optimizer/fisson/multi_conv_split_pass.cc b/mindspore/lite/tools/optimizer/fisson/multi_conv_split_pass.cc
index ae315cfcba1..df9910a682a 100644
--- a/mindspore/lite/tools/optimizer/fisson/multi_conv_split_pass.cc
+++ b/mindspore/lite/tools/optimizer/fisson/multi_conv_split_pass.cc
@@ -23,7 +23,7 @@
 #include "tools/optimizer/common/gllo_utils.h"
 #include "tools/optimizer/parallel/split_strategy.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 using mindspore::schema::PrimitiveType_Conv2dTransposeFusion;
 namespace mindspore {
 namespace opt {
diff --git a/mindspore/lite/tools/optimizer/format/conv_weight_format.cc b/mindspore/lite/tools/optimizer/format/conv_weight_format.cc
deleted file mode 100644
index 0141aca177d..00000000000
--- a/mindspore/lite/tools/optimizer/format/conv_weight_format.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "tools/optimizer/format/conv_weight_format.h"
-#include <vector>
-#include "tools/common/tensor_util.h"
-#include "tools/converter/parser/parser_utils.h"
-
-namespace mindspore {
-namespace opt {
-namespace {
-constexpr size_t kConvWeightIndex = 2;
-}  // namespace
-STATUS ConvWeightFormatBase::ConvWeightFormatTrans(const FuncGraphPtr &graph) {
-  MS_ASSERT(graph != nullptr);
-  auto node_list = TopoSort(graph->get_return());
-  for (auto &node : node_list) {
-    if (!utils::isa<CNodePtr>(node)) {
-      continue;
-    }
-    auto cnode = node->cast<CNodePtr>();
-    if (CheckPrimitiveType(node, prim::kPrimIf) || CheckPrimitiveType(node, prim::kPrimWhile)) {
-      auto sub_func_graph = GetValueNode<FuncGraphPtr>(cnode->input(1));
-      if (sub_func_graph == nullptr) {
-        lite::ReturnCode::GetSingleReturnCode()->UpdateReturnCode(lite::RET_NULL_PTR);
-        return false;
-      }
-      if (ConvWeightFormatTrans(sub_func_graph) != lite::RET_OK) {
-        MS_LOG(ERROR) << "transform conv weight format failed.";
-        return lite::RET_ERROR;
-      }
-      sub_func_graph = GetValueNode<FuncGraphPtr>(cnode->input(kInputIndexTwo));
-      if (sub_func_graph == nullptr) {
-        lite::ReturnCode::GetSingleReturnCode()->UpdateReturnCode(lite::RET_NULL_PTR);
-        return false;
-      }
-      if (ConvWeightFormatTrans(sub_func_graph) != lite::RET_OK) {
-        MS_LOG(ERROR) << "transform conv weight format failed.";
-        return lite::RET_ERROR;
-      }
-      continue;
-    }
-    if (!CheckPrimitiveType(node, prim::kPrimConv2DFusion) &&
-        !CheckPrimitiveType(node, opt::kPrimConv2DBackpropInputFusion) &&
-        !CheckPrimitiveType(node, prim::kPrimConv2dTransposeFusion)) {
-      continue;
-    }
-    MS_ASSERT(cnode->inputs().size() > kConvWeightIndex);
-    auto weight_node = cnode->input(kConvWeightIndex);
-    MS_ASSERT(weight_node != nullptr);
-    if (utils::isa<CNodePtr>(weight_node)) {
-      if (lite::HandleWeightConst(graph, cnode, weight_node->cast<CNodePtr>(), src_format_, dst_format_) !=
-          lite::RET_OK) {
-        MS_LOG(ERROR) << "handle cnode weight failed.";
-        return RET_ERROR;
-      }
-      continue;
-    }
-    if (TransferConvWeight(weight_node) != lite::RET_OK) {
-      MS_LOG(ERROR) << "transfer weight format failed.";
-      return lite::RET_ERROR;
-    }
-    if (utils::isa<Parameter>(weight_node)) {
-      if (lite::HandleWeightSharing(graph, dst_format_, weight_node->cast<ParameterPtr>(), src_format_, dst_format_) !=
-          lite::RET_OK) {
-        MS_LOG(ERROR) << "handle weight-sharing failed.";
-        return RET_ERROR;
-      }
-    }
-  }
-  return RET_OK;
-}
-
-STATUS ConvWeightFormatBase::TransferConvWeight(const AnfNodePtr &weight_node) {
-  MS_ASSERT(weight_node != nullptr);
-  auto weight_value = GetTensorInfo(weight_node);
-  if (weight_value == nullptr) {
-    MS_LOG(ERROR) << "weight node must const value";
-    return lite::RET_ERROR;
-  }
-  auto status = TransFilterFormat(weight_value, src_format_, dst_format_);
-  if (status != lite::RET_OK) {
-    MS_LOG(ERROR) << "trans conv weight failed.";
-    return lite::RET_ERROR;
-  }
-  auto type_id = static_cast<TypeId>(weight_value->data_type());
-  auto shape = weight_value->shape();
-  std::vector<int64_t> shape_vector(shape.begin(), shape.end());
-  auto abstract = lite::CreateTensorAbstract(shape_vector, type_id);
-  if (abstract == nullptr) {
-    MS_LOG(ERROR) << "Create tensor abstarct failed";
-    return lite::RET_ERROR;
-  }
-  weight_node->set_abstract(abstract);
-  return lite::RET_OK;
-}
-
-bool ConvWeightFormatBase::Run(const FuncGraphPtr &graph) {
-  MS_ASSERT(graph != nullptr);
-  if (src_format_ == dst_format_) {
-    return true;
-  }
-  auto manager = Manage(graph, true);
-  if (manager == nullptr) {
-    MS_LOG(ERROR) << "manager is nullptr.";
-    return false;
-  }
-  auto status = ConvWeightFormatTrans(graph);
-  if (status != lite::RET_OK) {
-    MS_LOG(ERROR) << "Conv2D weight FormatTrans failed: " << status;
-    return false;
-  }
-  return true;
-}
-}  // namespace opt
-}  // namespace mindspore
diff --git a/mindspore/lite/tools/optimizer/format/conv_weight_format.h b/mindspore/lite/tools/optimizer/format/conv_weight_format.h
deleted file mode 100644
index c05164e02f5..00000000000
--- a/mindspore/lite/tools/optimizer/format/conv_weight_format.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_TOOLS_OPTIMIZER_FORMAT_CONV_WEIGHT_FORMAT_H_
-#define MINDSPORE_LITE_TOOLS_OPTIMIZER_FORMAT_CONV_WEIGHT_FORMAT_H_
-
-#include <string>
-#include "backend/optimizer/common/pass.h"
-#include "tools/optimizer/common/gllo_utils.h"
-
-namespace mindspore {
-namespace opt {
-class ConvWeightFormatBase : public Pass {
- public:
-  explicit ConvWeightFormatBase(const std::string &name = "ConvWeightFormatBase") : Pass(name) {}
-  ~ConvWeightFormatBase() override = default;
-  bool Run(const FuncGraphPtr &graph) override;
-
- private:
-  STATUS ConvWeightFormatTrans(const FuncGraphPtr &graph);
-  STATUS TransferConvWeight(const AnfNodePtr &weight_node);
-
- protected:
-  schema::Format src_format_{schema::Format_KHWC};
-  schema::Format dst_format_{schema::Format_KHWC};
-};
-
-class ConvWeightToKHWC : public ConvWeightFormatBase {
- public:
-  ConvWeightToKHWC() : ConvWeightFormatBase("ConvWeightToKHWC") { src_format_ = schema::Format_KCHW; }
-  ~ConvWeightToKHWC() override = default;
-};
-
-class ConvWeightToKCHW : public ConvWeightFormatBase {
- public:
-  ConvWeightToKCHW() : ConvWeightFormatBase("ConvWeightToKCHW") { dst_format_ = schema::Format_KCHW; }
-  ~ConvWeightToKCHW() override = default;
-};
-}  // namespace opt
-}  // namespace mindspore
-#endif  // MINDSPORE_LITE_TOOLS_OPTIMIZER_FORMAT_CONV_WEIGHT_FORMAT_H_
diff --git a/mindspore/lite/tools/optimizer/format/to_format_base.h b/mindspore/lite/tools/optimizer/format/to_format_base.h
index 03a214697f5..6c6765c9f41 100644
--- a/mindspore/lite/tools/optimizer/format/to_format_base.h
+++ b/mindspore/lite/tools/optimizer/format/to_format_base.h
@@ -27,12 +27,12 @@
 #include "tools/optimizer/common/format_utils.h"
 #include "tools/optimizer/graph/infershape_pass.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore {
 namespace opt {
 class ToFormatBase : public Pass {
  public:
-  explicit ToFormatBase(FmkType fmk_type = lite::converter::FmkType_MS, bool train_flag = false,
+  explicit ToFormatBase(FmkType fmk_type = converter::kFmkTypeMs, bool train_flag = false,
                         std::string pass_name = "to_format_base")
       : Pass(pass_name), fmk_type_(fmk_type), train_flag_(train_flag) {}
   ~ToFormatBase() override = default;
@@ -56,7 +56,7 @@ class ToFormatBase : public Pass {
   virtual bool DecideWhetherInferShapeForNewNode() { return true; }
   virtual STATUS DecideConvWeightSrcAndDstFormat(const CNodePtr &cnode, schema::Format *src_format,
                                                  schema::Format *dst_format) = 0;
-  FmkType fmk_type_{lite::converter::FmkType_MS};
+  FmkType fmk_type_{converter::kFmkTypeMs};
   bool train_flag_{false};
   mindspore::Format format_{mindspore::NHWC};
   std::shared_ptr<NodeInferShape> node_infer_shape_{nullptr};
diff --git a/mindspore/lite/tools/optimizer/format/to_nchw_format.h b/mindspore/lite/tools/optimizer/format/to_nchw_format.h
index d2e2d000ea6..93a8d344008 100644
--- a/mindspore/lite/tools/optimizer/format/to_nchw_format.h
+++ b/mindspore/lite/tools/optimizer/format/to_nchw_format.h
@@ -23,7 +23,7 @@ namespace mindspore {
 namespace opt {
 class ToNCHWFormat : public ToFormatBase {
  public:
-  explicit ToNCHWFormat(FmkType fmk_type = lite::converter::FmkType_MS, bool train_flag = false)
+  explicit ToNCHWFormat(FmkType fmk_type = converter::kFmkTypeMs, bool train_flag = false)
       : ToFormatBase(fmk_type, train_flag, "to_nchw_format") {
     format_ = mindspore::NCHW;
   }
diff --git a/mindspore/lite/tools/optimizer/format/to_nhwc_format.h b/mindspore/lite/tools/optimizer/format/to_nhwc_format.h
index d16b861b6fc..2c40967629f 100644
--- a/mindspore/lite/tools/optimizer/format/to_nhwc_format.h
+++ b/mindspore/lite/tools/optimizer/format/to_nhwc_format.h
@@ -23,7 +23,7 @@ namespace mindspore {
 namespace opt {
 class ToNHWCFormat : public ToFormatBase {
  public:
-  explicit ToNHWCFormat(FmkType fmk_type = lite::converter::FmkType_MS, bool train_flag = false)
+  explicit ToNHWCFormat(FmkType fmk_type = converter::kFmkTypeMs, bool train_flag = false)
       : ToFormatBase(fmk_type, train_flag, "to_nhwc_format") {}
   ~ToNHWCFormat() = default;
 
diff --git a/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.cc b/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.cc
index 51d6780e40c..e2fc448ca33 100644
--- a/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.cc
+++ b/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.cc
@@ -53,7 +53,7 @@ void FreeTensors(std::vector<Tensor *> *input_tensor, std::vector<Tensor *> *out
   }
 }
 
-std::vector<Tensor *> GetCNodeInputTensors(const CNodePtr &cnode, lite::converter::FmkType fmk_type) {
+std::vector<Tensor *> GetCNodeInputTensors(const CNodePtr &cnode, converter::FmkType fmk_type) {
   MS_ASSERT(CNode != nullptr);
   std::vector<Tensor *> tensors;
   for (size_t i = 1; i < cnode->size(); ++i) {
diff --git a/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.h b/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.h
index ef60b12f9ac..5d8e726455e 100644
--- a/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.h
+++ b/mindspore/lite/tools/optimizer/fusion/constant_folding_fusion.h
@@ -31,7 +31,7 @@ namespace mindspore {
 namespace opt {
 class ConstFoldPass : public PatternProcessPass {
  public:
-  explicit ConstFoldPass(lite::converter::FmkType fmk_type = lite::converter::FmkType_MS, bool multigraph = true)
+  explicit ConstFoldPass(converter::FmkType fmk_type = converter::kFmkTypeMs, bool multigraph = true)
       : PatternProcessPass("constfold_pass", multigraph), fmk_type_(fmk_type) {
     context_ = std::make_shared<lite::InnerContext>();
     context_->Init();
@@ -41,7 +41,7 @@ class ConstFoldPass : public PatternProcessPass {
   const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
 
  private:
-  lite::converter::FmkType fmk_type_{lite::converter::FmkType_MS};
+  converter::FmkType fmk_type_{converter::kFmkTypeMs};
   std::shared_ptr<lite::InnerContext> context_{nullptr};
   std::shared_ptr<mindspore::Context> ms_context_{nullptr};
 };
diff --git a/mindspore/lite/tools/optimizer/fusion/conv_transform_fusion.h b/mindspore/lite/tools/optimizer/fusion/conv_transform_fusion.h
index e1ac64ff0d3..eccb90d4b52 100644
--- a/mindspore/lite/tools/optimizer/fusion/conv_transform_fusion.h
+++ b/mindspore/lite/tools/optimizer/fusion/conv_transform_fusion.h
@@ -21,7 +21,7 @@
 #include "backend/optimizer/common/optimizer.h"
 #include "tools/converter/converter_flags.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore::opt {
 class ConvTransformFusion : public PatternProcessPass {
  public:
@@ -37,7 +37,7 @@ class ConvTransformFusion : public PatternProcessPass {
   void SetFmkType(FmkType type) { this->fmk_type_ = type; }
 
  private:
-  FmkType fmk_type_ = lite::converter::FmkType_TF;
+  FmkType fmk_type_ = converter::kFmkTypeTf;
 };
 }  // namespace mindspore::opt
 #endif  // MINDSPORE_LITE_SRC_PASS_FUSION_CONV_TRANSFORM_FUSION_H_
diff --git a/mindspore/lite/tools/optimizer/fusion/gelu_fusion.cc b/mindspore/lite/tools/optimizer/fusion/gelu_fusion.cc
index 7cb4176b1a8..319b594f8fe 100644
--- a/mindspore/lite/tools/optimizer/fusion/gelu_fusion.cc
+++ b/mindspore/lite/tools/optimizer/fusion/gelu_fusion.cc
@@ -41,7 +41,7 @@ CNodePtr GeLUFusion::CreateGeLUNode(const FuncGraphPtr &func_graph, const AnfNod
 const float GeLUFusion::GetParameterValue(const EquivPtr &equiv, const VarPtr &input) const {
   MS_ASSERT(equiv != nullptr);
   MS_ASSERT(input != nullptr);
-  float value = -1;
+  const float value = -1;
   auto node = utils::cast<AnfNodePtr>((*equiv)[input]);
   if (node == nullptr || !utils::isa<ParameterPtr>(node)) {
     return value;
diff --git a/mindspore/lite/tools/optimizer/fusion/pooling_activation_fusion.cc b/mindspore/lite/tools/optimizer/fusion/pooling_activation_fusion.cc
deleted file mode 100644
index a97d40ab8bf..00000000000
--- a/mindspore/lite/tools/optimizer/fusion/pooling_activation_fusion.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "tools/optimizer/fusion/pooling_activation_fusion.h"
-#include <memory>
-#include "src/ops/pooling.h"
-#include "src/ops/activation.h"
-#include "schema/inner/model_generated.h"
-#include "tools/optimizer/common/gllo_utils.h"
-
-namespace mindspore::opt {
-namespace {
-constexpr size_t kActivationInputsLength = 2;
-}
-const BaseRef PoolingActivationFusion::DefinePattern() const {
-  auto pooling_var = std::make_shared<CondVar>(IsPoolingNode);
-  auto prim = new (std::nothrow) schema::PrimitiveT();
-  if (prim == nullptr) {
-    MS_LOG(ERROR) << "new primitiveT failed";
-    return nullptr;
-  }
-  prim->value.type = primitive_type;
-  auto prim_value = std::make_shared<lite::PrimitiveC>(prim);
-  return VectorRef({prim_value, pooling_var});
-}
-
-const AnfNodePtr PoolingActivationFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
-                                                  const EquivPtr &) const {
-  MS_ASSERT(func_graph != nullptr);
-  MS_ASSERT(node != nullptr);
-  MS_LOG(DEBUG) << "pooling activation pass process:" << schema::EnumNamesPrimitiveType()[primitive_type];
-  CheckIfFuncGraphIsNull(func_graph);
-  CheckIfAnfNodeIsNull(node);
-  auto act_node = node->cast<CNodePtr>();
-  CheckIfCNodeIsNull(act_node);
-  CheckInputSize(act_node, kActivationInputsLength);
-
-  auto primitivec = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(act_node->input(0));
-  MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::Activation>>(primitivec));
-  auto act_primitivec = utils::cast<std::shared_ptr<mindspore::lite::Activation>>(primitivec);
-  MS_ASSERT(act_primitivec != nullptr);
-  if (act_primitivec->GetType() != activation_type) {
-    return node;
-  }
-  AnfNodePtr pre_node = act_node->input(1);
-  CheckIfAnfNodeIsNull(pre_node);
-  if (pre_node != nullptr && pre_node->isa<CNode>()) {
-    if (IsMultiOutputTensors(func_graph, pre_node)) {
-      return node;
-    }
-    auto pooling_node = pre_node->cast<CNodePtr>();
-    auto primitive_c = GetValueNode<std::shared_ptr<lite::PrimitiveC>>(pooling_node->input(0));
-
-    MS_ASSERT(utils::isa<std::shared_ptr<mindspore::lite::Pooling>>(primitive_c));
-    auto primc = utils::cast<std::shared_ptr<mindspore::lite::Pooling>>(primitive_c);
-    MS_ASSERT(primc != nullptr);
-    if (primc->GetActivationType() == schema::ActivationType_NO_ACTIVATION) {
-      primc->SetActivationType(activation_type);
-      return pre_node;
-    }
-  }
-  return node;
-}
-}  // namespace mindspore::opt
diff --git a/mindspore/lite/tools/optimizer/fusion/quant_dtype_cast_fusion.cc b/mindspore/lite/tools/optimizer/fusion/quant_dtype_cast_fusion.cc
deleted file mode 100644
index e811f7361de..00000000000
--- a/mindspore/lite/tools/optimizer/fusion/quant_dtype_cast_fusion.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "tools/optimizer/fusion/quant_dtype_cast_fusion.h"
-#include <memory>
-#include "tools/optimizer/common/gllo_utils.h"
-namespace mindspore::opt {
-namespace {
-constexpr size_t kActivationInputsLength = 2;
-}
-const BaseRef QuantDtypeCastFusion::DefinePattern() const {
-  auto quant_var = std::make_shared<CondVar>(IsQuantNode);
-  auto input_var = std::make_shared<Var>();
-  return VectorRef({quant_var, input_var});
-}
-
-const AnfNodePtr QuantDtypeCastFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
-                                               const EquivPtr &) const {
-  MS_ASSERT(func_graph != nullptr);
-  MS_ASSERT(node != nullptr);
-  MS_LOG(DEBUG) << "quant dtype cast fusion pass process";
-  if (CheckIfFuncGraphIsNull(func_graph) != lite::RET_OK || CheckIfAnfNodeIsNull(node) != lite::RET_OK) {
-    return nullptr;
-  }
-  auto act_node = node->cast<CNodePtr>();
-  if (CheckIfCNodeIsNull(act_node) != lite::RET_OK ||
-      CheckInputSize(act_node, kActivationInputsLength) != lite::RET_OK) {
-    return nullptr;
-  }
-  AnfNodePtr pre_node = act_node->input(1);
-  if (CheckIfAnfNodeIsNull(pre_node) != lite::RET_OK) {
-    return nullptr;
-  }
-  return pre_node;
-}
-}  // namespace mindspore::opt
diff --git a/mindspore/lite/tools/optimizer/fusion/quant_dtype_cast_fusion.h b/mindspore/lite/tools/optimizer/fusion/quant_dtype_cast_fusion.h
deleted file mode 100644
index b60153b99ce..00000000000
--- a/mindspore/lite/tools/optimizer/fusion/quant_dtype_cast_fusion.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef LITE_QUANT_DTYPE_CAST_FUSION_H
-#define LITE_QUANT_DTYPE_CAST_FUSION_H
-
-#include <string>
-#include "backend/optimizer/common/optimizer.h"
-
-namespace mindspore {
-namespace opt {
-class QuantDtypeCastFusion : public PatternProcessPass {
- public:
-  explicit QuantDtypeCastFusion(bool multigraph = true, const std::string &name = "quant_dtype_cast_fusion")
-      : PatternProcessPass(name, multigraph) {}
-  ~QuantDtypeCastFusion() override = default;
-  const BaseRef DefinePattern() const override;
-  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
-};
-}  // namespace opt
-}  // namespace mindspore
-#endif  // LITE_QUANT_DTYPE_CAST_FUSION_H
diff --git a/mindspore/lite/tools/optimizer/graph/clip_convert_activation_pass.h b/mindspore/lite/tools/optimizer/graph/clip_convert_activation_pass.h
index e49705b4ec6..b73de3de91a 100644
--- a/mindspore/lite/tools/optimizer/graph/clip_convert_activation_pass.h
+++ b/mindspore/lite/tools/optimizer/graph/clip_convert_activation_pass.h
@@ -20,7 +20,7 @@
 #include "tools/converter/converter_flags.h"
 #include "backend/optimizer/common/pass.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 using mindspore::schema::QuantType;
 namespace mindspore::opt {
 class ClipConvertActivationPass : public Pass {
diff --git a/mindspore/lite/tools/optimizer/graph/control_flow_pass.cc b/mindspore/lite/tools/optimizer/graph/control_flow_pass.cc
index 24972f4f6ac..ae32f8ed8c6 100644
--- a/mindspore/lite/tools/optimizer/graph/control_flow_pass.cc
+++ b/mindspore/lite/tools/optimizer/graph/control_flow_pass.cc
@@ -22,20 +22,9 @@
 #include "include/errorcode.h"
 #include "tools/optimizer/common/gllo_utils.h"
 #include "src/common/log_adapter.h"
+#include "tools/common/node_util.h"
 
 namespace mindspore::opt {
-ValueNodePtr ControlFlowPass::GetSwitchAnfPrim() {
-  auto switch_prim = std::make_shared<mindspore::ops::Switch>();
-  ValueNodePtr switch_anf_prim = NewValueNode(switch_prim);
-  return switch_anf_prim;
-}
-
-ValueNodePtr ControlFlowPass::GetPartialAnfPrim() {
-  auto partial_prim = std::make_shared<mindspore::ops::PartialFusion>();
-  ValueNodePtr partial_anf_prim = NewValueNode(partial_prim);
-  return partial_anf_prim;
-}
-
 void ControlFlowPass::ReplaceNode(const FuncGraphPtr &fg,
                                   const std::unordered_map<AnfNodePtr, AnfNodePtr> &replace_pairs) {
   for (auto &node : fg->nodes()) {
@@ -199,7 +188,7 @@ int ControlFlowPass::CreateAfterGraph(const FuncGraphPtr &main_fg, const std::ve
   *after_fg = std::make_shared<FuncGraph>();
   auto manager = main_fg->manager();
   manager->AddFuncGraph(*after_fg);
-  (*after_fg)->set_attr("fmk", MakeValue(static_cast<int>(lite::converter::FmkType_TF)));
+  (*after_fg)->set_attr("fmk", MakeValue(static_cast<int>(converter::kFmkTypeTf)));
   (*after_fg)->set_attr("graph_name", MakeValue(aim_cnode->fullname_with_scope() + "_after_fg"));
   (*after_fg)->set_manager(main_fg->manager());
 
@@ -211,7 +200,9 @@ int ControlFlowPass::CreateAfterGraph(const FuncGraphPtr &main_fg, const std::ve
       continue;
     }
     (*after_fg)->AddNode(cur_node);
-    cur_node->set_func_graph(*after_fg);
+    if (!utils::isa<ValueNodePtr>(cur_node)) {
+      cur_node->set_func_graph(*after_fg);
+    }
     if (cur_node == main_fg->output()) {
       (*after_fg)->set_output(cur_node, false);
     }
@@ -233,9 +224,9 @@ int ControlFlowPass::CreateWhileCondCallNode(
   }
 
   // create after partial node
-  ValueNodePtr cond_partial_anf_primitive = GetPartialAnfPrim();
+  ValueNodePtr cond_partial_anf_primitive = lite::GetPartialFusionPrim();
   if (cond_partial_anf_primitive == nullptr) {
-    MS_LOG(ERROR) << "GetPartialAnfPrim failed.";
+    MS_LOG(ERROR) << "GetPartialFusionPrim failed.";
     return RET_FAILED;
   }
 
@@ -290,9 +281,9 @@ int ControlFlowPass::CreateWhileBodyPartialNode(const FuncGraphPtr &cond_fg, con
     return RET_FAILED;
   }
 
-  ValueNodePtr partial_anf_primitive = GetPartialAnfPrim();
+  ValueNodePtr partial_anf_primitive = lite::GetPartialFusionPrim();
   if (partial_anf_primitive == nullptr) {
-    MS_LOG(ERROR) << "GetPartialAnfPrim failed.";
+    MS_LOG(ERROR) << "GetPartialFusionPrim failed.";
     return RET_FAILED;
   }
 
@@ -358,9 +349,9 @@ int ControlFlowPass::CreateWhileAfterPartialNode(
   }
 
   auto after_value_node = NewValueNode(after_fg);
-  ValueNodePtr partial_anf_primitive = GetPartialAnfPrim();
+  ValueNodePtr partial_anf_primitive = lite::GetPartialFusionPrim();
   if (partial_anf_primitive == nullptr) {
-    MS_LOG(ERROR) << "GetPartialAnfPrim failed.";
+    MS_LOG(ERROR) << "GetPartialFusionPrim failed.";
     return RET_FAILED;
   }
 
@@ -463,7 +454,7 @@ int ControlFlowPass::ProcessWhileOp(const FuncGraphPtr &fg, const std::set<AnfNo
   }
 
   // create switch cnode
-  ValueNodePtr switch_anf_primitive = GetSwitchAnfPrim();
+  ValueNodePtr switch_anf_primitive = lite::GetSwitchAnfPrim();
   if (switch_anf_primitive == nullptr) {
     MS_LOG(ERROR) << "GetSwitchAnfPrim failed.";
     return false;
@@ -534,9 +525,9 @@ int ControlFlowPass::CreateIfPartialNode(const FuncGraphPtr &fg, const size_t &i
   }
 
   // create then partial node
-  ValueNodePtr then_partial_anf_primitive = GetPartialAnfPrim();
+  ValueNodePtr then_partial_anf_primitive = lite::GetPartialFusionPrim();
   if (then_partial_anf_primitive == nullptr) {
-    MS_LOG(ERROR) << "GetPartialAnfPrim failed.";
+    MS_LOG(ERROR) << "GetPartialFusionPrim failed.";
     return RET_FAILED;
   }
   std::vector<AnfNodePtr> then_partial_cnode_inputs{then_partial_anf_primitive, then_vnode};
@@ -584,9 +575,9 @@ int ControlFlowPass::CreateIfPartialNode(const FuncGraphPtr &fg, const size_t &i
   (*then_partial_cnode)->set_fullname_with_scope("partial_" + then_fg_name);
 
   // create after partial node
-  ValueNodePtr after_partial_anf_primitive = GetPartialAnfPrim();
+  ValueNodePtr after_partial_anf_primitive = lite::GetPartialFusionPrim();
   if (after_partial_anf_primitive == nullptr) {
-    MS_LOG(ERROR) << "GetPartialAnfPrim failed.";
+    MS_LOG(ERROR) << "GetPartialFusionPrim failed.";
     return RET_FAILED;
   }
   auto after_value_node = NewValueNode(*after_fg);
@@ -701,7 +692,7 @@ int ControlFlowPass::ProcessIfOp(const FuncGraphPtr &fg, const std::set<AnfNodeP
   }
 
   // create switch cnode
-  ValueNodePtr switch_anf_primitive = GetSwitchAnfPrim();
+  ValueNodePtr switch_anf_primitive = lite::GetSwitchAnfPrim();
   if (switch_anf_primitive == nullptr) {
     MS_LOG(ERROR) << "GetSwitchAnfPrim failed.";
     return false;
diff --git a/mindspore/lite/tools/optimizer/graph/control_flow_pass.h b/mindspore/lite/tools/optimizer/graph/control_flow_pass.h
index beb123ed461..bddbe4da887 100644
--- a/mindspore/lite/tools/optimizer/graph/control_flow_pass.h
+++ b/mindspore/lite/tools/optimizer/graph/control_flow_pass.h
@@ -33,9 +33,6 @@ class ControlFlowPass : public Pass {
   bool Run(const FuncGraphPtr &fg) override;
 
  private:
-  // utility function
-  static ValueNodePtr GetSwitchAnfPrim();
-  static ValueNodePtr GetPartialAnfPrim();
   void ReplaceNode(const FuncGraphPtr &fg, const std::unordered_map<AnfNodePtr, AnfNodePtr> &replace_pairs);
   void VisitedNodesUsedByAfterParts(const std::set<AnfNodePtr> &visited_nodes,
                                     const std::vector<AnfNodePtr> &remain_nodes,
diff --git a/mindspore/lite/tools/optimizer/graph/decrease_transpose_algo.cc b/mindspore/lite/tools/optimizer/graph/decrease_transpose_algo.cc
index 74e039d6934..ae158b228a0 100644
--- a/mindspore/lite/tools/optimizer/graph/decrease_transpose_algo.cc
+++ b/mindspore/lite/tools/optimizer/graph/decrease_transpose_algo.cc
@@ -343,6 +343,7 @@ STATUS DecreaseTransposeAlgo::InsertPreTransNode(const FuncGraphPtr &func_graph,
       return lite::RET_ERROR;
     }
   }
+  ModifyCNodeFormat(cnode, trans_insert_info->pre_);
   status = node_infer_shape_.InferShape(cnode);
 
   if (status != lite::RET_OK && status != lite::RET_INFER_INVALID) {
@@ -442,6 +443,7 @@ STATUS DecreaseTransposeAlgo::HandleGraphMultiNode(const FuncGraphPtr &func_grap
       MS_LOG(ERROR) << "change op attr failed.";
       return lite::RET_ERROR;
     }
+    ModifyCNodeFormat(middle_cnode, trans_info.post_);
     status = node_infer_shape_.InferShape(middle_cnode);
     if (status != lite::RET_OK && status != lite::RET_INFER_INVALID) {
       MS_LOG(ERROR) << "infer shape failed.";
@@ -587,9 +589,22 @@ void DecreaseTransposeAlgo::SetSubGraphAbstract(const CNodePtr &cnode, const Fun
   prim->AddAttr(kInferDone, MakeValue<bool>(infer_done));
 }
 
+void DecreaseTransposeAlgo::ModifyCNodeFormat(const CNodePtr &cnode, FormatTransNodeType pre_trans_type) {
+  MS_ASSERT(cnode != nullptr);
+  if (pre_trans_type == kNONE) {
+    return;
+  }
+  auto primitive = GetValueNode<PrimitivePtr>(cnode->input(0));
+  MS_ASSERT(primitive != nullptr);
+  if (pre_trans_type == kNHWC2NCHW) {
+    primitive->AddAttr(ops::kFormat, MakeValue<int64_t>(mindspore::NCHW));
+  } else {
+    primitive->AddAttr(ops::kFormat, MakeValue<int64_t>(mindspore::NHWC));
+  }
+}
+
 bool DecreaseTransposeAlgo::DecreaseTransposeForSingleOp(const FuncGraphPtr &func_graph) {
   MS_ASSERT(func_graph != nullptr);
-  auto graph_name = GetValue<std::string>(func_graph->get_attr("graph_name"));
   auto manager = Manage(func_graph, true);
   if (manager == nullptr) {
     MS_LOG(ERROR) << "manager is nullptr.";
diff --git a/mindspore/lite/tools/optimizer/graph/decrease_transpose_algo.h b/mindspore/lite/tools/optimizer/graph/decrease_transpose_algo.h
index c25b3f530fb..b32079bd181 100644
--- a/mindspore/lite/tools/optimizer/graph/decrease_transpose_algo.h
+++ b/mindspore/lite/tools/optimizer/graph/decrease_transpose_algo.h
@@ -28,12 +28,12 @@
 #include "tools/optimizer/common/format_utils.h"
 #include "tools/optimizer/graph/transpose_strategy.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore {
 namespace opt {
 class DecreaseTransposeAlgo : public Pass {
  public:
-  explicit DecreaseTransposeAlgo(FmkType fmk_type = FmkType::FmkType_MS, bool train_flag = false)
+  explicit DecreaseTransposeAlgo(FmkType fmk_type = FmkType::kFmkTypeMs, bool train_flag = false)
       : Pass("DecreaseTransposeAlgo"), fmk_type_(fmk_type), train_flag_(train_flag) {}
   ~DecreaseTransposeAlgo() override = default;
   void Init(FmkType fmk_type, bool train_flag) {
@@ -62,7 +62,8 @@ class DecreaseTransposeAlgo : public Pass {
   void ResetSubGraphInput();
   void SetSubGraphOutput(const CNodePtr &cnode, const FuncGraphPtr &sub_graph);
   void SetSubGraphAbstract(const CNodePtr &cnode, const FuncGraphPtr &sub_graph);
-  FmkType fmk_type_{lite::converter::FmkType_MS};
+  void ModifyCNodeFormat(const CNodePtr &cnode, FormatTransNodeType pre_trans_type);
+  FmkType fmk_type_{converter::kFmkTypeMs};
   bool train_flag_{false};
   NodeInferShape node_infer_shape_;
   TransposeStrategy transpose_strategy_;
diff --git a/mindspore/lite/tools/optimizer/graph/infershape_pass.cc b/mindspore/lite/tools/optimizer/graph/infershape_pass.cc
index 60a81f5071f..9a94ed61740 100644
--- a/mindspore/lite/tools/optimizer/graph/infershape_pass.cc
+++ b/mindspore/lite/tools/optimizer/graph/infershape_pass.cc
@@ -15,9 +15,80 @@
  */
 
 #include "tools/optimizer/graph/infershape_pass.h"
+#include "tools/common/node_util.h"
 
 namespace mindspore {
 namespace opt {
+namespace {
+int GetCNodeCertainInputFormat(const CNodePtr cnode, int index, mindspore::Format *format) {
+  MS_ASSERT(cnode != nullptr && format != nullptr);
+  auto origin_inputs = cnode->inputs();
+  lite::RemoveIfDepend(cnode);
+  lite::RemoveIfMakeTuple(cnode);
+  RemoveIfMonad(cnode);
+  if (index <= 0 || static_cast<size_t>(index) >= cnode->size()) {
+    MS_LOG(ERROR) << "input index out of range";
+    cnode->set_inputs(origin_inputs);
+    return lite::RET_ERROR;
+  }
+  if (!utils::isa<CNode>(cnode->input(index))) {
+    cnode->set_inputs(origin_inputs);
+    return lite::RET_NO_CHANGE;
+  }
+  auto real_cnode = cnode->input(index)->cast<CNodePtr>();
+  if (CheckPrimitiveType(real_cnode, prim::kPrimTupleGetItem)) {
+    real_cnode = real_cnode->input(1)->cast<CNodePtr>();
+  }
+  cnode->set_inputs(origin_inputs);
+  MS_ASSERT(real_cnode != nullptr);
+  auto primitive = GetValueNode<PrimitivePtr>(real_cnode->input(0));
+  MS_ASSERT(primitive != nullptr);
+  if (primitive->GetAttr(ops::kFormat) == nullptr) {
+    MS_LOG(ERROR) << "cnode has no format attr. " << real_cnode->fullname_with_scope();
+    return lite::RET_ERROR;
+  }
+  *format = static_cast<mindspore::Format>(GetValue<int64_t>(primitive->GetAttr(ops::kFormat)));
+  if (CheckPrimitiveType(real_cnode, prim::kPrimTranspose)) {
+    std::vector<int> perm;
+    if (GetTransposePerm(real_cnode, &perm) != lite::RET_OK) {
+      MS_LOG(ERROR) << "get transpose perm failed.";
+      return lite::RET_ERROR;
+    }
+    if (perm.size() != 4) {
+      return RET_OK;
+    }
+    if (perm == kNH2NC && *format == mindspore::NHWC) {
+      *format = mindspore::NCHW;
+    } else if (perm == kNC2NH && *format == mindspore::NCHW) {
+      *format = mindspore::NHWC;
+    }
+  }
+  return lite::RET_OK;
+}
+
+int ModifySubGraphInputCNodeFormat(const FuncGraphPtr &sub_graph, const ParameterPtr &certain_input,
+                                   mindspore::Format format) {
+  MS_ASSERT(sub_graph != nullptr && certain_input != nullptr);
+  auto manager = sub_graph->manager();
+  MS_ASSERT(manager != nullptr);
+  auto node_users = manager->node_users()[certain_input];
+  for (auto &node_user : node_users) {
+    if (node_user.second != 1) {
+      continue;
+    }
+    auto post_cnode = node_user.first->cast<CNodePtr>();
+    if (post_cnode == nullptr) {
+      MS_LOG(ERROR) << "post node is not cnode, which is invalid.";
+      return lite::RET_ERROR;
+    }
+    auto primitive = GetValueNode<PrimitivePtr>(post_cnode->input(0));
+    MS_ASSERT(primitive != nullptr);
+    primitive->AddAttr(ops::kFormat, MakeValue<int64_t>(format));
+  }
+  return lite::RET_OK;
+}
+}  // namespace
+
 bool InferShapePass::Run(const FuncGraphPtr &func_graph) {
   if (func_graph == nullptr) {
     MS_LOG(ERROR) << "func_graph is nullptr.";
@@ -53,6 +124,10 @@ bool InferShapePass::JudgeAllOpsCanInfer(const FuncGraphPtr &func_graph) {
     if (IsSpecialType(cnode)) {
       continue;
     }
+    if (lite::IsCall(cnode) || lite::IsPartialFusion(node)) {
+      all_op_can_infer = false;
+      return all_op_can_infer;
+    }
     if (CheckPrimitiveType(node, prim::kPrimIf) || CheckPrimitiveType(node, prim::kPrimWhile)) {
       auto sub_func_graph = GetValueNode<FuncGraphPtr>(cnode->input(1));
       if (sub_func_graph == nullptr) {
@@ -105,7 +180,7 @@ STATUS InferShapePass::InferProcess(const FuncGraphPtr &func_graph) {
         return false;
       }
       SetSubGraphOutput(cnode, sub_func_graph);
-      sub_func_graph = GetValueNode<FuncGraphPtr>(cnode->input(2));
+      sub_func_graph = GetValueNode<FuncGraphPtr>(cnode->input(kInputIndexTwo));
       if (sub_func_graph == nullptr) {
         lite::ReturnCode::GetSingleReturnCode()->UpdateReturnCode(lite::RET_NULL_PTR);
         return false;
@@ -149,6 +224,14 @@ void InferShapePass::SetSubGraphInput(const CNodePtr &cnode, const FuncGraphPtr
       if (out_prim->GetAttr(opt::kInferDone) == nullptr || !GetValue<bool>(out_prim->GetAttr(opt::kInferDone))) {
         param_node->abstract()->set_shape(std::make_shared<abstract::Shape>(shape_vec));
       }
+      mindspore::Format format = mindspore::NHWC;
+      if (GetCNodeCertainInputFormat(cnode, index, &format) != lite::RET_OK) {
+        MS_LOG(DEBUG) << "has no change for current control node." << cnode->fullname_with_scope();
+        continue;
+      }
+      if (ModifySubGraphInputCNodeFormat(sub_graph, param_node, format) != lite::RET_OK) {
+        MS_LOG(DEBUG) << "modify subgraph input cnode format failed." << cnode->func_graph_as_var();
+      }
     } else {
       lite::DataInfo data_info;
       if (utils::isa<ParameterPtr>(cnode->input(index))) {
diff --git a/mindspore/lite/tools/optimizer/graph/infershape_pass.h b/mindspore/lite/tools/optimizer/graph/infershape_pass.h
index 1bede691662..5150d26effb 100644
--- a/mindspore/lite/tools/optimizer/graph/infershape_pass.h
+++ b/mindspore/lite/tools/optimizer/graph/infershape_pass.h
@@ -27,7 +27,7 @@ namespace mindspore {
 namespace opt {
 class InferShapePass : public Pass {
  public:
-  explicit InferShapePass(FmkType fmk_type = lite::converter::FmkType_MS, bool train_flag = false)
+  explicit InferShapePass(FmkType fmk_type = converter::kFmkTypeMs, bool train_flag = false)
       : Pass("infer_shape"), fmk_type_(fmk_type), train_flag_(train_flag) {}
   ~InferShapePass() override = default;
   bool Run(const FuncGraphPtr &func_graph) override;
@@ -40,7 +40,7 @@ class InferShapePass : public Pass {
   void SetSubGraphAbstract(const CNodePtr &cnode, const FuncGraphPtr &sub_graph);
   void ResetSubGraphInput();
 
-  FmkType fmk_type_{lite::converter::FmkType_MS};
+  FmkType fmk_type_{converter::kFmkTypeMs};
   bool train_flag_{false};
   std::shared_ptr<NodeInferShape> node_infer_shape_{nullptr};
   std::map<FuncGraphPtr, std::vector<AnfNodePtr>> sub_inputs_map_;
diff --git a/mindspore/lite/tools/optimizer/graph/node_infershape.cc b/mindspore/lite/tools/optimizer/graph/node_infershape.cc
index c34d8bc8c56..ca2d9936166 100644
--- a/mindspore/lite/tools/optimizer/graph/node_infershape.cc
+++ b/mindspore/lite/tools/optimizer/graph/node_infershape.cc
@@ -45,7 +45,7 @@ void FreeTensors(std::vector<lite::Tensor *> *tensors) {
 
 void RectifyFormat(const CNodePtr &cnode, const std::vector<lite::Tensor *> &inputs, FmkType fmk_type) {
   MS_ASSERT(cnode != nullptr);
-  if (fmk_type != lite::converter::FmkType_ONNX) {
+  if (fmk_type != converter::kFmkTypeOnnx) {
     return;
   }
   for (auto &input : inputs) {
@@ -122,7 +122,7 @@ STATUS NodeInferShape::InferShape(const CNodePtr &cnode) {
     fbb.Clear();
     return lite::RET_ERROR;
   }
-  auto ret = KernelInferShape(inputs, outputs, prim, {});
+  auto ret = KernelInferShape(inputs, outputs, prim, {}, lite::SCHEMA_CUR);
   if (ret == lite::RET_NOT_SUPPORT) {
     auto parameter_gen =
       lite::PopulateRegistry::GetInstance()->GetParameterCreator(prim->value_type(), lite::SCHEMA_CUR);
diff --git a/mindspore/lite/tools/optimizer/graph/node_infershape.h b/mindspore/lite/tools/optimizer/graph/node_infershape.h
index f6bcffb31f5..74e09ebaabe 100644
--- a/mindspore/lite/tools/optimizer/graph/node_infershape.h
+++ b/mindspore/lite/tools/optimizer/graph/node_infershape.h
@@ -27,12 +27,12 @@
 #include "tools/converter/converter_flags.h"
 #include "tools/optimizer/common/format_utils.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore {
 namespace opt {
 class NodeInferShape {
  public:
-  explicit NodeInferShape(FmkType fmk_type = lite::converter::FmkType_MS, bool train_flag = false)
+  explicit NodeInferShape(FmkType fmk_type = converter::kFmkTypeMs, bool train_flag = false)
       : fmk_type_(fmk_type), train_flag_(train_flag) {}
   virtual ~NodeInferShape() = default;
   void Init(FmkType fmk_type, bool train_flag) {
@@ -54,7 +54,7 @@ class NodeInferShape {
   STATUS SetCNodeAbstract(const std::shared_ptr<CNode> &cnode, const std::vector<lite::Tensor *> &outputs, int status);
   abstract::AbstractBasePtr ConvertLiteTensorToAbstract(lite::Tensor *tensor);
   abstract::AbstractBasePtr ConvertTensorListToAbstract(lite::Tensor *tensor);
-  FmkType fmk_type_{lite::converter::FmkType_MS};
+  FmkType fmk_type_{converter::kFmkTypeMs};
   bool train_flag_{false};
 };
 }  // namespace opt
diff --git a/mindspore/lite/tools/optimizer/graph/reduce_same_act_pass.h b/mindspore/lite/tools/optimizer/graph/reduce_same_act_pass.h
index 5e15dad9725..e4b917f8c2a 100644
--- a/mindspore/lite/tools/optimizer/graph/reduce_same_act_pass.h
+++ b/mindspore/lite/tools/optimizer/graph/reduce_same_act_pass.h
@@ -27,7 +27,7 @@
 #include "tools/optimizer/common/format_utils.h"
 #include "tools/optimizer/graph/transpose_strategy.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore {
 namespace opt {
 class ReduceSameActPass : public Pass {
diff --git a/mindspore/lite/tools/optimizer/graph/redundant_op_remove_pass.cc b/mindspore/lite/tools/optimizer/graph/redundant_op_remove_pass.cc
index 4a9eed1c325..b76b23b6451 100644
--- a/mindspore/lite/tools/optimizer/graph/redundant_op_remove_pass.cc
+++ b/mindspore/lite/tools/optimizer/graph/redundant_op_remove_pass.cc
@@ -261,13 +261,13 @@ int RemoveRedundantOpPass::RemoveInvalidPadOp(const AnfNodePtr &anf_node, const
     auto padding_node = cnode->input(kInputIndexTwo);
     lite::DataInfo data_info;
     if (utils::isa<Parameter>(padding_node)) {
-      auto status = lite::FetchDataFromParameterNode(cnode, 2, lite::converter::FmkType_MS, false, &data_info);
+      auto status = lite::FetchDataFromParameterNode(cnode, 2, converter::kFmkTypeMs, false, &data_info);
       if (status != lite::RET_OK && status != lite::RET_NO_CHANGE) {
         MS_LOG(ERROR) << "fetch data from parameter node failed.";
         return lite::RET_ERROR;
       }
     } else if (utils::isa<ValueNode>(padding_node)) {
-      auto status = lite::FetchDataFromValueNode(cnode, 2, lite::converter::FmkType_MS, false, &data_info);
+      auto status = lite::FetchDataFromValueNode(cnode, 2, converter::kFmkTypeMs, false, &data_info);
       if (status != lite::RET_OK && status != lite::RET_NO_CHANGE) {
         MS_LOG(ERROR) << "fetch data from value node failed.";
         return lite::RET_ERROR;
diff --git a/mindspore/lite/tools/optimizer/graph/redundant_op_remove_pass.h b/mindspore/lite/tools/optimizer/graph/redundant_op_remove_pass.h
index b0216d79911..034133fecf7 100644
--- a/mindspore/lite/tools/optimizer/graph/redundant_op_remove_pass.h
+++ b/mindspore/lite/tools/optimizer/graph/redundant_op_remove_pass.h
@@ -22,7 +22,7 @@
 #include "tools/converter/converter_flags.h"
 #include "tools/optimizer/common/gllo_utils.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore::opt {
 class RemoveRedundantOpPass : public Pass {
  public:
diff --git a/mindspore/lite/tools/optimizer/graph/slice_prepose_pass.cc b/mindspore/lite/tools/optimizer/graph/slice_prepose_pass.cc
index ad862310a21..97ab621fc0b 100644
--- a/mindspore/lite/tools/optimizer/graph/slice_prepose_pass.cc
+++ b/mindspore/lite/tools/optimizer/graph/slice_prepose_pass.cc
@@ -1411,7 +1411,7 @@ bool SlicePreposePass::DoPrepose(const FuncGraphPtr &graph, const CNodePtr &slic
 }
 
 bool SlicePreposePass::Run(const FuncGraphPtr &graph) {
-  if (fmk_type != lite::converter::FmkType_TF && fmk_type != lite::converter::FmkType_TFLITE) {
+  if (fmk_type != converter::kFmkTypeTf && fmk_type != converter::kFmkTypeTflite) {
     MS_LOG(INFO) << "The framework type of model should be tf/tflite.";
     return false;
   }
diff --git a/mindspore/lite/tools/optimizer/graph/slice_prepose_pass.h b/mindspore/lite/tools/optimizer/graph/slice_prepose_pass.h
index 3ad4b5fcf9a..67fd914ec7e 100644
--- a/mindspore/lite/tools/optimizer/graph/slice_prepose_pass.h
+++ b/mindspore/lite/tools/optimizer/graph/slice_prepose_pass.h
@@ -25,7 +25,7 @@
 #include "include/errorcode.h"
 #include "mindspore/core/ir/manager.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore::opt {
 using lite::RET_ERROR;
 using lite::RET_OK;
@@ -95,7 +95,7 @@ class SlicePreposePass : public Pass {
   static bool MergeParallelSlice(const FuncGraphPtr &graph, const NodeUsedListPtr &slices);
 
  private:
-  FmkType fmk_type = lite::converter::FmkType_ONNX;
+  FmkType fmk_type = converter::kFmkTypeOnnx;
 };
 }  // namespace mindspore::opt
 
diff --git a/mindspore/lite/tools/optimizer/graph/split_one_pass.h b/mindspore/lite/tools/optimizer/graph/split_one_pass.h
index 551d288e2b9..848983999bf 100644
--- a/mindspore/lite/tools/optimizer/graph/split_one_pass.h
+++ b/mindspore/lite/tools/optimizer/graph/split_one_pass.h
@@ -27,7 +27,7 @@
 #include "tools/optimizer/common/format_utils.h"
 #include "tools/optimizer/graph/transpose_strategy.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore {
 namespace opt {
 class SplitOnePass : public Pass {
diff --git a/mindspore/lite/tools/optimizer/graph/transpose_strategy.h b/mindspore/lite/tools/optimizer/graph/transpose_strategy.h
index b9b6ee2b974..dff8e69a475 100644
--- a/mindspore/lite/tools/optimizer/graph/transpose_strategy.h
+++ b/mindspore/lite/tools/optimizer/graph/transpose_strategy.h
@@ -25,7 +25,7 @@
 #include "tools/optimizer/common/format_utils.h"
 #include "tools/optimizer/graph/node_infershape.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore {
 namespace opt {
 class TransposeStrategy {
@@ -58,7 +58,7 @@ class TransposeStrategy {
   void TransformAttrByAxes(const FuncGraphPtr &func_graph, const CNodePtr &cnode, size_t input_index,
                            const std::vector<int> &axes, FormatTransNodeType trans_type);
   std::vector<int> TransformOpAxesAttr(const std::vector<int> &origin_axes, FormatTransNodeType trans_type);
-  FmkType fmk_type_{lite::converter::FmkType_MS};
+  FmkType fmk_type_{converter::kFmkTypeMs};
   bool train_flag_{false};
   NodeInferShape node_infer_shape_;
 };
diff --git a/mindspore/lite/tools/optimizer/graph/unused_cast_node_remove_pass.cc b/mindspore/lite/tools/optimizer/graph/unused_cast_node_remove_pass.cc
index cb88fb439a0..0e3c3b26836 100644
--- a/mindspore/lite/tools/optimizer/graph/unused_cast_node_remove_pass.cc
+++ b/mindspore/lite/tools/optimizer/graph/unused_cast_node_remove_pass.cc
@@ -22,7 +22,7 @@ constexpr size_t kCastInputNum = 3;
 void RemoveUnusedCastOpPass::SetFmkType(FmkType type) { this->fmk_type = type; }
 
 bool RemoveUnusedCastOpPass::Run(const FuncGraphPtr &func_graph) {
-  if (this->fmk_type != lite::converter::FmkType_MS) {
+  if (this->fmk_type != converter::kFmkTypeMs) {
     MS_LOG(ERROR) << "The framework type of model should be mindspore.";
     return RET_ERROR;
   }
diff --git a/mindspore/lite/tools/optimizer/graph/unused_cast_node_remove_pass.h b/mindspore/lite/tools/optimizer/graph/unused_cast_node_remove_pass.h
index 4536e0f06c6..57675c78f7f 100644
--- a/mindspore/lite/tools/optimizer/graph/unused_cast_node_remove_pass.h
+++ b/mindspore/lite/tools/optimizer/graph/unused_cast_node_remove_pass.h
@@ -20,7 +20,7 @@
 #include "backend/optimizer/common/pass.h"
 #include "tools/converter/converter_flags.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore::opt {
 class RemoveUnusedCastOpPass : public Pass {
  public:
@@ -30,7 +30,7 @@ class RemoveUnusedCastOpPass : public Pass {
   bool Run(const FuncGraphPtr &graph) override;
 
  private:
-  FmkType fmk_type = lite::converter::FmkType_TF;
+  FmkType fmk_type = converter::kFmkTypeTf;
 };
 }  // namespace mindspore::opt
 #endif  // MINDSPORE_LITE_SRC_PASS_REMOVE_UNUSED_CAST_PASS_H_
diff --git a/mindspore/lite/tools/optimizer/graph/unused_transpose_node_remove_pass.cc b/mindspore/lite/tools/optimizer/graph/unused_transpose_node_remove_pass.cc
index d97a0d79577..e095dff88b8 100644
--- a/mindspore/lite/tools/optimizer/graph/unused_transpose_node_remove_pass.cc
+++ b/mindspore/lite/tools/optimizer/graph/unused_transpose_node_remove_pass.cc
@@ -57,7 +57,7 @@ std::vector<int> GetTransposePerm(const CNodePtr &node) {
 }
 
 bool RemoveUnusedTransposeOpPass::Run(const FuncGraphPtr &func_graph) {
-  if (this->fmk_type != lite::converter::FmkType_ONNX) {
+  if (this->fmk_type != converter::kFmkTypeOnnx) {
     MS_LOG(ERROR) << "The framework type of model should be onnx.";
     return RET_ERROR;
   }
diff --git a/mindspore/lite/tools/optimizer/graph/unused_transpose_node_remove_pass.h b/mindspore/lite/tools/optimizer/graph/unused_transpose_node_remove_pass.h
index 9725ed48137..954d64a4c8a 100644
--- a/mindspore/lite/tools/optimizer/graph/unused_transpose_node_remove_pass.h
+++ b/mindspore/lite/tools/optimizer/graph/unused_transpose_node_remove_pass.h
@@ -20,7 +20,7 @@
 #include "backend/optimizer/common/pass.h"
 #include "tools/converter/converter_flags.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore::opt {
 class RemoveUnusedTransposeOpPass : public Pass {
  public:
@@ -30,7 +30,7 @@ class RemoveUnusedTransposeOpPass : public Pass {
   bool Run(const FuncGraphPtr &graph) override;
 
  private:
-  FmkType fmk_type = lite::converter::FmkType_TF;
+  FmkType fmk_type = converter::kFmkTypeTf;
 };
 }  // namespace mindspore::opt
 #endif  // MINDSPORE_LITE_SRC_PASS_REMOVE_UNUSED_TRANSPOSE_PASS_H_
diff --git a/mindspore/lite/tools/optimizer/graph/update_conv2d_param_pass.cc b/mindspore/lite/tools/optimizer/graph/update_conv2d_param_pass.cc
index 18bec57c950..1499c5261c8 100644
--- a/mindspore/lite/tools/optimizer/graph/update_conv2d_param_pass.cc
+++ b/mindspore/lite/tools/optimizer/graph/update_conv2d_param_pass.cc
@@ -30,7 +30,7 @@ constexpr int kAnfPopulaterInputNumTwo = 2;
 
 lite::STATUS UpdateConv2DParamPass::UpdateCommonConv2D(const CNodePtr &cnode) {
   MS_ASSERT(cnode != nullptr);
-  if (fmk_type_ != lite::converter::FmkType_TF) {
+  if (fmk_type_ != converter::kFmkTypeTf) {
     return lite::RET_OK;
   }
   auto conv = GetValueNode<std::shared_ptr<ops::Conv2DFusion>>(cnode->input(0));
diff --git a/mindspore/lite/tools/optimizer/graph/update_conv2d_param_pass.h b/mindspore/lite/tools/optimizer/graph/update_conv2d_param_pass.h
index 79944381b0b..c15a9bea1f6 100644
--- a/mindspore/lite/tools/optimizer/graph/update_conv2d_param_pass.h
+++ b/mindspore/lite/tools/optimizer/graph/update_conv2d_param_pass.h
@@ -21,7 +21,7 @@
 #include "tools/optimizer/common/gllo_utils.h"
 #include "tools/converter/converter_flags.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 namespace mindspore::opt {
 class UpdateConv2DParamPass : public Pass {
  public:
@@ -33,7 +33,7 @@ class UpdateConv2DParamPass : public Pass {
   void SetFmkType(FmkType fmk_type) { this->fmk_type_ = fmk_type; }
 
  private:
-  FmkType fmk_type_ = lite::converter::FmkType_ONNX;
+  FmkType fmk_type_ = converter::kFmkTypeOnnx;
 };
 }  // namespace mindspore::opt
 #endif  // MINDSPORE_LITE_SRC_PASS_UPDATE_CONV2D_PARAM_PASS_H_
diff --git a/mindspore/lite/tools/optimizer/parallel/multi_conv_info.cc b/mindspore/lite/tools/optimizer/parallel/multi_conv_info.cc
index 81800dd94cc..8e2e9ba264a 100644
--- a/mindspore/lite/tools/optimizer/parallel/multi_conv_info.cc
+++ b/mindspore/lite/tools/optimizer/parallel/multi_conv_info.cc
@@ -21,7 +21,7 @@
 #include "tools/optimizer/parallel/split_strategy.h"
 #include "nnacl/op_base.h"
 
-using mindspore::lite::converter::FmkType;
+using mindspore::converter::FmkType;
 using mindspore::schema::PrimitiveType_Conv2dTransposeFusion;
 namespace mindspore {
 namespace opt {
diff --git a/mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh b/mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh
index a04025fc294..db96d48d6cf 100644
--- a/mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh
+++ b/mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh
@@ -14,6 +14,8 @@ function Run_Build_x86() {
   rm -rf ${nnie_code_path}/mindspore/mindspore/lite/tools/converter/nnie/third_party/ms_lite/
   mkdir -p ${nnie_code_path}/mindspore/mindspore/lite/tools/converter/nnie/third_party/ms_lite/ || exit 1
   cp -r ./tools/ ${nnie_code_path}/mindspore/mindspore/lite/tools/converter/nnie/third_party/ms_lite/ || exit 1
+  mkdir -pv ${open_source_ms_path}/mindspore/lite/test/do_test
+  cp ./tools/converter/lib/*.so* ${open_source_ms_path}/mindspore/lite/test/do_test
 
   # compile nnie converter so
   export MSLITE_ENABLE_NNIE=on
diff --git a/mindspore/log.py b/mindspore/log.py
index 7f81f4b66d0..5d5b5d16ed5 100644
--- a/mindspore/log.py
+++ b/mindspore/log.py
@@ -422,7 +422,6 @@ def _get_stack_info(frame):
     Returns:
         str, the string of the stack information.
     """
-    sinfo = None
     stack_prefix = 'Stack (most recent call last):\n'
     sinfo = stack_prefix + "".join(traceback.format_stack(frame))
     return sinfo
diff --git a/mindspore/nn/cell.py b/mindspore/nn/cell.py
index 8ab61f3a042..85ba988c58c 100755
--- a/mindspore/nn/cell.py
+++ b/mindspore/nn/cell.py
@@ -21,6 +21,7 @@ from collections import OrderedDict
 
 import numpy
 
+from mindspore._checkparam import args_type_check
 from mindspore import log as logger
 from mindspore.common.parameter import PARAMETER_NAME_DEFAULT
 from mindspore.common._decorator import deprecated
@@ -85,6 +86,7 @@ class Cell(Cell_):
         self._cells = OrderedDict()
         self._params_list = OrderedDict()
         self._tensor_list = OrderedDict()
+        self._primitives = OrderedDict()
         self.training = False
         self.requires_grad = False
         self.pynative = False
@@ -337,7 +339,7 @@ class Cell(Cell_):
 
     def run_construct(self, cast_inputs, kwargs):
         if self.enable_hook:
-            output = self._hook_construct(*cast_inputs, **kwargs)
+            output = self._hook_construct(*cast_inputs)
         else:
             output = self.construct(*cast_inputs, **kwargs)
         return output
@@ -510,6 +512,7 @@ class Cell(Cell_):
         else:
             if isinstance(value, Primitive):
                 value.set_prim_instance_name(name)
+                self._primitives[name] = value
             object.__setattr__(self, name, value)
         if name not in Cell.IGNORE_LIST:
             self._attr_synced = False
@@ -1206,7 +1209,7 @@ class Cell(Cell_):
         self.add_flags(auto_parallel=True)
         self._get_construct_inputs_number_and_name()
 
-    def _hook_construct(self, *inputs, **kwargs):
+    def _hook_construct(self, *inputs):
         """Hook construct method to replace original construct method when hook function enabled."""
         inputs = self._backward_hook(*inputs)
         inputs = self.construct(inputs)
@@ -1287,7 +1290,26 @@ class Cell(Cell_):
         elif not self._scope is None and self._scope.startswith(prefix):
             self._scope = self._scope[len(prefix):]
 
-    def recompute(self, mode=True, output_recompute=False):
+    def _mp_comm_recompute(self, mp_comm_recompute=True):
+        for _, value in self._primitives.items():
+            if value:
+                value.add_prim_attr("recompute_comm_op", mp_comm_recompute)
+        for cell in self.cells():
+            cell._mp_comm_recompute(mp_comm_recompute)
+
+    def _recompute(self, mode=True, output_recompute=False):
+        if context.get_context("mode") == context.PYNATIVE_MODE:
+            raise TypeError("Recompute is not supported in pynative mode currently.")
+        Validator.check_bool(mode)
+        Validator.check_bool(output_recompute)
+        self._set_recompute_scope(mode)
+        if mode and not output_recompute:
+            self.add_flags(output_no_recompute=True)
+        for cell in self.cells():
+            cell._recompute(mode, True)
+
+    @args_type_check(mode=bool, output_recompute=bool, mp_comm_recompute=bool)
+    def recompute(self, **kwargs):
         """
         Set the cell recomputed. All the primitive in the cell will be set recomputed. If a primitive
         set recomputed feeds into some backward nodes for computing gradient, rather than storing the
@@ -1304,16 +1326,25 @@ class Cell(Cell_):
             mode (bool): Specifies whether the cell is recomputed. Default: True.
             output_recompute (bool): Specifies whether the output of this cell is recomputed when
                 the mode is true. Note that when the mode is false, this arg is not working. Default: False.
+            mp_comm_recompute (bool): Specifies whether the model parallel communication operators in the
+                cell is recomputed in auto parallel or semi auto parallel mode. Default: True.
         """
-        if context.get_context("mode") == context.PYNATIVE_MODE:
-            raise TypeError("Recompute is not supported in pynative mode currently.")
-        Validator.check_bool(mode)
-        Validator.check_bool(output_recompute)
-        self._set_recompute_scope(mode)
-        if mode and not output_recompute:
-            self.add_flags(output_no_recompute=True)
-        for cell in self.cells():
-            cell.recompute(mode, True)
+        if not kwargs:
+            self._recompute()
+        if 'mode' in kwargs.keys() or 'output_recompute' in kwargs.keys():
+            mode = True
+            output_recompute = False
+            if 'mode' in kwargs.keys():
+                mode = kwargs['mode']
+            if 'output_recompute' in kwargs.keys():
+                output_recompute = kwargs['output_recompute']
+            self._recompute(mode, output_recompute)
+        if 'mp_comm_recompute' in kwargs.keys():
+            self._mp_comm_recompute(kwargs['mp_comm_recompute'])
+        for key, _ in kwargs.items():
+            if key not in ('mode', 'output_recompute', 'mp_comm_recompute'):
+                raise ValueError("Recompute keyword %s is not recognized!" % key)
+
 
     def infer_param_pipeline_stage(self):
         """
@@ -1427,4 +1458,6 @@ class GraphCell(Cell):
         return self.graph(*inputs)
 
     def __call__(self, *inputs):
+        self.phase = "graph_load_from_mindir"
+        self._add_attr("graph_load_from_mindir", self.graph)
         return self.compile_and_run(*inputs)
diff --git a/mindspore/nn/layer/activation.py b/mindspore/nn/layer/activation.py
index 634d3d0ee07..362c22aeadc 100644
--- a/mindspore/nn/layer/activation.py
+++ b/mindspore/nn/layer/activation.py
@@ -332,14 +332,15 @@ class LeakyReLU(Cell):
         validator.check_value_type('alpha', alpha, [float, int], self.cls_name)
         self.greater_equal = P.GreaterEqual()
         self.mul = P.Mul()
+        self.maximum = P.Maximum()
         self.alpha = alpha
 
     def construct(self, x):
         alpha_array = P.Cast()(F.scalar_to_array(self.alpha), P.DType()(x))
         if self.alpha <= 1:
-            out = P.Maximum()(alpha_array * x, x)
+            out = self.maximum(alpha_array * x, x)
         else:
-            out = P.Minimum()(alpha_array * x, x)
+            out = self.maximum(alpha_array * x, x)
         return out
 
 
diff --git a/mindspore/nn/layer/basic.py b/mindspore/nn/layer/basic.py
index 11ee7cfae41..1c85b3843a3 100644
--- a/mindspore/nn/layer/basic.py
+++ b/mindspore/nn/layer/basic.py
@@ -33,7 +33,7 @@ from ..cell import Cell
 from .activation import get_activation
 
 __all__ = ['Dropout', 'Flatten', 'Dense', 'ClipByNorm', 'Norm', 'OneHot', 'Pad', 'Unfold',
-           'Tril', 'Triu', 'ResizeBilinear', 'MatrixDiag', 'MatrixDiagPart', 'MatrixSetDiag', 'L1Regularizer']
+           'Tril', 'Triu', 'ResizeBilinear', 'MatrixDiag', 'MatrixDiagPart', 'MatrixSetDiag', 'L1Regularizer', 'Roll']
 
 
 class L1Regularizer(Cell):
@@ -1355,3 +1355,88 @@ class MatrixSetDiag(Cell):
         assist = _get_matrix_diag_part_assist(x_shape, x_dtype)
         out_matrix_set_diag = self.matrix_set_diag(input_x, diagonal, assist)
         return out_matrix_set_diag
+
+
+@constexpr
+def _check_input_dim(axis, dim, cls_name):
+    Validator.check_int_range(axis, -dim, dim, Rel.INC_LEFT, 'axis', cls_name)
+
+
+class Roll(Cell):
+    """
+    Rolls the elements of a tensor along an axis.
+
+    The elements are shifted positively (towards larger indices) by the offset of `shift` along the dimension of `axis`.
+    Negative `shift` values will shift elements in the opposite direction. Elements that roll passed the last position
+    will wrap around to the first and vice versa. Multiple shifts along multiple axes may be specified.
+
+    Args:
+        shift (Union[list(int), tuple(int), int]): Specifies the number of places by which elements are shifted
+            positively (towards larger indices) along the specified dimension. Negative shifts will roll the elements
+            in the opposite direction.
+        axis (Union[list(int), tuple(int), int]): Specifies the dimension indexes of shape to be rolled.
+
+    Inputs:
+        - **input_x** (Tensor) - Input tensor.
+
+    Outputs:
+        Tensor, has the same shape and type as `input_x`.
+
+    Raises:
+        TypeError: If `shift` is not an int, a tuple or a list.
+        TypeError: If `axis` is not an int, a tuple or a list.
+        TypeError: If element of `shift` is not an int.
+        TypeError: If element of `axis` is not an int.
+        ValueError: If axis is out of the range [-len(input_x.shape), len(input_x.shape)).
+        ValueError: If length of shape of `shift` is not equal to length of shape of `axis`.
+
+    Supported Platforms:
+        ``Ascend``
+
+    Examples:
+        >>> input_x = Tensor(np.array([0, 1, 2, 3, 4]).astype(np.float32))
+        >>> op = nn.Roll(shift=2, axis=0)
+        >>> output = op(input_x)
+        >>> print(output)
+        [3. 4. 0. 1. 2.]
+        >>> input_x = Tensor(np.array([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]).astype(np.float32))
+        >>> op = nn.Roll(shift=[1, -2], axis=[0, 1])
+        >>> output = op(input_x)
+        >>> print(output)
+        [[7. 8. 9. 5. 6.]
+         [2. 3. 4. 0. 1.]]
+    """
+
+    def __init__(self, shift, axis):
+        """Initialize Roll"""
+        super(Roll, self).__init__()
+        Validator.check_value_type("shift", shift, [int, tuple, list], self.cls_name)
+        Validator.check_value_type("axis", axis, [int, tuple, list], self.cls_name)
+        self.shape_op = P.Shape()
+        self.shift = shift
+        self.axis = axis
+        self.op_list = []
+
+        if not isinstance(self.axis, (list, tuple)):
+            self.op_list.append((inner.Roll(shift=self.shift, axis=0), self.axis))
+        else:
+            if len(self.shift) != len(self.axis):
+                raise ValueError('The shape of shift and the shape of axis must be the same.')
+            for idx, _ in enumerate(self.axis):
+                self.op_list.append((inner.Roll(shift=self.shift[idx], axis=0), self.axis[idx]))
+
+    def construct(self, input_x):
+        dim = len(self.shape_op(input_x))
+        for single_op_roll, single_axis in self.op_list:
+            _check_input_dim(single_axis, dim, self.cls_name)
+            if single_axis < 0:
+                single_axis += dim
+            transpose_perm = []
+            for i in range(dim):
+                transpose_perm.append(i)
+            transpose_perm[0], transpose_perm[single_axis] = single_axis, 0
+
+            input_x = input_x.transpose(transpose_perm)
+            input_x = single_op_roll(input_x)
+            input_x = input_x.transpose(transpose_perm)
+        return input_x
diff --git a/mindspore/nn/layer/math.py b/mindspore/nn/layer/math.py
index d43908d161d..700066fd330 100644
--- a/mindspore/nn/layer/math.py
+++ b/mindspore/nn/layer/math.py
@@ -276,8 +276,8 @@ class LGamma(Cell):
         reflection_denom = self.log(self.sin(self.pi * reduced_frac_input))
 
         reflection = self.select(self.isfinite(reflection_denom),
-                                 -reflection_denom - log_y + self.log_pi,
-                                 -reflection_denom)
+                                 -reflection_denom - log_y + self.log_pi, # pylint: disable=invalid-unary-operand-type
+                                 -reflection_denom)  # pylint: disable=invalid-unary-operand-type
 
         result = self.select(need_to_reflect, reflection, log_y)
 
@@ -642,15 +642,17 @@ class IGamma(Cell):
 
 class LBeta(Cell):
     r"""
+    This method avoids the numeric cancellation by explicitly
+    decomposing lgamma into the Stirling approximation and an explicit log_gamma_correction, and cancelling
+    the large terms from the Striling analytically.
+
     This is semantically equal to
 
     .. math::
         P(x, y) = lgamma(x) + lgamma(y) - lgamma(x + y).
 
     The method is more accurate for arguments above 8. The reason for accuracy loss in the naive computation
-    is catastrophic cancellation between the lgammas. This method avoids the numeric cancellation by explicitly
-    decomposing lgamma into the Stirling approximation and an explicit log_gamma_correction, and cancelling
-    the large terms from the Striling analytically.
+    is catastrophic cancellation between the lgammas.
 
     Inputs:
         - **x** (Tensor) - The input tensor. With float16 or float32 data type. `x` should have
diff --git a/mindspore/nn/loss/loss.py b/mindspore/nn/loss/loss.py
index 29acf71030f..bbe4919e9f1 100644
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@@ -76,8 +76,8 @@ class LossBase(Cell):
 
         Args:
             weights (Union[float, Tensor]): Optional `Tensor` whose rank is either 0, or the same rank as inputs,
-            and must be broadcastable to inputs (i.e., all dimensions must be either `1`,
-            or the same as the corresponding inputs dimension).
+                and must be broadcastable to inputs (i.e., all dimensions must be either `1`,
+                or the same as the corresponding inputs dimension).
         """
         input_dtype = x.dtype
         x = self.cast(x, mstype.float32)
diff --git a/mindspore/nn/metrics/accuracy.py b/mindspore/nn/metrics/accuracy.py
index e65f4ae6a0c..3280c5379a6 100644
--- a/mindspore/nn/metrics/accuracy.py
+++ b/mindspore/nn/metrics/accuracy.py
@@ -22,7 +22,7 @@ class Accuracy(EvaluationBase):
     Calculates the accuracy for classification and multilabel data.
 
     The accuracy class creates two local variables, the correct number and the total number that are used to compute the
-    frequency with which predictions matches labels. This frequency is ultimately returned as the accuracy: an
+    frequency with which `y_pred` matches `y`. This frequency is ultimately returned as the accuracy: an
     idempotent operation that simply divides the correct number by the total number.
 
     .. math::
@@ -30,7 +30,7 @@ class Accuracy(EvaluationBase):
         {\text{true_positive} + \text{true_negative} + \text{false_positive} + \text{false_negative}}
 
     Args:
-        eval_type (str): Metric to calculate the accuracy over a dataset, for
+        eval_type (str): The metric to calculate the accuracy over a dataset, for
             classification (single-label), and multilabel (multilabel classification).
             Default: 'classification'.
 
@@ -77,7 +77,7 @@ class Accuracy(EvaluationBase):
             ValueError: If the number of the inputs is not 2.
         """
         if len(inputs) != 2:
-            raise ValueError('Accuracy need 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
+            raise ValueError('The accuracy needs 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
         y_pred = self._convert_data(inputs[0])
         y = self._convert_data(inputs[1])
         if self._type == 'classification' and y_pred.ndim == y.ndim and self._check_onehot_data(y):
@@ -88,8 +88,9 @@ class Accuracy(EvaluationBase):
         if self._class_num == 0:
             self._class_num = y_pred.shape[1]
         elif y_pred.shape[1] != self._class_num:
-            raise ValueError('Class number not match, last input data contain {} classes, but current data contain {} '
-                             'classes'.format(self._class_num, y_pred.shape[1]))
+            raise ValueError('The y_pred shape does not match the class number, the last input data contains '
+                             '{} classes, but the current data contains {} classes'
+                             .format(self._class_num, y_pred.shape[1]))
 
         if self._type == 'classification':
             indices = y_pred.argmax(axis=1)
diff --git a/mindspore/nn/metrics/auc.py b/mindspore/nn/metrics/auc.py
index a20eca4205c..a7d43b8d741 100644
--- a/mindspore/nn/metrics/auc.py
+++ b/mindspore/nn/metrics/auc.py
@@ -18,7 +18,7 @@ import numpy as np
 
 def auc(x, y, reorder=False):
     """
-    Computes the Area Under the Curve (AUC) using the trapezoidal rule. This is a general function, given points on a
+    Computes the AUC(Area Under the Curve) using the trapezoidal rule. This is a general function, given points on a
     curve. For computing the area under the ROC-curve.
 
     Args:
@@ -78,12 +78,10 @@ def auc(x, y, reorder=False):
 
 def _column_or_1d(y):
     """
-     Ravel column or 1d numpy array, otherwise raise an error.
+     Ravel column or 1D numpy array, otherwise raise a ValueError.
     """
     shape = np.shape(y)
-    if len(shape) == 1:
-        return np.ravel(y)
-    if len(shape) == 2 and shape[1] == 1:
+    if len(shape) == 1 or(len(shape) == 2 and shape[1] == 1):
         return np.ravel(y)
 
     raise ValueError("Bad input shape {0}.".format(shape))
diff --git a/mindspore/nn/metrics/bleu_score.py b/mindspore/nn/metrics/bleu_score.py
index bc2ad981b7a..507052dc2e1 100644
--- a/mindspore/nn/metrics/bleu_score.py
+++ b/mindspore/nn/metrics/bleu_score.py
@@ -24,11 +24,11 @@ class BleuScore(Metric):
     Calculates BLEU score of machine translated text with one or more references.
 
     Args:
-        n_gram (int): The n_gram value ranged from 1 to 4. Default: 4.
+        n_gram (int): The n_gram value ranges from 1 to 4. Default: 4.
         smooth (bool): Whether or not to apply smoothing. Default: False.
 
     Raises:
-        ValueError: If the value range of n_gram is not 1 to 4.
+        ValueError: If the value range of n_gram is not from 1 to 4.
 
     Supported Platforms:
         ``Ascend`` ``GPU`` ``CPU``
@@ -48,7 +48,7 @@ class BleuScore(Metric):
         super().__init__()
         self.n_gram = validator.check_value_type("n_gram", n_gram, [int])
         if self.n_gram > 4 or self.n_gram < 1:
-            raise ValueError('The n_gram value ranged from 1 to 4, but got {}'.format(n_gram))
+            raise ValueError('The n_gram value ranges from 1 to 4, but got {}'.format(n_gram))
 
         self.smooth = validator.check_value_type("smooth", smooth, [bool])
         self.clear()
@@ -70,7 +70,7 @@ class BleuScore(Metric):
 
         Args:
             ngram_input_list (list): A list of translated text or reference texts.
-            n_gram (int): gram value ranged 1 to 4.
+            n_gram (int): gram value ranges from 1 to 4.
 
         Return:
             ngram_counter: a collections.Counter object of ngram.
@@ -99,12 +99,12 @@ class BleuScore(Metric):
             ValueError: If the number of input is not 2.
         """
         if len(inputs) != 2:
-            raise ValueError('The bleu_score need 2 inputs (candidate_corpus, reference_corpus), '
+            raise ValueError('The bleu_score needs 2 inputs (candidate_corpus, reference_corpus), '
                              'but got {}'.format(len(inputs)))
         candidate_corpus = inputs[0]
         reference_corpus = inputs[1]
         if len(candidate_corpus) != len(reference_corpus):
-            raise ValueError('translate_corpus and reference_corpus should be equal in length, '
+            raise ValueError('The translate_corpus and reference_corpus should be equal in length, '
                              'but got {} {}'.format(len(candidate_corpus), len(reference_corpus)))
 
         for (candidate, references) in zip(candidate_corpus, reference_corpus):
diff --git a/mindspore/nn/metrics/confusion_matrix.py b/mindspore/nn/metrics/confusion_matrix.py
index a5c0af1973b..48721c69afa 100644
--- a/mindspore/nn/metrics/confusion_matrix.py
+++ b/mindspore/nn/metrics/confusion_matrix.py
@@ -95,13 +95,13 @@ class ConfusionMatrix(Metric):
             ValueError: If the number of the inputs is not 2.
         """
         if len(inputs) != 2:
-            raise ValueError('ConfusionMatrix need 2 inputs (y_pred, y), but got {}.'.format(len(inputs)))
+            raise ValueError('The ConfusionMatrix needs 2 inputs (y_pred, y), but got {}.'.format(len(inputs)))
 
         y_pred = self._convert_data(inputs[0])
         y = self._convert_data(inputs[1])
 
         if not (y_pred.ndim == y.ndim or y_pred.ndim == y.ndim + 1):
-            raise ValueError("y_pred and y should have the same number of dimensions, or the dimension of y_pred "
+            raise ValueError("The y_pred and y should have the same number of dimensions, or the dimension of y_pred "
                              "equals the dimension of y add 1.")
 
         if y_pred.ndim == y.ndim + 1:
@@ -165,9 +165,9 @@ class ConfusionMatrixMetric(Metric):
                            "fall out", "false discovery rate", "false omission rate", "prevalence threshold",
                            "threat score", "accuracy", "balanced accuracy", "f1 score",
                            "matthews correlation coefficient", "fowlkes mallows index", "informedness", "markedness"].
-        calculation_method (bool): If true, the measurement for each sample is calculated first. If it is false, the
-                                   confusion matrix of all samples is accumulated first. As for classification task,
-                                   'calculation_method' should be False. Default: False.
+        calculation_method (bool): If true, the measurement for each sample will be calculated first.
+                                   If not, the confusion matrix of all samples will be accumulated first.
+                                   As for classification task, 'calculation_method' should be False. Default: False.
         decrease (str): Define the mode to reduce the calculation result of one batch of data. Decrease is used only if
                         calculation_method is True. Default: "mean". Choose from:
                         ["none", "mean", "sum", "mean_batch", "sum_batch", "mean_channel", "sum_channel"].
@@ -233,7 +233,7 @@ class ConfusionMatrixMetric(Metric):
             ValueError: If the number of the inputs is not 2.
         """
         if len(inputs) != 2:
-            raise ValueError('ConfusionMatrixMetric need 2 inputs (y_pred, y), but got {}.'.format(len(inputs)))
+            raise ValueError('The ConfusionMatrixMetric needs 2 inputs (y_pred, y), but got {}.'.format(len(inputs)))
 
         y_pred = self._convert_data(inputs[0])
         y = self._convert_data(inputs[1])
@@ -261,7 +261,8 @@ class ConfusionMatrixMetric(Metric):
 
         if self.calculation_method is True:
             if self._class_num == 0:
-                raise RuntimeError("ConfusionMatrixMetric must have at least one example before it can be computed.")
+                raise RuntimeError("The ConfusionMatrixMetric must have at least one example "
+                                   "before it can be computed.")
 
             return self._total_num / self._class_num
 
@@ -278,8 +279,8 @@ class _ConfusionMatrix:
                              output. Default: True.
         metric_name (str): The names of indicators are in the following range. Of course, you can also set the industry
                            common aliases for these indicators.
-        calculation_method (bool): If true, the measurement for each sample is calculated first. If it is false, the
-                                   confusion  matrix for each image (the output of function '_get_confusion_matrix')
+        calculation_method (bool): If true, the measurement for each sample will be calculated first. If not, the
+                                   confusion matrix for each image (the output of function '_get_confusion_matrix')
                                    will be returned. In this way, users should achieve the confusion matrixes for all
                                    images during an epochand then use '_compute_confusion_matrix_metric' to calculate
                                    the metric. Default: False.
@@ -310,11 +311,11 @@ class _ConfusionMatrix:
             ValueError: when `y_pred` has less than two dimensions.
         """
         if not np.all(y.astype(np.uint8) == y):
-            raise ValueError("y should be a binarized ndarray.")
+            raise ValueError("The y should be a binarized ndarray.")
 
         dims = y_pred.ndim
         if dims < 2:
-            raise ValueError("y_pred should have at least two dimensions.")
+            raise ValueError("The y_pred should have at least two dimensions.")
 
         if dims == 2 or (dims == 3 and y_pred.shape[-1] == 1):
             if self.calculation_method:
@@ -616,8 +617,7 @@ def _compute_confusion_matrix_metric(metric_name, confusion_matrix):
                         "mcc": _calculate_mcc(tp, fp, tn, fn),
                         "fm": _calculate_fm(tp, fp, p),
                         "bm": _calculate_bm(tp, tn, p, n),
-                        "mk": _calculate_mk(tp, fp, tn, fn)
-                        }
+                        "mk": _calculate_mk(tp, fp, tn, fn)}
     numerator, denominator = metric_name_dict.get(metric)
 
     if isinstance(denominator, np.ndarray):
@@ -685,8 +685,7 @@ def _check_metric_name(metric_name):
                         "bm": "bm",
                         "markedness": "mk",
                         "deltap": "mk",
-                        "mk": "mk"
-                        }
+                        "mk": "mk"}
 
     metric_name_info = metric_name_dict.get(metric_name)
 
diff --git a/mindspore/nn/metrics/cosine_similarity.py b/mindspore/nn/metrics/cosine_similarity.py
index cb7d238be00..be4a18f66a6 100644
--- a/mindspore/nn/metrics/cosine_similarity.py
+++ b/mindspore/nn/metrics/cosine_similarity.py
@@ -25,11 +25,11 @@ class CosineSimilarity(Metric):
     Args:
         similarity (str): 'dot' or 'cosine'. Default: 'cosine'
         reduction (str): 'none', 'sum', 'mean' (all along dim -1). Default: 'none'
-        zero_diagonal (bool): if True, the diagonals are set to zero. Default: True
+        zero_diagonal (bool): If true, the diagonals are set to zero. Default: True
 
     Return:
         A square matrix (input1, input1) with the similarity scores between all elements.
-        If sum or mean are used, then returns (b, 1) with the reduced value for each row.
+        If sum or mean is used, then returns (b, 1) with the reduced value for each row.
 
     Supported Platforms:
         ``Ascend`` ``GPU`` ``CPU``
diff --git a/mindspore/nn/metrics/dice.py b/mindspore/nn/metrics/dice.py
index 4d1a693b945..f127d03741a 100644
--- a/mindspore/nn/metrics/dice.py
+++ b/mindspore/nn/metrics/dice.py
@@ -21,7 +21,7 @@ from .metric import Metric, rearrange_inputs
 class Dice(Metric):
     r"""
     The Dice coefficient is a set similarity metric. It is used to calculate the similarity between two samples. The
-    value of the Dice coefficient is 1 when the segmentation result is the best and 0 when the segmentation result
+    value of the Dice coefficient is 1 when the segmentation result is the best and is 0 when the segmentation result
     is the worst. The Dice coefficient indicates the ratio of the area between two objects to the total area.
     The function is shown as follows:
 
@@ -73,17 +73,17 @@ class Dice(Metric):
 
         Raises:
             ValueError: If the number of the inputs is not 2.
-            RuntimeError: If y_pred and y should have different the dimension.
+            RuntimeError: If y_pred and y do not have the same shape.
         """
         if len(inputs) != 2:
-            raise ValueError('Dice need 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
+            raise ValueError('The Dice needs 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
 
         y_pred = self._convert_data(inputs[0])
         y = self._convert_data(inputs[1])
         self._samples_num += y.shape[0]
 
         if y_pred.shape != y.shape:
-            raise RuntimeError('y_pred and y should have same the dimension, but the shape of y_pred is{}, '
+            raise RuntimeError('The y_pred and y should have the same shape, but the shape of y_pred is {}, '
                                'the shape of y is {}.'.format(y_pred.shape, y.shape))
 
         intersection = np.dot(y_pred.flatten(), y.flatten())
@@ -100,9 +100,9 @@ class Dice(Metric):
             Float, the computed result.
 
         Raises:
-            RuntimeError: If the total samples num is 0.
+            RuntimeError: If the total number of samples is 0.
         """
         if self._samples_num == 0:
-            raise RuntimeError('Total samples num must not be 0.')
+            raise RuntimeError('The total number of samples can not be 0.')
 
         return self._dice_coeff_sum / float(self._samples_num)
diff --git a/mindspore/nn/metrics/error.py b/mindspore/nn/metrics/error.py
index 6ede282f3bd..4f9e78f5cc9 100644
--- a/mindspore/nn/metrics/error.py
+++ b/mindspore/nn/metrics/error.py
@@ -19,10 +19,10 @@ from .metric import Metric, rearrange_inputs
 
 class MAE(Metric):
     r"""
-    Calculates the mean absolute error.
+    Calculates the mean absolute error(MAE).
 
-    Creates a criterion that measures the mean absolute error (MAE)
-    between each element in the input: :math:`x` and the target: :math:`y`.
+    Creates a criterion that measures the MAE between each element
+    in the input: :math:`x` and the target: :math:`y`.
 
     .. math::
         \text{MAE} = \frac{\sum_{i=1}^n \|y_i - x_i\|}{n}
@@ -60,14 +60,14 @@ class MAE(Metric):
         Updates the internal evaluation result :math:`y_{pred}` and :math:`y`.
 
         Args:
-            inputs: Input `y_pred` and `y` for calculating mean absolute error where the shape of
+            inputs: Input `y_pred` and `y` for calculating MAE where the shape of
                 `y_pred` and `y` are both N-D and the shape are the same.
 
         Raises:
             ValueError: If the number of the input is not 2.
         """
         if len(inputs) != 2:
-            raise ValueError('Mean absolute error need 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
+            raise ValueError('The MAE needs 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
         y_pred = self._convert_data(inputs[0])
         y = self._convert_data(inputs[1])
         abs_error_sum = np.abs(y.reshape(y_pred.shape) - y_pred)
@@ -76,25 +76,25 @@ class MAE(Metric):
 
     def eval(self):
         """
-        Computes the mean absolute error.
+        Computes the mean absolute error(MAE).
 
         Returns:
             Float, the computed result.
 
         Raises:
-            RuntimeError: If the number of the total samples is 0.
+            RuntimeError: If the total number of samples is 0.
         """
         if self._samples_num == 0:
-            raise RuntimeError('Total samples num must not be 0.')
+            raise RuntimeError('The total number of samples must not be 0.')
         return self._abs_error_sum / self._samples_num
 
 
 class MSE(Metric):
     r"""
-    Measures the mean squared error.
+    Measures the mean squared error(MSE).
 
-    Creates a criterion that measures the mean squared error (squared L2
-    norm) between each element in the input: :math:`x` and the target: :math:`y`.
+    Creates a criterion that measures the MSE (squared L2 norm) between
+    each element in the input: :math:`x` and the target: :math:`y`.
 
     .. math::
         \text{MSE}(x,\ y) = \frac{\sum_{i=1}^n(y_i - x_i)^2}{n}
@@ -127,14 +127,14 @@ class MSE(Metric):
         Updates the internal evaluation result :math:`y_{pred}` and :math:`y`.
 
         Args:
-            inputs: Input `y_pred` and `y` for calculating mean square error where the shape of
+            inputs: Input `y_pred` and `y` for calculating the MSE where the shape of
                 `y_pred` and `y` are both N-D and the shape are the same.
 
         Raises:
             ValueError: If the number of input is not 2.
         """
         if len(inputs) != 2:
-            raise ValueError('Mean squared error need 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
+            raise ValueError('The MSE needs 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
 
         y_pred = self._convert_data(inputs[0])
         y = self._convert_data(inputs[1])
@@ -144,7 +144,7 @@ class MSE(Metric):
 
     def eval(self):
         """
-        Compute the mean squared error.
+        Computes the mean squared error(MSE).
 
         Returns:
             Float, the computed result.
diff --git a/mindspore/nn/metrics/fbeta.py b/mindspore/nn/metrics/fbeta.py
index 33cc9b024b4..ff5dd034ee8 100755
--- a/mindspore/nn/metrics/fbeta.py
+++ b/mindspore/nn/metrics/fbeta.py
@@ -49,7 +49,7 @@ class Fbeta(Metric):
         super(Fbeta, self).__init__()
         self.eps = sys.float_info.min
         if not beta > 0:
-            raise ValueError('`beta` must greater than zero, but got {}'.format(beta))
+            raise ValueError('The `beta` must be greater than zero, but got {}'.format(beta))
         self.beta = beta
         self.clear()
 
@@ -73,7 +73,7 @@ class Fbeta(Metric):
                 if one-hot encoding is used. Shape can also be :math:`(N,)` if category index is used.
         """
         if len(inputs) != 2:
-            raise ValueError('Fbeta need 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
+            raise ValueError('The fbeta needs 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
         y_pred = self._convert_data(inputs[0])
         y = self._convert_data(inputs[1])
         if y_pred.ndim == y.ndim and self._check_onehot_data(y):
@@ -82,12 +82,12 @@ class Fbeta(Metric):
         if self._class_num == 0:
             self._class_num = y_pred.shape[1]
         elif y_pred.shape[1] != self._class_num:
-            raise ValueError('Class number not match, last input data contain {} classes, but current data contain {} '
-                             'classes'.format(self._class_num, y_pred.shape[1]))
+            raise ValueError('The class number does not match, the last input data contains {} classes, '
+                             'but the current data contains {} classes'.format(self._class_num, y_pred.shape[1]))
         class_num = self._class_num
 
         if y.max() + 1 > class_num:
-            raise ValueError('y_pred contains {} classes less than y contains {} classes.'.
+            raise ValueError('The y_pred contains {} classes is less than y contains {} classes.'.
                              format(class_num, y.max() + 1))
         y = np.eye(class_num)[y.reshape(-1)]
         indices = y_pred.argmax(axis=1).reshape(-1)
@@ -113,7 +113,7 @@ class Fbeta(Metric):
         """
         validator.check_value_type("average", average, [bool], self.__class__.__name__)
         if self._class_num == 0:
-            raise RuntimeError('Input number of samples can not be 0.')
+            raise RuntimeError('The input number of samples can not be 0.')
 
         fbeta = (1.0 + self.beta ** 2) * self._true_positives / \
                 (self.beta ** 2 * self._actual_positives + self._positives + self.eps)
diff --git a/mindspore/nn/metrics/hausdorff_distance.py b/mindspore/nn/metrics/hausdorff_distance.py
index 299c35eec71..bc13e79aead 100644
--- a/mindspore/nn/metrics/hausdorff_distance.py
+++ b/mindspore/nn/metrics/hausdorff_distance.py
@@ -128,7 +128,7 @@ class HausdorffDistance(Metric):
             result = tuple(tup)
 
         if result is None:
-            raise ValueError(f"Sequence must have length {dim}, but got {len(tup)}.")
+            raise ValueError(f"The sequence length should be {dim}, but got {len(tup)}.")
 
         return result
 
@@ -172,7 +172,7 @@ class HausdorffDistance(Metric):
         box_end = list()
         for i in range(data.ndim):
             if nonzero_idx[i].size <= 0:
-                raise ValueError("did not find nonzero index at the spatial dim {}".format(i))
+                raise ValueError("Did not find nonzero index at the spatial dim {}".format(i))
             box_start.append(max(0, np.min(nonzero_idx[i]) - margin[i]))
             box_end.append(min(data.shape[i], np.max(nonzero_idx[i]) + margin[i] + 1))
         return box_start, box_end
@@ -195,7 +195,7 @@ class HausdorffDistance(Metric):
         if 0 <= self.percentile <= 100:
             return np.percentile(surface_distance, self.percentile)
 
-        raise ValueError(f"percentile should be a value between 0 and 100, get {self.percentile}.")
+        raise ValueError(f"The percentile value should be between 0 and 100, but got {self.percentile}.")
 
     def _get_surface_distance(self, y_pred_edges, y_edges):
         """
@@ -268,7 +268,7 @@ class HausdorffDistance(Metric):
         self._is_update = True
 
         if len(inputs) != 3:
-            raise ValueError('HausdorffDistance need 3 inputs (y_pred, y, label), but got {}'.format(len(inputs)))
+            raise ValueError('The HausdorffDistance needs 3 inputs (y_pred, y, label), but got {}'.format(len(inputs)))
 
         y_pred = self._convert_data(inputs[0])
         y = self._convert_data(inputs[1])
diff --git a/mindspore/nn/metrics/loss.py b/mindspore/nn/metrics/loss.py
index 37ce9543e70..d3505ef52cc 100644
--- a/mindspore/nn/metrics/loss.py
+++ b/mindspore/nn/metrics/loss.py
@@ -54,10 +54,10 @@ class Loss(Metric):
 
         Raises:
             ValueError: If the length of inputs is not 1.
-            ValueError: If the dimensions of loss is not 1.
+            ValueError: If the dimension of loss is not 1.
         """
         if len(inputs) != 1:
-            raise ValueError('Length of inputs must be 1, but got {}'.format(len(inputs)))
+            raise ValueError('The length of inputs must be 1, but got {}'.format(len(inputs)))
 
         loss = self._convert_data(inputs[0])
 
@@ -65,7 +65,7 @@ class Loss(Metric):
             loss = loss.reshape(1)
 
         if loss.ndim != 1:
-            raise ValueError("Dimensions of loss must be 1, but got {}".format(loss.ndim))
+            raise ValueError("The dimension of loss must be 1, but got {}".format(loss.ndim))
 
         loss = loss.mean(-1)
         self._sum_loss += loss
@@ -82,5 +82,5 @@ class Loss(Metric):
             RuntimeError: If the total number is 0.
         """
         if self._total_num == 0:
-            raise RuntimeError('Total number can not be 0.')
+            raise RuntimeError('The total number can not be 0.')
         return self._sum_loss / self._total_num
diff --git a/mindspore/nn/metrics/mean_surface_distance.py b/mindspore/nn/metrics/mean_surface_distance.py
index 4a1b16fdb05..714295d40f8 100644
--- a/mindspore/nn/metrics/mean_surface_distance.py
+++ b/mindspore/nn/metrics/mean_surface_distance.py
@@ -99,9 +99,9 @@ class MeanSurfaceDistance(Metric):
 
         Raises:
             ValueError: If the number of the inputs is not 3.
-            TypeError: If the data type of label_idx not be int or float.
+            TypeError: If the data type of label_idx is not int or float.
             ValueError: If the value of label_idx is not in y_pred or y.
-            ValueError: If y_pred and y should have different shape.
+            ValueError: If y_pred and y have different shapes.
         """
         if len(inputs) != 3:
             raise ValueError('MeanSurfaceDistance need 3 inputs (y_pred, y, label), but got {}.'.format(len(inputs)))
diff --git a/mindspore/nn/metrics/metric.py b/mindspore/nn/metrics/metric.py
index 057a4312af7..4715148febe 100644
--- a/mindspore/nn/metrics/metric.py
+++ b/mindspore/nn/metrics/metric.py
@@ -89,18 +89,18 @@ class Metric(metaclass=ABCMeta):
         elif isinstance(data, np.ndarray):
             pass
         else:
-            raise TypeError('Input data type must be tensor, list or numpy.ndarray')
+            raise TypeError('The input data type must be a tensor, list or numpy.ndarray')
         return data
 
     def _check_onehot_data(self, data):
         """
-        Whether input data are one-hot encoding.
+        Whether input data is one-hot encoding.
 
         Args:
             data (numpy.array): Input data.
 
         Returns:
-            bool, return true, if input data are one-hot encoding.
+            bool, return true, if input data is one-hot encoding.
         """
         if data.ndim > 1 and np.equal(data ** 2, data).all():
             shp = (data.shape[0],) + data.shape[2:]
@@ -139,13 +139,13 @@ class Metric(metaclass=ABCMeta):
 
     @property
     def indexes(self):
-        """The `_indexes` is a private attributes, and you can retrieve it by `self.indexes`.
+        """The `_indexes` is a private attribute, and you can retrieve it by `self.indexes`.
         """
         return getattr(self, '_indexes', None)
 
     def set_indexes(self, indexes):
         """
-        The `_indexes` is a private attributes, and you can modify it by this function.
+        The `_indexes` is a private attribute and you can modify it by this function.
         This allows you to determine the order of logits and labels to be calculated in the
         inputs, specially when you call the method `update` within this metrics.
 
@@ -183,7 +183,7 @@ class Metric(metaclass=ABCMeta):
         Evaluate input data once.
 
         Args:
-            inputs (tuple): The first item is predict array, the second item is target array.
+            inputs (tuple): The first item is a predict array, the second item is a target array.
 
         Returns:
             Float, compute result.
@@ -262,10 +262,10 @@ class EvaluationBase(Metric):
                                  'got y_pred shape is {} and y shape is {}'.format(y_pred.shape, y.shape))
         else:
             if y_pred.ndim != y.ndim:
-                raise ValueError('{} case, dims of y_pred need equal with dims of y, but got y_pred: {} '
+                raise ValueError('{} case, dims of y_pred must be equal to dims of y, but got y_pred: {} '
                                  'dims and y: {} dims.'.format(self._type, y_pred.ndim, y.ndim))
             if y_pred.shape != y.shape:
-                raise ValueError('{} case, y_pred shape need equal with y shape, but got y_pred: {} and y: {}'.
+                raise ValueError('{} case, y_pred shape must be equal to y shape, but got y_pred: {} and y: {}'.
                                  format(self._type, y_pred.shape, y.shape))
 
     def _check_value(self, y_pred, y):
@@ -296,7 +296,7 @@ class EvaluationBase(Metric):
             All subclasses must override this interface.
 
         Args:
-            inputs: The first item is predicted array and the second item is target array.
+            inputs: The first item is a predicted array and the second item is a target array.
         """
         raise NotImplementedError
 
diff --git a/mindspore/nn/metrics/occlusion_sensitivity.py b/mindspore/nn/metrics/occlusion_sensitivity.py
index 370224d5c67..0073ec18f58 100644
--- a/mindspore/nn/metrics/occlusion_sensitivity.py
+++ b/mindspore/nn/metrics/occlusion_sensitivity.py
@@ -35,7 +35,7 @@ class OcclusionSensitivity(Metric):
 
     For a given result, the output probability is the probability of a region.
 
-    The higher the value in the output image, the greater the decline of certainty, indicating that
+    The higher the value in the output image is, the greater the decline of certainty, indicating that
     the occluded area is more important in the decision-making process.
 
     Args:
@@ -96,7 +96,8 @@ class OcclusionSensitivity(Metric):
             b_box_min = b_box_max = None
         else:
             if len(b_box) != 2 * len(im_shape):
-                raise ValueError("Bounding box should contain upper and lower for all dimensions (except batch number)")
+                raise ValueError("The bounding box should contain upper and lower "
+                                 "for all dimensions (except batch number)")
 
             b_box_min = np.array(b_box[::2])
             b_box_max = np.array(b_box[1::2])
@@ -130,7 +131,7 @@ class OcclusionSensitivity(Metric):
 
         Inputs:
             - **model** (nn.Cell) - classification model to use for inference.
-            - **y_pred** (Union[Tensor, list, np.ndarray]) - image to test. Should be tensor consisting of 1 batch,
+            - **y_pred** (Union[Tensor, list, np.ndarray]) - image to test. Should be a tensor consisting of 1 batch,
               can be 2- or 3D.
             - **label** (Union[int, Tensor]) - classification label to check for changes (normally the true label,
               but doesn't have to be
@@ -141,7 +142,8 @@ class OcclusionSensitivity(Metric):
             RuntimeError: If the number of labels is different from the number of batches.
         """
         if len(inputs) != 3:
-            raise ValueError('occlusion_sensitivity need 3 inputs (model, y_pred, y), but got {}'.format(len(inputs)))
+            raise ValueError('The occlusion_sensitivity needs 3 inputs (model, y_pred, y), '
+                             'but got {}'.format(len(inputs)))
 
         model = inputs[0]
         y_pred = self._convert_data(inputs[1])
diff --git a/mindspore/nn/metrics/perplexity.py b/mindspore/nn/metrics/perplexity.py
index 79d5c833e68..c7fc3a6f1f7 100644
--- a/mindspore/nn/metrics/perplexity.py
+++ b/mindspore/nn/metrics/perplexity.py
@@ -77,17 +77,17 @@ class Perplexity(Metric):
 
         Raises:
             ValueError: If the number of the inputs is not 2.
-            RuntimeError: If preds and labels should have different length.
-            RuntimeError: If label shape should not be equal to pred shape.
+            RuntimeError: If preds and labels have different lengths.
+            RuntimeError: If label shape is not equal to pred shape.
         """
         if len(inputs) != 2:
-            raise ValueError('Perplexity needs 2 inputs (preds, labels), but got {}.'.format(len(inputs)))
+            raise ValueError('The perplexity needs 2 inputs (preds, labels), but got {}.'.format(len(inputs)))
 
         preds = [self._convert_data(inputs[0])]
         labels = [self._convert_data(inputs[1])]
 
         if len(preds) != len(labels):
-            raise RuntimeError('preds and labels should have the same length, but the length of preds is{}, '
+            raise RuntimeError('The preds and labels should have the same length, but the length of preds is{}, '
                                'the length of labels is {}.'.format(len(preds), len(labels)))
 
         loss = 0.
@@ -121,6 +121,6 @@ class Perplexity(Metric):
             RuntimeError: If the sample size is 0.
         """
         if self._num_inst == 0:
-            raise RuntimeError('Perplexity can not be calculated, because the number of samples is 0.')
+            raise RuntimeError('The perplexity can not be calculated, because the number of samples is 0.')
 
         return math.exp(self._sum_metric / self._num_inst)
diff --git a/mindspore/nn/metrics/precision.py b/mindspore/nn/metrics/precision.py
index 4419c512ff7..d7c570f38b6 100644
--- a/mindspore/nn/metrics/precision.py
+++ b/mindspore/nn/metrics/precision.py
@@ -91,7 +91,7 @@ class Precision(EvaluationBase):
             ValueError: If the number of input is not 2.
         """
         if len(inputs) != 2:
-            raise ValueError('Precision need 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
+            raise ValueError('The precision needs 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
         y_pred = self._convert_data(inputs[0])
         y = self._convert_data(inputs[1])
         if self._type == 'classification' and y_pred.ndim == y.ndim and self._check_onehot_data(y):
@@ -141,7 +141,7 @@ class Precision(EvaluationBase):
             Float, the computed result.
         """
         if self._class_num == 0:
-            raise RuntimeError('Input number of samples can not be 0.')
+            raise RuntimeError('The input number of samples can not be 0.')
 
         validator.check_value_type("average", average, [bool], self.__class__.__name__)
         result = self._true_positives / (self._positives + self.eps)
diff --git a/mindspore/nn/metrics/recall.py b/mindspore/nn/metrics/recall.py
index 6aecfb2afeb..c4541c313f4 100644
--- a/mindspore/nn/metrics/recall.py
+++ b/mindspore/nn/metrics/recall.py
@@ -37,7 +37,7 @@ class Recall(EvaluationBase):
         In the multi-label cases, the elements of :math:`y` and :math:`y_{pred}` must be 0 or 1.
 
     Args:
-        eval_type (str): Metric to calculate the recall over a dataset, for classification or
+        eval_type (str): The metric to calculate the recall over a dataset, for classification or
                          multilabel. Default: 'classification'.
 
     Examples:
@@ -91,7 +91,7 @@ class Recall(EvaluationBase):
             ValueError: If the number of input is not 2.
         """
         if len(inputs) != 2:
-            raise ValueError('Recall need 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
+            raise ValueError('The recall needs 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
         y_pred = self._convert_data(inputs[0])
         y = self._convert_data(inputs[1])
         if self._type == 'classification' and y_pred.ndim == y.ndim and self._check_onehot_data(y):
@@ -102,8 +102,8 @@ class Recall(EvaluationBase):
         if self._class_num == 0:
             self._class_num = y_pred.shape[1]
         elif y_pred.shape[1] != self._class_num:
-            raise ValueError('Class number not match, last input data contain {} classes, but current data contain {} '
-                             'classes'.format(self._class_num, y_pred.shape[1]))
+            raise ValueError('The class number does not match, the last input data contains {} classes, '
+                             'but the current data contains {} classes'.format(self._class_num, y_pred.shape[1]))
 
         class_num = self._class_num
         if self._type == "classification":
@@ -140,7 +140,7 @@ class Recall(EvaluationBase):
             Float, the computed result.
         """
         if self._class_num == 0:
-            raise RuntimeError('Input number of samples can not be 0.')
+            raise RuntimeError('The input number of samples can not be 0.')
 
         validator.check_value_type("average", average, [bool], self.__class__.__name__)
         result = self._true_positives / (self._actual_positives + self.eps)
diff --git a/mindspore/nn/metrics/roc.py b/mindspore/nn/metrics/roc.py
index aed53398082..c9afa4f7f4d 100644
--- a/mindspore/nn/metrics/roc.py
+++ b/mindspore/nn/metrics/roc.py
@@ -90,7 +90,8 @@ class ROC(Metric):
         # single class evaluation
         if len(y_pred.shape) == len(y.shape):
             if class_num is not None and class_num != 1:
-                raise ValueError('y_pred and y should have the same shape, but number of classes is different from 1.')
+                raise ValueError('The y_pred and y should have the same shape, '
+                                 'but the number of classes is different from 1.')
             class_num = 1
             if pos_label is None:
                 pos_label = 1
diff --git a/mindspore/nn/metrics/root_mean_square_surface_distance.py b/mindspore/nn/metrics/root_mean_square_surface_distance.py
index f3160ee7b41..e9c746d62e2 100644
--- a/mindspore/nn/metrics/root_mean_square_surface_distance.py
+++ b/mindspore/nn/metrics/root_mean_square_surface_distance.py
@@ -101,9 +101,9 @@ class RootMeanSquareDistance(Metric):
 
         Raises:
             ValueError: If the number of the inputs is not 3.
-            TypeError: If the data type of label_idx not be int or float.
+            TypeError: If the data type of label_idx is not int or float.
             ValueError: If the value of label_idx is not in y_pred or y.
-            ValueError: If y_pred and y should have different shape.
+            ValueError: If y_pred and y have different shapes.
         """
         if len(inputs) != 3:
             raise ValueError('MeanSurfaceDistance need 3 inputs (y_pred, y, label), but got {}.'.format(len(inputs)))
diff --git a/mindspore/nn/metrics/topk.py b/mindspore/nn/metrics/topk.py
index 65cc38e2f10..e7421de76e3 100644
--- a/mindspore/nn/metrics/topk.py
+++ b/mindspore/nn/metrics/topk.py
@@ -73,7 +73,7 @@ class TopKCategoricalAccuracy(Metric):
                 if one-hot encoding is used. Shape can also be :math:`(N,)` if category index is used.
         """
         if len(inputs) != 2:
-            raise ValueError('Topk need 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
+            raise ValueError('The topk needs 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
 
         y_pred = self._convert_data(inputs[0])
         y = self._convert_data(inputs[1])
@@ -93,7 +93,7 @@ class TopKCategoricalAccuracy(Metric):
             Float, computed result.
         """
         if self._samples_num == 0:
-            raise RuntimeError('Total samples num must not be 0.')
+            raise RuntimeError('The total number of samples must not be 0.')
         return self._correct_num / self._samples_num
 
 
diff --git a/mindspore/nn/optim/__init__.py b/mindspore/nn/optim/__init__.py
index 469ef9632b9..87aedfbaceb 100644
--- a/mindspore/nn/optim/__init__.py
+++ b/mindspore/nn/optim/__init__.py
@@ -30,6 +30,7 @@ from .proximal_ada_grad import ProximalAdagrad
 from .lazyadam import LazyAdam
 from .ada_grad import Adagrad
 from .thor import thor
+from .adafactor import AdaFactor
 
 __all__ = ['Optimizer', 'Momentum', 'LARS', 'Adam', 'AdamWeightDecay', 'LazyAdam', 'AdamOffload',
-           'Lamb', 'SGD', 'FTRL', 'RMSProp', 'ProximalAdagrad', 'Adagrad', 'thor']
+           'Lamb', 'SGD', 'FTRL', 'RMSProp', 'ProximalAdagrad', 'Adagrad', 'thor', 'AdaFactor']
diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py
index a5a83dc8d15..24f768616df 100644
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -218,7 +218,7 @@ class Optimizer(Cell):
         else:
             self.use_parallel = False
         if self.use_parallel:
-            if self.cls_name not in ["Lamb", "AdamWeightDecay"]:
+            if self.cls_name not in ["Lamb", "AdamWeightDecay", "AdaFactor"]:
                 raise RuntimeError("Parallel optimizer does not support optimizer {}".format(self.cls_name))
             self.dev_num = _get_device_num()
             if self.dev_num > self.param_length:
diff --git a/mindspore/nn/optim/thor.py b/mindspore/nn/optim/thor.py
index 5011ebbfc28..632be9ee8ad 100644
--- a/mindspore/nn/optim/thor.py
+++ b/mindspore/nn/optim/thor.py
@@ -113,13 +113,10 @@ def _check_param(momentum, frequency, lr, cls_name):
 
 
 def caculate_device_shape(matrix_dim, channel, is_a):
-    ll = (0)
     if is_a:
         if channel // C0 == 0:
             matrix_dim = (matrix_dim / channel) * C0
-        ll = (int(matrix_dim // C0), int(matrix_dim // C0), C0, C0), int(matrix_dim)
-    else:
-        ll = (int(matrix_dim // C0), int(matrix_dim // C0), C0, C0), int(matrix_dim)
+    ll = (int(matrix_dim // C0), int(matrix_dim // C0), C0, C0), int(matrix_dim)
     return ll
 
 
diff --git a/mindspore/nn/parallel/__init__.py b/mindspore/nn/parallel/__init__.py
deleted file mode 100644
index b7bd96acdae..00000000000
--- a/mindspore/nn/parallel/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Parallel Networks.
-This is an experimental interface that is subject to change and/or deletion.
-"""
-from .transformer import *
-__all__ = []
-__all__.extend(transformer.__all__)
diff --git a/mindspore/nn/parallel/transformer/__init__.py b/mindspore/nn/parallel/transformer/__init__.py
deleted file mode 100644
index c48b6839191..00000000000
--- a/mindspore/nn/parallel/transformer/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Transformer Networks
-This is an experimental interface that is subject to change and/or deletion.
-"""
-from .transformer import *
-
-__all__ = []
-__all__.extend(transformer.__all__)
diff --git a/mindspore/nn/parallel/transformer/transformer.py b/mindspore/nn/parallel/transformer/transformer.py
deleted file mode 100644
index ba875dca898..00000000000
--- a/mindspore/nn/parallel/transformer/transformer.py
+++ /dev/null
@@ -1,1225 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Transformer Networks. This is an experimental interface that is subject to change and/or deletion."""
-import math
-import numpy as np
-from mindspore.common.tensor import Tensor
-from mindspore.common.parameter import Parameter
-from mindspore.common.initializer import initializer
-from mindspore.common.seed import _get_graph_seed
-from mindspore._checkparam import Validator
-from mindspore import context
-from mindspore import nn
-import mindspore.common.dtype as mstype
-from mindspore.ops import operations as P
-from mindspore.ops import functional as F
-from mindspore._extends import cell_attr_register
-from mindspore.nn.cell import Cell
-from mindspore.nn.layer import Dense
-
-__all__ = [
-    "Dropout",
-    "LayerNorm",
-    "Linear",
-    "AttentionMask",
-    "VocabEmbedding",
-    "MultiHeadAttention",
-    "FeedForward",
-    "TransformerEncoder",
-    "TransformerDecoder",
-    "TransformerEncoderLayer",
-    "TransformerDecoderLayer",
-    "Transformer",
-    "TransformerParallelConfig"]
-
-
-class TransformerParallelConfig:
-    r"""
-        TransformerParallelConfig for the setting the global data parallel, model parallel and fusion group.
-        The parallel configure setting
-
-        Args:
-            dp (int): The data parallel way. Default: 1
-            mp (int): The model parallel way. Default: 1
-            pp (int): The number of the pipeline stage. Should be a positive value. Default: 1.
-            optimizer_parallel (bool): Enable optimizer state sharding or not. Default: True.
-            gradient_aggregation_group (int): The fusion group size of the optimizer state sharding. Default: 4.
-            recompute (bool): Enable recomputation of the transformer block or not. Default: False.
-            vocab_emb_dp (bool): Shard embedding in model parallel or data parallel. Default: True
-
-        Supported Platforms:
-            ``Ascend`` ``GPU``
-
-        Examples:
-            >>> config=TransformerParallelConfig(dp=1, mp=1)
-    """
-    def __init__(self, dp=1, mp=1, pp=1, recompute=False, optimizer_parallel=True, gradient_aggregation_group=4,
-                 vocab_emb_dp=True):
-        self.dp = dp
-        self.mp = mp
-        self.pp = pp
-        self.recompute = recompute
-        self.optimizer_parallel = optimizer_parallel
-        self.gradient_aggregation_group = gradient_aggregation_group
-        self.vocab_emb_dp = vocab_emb_dp
-
-    def __str__(self):
-        info = "[TransformerParallelConfig]" + '\n'
-        for k, v in self.__dict__.items():
-            var_info = "{}:{}\n".format(k, v)
-            info += var_info
-        return info
-
-
-# In case the user doesn't pass a config as args.
-default_transformer_config = TransformerParallelConfig()
-
-class Dropout(Cell):
-    r"""
-        A Dropout Implements with P.DropoutGenMask and  P.DropoutDoMask for parallel training.
-        Args:
-            keep_prob: the keep probability of the inputs. Default 0.5
-            dtype: the input type. Default mstype.float32
-
-        Inputs:
-            x: To be dropped tensor.
-
-        Returns:
-            a tensor with dropped value.
-        Examples:
-            >>> x = Tensor(np.ones([2, 2, 3]), mindspore.float32)
-            >>> net = nn.transformer.Dropout(keep_prob=0.8)
-            >>> net.set_train()
-            Dropout<keep_prob=0.8>
-            >>> output = net(x)
-            >>> print(output.shape)
-            (2, 2, 3)
-    """
-
-    def __init__(self, keep_prob=0.5, dtype=mstype.float32):
-        super(Dropout, self).__init__()
-        if keep_prob <= 0 or keep_prob > 1:
-            raise ValueError(
-                "dropout probability should be a number in range (0, 1], but got {}".format(
-                    keep_prob))
-        Validator.check_subclass("dtype", dtype, mstype.number_type, self.cls_name)
-        Validator.check_value_type('keep_prob', keep_prob, [float], self.cls_name)
-        self.keep_prob = keep_prob
-        self.is_ascend = context.get_context('device_target') in ["Ascend"]
-        if self.is_ascend:
-            seed0, seed1 = _get_graph_seed(0, "dropout")
-            self.seed0 = seed0
-            self.seed1 = seed1
-            self.dtype = dtype
-            self.get_shape = P.Shape()
-            self.dropout_gen_mask = P.DropoutGenMask(Seed0=self.seed0, Seed1=self.seed1)
-            self.dropout_do_mask = P.DropoutDoMask()
-            self.cast = P.Cast()
-        else:
-            self.dropout = P.Dropout(keep_prob)
-
-    def construct(self, x):
-        if not self.training:
-            return x
-
-        if not self.is_ascend:
-            out, _ = self.dropout(x)
-            return out
-
-        if self.keep_prob == 1:
-            return x
-
-        shape = self.get_shape(x)
-        dtype = P.DType()(x)
-        keep_prob = self.cast(self.keep_prob, dtype)
-        output = self.dropout_gen_mask(shape, keep_prob)
-        return self.dropout_do_mask(x, output, keep_prob)
-
-    def extend_repr(self):
-        return 'keep_prob={}, dtype={}'.format(self.keep_prob, self.dtype)
-
-    def shard(self, strategy):
-        r"""
-        Set the shard for the dropout. the strategy size should be equal to the inputs.
-
-        Args:
-            strategy (tuple): The strategy for the dropout. Should be the same shape as the inputs.
-        Examples:
-            >>> net = nn.transformer.Dropout(keep_prob=0.8)
-            >>> net.set_train()
-            Dropout<keep_prob=0.8>
-            >>> net.shard(((2, 1),))
-        """
-        if self.is_ascend:
-            self.dropout_gen_mask.shard(strategy)
-            self.dropout_do_mask.shard(strategy)
-        else:
-            self.dropout.shard(strategy)
-
-
-class LayerNorm(Cell):
-    r"""
-        A self-defined layer norm operation using reduce sum and reduce mean
-
-        Args:
-            normalized_shape (tuple): The shape of the input tensor
-            dp (int): The data parallel way of the inputs, Default:1
-            eps (float): The epsilon value of the denominator. Default 1e-5.
-        Inputs:
-            - **x** (Tensor) - Tensor of shape :math:`(batch, seq\_length, hidden\_size)`.
-
-        Outputs:
-            Tensor of shape :math:`(batch, seq_length, hidden_size)`.
-    """
-
-    def __init__(self, normalized_shape, eps=1e-5):
-        super(LayerNorm, self).__init__()
-        self.gamma = Parameter(initializer('ones', normalized_shape), name="gamma", parallel_optimizer=False)
-        self.beta = Parameter(initializer('zeros', normalized_shape), name="beta", parallel_optimizer=False)
-        self.mean = P.ReduceMean(keep_dims=True)
-        self.square = P.Square()
-        self.sqrt = P.Sqrt()
-        self.sub1 = P.Sub()
-        self.sub2 = P.Sub()
-        self.add = P.TensorAdd()
-        self.eps = eps
-        self.mul = P.Mul()
-        self.add2 = P.TensorAdd()
-        self.real_div = P.RealDiv()
-
-    def construct(self, x):
-        r"""
-          x : batch x seq_length x hidden_size
-        """
-        mean = self.mean(x, -1)
-        diff = self.sub1(x, mean)
-        variance = self.mean(self.square(diff), -1)
-        variance_eps = self.sqrt(self.add(variance, self.eps))
-        output = self.real_div(diff, variance_eps)
-        output = self.add2(self.mul(output, self.gamma), self.beta)
-        return output
-
-    def shard(self, strategy):
-        r"""
-        Set the shard for the layer norm. the strategy size should be equal to the inputs.
-
-        Args:
-            strategy (tuple): The strategy for the dropout. Should be the same shape as the inputs.
-        Examples:
-            >>> net = nn.transformer.LayerNorm(normalized_shape=(1024, 10))
-            >>> net.shard(((10, 2, 1),))
-        """
-        self.mean.shard(strategy)
-        self.square.shard(strategy)
-        self.sqrt.shard(strategy)
-        self.sub1.shard((strategy[0], strategy[0]))
-        self.sub2.shard((strategy[0], strategy[0]))
-        self.add.shard((strategy[0], ()))
-        self.mul.shard((strategy[0], (1,)))
-        self.add2.shard((strategy[0], (1,)))
-        self.real_div.shard((strategy[0], strategy[0]))
-
-
-class Linear(Dense):
-    r"""
-    The dense connected layer. Once the parallel mode is enabled, the input shape should be
-    3-D tensor.
-
-    Applies dense connected layer for the input. This layer implements the operation as:
-
-    .. math::
-        \text{outputs} = \text{activation}(\text{X} * \text{kernel} + \text{bias}),
-
-    where :math:`X` is the input tensors, :math:`\text{activation}` is the activation function passed as the activation
-    argument (if passed in), :math:`\text{kernel}` is a weight matrix with the same
-    data type as the :math:`X` created by the layer, and :math:`\text{bias}` is a bias vector
-    with the same data type as the :math:`X` created by the layer (only if has_bias is True).
-
-    Args:
-        in_channels (int): The number of channels in the input space.
-        out_channels (int): The number of channels in the output space.
-        weight_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable weight_init parameter. The dtype
-            is same as `x`. The values of str refer to the function `initializer`. Default: 'normal'.
-        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
-            same as `x`. The values of str refer to the function `initializer`. Default: 'zeros'.
-        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
-        activation (str): activate function applied to the output of the fully connected layer,
-            eg. 'ReLU'.Default: None.
-        compute_dtype (mstype): The computation type. Default: mstype.float16
-    Inputs:
-        - **x** (Tensor) - Tensor of shape :math:`(*, in\_channels)`. The `in_channels` in `Args` should be equal
-          to :math:`in\_channels` in `Inputs`.
-
-    Outputs:
-        Tensor of shape :math:`(*, out\_channels)`.
-
-    Raises:
-        TypeError: If `in_channels` or `out_channels` is not an int.
-        TypeError: If `has_bias` is not a bool.
-        TypeError: If `activation` is not one of str, Cell, Primitive, None.
-        ValueError: If length of shape of `weight_init` is not equal to 2 or shape[0] of `weight_init`
-                    is not equal to `out_channels` or shape[1] of `weight_init` is not equal to `in_channels`.
-        ValueError: If length of shape of `bias_init` is not equal to 1
-                    or shape[0] of `bias_init` is not equal to `out_channels`.
-
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-
-    Examples:
-        >>> x = Tensor(np.ones((10, 20, 3)), mindspore.float32)
-        >>> net = Linear(3, 4)
-        >>> output = net(x)
-        >>> print(output.shape)
-        (10, 20, 4)
-    """
-
-    @cell_attr_register(attrs=['has_bias', 'in_channels', 'out_channels', 'shard_output', 'activation'])
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 weight_init='normal',
-                 bias_init='zeros',
-                 has_bias=True,
-                 activation=None,
-                 compute_dtype=mstype.float16):
-        super(Linear, self).__init__(in_channels=in_channels,
-                                     out_channels=out_channels,
-                                     weight_init=weight_init,
-                                     bias_init=bias_init,
-                                     has_bias=has_bias,
-                                     activation=activation)
-        if activation and not isinstance(activation, str):
-            raise ValueError("Activation can only be str, but found type {}".format(activation))
-        self.act_name = activation
-        self.dtype = compute_dtype
-        self.cast = P.Cast()
-        self.has_bias = self.has_bias
-
-    def construct(self, x):
-        out_shape = P.Shape()(x)[:-1] + (self.out_channels,)
-        x = P.Reshape()(x, (-1, self.in_channels))
-        weight = self.cast(self.weight, self.dtype)
-        x = self.matmul(x, weight)
-        x = self.bias_add(x, self.cast(self.bias, self.dtype))
-        output = P.Reshape()(x, out_shape)
-        if self.activation_flag:
-            output = self.activation(output)
-        return output
-
-    def shard(self, strategy_matmul, strategy_bias=None, strategy_activation=None):
-        r"""
-         Set the shard for the linear. the strategy size should be equal to the inputs.
-
-         Args:
-             strategy_matmul (tuple): The strategy for the matmul. Should be the same shape as the inputs.
-             strategy_bias (tuple): The strategy for the bias_add. Should be the same shape as the inputs.
-             strategy_activation (tuple): The strategy for the strategy_activation. Should be the same shape as
-                the inputs.
-         Examples:
-             >>> net = nn.transformer.Linear(16, 8, has_bias=True)
-             >>> net.shard(strategy_matmul=((2, 1), (2, 1)),
-             >>>           strategy_bias=((2, 2), (2,)))
-         """
-        self.matmul.shard(strategy_matmul)
-        if self.has_bias:
-            self.bias_add.shard(strategy_bias)
-        if self.activation_flag:
-            getattr(self.activation, self.act_name).shard(strategy_activation)
-
-
-class FeedForward(Cell):
-    """
-    The multilayer perceptron with two linear layers with dropout applied at final output. The first linear
-    will project the input dimension from hidden_size to ffn_hidden_size, the second linear will project the
-    dimension from ffn_hidden_size to hidden_size. The first linear is sharded on the relative dimension,
-    the second linear is sharded on the output dimension.
-    Args:
-        hidden_size (int): The dimension of the inputs.
-        ffn_hidden_size (int): The intermediate hidden size.
-        dropout_rate (float): The dropout rate for the second linear's output.
-        hidden_act (str): The activate type of the first linear, Default: gelu.
-        parallel_config(TransformerParallelConfig): the config of parallel setting, see `TransformerParallelConfig`
-    Inputs:
-        x: should be `[batch, seq_length, hidden_size]`.
-    Returns:
-        output: Tensor, the output of this layer after mapping. The shape is `[batch, seq_length, hidden_size]`.
-
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-    Examples:
-        >>> model = FeedForward(hidden_size=15, ffn_hidden_size=30, dropout_rate=0.1)
-        >>> tensor = Tensor(np.ones((2, 20, 15)), dtype.float32)
-        >>> output = model(tensor)
-    """
-
-    def __init__(self, hidden_size,
-                 ffn_hidden_size,
-                 dropout_rate,
-                 hidden_act='gelu',
-                 parallel_config=default_transformer_config):
-        super(FeedForward, self).__init__()
-        dp = parallel_config.dp
-        mp = parallel_config.mp
-        input_size = hidden_size
-        output_size = ffn_hidden_size
-        # Project to ffn_hidden_size
-        self.mapping = Linear(in_channels=input_size,
-                              out_channels=output_size,
-                              activation=hidden_act)
-        self.mapping.shard(strategy_bias=((dp, mp), (mp,)),
-                           strategy_matmul=((dp, 1), (mp, 1)),
-                           strategy_activation=((dp, 1, mp),))
-        # Project back to embedding_size
-        self.projection = Linear(in_channels=output_size,
-                                 out_channels=input_size)
-        self.projection.shard(strategy_bias=((dp, 1), (1,)),
-                              strategy_matmul=((dp, mp), (1, mp)))
-        self.dropout = Dropout(1 - dropout_rate)
-        self.dropout.shard(((dp, 1, 1),))
-        self.cast = P.Cast()
-
-    def construct(self, x):
-        x = self.cast(x, mstype.float16)
-        # [bs, seq_length, ffn_hidden_size]
-        hidden = self.mapping(x)
-        output = self.projection(hidden)
-        # [bs, seq_length, hidden_size]
-        output = self.dropout(output)
-        return output
-
-
-class AttentionMask(Cell):
-    r"""
-    Get the Lower triangular matrix.
-    Args:
-        seq_length: the length of the
-        parallel_config(parallel_config): the parallel configure
-    Inputs:
-        input_mask: the mask indicating whether each position is a valid input with (batch_size, seq_length)
-    Outputs:
-        attention_mask: the attention mask matrix with shape (batch_size, 1, seq_length, seq_length)
-
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-    """
-
-    def __init__(self, seq_length, parallel_config=default_transformer_config):
-        super(AttentionMask, self).__init__()
-        self.reshape = P.Reshape()
-        self.mul = P.BatchMatMul().shard(
-            ((parallel_config.dp, 1, 1), (parallel_config.dp, 1, 1)))
-        self.expand_dim = P.ExpandDims().shard(((1, 1),))
-        ones = np.ones(shape=(seq_length, seq_length))
-        # Default lower triangle mask matrix
-        self.lower_triangle_mask = Tensor(np.tril(ones), mstype.float32)
-        self.multiply = P.Mul().shard(((parallel_config.dp, 1, 1), (1, 1, 1)))
-
-    def construct(self, input_mask):
-        r"""
-        Generate the attention mask matrix.
-        """
-        input_shape = P.Shape()(input_mask)
-        shape_right = (input_shape[0], 1, input_shape[1])
-        shape_left = input_shape + (1,)
-        # Mask the padded inputs
-        mask_left = self.reshape(input_mask, shape_left)
-        mask_right = self.reshape(input_mask, shape_right)
-        attention_mask = self.mul(mask_left, mask_right)
-        lower_traiangle = self.expand_dim(self.lower_triangle_mask, 0)
-        # [bs, seq_length, seq_length]
-        attention_mask = self.multiply(
-            attention_mask, lower_traiangle)
-        return attention_mask
-
-
-class VocabEmbedding(Cell):
-    """
-    The embedding lookup table for vocabulary
-    Args:
-        vocab_size (int): Size of the dictionary of embeddings.
-        embedding_size (int): The size of each embedding vector.
-        param_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the embedding_table.
-            Refer to class `initializer` for the values of string when a string
-            is specified. Default: 'normal'.
-        parallel_config(TransformerParallelConfig): the parallel config of network.
-    Inputs:
-        input_ids: the tokenized inputs with datatype int32 with shape (batch_size, seq_length)
-    Outputs:
-        output: Tensor, the embedding vector for the input with shape (batch_size,
-        seq_length, embedding_size)
-        self.weight: Tensor, the embedding table for the vocabulary
-
-    Raises:
-        ValueError: If the parallel_config.vocab_emb_dp is True, the vocab size is not a multiple of
-            parallel_config.mp
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-    Examples:
-        >>> model = VocabEmbedding(vocab_size=30, embedding_size=30)
-        >>> tensor = Tensor(np.ones((20, 15)), dtype.int32)
-        >>> output = model(tensor)
-    """
-
-    def __init__(self, vocab_size, embedding_size, parallel_config=default_transformer_config, param_init='normal'):
-        super(VocabEmbedding, self).__init__()
-        self.vocab_size = vocab_size
-        self.embedding_size = embedding_size
-        self.weight = Parameter(initializer(param_init, [self.vocab_size, self.embedding_size]),
-                                name='embedding_table')
-        if parallel_config.vocab_emb_dp:
-            self.gather = P.GatherV2().shard(((1, 1), (parallel_config.dp, 1)))
-        else:
-            if self.embedding_size % parallel_config.mp != 0:
-                raise ValueError(f"The vocab size of the embedding {self.vocab_size} must be a "
-                                 f"multiple of parallel_config.mp {parallel_config.mp}.")
-            self.gather = P.GatherV2().shard(((parallel_config.mp, 1), (1, 1)))
-
-    def construct(self, input_ids):
-        output = self.gather(self.weight, input_ids, 0)
-        return output, self.weight
-
-
-class MultiHeadAttention(Cell):
-    """
-    MultiHeadAttention module.
-
-    Args:
-        hidden_size(int): The hidden size of the input.
-        num_heads(int): The number of the heads.
-        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
-        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
-        compute_dtype(mstype): The computation type. Default mstype.float16. The computation of the
-            softmax will be converted to the float32.
-        use_past(bool): Use the past state to compute. Default False.
-        parallel_config(TransformerParallelConfig): The parallel configure.
-    Inputs:
-        from_tensor: the query vector with shape (batch_size, src_seq_length, hidden_size).
-        to_tensor: the key and value vector with shape (batch_size, tgt_seq_length, hidden_size).
-        attention_mask: the attention mask matrix with shape (batch_size, 1,
-        seq_length, seq_length)
-        layer_past: the previous feature map
-
-    Outputs:
-        output: Tensor, the output logit of this layer
-        layer_present: Tensor, the feature map of current layer
-
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-    Examples:
-        >>> model = MultiHeadAttention(hidden_size=15, from_seq_length=20, to_seq_length=20,
-        >>>                           num_heads=3)
-        >>> from_tensor = Tensor(np.ones((2, 20, 15)), dtype.float32)
-        >>> to_tensor = Tensor(np.ones((2, 20, 15)), dtype.float16)
-        >>> attention_mask = Tensor(np.ones((2, 1, 20, 20)), dtype.float16)
-        >>> model(from_tensor, to_tensor, attention_mask)
-    """
-
-    def __init__(self, hidden_size,
-                 num_heads,
-                 hidden_dropout_rate=0.1,
-                 attention_dropout_rate=0.1,
-                 compute_dtype=mstype.float16,
-                 use_past=False,
-                 parallel_config=default_transformer_config):
-        super(MultiHeadAttention, self).__init__()
-        # Output layer
-        self.projection = Linear(in_channels=hidden_size,
-                                 out_channels=hidden_size).to_float(compute_dtype)
-        self.projection.shard(strategy_bias=((parallel_config.dp, 1), (1,)),
-                              strategy_matmul=((parallel_config.dp, parallel_config.mp), (1, parallel_config.mp)))
-        self.transpose = P.Transpose().shard(((parallel_config.dp, 1, parallel_config.mp, 1),))
-        self.merger_head_transpose = P.Transpose().shard(
-            ((parallel_config.dp, parallel_config.mp, 1, 1),))
-        self.reshape = P.Reshape()
-        self.n_head = num_heads
-        # embedding size per head
-        self.size_per_head = hidden_size // self.n_head
-        self.concat_k = P.Concat(axis=3)
-        self.concat_v = P.Concat(axis=2)
-        self.multiply_data = Tensor([
-            -10000.0,
-        ], dtype=mstype.float32)
-        self.batch_matmul = P.BatchMatMul().shard(
-            ((parallel_config.dp, parallel_config.mp, 1, 1), (parallel_config.dp, parallel_config.mp, 1, 1)))
-        self.real_div = P.RealDiv().shard(((parallel_config.dp, parallel_config.mp, 1, 1), ()))
-        self.sub = P.Sub().shard(
-            ((1,), (parallel_config.dp, 1, 1, 1)))
-        self.mul = P.Mul().shard(
-            ((parallel_config.dp, 1, 1, 1), (1,)))
-        self.add = P.TensorAdd().shard(
-            ((parallel_config.dp, 1, 1, 1), (parallel_config.dp, parallel_config.mp, 1, 1)))
-        # Normalize factor for attention, sqrt(dk) as widely used
-        self.scale_factor = Tensor(math.sqrt(self.size_per_head))
-        self.use_past = use_past
-        self.dropout = Dropout(1 - hidden_dropout_rate)
-        self.dropout.shard(((parallel_config.dp, 1, 1),))
-        self.prob_dropout = Dropout(1 - attention_dropout_rate)
-        self.prob_dropout.shard(
-            ((parallel_config.dp, parallel_config.mp, 1, 1),))
-        self.softmax = nn.Softmax()
-        self.softmax.softmax.shard(((parallel_config.dp, parallel_config.mp, 1),))
-        self.expand_dims = P.ExpandDims().shard(((parallel_config.dp, 1, 1),))
-
-        # Query
-        self.dense1 = Linear(hidden_size,
-                             hidden_size).to_float(compute_dtype)
-        self.dense1.shard(strategy_matmul=((parallel_config.dp, 1), (parallel_config.mp, 1)),
-                          strategy_bias=((parallel_config.dp, parallel_config.mp), (parallel_config.mp,)))
-        # Key
-        self.dense2 = Linear(hidden_size,
-                             hidden_size).to_float(compute_dtype)
-        self.dense2.shard(strategy_matmul=((parallel_config.dp, 1), (parallel_config.mp, 1)),
-                          strategy_bias=((parallel_config.dp, parallel_config.mp), (parallel_config.mp,)))
-
-        # Value
-        self.dense3 = Linear(hidden_size,
-                             hidden_size).to_float(compute_dtype)
-        self.dense3.shard(strategy_matmul=((parallel_config.dp, 1), (parallel_config.mp, 1)),
-                          strategy_bias=((parallel_config.dp, parallel_config.mp), (parallel_config.mp,)))
-
-
-    def construct(self, from_tensor, to_tensor, attention_mask, layer_past=None):
-        """
-        multi-head attention
-        """
-
-        from_tensor_original_shape = F.shape(from_tensor)
-        from_tensor = F.reshape(from_tensor, (-1, from_tensor_original_shape[-1]))
-
-        to_tensor_original_shape = F.shape(to_tensor)
-        to_tensor = F.reshape(to_tensor, (-1, to_tensor_original_shape[-1]))
-
-        # Self attention: query, key, value are derived from the same inputs
-        query = self.dense1(from_tensor)
-        key = self.dense2(to_tensor)
-        value = self.dense3(to_tensor)
-        # [bs, num_heads, seq_length, size_per_head]
-        query = self.transpose(
-            F.reshape(
-                query,
-                (-1, from_tensor_original_shape[1], self.n_head, self.size_per_head)),
-            (0, 2, 1, 3))
-        # [bs, num_heads, size_per_head, seq_length]
-        key = self.transpose(
-            F.reshape(
-                key, (-1, to_tensor_original_shape[1], self.n_head, self.size_per_head)),
-            (0, 2, 3, 1))
-        # [bs, num_heads, seq_length, size_per_head]
-        value = self.transpose(
-            F.reshape(
-                value,
-                (-1, to_tensor_original_shape[1], self.n_head, self.size_per_head)),
-            (0, 2, 1, 3))
-        if self.use_past:
-            past_value = layer_past[1]
-            past_key = self.transpose(layer_past[0], (0, 1, 3, 2))
-            key = self.concat_k((past_key, key))
-            value = self.concat_v(past_value, value)
-        layer_present = (key, value)
-        # attention considering attention mask
-        attention = self._attn(query, key, value, attention_mask)
-        # [bs, seq_length, embedding_size]
-        attention_merge = self.merge_heads(attention)
-        # Output
-        output = self.projection(attention_merge)
-        output = self.dropout(output)
-        return output, layer_present
-
-    def split_heads(self, x, transpose):
-        """
-        split 3d tensor to 4d and switch certain axes
-        Inputs:
-            x: input tensor
-            transpose: tuple, the transpose sequence
-        Outputs:
-            x_transpose: the 4d output
-        """
-        x_size = P.Shape()(x)
-        new_x_shape = x_size[:-1] + (self.n_head, self.size_per_head)
-        x = self.reshape(x, new_x_shape)
-        x_transpose = self.transpose(x, transpose)
-        return x_transpose
-
-    def merge_heads(self, x):
-        """
-        convert a 4d input to a 3d output
-
-        Inputs:
-            x: input tensor
-
-        Output:
-            x_merge: the 3d output
-        """
-        x = self.merger_head_transpose(
-            x, (0, 2, 1, 3))  # bs, seq_length, head, size_per_head
-        x_shape = P.Shape()(x)
-        new_shape = x_shape[:-2] + (x_shape[-2] * x_shape[-1],)
-        x_merge = self.reshape(x, new_shape)
-        return x_merge
-
-    def _attn(self, query, key, value, attention_mask):
-        """
-        Get the weighted score along the seq_length
-
-        Inputs:
-            query: the query matrix
-            key: the key matrix
-            value: the value matrix
-            attention_mask: the attention mask matrix with shape (batch_size,
-            1, seq_length, seq_length)
-        Outputs:
-            weighted_values: Tensor, the weighted sum scores
-        """
-        # Normalize query and key before MatMul, default off
-        # Attention score [bs, num_heads, seq_length, seq_length]
-        score = self.batch_matmul(query, key)
-        # Normalize after query and key MatMul
-        score = self.real_div(
-            score,
-            P.Cast()(self.scale_factor, P.DType()(score)))
-
-        ori_dtype = P.DType()(score)
-        score = P.Cast()(score, mstype.float32)
-        # Minus 10000 for the position where masked to exclude them from softmax
-        multiplu_out = self.sub(
-            P.Cast()(F.tuple_to_array((1.0,)), P.DType()(score)),
-            P.Cast()(attention_mask, P.DType()(score)))
-
-        adder = self.mul(multiplu_out, self.multiply_data)
-        attention_scores = self.add(adder, score)
-
-        shape = F.shape(attention_scores)
-        # attention probs
-        attention_probs = self.softmax(
-            F.reshape(attention_scores,
-                      (shape[0], -1, shape[-1])))
-        attention_probs = P.Cast()(attention_probs, ori_dtype)
-        attention_probs = F.reshape(attention_probs, shape)
-
-        attention_probs = self.prob_dropout(attention_probs)
-        # Weighted sum output [bs, num_heads, seq_length, size_per_head]
-        weighted_values = self.batch_matmul(attention_probs, value)
-        return weighted_values
-
-
-class TransformerEncoderLayer(Cell):
-    r"""
-    Transformer Encoder module.
-
-    Args:
-        hidden_size(int): The hidden size of the input.
-        ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
-        num_heads(int): The number of the heads.
-        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
-        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
-        post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
-        hidden_act(str): The activation of the internal feedforward layer. Default 'gelu'.
-        parallel_config(TransformerParallelConfig): The parallel configure.
-    Inputs:
-        x: Tensor, shape should be [batch_size, seq_length, hidden_size]
-        input_mask: Tensor, attention mask with shape [batch_size, 1, seq_length, seq_length]
-        layer_past: the past the feature map.
-    Outputs:
-        output: Tensor, the output logit of this layer
-        layer_present: Tensor, the feature map of current layer
-
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-    Examples:
-        >>> model = TransformerEncoderLayer(hidden_size=8, ffn_hidden_size=64, seq_length=16,
-        >>>                                 num_heads=2)
-        >>> encoder_input_value = Tensor(np.ones((2, 16, 8)), dtype.float32)
-        >>> encoder_input_mask = Tensor(np.ones((2, 1, 16, 16)), dtype.float16)
-        >>> model(encoder_input_value, encoder_input_value)
-    """
-
-    def __init__(self,
-                 hidden_size,
-                 ffn_hidden_size,
-                 num_heads,
-                 seq_length,
-                 attention_dropout_rate=0.1,
-                 hidden_dropout_rate=0.1,
-                 post_layernorm_residual=False,
-                 hidden_act='gelu',
-                 parallel_config=default_transformer_config):
-        super(TransformerEncoderLayer, self).__init__()
-        if num_heads % parallel_config.mp != 0:
-            raise ValueError(
-                f"num heads must be divisibled by the model parallel way {parallel_config.mp}, but found {num_heads}")
-
-        self.layernorm1 = LayerNorm((hidden_size,)).to_float(mstype.float32)
-        self.layernorm1.shard(((parallel_config.dp, 1, 1),))
-        self.layernorm2 = LayerNorm((hidden_size,)).to_float(mstype.float32)
-        self.layernorm2.shard(((parallel_config.dp, 1, 1),))
-
-        self.attention = MultiHeadAttention(hidden_size=hidden_size,
-                                            num_heads=num_heads,
-                                            hidden_dropout_rate=hidden_dropout_rate,
-                                            attention_dropout_rate=attention_dropout_rate,
-                                            parallel_config=parallel_config)
-        # Feed Forward Network, FFN
-        self.output = FeedForward(hidden_size=hidden_size,
-                                  dropout_rate=hidden_dropout_rate,
-                                  ffn_hidden_size=ffn_hidden_size,
-                                  hidden_act=hidden_act,
-                                  parallel_config=parallel_config)
-        self.post_layernorm_residual = post_layernorm_residual
-        self.add = P.TensorAdd().shard(((parallel_config.dp, 1, 1), (parallel_config.dp, 1, 1)))
-        self.dtype = mstype.float16
-
-    def construct(self, x, input_mask, layer_past=None):
-        r"""
-        The forward process of the block.
-        """
-        # [bs, seq_length, embedding_size]
-        input_x = self.layernorm1(x)
-        input_x = F.cast(input_x, self.dtype)
-        attention, layer_present = self.attention(input_x, input_x, input_mask,
-                                                  layer_past)
-        # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm
-        if self.post_layernorm_residual:
-            x = self.add(input_x, attention)
-        # For pre-layernorm the inputs for residual path are output of self-attention and input of this layer
-        else:
-            x = self.add(x, attention)
-
-        output_x = self.layernorm2(x)
-        output_x = F.cast(output_x, self.dtype)
-        mlp_logit = self.output(output_x)
-        if self.post_layernorm_residual:
-            output = self.add(output_x, mlp_logit)
-        else:
-            output = self.add(x, mlp_logit)
-        return output, layer_present
-
-
-class TransformerDecoderLayer(Cell):
-    r"""
-    Transformer Decoder module.
-
-    Args:
-        hidden_size(int): The hidden size of the input.
-        ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
-        num_heads(int): The number of the heads.
-        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
-        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
-        post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
-        hidden_act(str): The activation of the internal feedforward layer. Default 'gelu'.
-        parallel_config(TransformerParallelConfig): The parallel configure.
-    Inputs:
-        hidden_stats: the input tensor with shape [batch_size, seq_length, hidden_size]
-        decoder_mask: the attention mask for decoder with shape [batch_size, 1, seq_length, seq_length]
-        encoder_output: the output of the encoder with shape [batch_size, seq_length, hidden_size]
-        memory_mask: the memory mask of the cross attention with shape [batch, 1, tgt_seq_length, src_seq_length]
-         where tgt_seq_length is the length of the decoder.
-        layer_past: the past the feature map.
-    Outputs:
-        output: Tensor, the output logit of this layer. The shape is [batch, seq_length, hidden_size]
-        layer_present: Tensor, the feature map of current layer
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-    Examples:
-        >>> model = TransformerDecoderLayer(hidden_size=64, ffn_hidden_size=64, num_heads=2, seq_length=10)
-        >>> encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
-        >>> decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32)
-        >>> decoder_input_mask = Tensor(np.ones((2, 1, 10, 10)), dtype.float16)
-        >>> memory_mask = Tensor(np.ones((2, 1, 10, 20)), dtype.float16)
-        >>> model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask)
-    """
-
-    def __init__(self, hidden_size,
-                 ffn_hidden_size,
-                 num_heads,
-                 seq_length,
-                 attention_dropout_rate=0.1,
-                 hidden_dropout_rate=0.1,
-                 post_layernorm_residual=False,
-                 hidden_act='gelu',
-                 parallel_config=default_transformer_config):
-        super(TransformerDecoderLayer, self).__init__()
-        if num_heads % parallel_config.mp != 0:
-            raise ValueError(
-                f"num heads must be divisibled by the model parallel way {parallel_config.mp}, but found {num_heads}")
-
-        self.layernorm1 = LayerNorm((hidden_size,), parallel_config.dp).to_float(mstype.float32)
-        self.layernorm1.shard(((parallel_config.dp, 1, 1),))
-        self.layernorm2 = LayerNorm((hidden_size,), parallel_config.dp).to_float(mstype.float32)
-        self.layernorm2.shard(((parallel_config.dp, 1, 1),))
-
-        self.attention = MultiHeadAttention(hidden_size=hidden_size,
-                                            num_heads=num_heads,
-                                            hidden_dropout_rate=hidden_dropout_rate,
-                                            attention_dropout_rate=attention_dropout_rate,
-                                            parallel_config=parallel_config)
-        # Cross attention with the output of encoder as memory tensor
-        self.cross_attention = MultiHeadAttention(hidden_size=hidden_size,
-                                                  num_heads=num_heads,
-                                                  hidden_dropout_rate=hidden_dropout_rate,
-                                                  attention_dropout_rate=attention_dropout_rate,
-                                                  parallel_config=parallel_config)
-        self.cross_attention_layernorm = LayerNorm((hidden_size,), parallel_config.dp).to_float(mstype.float32)
-        self.cross_attention_layernorm.shard(((parallel_config.dp, 1, 1),))
-
-        # Feed Forward Network, FFN
-        self.output = FeedForward(hidden_size=hidden_size,
-                                  dropout_rate=hidden_dropout_rate,
-                                  ffn_hidden_size=ffn_hidden_size,
-                                  hidden_act=hidden_act,
-                                  parallel_config=parallel_config)
-        self.post_layernorm_residual = post_layernorm_residual
-        self.add = P.TensorAdd().shard(((parallel_config.dp, 1, 1), (parallel_config.dp, 1, 1)))
-        self.dtype = mstype.float16
-
-    def construct(self, hidden_stats,
-                  decoder_mask,
-                  encoder_output,
-                  memory_mask,
-                  layer_past=None):
-        r"""
-        The forward process of the block.
-        """
-        # [bs, seq_length, embedding_size]
-        input_x = self.layernorm1(hidden_stats)
-        input_x = F.cast(input_x, self.dtype)
-        attention, layer_present = self.attention(input_x, input_x, decoder_mask, layer_past)
-        # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm
-        if self.post_layernorm_residual:
-            x = self.add(input_x, attention)
-        # For pre-layernorm the inputs for residual path are output of self-attention and input of this layer
-        else:
-            x = self.add(hidden_stats, attention)
-
-        middle_output = self.cross_attention_layernorm(x)
-        middle_output = F.cast(middle_output, self.dtype)
-        cross_attn_output, layer_present = self.cross_attention(middle_output, encoder_output,
-                                                                memory_mask, layer_past)
-        if self.post_layernorm_residual:
-            x = self.add(middle_output, cross_attn_output)
-        else:
-            x = self.add(x, cross_attn_output)
-
-        output_x = self.layernorm2(x)
-        output_x = F.cast(output_x, self.dtype)
-        mlp_logit = self.output(output_x)
-        if self.post_layernorm_residual:
-            output = self.add(output_x, mlp_logit)
-        else:
-            output = self.add(x, mlp_logit)
-        return output, layer_present
-
-
-def set_parallel_configure_for_layer(network, layer_id, offset, layers, parallel_config):
-    # Used for the pipeline's stages setting
-    network.pipeline_stage = (layer_id + offset) // int(layers / parallel_config.pp)
-    # Used for optimizer's fusion tag
-    network.set_comm_fusion(int((layer_id + offset) / parallel_config.gradient_aggregation_group))
-    # Used for enabling recomputation of the block
-    if parallel_config.recompute:
-        network.recompute()
-
-
-class TransformerEncoder(Cell):
-    r"""
-    Transformer Encoder module with multi-layer.
-
-    Args:
-        num_layers(int): The layers of the `TransformerEncoderLayer`
-        hidden_size(int): The hidden size of the input.
-        ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
-        seq_length(int): The seq_length of the input tensor.
-        num_heads(int): The number of the heads.
-        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
-        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
-        post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
-        hidden_act(str): The activation of the internal feedforward layer. Default 'gelu'.
-        lambda_func: a function can specific the fusion index, pipeline stages and recompute attribute.
-            Default: set_parallel_configure_for_layer
-        offset(int): The initial layer index for the `decoder`. Used for setting the fusion id and stage id, to not
-            overlap with the encoder layer.
-        parallel_config(TransformerParallelConfig): The parallel configure.
-    Inputs:
-        hidden_states: Tensor, shape should be [batch_size, seq_length, hidden_size]
-        attention_mask: Tensor, attention mask with shape [batch_size, 1, seq_length, seq_length]
-        layer_past: the past the feature map.
-    Outputs:
-        output: Tensor, the output logit of this layer
-        layer_present: Tensor, the feature map of current layer
-
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-
-    Examples:
-        >>> model = TransformerEncoder(num_layers=2, hidden_size=8, ffn_hidden_size=64, seq_length=16,
-        >>>                       num_heads=2)
-        >>> encoder_input_value = Tensor(np.ones((2, 16, 8)), dtype.float32)
-        >>> encoder_input_mask = Tensor(np.ones((2, 1, 16, 16)), dtype.float16)
-        >>> model(encoder_input_value, encoder_input_mask)
-    """
-
-    def __init__(self,
-                 num_layers,
-                 hidden_size,
-                 ffn_hidden_size,
-                 seq_length,
-                 num_heads,
-                 attention_dropout_rate=0.1,
-                 hidden_dropout_rate=0.1,
-                 hidden_act='gelu',
-                 post_layernorm_residual=False,
-                 lambda_func=set_parallel_configure_for_layer,
-                 offset=0,
-                 parallel_config=default_transformer_config):
-        super(TransformerEncoder, self).__init__()
-        self.num_layers = num_layers
-        self.blocks = nn.CellList()
-        for i in range(num_layers):
-            block = TransformerEncoderLayer(hidden_size=hidden_size,
-                                            ffn_hidden_size=ffn_hidden_size,
-                                            seq_length=seq_length,
-                                            attention_dropout_rate=attention_dropout_rate,
-                                            hidden_dropout_rate=hidden_dropout_rate,
-                                            num_heads=num_heads,
-                                            hidden_act=hidden_act,
-                                            post_layernorm_residual=post_layernorm_residual,
-                                            parallel_config=parallel_config)
-            lambda_func(block, layer_id=i, offset=offset,
-                        layers=num_layers, parallel_config=parallel_config)
-            self.blocks.append(block)
-
-    def construct(self, hidden_states, attention_mask, layer_past=None):
-        r"""
-        The forward process of the block.
-        """
-        present_layer = ()
-        for i in range(self.num_layers):
-            hidden_states, present = self.blocks[i](hidden_states,
-                                                    attention_mask,
-                                                    layer_past)
-            present_layer = present_layer + (present,)
-
-        return hidden_states, present_layer
-
-
-class TransformerDecoder(Cell):
-    r"""
-    Transformer Decoder module with multi-layer.
-
-    Args:
-        num_layers(int): The layers of the `TransformerEncoderLayer`
-        hidden_size(int): The hidden size of the input.
-        ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
-        seq_length(int): The seq_length of the input tensor.
-        num_heads(int): The number of the heads.
-        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
-        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
-        post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
-        hidden_act(str): The activation of the internal feedforward layer. Default 'gelu'.
-        offset(int): The initial layer index for the `decoder`. Used for setting the fusion id and stage id, to not
-            overlap with the encoder layer.
-        lambda_func: a function can specific the fusion index, pipeline stages and recompute attribute.
-            Default: set_parallel_configure_for_layer
-        parallel_config(TransformerParallelConfig): The parallel configure.
-    Inputs:
-        hidden_stats: the input tensor with shape [batch_size, seq_length, hidden_size]
-        attention_mask: the attention mask for decoder with shape [batch_size, 1, seq_length, seq_length]
-        encoder_output: the output of the encoder with shape [batch_size, seq_length, hidden_size]
-        memory_mask: the memory mask of the cross attention with shape [batch, 1, tgt_seq_length, src_seq_length]
-         where tgt_seq_length is the length of the decoder. the output of the encoder with shape
-         [batch_size, seq_length, hidden_size],
-        layer_past: the past the feature map.
-    Outputs:
-        output: Tensor, the output logit of this layer
-        layer_present: Tensor, the feature map of current layer
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-    Examples:
-        >>> model = TransformerDecoder(num_layers=1, hidden_size=64, ffn_hidden_size=64, num_heads=2, seq_length=10)
-        >>> encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
-        >>> decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32)
-        >>> decoder_input_mask = Tensor(np.ones((2, 1, 10, 10)), dtype.float16)
-        >>> memory_mask = Tensor(np.ones((2, 1, 10, 20)), dtype.float16)
-        >>> model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask)
-    """
-
-    def __init__(self,
-                 num_layers,
-                 hidden_size,
-                 ffn_hidden_size,
-                 seq_length,
-                 num_heads,
-                 attention_dropout_rate=0.1,
-                 hidden_dropout_rate=0.1,
-                 post_layernorm_residual=False,
-                 hidden_act='gelu',
-                 lambda_func=set_parallel_configure_for_layer,
-                 offset=0,
-                 parallel_config=default_transformer_config):
-        super(TransformerDecoder, self).__init__()
-        self.num_layers = num_layers
-        self.blocks = nn.CellList()
-        for i in range(num_layers):
-            block = TransformerDecoderLayer(hidden_size=hidden_size,
-                                            ffn_hidden_size=ffn_hidden_size,
-                                            seq_length=seq_length,
-                                            attention_dropout_rate=attention_dropout_rate,
-                                            hidden_dropout_rate=hidden_dropout_rate,
-                                            num_heads=num_heads,
-                                            hidden_act=hidden_act,
-                                            post_layernorm_residual=post_layernorm_residual,
-                                            parallel_config=parallel_config)
-
-            # Used for the pipeline's stages setting
-            lambda_func(block, layer_id=i, offset=offset,
-                        layers=num_layers + offset, parallel_config=parallel_config)
-            self.blocks.append(block)
-
-    def construct(self, hidden_states, attention_mask, encoder_output, memory_mask, layer_past=None):
-        r"""
-        The forward process of the block.
-        """
-        present_layer = ()
-        # Loop through each self-attention layer
-        for i in range(self.num_layers):
-            hidden_states, present = self.blocks[i](hidden_states,
-                                                    attention_mask,
-                                                    encoder_output,
-                                                    memory_mask,
-                                                    layer_past)
-            present_layer = present_layer + (present,)
-
-        return hidden_states, present_layer
-
-
-class Transformer(Cell):
-    r"""
-    Transformer Decoder module.
-
-    .. warning::
-        This is an experimental interface that is subject to change and/or deletion.
-
-    Args:
-        encoder_layers(int): The layers of the `TransformerEncoderLayer`
-        decoder_layers(int): The layers of the `TransformerDecoderLayer`
-        hidden_size(int): The hidden size of the input.
-        ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
-        src_seq_length(int): The seq_length of the encoder's input tensor.
-        tgt_seq_length(int): The seq_length of the decoder's input tensor.
-        num_heads(int): The number of the heads.
-        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
-        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
-        post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
-        hidden_act(str): The activation of the internal feedforward layer. Default 'gelu'.
-        lambda_func: a function can specific the fusion index, pipeline stages and recompute attribute.
-            Default: set_parallel_configure_for_layer
-        parallel_config(TransformerParallelConfig): The parallel configure. Default 'default_transformer_config'
-    Inputs:
-        encoder_inputs: the input tensor with shape [batch_size, seq_length, hidden_size]
-        encoder_masks: the attention mask for decoder with shape [batch_size, 1, seq_length, seq_length]
-        decoder_inputs: the output of the encoder with shape [batch_size, seq_length, hidden_size], this can be none if
-            the decoder layer is 0.
-        decoder_masks: the attention mask for decoder with shape [batch_size, 1, seq_length, seq_length]
-        memory_mask: the memory mask of the cross attention with shape [batch, 1, tgt_seq_length, src_seq_length]
-         where tgt_seq_length is the length of the decoder. the output of the encoder with shape [batch_size,
-         seq_length, hidden_size], this can be none if the decoder layer is 0.
-    Outputs:
-        output: Tensor, the output logit of this layer
-        layer_present: Tensor, the feature map of current layer
-    Supported Platforms:
-        ``Ascend`` ``GPU``
-    Examples:
-        >>> model = Transformer(encoder_layers=1, decoder_layers=2, hidden_size=64, ffn_hidden_size=64, \
-        >>>      src_seq_length=20, tgt_seq_length=20)
-        >>> encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
-        >>> encoder_input_mask = Tensor(np.ones((2, 1, 20, 20)), dtype.float16)
-        >>> decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32)
-        >>> decoder_input_mask = Tensor(np.ones((2, 1, 10, 10)), dtype.float16)
-        >>> memory_mask = Tensor(np.ones((2, 1, 10, 20)), dtype.float16)
-        >>> model(encoder_input_value, encoder_input_mask, decoder_input_value, decoder_input_mask, \
-        >>>              memory_mask)
-    """
-
-    def __init__(self,
-                 hidden_size,
-                 ffn_hidden_size,
-                 src_seq_length,
-                 tgt_seq_length,
-                 encoder_layers=3,
-                 decoder_layers=3,
-                 num_heads=2,
-                 attention_dropout_rate=0.1,
-                 hidden_dropout_rate=0.1,
-                 hidden_act='gelu',
-                 post_layernorm_residual=False,
-                 lambda_func=set_parallel_configure_for_layer,
-                 parallel_config=default_transformer_config):
-        super(Transformer, self).__init__()
-
-        # The shard setting of Transformer is set within the class StackedTransformer
-        if encoder_layers > 0:
-            self.encoder = TransformerEncoder(num_layers=encoder_layers,
-                                              hidden_size=hidden_size,
-                                              ffn_hidden_size=ffn_hidden_size,
-                                              num_heads=num_heads,
-                                              seq_length=src_seq_length,
-                                              attention_dropout_rate=attention_dropout_rate,
-                                              hidden_dropout_rate=hidden_dropout_rate,
-                                              hidden_act=hidden_act,
-                                              post_layernorm_residual=post_layernorm_residual,
-                                              lambda_func=lambda_func,
-                                              parallel_config=parallel_config)
-        else:
-            self.encoder = None
-
-        # Offset is needed as the encoder has consumed some flags.
-        # so the decoder need to increase the flags based on the encoder layer
-        if decoder_layers > 0:
-            self.decoder = TransformerDecoder(num_layers=decoder_layers,
-                                              hidden_size=hidden_size,
-                                              ffn_hidden_size=ffn_hidden_size,
-                                              parallel_config=parallel_config,
-                                              num_heads=num_heads,
-                                              seq_length=tgt_seq_length,
-                                              attention_dropout_rate=attention_dropout_rate,
-                                              hidden_dropout_rate=hidden_dropout_rate,
-                                              hidden_act=hidden_act,
-                                              post_layernorm_residual=post_layernorm_residual,
-                                              lambda_func=lambda_func,
-                                              offset=encoder_layers)
-        else:
-            self.decoder = None
-
-    def construct(self, encoder_inputs,
-                  encoder_masks,
-                  decoder_inputs=None,
-                  decoder_masks=None,
-                  memory_mask=None):
-
-        encoder_output = None
-        output = None
-        encoder_layer_present = None
-        decoder_layer_present = None
-        if self.encoder is not None:
-            encoder_output, encoder_layer_present = self.encoder(encoder_inputs, encoder_masks)
-            output = encoder_output
-
-        if self.decoder is not None:
-            # decoder mask can be created outside of the model
-            decoder_output, decoder_layer_present = self.decoder(decoder_inputs,
-                                                                 decoder_masks,
-                                                                 encoder_output,
-                                                                 memory_mask)
-            output = decoder_output
-        return output, encoder_layer_present, decoder_layer_present
diff --git a/mindspore/nn/probability/bijector/bijector.py b/mindspore/nn/probability/bijector/bijector.py
index 2adc02068b6..6a89d338cf1 100644
--- a/mindspore/nn/probability/bijector/bijector.py
+++ b/mindspore/nn/probability/bijector/bijector.py
@@ -147,6 +147,7 @@ class Bijector(Cell):
         return (shape_tensor + dist_shape_tensor).shape
 
     def shape_mapping(self, shape):
+        """Map shape."""
         return self._shape_mapping(shape)
 
     def _add_parameter(self, value, name):
@@ -161,7 +162,7 @@ class Bijector(Cell):
             self.common_dtype = None
         # cast value to a tensor if it is not None
         if isinstance(value, bool) or value is None:
-            raise TypeError(f"{name} cannot be type {type(value)}")
+            raise TypeError("{} cannot be type {}".format(name, type(value)))
         value_t = Tensor(value)
         # if the bijector's dtype is not specified
         if self.dtype is None:
diff --git a/mindspore/nn/probability/bijector/exp.py b/mindspore/nn/probability/bijector/exp.py
index 404366ac7f1..b8984588368 100644
--- a/mindspore/nn/probability/bijector/exp.py
+++ b/mindspore/nn/probability/bijector/exp.py
@@ -57,8 +57,9 @@ class Exp(PowerTransform):
         super(Exp, self).__init__(name=name)
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
             str_info = 'exp'
         else:
-            str_info = f'batch_shape = {self.batch_shape}'
+            str_info = 'batch_shape = {}'.format(self.batch_shape)
         return str_info
diff --git a/mindspore/nn/probability/bijector/gumbel_cdf.py b/mindspore/nn/probability/bijector/gumbel_cdf.py
index fd66ce2787a..9030cdd3aee 100644
--- a/mindspore/nn/probability/bijector/gumbel_cdf.py
+++ b/mindspore/nn/probability/bijector/gumbel_cdf.py
@@ -28,7 +28,7 @@ class GumbelCDF(Bijector):
         Y = \exp(-\exp(\frac{-(X - loc)}{scale}))
 
     Args:
-        loc (float, list, numpy.ndarray, Tensor): The location. Default: 0..
+        loc (float, list, numpy.ndarray, Tensor): The location. Default: 0.0.
         scale (float, list, numpy.ndarray, Tensor): The scale. Default: 1.0.
         name (str): The name of the Bijector. Default: 'GumbelCDF'.
 
@@ -101,10 +101,11 @@ class GumbelCDF(Bijector):
         return self._scale
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            str_info = f'loc = {self.loc}, scale = {self.scale}'
+            str_info = 'loc = {}, scale = {}'.format(self.loc, self.scale)
         else:
-            str_info = f'batch_shape = {self.batch_shape}'
+            str_info = 'batch_shape = {}'.format(self.batch_shape)
         return str_info
 
     def _forward(self, x):
@@ -112,9 +113,12 @@ class GumbelCDF(Bijector):
         loc_local = self.cast_param_by_value(x, self.loc)
         scale_local = self.cast_param_by_value(x, self.scale)
         z = (x - loc_local) / scale_local
+        # pylint: disable=E1130
         return self.exp(-self.exp(-z))
 
     def _inverse(self, y):
+        # pylint false positive
+        # pylint: disable=E1130
         y = self._check_value_dtype(y)
         loc_local = self.cast_param_by_value(y, self.loc)
         scale_local = self.cast_param_by_value(y, self.scale)
diff --git a/mindspore/nn/probability/bijector/invert.py b/mindspore/nn/probability/bijector/invert.py
index 55e43a40abb..725f7dedf90 100644
--- a/mindspore/nn/probability/bijector/invert.py
+++ b/mindspore/nn/probability/bijector/invert.py
@@ -23,7 +23,8 @@ class Invert(Bijector):
 
     Args:
         bijector (Bijector): Base Bijector.
-        name (str): The name of the Bijector. Default: 'Invert' + bijector.name.
+        name (str): The name of the Bijector. Default: "". When name is set to "", it is actually
+            'Invert' + bijector.name.
 
     Supported Platforms:
         ``Ascend`` ``GPU``
@@ -67,16 +68,29 @@ class Invert(Bijector):
 
     @property
     def bijector(self):
+        """Return base bijector."""
         return self._bijector
 
     def inverse(self, y):
+        """
+        Forward transformation: transform the input value to another distribution.
+        """
         return self.bijector("forward", y)
 
     def forward(self, x):
+        """
+        Inverse transformation: transform the input value back to the original distribution.
+        """
         return self.bijector("inverse", x)
 
     def inverse_log_jacobian(self, y):
+        """
+        Logarithm of the derivative of the forward transformation.
+        """
         return self.bijector("forward_log_jacobian", y)
 
     def forward_log_jacobian(self, x):
+        """
+        Logarithm of the derivative of the inverse transformation.
+        """
         return self.bijector("inverse_log_jacobian", x)
diff --git a/mindspore/nn/probability/bijector/power_transform.py b/mindspore/nn/probability/bijector/power_transform.py
index 91fc2ed9fa1..c7a4465a687 100644
--- a/mindspore/nn/probability/bijector/power_transform.py
+++ b/mindspore/nn/probability/bijector/power_transform.py
@@ -95,13 +95,13 @@ class PowerTransform(Bijector):
         return self._power
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            str_info = f'power = {self.power}'
+            str_info = 'power = {}'.format(self.power)
         else:
-            str_info = f'batch_shape = {self.batch_shape}'
+            str_info = 'batch_shape = {}'.format(self.batch_shape)
         return str_info
 
-
     def _forward(self, x):
         """
         Evaluate the forward mapping.
diff --git a/mindspore/nn/probability/bijector/scalar_affine.py b/mindspore/nn/probability/bijector/scalar_affine.py
index 0f183521c98..45f7c4780eb 100644
--- a/mindspore/nn/probability/bijector/scalar_affine.py
+++ b/mindspore/nn/probability/bijector/scalar_affine.py
@@ -101,10 +101,11 @@ class ScalarAffine(Bijector):
         return self._shift
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            str_info = f'scale = {self.scale}, shift = {self.shift}'
+            str_info = 'scale = {}, shift = {}'.format(self.scale, self.shift)
         else:
-            str_info = f'batch_shape = {self.batch_shape}'
+            str_info = 'batch_shape = {}'.format(self.batch_shape)
         return str_info
 
     def _forward(self, x):
diff --git a/mindspore/nn/probability/bijector/softplus.py b/mindspore/nn/probability/bijector/softplus.py
index 7955b3a849c..6b4c55e4697 100644
--- a/mindspore/nn/probability/bijector/softplus.py
+++ b/mindspore/nn/probability/bijector/softplus.py
@@ -122,6 +122,7 @@ class Softplus(Bijector):
         ones = self.fill(self.dtypeop(x), self.shape(x), 1.0)
         too_small_or_too_large = self.logicalor(too_small, too_large)
         x = self.select(too_small_or_too_large, ones, x)
+        # pylint: disable=E1130
         y = x + self.log(self.abs(self.expm1(-x)))
         return self.select(too_small, too_small_value, self.select(too_large, too_large_value, y))
 
@@ -130,10 +131,11 @@ class Softplus(Bijector):
         return self._sharpness
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            str_info = f'sharpness = {self.sharpness}'
+            str_info = 'sharpness = {}'.format(self.sharpness)
         else:
-            str_info = f'batch_shape = {self.batch_shape}'
+            str_info = 'batch_shape = {}'.format(self.batch_shape)
         return str_info
 
     def _forward(self, x):
diff --git a/mindspore/nn/probability/bnn_layers/conv_variational.py b/mindspore/nn/probability/bnn_layers/conv_variational.py
index c7d4a117f8e..131891f04bc 100644
--- a/mindspore/nn/probability/bnn_layers/conv_variational.py
+++ b/mindspore/nn/probability/bnn_layers/conv_variational.py
@@ -72,9 +72,7 @@ class _ConvVariational(_Conv):
         self.group = group
         self.has_bias = has_bias
 
-        # distribution trainable parameters
-        self.shape = [self.out_channels,
-                      self.in_channels // self.group, *self.kernel_size]
+        self.shape = [self.out_channels, self.in_channels // self.group, *self.kernel_size]
 
         self.weight.requires_grad = False
         self.weight_prior = check_prior(weight_prior_fn, "weight_prior_fn")
@@ -108,6 +106,7 @@ class _ConvVariational(_Conv):
         return outputs
 
     def extend_repr(self):
+        """Display instance object as string."""
         s = 'in_channels={}, out_channels={}, kernel_size={}, stride={}, pad_mode={}, ' \
             'padding={}, dilation={}, group={}, weight_mean={}, weight_std={}, has_bias={}' \
             .format(self.in_channels, self.out_channels, self.kernel_size, self.stride, self.pad_mode, self.padding,
@@ -135,6 +134,7 @@ class _ConvVariational(_Conv):
         return kl_loss
 
     def apply_variational_bias(self, inputs):
+        """Calculate bias."""
         bias_posterior_tensor = self.bias_posterior("sample")
         return self.bias_add(inputs, bias_posterior_tensor)
 
@@ -261,6 +261,7 @@ class ConvReparam(_ConvVariational):
         )
 
     def apply_variational_weight(self, inputs):
+        """Calculate weight."""
         weight_posterior_tensor = self.weight_posterior("sample")
         outputs = self.conv2d(inputs, weight_posterior_tensor)
         return outputs
diff --git a/mindspore/nn/probability/bnn_layers/dense_variational.py b/mindspore/nn/probability/bnn_layers/dense_variational.py
index 22041bdbf5b..7dc2f953bde 100644
--- a/mindspore/nn/probability/bnn_layers/dense_variational.py
+++ b/mindspore/nn/probability/bnn_layers/dense_variational.py
@@ -78,6 +78,7 @@ class _DenseVariational(Cell):
         return outputs
 
     def extend_repr(self):
+        """Display instance object as string."""
         s = 'in_channels={}, out_channels={}, weight_mean={}, weight_std={}, has_bias={}' \
             .format(self.in_channels, self.out_channels, self.weight_posterior.mean,
                     self.weight_posterior.untransformed_std, self.has_bias)
@@ -89,6 +90,7 @@ class _DenseVariational(Cell):
         return s
 
     def apply_variational_bias(self, inputs):
+        """Calculate bias."""
         bias_posterior_tensor = self.bias_posterior("sample")
         return self.bias_add(inputs, bias_posterior_tensor)
 
@@ -196,6 +198,7 @@ class DenseReparam(_DenseVariational):
         )
 
     def apply_variational_weight(self, inputs):
+        """Calculate weight."""
         weight_posterior_tensor = self.weight_posterior("sample")
         outputs = self.matmul(inputs, weight_posterior_tensor)
         return outputs
@@ -292,6 +295,7 @@ class DenseLocalReparam(_DenseVariational):
         self.normal = Normal()
 
     def apply_variational_weight(self, inputs):
+        """Calculate weight."""
         mean = self.matmul(inputs, self.weight_posterior("mean"))
         std = self.sqrt(self.matmul(self.square(inputs), self.square(self.weight_posterior("sd"))))
         weight_posterior_affine_tensor = self.normal("sample", mean=mean, sd=std)
diff --git a/mindspore/nn/probability/distribution/bernoulli.py b/mindspore/nn/probability/distribution/bernoulli.py
index edf15e53cca..210fd18cc0a 100644
--- a/mindspore/nn/probability/distribution/bernoulli.py
+++ b/mindspore/nn/probability/distribution/bernoulli.py
@@ -27,7 +27,7 @@ class Bernoulli(Distribution):
     Bernoulli Distribution.
 
     Args:
-        probs (float, list, numpy.ndarray, Tensor): The probability of that the outcome is 1.
+        probs (float, list, numpy.ndarray, Tensor): The probability of that the outcome is 1. Default: None.
         seed (int): The seed used in sampling. The global seed is used if it is None. Default: None.
         dtype (mindspore.dtype): The type of the event samples. Default: mstype.int32.
         name (str): The name of the distribution. Default: 'Bernoulli'.
@@ -153,10 +153,11 @@ class Bernoulli(Distribution):
         self.uniform = C.uniform
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            s = f'probs = {self.probs}'
+            s = 'probs = {}'.format(self.probs)
         else:
-            s = f'batch_shape = {self._broadcast_shape}'
+            s = 'batch_shape = {}'.format(self._broadcast_shape)
         return s
 
     @property
diff --git a/mindspore/nn/probability/distribution/beta.py b/mindspore/nn/probability/distribution/beta.py
index 52ba0f2b464..146bee90674 100644
--- a/mindspore/nn/probability/distribution/beta.py
+++ b/mindspore/nn/probability/distribution/beta.py
@@ -181,10 +181,11 @@ class Beta(Distribution):
         self.lbeta = nn.LBeta()
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            s = f'concentration1 = {self._concentration1}, concentration0 = {self._concentration0}'
+            s = 'concentration1 = {}, concentration0 = {}'.format(self._concentration1, self._concentration0)
         else:
-            s = f'batch_shape = {self._broadcast_shape}'
+            s = 'batch_shape = {}'.format(self._broadcast_shape)
         return s
 
     @property
diff --git a/mindspore/nn/probability/distribution/categorical.py b/mindspore/nn/probability/distribution/categorical.py
index 077cfcec0be..63db680628b 100644
--- a/mindspore/nn/probability/distribution/categorical.py
+++ b/mindspore/nn/probability/distribution/categorical.py
@@ -171,10 +171,11 @@ class Categorical(Distribution):
         return self._probs
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            s = f'probs = {self.probs}'
+            s = 'probs = {}'.format(self.probs)
         else:
-            s = f'batch_shape = {self._broadcast_shape}'
+            s = 'batch_shape = {}'.format(self._broadcast_shape)
         return s
 
     def _get_dist_type(self):
diff --git a/mindspore/nn/probability/distribution/cauchy.py b/mindspore/nn/probability/distribution/cauchy.py
index 150034b2e47..4b13cc737ae 100644
--- a/mindspore/nn/probability/distribution/cauchy.py
+++ b/mindspore/nn/probability/distribution/cauchy.py
@@ -173,10 +173,11 @@ class Cauchy(Distribution):
 
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            str_info = f'location = {self._loc}, scale = {self._scale}'
+            str_info = 'location = {}, scale = {}'.format(self._loc, self._scale)
         else:
-            str_info = f'batch_shape = {self._broadcast_shape}'
+            str_info = 'batch_shape = {}'.format(self._broadcast_shape)
         return str_info
 
     @property
@@ -249,6 +250,7 @@ class Cauchy(Distribution):
         value = self.cast(value, self.dtype)
         loc, scale = self._check_param_type(loc, scale)
         z = (value - loc) / scale
+        # pylint: disable=E1130
         log_unnormalized_prob = - self.log1p(self.sq(z))
         log_normalization = self.log(np.pi * scale)
         return log_unnormalized_prob - log_normalization
diff --git a/mindspore/nn/probability/distribution/exponential.py b/mindspore/nn/probability/distribution/exponential.py
index f5e90d7d99b..4bb5bacf5c6 100644
--- a/mindspore/nn/probability/distribution/exponential.py
+++ b/mindspore/nn/probability/distribution/exponential.py
@@ -28,7 +28,7 @@ class Exponential(Distribution):
     Example class: Exponential Distribution.
 
     Args:
-        rate (float, list, numpy.ndarray, Tensor): The inverse scale.
+        rate (float, list, numpy.ndarray, Tensor): The inverse scale. Default: None.
         seed (int): The seed used in sampling. The global seed is used if it is None. Default: None.
         dtype (mindspore.dtype): The type of the event samples. Default: mstype.float32.
         name (str): The name of the distribution. Default: 'Exponential'.
@@ -156,10 +156,11 @@ class Exponential(Distribution):
         self.uniform = C.uniform
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            s = f'rate = {self.rate}'
+            s = 'rate = {}'.format(self.rate)
         else:
-            s = f'batch_shape = {self._broadcast_shape}'
+            s = 'batch_shape = {}'.format(self._broadcast_shape)
         return s
 
     @property
diff --git a/mindspore/nn/probability/distribution/gamma.py b/mindspore/nn/probability/distribution/gamma.py
index c0620746b6f..9d64209c020 100644
--- a/mindspore/nn/probability/distribution/gamma.py
+++ b/mindspore/nn/probability/distribution/gamma.py
@@ -180,10 +180,11 @@ class Gamma(Distribution):
         self.igamma = nn.IGamma()
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            s = f'concentration = {self._concentration}, rate = {self._rate}'
+            s = 'concentration = {}, rate = {}'.format(self._concentration, self._rate)
         else:
-            s = f'batch_shape = {self._broadcast_shape}'
+            s = 'batch_shape = {}'.format(self._broadcast_shape)
         return s
 
     @property
diff --git a/mindspore/nn/probability/distribution/geometric.py b/mindspore/nn/probability/distribution/geometric.py
index 80d36621ac5..be27a28ca2a 100644
--- a/mindspore/nn/probability/distribution/geometric.py
+++ b/mindspore/nn/probability/distribution/geometric.py
@@ -165,10 +165,11 @@ class Geometric(Distribution):
         self.uniform = C.uniform
 
     def extend_repr(self):
+        """Display instance object as string."""
         if not self.is_scalar_batch:
-            s = f'batch_shape = {self._broadcast_shape}'
+            s = 'batch_shape = {}'.format(self._broadcast_shape)
         else:
-            s = f'probs = {self.probs}'
+            s = 'probs = {}'.format(self.probs)
         return s
 
     @property
diff --git a/mindspore/nn/probability/distribution/gumbel.py b/mindspore/nn/probability/distribution/gumbel.py
index 337a6d3156d..c3da61eda32 100644
--- a/mindspore/nn/probability/distribution/gumbel.py
+++ b/mindspore/nn/probability/distribution/gumbel.py
@@ -112,10 +112,11 @@ class Gumbel(TransformedDistribution):
         return self._scale
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            str_info = f'loc = {self._loc}, scale = {self._scale}'
+            str_info = 'loc = {}, scale = {}'.format(self._loc, self._scale)
         else:
-            str_info = f'batch_shape = {self._broadcast_shape}'
+            str_info = 'batch_shape = {}'.format(self._broadcast_shape)
         return str_info
 
     def _get_dist_type(self):
diff --git a/mindspore/nn/probability/distribution/log_normal.py b/mindspore/nn/probability/distribution/log_normal.py
index 12eaa368d97..ece47ea7734 100644
--- a/mindspore/nn/probability/distribution/log_normal.py
+++ b/mindspore/nn/probability/distribution/log_normal.py
@@ -129,10 +129,11 @@ class LogNormal(msd.TransformedDistribution):
         return loc, scale
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            s = f'loc = {self.loc}, scale = {self.scale}'
+            s = 'loc = {}, scale = {}'.format(self.loc, self.scale)
         else:
-            s = f'batch_shape = {self.broadcast_shape}'
+            s = 'batch_shape = {}'.format(self.broadcast_shape)
         return s
 
     def _mean(self, loc=None, scale=None):
diff --git a/mindspore/nn/probability/distribution/logistic.py b/mindspore/nn/probability/distribution/logistic.py
index 5b1a72bb783..6a1b77ab31f 100644
--- a/mindspore/nn/probability/distribution/logistic.py
+++ b/mindspore/nn/probability/distribution/logistic.py
@@ -173,10 +173,11 @@ class Logistic(Distribution):
         return self.select(too_small, too_small_value, self.select(too_large, too_large_value, y))
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            s = f'location = {self._loc}, scale = {self._scale}'
+            s = 'location = {}, scale = {}'.format(self._loc, self._scale)
         else:
-            s = f'batch_shape = {self._broadcast_shape}'
+            s = 'batch_shape = {}'.format(self._broadcast_shape)
         return s
 
     @property
@@ -291,6 +292,7 @@ class Logistic(Distribution):
         value = self.cast(value, self.dtype)
         loc, scale = self._check_param_type(loc, scale)
         z = (value - loc) / scale
+        # pylint: disable=E1130
         return -self.softplus(-z)
 
     def _survival_function(self, value, loc=None, scale=None):
@@ -327,6 +329,7 @@ class Logistic(Distribution):
         value = self.cast(value, self.dtype)
         loc, scale = self._check_param_type(loc, scale)
         z = (value - loc) / scale
+        # pylint: disable=E1130
         return -self.softplus(z)
 
     def _sample(self, shape=(), loc=None, scale=None):
diff --git a/mindspore/nn/probability/distribution/normal.py b/mindspore/nn/probability/distribution/normal.py
index b15a2d23080..736b455455d 100644
--- a/mindspore/nn/probability/distribution/normal.py
+++ b/mindspore/nn/probability/distribution/normal.py
@@ -164,10 +164,11 @@ class Normal(Distribution):
         self.sqrt = P.Sqrt()
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            s = f'mean = {self._mean_value}, standard deviation = {self._sd_value}'
+            s = 'mean = {}, standard deviation = {}'.format(self._mean_value, self._sd_value)
         else:
-            s = f'batch_shape = {self._broadcast_shape}'
+            s = 'batch_shape = {}'.format(self._broadcast_shape)
         return s
 
     def _get_dist_type(self):
diff --git a/mindspore/nn/probability/distribution/poisson.py b/mindspore/nn/probability/distribution/poisson.py
index b3d81886c23..ac398daf798 100644
--- a/mindspore/nn/probability/distribution/poisson.py
+++ b/mindspore/nn/probability/distribution/poisson.py
@@ -155,10 +155,11 @@ class Poisson(Distribution):
         return self._rate
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            s = f'rate = {self.rate}'
+            s = 'rate = {}'.format(self.rate)
         else:
-            s = f'batch_shape = {self._broadcast_shape}'
+            s = 'batch_shape = {}'.format(self._broadcast_shape)
         return s
 
     def _get_dist_type(self):
@@ -219,6 +220,7 @@ class Poisson(Distribution):
         safe_x = self.select(self.less(value, zeros), zeros, value)
         y = log_rate * safe_x - self.lgamma(safe_x + 1.)
         comp = self.equal(value, safe_x)
+        # pylint: disable=E1130
         log_unnormalized_prob = self.select(comp, y, -inf)
         log_normalization = self.exp(log_rate)
         return log_unnormalized_prob - log_normalization
diff --git a/mindspore/nn/probability/distribution/uniform.py b/mindspore/nn/probability/distribution/uniform.py
index 5095bb9f328..3c825ac1f3f 100644
--- a/mindspore/nn/probability/distribution/uniform.py
+++ b/mindspore/nn/probability/distribution/uniform.py
@@ -170,10 +170,11 @@ class Uniform(Distribution):
         self.uniform = C.uniform
 
     def extend_repr(self):
+        """Display instance object as string."""
         if self.is_scalar_batch:
-            s = f'low = {self.low}, high = {self.high}'
+            s = 'low = {}, high = {}'.format(self.low, self.high)
         else:
-            s = f'batch_shape = {self._broadcast_shape}'
+            s = 'batch_shape = {}'.format(self._broadcast_shape)
         return s
 
     @property
diff --git a/mindspore/numpy/array_creations.py b/mindspore/numpy/array_creations.py
index b6c0bc4f537..90a2b7f8ab0 100644
--- a/mindspore/numpy/array_creations.py
+++ b/mindspore/numpy/array_creations.py
@@ -235,12 +235,12 @@ def copy_(a):
 
     Args:
         a (Union[int, float, bool, list, tuple, Tensor]): Input data, in any form that can
-            be converted to a `Tensor`. This includes Tensor, list, tuple and numbers.
+            be converted to a Tensor. This includes Tensor, list, tuple and numbers.
 
     Returns:
         Tensor, has the same data as `a`.
 
-     Raises:
+    Raises:
         TypeError: If input `a` has type not specified above.
         ValueError: If input `a` has different sizes at different dimensions.
 
diff --git a/mindspore/numpy/array_ops.py b/mindspore/numpy/array_ops.py
index 92189ae52c0..8fecaa811ee 100644
--- a/mindspore/numpy/array_ops.py
+++ b/mindspore/numpy/array_ops.py
@@ -200,7 +200,7 @@ def rollaxis(x, axis, start=0):
 
     axis = _check_axes_range(axis, ndim)
     start = _check_start_normalize(start, ndim)
-    if start - axis >= 0 and start - axis <= 1:
+    if 0 <= start - axis <= 1:
         return x
     perm = F.make_range(0, ndim)
     new_perm = None
diff --git a/mindspore/numpy/math_ops.py b/mindspore/numpy/math_ops.py
index 031e4716993..beb7cb110f2 100644
--- a/mindspore/numpy/math_ops.py
+++ b/mindspore/numpy/math_ops.py
@@ -1444,14 +1444,14 @@ def amin(a, axis=None, keepdims=False, initial=None, where=True):
             axes along which to operate. By default, flattened input is used. If
             this is a tuple of ints, the minimum is selected over multiple axes,
             instead of a single axis or all the axes as before.
-        keepdims (boolean, optional): defaults to False.
+        keepdims (bool, optional): defaults to False.
             If this is set to True, the axes which are reduced are left in the
             result as dimensions with size one. With this option, the result will
             broadcast correctly against the input array.
-        initial (scalar, optional):
+        initial (Number, optional):
             The maximum value of an output element. Must be present to allow
             computation on empty slice.
-        where (boolean Tensor, optional): defaults to True.
+        where (bool Tensor, optional): defaults to True.
             A boolean array which is broadcasted to match the dimensions of array,
             and selects elements to include in the reduction. If non-default value
             is passed, initial must also be provided.
diff --git a/mindspore/ops/_grad/grad_array_ops.py b/mindspore/ops/_grad/grad_array_ops.py
index d025f7a8ec1..b08035c5429 100644
--- a/mindspore/ops/_grad/grad_array_ops.py
+++ b/mindspore/ops/_grad/grad_array_ops.py
@@ -202,7 +202,7 @@ def get_bprop_squeeze(self):
 @bprop_getters.register(P.Flatten)
 def get_bprop_flatten(self):
     """Generate bprop for Flatten"""
-    flatten_grad = G.FlattenGrad()
+    flatten_grad = P.Reshape()
 
     def bprop(x, out, dout):
         dx = flatten_grad(dout, shape_op(x))
diff --git a/mindspore/ops/_grad_experimental/grad_comm_ops.py b/mindspore/ops/_grad_experimental/grad_comm_ops.py
index 878a2d094a1..d0cff6bc7c6 100644
--- a/mindspore/ops/_grad_experimental/grad_comm_ops.py
+++ b/mindspore/ops/_grad_experimental/grad_comm_ops.py
@@ -25,9 +25,11 @@ def get_bprop_neighborexchange(self):
     send_rank_ids = self.recv_rank_ids
     recv_rank_ids = self.send_rank_ids
     recv_shapes = self.send_shapes
+    send_shapes = self.recv_shapes
     recv_type = self.recv_type
-    neighborexchange_grad = NeighborExchange(send_rank_ids, recv_rank_ids, recv_shapes, recv_shapes, recv_type, group)
+    neighborexchange_grad = NeighborExchange(send_rank_ids, recv_rank_ids, recv_shapes, send_shapes, recv_type, group)
 
     def bprop(x, out, dout):
         return (neighborexchange_grad(dout),)
+
     return bprop
diff --git a/mindspore/ops/_grad_experimental/grad_inner_ops.py b/mindspore/ops/_grad_experimental/grad_inner_ops.py
index ff84e8ffd65..be38eefaa61 100644
--- a/mindspore/ops/_grad_experimental/grad_inner_ops.py
+++ b/mindspore/ops/_grad_experimental/grad_inner_ops.py
@@ -31,3 +31,17 @@ def get_bprop_tensor_copy_slices(self):
         return x_grad, update_grad, zeros_like(begin), zeros_like(end), zeros_like(stride)
 
     return bprop
+
+
+@bprop_getters.register(inner.Roll)
+def get_bprop_roll(self):
+    """Generate bprop for Roll"""
+    shift = self.shift
+    axis = self.axis
+    roll_grad = inner.Roll(-shift, axis)
+
+    def bprop(x_input, out, dout):
+        dx = roll_grad(dout)
+        return (dx,)
+
+    return bprop
diff --git a/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad_reduce.py b/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad_reduce.py
index cc7938b9374..12dbf4bcac7 100644
--- a/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad_reduce.py
+++ b/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad_reduce.py
@@ -21,7 +21,6 @@ from te.platform.cce_build import build_config
 from topi import generic
 from topi.cce import util
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
-from impl.bn_training_reduce import bn_training_reduce_schedule_nd
 
 SHAPE_SIZE_LIMIT = 2147483648
 
@@ -100,7 +99,7 @@ def batchnorm_fold2_grad_reduce(dout, x, dout_reduce, dout_x_reduce, kernel_name
 
         te.lang.cce.cce_build_code(sch, config)
         return
-
+    from impl.bn_training_reduce import bn_training_reduce_schedule_nd
     sch, tensor_list = bn_training_reduce_schedule_nd(res_list)
     with build_config:
         tvm.build(sch, tensor_list, "cce", name=kernel_name)
diff --git a/mindspore/ops/_op_impl/cpu/__init__.py b/mindspore/ops/_op_impl/cpu/__init__.py
index 0b1f418ecf8..3863143050f 100644
--- a/mindspore/ops/_op_impl/cpu/__init__.py
+++ b/mindspore/ops/_op_impl/cpu/__init__.py
@@ -65,3 +65,7 @@ from .pad import _pad_cpu
 from .range import _range_cpu
 from .tensor_copy_slices import _tensor_copy_slices_cpu
 from .l2loss import _l2loss_cpu
+from .pyfunc import _pyfunc_cpu
+from .buffer_append import _buffer_append_cpu
+from .buffer_get import _buffer_get_cpu
+from .buffer_sample import _buffer_sample_cpu
diff --git a/mindspore/ops/_op_impl/cpu/mirror_pad.py b/mindspore/ops/_op_impl/cpu/mirror_pad.py
index 9ab0a4f65ea..47454eb4fe1 100644
--- a/mindspore/ops/_op_impl/cpu/mirror_pad.py
+++ b/mindspore/ops/_op_impl/cpu/mirror_pad.py
@@ -21,9 +21,11 @@ mirror_pad_op_info = CpuRegOp("MirrorPad") \
     .output(0, "y", "required") \
     .dtype_format(DataType.F16_Default, DataType.I64_Default, DataType.F16_Default) \
     .dtype_format(DataType.F32_Default, DataType.I64_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default, DataType.F64_Default) \
     .dtype_format(DataType.I32_Default, DataType.I64_Default, DataType.I32_Default) \
     .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.F16_Default) \
     .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I32_Default, DataType.F64_Default) \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/cpu/mirror_pad_grad.py b/mindspore/ops/_op_impl/cpu/mirror_pad_grad.py
index 6bd0c88025a..feb69cd9528 100644
--- a/mindspore/ops/_op_impl/cpu/mirror_pad_grad.py
+++ b/mindspore/ops/_op_impl/cpu/mirror_pad_grad.py
@@ -21,9 +21,11 @@ mirror_pad_grad_op_info = CpuRegOp("MirrorPadGrad") \
     .output(0, "y", "required") \
     .dtype_format(DataType.F16_Default, DataType.I64_Default, DataType.F16_Default) \
     .dtype_format(DataType.F32_Default, DataType.I64_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default, DataType.F64_Default) \
     .dtype_format(DataType.I32_Default, DataType.I64_Default, DataType.I32_Default) \
     .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.F16_Default) \
     .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I32_Default, DataType.F64_Default) \
     .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/cpu/pad.py b/mindspore/ops/_op_impl/cpu/pad.py
index 08e0dd4ea00..6b6be6fed20 100644
--- a/mindspore/ops/_op_impl/cpu/pad.py
+++ b/mindspore/ops/_op_impl/cpu/pad.py
@@ -21,6 +21,7 @@ pad_op_info = CpuRegOp("Pad") \
     .output(0, "y", "required") \
     .dtype_format(DataType.F16_Default, DataType.F16_Default) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.F64_Default) \
     .dtype_format(DataType.I32_Default, DataType.I32_Default) \
     .get_op_info()
 
diff --git a/mindspore/ops/_op_impl/tbe/__init__.py b/mindspore/ops/_op_impl/tbe/__init__.py
index 7108c57a533..a1d88ff4faa 100644
--- a/mindspore/ops/_op_impl/tbe/__init__.py
+++ b/mindspore/ops/_op_impl/tbe/__init__.py
@@ -289,7 +289,6 @@ from .reciprocal_grad import _reciprocal_grad_tbe
 from .sqrt_grad import _sqrt_grad_tbe
 from .sqrt_grad_ds import _sqrt_grad_ds_tbe
 from .rsqrt_grad import _rsqrt_grad_tbe
-from .flatten_grad import _flatten_grad_tbe
 from .scatter_add import _scatter_add_tbe
 from .scatter_add_ds import _scatter_add_ds_tbe
 from .atan2 import _atan2_tbe
@@ -326,6 +325,7 @@ from .basic_lstm_cell_input_grad import _basic_lstm_cell_input_grad_tbe
 from .dynamic_rnn import _dynamic_rnn_tbe
 from .dynamic_gru_v2 import _dynamic_gru_v2_tbe
 from .gru_v2_hidden_grad import _gru_v2_hidden_grad_tbe
+from .gru_v2_hidden_grad_cell import _gru_v2_hidden_grad_cell_tbe
 from .lstm_input_grad import _lstm_input_grad_tbe
 from .confusion_matrix import _confusion_matrix_tbe
 from .broadcast_to import _broadcast_to_tbe
@@ -393,6 +393,7 @@ from .not_equal_ds import _not_ds_equal_tbe
 from .reciprocal_ds import _reciprocal_ds_tbe
 from .ctc_loss_v2 import _ctc_loss_v2_tbe
 from .ctc_loss_v2_grad import _ctc_loss_v2_grad_tbe
+from .roll import _roll_tbe
 from .soft_shrink import _soft_shrink_tbe
 from .soft_shrink_grad import _soft_shrink_grad_tbe
 from .hsigmoid_grad import _hsigmoid_grad_tbe
diff --git a/mindspore/ops/_op_impl/tbe/flatten_grad.py b/mindspore/ops/_op_impl/tbe/flatten_grad.py
deleted file mode 100644
index 43046bb619b..00000000000
--- a/mindspore/ops/_op_impl/tbe/flatten_grad.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Reshape op"""
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
-flatten_grad_op_info = TBERegOp("FlattenGrad") \
-    .fusion_type("OPAQUE") \
-    .async_flag(False) \
-    .binfile_name("reshape.so") \
-    .compute_cost(10) \
-    .kernel_name("reshape") \
-    .partial_flag(True) \
-    .attr("shape", "required", "listInt", "all") \
-    .input(0, "x", False, "required", "all") \
-    .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
-    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-    .get_op_info()
-@op_info_register(flatten_grad_op_info)
-def _flatten_grad_tbe():
-    """Reshape TBE register"""
-    return
diff --git a/mindspore/ops/_op_impl/tbe/reshape.py b/mindspore/ops/_op_impl/tbe/reshape.py
deleted file mode 100644
index d46fd966d8c..00000000000
--- a/mindspore/ops/_op_impl/tbe/reshape.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-"""Reshape op"""
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
-
-reshape_op_info = TBERegOp("Reshape") \
-    .fusion_type("OPAQUE") \
-    .async_flag(False) \
-    .binfile_name("reshape.so") \
-    .compute_cost(10) \
-    .kernel_name("reshape") \
-    .partial_flag(True) \
-    .attr("shape", "required", "listInt", "all") \
-    .input(0, "x", False, "required", "all") \
-    .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
-    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-    .get_op_info()
-
-
-@op_info_register(reshape_op_info)
-def _reshape_tbe():
-    """Reshape TBE register"""
-    return
diff --git a/mindspore/ops/_register_for_op.py b/mindspore/ops/_register_for_op.py
index beeda21a509..c82a9bbba04 100644
--- a/mindspore/ops/_register_for_op.py
+++ b/mindspore/ops/_register_for_op.py
@@ -47,3 +47,12 @@ class Registry(UserDict):
                 if key in self:
                     fn = self[prim_obj.name]
         return fn
+
+class PyFuncRegistry(UserDict):
+    def register(self, key, value):
+        self[key] = value
+
+    def get(self, key):
+        if key not in self:
+            raise ValueError(f"Python function with key{key} not registered.")
+        return self[key]
diff --git a/mindspore/ops/bprop_mindir/Identity_bprop.mindir b/mindspore/ops/bprop_mindir/Identity_bprop.mindir
index 39bfa0862c2..03c502861c7 100644
--- a/mindspore/ops/bprop_mindir/Identity_bprop.mindir
+++ b/mindspore/ops/bprop_mindir/Identity_bprop.mindir
@@ -1,9 +1,9 @@
 
-0.1.0	MindSpore*1.4.0:�
+0.1.0	MindSpore*1.4.0.20210815:�
 �
-bprop.15:doutbprop.15:[CNode]17:2bprop.15:[CNode]16:1"S-Prim-MakeTuple:HGradients/Default/network-NetIdentity/gradIdentity/S-Prim-MakeTuple-op15bprop.15*
+bprop.10:doutbprop.10:[CNode]12:2bprop.10:[CNode]11:1"S-Prim-MakeTuple:HGradients/Default/network-NetIdentity/gradIdentity/S-Prim-MakeTuple-op22bprop.10*
 
-bprop.15:x*
-bprop.15:out*
-bprop.15:dout2
-bprop.15:[CNode]17:2:�027af68f320ba40d9fbd0893da424c07f9c3a4ec82e98f9543bff9b5a15547a2102a58399653345b09bd6f5b337c4b81c4f8900664c0abc09fb80f38f8e95be82366f7bd59ea5ec135e982de03b4f7cab6b61d833d046a6e13f78bdaf2fb2b224c332efad4a51b4773cb78093dd53a4ca850b2dc6cdd5f2ae47106b3fda77bb3565f906930f68ca2413e9ad958d105e129e717cd183b95d11d65a8b0b030fc0d65c0e00bc893ef15ec6199798d6c8c46997153587d375b3240c1195ff2c7278c7e635a08323207b4cb3f73fd8437b4d7ee28a7676a68f005a7749bd19e5ed4ec99802e8da0efad2a3f80e99bfdcc99c4d54f2769de69733086a4722cb141371ba6c407ad6a3b57190d3702d6a45031d13b97bb6952735edf94fb36f73dbff6cdab258748286fc6d783abacce203dfc79d2fc31e23a427ce1f86e08777a687f71c414b8c313aac4f85c6217fbbb7009dd079b2d5548f8b695a470a11cb8cc83e6f5e78f5b3c67f2e7bf339b250c3638aee952e1a073002e2834011401f3827260
\ No newline at end of file
+bprop.10:x*
+bprop.10:out*
+bprop.10:dout2
+bprop.10:[CNode]12:2:�027af68f320ba40d9fbd0893da424c07f9c3a4ec82e98f9543bff9b5a15547a2087787fe3abde92d74a97b5b9f48f23d8ccdd6de450a931c64f578b83dcb5c2f102a58399653345b09bd6f5b337c4b81c4f8900664c0abc09fb80f38f8e95be82366f7bd59ea5ec135e982de03b4f7cab6b61d833d046a6e13f78bdaf2fb2b224c332efad4a51b4773cb78093dd53a4ca850b2dc6cdd5f2ae47106b3fda77bb3565f906930f68ca2413e9ad958d105e129e717cd183b95d11d65a8b0b030fc0d65c0e00bc893ef15ec6199798d6c8c46997153587d375b3240c1195ff2c7278c7e635a08323207b4cb3f73fd8437b4d7ee28a7676a68f005a7749bd19e5ed4eca6c407ad6a3b57190d3702d6a45031d13b97bb6952735edf94fb36f73dbff6cdab258748286fc6d783abacce203dfc79d2fc31e23a427ce1f86e08777a687f71c414b8c313aac4f85c6217fbbb7009dd079b2d5548f8b695a470a11cb8cc83e6f5e78f5b3c67f2e7bf339b250c3638aee952e1a073002e2834011401f3827260
\ No newline at end of file
diff --git a/mindspore/ops/bprop_mindir/ReLU_bprop.mindir b/mindspore/ops/bprop_mindir/ReLU_bprop.mindir
index 728be19742d..b4bc4ccf0f4 100644
--- a/mindspore/ops/bprop_mindir/ReLU_bprop.mindir
+++ b/mindspore/ops/bprop_mindir/ReLU_bprop.mindir
@@ -1,11 +1,11 @@
 
-0.1.0	MindSpore*1.4.0:�
+0.1.0	MindSpore*1.4.0.20210815:�
 �
-bprop.4:dout
-bprop.4:outbprop.4:dx:1bprop.4:dx:1"S-Prim-ReluGrad:>Gradients/Default/network-NetRelu/gradReLU/S-Prim-ReluGrad-op5
+bprop.2:dout
+bprop.2:outbprop.2:dx:1bprop.2:dx:1"S-Prim-ReluGrad:>Gradients/Default/network-NetRelu/gradReLU/S-Prim-ReluGrad-op5
 �
-bprop.4:dx:1bprop.4:[CNode]6:3bprop.4:[CNode]5:2"S-Prim-MakeTuple:?Gradients/Default/network-NetRelu/gradReLU/S-Prim-MakeTuple-op6bprop.4*
-	bprop.4:x*
-bprop.4:out*
-bprop.4:dout2
-bprop.4:[CNode]6:3:�027af68f320ba40d9fbd0893da424c07f9c3a4ec82e98f9543bff9b5a15547a2102a58399653345b09bd6f5b337c4b81c4f8900664c0abc09fb80f38f8e95be82366f7bd59ea5ec135e982de03b4f7cab6b61d833d046a6e13f78bdaf2fb2b224c332efad4a51b4773cb78093dd53a4ca850b2dc6cdd5f2ae47106b3fda77bb3565f906930f68ca2413e9ad958d105e129e717cd183b95d11d65a8b0b030fc0d65c0e00bc893ef15ec6199798d6c8c46997153587d375b3240c1195ff2c7278c7e635a08323207b4cb3f73fd8437b4d7ee28a7676a68f005a7749bd19e5ed4ec99802e8da0efad2a3f80e99bfdcc99c4d54f2769de69733086a4722cb141371ba6c407ad6a3b57190d3702d6a45031d13b97bb6952735edf94fb36f73dbff6cdab258748286fc6d783abacce203dfc79d2fc31e23a427ce1f86e08777a687f71c414b8c313aac4f85c6217fbbb7009dd079b2d5548f8b695a470a11cb8cc83e6f5e78f5b3c67f2e7bf339b250c3638aee952e1a073002e2834011401f3827260
\ No newline at end of file
+bprop.2:dx:1bprop.2:[CNode]4:3bprop.2:[CNode]3:2"S-Prim-MakeTuple:?Gradients/Default/network-NetRelu/gradReLU/S-Prim-MakeTuple-op6bprop.2*
+	bprop.2:x*
+bprop.2:out*
+bprop.2:dout2
+bprop.2:[CNode]4:3:�027af68f320ba40d9fbd0893da424c07f9c3a4ec82e98f9543bff9b5a15547a2087787fe3abde92d74a97b5b9f48f23d8ccdd6de450a931c64f578b83dcb5c2f102a58399653345b09bd6f5b337c4b81c4f8900664c0abc09fb80f38f8e95be82366f7bd59ea5ec135e982de03b4f7cab6b61d833d046a6e13f78bdaf2fb2b224c332efad4a51b4773cb78093dd53a4ca850b2dc6cdd5f2ae47106b3fda77bb3565f906930f68ca2413e9ad958d105e129e717cd183b95d11d65a8b0b030fc0d65c0e00bc893ef15ec6199798d6c8c46997153587d375b3240c1195ff2c7278c7e635a08323207b4cb3f73fd8437b4d7ee28a7676a68f005a7749bd19e5ed4eca6c407ad6a3b57190d3702d6a45031d13b97bb6952735edf94fb36f73dbff6cdab258748286fc6d783abacce203dfc79d2fc31e23a427ce1f86e08777a687f71c414b8c313aac4f85c6217fbbb7009dd079b2d5548f8b695a470a11cb8cc83e6f5e78f5b3c67f2e7bf339b250c3638aee952e1a073002e2834011401f3827260
\ No newline at end of file
diff --git a/mindspore/ops/composite/array_ops.py b/mindspore/ops/composite/array_ops.py
index dc751eab9df..0649eeccb66 100644
--- a/mindspore/ops/composite/array_ops.py
+++ b/mindspore/ops/composite/array_ops.py
@@ -139,7 +139,7 @@ def sequence_mask(lengths, maxlen=None):
           less than or equal to `maxlen`. Values greater than `maxlen` will be treated as `maxlen`.
           Must be type int32 or int64.
         - **maxlen** (int) - size of the last dimension of returned tensor. Must be positive and same
-          type as elements in `lengths`.
+          type as elements in `lengths`. Default is None.
 
     Outputs:
         One mask tensor of shape lengths.shape + (maxlen,).
diff --git a/mindspore/ops/composite/clip_ops.py b/mindspore/ops/composite/clip_ops.py
index 78d3474ecf2..6efc3699926 100644
--- a/mindspore/ops/composite/clip_ops.py
+++ b/mindspore/ops/composite/clip_ops.py
@@ -152,7 +152,7 @@ def clip_by_global_norm(x, clip_norm=1.0, use_norm=None):
 
     Returns:
         tuple[Tensor], a clipped Tensor. It has the same data type as `x` and each Tensor in the output tuple is the
-          same as the original input shape.
+        same as the original input shape.
 
     Supported Platforms:
         ``Ascend`` ``GPU``
@@ -161,7 +161,7 @@ def clip_by_global_norm(x, clip_norm=1.0, use_norm=None):
         >>> x1 = np.array([[2., 3.], [1., 2.]]).astype(np.float32)
         >>> x2 = np.array([[1., 4.], [3., 1.]]).astype(np.float32)
         >>> input_x = (Tensor(x1), Tensor(x2))
-        >>> out = clip_by_global_norm(input_x, 1.0)
+        >>> out = ops.clip_by_global_norm(input_x, 1.0)
         >>> print(out)
         (Tensor(shape=[2, 2], dtype=Float32, value=
         [[ 2.98142403e-01,  4.47213590e-01],
diff --git a/mindspore/ops/composite/math_ops.py b/mindspore/ops/composite/math_ops.py
index a5b0f2f6c96..5c82443d650 100644
--- a/mindspore/ops/composite/math_ops.py
+++ b/mindspore/ops/composite/math_ops.py
@@ -528,6 +528,7 @@ def batch_dot(x1, x2, axes=None):
         - **axes** (Union[int, tuple(int), list(int)]) - Single value or tuple/list of length 2 with dimensions
           specified for `a` and `b` each. If single value `N` passed, automatically picks up last N dims from
           `a` input shape and last N dimensions from `b` input shape in order as axes for each respectively.
+          Default: None.
 
     Outputs:
         Tensor, batch dot product of `x1` and `x2`.For example: The Shape of output
diff --git a/mindspore/ops/op_info_register.py b/mindspore/ops/op_info_register.py
index 4833c4caa1b..4251ef4e80e 100644
--- a/mindspore/ops/op_info_register.py
+++ b/mindspore/ops/op_info_register.py
@@ -381,7 +381,25 @@ class TBERegOp(RegOp):
     Class for TBE operator information register.
 
     Args:
-        op_name (string):kernel name.
+        op_name (str):kernel name.
+
+    Examples:
+        >>> from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+        >>> abs_op_info = TBERegOp("Abs") \
+        ...    .fusion_type("ELEMWISE") \
+        ...    .async_flag(False) \
+        ...    .binfile_name("abs.so") \
+        ...    .compute_cost(10) \
+        ...    .kernel_name("abs") \
+        ...    .partial_flag(True) \
+        ...    .op_pattern("formatAgnostic") \
+        ...    .input(0, "x", None, "required", None) \
+        ...    .output(0, "y", True, "required", "all") \
+        ...    .dtype_format(DataType.F16_None, DataType.F16_None) \
+        ...    .dtype_format(DataType.F32_None, DataType.F32_None) \
+        ...    .dtype_format(DataType.I32_None, DataType.I32_None) \
+        ...    .get_op_info()
+        >>>
     """
 
     def __init__(self, op_name):
@@ -890,3 +908,6 @@ class DataType:
     F64_HWCN = ("float64", "HWCN")
     F64_NDHWC = ("float64", "NDHWC")
     F64_ChannelLast = ("float64", "ChannelLast")
+
+    C64_Default = ("complex64", "DefaultFormat")
+    C128_Default = ("complex128", "DefaultFormat")
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index 2237f7f0f5f..6540b5623c3 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -92,7 +92,7 @@ from ._quant_ops import *
 from .other_ops import (Assign, InplaceAssign, IOU, BoundingBoxDecode, BoundingBoxEncode,
                         ConfusionMatrix, PopulationCount, UpdateState, Load,
                         CheckValid, Partial, Depend, identity, CheckBprop, Push, Pull, PullWeight, PushWeight,
-                        StartFLJob, UpdateModel, GetModel)
+                        StartFLJob, UpdateModel, GetModel, PyFunc)
 from ._thor_ops import (CusBatchMatMul, CusCholeskyTrsm, CusFusedAbsMax1, CusImg2Col, CusMatMulCubeDenseLeft,
                         CusMatMulCubeFraczRightMul, CusMatMulCube, CusMatrixCombine, CusTranspose02314,
                         CusMatMulCubeDenseRight,
@@ -110,16 +110,17 @@ from .sponge_ops import (BondForce, BondEnergy, BondAtomEnergy, BondForceWithAto
                          Dihedral14LJForceWithDirectCF, Dihedral14LJEnergy, Dihedral14LJCFForceWithAtomEnergy,
                          Dihedral14LJAtomEnergy, Dihedral14CFEnergy, Dihedral14CFAtomEnergy,
                          GetCenterOfGeometry, MDTemperature, MDIterationLeapFrogLiujian,
-                         CrdToUintCrd, MDIterationSetupRandState, TransferCrd, FFT3D, IFFT3D)
+                         CrdToUintCrd, MDIterationSetupRandState, TransferCrd, FFT3D, IFFT3D, NeighborListUpdate)
 from .sponge_update_ops import (v0coordinaterefresh, v1coordinaterefresh, v2coordinaterefresh, v3coordinaterefresh,
                                 v0forceredistribute, v1forceredistribute, v2forceredistribute, v3forceredistribute,
                                 restrainenergy, restrainforcewithatomenergyandvirial, constrainforcecyclewithvirial,
                                 refreshuintcrd, lastcrdtodr, refreshcrdvel, calculatenowrapcrd, refreshboxmaptimes,
                                 totalc6get, copyfrctosystemgrad, CrdToUintCrdQuarter,
                                 MDIterationLeapFrogLiujianWithMaxVel, GetCenterOfMass, MapCenterOfMass,
-                                NeighborListUpdate, MDIterationLeapFrog,
+                                NeighborListUpdateNew, MDIterationLeapFrog,
                                 MDIterationLeapFrogWithMaxVel, MDIterationGradientDescent,
                                 BondForceWithAtomEnergyAndVirial, ConstrainForceCycle)
+from .rl_ops import (BufferAppend, BufferGetItem, BufferSample)
 
 __all__ = [
     'Unique',
@@ -524,6 +525,11 @@ __all__ = [
     "MDIterationGradientDescent",
     "BondForceWithAtomEnergyAndVirial",
     "ConstrainForceCycle",
+    "PyFunc",
+    "BufferAppend",
+    "BufferGetItem",
+    "BufferSample",
+    "NeighborListUpdateNew",
 ]
 
 __all__.sort()
diff --git a/mindspore/ops/operations/_inner_ops.py b/mindspore/ops/operations/_inner_ops.py
index 07acdef27f4..e76db64bba2 100755
--- a/mindspore/ops/operations/_inner_ops.py
+++ b/mindspore/ops/operations/_inner_ops.py
@@ -500,10 +500,10 @@ class NeighborExchange(Primitive):
     as while receive data from recv_rank_ids.
 
     Args:
-        send_rank_ids (list): Ranks which the data is sent to.
-        recv_rank_ids (list): Ranks which the data is received from.
-        recv_shapes (list): Data shape which received from recv_rank_ids.
-        send_shapes (list): Data shape which send to the send_rank_ids.
+        send_rank_ids (list(int)): Ranks which the data is sent to.
+        recv_rank_ids (list(int)): Ranks which the data is received from.
+        recv_shapes (tuple(list(int))): Data shape which received from recv_rank_ids.
+        send_shapes (tuple(list(int))): Data shape which send to the send_rank_ids.
         recv_type (type): Data type which received from recv_rank_ids
         group (str):
     """
@@ -518,6 +518,9 @@ class NeighborExchange(Primitive):
         self.send_shapes = send_shapes
         self.recv_type = recv_type
 
+    def __call__(self, tensor):
+        raise NotImplementedError
+
 
 class MatrixSetDiag(PrimitiveWithInfer):
     r"""
@@ -954,6 +957,7 @@ class StackInit(PrimitiveWithInfer):
         [[1 3]
          [2 0]]
     """
+
     @prim_attr_register
     def __init__(self, index=1):
         """StackInit"""
@@ -979,6 +983,7 @@ class StackPush(PrimitiveWithInfer):
     Examples:
         Please refer to the usage of `StackInit`.
     """
+
     @prim_attr_register
     def __init__(self, index=1):
         """StackPush"""
@@ -1007,6 +1012,7 @@ class StackPop(PrimitiveWithInfer):
     Examples:
         Please refer to the usage of `StackInit`.
     """
+
     @prim_attr_register
     def __init__(self, index=1, shape=(1,), dtype=mstype.float32):
         """StackPop"""
@@ -1046,6 +1052,7 @@ class StackDestroy(PrimitiveWithInfer):
     Examples:
         Please refer to the usage of `StackInit`.
     """
+
     @prim_attr_register
     def __init__(self, index=1):
         """StackDestroy"""
@@ -1220,3 +1227,69 @@ class TensorCopySlices(Primitive):
     def __init__(self):
         """Initialize TensorScatterUpdate"""
         self.init_prim_io_names(inputs=['x', 'value', 'begin', 'end', 'strides'], outputs=['y'])
+
+
+class Roll(Primitive):
+    """
+    Rolls the elements of a tensor along an axis.
+
+    The elements are shifted positively (towards larger indices) by the offset of `shift` along the dimension of `axis`.
+    Negative `shift` values will shift elements in the opposite direction. Elements that roll passed the last position
+    will wrap around to the first and vice versa. Multiple shifts along multiple axes may be specified.
+
+    Note:
+        This inner operation is valid only if the axis is equal to 0. If the shift and the axis are tuples or lists,
+        this inner operation is valid only for the first pair of elements.
+
+    Args:
+        shift (Union[list(int), tuple(int), int]): Specifies the number of places by which elements are shifted
+            positively (towards larger indices) along the specified dimension. Negative shifts will roll the elements
+            in the opposite direction.
+        axis (Union[list(int), tuple(int), int]): Specifies the dimension indexes of shape to be rolled. The value is
+            forced to be zero in this operation.
+
+    Inputs:
+        - **input_x** (Tensor) - Input tensor.
+
+    Outputs:
+        Tensor, has the same shape and type as `input_x`.
+
+    Raises:
+        TypeError: If `shift` is not an int, a tuple or a list.
+        TypeError: If `axis` is not an int, a tuple or a list.
+        TypeError: If element of `shift` is not an int.
+        TypeError: If element of `axis` is not an int.
+        ValueError: If axis is not equal to 0.
+        ValueError: If shape of `shift` is not equal to 1.
+        ValueError: If shape of `axis` is not equal to 1.
+
+    Supported Platforms:
+        ``Ascend``
+
+    Examples:
+        >>> from mindspore.ops.operations import _inner_ops as inner
+        >>> input_x = Tensor(np.array([0, 1, 2, 3, 4]).astype(np.float32))
+        >>> op = inner.Roll(shift=2, axis=0)
+        >>> output = op(input_x)
+        >>> print(output)
+        [3. 4. 0. 1. 2.]
+        >>> input_x = Tensor(np.array([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]).astype(np.float32))
+        >>> op = inner.Roll(shift=-1, axis=0)
+        >>> output = op(input_x)
+        >>> print(output)
+        [[5. 6. 7. 8. 9.]
+         [0. 1. 2. 3. 4.]]
+    """
+
+    @prim_attr_register
+    def __init__(self, shift, axis):
+        """Initialize Roll"""
+        validator.check_value_type("shift", shift, [int, tuple, list], self.name)
+        validator.check_value_type("axis", axis, [int, tuple, list], self.name)
+        if isinstance(shift, (tuple, list)) and isinstance(axis, (tuple, list)):
+            validator.check_equal_int(len(shift), 1, "shift size", self.name)
+            validator.check_equal_int(len(axis), 1, "shift size", self.name)
+            validator.check_equal_int(axis[0], 0, "axis", self.name)
+        elif isinstance(shift, int) and isinstance(axis, int):
+            validator.check_equal_int(axis, 0, "axis", self.name)
+        self.init_prim_io_names(inputs=['input_x'], outputs=['output'])
diff --git a/mindspore/ops/operations/array_ops.py b/mindspore/ops/operations/array_ops.py
index 50afe154728..94447db60c0 100755
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
@@ -1264,7 +1264,7 @@ class Fill(PrimitiveWithInfer):
         return out
 
 
-class Ones(PrimitiveWithInfer):
+class Ones(Primitive):
     r"""
     Creates a tensor filled with value ones.
 
@@ -1302,27 +1302,6 @@ class Ones(PrimitiveWithInfer):
     def __init__(self):
         """Initialize Ones"""
 
-    def __infer__(self, dims, dtype):
-        if isinstance(dims['value'], int):
-            shape = (dims['value'],)
-        else:
-            shape = dims['value']
-        validator.check_value_type("shape", shape, [tuple], self.name)
-        for i, item in enumerate(shape):
-            validator.check_non_negative_int(item, shape[i], self.name)
-        valid_types = [mstype.bool_, mstype.int8, mstype.int16, mstype.int32, mstype.int64,
-                       mstype.uint8, mstype.uint16, mstype.uint32, mstype.uint64,
-                       mstype.float16, mstype.float32, mstype.float64]
-        validator.check_types_same_and_valid({"value": dtype['value']}, valid_types, self.name)
-        x_nptype = mstype.dtype_to_nptype(dtype['value'])
-        ret = np.ones(shape, x_nptype)
-        out = {
-            'value': Tensor(ret),
-            'shape': shape,
-            'dtype': x_nptype,
-        }
-        return out
-
 
 class Zeros(Primitive):
     r"""
@@ -2308,7 +2287,7 @@ class Concat(PrimitiveWithInfer):
 
     Outputs:
         Tensor, the shape is :math:`(x_1, x_2, ..., \sum_{i=1}^Nx_{mi}, ..., x_R)`.
-          The data type is the same with `input_x`.
+        The data type is the same with `input_x`.
 
     Raises:
         TypeError: If `axis` is not an int.
@@ -5725,6 +5704,9 @@ class EmbeddingLookup(PrimitiveWithCheck):
         validator.check_subclass("params", params['dtype'], mstype.tensor, self.name)
         validator.check_tensor_dtype_valid("indices", indices['dtype'], mstype.int_type, self.name)
         validator.check_subclass("offset", offset['dtype'], mstype.int_, self.name)
+        indices_shp = indices['shape']
+        if not indices_shp:
+            raise ValueError("'indices' should NOT be a scalar.")
         params_shp = params['shape']
         if len(params_shp) > 2:
             raise ValueError("The dimension of 'params' in EmbeddingLookup must <= 2, but got %d." % len(params_shp))
@@ -5980,8 +5962,15 @@ class SearchSorted(PrimitiveWithInfer):
 
 class TensorScatterMax(PrimitiveWithInfer):
     """
-    This operator is equivalent to TensorScatterAdd, except we take the maximum instead
-    of adding values together.
+    By comparing the value at the position indicated by the index in input_x with the value in the update,
+    the value at the index will eventually be equal to the largest one to create a new tensor.
+
+    The last axis of the index is the depth of each index vector. For each index vector,
+    there must be a corresponding value in update. The shape of update should be equal to the shape of input_x[indices].
+
+    Note:
+        If some values of the `indices` are out of bound, instead of raising an index error,
+        the corresponding `update` will not be updated to `input_x`.
 
     Inputs:
         - **input_x** (Tensor) - The target tensor. The dimension of input_x must be no less than indices.shape[-1].
@@ -6031,8 +6020,15 @@ class TensorScatterMax(PrimitiveWithInfer):
 
 class TensorScatterMin(PrimitiveWithInfer):
     """
-    This operator is equivalent to TensorScatterAdd, except we take the minimum instead
-    of adding values together.
+    By comparing the value at the position indicated by the index in input_x with the value in the update,
+    the value at the index will eventually be equal to the smallest one to create a new tensor.
+
+    The last axis of the index is the depth of each index vector. For each index vector,
+    there must be a corresponding value in update. The shape of update should be equal to the shape of input_x[indices].
+
+    Note:
+        If some values of the `indices` are out of bound, instead of raising an index error,
+        the corresponding `update` will not be updated to `input_x`.
 
     Inputs:
         - **input_x** (Tensor) - The target tensor. The dimension of input_x must be no less than indices.shape[-1].
@@ -6083,8 +6079,18 @@ class TensorScatterMin(PrimitiveWithInfer):
 
 class TensorScatterSub(PrimitiveWithInfer):
     """
-    This operator is equivalent to TensorScatterAdd, except we subtract, instead of
-    adding values together.
+    Creates a new tensor by subtracting the values from the positions in `input_x` indicicated by
+    `indices`, with values from `update`. When multiple values are provided for the same
+    index, the result of the update will be to subtract these values respectively. This operation is almost
+    equivalent to using ScatterNdSub, except that the updates are applied on `Tensor` instead of `Parameter`.
+
+    The last axis of `indices` is the depth of each index vectors. For each index vector,
+    there must be a corresponding value in `update`. The shape of `update` should be
+    equal to the shape of `input_x[indices]`.
+
+    Note:
+        If some values of the `indices` are out of bound, instead of raising an index error,
+        the corresponding `update` will not be updated to `input_x`.
 
     Inputs:
         - **input_x** (Tensor) - The target tensor. The dimension of input_x must be no less than indices.shape[-1].
@@ -6110,8 +6116,8 @@ class TensorScatterSub(PrimitiveWithInfer):
         >>> op = ops.TensorScatterSub()
         >>> output = op(input_x, indices, update)
         >>> print(output)
-        [[ -3.3  0.3  3.6]
-         [ 0.4  0.5 -3.2]]
+        [[-3.3000002  0.3        3.6      ]
+         [ 0.4        0.5       -3.2      ]]
     """
 
     @prim_attr_register
diff --git a/mindspore/ops/operations/comm_ops.py b/mindspore/ops/operations/comm_ops.py
index 19826bd2263..5cee0d71877 100644
--- a/mindspore/ops/operations/comm_ops.py
+++ b/mindspore/ops/operations/comm_ops.py
@@ -26,7 +26,15 @@ from ...common.api import context
 
 class ReduceOp:
     """
-    Operation options for reducing tensors.
+    Operation options for reducing tensors. This is an enumerated type, not an operator.
+    Mainly used in data parallel mode.
+
+    The main calling methods are as follows:
+
+    - SUM: ReduceOp.SUM.
+    - MAX: ReduceOp.MAX.
+    - MIN: ReduceOp.MIN.
+    - PROD: ReduceOp.PROD.
 
     There are four kinds of operation options, "SUM", "MAX", "MIN", and "PROD".
 
@@ -35,8 +43,33 @@ class ReduceOp:
     - MIN: Take the minimum.
     - PROD: Take the product.
 
+    For more, refer to example. Note: This needs to run in an environment with multiple graphics cards.
+
     Supported Platforms:
         ``Ascend`` ``GPU``
+
+    Examples:
+        >>> from mindspore.communication import init
+        >>> from mindspore import Tensor
+        >>> from mindspore.ops.operations.comm_ops import ReduceOp
+        >>> import mindspore.nn as nn
+        >>> import mindspore.ops.operations as ops
+        >>>
+        >>> init()
+        >>> class Net(nn.Cell):
+        ...     def __init__(self):
+        ...         super(Net, self).__init__()
+        ...         self.allreduce_sum = ops.AllReduce(ReduceOp.SUM, group="nccl_world_group")
+        ...
+        ...     def construct(self, x):
+        ...         return self.allreduce_sum(x)
+        ...
+        >>> input_ = Tensor(np.ones([2, 8]).astype(np.float32))
+        >>> net = Net()
+        >>> output = net(input_)
+        >>> print(output)
+        [[4. 5. 6. 0. 0. 0. 0. 0.]
+         [0. 0. 0. 0. 0. 0. 0. 0.]]
     """
     SUM = "sum"
     MAX = "max"
@@ -218,6 +251,7 @@ class _MiniStepAllGather(PrimitiveWithInfer):
         group (str): The communication group to work on. Default: None.
         grad_accumulation_step (int): The grad accumulation step. Default: None.
     """
+
     @prim_attr_register
     def __init__(self, group=GlobalComm.WORLD_COMM_GROUP, grad_accumulation_step=None, mean_flag=None):
         """Initialize _MiniStepAllGather."""
@@ -250,6 +284,7 @@ class _MicroStepAllGather(PrimitiveWithInfer):
     Args:
         group (str): The communication group to work on. Default: None.
     """
+
     @prim_attr_register
     def __init__(self, group=GlobalComm.WORLD_COMM_GROUP, mean_flag=None):
         validator.check_value_type('group', _get_group(group), (str,), self.name)
@@ -421,6 +456,7 @@ class _HostReduceScatter(PrimitiveWithInfer):
         ValueError: If the first dimension of input can not be divided by group size,
                     or group is not set, or rank_id not in [0, 7].
     """
+
     @prim_attr_register
     def __init__(self, op=ReduceOp.SUM, group=None):
         """Initialize _HostReduceScatter."""
@@ -603,12 +639,21 @@ class _AlltoAll(PrimitiveWithInfer):
     def __init__(self, split_count, split_dim, concat_dim, group=GlobalComm.WORLD_COMM_GROUP):
         """Initialize AlltoAll"""
         validator.check_value_type('group', _get_group(group), (str,), self.name)
+        validator.check_is_int(split_count, int)
+        validator.check_is_int(split_dim, int)
+        validator.check_is_int(concat_dim, int)
         self.split_count = split_count
         self.split_dim = split_dim
         self.concat_dim = concat_dim
         self.add_prim_attr('group', _get_group(group))
 
     def infer_shape(self, x_shape):
+        rank_size = get_group_size(_get_group(self.group))
+        if self.split_count != rank_size:
+            raise ValueError(f"split count '{self.split_count}' must be equal to rank size '{rank_size}'.")
+        if x_shape[self.split_dim] % self.split_count != 0:
+            raise ValueError(
+                f"split count '{self.split_count}' must be divisible by rank size '{x_shape[self.split_dim]}'.")
         x_shape[self.concat_dim] = x_shape[self.concat_dim] * self.split_count
         x_shape[self.split_dim] = int(x_shape[self.split_dim] / self.split_count)
         return x_shape
@@ -618,7 +663,7 @@ class _AlltoAll(PrimitiveWithInfer):
         return x_dtype
 
     def __call__(self, tensor):
-        return
+        raise NotImplementedError
 
 
 class _MirrorOperator(PrimitiveWithInfer):
@@ -687,6 +732,7 @@ class _VirtualDiv(PrimitiveWithInfer):
     Args:
         divisor: float32
     """
+
     @prim_attr_register
     def __init__(self, divisor=None):
         """Initialize _VirtualDiv."""
@@ -704,6 +750,7 @@ virtual_div = _VirtualDiv()
 
 class _VirtualAdd(PrimitiveWithInfer):
     """Auto parallel virtual operator. Do nothing in forward, do Add in backward."""
+
     @prim_attr_register
     def __init__(self):
         """Initialize _VirtualAdd."""
@@ -742,6 +789,7 @@ class _VirtualAssignAdd(PrimitiveWithInfer):
     internal use of parallel modules and cannot be called by users.
 
     """
+
     @prim_attr_register
     def __init__(self):
         """Initialize _VirtualAssignAdd."""
@@ -761,6 +809,7 @@ class _VirtualAccuGrad(PrimitiveWithInfer):
     Auto parallel virtual operator. Do nothing in forward, return y in backward. It is only for
     internal use of parallel modules and cannot be called by users.
     """
+
     @prim_attr_register
     def __init__(self):
         """Initialize _VirtualAccuGrad."""
@@ -817,6 +866,7 @@ class _VirtualOutput(PrimitiveWithInfer):
     def infer_dtype(self, x_dtype):
         return x_dtype
 
+
 class _GetTensorSlice(PrimitiveWithInfer):
     """
     Gets tensor slice by device matrix and tensor map.
diff --git a/mindspore/ops/operations/math_ops.py b/mindspore/ops/operations/math_ops.py
index dd036edc5c4..241a69da3c9 100644
--- a/mindspore/ops/operations/math_ops.py
+++ b/mindspore/ops/operations/math_ops.py
@@ -1037,7 +1037,7 @@ class MatMul(PrimitiveWithCheck):
     r"""
     Multiplies matrix `x` and matrix `y`.
 
-     .. math::
+    .. math::
 
         (Output)_{i j}=\sum_{k=1}^{p} a_{i k} b_{k j}=a_{i 1} b_{1 j}+a_{i 2} b_{2 j}+\cdots+a_{i p} b_{p j}, p\in N
 
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index 46b6ce6ffd7..b8af8a4916d 100755
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -353,10 +353,10 @@ class Softplus(Primitive):
 
     Raises:
         TypeError: If `input_x` is not a Tensor.
-        TypeError: If dtype of `input_x` is neither float16 nor float32.
+        TypeError: If the dtype of `input_x` is neither float16 nor float32.
 
     Supported Platforms:
-        ``Ascend``  ``GPU``
+        ``Ascend``  ``GPU`` ``CPU``
 
     Examples:
         >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32)
diff --git a/mindspore/ops/operations/other_ops.py b/mindspore/ops/operations/other_ops.py
index 4e746ac04a7..2a5098e7518 100644
--- a/mindspore/ops/operations/other_ops.py
+++ b/mindspore/ops/operations/other_ops.py
@@ -15,13 +15,14 @@
 
 """Other operators."""
 import functools
+from mindspore import log as logger
 from mindspore.common import monad
 from mindspore.common._decorator import deprecated
 from .. import signature as sig
 from ..._checkparam import Validator as validator, Rel
 from ...common import dtype as mstype
 from ..primitive import Primitive, PrimitiveWithCheck, PrimitiveWithInfer, prim_attr_register
-
+from .._register_for_op import PyFuncRegistry
 
 class Assign(Primitive):
     """
@@ -842,3 +843,89 @@ class identity(Primitive):
 
     def __call__(self, x):
         return x
+
+pyfunc_register = PyFuncRegistry()
+def get_pyfunc(fn_id):
+    return pyfunc_register.get(fn_id)
+
+class PyFunc(PrimitiveWithInfer):
+    r"""
+    Execute Python function.
+
+    `PyFunc` encapsulates Python functions as an operator which could be compiled into computation graph.
+    Unlike normal operators, it cannot be exported to MindIR as it is executed in current Python context.
+    As only the weights of the network is stored in the checkpoint, network include `PyFunc` could save
+    checkpoint and load to the network again, but will lose any Python function state.
+
+    .. warning::
+        This is an experimental prototype that is subject to change and/or deletion.
+
+    Args:
+        fn (function): Python function which inputs and outputs should be Python built-in scalar or numpy ndarray.
+        in_types (list[:class:`mindspore.dtype`]): The type of the inputs.
+        in_shapes (list[tuple[int]]): The dimensionality of the inputs. An empty list represents a scalar, otherwise it
+                                      represent a numpy array.
+        out_types (list[:class:`mindspore.dtype`]): The type of the outputs.
+        out_shapes (list[tuple[int]]): The dimensionality of the outputs. An empty list represents a scalar, otherwise
+                                       it represent a numpy array.
+        stateful (bool): Whether the function is stateful or not.
+                         If True, the execution order is same with model definition.
+
+    Inputs:
+        - **input_x** (Union(tuple[Tensor], list[Tensor])) - The input tuple or list
+          is made up of multiple tensors.
+
+    Outputs:
+        tuple[Tensor], execution results Python functions.
+
+    Raises:
+        TypeError: The Python function execution failed.
+        TypeError: The attributes(in_types/in_shapes/out_types/out_shapes) are inconsistent with Python function
+                   specifications.
+
+    Supported Platforms:
+        ``CPU``
+
+    Examples:
+        >>> def func(x1, x2):
+        >>>     return x1 + x2
+        >>> x1 = Tensor(np.array([1, 2, 3]).astype(np.float32))
+        >>> x2 = Tensor(np.array([1, 2, 3]).astype(np.float32))
+        >>> op = P.PyFunc(func, [x1.dtype, x2.dtype], [x1.shape, x2.shape], [x1.dtype], [x1.dtype])
+        >>> output = op((x1, x2))
+        >>> print(output[0].asnumpy())
+        [2. 4. 6.]
+    """
+
+    def __init__(self, fn, in_types, in_shapes, out_types, out_shapes, stateful=True):
+        super(PyFunc, self).__init__(self.__class__.__name__)
+        pyfunc_register.register(id(fn), fn)
+        self.add_prim_attr('fn_id', id(fn))
+        self.add_prim_attr('in_types', in_types)
+        self.add_prim_attr('in_shapes', in_shapes)
+        self.add_prim_attr('out_types', out_types)
+        self.add_prim_attr('out_shapes', out_shapes)
+        validator.check_value_type("in_types", in_types, [list, tuple], self.name)
+        validator.check_value_type("in_shapes", in_shapes, [list, tuple], self.name)
+        validator.check("in_types length", len(in_types), "in_shapes length", len(in_shapes), Rel.EQ, self.name)
+        validator.check_value_type("out_types", out_types, [list, tuple], self.name)
+        validator.check_value_type("out_shapes", out_shapes, [list, tuple], self.name)
+        validator.check("out_types length", len(out_types), "out_shapes length", len(out_shapes), Rel.EQ, self.name)
+        self.add_prim_attr("side_effect_io", stateful)
+        self.add_prim_attr("primitive_target", "CPU")
+
+    def infer_shape(self, *args):
+        if self.out_shapes:
+            return tuple(self.out_shapes)
+
+        logger.warning("The function output are empty tuple. Add a placeholder instead. "
+                       "Do not use it as it could be any uninitialized data.")
+        return ((1,),)
+
+    def infer_dtype(self, *args):
+        if self.out_shapes:
+            return tuple(self.out_types)
+
+        logger.warning("The function output are empty tuple. Add a placeholder instead. "
+                       "Do not use it as it could be any uninitialized data.")
+        return (mstype.int32,)
diff --git a/mindspore/ops/operations/sponge_ops.py b/mindspore/ops/operations/sponge_ops.py
index 8e9773ba5af..093d0c09f5e 100644
--- a/mindspore/ops/operations/sponge_ops.py
+++ b/mindspore/ops/operations/sponge_ops.py
@@ -3045,3 +3045,189 @@ class IFFT3D(PrimitiveWithInfer):
         validator.check_tensor_dtype_valid('input_real', input_real_dtype, mstype.number_type, self.name)
         validator.check_tensor_dtype_valid('input_imag', input_imag_dtype, mstype.number_type, self.name)
         return input_real_dtype
+
+class NeighborListUpdate(PrimitiveWithInfer):
+    """
+    Update (or construct if first time) the Verlet neighbor list for the
+    calculation of short-ranged force. Assume the number of atoms is N,
+    the number of grids divided is G, the maximum number of atoms in one
+    grid is M, the maximum number of atoms in single atom's neighbor list
+    is L, and the number of total atom in excluded list is E.
+
+    Args:
+        grid_numbers(int32): the total number of grids divided.
+        not_first_time(int32): whether to construct the neighbor
+          list first time or not.
+        Nxy(int32): the total number of grids divided in xy plane.
+        excluded_atom_numbers(int32): the total atom numbers in the excluded list.
+        cutoff(float32): the cutoff distance for short-range force calculation.
+        skin(float32): the overflow value of cutoff to maintain a neighbor list.
+        cutoff_square(float32): the suqare value of cutoff.
+        half_skin_square(float32): skin*skin/4, indicates the maximum
+          square value of the distance atom allowed to move between two updates.
+        cutoff_with_skin(float32): cutoff + skin, indicates the
+          radius of the neighbor list for each atom.
+        half_cutoff_with_skin(float32): cutoff_with_skin/2.
+        cutoff_with_skin_square(float32): the square value of cutoff_with_skin.
+        refresh_interval(int32): the number of iteration steps between two updates of neighbor list.
+        max_atom_in_grid_numbers(int32): the maximum number of atoms in one grid.
+
+    Inputs:
+        - **atom_numbers_in_grid_bucket** (Tensor, int32) - [G,], the number of atoms in each grid bucket.
+        - **bucket** (Tensor, int32) - (Tensor,int32) - [G, M], the atom indices in each grid bucket.
+        - **crd** (Tensor, float32) - [N,], the coordinates of each atom.
+        - **box_length** (Tensor, float32) - [3,], the length of 3 dimensions of the simulation box.
+        - **grid_N** (Tensor, int32) - [3,], the number of grids divided of 3 dimensions of the simulation box.
+        - **grid_length_inverse** (float32) - the inverse value of grid length.
+        - **atom_in_grid_serial** (Tensor, int32) - [N,], the grid index for each atom.
+        - **old_crd** (Tensor, float32) - [N, 3], the coordinates before update of each atom.
+        - **crd_to_uint_crd_cof** (Tensor, float32) - [3,], the scale factor
+          between the unsigned int value and the real space coordinates.
+        - **uint_crd** (Tensor, uint32) - [N, 3], the unsigned int coordinates value fo each atom.
+        - **gpointer** (Tensor, int32) - [G, 125], the 125 nearest neighbor grids (including self) of each grid.
+          G is the number of nearest neighbor grids.
+        - **nl_atom_numbers** (Tensor, int32) - [N,], the number of atoms in neighbor list of each atom.
+        - **nl_atom_serial** (Tensor, int32) - [N, L], the indices of atoms in neighbor list of each atom.
+        - **uint_dr_to_dr_cof** (Tensor, float32) - [3,], the scale factor between
+          the real space coordinates and the unsigned int value.
+        - **excluded_list_start** (Tensor, int32) - [N,], the start excluded index in excluded list for each atom.
+        - **excluded_numbers** (Tensor, int32) - [N,], the number of atom excluded in excluded list for each atom.
+        - **excluded_list** (Tensor, int32) - [E,], the contiguous join of excluded list of each atom.
+        - **need_refresh_flag** (Tensor, int32) - [N,], whether the neighbor list of each atom need update or not.
+        - **refresh_count** (Tensor, int32) - [1,], count how many iteration steps have passed since last update.
+
+    Outputs:
+        - **res** (float32)
+
+    Supported Platforms:
+        ``GPU``
+    """
+
+    @prim_attr_register
+    def __init__(self, grid_numbers, atom_numbers, not_first_time, Nxy, excluded_atom_numbers,
+                 cutoff_square, half_skin_square, cutoff_with_skin, half_cutoff_with_skin, cutoff_with_skin_square,
+                 refresh_interval=20, cutoff=10.0, skin=2.0, max_atom_in_grid_numbers=64, max_neighbor_numbers=800):
+        self.grid_numbers = grid_numbers
+        self.atom_numbers = atom_numbers
+        self.refresh_interval = refresh_interval
+        self.not_first_time = not_first_time
+        self.cutoff = cutoff
+        self.skin = skin
+        self.max_atom_in_grid_numbers = max_atom_in_grid_numbers
+        self.Nxy = Nxy
+        self.excluded_atom_numbers = excluded_atom_numbers
+        self.cutoff_square = cutoff_square
+        self.half_skin_square = half_skin_square
+        self.cutoff_with_skin = cutoff_with_skin
+        self.half_cutoff_with_skin = half_cutoff_with_skin
+        self.cutoff_with_skin_square = cutoff_with_skin_square
+        self.max_neighbor_numbers = max_neighbor_numbers
+        self.init_prim_io_names(
+            inputs=['atom_numbers_in_grid_bucket', 'bucket', 'crd', 'box_length', 'grid_N', 'grid_length_inverse',
+                    'atom_in_grid_serial', 'old_crd', 'crd_to_uint_crd_cof', 'uint_crd', 'gpointer', 'nl_atom_numbers',
+                    'nl_atom_serial', 'uint_dr_to_dr_cof', 'excluded_list_start', 'excluded_list', 'excluded_numbers',
+                    'need_refresh_flag', 'refresh_count'], outputs=['res'])
+
+        self.add_prim_attr('grid_numbers', self.grid_numbers)
+        self.add_prim_attr('atom_numbers', self.atom_numbers)
+        self.add_prim_attr('refresh_interval', self.refresh_interval)
+        self.add_prim_attr('not_first_time', self.not_first_time)
+        self.add_prim_attr('cutoff', self.cutoff)
+        self.add_prim_attr('skin', self.skin)
+        self.add_prim_attr('max_atom_in_grid_numbers', self.max_atom_in_grid_numbers)
+        self.add_prim_attr('Nxy', self.Nxy)
+        self.add_prim_attr('excluded_atom_numbers', self.excluded_atom_numbers)
+        self.add_prim_attr('cutoff_square', self.cutoff_square)
+        self.add_prim_attr('half_skin_square', self.half_skin_square)
+        self.add_prim_attr('cutoff_with_skin', self.cutoff_with_skin)
+        self.add_prim_attr('half_cutoff_with_skin', self.half_cutoff_with_skin)
+        self.add_prim_attr('cutoff_with_skin_square', self.cutoff_with_skin_square)
+
+    def infer_shape(self, atom_numbers_in_grid_bucket_shape, bucket_shape, crd_shape, box_length_shape, grid_N_shape,
+                    grid_length_inverse_shape, atom_in_grid_serial_shape, old_crd_shape, crd_to_uint_crd_cof_shape,
+                    uint_crd_shape, gpointer_shape, nl_atom_numbers_shape, nl_atom_serial_shape,
+                    uint_dr_to_dr_cof_shape, excluded_list_start_shape, excluded_list_shape, excluded_numbers_shape,
+                    need_refresh_flag_shape, refresh_count_shape):
+        assert len(atom_numbers_in_grid_bucket_shape) == 1
+        assert len(bucket_shape) == 2
+        assert len(crd_shape) == 2
+        assert len(box_length_shape) == 1
+        assert len(grid_N_shape) == 1
+        assert len(grid_length_inverse_shape) == 1
+        assert len(atom_in_grid_serial_shape) == 1
+        assert len(old_crd_shape) == 2
+        assert len(crd_to_uint_crd_cof_shape) == 1
+        assert len(uint_crd_shape) == 2
+        assert len(gpointer_shape) == 2
+        assert len(nl_atom_numbers_shape) == 1
+        assert len(nl_atom_serial_shape) == 2
+        assert len(uint_dr_to_dr_cof_shape) == 1
+        assert len(excluded_list_start_shape) == 1
+        assert len(excluded_list_shape) == 1
+        assert len(excluded_numbers_shape) == 1
+        assert len(need_refresh_flag_shape) == 1
+
+        validator.check_int(atom_numbers_in_grid_bucket_shape[0], self.grid_numbers, Rel.EQ,
+                            "atom_numbers_in_grid_bucket", self.name)
+        validator.check_int(bucket_shape[0], self.grid_numbers, Rel.EQ, "bucket", self.name)
+        validator.check_int(bucket_shape[1], self.max_atom_in_grid_numbers, Rel.EQ, "bucket", self.name)
+        validator.check_int(crd_shape[0], self.atom_numbers, Rel.EQ, "crd", self.name)
+        validator.check_int(crd_shape[1], 3, Rel.EQ, "crd", self.name)
+        validator.check_int(box_length_shape[0], 3, Rel.EQ, "box_length", self.name)
+        validator.check_int(grid_N_shape[0], 3, Rel.EQ, "grid_N", self.name)
+        validator.check_int(grid_length_inverse_shape[0], 3, Rel.EQ, "grid_length_inverse", self.name)
+        validator.check_int(atom_in_grid_serial_shape[0], self.atom_numbers, Rel.EQ, "atom_in_grid_serial",
+                            self.name)
+        validator.check_int(old_crd_shape[0], self.atom_numbers, Rel.EQ, "old_crd", self.name)
+        validator.check_int(old_crd_shape[1], 3, Rel.EQ, "old_crd", self.name)
+        validator.check_int(crd_to_uint_crd_cof_shape[0], 3, Rel.EQ, "crd_to_uint_crd_cof", self.name)
+        validator.check_int(uint_crd_shape[0], self.atom_numbers, Rel.EQ, "uint_crd", self.name)
+        validator.check_int(uint_crd_shape[1], 3, Rel.EQ, "uint_crd", self.name)
+        validator.check_int(gpointer_shape[0], self.grid_numbers, Rel.EQ, "gpointer", self.name)
+        validator.check_int(gpointer_shape[1], 125, Rel.EQ, "gpointer", self.name)
+        validator.check_int(nl_atom_numbers_shape[0], self.atom_numbers, Rel.EQ, "nl_atom_numbers", self.name)
+        validator.check_int(nl_atom_serial_shape[0], self.atom_numbers, Rel.EQ, "nl_atom_serial", self.name)
+        validator.check_int(nl_atom_serial_shape[1], self.max_neighbor_numbers, Rel.EQ, "nl_atom_serial",
+                            self.name)
+        validator.check_int(uint_dr_to_dr_cof_shape[0], 3, Rel.EQ, "uint_dr_to_dr_cof", self.name)
+        validator.check_int(excluded_list_start_shape[0], self.atom_numbers, Rel.EQ, "excluded_list_start",
+                            self.name)
+        validator.check_int(excluded_list_shape[0], self.excluded_atom_numbers, Rel.EQ, "excluded_list",
+                            self.name)
+        validator.check_int(excluded_numbers_shape[0], self.atom_numbers, Rel.EQ, "excluded_numbers", self.name)
+        validator.check_int(need_refresh_flag_shape[0], 1, Rel.EQ, "need_refresh_flag", self.name)
+
+        return [1,]
+
+    def infer_dtype(self, atom_numbers_in_grid_bucket_dtype, bucket_dtype, crd_dtype, box_length_dtype, grid_N_dtype,
+                    grid_length_inverse_dtype, atom_in_grid_serial_dtype, old_crd_dtype, crd_to_uint_crd_cof_dtype,
+                    uint_crd_dtype, gpointer_dtype, nl_atom_numbers_dtype, nl_atom_serial_dtype,
+                    uint_dr_to_dr_cof_dtype, excluded_list_start_dtype, excluded_list_dtype, excluded_numbers_dtype,
+                    need_refresh_flag_dtype, refresh_count_dtype):
+        validator.check_tensor_dtype_valid('atom_numbers_in_grid_bucket', atom_numbers_in_grid_bucket_dtype,
+                                           [mstype.int32], self.name)
+        validator.check_tensor_dtype_valid('bucket', bucket_dtype, [mstype.int32], self.name)
+        validator.check_tensor_dtype_valid('crd', crd_dtype, [mstype.float32], self.name)
+        validator.check_tensor_dtype_valid('box_length', box_length_dtype, [mstype.float32], self.name)
+        validator.check_tensor_dtype_valid('grid_N', grid_N_dtype, [mstype.int32], self.name)
+        validator.check_tensor_dtype_valid('grid_length_inverse', grid_length_inverse_dtype, [mstype.float32],
+                                           self.name)
+        validator.check_tensor_dtype_valid('atom_in_grid_serial', atom_in_grid_serial_dtype, [mstype.int32],
+                                           self.name)
+        validator.check_tensor_dtype_valid('old_crd', old_crd_dtype, [mstype.float32], self.name)
+        validator.check_tensor_dtype_valid('crd_to_uint_crd_cof', crd_to_uint_crd_cof_dtype, [mstype.float32],
+                                           self.name)
+        validator.check_tensor_dtype_valid('uint_crd', uint_crd_dtype, [mstype.uint32], self.name)
+        validator.check_tensor_dtype_valid('gpointer', gpointer_dtype, [mstype.int32], self.name)
+        validator.check_tensor_dtype_valid('nl_atom_numbers', nl_atom_numbers_dtype, [mstype.int32], self.name)
+        validator.check_tensor_dtype_valid('nl_atom_serial', nl_atom_serial_dtype, [mstype.int32], self.name)
+        validator.check_tensor_dtype_valid('uint_dr_to_dr_cof', uint_dr_to_dr_cof_dtype, [mstype.float32],
+                                           self.name)
+        validator.check_tensor_dtype_valid('excluded_list_start', excluded_list_start_dtype, [mstype.int32],
+                                           self.name)
+        validator.check_tensor_dtype_valid('excluded_list', excluded_list_dtype, [mstype.int32], self.name)
+        validator.check_tensor_dtype_valid('excluded_numbers', excluded_numbers_dtype, [mstype.int32], self.name)
+        validator.check_tensor_dtype_valid('need_refresh_flag', need_refresh_flag_dtype, [mstype.int32],
+                                           self.name)
+
+        return mstype.float32
diff --git a/mindspore/ops/operations/sponge_update_ops.py b/mindspore/ops/operations/sponge_update_ops.py
index 1c1be718ef5..85f6b33e848 100644
--- a/mindspore/ops/operations/sponge_update_ops.py
+++ b/mindspore/ops/operations/sponge_update_ops.py
@@ -998,7 +998,7 @@ class MapCenterOfMass(PrimitiveWithInfer):
         return mstype.float32
 
 
-class NeighborListUpdate(PrimitiveWithInfer):
+class NeighborListUpdateNew(PrimitiveWithInfer):
     """
     Update (or construct if first time) the Verlet neighbor list for the
     calculation of short-ranged force. Assume the number of atoms is n,
diff --git a/mindspore/ops/primitive.py b/mindspore/ops/primitive.py
index 2c94d657245..d59830a9991 100644
--- a/mindspore/ops/primitive.py
+++ b/mindspore/ops/primitive.py
@@ -619,9 +619,10 @@ def constexpr(fn=None, get_instance=True, name=None):
     to compute constant value using the constants in the constructor.
 
     Args:
-        fn (function): A `fn` use as the infer_value of the output operator.
-        get_instance (bool): If true, return the instance of operator, otherwise return the operator class.
-        name (str): Defines the operator name. If `name` is None, use the function name as op name.
+        fn (function): A `fn` use as the infer_value of the output operator. Default: None.
+        get_instance (bool): If true, return the instance of operator,
+                             otherwise return the operator class. Default: True.
+        name (str): Defines the operator name. If `name` is None, use the function name as op name. Default: None.
 
     Examples:
         >>> from mindspore.ops import constexpr
diff --git a/mindspore/parallel/_auto_parallel_context.py b/mindspore/parallel/_auto_parallel_context.py
index d8069f1056d..50c2e31a55b 100644
--- a/mindspore/parallel/_auto_parallel_context.py
+++ b/mindspore/parallel/_auto_parallel_context.py
@@ -14,6 +14,7 @@
 # ============================================================================
 """Context of auto parallel"""
 import threading
+
 import mindspore.context as context
 import mindspore.log as logger
 from mindspore.parallel._dp_allreduce_fusion import _set_fusion_strategy_by_idx, _set_fusion_strategy_by_size
@@ -39,6 +40,7 @@ class _AutoParallelContext:
 
     def __init__(self):
         self._context_handle = AutoParallelContext.get_instance()
+        self._dataset_strategy_using_str = True
 
     def __new__(cls):
         if cls._instance is None:
@@ -261,24 +263,34 @@ class _AutoParallelContext:
         Set dataset sharding strategy.
 
         Args:
-            dataset_strategy (tuple(tuple)): The dataset sharding strategy.
+            dataset_strategy (str or tuple(tuple)): The dataset sharding strategy.
         """
         self.check_context_handle()
+        if isinstance(dataset_strategy, str):
+            if dataset_strategy not in ("full_batch", "data_parallel"):
+                raise ValueError("The dataset_strategy string should be 'full_batch' or 'data_parallel', "
+                                 "otherwise, incoming tuple(tuple) type strategy")
+            self._context_handle.set_full_batch(dataset_strategy == "full_batch")
+            self._dataset_strategy_using_str = True
+            return
         if not isinstance(dataset_strategy, tuple):
-            raise TypeError(f'strategy must be tuple type, but got:{type(dataset_strategy)}')
+            raise TypeError(f'strategy must be str or tuple type, but got:{type(dataset_strategy)}')
         for ele in dataset_strategy:
             if not isinstance(ele, tuple):
                 raise TypeError(f'The element of strategy must be tuple type, but got:{type(ele)}')
             for dim in ele:
                 if not isinstance(dim, int):
                     raise TypeError(f'The dim of each strategy value must be int type, but got:{type(dim)}')
+        self._dataset_strategy_using_str = False
         self._context_handle.set_dataset_strategy(dataset_strategy)
 
     def get_dataset_strategy(self):
         """Get dataset sharding strategy."""
         self.check_context_handle()
-        if _is_role_pserver():
-            return False
+        if self._dataset_strategy_using_str:
+            if self._context_handle.get_full_batch():
+                return "full_batch"
+            return "data_parallel"
         return self._context_handle.get_dataset_strategy()
 
     def set_grad_accumulation_step(self, grad_accumulation_step):
@@ -659,7 +671,7 @@ _get_auto_parallel_context_func_map = {
 @args_type_check(device_num=int, global_rank=int, gradients_mean=bool, gradient_fp32_sync=bool,
                  loss_repeated_mean=bool, parallel_mode=str, auto_parallel_search_mode=str,
                  parameter_broadcast=bool, strategy_ckpt_load_file=str,
-                 strategy_ckpt_save_file=str, full_batch=bool, dataset_strategy=tuple, enable_parallel_optimizer=bool,
+                 strategy_ckpt_save_file=str, full_batch=bool, enable_parallel_optimizer=bool,
                  grad_accumulation_step=int, all_reduce_fusion_config=list, group_ckpt_save_file=str,
                  communi_parallel_mode=str, optimizer_weight_shard_size=int,
                  optimizer_weight_shard_aggregated_save=bool,
@@ -706,7 +718,7 @@ def _set_auto_parallel_context(**kwargs):
         strategy_ckpt_save_file (str): The path to save parallel strategy checkpoint. Default: ''
         group_ckpt_save_file (str): The path to save parallel group checkpoint. Default: ''
         full_batch (bool): Whether to load the whole batch on each device. Default: False.
-        dataset_strategy (tuplr): Dataset sharding strategy. Default: ().
+        dataset_strategy Union[str, tuple]: Dataset sharding strategy. Default: "data_parallel".
         enable_parallel_optimizer (bool): Enable using optimizer segmentation or not. Default: False.
         all_reduce_fusion_config (list): Set allreduce fusion strategy by parameters indices.
         pipeline_stages (int): Set the stage information for pipeline parallel. This indicates how
diff --git a/mindspore/parallel/_utils.py b/mindspore/parallel/_utils.py
index 617b34bcf2f..4730432508c 100644
--- a/mindspore/parallel/_utils.py
+++ b/mindspore/parallel/_utils.py
@@ -58,20 +58,20 @@ def _check_full_batch():
 
 def _need_to_full():
     """Check whether to convert input to full shape or tensor."""
+    if _get_parallel_mode() not in ("semi_auto_parallel", "auto_parallel"):
+        return False
     dataset_strategy = context.get_auto_parallel_context("dataset_strategy")
-    if dataset_strategy:
+    if dataset_strategy and dataset_strategy not in ("data_parallel", "full_batch"):
         return True
-    parallel_mode = _get_parallel_mode()
-    full_batch = _get_full_batch()
-    need = ((parallel_mode in ("semi_auto_parallel", "auto_parallel"))
-            and (not full_batch))
-    return need
+    return not _get_full_batch()
 
 
 def _to_full_shapes(shapes, device_num):
     """Expanding batch dimension according to device_num, adapt to mindspore minddata graph solution."""
     new_shapes = []
-    dataset_strategy = context.get_auto_parallel_context("dataset_strategy")
+    dataset_strategy = ()
+    if context.get_auto_parallel_context("dataset_strategy") not in ("data_parallel", "full_batch"):
+        dataset_strategy = context.get_auto_parallel_context("dataset_strategy")
     if dataset_strategy:
         if len(shapes) != len(dataset_strategy):
             raise ValueError("The input shapes size {} is not equal to "
@@ -108,7 +108,9 @@ def _to_full_tensor(elem, global_device_num, global_rank, scaling_sens=None):
     if stage_rank >= device_num:
         raise ValueError("The global rank must be smaller than device number, the global rank is {}, "
                          "the device num is {}".format(stage_rank, device_num))
-    dataset_strategy = context.get_auto_parallel_context("dataset_strategy")
+    dataset_strategy = ()
+    if context.get_auto_parallel_context("dataset_strategy") not in ("data_parallel", "full_batch"):
+        dataset_strategy = context.get_auto_parallel_context("dataset_strategy")
     if elem and dataset_strategy:
         if len(elem) != len(dataset_strategy):
             raise ValueError("The input size {} is not equal to "
diff --git a/mindspore/profiler/parser/flops_parser.py b/mindspore/profiler/parser/flops_parser.py
index f779ba8678d..43525582af3 100644
--- a/mindspore/profiler/parser/flops_parser.py
+++ b/mindspore/profiler/parser/flops_parser.py
@@ -78,6 +78,9 @@ class FlopsParser:
             op_name = self._get_op_name(result)
             if op_name in op_name_set or op_name == "":
                 continue
+            if op_name not in op_avg_time_dict:
+                logger.warning("Op name {op_name} is not exist in op average time dict.")
+                continue
             # Convert the unit of task_fops to MFLOPs(1e6).
             task_fops = self._compute_task_flops(result) * 1e-6
             op_avg_time = op_avg_time_dict[op_name]
diff --git a/mindspore/profiler/parser/minddata_analyzer.py b/mindspore/profiler/parser/minddata_analyzer.py
index 34390da9882..00a57dc99dc 100644
--- a/mindspore/profiler/parser/minddata_analyzer.py
+++ b/mindspore/profiler/parser/minddata_analyzer.py
@@ -32,7 +32,6 @@ class MinddataProfilingAnalyzer:
 
     Args:
         source_dir (str): The source directory for MindData profiling input files.
-        device_target (str): Device target, either 'CPU', 'GPU' or 'Ascend'.
         device_id (str): The device ID.
         output_path (str): The target directory for the analyzed summary. Default: `./`.
 
@@ -42,9 +41,8 @@ class MinddataProfilingAnalyzer:
         ProfilerFileNotFoundException: If any of the MindData profiling input files do not exist.
     """
 
-    def __init__(self, source_dir, device_target, device_id, output_path='./'):
+    def __init__(self, source_dir, device_id, output_path='./'):
         # Validate and save input parameters
-        self._validate_device_target(device_target)
         self._device_id = device_id
         self._source_dir = self._validate_directory(source_dir, 'Source directory')
         self._output_path = self._validate_directory(output_path, 'Output path')
@@ -52,7 +50,7 @@ class MinddataProfilingAnalyzer:
         # Get MindData profiling input filenames
         self._pipeline_path_filename = self._get_pipeline_path_filename(source_dir)
         self._cpu_utilization_path_filename = self._get_cpu_utilization_path_filename(source_dir)
-        self._device_trace_path_filename, self._device_trace_file_flag = \
+        self._device_trace_path_filename, self._device_queue_file_found = \
             self._get_device_trace_path_filename(source_dir)
 
         # Save output filename
@@ -106,39 +104,22 @@ class MinddataProfilingAnalyzer:
             logger.warning('The MindData CPU utilization file <%s> is empty.', self._cpu_utilization_path_filename)
             raise ProfilerRawFileException('The MindData CPU utilization file is empty.')
 
-        # Check if a device trace profiling filename was identified
-        if self._device_trace_file_flag:
-            # Open the dataset iterator (CPU) or device queue (GPU, Ascend) trace profiling file
-            with open(self._device_trace_path_filename, 'r') as device_trace_file:
-                try:
-                    device_trace_info = device_trace_file.readlines()
-                except (TypeError) as path_filename_error:
-                    logger.warning(path_filename_error)
-                    raise ProfilerRawFileException(
-                        'Failed to find the MindData trace profiling file.') from path_filename_error
-            if not device_trace_info:
-                logger.warning('The MindData trace profiling file <%s> is empty.', self._device_trace_path_filename)
-                raise ProfilerRawFileException('The MindData trace profiling file is empty.')
-        else:
-            device_trace_info = None
+        # Open the device queue or dataset iterator trace profiling file
+        with open(self._device_trace_path_filename, 'r') as device_trace_file:
+            try:
+                device_trace_info = device_trace_file.readlines()
+            except (TypeError) as path_filename_error:
+                logger.warning(path_filename_error)
+                raise ProfilerRawFileException(
+                    'Failed to find the MindData trace profiling file.') from path_filename_error
+        if not device_trace_info:
+            logger.warning('The MindData trace profiling file <%s> is empty.', self._device_trace_path_filename)
+            raise ProfilerRawFileException('The MindData trace profiling file is empty.')
 
         # Analyze the MindData profiling file information and save the result
         summary_dict = self._analyze_and_save(pipeline_info, cpu_util_info, device_trace_info)
         return summary_dict
 
-    def _validate_device_target(self, device_target):
-        """
-        Validate the device_target.
-
-        Args:
-            device_target (str): Device target, either 'CPU', 'GPU' or 'Ascend'.
-        """
-        if device_target not in ('CPU', 'GPU', 'Ascend'):
-            msg = 'Invalid device target "', device_target, '". Must be "CPU", "GPU" or "Ascend."'
-            logger.warning(msg)
-            raise ValueError(msg)
-        self._device_target = device_target
-
     @staticmethod
     def _validate_directory(dir_name, dir_type):
         """
@@ -219,41 +200,43 @@ class MinddataProfilingAnalyzer:
     def _get_device_trace_path_filename(self, source_dir):
         """
         Get the MindData device trace profiling full path filename.
-        On CPU, the filename is 'dataset_iterator_profiling_<device_id>.txt'.
-        On GPU and Ascend, the filename is 'device_trace_profiling_<device_id>.txt'.
+        File search order:
+        1) 'device_queue_profiling_<device_id>.txt' and then
+        2) 'dataset_iterator_profiling_<device_id>.txt'.
 
         Args:
             source_dir (str): The source directory for MindData profiling files.
 
         Returns:
             str, the MindData device trace profiling full path filename.
-            bool, flag which indicates if device trace profiling filename has been identified or not
+            bool, flag which indicates if 'device_queue_profiling_<device_id>.txt' has been found or not
         """
-        # Initialize flag that device trace file as correctly identified
-        device_trace_file_flag = True
+        # Initialize variable for MindData device trace profiling filename
+        device_trace_path_filename = ''
+        # Initialize flag that 'device_queue_profiling_<device_id>.txt' has not yet been found
+        device_queue_file_found = False
 
-        # Determine the device trace profiling filename
-        if self._device_target in ('GPU', 'Ascend'):
-            device_trace_template_filename = 'device_queue_profiling_{}.txt'
-        elif self._device_target == 'CPU':
-            device_trace_template_filename = 'dataset_iterator_profiling_{}.txt'
-        # Note: No need to else statement since self._device_target has already been verified to be valid
-
-        device_trace_path_filename = os.path.join(
+        txt_names = [os.path.join(
             source_dir,
-            device_trace_template_filename.format(self._device_id))
+            txt_name.format(self._device_id)) for txt_name in
+                     ('device_queue_profiling_{}.txt', 'dataset_iterator_profiling_{}.txt')]
 
-        try:
-            device_trace_path_filename = validate_and_normalize_path(device_trace_path_filename)
-        except RuntimeError:
-            logger.warning('The MindData profiling path <%s> is invalid.', device_trace_path_filename)
-            device_trace_file_flag = False
+        # Search for a device trace profiling file
+        if os.path.exists(txt_names[0]):
+            device_trace_path_filename = txt_names[0]
+            device_queue_file_found = True
+        elif os.path.exists(txt_names[1]):
+            device_trace_path_filename = txt_names[1]
+        else:
+            logger.warning('A MindData device trace profiling file <%s> nor <%s> cannot be found.',
+                           txt_names[0], txt_names[1])
+            raise ProfilerPathErrorException('A MindData device trace profiling file cannot be found.')
 
-        if device_trace_file_flag and not os.path.isfile(device_trace_path_filename):
+        if not os.path.isfile(device_trace_path_filename):
             logger.warning('The MindData device trace profiling file <%s> is not found.', device_trace_path_filename)
-            device_trace_file_flag = False
+            raise ProfilerFileNotFoundException(device_trace_path_filename)
 
-        return device_trace_path_filename, device_trace_file_flag
+        return device_trace_path_filename, device_queue_file_found
 
     def _get_save_path(self, output_path):
         """
@@ -503,7 +486,8 @@ class MinddataProfilingAnalyzer:
             if record[0] == 0:  # type 0: time record
                 q_time[record[1]].append(record[3])
             elif record[0] == 1:  # type 1: connector size record
-                if self._device_target == 'CPU':
+                # Check if dataset_iterator trace profiling file was found
+                if not self._device_queue_file_found:
                     q_time[2].append(record[4] - prev_time)
                     prev_time = record[4]
 
@@ -704,7 +688,8 @@ class BottleneckAnalyzer:
         for op_id in self.op_ids:
             if op_id == self.op_id_not_exist or self.op_names[op_id] in self.non_multithreaded_ops:
                 continue
-            elif self.avg_cpu_pct_per_worker[op_id] > self._AVG_CPU_UTIL_PCT_PER_WORKER_MAXIMUM and \
+
+            if self.avg_cpu_pct_per_worker[op_id] > self._AVG_CPU_UTIL_PCT_PER_WORKER_MAXIMUM and \
                     self.op_names[op_id]:
                 cpu_usage_analysis.append(
                     ("{} is using {}% CPU per worker."
@@ -727,7 +712,8 @@ class BottleneckAnalyzer:
         for op_id in self.op_ids:
             if op_id == self.op_id_not_exist or self.op_names[op_id] in self.non_multithreaded_ops:
                 continue
-            elif self.op_names[op_id] == "Batch":
+
+            if self.op_names[op_id] == "Batch":
                 pass
             else:
                 in_op_id, out_q = self.__get_non_inline_child_recur(
@@ -772,12 +758,12 @@ class BottleneckAnalyzer:
                     self.op_names[op_id] in self.non_multithreaded_ops \
                     or self.op_names[op_id] == "DeviceQueue":
                 continue
-            elif wkr_cpu > self._AVG_CPU_UTIL_PCT_PER_WORKER_MAXIMUM:
+
+            if wkr_cpu > self._AVG_CPU_UTIL_PCT_PER_WORKER_MAXIMUM:
                 bottleneck = self.pipeline_ops[op_id]
                 suggestion = "{} has high CPU utilization per worker of {}%".format(
                     self.pipeline_ops[op_id], wkr_cpu)
                 suggestion += " Try increasing num_parallel_workers above {}.".format(self.num_workers[op_id])
-                break
             elif wkr_cpu < self._AVG_CPU_UTIL_PCT_PER_WORKER_MINIMUM:
                 in_op_id = self.__get_non_inline_child_recur(op_id)
                 in_q_usage = self.queue_utilization_pct[in_op_id]
@@ -789,6 +775,4 @@ class BottleneckAnalyzer:
                         self.pipeline_ops[op_id], wkr_cpu)
                     suggestion += " and abnormal queue usage. Try increasing prefetch_size."
 
-                    break
-
         return [bottleneck], [suggestion]
diff --git a/mindspore/profiler/profiling.py b/mindspore/profiler/profiling.py
index 7f8474ddb03..22a00b37eaa 100644
--- a/mindspore/profiler/profiling.py
+++ b/mindspore/profiler/profiling.py
@@ -299,8 +299,7 @@ class Profiler:
 
         # Analyze minddata information
         try:
-            md_analyzer = MinddataProfilingAnalyzer(self._output_path, self._device_target, self._dev_id,
-                                                    self._output_path)
+            md_analyzer = MinddataProfilingAnalyzer(self._output_path, self._dev_id, self._output_path)
             md_analyzer.analyze()
         except ProfilerException as err:
             logger.warning(err.message)
@@ -358,8 +357,7 @@ class Profiler:
 
         # Analyze minddata information
         try:
-            md_analyzer = MinddataProfilingAnalyzer(self._output_path, self._device_target, self._dev_id,
-                                                    self._output_path)
+            md_analyzer = MinddataProfilingAnalyzer(self._output_path, self._dev_id, self._output_path)
             md_analyzer.analyze()
         except ProfilerException as err:
             logger.warning(err.message)
diff --git a/mindspore/run_check/_check_version.py b/mindspore/run_check/_check_version.py
index fa52264ac12..e70264027f6 100644
--- a/mindspore/run_check/_check_version.py
+++ b/mindspore/run_check/_check_version.py
@@ -207,7 +207,7 @@ class AscendEnvChecker(EnvChecker):
     """ascend environment check"""
 
     def __init__(self):
-        self.version = ["1.79.T10.0.B100"]
+        self.version = ["1.79.T15.0.B150"]
         atlas_nnae_version = "/usr/local/Ascend/nnae/latest/fwkacllib/version.info"
         atlas_toolkit_version = "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/version.info"
         hisi_fwk_version = "/usr/local/Ascend/fwkacllib/version.info"
diff --git a/mindspore/schema/fl_job.fbs b/mindspore/schema/fl_job.fbs
index 1b798c128b7..e7a3d60a2b0 100644
--- a/mindspore/schema/fl_job.fbs
+++ b/mindspore/schema/fl_job.fbs
@@ -160,3 +160,12 @@ table ResponsePullWeight{
 table FeatureMapList {
   feature_map:[FeatureMap];
 }
+
+table RequestPushMetrics{
+  loss:float;
+  accuracy:float;
+}
+
+table ResponsePushMetrics{
+  retcode:int;
+}
diff --git a/mindspore/train/callback/_loss_monitor.py b/mindspore/train/callback/_loss_monitor.py
index 9f11a7c3cb1..b77c97d6b3d 100644
--- a/mindspore/train/callback/_loss_monitor.py
+++ b/mindspore/train/callback/_loss_monitor.py
@@ -43,6 +43,12 @@ class LossMonitor(Callback):
         self._per_print_times = per_print_times
 
     def step_end(self, run_context):
+        """
+        Print training loss at the end of step.
+
+        Args:
+            run_context (RunContext): Context of the train running.
+        """
         cb_params = run_context.original_args()
         loss = cb_params.net_outputs
 
diff --git a/mindspore/train/callback/_lr_scheduler_callback.py b/mindspore/train/callback/_lr_scheduler_callback.py
index 536b5c2202a..5d0c070f2aa 100644
--- a/mindspore/train/callback/_lr_scheduler_callback.py
+++ b/mindspore/train/callback/_lr_scheduler_callback.py
@@ -51,7 +51,6 @@ class LearningRateScheduler(Callback):
         >>> dataset = create_custom_dataset("custom_dataset_path")
         >>> model.train(1, dataset, callbacks=[LearningRateScheduler(learning_rate_function)],
         ...             dataset_sink_mode=False)
-
     """
 
     def __init__(self, learning_rate_function):
@@ -59,6 +58,12 @@ class LearningRateScheduler(Callback):
         self.learning_rate_function = learning_rate_function
 
     def step_end(self, run_context):
+        """
+        Change the learning_rate at the end of step.
+
+        Args:
+            run_context (RunContext): Context of the train running.
+        """
         cb_params = run_context.original_args()
         arr_lr = cb_params.optimizer.learning_rate.asnumpy()
         lr = float(np.array2string(arr_lr))
diff --git a/mindspore/train/callback/_time_monitor.py b/mindspore/train/callback/_time_monitor.py
index 8adb26713db..a35e060da35 100644
--- a/mindspore/train/callback/_time_monitor.py
+++ b/mindspore/train/callback/_time_monitor.py
@@ -38,9 +38,21 @@ class TimeMonitor(Callback):
         self.epoch_time = time.time()
 
     def epoch_begin(self, run_context):
+        """
+        Record time at the begin of epoch.
+
+        Args:
+            run_context (RunContext): Context of the process running.
+        """
         self.epoch_time = time.time()
 
     def epoch_end(self, run_context):
+        """
+        Print process cost time at the end of epoch.
+
+        Args:
+           run_context (RunContext): Context of the process running.
+        """
         epoch_seconds = (time.time() - self.epoch_time) * 1000
         step_size = self.data_size
         cb_params = run_context.original_args()
diff --git a/mindspore/train/model.py b/mindspore/train/model.py
index 23412cd1f5d..c3ab4c3c511 100644
--- a/mindspore/train/model.py
+++ b/mindspore/train/model.py
@@ -598,6 +598,8 @@ class Model:
             of data will be transferred one by one. The limitation of data transmission per time is 256M.
             If sink_size > 0, each epoch the dataset can be traversed unlimited times until you get sink_size
             elements of the dataset. Next epoch continues to traverse from the end position of the previous traversal.
+            The interface builds the computational graphs and then executes the computational graphs.
+            However, when the 'model.build' is executed first, it only performs the graphs execution.
 
         Args:
             epoch (int): Generally, total number of iterations on the data per epoch.
@@ -653,6 +655,42 @@ class Model:
                     dataset_sink_mode=dataset_sink_mode,
                     sink_size=sink_size)
 
+    def build(self, train_dataset=None, valid_dataset=None, sink_size=-1):
+        """
+        Build computational graphs and data graphs with the sink mode.
+
+        .. warning::
+            This is an experimental prototype that is subject to change and/or deletion.
+
+        Note:
+            Pre-build process only supports `GRAPH_MODE` and `Ascend` target currently.
+            The interface builds the computational graphs, when the interface is executed first,
+            'model.train' only performs the graphs execution.
+            It only support dataset sink mode.
+
+        Args:
+            train_dataset (Dataset): A training dataset iterator. If `train_dataset` is defined, training graphs will be
+                                     initialized. Default: None.
+            valid_dataset (Dataset): An evaluating dataset iterator. If `valid_dataset` is defined, evaluation graphs
+                                     will be initialized, and `metrics` in `Model` can not be None. Default: None.
+            sink_size (int): Control the amount of data in each sink. Default: -1.
+
+        Examples:
+            >>> from mindspore import Model, nn, FixedLossScaleManager
+            >>>
+            >>> # For details about how to build the dataset, please refer to the tutorial
+            >>> # document on the official website.
+            >>> dataset = create_custom_dataset()
+            >>> net = Net()
+            >>> loss = nn.SoftmaxCrossEntropyWithLogits()
+            >>> loss_scale_manager = FixedLossScaleManager()
+            >>> optim = nn.Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9)
+            >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None, loss_scale_manager=loss_scale_manager)
+            >>> model.build(dataset)
+            >>> model.train(2, dataset)
+        """
+        self._init(train_dataset, valid_dataset, sink_size)
+
     def _eval_dataset_sink_process(self, valid_dataset, list_callback=None, cb_params=None):
         """
         Evaluation. The data would be passed to network through dataset channel.
diff --git a/mindspore/train/serialization.py b/mindspore/train/serialization.py
index dea204c29ff..671bf02cd85 100644
--- a/mindspore/train/serialization.py
+++ b/mindspore/train/serialization.py
@@ -690,7 +690,8 @@ def export(net, *inputs, file_name, file_format='AIR', **kwargs):
     Export the MindSpore prediction model to a file in the specified format.
 
     Note:
-        When exporting to AIR、ONNX format, the size of a single tensor can not exceed 2GB.
+        1. When exporting to AIR、ONNX format, the size of a single tensor can not exceed 2GB.
+        2. When `file_name` does not have a suffix, the system will automatically add according to the `file_format`.
 
     Args:
         net (Cell): MindSpore network.
@@ -699,12 +700,9 @@ def export(net, *inputs, file_name, file_format='AIR', **kwargs):
         file_format (str): MindSpore currently supports 'AIR', 'ONNX' and 'MINDIR' format for exported model.
 
             - AIR: Ascend Intermediate Representation. An intermediate representation format of Ascend model.
-              Recommended suffix for output file is '.air'.
             - ONNX: Open Neural Network eXchange. An open format built to represent machine learning models.
-              Recommended suffix for output file is '.onnx'.
             - MINDIR: MindSpore Native Intermediate Representation for Anf. An intermediate representation format
               for MindSpore models.
-              Recommended suffix for output file is '.mindir'.
 
         kwargs (dict): Configuration options dictionary.
 
@@ -826,7 +824,6 @@ def _save_mindir(net, file_name, *inputs, **kwargs):
         if os.path.exists(data_path):
             shutil.rmtree(data_path)
         os.makedirs(data_path, exist_ok=True)
-        os.chmod(data_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
         index = 0
         graphproto = graph_proto()
         data_size = 0
@@ -1191,8 +1188,7 @@ def merge_sliced_parameter(sliced_parameters, strategy=None):
 
     Examples:
         >>> import numpy as np
-        >>> from mindspore import Tensor, merge_sliced_parameter
-        >>> from mindspore.common.parameter import Parameter
+        >>> from mindspore import Tensor, merge_sliced_parameter, Parameter
         >>>
         >>> sliced_parameters = [
         ...                      Parameter(Tensor(np.array([0.00023915, 0.00013939, -0.00098059])),
diff --git a/mindspore/train/train_thor/convert_utils.py b/mindspore/train/train_thor/convert_utils.py
index 7ce34a94b95..26ef00045ac 100644
--- a/mindspore/train/train_thor/convert_utils.py
+++ b/mindspore/train/train_thor/convert_utils.py
@@ -20,7 +20,7 @@ import mindspore.common.dtype as mstype
 from mindspore import context
 
 
-class ConvertNetUtils():
+class ConvertNetUtils:
     """
     Convert net to thor layer net
     """
@@ -29,7 +29,6 @@ class ConvertNetUtils():
                                     nn.Embedding: ConvertNetUtils._convert_embedding,
                                     nn.Conv2d: ConvertNetUtils._convert_conv2d}
 
-
     @staticmethod
     def _convert_dense(subcell):
         """
@@ -64,7 +63,6 @@ class ConvertNetUtils():
             new_subcell.bias = subcell.bias
         return new_subcell
 
-
     @staticmethod
     def _convert_embedding(subcell):
         """
@@ -76,7 +74,6 @@ class ConvertNetUtils():
         new_subcell.embedding_table = subcell.embedding_table
         return new_subcell
 
-
     @staticmethod
     def _convert_conv2d(subcell):
         """
@@ -95,7 +92,6 @@ class ConvertNetUtils():
                                     has_bias=has_bias, weight_init=weight)
         return new_subcell
 
-
     def _convert_to_thor_net(self, net):
         """
         Convert net to thor net
@@ -114,9 +110,6 @@ class ConvertNetUtils():
             elif isinstance(subcell, (nn.Embedding, nn.Dense, nn.Conv2d)):
                 prefix = subcell.param_prefix
                 new_subcell = self._convert_method_map[type(subcell)](subcell)
-                print("subcell name: ", name, "prefix is", prefix, flush=True)
-                if isinstance(new_subcell, (nn.DenseThor, nn.EmbeddingThor, nn.Conv2dThor)):
-                    print("convert to thor layer success.", flush=True)
                 new_subcell.update_parameters_name(prefix + '.')
                 net.insert_child_to_cell(name, new_subcell)
                 change = True
@@ -124,10 +117,8 @@ class ConvertNetUtils():
                 self._convert_to_thor_net(subcell)
 
         if isinstance(net, nn.SequentialCell) and change:
-            print("is nn.SequentialCell and change")
             net.cell_list = list(net.cells())
 
-
     def convert_to_thor_net(self, net):
         """
         This interface is used to convert a network to thor layer network, in order to calculate and store the
@@ -152,7 +143,7 @@ class ConvertNetUtils():
         net.update_cell_type("second-order")
 
 
-class ConvertModelUtils():
+class ConvertModelUtils:
     """
     Convert model to thor model.
     """
@@ -203,7 +194,7 @@ class ConvertModelUtils():
             ...            frequency=100)
             >>> model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_manager, metrics={"acc"},
             ...               amp_level="O2", keep_batchnorm_fp32=False)
-            >>> model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
+            >>> model = ConvertModelUtils.convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
             ...                                                   metrics={'acc'}, amp_level="O2",
             ...                                                   loss_scale_manager=loss_manager,
             ...                                                   keep_batchnorm_fp32=False)
diff --git a/model_zoo/README.md b/model_zoo/README.md
index 93c786b0b12..5a1c7fdf09d 100644
--- a/model_zoo/README.md
+++ b/model_zoo/README.md
@@ -113,3 +113,9 @@ MindSpore is Apache 2.0 licensed. Please see the LICENSE file.
 ## License
 
 [Apache License 2.0](https://gitee.com/mindspore/mindspore/blob/master/LICENSE)
+
+## FAQ
+
+- **Q: How to resolve the lack of memory while using `PYNATIVE_MODE` with errors such as *Failed to alloc memory pool memory*?**
+
+  **A**: `PYNATIVE_MODE` usually requires more memory than `GRAPH_MODE`, especially in training process which have to deal with back propagation. You could try using smaller batch size.
diff --git a/model_zoo/README_CN.md b/model_zoo/README_CN.md
index 2c64e2bc521..7becc6aa44d 100644
--- a/model_zoo/README_CN.md
+++ b/model_zoo/README_CN.md
@@ -113,3 +113,9 @@ MindSpore已获得Apache 2.0许可，请参见LICENSE文件。
 ## 许可证
 
 [Apache 2.0许可证](https://gitee.com/mindspore/mindspore/blob/master/LICENSE)
+
+## FAQ
+
+- **Q: 使用`PYNATIVE_MODE`运行模型出现错误内存不足，例如*Failed to alloc memory pool memory*, 该怎么处理?**
+
+  **A**: `PYNATIVE_MODE`通常比`GRAPH_MODE`使用更多内存，尤其是在需要进行反向传播计算的训练图中，你可以尝试使用一些更小的batch size.
diff --git a/model_zoo/official/cv/FCN8s/gpu_default_config.yaml b/model_zoo/official/cv/FCN8s/gpu_default_config.yaml
index 86834c491fc..e2d24840ac5 100644
--- a/model_zoo/official/cv/FCN8s/gpu_default_config.yaml
+++ b/model_zoo/official/cv/FCN8s/gpu_default_config.yaml
@@ -21,6 +21,7 @@ image_std: [57.375, 57.120, 58.395]
 ignore_label: 255
 num_classes: 21
 model: "FCN8s"
+parallel_mode: "data_parallel"
 
 # ======================================================================================
 # Training options
diff --git a/model_zoo/official/cv/centerface/README.md b/model_zoo/official/cv/centerface/README.md
index 00be4179e63..fef31b6d2ec 100644
--- a/model_zoo/official/cv/centerface/README.md
+++ b/model_zoo/official/cv/centerface/README.md
@@ -151,10 +151,7 @@ ls ./dataset/centerface/images/train/images # img_dir
 
     ```python
     # enter script dir, train CenterFace
-    bash train_distribute_gpu.sh
-    # after training
-    mkdir ./model
-    cp train_distribute_gpu/output/*/*.ckpt ./model # cp model to [MODEL_PATH]
+    bash train_distribute_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [PRETRAINED_BACKBONE] [ANNOTATIONS] [DATASET]
     ```
 
 step5: test
@@ -186,7 +183,7 @@ ls ./dataset/centerface/ground_truth/val.mat # annot_path
 
     ```bash
     # test CenterFace
-    bash test_distribute GPU
+    bash test_distribute_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [CKPT_PATH] [DATASET] [GROUND_TRUTH_MAT]
     ```
 
 step6: eval
@@ -321,10 +318,14 @@ bash eval_all.sh [ground_truth_path]
         ├── scripts
         │   ├──run_infer_310.sh          // shell script for infer on ascend310
         │   ├──eval.sh                   // evaluate a single testing result
+        │   ├──eval.sh                   // evaluate a single testing result
         │   ├──eval_all.sh               // choose a range of testing results to evaluate
         │   ├──test.sh                   // testing a single model
+        │   ├──test_gpu.sh               // testing a single model on GPU
         │   ├──test_distribute.sh        // testing a range of models
+        │   ├──test_distribute_gpu.sh    // testing a range of models on GPU
         │   ├──test_and_eval.sh          // test then evaluate a single model
+        │   ├──test_and_eval_gpu.sh      // test then evaluate a single model on GPU
         │   ├──train_standalone.sh       // train in ascend with single npu
         │   ├──train_standalone_gpu.sh   // train on GPU with single npu
         │   ├──train_distribute.sh       // train in ascend with multi npu
@@ -519,12 +520,9 @@ Major parameters eval.py as follows:
     # or use the command as follow:
     #   USE_DEVICE_ID: your device
     #   PRETRAINED_BACKBONE: your pretrained model path
-    #   DATASET: dataset path
     #   ANNOTATIONS: annotation path
-    #   images: img_dir in dataset path
-    bash train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]
-    # after training
-    cp train_standalone_gpu/output/*/*.ckpt [MODEL_PATH]
+    #   DATASET: image dataset path
+    bash train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [ANNOTATIONS] [DATASET]
     ```
 
     - Multi-device (recommended)
@@ -536,9 +534,7 @@ Major parameters eval.py as follows:
     # or use symbolic link as quick start
     # or use the command as follow, most are the same as train_standalone_gpu.sh, the different is DEVICE_NUM
     #   DEVICE_NUM: for multi-device only, number of devices
-    bash train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]
-    # after training
-    cp train_distribute_gpu/output/*/*.ckpt [MODEL_PATH]
+    bash train_distribute_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [PRETRAINED_BACKBONE] [ANNOTATIONS] [DATASET]
     ```
 
     After training with 8 device, the loss value will be achieved as follows:
@@ -581,15 +577,21 @@ mkdir [SAVE_PATH]
     ```python
     # you need to change the parameter in test.sh
     # or use symbolic link as quick start
+    - On Ascend
     # or use the command as follow:
-    #   DEVICE_TARGET: device where the code will be implemented. Either Ascend or GPU (default: Ascend)
     #   MODEL_PATH: ckpt path saved during training
     #   DATASET: img dir
     #   GROUND_TRUTH_MAT: ground_truth file, mat type
     #   SAVE_PATH: save_path for evaluate
     #   DEVICE_ID: use device id
     #   CKPT: test model name
-    bash test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]
+    bash test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]
+    - On GPU
+    # or use the command as follow:
+    #   CKPT: test model name
+    #   DATASET: img dir
+    #   GROUND_TRUTH_MAT: ground_truth file, mat type
+    bash test_gpu.sh [DEVICE_ID] [CKPT] [DATASET] [GROUND_TRUTH_MAT]
     ```
 
 2. test many out ckpt for user to choose the best one
@@ -597,13 +599,20 @@ mkdir [SAVE_PATH]
     ```python
     # you need to change the parameter in test.sh
     # or use symbolic link as quick start
+    - On Ascend
     # or use the command as follow, most are the same as test.sh, the different are:
-    #   DEVICE_TARGET: device where the code will be implemented. Either Ascend or GPU (default: Ascend)
     #   DEVICE_NUM: training device number
     #   STEPS_PER_EPOCH: steps for each epoch
     #   START: start loop number, used to calculate first epoch number
     #   END: end loop number, used to calculate last epoch number
-    bash test_distribute.sh [DEVICE_TARGET][MODEL_PATH] [DATASET][GROUND_TRUTH_MAT] [SAVE_PATH][DEVICE_NUM] [STEPS_PER_EPOCH][START] [END]
+    bash test_distribute.sh [MODEL_PATH] [DATASET][GROUND_TRUTH_MAT] [SAVE_PATH][DEVICE_NUM] [STEPS_PER_EPOCH][START] [END]
+    - On GPU
+    # or use the command as follow, most are the same as test.sh, the different are:
+    #   DEVICE_NUM: training device number
+    #   CKPT_PATH: test model path
+    #   DATASET: img dir
+    #   GROUND_TRUTH_MAT: ground_truth file, mat type
+    bash test_distribute_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [CKPT_PATH] [DATASET] [GROUND_TRUTH_MAT]
     ```
 
 =======
@@ -648,11 +657,14 @@ cd ../../../scripts;
 3. test+eval
 
     ```python
+    - On Ascend
     # you need to change the parameter in test_and_eval.sh
     # or use symbolic link as quick start, default eval the ckpt saved in ./scripts/output/centerface/999
     # or use the command as follow, most are the same as test.sh, the different are:
     #   GROUND_TRUTH_PATH: ground truth path
-    bash test_and_eval.sh [DEVICE_TARGET][MODEL_PATH] [DATASET][GROUND_TRUTH_MAT] [SAVE_PATH][CKPT] [GROUND_TRUTH_PATH]
+    bash test_and_eval.sh [MODEL_PATH] [DATASET][GROUND_TRUTH_MAT] [SAVE_PATH][CKPT] [GROUND_TRUTH_PATH]
+    - On GPU
+    bash test_and_eval_gpu.sh [DEVICE_ID] [CKPT] [DATASET] [GROUND_TRUTH_MAT]
     ```
 
 - Running on Ascend
diff --git a/model_zoo/official/cv/centerface/dependency/evaluate/eval.py b/model_zoo/official/cv/centerface/dependency/evaluate/eval.py
index 031aa1497b5..b565cce4028 100644
--- a/model_zoo/official/cv/centerface/dependency/evaluate/eval.py
+++ b/model_zoo/official/cv/centerface/dependency/evaluate/eval.py
@@ -39,7 +39,7 @@ from bbox import bbox_overlaps
 def get_gt_boxes(gt_dir):
     """ gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)"""
 
-    gt_mat = loadmat(os.path.join(gt_dir, 'val.mat')) # you own ground_truth name
+    gt_mat = loadmat(os.path.join(gt_dir, 'wider_face_val.mat')) # you own ground_truth name
     hard_mat = loadmat(os.path.join(gt_dir, 'wider_hard_val.mat'))
     medium_mat = loadmat(os.path.join(gt_dir, 'wider_medium_val.mat'))
     easy_mat = loadmat(os.path.join(gt_dir, 'wider_easy_val.mat'))
diff --git a/model_zoo/official/cv/centerface/scripts/eval.sh b/model_zoo/official/cv/centerface/scripts/eval.sh
index 8f2a65a5e17..e390bfc7f39 100644
--- a/model_zoo/official/cv/centerface/scripts/eval.sh
+++ b/model_zoo/official/cv/centerface/scripts/eval.sh
@@ -16,7 +16,19 @@
 
 root=$PWD
 save_path=$root/output/centerface/
+if [ ! -d $save_path ]
+then
+    echo "error: save_path=$save_path is not a dir"
+exit 1
+fi
+
 ground_truth_path=$1
+if [ ! -d $ground_truth_path ]
+then
+    echo "error: ground_truth_path=$ground_truth_path is not a dir"
+exit 1
+fi
+
 echo "start eval"
 python ../dependency/evaluate/eval.py --pred=$save_path --gt=$ground_truth_path
 echo "end eval"
diff --git a/model_zoo/official/cv/centerface/scripts/eval_all.sh b/model_zoo/official/cv/centerface/scripts/eval_all.sh
index a38c137cab7..816cd5ec174 100644
--- a/model_zoo/official/cv/centerface/scripts/eval_all.sh
+++ b/model_zoo/official/cv/centerface/scripts/eval_all.sh
@@ -16,7 +16,19 @@
 
 root=$PWD
 save_path=$root/output/centerface/
+if [ ! -d $save_path ]
+then
+    echo "error: save_path=$save_path is not a dir"
+exit 1
+fi
+
 ground_truth_path=$1
+if [ ! -d $ground_truth_path ]
+then
+    echo "error: ground_truth_path=$ground_truth_path is not a dir"
+exit 1
+fi
+
 #for i in $(seq start_epoch end_epoch+1)
 for i in $(seq 89 200)
 do
diff --git a/model_zoo/official/cv/centerface/scripts/test.sh b/model_zoo/official/cv/centerface/scripts/test.sh
index ee719554631..4d623bd608f 100644
--- a/model_zoo/official/cv/centerface/scripts/test.sh
+++ b/model_zoo/official/cv/centerface/scripts/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2020-21 Huawei Technologies Co., Ltd
+# Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,15 +14,14 @@
 # limitations under the License.
 # ============================================================================
 
-if [ $# -gt 7 ]
+if [ $# -gt 6 ]
 then
-    echo "Usage: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]"
-    echo "   or: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID]"
-    echo "   or: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]"
-    echo "   or: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]"
-    echo "   or: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET]"
-    echo "   or: sh test.sh [DEVICE_TARGET] [MODEL_PATH]"
-    echo "   or: sh test.sh [DEVICE_TARGET]"
+    echo "Usage: sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]"
+    echo "   or: sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID]"
+    echo "   or: sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]"
+    echo "   or: sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]"
+    echo "   or: sh test.sh [MODEL_PATH] [DATASET]"
+    echo "   or: sh test.sh [MODEL_PATH]"
     echo "   or: sh test.sh "
 exit 1
 fi
@@ -51,43 +50,32 @@ dataset_root=$root/dataset
 dataset_path=$dataset_root/centerface/images/val/images/
 ground_truth_mat=$dataset_root/centerface/ground_truth/val.mat
 save_path=$root/output/centerface/
-device_target="Ascend"
 device_id=0
-ckpt="0-140_221620.ckpt" # the model saved for epoch=140
+ckpt="0-125_24750.ckpt" # the model saved for epoch=125
 
-if [ $# -ge 1 ]
+if [ $# == 1 ]
 then
-    device_target="$1"
-    if [ "$device_target" != "Ascend" ] && [ "$device_target" != "GPU" ]
-    then
-        echo "error: device_target=$device_target is not a valid option (Ascend or GPU)"
-    exit 1
-    fi
-fi
-
-if [ $# -ge 2 ]
-then
-    model_path=$(get_real_path $2)
-    if [ ! -d $model_path ]
+    model_path=$(get_real_path $1)
+    if [ ! -f $model_path ]
     then
         echo "error: model_path=$model_path is not a file"
     exit 1
     fi
 fi
 
-if [ $# -ge 3 ]
+if [ $# == 2 ]
 then
-    dataset_path=$(get_real_path $3)
-    if [ ! -d $dataset_path ]
+    dataset_path=$(get_real_path $2)
+    if [ ! -f $dataset_path ]
     then
         echo "error: dataset_path=$dataset_path is not a file"
     exit 1
     fi
 fi
 
-if [ $# -ge 4 ]
+if [ $# == 3 ]
 then
-    ground_truth_mat=$(get_real_path $4)
+    ground_truth_mat=$(get_real_path $3)
     if [ ! -f $ground_truth_mat ]
     then
         echo "error: ground_truth_mat=$ground_truth_mat is not a file"
@@ -95,24 +83,24 @@ then
     fi
 fi
 
-if [ $# -ge 5 ]
+if [ $# == 4 ]
 then
-    save_path=$(get_real_path $5)
-    if [ ! -d $save_path ]
+    save_path=$(get_real_path $4)
+    if [ ! -f $save_path ]
     then
         echo "error: save_path=$save_path is not a file"
     exit 1
     fi
 fi
 
-if [ $# -ge 6 ]
+if [ $# == 5 ]
 then
-    device_id=$6
+    device_id=$5
 fi
 
-if [ $# == 7 ]
+if [ $# == 6 ]
 then
-    ckpt=$7
+    ckpt=$6
 fi
 
 echo $model_path
@@ -138,7 +126,6 @@ python ${dirname_path}/${SCRIPT_NAME} \
     --ground_truth_mat=$ground_truth_mat \
     --save_dir=$save_path \
     --rank=$device_id \
-    --device_target=$device_target \
     --ckpt_name=$ckpt > test.log  2>&1 &
 
 echo 'running'
diff --git a/model_zoo/official/cv/centerface/scripts/test_and_eval.sh b/model_zoo/official/cv/centerface/scripts/test_and_eval.sh
index e52e0a59fae..6a6e1ea4f34 100644
--- a/model_zoo/official/cv/centerface/scripts/test_and_eval.sh
+++ b/model_zoo/official/cv/centerface/scripts/test_and_eval.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2020-21 Huawei Technologies Co., Ltd
+# Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,16 +14,15 @@
 # limitations under the License.
 # ============================================================================
 
-if [ $# -gt 8 ]
+if [ $# -gt 6 ]
 then
-    echo "Usage: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT] [GROUND_TRUTH_PATH]"
-    echo "   or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]"
-    echo "   or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID]"
-    echo "   or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]"
-    echo "   or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]"
-    echo "   or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET]"
-    echo "   or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH]"
-    echo "   or: sh test_and_eval.sh [DEVICE_TARGET]"
+    echo "Usage: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT] [GROUND_TRUTH_PATH]"
+    echo "   or: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]"
+    echo "   or: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID]"
+    echo "   or: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]"
+    echo "   or: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]"
+    echo "   or: sh test_and_eval.sh [MODEL_PATH] [DATASET]"
+    echo "   or: sh test_and_eval.sh [MODEL_PATH]"
     echo "   or: sh test_and_eval.sh "
 exit 1
 fi
@@ -52,24 +51,14 @@ dataset_root=$root/dataset
 dataset_path=$dataset_root/centerface/images/val/images/
 ground_truth_mat=$dataset_root/centerface/ground_truth/val.mat
 save_path=$root/output/centerface/999
-device_target="Ascend"
 device_id=0
-ckpt="0-140_221620.ckpt" # the model saved for epoch=125
+ckpt="0-125_24750.ckpt" # the model saved for epoch=125
 ground_truth_path=$root/dataset/centerface/ground_truth
 
 if [ $# -ge 1 ]
 then
-    device_target="$1"
-    if [ "$device_target" != "Ascend" ] && [ "$device_target" != "GPU" ]
-    then
-        echo "error: device_target=$device_target is not a valid option (Ascend or GPU)"
-    exit 1
-    fi
-fi
-
-if [ $# -ge 2 ]
-then
-    model_path=$(get_real_path $2)
+    model_path=$(get_real_path $1)
+#    if [ ! -f $model_path ]
     if [ ! -d $model_path ]
     then
         echo "error: model_path=$model_path is not a dir"
@@ -77,9 +66,9 @@ then
     fi
 fi
 
-if [ $# -ge 3 ]
+if [ $# -ge 2 ]
 then
-    dataset_path=$(get_real_path $3)
+    dataset_path=$(get_real_path $2)
     if [ ! -d $dataset_path ]
     then
         echo "error: dataset_path=$dataset_path is not a dir"
@@ -87,9 +76,9 @@ then
     fi
 fi
 
-if [ $# -ge 4 ]
+if [ $# -ge 3 ]
 then
-    ground_truth_mat=$(get_real_path $4)
+    ground_truth_mat=$(get_real_path $3)
     if [ ! -f $ground_truth_mat ]
     then
         echo "error: ground_truth_mat=$ground_truth_mat is not a file"
@@ -97,9 +86,9 @@ then
     fi
 fi
 
-if [ $# -ge 5 ]
+if [ $# -ge 4 ]
 then
-    save_path=$(get_real_path $5)
+    save_path=$(get_real_path $4)
     if [ ! -d $save_path ]
     then
         echo "error: save_path=$save_path is not a dir"
@@ -107,19 +96,19 @@ then
     fi
 fi
 
+if [ $# -ge 5 ]
+then
+    device_id=$5
+fi
+
 if [ $# -ge 6 ]
 then
-    device_id=$6
+    ckpt=$6
 fi
 
 if [ $# -ge 7 ]
 then
-    ckpt=$7
-fi
-
-if [ $# == 8 ]
-then
-    ground_truth_path=$(get_real_path $8)
+    ground_truth_path=$(get_real_path $7)
     if [ ! -f $ground_truth_path ]
     then
         echo "error: ground_truth_path=$ground_truth_path is not a file"
@@ -153,7 +142,6 @@ python ${dirname_path}/${SCRIPT_NAME} \
     --rank=$device_id \
     --ckpt_name=$ckpt \
     --eval=1 \
-    --device_target=$device_target \
     --ground_truth_path=$ground_truth_path > test.log  2>&1 &
 
 echo 'running'
diff --git a/model_zoo/official/cv/centerface/scripts/test_distribute.sh b/model_zoo/official/cv/centerface/scripts/test_distribute.sh
index d14c84df6c8..3cfc82934e4 100644
--- a/model_zoo/official/cv/centerface/scripts/test_distribute.sh
+++ b/model_zoo/official/cv/centerface/scripts/test_distribute.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2020-21 Huawei Technologies Co., Ltd
+# Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,19 +14,18 @@
 # limitations under the License.
 # ============================================================================
 
-if [ $# -gt 9 ]
+if [ $# -gt 8 ]
 then
-    echo "Usage: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START] [END]"
-    echo "   or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START]"
-    echo "   or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH]"
-    echo "   or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM]"
-    echo "   or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM]"
-    echo "   or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]"
-    echo "   or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]"
-    echo "   or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET]"
-    echo "   or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET]"
-    echo "   or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH]"
-    echo "   or: sh test_distribute.sh [DEVICE_TARGET]"
+    echo "Usage: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START] [END]"
+    echo "   or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START]"
+    echo "   or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH]"
+    echo "   or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM]"
+    echo "   or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM]"
+    echo "   or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]"
+    echo "   or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]"
+    echo "   or: sh test_distribute.sh [MODEL_PATH] [DATASET]"
+    echo "   or: sh test_distribute.sh [MODEL_PATH] [DATASET]"
+    echo "   or: sh test_distribute.sh [MODEL_PATH]"
     echo "   or: sh test_distribute.sh "
 exit 1
 fi
@@ -59,7 +58,6 @@ save_path=$root/output/centerface/
 # model/ckpt name is "0-" + str(ckpt_num) + "_" + str(198*ckpt_num) + ".ckpt";
 # ckpt_num is epoch number, can be calculated by device_num
 # detail can be found in "test.py"
-device_target="Ascend"
 device_num=8
 steps_per_epoch=198 #198 for 8P; 1583 for 1p
 start=11 # start epoch number = start * device_num + min(device_phy_id) + 1
@@ -67,17 +65,8 @@ end=18 # end epoch number = end * device_num + max(device_phy_id) + 1
 
 if [ $# -ge 1 ]
 then
-    device_target="$1"
-    if [ "$device_target" != "Ascend" ] && [ "$device_target" != "GPU" ]
-    then
-        echo "error: device_target=$device_target is not a valid option (Ascend or GPU)"
-    exit 1
-    fi
-fi
-
-if [ $# -ge 2 ]
-then
-    model_path=$(get_real_path $2)
+    model_path=$(get_real_path $1)
+#    if [ ! -f $model_path ]
     if [ ! -d $model_path ]
     then
         echo "error: model_path=$model_path is not a dir"
@@ -85,9 +74,9 @@ then
     fi
 fi
 
-if [ $# -ge 3 ]
+if [ $# -ge 2 ]
 then
-    dataset_path=$(get_real_path $3)
+    dataset_path=$(get_real_path $2)
     if [ ! -d $dataset_path ]
     then
         echo "error: dataset_path=$dataset_path is not a dir"
@@ -95,9 +84,9 @@ then
     fi
 fi
 
-if [ $# -ge 4 ]
+if [ $# -ge 3 ]
 then
-    ground_truth_mat=$(get_real_path $4)
+    ground_truth_mat=$(get_real_path $3)
     if [ ! -f $ground_truth_mat ]
     then
         echo "error: ground_truth_mat=$ground_truth_mat is not a file"
@@ -105,9 +94,9 @@ then
     fi
 fi
 
-if [ $# -ge 5 ]
+if [ $# -ge 4 ]
 then
-    save_path=$(get_real_path $5)
+    save_path=$(get_real_path $4)
     if [ ! -d $save_path ]
     then
         echo "error: save_path=$save_path is not a dir"
@@ -115,24 +104,24 @@ then
     fi
 fi
 
+if [ $# -ge 5 ]
+then
+    device_num=$5
+fi
+
 if [ $# -ge 6 ]
 then
-    device_num=$6
+    steps_per_epoch=$6
 fi
 
 if [ $# -ge 7 ]
 then
-    steps_per_epoch=$7
+    start=$7
 fi
 
-if [ $# -ge 8 ]
+if [ $# == 8 ]
 then
-    start=$8
-fi
-
-if [ $# == 9 ]
-then
-    end=$9
+    end=$8
 fi
 
 echo $model_path
@@ -161,7 +150,6 @@ do
         --save_dir=$save_path \
         --rank=$i \
         --device_num=$device_num \
-        --device_target=$device_target \
         --steps_per_epoch=$steps_per_epoch \
         --start=$start \
         --end=$end > test.log  2>&1 &
diff --git a/model_zoo/official/cv/centerface/scripts/train_distribute_gpu.sh b/model_zoo/official/cv/centerface/scripts/train_distribute_gpu.sh
index c8b626de9ee..3abd6008ff9 100644
--- a/model_zoo/official/cv/centerface/scripts/train_distribute_gpu.sh
+++ b/model_zoo/official/cv/centerface/scripts/train_distribute_gpu.sh
@@ -14,15 +14,10 @@
 # limitations under the License.
 # ============================================================================
 
-if [ $# != 0 ] && [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
+if [ $# != 5 ]
 then
-    echo "Usage: sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]"
-    echo "   or: sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS]"
-    echo "   or: sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE] [DATASET]"
-    echo "   or: sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE]"
-    echo "   or: sh train_distribute_gpu.sh [DEVICE_NUM]"
-    echo "   or: sh train_distribute_gpu.sh "
-exit 1
+    echo "Usage: bash train_distribute_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [PRETRAINED_BACKBONE] [ANNOTATIONS] [DATASET]"
+    exit 1
 fi
 
 get_real_path(){
@@ -44,73 +39,48 @@ SCRIPT_NAME='train.py'
 
 ulimit -c unlimited
 
-root=${current_exec_path} # your script path
-pretrained_backbone=${dirname_path}/mobilenet_v2.ckpt # or mobilenet_v2-b0353104.ckpt
-dataset_path=$root/dataset/centerface
-annot_path=$dataset_path/annotations/train.json
-img_dir=$dataset_path/images/train/images
-num_devices=8
-
-if [ $# == 1 ]
+if [ $1 -lt 1 ] && [ $1 -gt 8 ]
 then
-    num_devices=$1
+    echo "error: DEVICE_NUM=$1 is not in (1-8)"
+    exit 1
 fi
 
-if [ $# == 2 ]
+export CUDA_VISIBLE_DEVICES="$2"
+
+pretrained_backbone=$(get_real_path $3)
+if [ ! -f $pretrained_backbone ]
 then
-    pretrained_backbone=$(get_real_path $2)
-    if [ ! -f $pretrained_backbone ]
-    then
-        echo "error: pretrained_backbone=$pretrained_backbone is not a file"
+    echo "error: pretrained_backbone=$pretrained_backbone is not a file"
     exit 1
-    fi
 fi
 
-if [ $# == 3 ]
+annot_path=$(get_real_path $4)
+if [ ! -f $annot_path ]
 then
-    dataset_path=$(get_real_path $3)
-    if [ ! -f $dataset_path ]
-    then
-        echo "error: dataset_path=$dataset_path is not a file"
+    echo "error: annot_path=$annot_path is not a file"
     exit 1
-    fi
 fi
 
-if [ $# == 4 ]
+dataset_path=$(get_real_path $5)
+if [ ! -d $dataset_path ]
 then
-    annot_path=$(get_real_path $4)
-    if [ ! -f $annot_path ]
-    then
-        echo "error: annot_path=$annot_path is not a file"
+    echo "error: dataset_path=$dataset_path is not a dir"
     exit 1
-    fi
-fi
-
-if [ $# == 5 ]
-then
-    img_dir=$(get_real_path $5)
-    if [ ! -f $img_dir ]
-    then
-        echo "error: img_dir=$img_dir is not a file"
-    exit 1
-    fi
 fi
 
 echo $pretrained_backbone
-echo $dataset_path
 echo $annot_path
-echo $img_dir
+echo $dataset_path
 
 export PYTHONPATH=${dirname_path}:$PYTHONPATH
-export RANK_SIZE=$num_devices
-export DEVICE_ID=0
+export RANK_SIZE=$1
 
 echo "start training on $RANK_SIZE devices"
 
 mkdir ${current_exec_path}/train_distribute_gpu
 cd ${current_exec_path}/train_distribute_gpu || exit
 
-mpirun -n $RANK_SIZE \
+mpirun -n $1 \
     python ${dirname_path}/${SCRIPT_NAME} \
     --lr=4e-3 \
     --per_batch_size=8 \
@@ -123,10 +93,8 @@ mpirun -n $RANK_SIZE \
     --weight_decay=0.0000 \
     --loss_scale=1024 \
     --pretrained_backbone=$pretrained_backbone \
-    --data_dir=$dataset_path \
     --annot_path=$annot_path \
-    --img_dir=$img_dir \
+    --img_dir=$dataset_path \
     --device_target="GPU" > train.log  2>&1 &
 
-
 echo 'running'
diff --git a/model_zoo/official/cv/centerface/scripts/train_standalone_gpu.sh b/model_zoo/official/cv/centerface/scripts/train_standalone_gpu.sh
index 6a187d66936..fda44d38d7a 100644
--- a/model_zoo/official/cv/centerface/scripts/train_standalone_gpu.sh
+++ b/model_zoo/official/cv/centerface/scripts/train_standalone_gpu.sh
@@ -14,15 +14,10 @@
 # limitations under the License.
 # ============================================================================
 
-if [ $# != 0 ] && [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
+if [ $# != 1 ]
 then
-    echo "Usage: sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]"
-    echo "   or: sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS]"
-    echo "   or: sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET]"
-    echo "   or: sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE]"
-    echo "   or: sh train_standalone_gpu.sh [USE_DEVICE_ID]"
-    echo "   or: sh train_standalone_gpu.sh "
-exit 1
+    echo "Usage: bash train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [ANNOTATIONS] [DATASET]"
+    exit 1
 fi
 
 get_real_path(){
@@ -43,89 +38,48 @@ SCRIPT_NAME='train.py'
 
 ulimit -c unlimited
 
-root=${current_exec_path} # your script path
-pretrained_backbone=${dirname_path}/mobilenet_v2.ckpt # or mobilenet_v2-b0353104.ckpt
-dataset_path=$root/dataset/centerface
-annot_path=$dataset_path/annotations/train.json
-img_dir=$dataset_path/images/train/images
-use_device_id=0
-
-if [ $# == 1 ]
+if [ $1 -lt 0 ] && [ $1 -gt 7 ]
 then
-    use_device_id=$1
+    echo "error: DEVICE_ID=$1 is not in (0-7)"
+    exit 1
 fi
 
-if [ $# == 2 ]
-then
-    use_device_id=$1
-    pretrained_backbone=$(get_real_path $2)
-fi
-
-if [ $# == 3 ]
-then
-    use_device_id=$1
-    pretrained_backbone=$(get_real_path $2)
-    dataset_path=$(get_real_path $3)
-fi
-
-if [ $# == 4 ]
-then
-    use_device_id=$1
-    pretrained_backbone=$(get_real_path $2)
-    dataset_path=$(get_real_path $3)
-    annot_path=$(get_real_path $4)
-fi
-
-if [ $# == 5 ]
-then
-    use_device_id=$1
-    pretrained_backbone=$(get_real_path $2)
-    dataset_path=$(get_real_path $3)
-    annot_path=$(get_real_path $4)
-    img_dir=$(get_real_path $5)
-fi
-
-echo "use_device_id: "   $use_device_id
-echo "pretrained_backbone: "   $pretrained_backbone
-echo "dataset_path: "   $dataset_path
-echo "annot_path: "   $annot_path
-echo "img_dir: "   $img_dir
+export CUDA_VISIBLE_DEVICES="$1"
 
+pretrained_backbone=$(get_real_path $2)
 if [ ! -f $pretrained_backbone ]
 then
     echo "error: pretrained_backbone=$pretrained_backbone is not a file"
-exit 1
-fi
-
-if [ ! -d $dataset_path ]
-then
-    echo "error: dataset_path=$dataset_path is not a directory"
-exit 1
+    exit 1
 fi
 
+annot_path=$(get_real_path $3)
 if [ ! -f $annot_path ]
 then
     echo "error: annot_path=$annot_path is not a file"
-exit 1
+    exit 1
 fi
 
-if [ ! -d $img_dir ]
+dataset_path=$(get_real_path $4)
+if [ ! -d $dataset_path ]
 then
-    echo "error: img_dir=$img_dir is not a directory"
-exit 1
+    echo "error: dataset_path=$dataset_path is not a dir"
+    exit 1
 fi
 
+echo $pretrained_backbone
+echo $annot_path
+echo $dataset_path
+
 export PYTHONPATH=${dirname_path}:$PYTHONPATH
 export RANK_SIZE=1
 
 echo 'start training'
-echo 'start rank '$use_device_id
 rm -rf ${current_exec_path}/train_standalone_gpu
 mkdir ${current_exec_path}/train_standalone_gpu
 cd ${current_exec_path}/train_standalone_gpu || exit
 export RANK_ID=0
-dev=`expr $use_device_id + 0`
-export DEVICE_ID=$dev
+
 python ${dirname_path}/${SCRIPT_NAME} \
     --lr=5e-4 \
     --per_batch_size=8 \
@@ -138,9 +92,8 @@ python ${dirname_path}/${SCRIPT_NAME} \
     --weight_decay=0.0000 \
     --loss_scale=1024 \
     --pretrained_backbone=$pretrained_backbone \
-    --data_dir=$dataset_path \
     --annot_path=$annot_path \
-    --img_dir=$img_dir \
+    --img_dir=$dataset_path \
     --device_target="GPU" > train.log  2>&1 &
 
 echo 'running'
diff --git a/model_zoo/official/cv/centerface/src/centerface.py b/model_zoo/official/cv/centerface/src/centerface.py
index b023ada4c6c..aae19169a39 100644
--- a/model_zoo/official/cv/centerface/src/centerface.py
+++ b/model_zoo/official/cv/centerface/src/centerface.py
@@ -310,8 +310,8 @@ class TrainingWrapper(nn.Cell):
         else:
             cond = self.less_equal(self.base, flag_sum)
 
-        self.optimizer(grads)
-        return (loss, cond, sens)
+        ret = (loss, cond, sens)
+        return F.depend(ret, self.optimizer(grads))
 
 
 class CenterFaceWithNms(nn.Cell):
diff --git a/model_zoo/official/cv/centerface/test.py b/model_zoo/official/cv/centerface/test.py
index b5635e79cf8..40a1f1d891b 100644
--- a/model_zoo/official/cv/centerface/test.py
+++ b/model_zoo/official/cv/centerface/test.py
@@ -35,9 +35,10 @@ from dependency.evaluate.eval import evaluation
 
 dev_id = get_device_id()
 context.set_context(mode=context.GRAPH_MODE,
-                    device_target=config.device_target, save_graphs=False, device_id=dev_id)
+                    device_target=config.device_target, save_graphs=False)
 
 if config.device_target == "Ascend":
+    context.set_context(device_id=dev_id)
     context.set_context(enable_auto_mixed_precision=False)
 
 def modelarts_process():
@@ -65,7 +66,7 @@ def test_centerface():
         else:
             ckpt_name = config.ckpt_name
 
-        test_model = config.test_model + ckpt_name
+        test_model = config.test_model + "/" + ckpt_name
         if not test_model:
             print('load_model {} none'.format(test_model))
             continue
@@ -112,8 +113,8 @@ def test_centerface():
             if not os.path.exists(save_path + im_dir):
                 os.makedirs(save_path + im_dir)
                 print('save_path + im_dir={}'.format(save_path + im_dir))
-            for num, file in enumerate(file_list_item):
-                im_name = file[0][0]
+            for num, file_obj in enumerate(file_list_item):
+                im_name = file_obj[0][0]
                 zip_name = '%s/%s.jpg' % (im_dir, im_name)
                 img_path = os.path.join(config.data_dir, zip_name)
                 print('img_path={}'.format(img_path))
diff --git a/model_zoo/official/cv/crnn/README.md b/model_zoo/official/cv/crnn/README.md
index 9f77bac55d6..048ac41595b 100644
--- a/model_zoo/official/cv/crnn/README.md
+++ b/model_zoo/official/cv/crnn/README.md
@@ -22,6 +22,7 @@
         - [Export MindIR](#export-mindir)
         - [Infer on Ascend310](#infer-on-ascend310)
         - [result](#result)
+        - [Post Training Quantization](#post-training-quantization)
     - [Model Description](#model-description)
         - [Performance](#performance)
             - [Training Performance](#training-performance)
@@ -364,6 +365,41 @@ correct num: 2042 , total num: 3000
 result CRNNAccuracy is: 0.806666666666
 ```
 
+### [Post Training Quantization](#contents)
+
+Relative executing script files reside in the directory "ascend310_quant_infer". Please implement following steps sequentially to complete post quantization.
+Current quantization project bases on IIIT5K dataset.
+
+1. Generate data of .bin format required for AIR model inference at Ascend310 platform.
+
+```shell
+python export_bin.py --eval_dataset [DATASET NAME] --eval_dataset_path [DATA PATH]
+```
+
+2. Export quantized AIR model.
+
+Post quantization of model requires special toolkits for exporting quantized AIR model. Please refer to [official website](https://www.hiascend.com/software/cann/community).
+
+```shell
+python post_quant.py --eval_dataset [DATASET NAME] --eval_dataset_path [DATA PATH] --ckpt_file [CKPT_PATH]
+```
+
+The quantized AIR file will be stored as "./results/crnn_quant.air".
+
+3. Implement inference at Ascend310 platform.
+
+```shell
+# Ascend310 quant inference
+bash run_quant_infer.sh [AIR_PATH] [DATA_PATH] [LABEL_PATH]
+```
+
+Inference result is saved in current path, you can find result like this in acc.log file.
+
+```bash
+correct num: 2398 , total num: 3000
+result CRNNAccuracy is: 0.7933333333333
+```
+
 ## [Model Description](#contents)
 
 ### [Performance](#contents)
diff --git a/model_zoo/official/cv/ctpn/README.md b/model_zoo/official/cv/ctpn/README.md
index bd220d68779..792464bdac3 100644
--- a/model_zoo/official/cv/ctpn/README.md
+++ b/model_zoo/official/cv/ctpn/README.md
@@ -1,6 +1,6 @@
 ![logo](https://www.mindspore.cn/static/img/logo_black.6a5c850d.png)
 
-# CTPN for Ascend
+# CTPN
 
 <!-- TOC -->
 
diff --git a/model_zoo/official/cv/deeplabv3/README.md b/model_zoo/official/cv/deeplabv3/README.md
index 7d1d8e09ba3..e1b6b46bccc 100644
--- a/model_zoo/official/cv/deeplabv3/README.md
+++ b/model_zoo/official/cv/deeplabv3/README.md
@@ -29,6 +29,7 @@
     - [Inference Process](#inference-process)
         - [Usage](#usage-2)
         - [result](#result-2)
+    - [Post Training Quantization](#post-training-quantization)
 - [Model Description](#model-description)
     - [Performance](#performance)
         - [Evaluation Performance](#evaluation-performance)
@@ -112,13 +113,7 @@ After installing MindSpore via the official website, you can start training and
 
 - Prepare backbone
 
-Download resnet101 for here(https://download.pytorch.org/models/resnet101-5d3b4d8f.pth).
-
-Use convert_resnet101.py to convert as backbone.
-
-```shell
-python convert_resnet101.py
-```
+Download resnet101 for here(https://download.mindspore.cn/model_zoo/r1.2/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt).
 
 - Running on Ascend
 
@@ -809,6 +804,40 @@ Inference result is saved in current path, you can find result in acc.log file.
 | :----------: | :-----: | :----: | :----: | :-----: | :-----: | :-------------: |
 | deeplab_v3 |       | √    |      |       | 78.84 | 78.51    |
 
+## [Post Training Quantization](#contents)
+
+Relative executing script files reside in the directory "ascend310_quant_infer". Please implement following steps sequentially to complete post quantization.
+In this project, the model is set as deeplab_v3_s8.
+
+1. Generate data of .bin format required for AIR model inference at Ascend310 platform.
+
+```shell
+python export_bin.py --model [MODEL] --data_root [DATA ROOT] --data_lst [DATA LST]
+```
+
+2. Export quantized AIR model.
+
+Post quantization of model requires special toolkits for exporting quantized AIR model. Please refer to [official website](https://www.hiascend.com/software/cann/community).
+
+```shell
+python post_quant.py --model [MODEL] --data_root [DATA ROOT] --data_lst [DATA LST] --ckpt_file [CKPT_PATH]
+```
+
+The quantized AIR file will be stored as "./results/deeplabv3_quant.air".
+
+3. Implement inference at Ascend310 platform.
+
+```shell
+# Ascend310 quant inference
+bash run_quant_infer.sh [AIR_PATH] [DATA_PATH] [LABEL_PATH] [SHAPE_PATH]
+```
+
+Inference result is saved in current path, you can find result like this in acc.log file.
+
+```bash
+mean Iou 0.7854572371350974
+```
+
 # [Model Description](#contents)
 
 ## [Performance](#contents)
diff --git a/model_zoo/official/cv/deeplabv3/README_CN.md b/model_zoo/official/cv/deeplabv3/README_CN.md
index 21f85fbdce8..893910256c7 100644
--- a/model_zoo/official/cv/deeplabv3/README_CN.md
+++ b/model_zoo/official/cv/deeplabv3/README_CN.md
@@ -31,6 +31,7 @@
     - [推理过程](#推理过程)
         - [用法](#用法-2)
         - [结果](#结果-2)
+    - [训练后量化推理](#训练后量化推理)
 - [模型描述](#模型描述)
     - [性能](#性能)
         - [训练性能](#训练性能)
@@ -62,13 +63,7 @@ Pascal VOC数据集和语义边界数据集（Semantic Boundaries Dataset，SBD
 
 - 准备Backbone模型
 
-准备resnet101模型，点此下载(https://download.pytorch.org/models/resnet101-5d3b4d8f.pth).
-
-使用convert_resnet101.py脚本转换Backbone模型.
-
-```shell
-python convert_resnet101.py
-```
+准备resnet101模型，点此下载(https://download.mindspore.cn/model_zoo/r1.2/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78/resnet101_ascend_v120_imagenet2012_official_cv_bs32_acc78.ckpt).
 
 - 下载分段数据集。
 
@@ -810,6 +805,40 @@ bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DATA_ROOT] [DATA_LIST] [DEVICE_
 | :----------: | :-----: | :----: | :----: | :-----: | :-----: | :-------------: |
 | deeplab_v3 |       | √    |      |       | 78.84 | 78.51    |
 
+## [训练后量化推理](#contents)
+
+训练后量化推理的相关执行脚本文件在"ascend310_quant_infer"目录下，依次执行以下步骤实现训练后量化推理。
+本训练后量化工程的模型类型是deeplab_v3_s8。
+
+1、生成Ascend310平台AIR模型推理需要的.bin格式数据。
+
+```shell
+python export_bin.py --model [MODEL] --data_root [DATA ROOT] --data_lst [DATA LST]
+```
+
+2、导出训练后量化的AIR格式模型。
+
+导出训练后量化模型需要配套的量化工具包，参考[官方地址](https://www.hiascend.com/software/cann/community)
+
+```shell
+python post_quant.py --model [MODEL] --data_root [DATA ROOT] --data_lst [DATA LST] --ckpt_file [CKPT_PATH]
+```
+
+导出的模型会存储在./result/deeplabv3_quant.air。
+
+3、在Ascend310执行推理量化模型。
+
+```shell
+# Ascend310 inference
+bash run_quant_infer.sh [AIR_PATH] [DATA_PATH] [LABEL_PATH] [SHAPE_PATH]
+```
+
+推理结果保存在脚本执行的当前路径，可以在acc.log中看到精度计算结果。
+
+```bash
+mean Iou 0.7854572371350974
+```
+
 # 模型描述
 
 ## 性能
diff --git a/model_zoo/official/cv/deeplabv3/convert_resnet101.py b/model_zoo/official/cv/deeplabv3/convert_resnet101.py
deleted file mode 100644
index 6c455a2414e..00000000000
--- a/model_zoo/official/cv/deeplabv3/convert_resnet101.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""convert backbone resnet101"""
-import torch
-from mindspore import Tensor
-from mindspore.train.serialization import save_checkpoint
-
-
-def torch2ms():
-    pretrained_dict = torch.load('./resnet101-5d3b4d8f.pth')
-    new_params = []
-
-    for key, value in pretrained_dict.items():
-        if not key.__contains__('fc'):
-            if key.__contains__('bn'):
-                key = key.replace('running_mean', 'moving_mean')
-                key = key.replace('running_var', 'moving_variance')
-                key = key.replace('weight', 'gamma')
-                key = key.replace('bias', 'beta')
-            param_dict = {'name': key, 'data': Tensor(value.detach().numpy())}
-            new_params.append(param_dict)
-    save_checkpoint(new_params, './resnet101-5d3b4d8f.ckpt')
-    print("Convert resnet-101 completed!")
-
-
-if __name__ == '__main__':
-    torch2ms()
diff --git a/model_zoo/official/cv/deeplabv3/train.py b/model_zoo/official/cv/deeplabv3/train.py
index c1226fdd494..9c145ba94f2 100644
--- a/model_zoo/official/cv/deeplabv3/train.py
+++ b/model_zoo/official/cv/deeplabv3/train.py
@@ -161,8 +161,15 @@ def train():
                         continue
                     print('filter {}'.format(key))
                     del param_dict[key]
-        load_param_into_net(train_net, param_dict)
-        print('load_model {} success'.format(args.ckpt_pre_trained))
+            load_param_into_net(train_net, param_dict)
+            print('load_model {} success'.format(args.ckpt_pre_trained))
+        else:
+            trans_param_dict = {}
+            for key, val in param_dict.items():
+                key = key.replace("down_sample_layer", "downsample")
+                trans_param_dict[f"network.resnet.{key}"] = val
+            load_param_into_net(train_net, trans_param_dict)
+            print('load_model {} success'.format(args.ckpt_pre_trained))
 
     # optimizer
     iters_per_epoch = dataset.get_dataset_size()
diff --git a/model_zoo/official/cv/deeptext/README.md b/model_zoo/official/cv/deeptext/README.md
index d9e5f4888f9..908469bf2e5 100644
--- a/model_zoo/official/cv/deeptext/README.md
+++ b/model_zoo/official/cv/deeptext/README.md
@@ -1,4 +1,4 @@
-# DeepText for Ascend
+# DeepText
 
 - [DeepText Description](#DeepText-description)
 - [Model Architecture](#model-architecture)
@@ -73,9 +73,13 @@ Here we used 4 datasets for training, and 1 datasets for Evaluation.
     └─moxing_adapter.py                 # Moxing adapter for ModelArts
   ├─scripts
     ├─run_standalone_train_ascend.sh    # launch standalone training with ascend platform(1p)
+    ├─run_standalone_train_gpu.sh       # launch standalone training with GPU platform(1p)
     ├─run_distribute_train_ascend.sh    # launch distributed training with ascend platform(8p)
+    ├─run_distribute_train_gpu.sh       # launch distributed training with GPU platform(8p)
     ├─run_infer_310.sh                  # shell script for 310 inference
+    ├─run_eval_gpu.sh                   # launch evaluation with GPU platform
     └─run_eval_ascend.sh                # launch evaluating with ascend platform
+
   ├─src
     ├─DeepText
       ├─__init__.py                     # package init file
@@ -115,6 +119,17 @@ bash run_standalone_train_ascend.sh [IMGS_PATH] [ANNOS_PATH] [PRETRAINED_PATH] [
 bash run_eval_ascend.sh [IMGS_PATH] [ANNOS_PATH] [CHECKPOINT_PATH] [COCO_TEXT_PARSER_PATH] [DEVICE_ID]
 ```
 
+- GPU:
+
+```bash
+# distribute training example(8p)
+sh run_distribute_train_gpu.sh [IMGS_PATH] [ANNOS_PATH] [PRETRAINED_PATH] [COCO_TEXT_PARSER_PATH]
+# standalone training
+sh run_standalone_train_gpu.sh [IMGS_PATH] [ANNOS_PATH] [PRETRAINED_PATH] [COCO_TEXT_PARSER_PATH] [DEVICE_ID]
+# evaluation:
+sh run_eval_gpu.sh [IMGS_PATH] [ANNOS_PATH] [CHECKPOINT_PATH] [COCO_TEXT_PARSER_PATH] [DEVICE_ID]
+```
+
 > Notes:
 > RANK_TABLE_FILE can refer to [Link](https://www.mindspore.cn/docs/programming_guide/en/master/distributed_training_ascend.html) , and the device_ip can be got as [Link](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). For large models like InceptionV4, it's better to export an external environment variable `export HCCL_CONNECT_TIMEOUT=600` to extend hccl connection checking time from the default 120 seconds to 600 seconds. Otherwise, the connection could be timeout since compiling time increases with the growth of model size.
 >
@@ -287,6 +302,14 @@ Evaluation result will be stored in the example path, you can find result like t
 class 1 precision is 88.01%, recall is 82.77%
 ```
 
+Evaluation result on GPU will be as follows:
+
+```python
+========================================
+
+class 1 precision is 84.49%, recall is 88.28%
+```
+
 ## Model Export
 
 ```shell
@@ -322,34 +345,34 @@ class 1 precision is 84.24%, recall is 87.40%, F1 is 85.79%
 
 ### Training Performance
 
-| Parameters                 | Ascend                                                       |
-| -------------------------- | ------------------------------------------------------------ |
-| Model Version              | Deeptext                                                     |
-| Resource                   | Ascend 910; cpu 2.60GHz, 192cores; memory 755G; OS Euler2.8   |
-| uploaded Date              | 12/26/2020                                                   |
-| MindSpore Version          | 1.1.0                                                        |
-| Dataset                    | 66040 images                                                 |
-| Batch_size                 | 2                                                            |
-| Training Parameters        | src/config.py                                                |
-| Optimizer                  | Momentum                                                     |
-| Loss Function              | SoftmaxCrossEntropyWithLogits for classification, SmoothL2Loss for bbox regression|
-| Loss                       | ~0.008                                                       |
-| Total time (8p)            | 4h                                                           |
-| Scripts                    | [deeptext script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/deeptext) |
+| Parameters                 | Ascend                                                                                              | GPU                            |
+| -------------------------- | --------------------------------------------------------------------------------------------------- |--------------------------------------- |
+| Model Version              | Deeptext                                                                                            | Deeptext                       |
+| Resource                   | Ascend 910; cpu 2.60GHz, 192cores; memory 755G; OS Euler2.8                                         | Tesla V100 PCIe 32GB; CPU 2.70GHz; 52cores; Memory 1510G; OS Ubuntu 18.04.5       |
+| uploaded Date              | 12/26/2020                                                                                          | 7/29/2021 (month/day/year)     |
+| MindSpore Version          | 1.1.0                                                                                               | 1.3.0                        |
+| Dataset                    | 66040 images                                                                                        | 66040 images                 |
+| Batch_size                 | 2                                                                                                   | 2                        |
+| Training Parameters        | src/config.py                                                                                       | src/config.py            |
+| Optimizer                  | Momentum                                                                                            | Momentum             |
+| Loss Function              | SoftmaxCrossEntropyWithLogits for classification, SmoothL2Loss for bbox regression                  | SoftmaxCrossEntropyWithLogits for classification, SmoothL2Loss for bbox regression  |
+| Loss                       | ~0.008                                                                                              | ~0.116               |
+| Total time (8p)            | 4h                                                                                                  | 9h                   |
+| Scripts                    | [deeptext script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/deeptext) | [deeptext script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/deeptext)  |
 
 #### Inference Performance
 
-| Parameters          | Ascend                 |
-| ------------------- | --------------------------- |
-| Model Version       | Deeptext                 |
-| Resource            | Ascend 910; cpu 2.60GHz, 192cores; memory 755G; OS Euler2.8         |
-| Uploaded Date       | 12/26/2020                 |
-| MindSpore Version   | 1.1.0              |
-| Dataset             | 229 images                  |
-| Batch_size          | 2                         |
-| Accuracy            | F1 score is 84.50% |
-| Total time          | 1 min                      |
-| Model for inference | 3492M (.ckpt file)   |
+| Parameters          | Ascend                                                       | GPU                        |
+| ------------------- | -------------------------------------------------------------| --------------------------- |
+| Model Version       | Deeptext                                                     | Deeptext
+| Resource            | Ascend 910; cpu 2.60GHz, 192cores; memory 755G; OS Euler2.8  | Tesla V100 PCIe 32GB; CPU 2.70GHz; 52cores; Memory 1510G; OS Ubuntu 18.04.5  |
+| Uploaded Date       | 12/26/2020                                                   | 7/29/2021 (month/day/year)    |
+| MindSpore Version   | 1.1.0                                                        | 1.3.0                         |
+| Dataset             | 229 images                                                   | 229 images              |
+| Batch_size          | 2                                                            | 2                       |
+| Accuracy            | F1 score is 84.50%                                           | F1 score is 86.34%      |
+| Total time          | 1 min                                                        | 1 min                   |
+| Model for inference | 3492M (.ckpt file)                                           | 3492M (.ckpt)           |
 
 #### Training performance results
 
@@ -359,7 +382,15 @@ class 1 precision is 84.24%, recall is 87.40%, F1 is 85.79%
 
 | **Ascend** | train performance |
 | :--------: | :---------------: |
-|     8p     |     50 img/s     |
+|     8p     |     50 img/s      |
+
+|   **GPU**   |  train performance  |
+| :---------: | :---------------: |
+|     1p      |     5 img/s       |
+
+|   **GPU**   |  train performance  |
+| :---------: | :-----------------: |
+|     8p      |     25 img/s     |
 
 # [Description of Random Situation](#contents)
 
diff --git a/model_zoo/official/cv/deeptext/src/Deeptext/proposal_generator.py b/model_zoo/official/cv/deeptext/src/Deeptext/proposal_generator.py
index 33b667f3b50..3edd1c68fff 100644
--- a/model_zoo/official/cv/deeptext/src/Deeptext/proposal_generator.py
+++ b/model_zoo/official/cv/deeptext/src/Deeptext/proposal_generator.py
@@ -19,9 +19,6 @@ import mindspore.nn as nn
 import mindspore.common.dtype as mstype
 from mindspore.ops import operations as P
 from mindspore import Tensor
-from mindspore import context
-
-context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
 
 
 class Proposal(nn.Cell):
diff --git a/model_zoo/official/cv/deeptext/src/Deeptext/rcnn.py b/model_zoo/official/cv/deeptext/src/Deeptext/rcnn.py
index e30f198846e..b1d34bbb702 100644
--- a/model_zoo/official/cv/deeptext/src/Deeptext/rcnn.py
+++ b/model_zoo/official/cv/deeptext/src/Deeptext/rcnn.py
@@ -21,7 +21,12 @@ from mindspore.ops import operations as P
 from mindspore.common.tensor import Tensor
 from mindspore.common.initializer import initializer
 from mindspore.common.parameter import Parameter
+from model_utils.config import config as default_config
 
+if default_config.export_device_target == "Ascend":
+    mtype = mstype.float16
+else:
+    mtype = mstype.float32
 
 class DenseNoTranpose(nn.Cell):
     """Dense method"""
@@ -38,8 +43,8 @@ class DenseNoTranpose(nn.Cell):
         self.cast = P.Cast()
 
     def construct(self, x):
-        x = self.cast(x, mstype.float16)
-        weight = self.cast(self.weight, mstype.float16)
+        x = self.cast(x, mtype)
+        weight = self.cast(self.weight, mtype)
         output = self.bias_add(self.matmul(x, weight), self.bias)
         return output
 
diff --git a/model_zoo/official/cv/deeptext/train.py b/model_zoo/official/cv/deeptext/train.py
index cb4b11474f9..4e3c752c1f6 100644
--- a/model_zoo/official/cv/deeptext/train.py
+++ b/model_zoo/official/cv/deeptext/train.py
@@ -29,9 +29,9 @@ from model_utils.moxing_adapter import moxing_wrapper
 from model_utils.device_adapter import get_device_id, get_device_num, get_rank_id
 
 import mindspore.common.dtype as mstype
-from mindspore import context, Tensor
+from mindspore import context, Tensor, Parameter
 from mindspore.common import set_seed
-from mindspore.communication.management import init
+from mindspore.communication.management import init, get_group_size, get_rank
 from mindspore.context import ParallelMode
 from mindspore.nn import Momentum
 from mindspore.train import Model
@@ -42,7 +42,8 @@ np.set_printoptions(threshold=np.inf)
 
 set_seed(1)
 
-context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=get_device_id())
+context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, device_id=get_device_id())
+
 
 def modelarts_pre_process():
     '''modelarts pre process function.'''
@@ -54,8 +55,7 @@ def modelarts_pre_process():
             if zip_isexist:
                 fz = zipfile.ZipFile(zip_file, 'r')
                 data_num = len(fz.namelist())
-                print("Extract Start...", flush=True)
-                print("unzip file num: {}".format(data_num), flush=True)
+                print("Extract Start. unzip file num: {}".format(data_num), flush=True)
                 data_print = int(data_num / 100) if data_num > 100 else 1
                 i = 0
                 for file in fz.namelist():
@@ -100,12 +100,21 @@ def modelarts_pre_process():
 
 @moxing_wrapper(pre_process=modelarts_pre_process)
 def run_train():
+    device_type = "Ascend" if context.get_context("device_target") == "Ascend" else "GPU"
     if config.run_distribute:
-        rank = get_rank_id()
-        device_num = get_device_num()
+        if device_type == "Ascend":
+            rank = get_rank_id()
+            device_num = get_device_num()
+
+        else:
+            context.reset_auto_parallel_context()
+            rank = get_rank()
+            device_num = get_group_size()
+
         context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
                                           gradients_mean=True)
         init()
+
     else:
         rank = get_rank_id()
         device_num = 1
@@ -151,9 +160,13 @@ def run_train():
     load_path = config.pre_trained
     if load_path != "":
         param_dict = load_checkpoint(load_path)
+        if device_type == "GPU":
+            print("Converting pretrained checkpoint from fp16 to fp32", flush=True)
+            for key, value in param_dict.items():
+                tensor = value.asnumpy().astype(np.float32)
+                param_dict[key] = Parameter(tensor, key)
         load_param_into_net(net, param_dict)
 
-    device_type = "Ascend" if context.get_context("device_target") == "Ascend" else "Others"
     if device_type == "Ascend":
         net.to_float(mstype.float16)
 
diff --git a/model_zoo/official/cv/dpn/ascend310_infer/src/main.cc b/model_zoo/official/cv/dpn/ascend310_infer/src/main.cc
index 31fea6c4346..2fe237e0ea6 100644
--- a/model_zoo/official/cv/dpn/ascend310_infer/src/main.cc
+++ b/model_zoo/official/cv/dpn/ascend310_infer/src/main.cc
@@ -64,6 +64,7 @@ int load_model(Model *model, std::vector<MSTensor> *model_inputs, std::string mi
   auto context = std::make_shared<Context>();
   auto ascend310 = std::make_shared<mindspore::Ascend310DeviceInfo>();
   ascend310->SetDeviceID(device_id);
+  ascend310->SetPrecisionMode("allow_fp32_to_fp16");
   context->MutableDeviceInfo().push_back(ascend310);
   mindspore::Graph graph;
   Serialization::Load(mindir_path, ModelType::kMindIR, &graph);
diff --git a/model_zoo/official/cv/inceptionv4/README.md b/model_zoo/official/cv/inceptionv4/README.md
index 95ef8bfeef3..e06f370a662 100644
--- a/model_zoo/official/cv/inceptionv4/README.md
+++ b/model_zoo/official/cv/inceptionv4/README.md
@@ -1,4 +1,4 @@
-# InceptionV4 for Ascend/GPU
+# InceptionV4
 
 - [InceptionV4 Description](#InceptionV4-description)
 - [Model Architecture](#model-architecture)
diff --git a/model_zoo/official/cv/lenet_quant/scripts/run_infer_310.sh b/model_zoo/official/cv/lenet_quant/scripts/run_infer_310.sh
index a675b881be1..86d84a89143 100644
--- a/model_zoo/official/cv/lenet_quant/scripts/run_infer_310.sh
+++ b/model_zoo/official/cv/lenet_quant/scripts/run_infer_310.sh
@@ -49,9 +49,10 @@ if [ -d ${ASCEND_HOME}/ascend-toolkit ]; then
     export PYTHONPATH=${TBE_IMPL_PATH}:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/python/site-packages:$PYTHONPATH
     export ASCEND_OPP_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp
 else
-    export PATH=$ASCEND_HOME/atc/ccec_compiler/bin:$ASCEND_HOME/atc/bin:$PATH
-    export LD_LIBRARY_PATH=/usr/local/lib:$ASCEND_HOME/atc/lib64:$ASCEND_HOME/acllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH
-    export PYTHONPATH=$ASCEND_HOME/atc/python/site-packages:$PYTHONPATH
+    export PATH=$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/fwkacllib/bin:$PATH
+    export LD_LIBRARY_PATH=/usr/local/lib:$ASCEND_HOME/fwkacllib/lib64:$ASCEND_HOME/driver/lib64:$LD_LIBRARY_PATH
+    export TBE_IMPL_PATH=$ASCEND_HOME/opp/op_impl/built-in/ai_core/tbe
+    export PYTHONPATH=$PYTHONPATH:$TBE_IMPL_PATH
     export ASCEND_OPP_PATH=$ASCEND_HOME/opp
 fi
 
@@ -104,4 +105,4 @@ cal_acc
 if [ $? -ne 0 ]; then
     echo "calculate accuracy failed"
     exit 1
-fi
\ No newline at end of file
+fi
diff --git a/model_zoo/official/cv/maskrcnn/README.md b/model_zoo/official/cv/maskrcnn/README.md
index 47f440f967b..d5597471c3f 100644
--- a/model_zoo/official/cv/maskrcnn/README.md
+++ b/model_zoo/official/cv/maskrcnn/README.md
@@ -23,6 +23,7 @@
     - [Inference Process](#inference-process)
         - [Usage](#usage)
         - [result](#result)
+    - [Post Training Quantization](#post-training-quantization)
 - [Model Description](#model-description)
     - [Performance](#performance)
         - [Evaluation Performance](#evaluation-performance)
@@ -701,6 +702,69 @@ Accumulating evaluation results...
  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.594
 ```
 
+## [Post Training Quantization](#contents)
+
+Relative executing script files reside in the directory "ascend310_quant_infer". Please implement following steps sequentially to complete post quantization.
+Current quantization project bases on COCO2017 dataset.
+The inference process needs about 600G hard disk space to save the reasoning results.
+
+1. Generate data of .bin format required for AIR model inference at Ascend310 platform.
+
+```shell
+python export_bin.py --coco_root [COCO DATA PATH] --mindrecord_dir [MINDRECORD PATH] --ann_file [ANNOTATION PATH]
+```
+
+2. Export quantized AIR model.
+
+Post quantization of model requires special toolkits for exporting quantized AIR model. Please refer to [official website](https://www.hiascend.com/software/cann/community).
+
+```shell
+python post_quant.py --coco_root [COCO DATA PATH] --mindrecord_dir [MINDRECORD PATH] --ckpt_file [CKPT_PATH]
+```
+
+The quantized AIR file will be stored as "./results/maskrcnn_quant.air".
+
+3. Implement inference at Ascend310 platform.
+
+```shell
+# Ascend310 quant inference
+bash run_quant_infer.sh [AIR_PATH] [DATA_PATH] [SHAPE_PATH] [ANNOTATION_PATH]
+```
+
+Inference result is saved in current path, you can find result like this in acc.log file.
+
+```bash
+Evaluate annotation type *bbox*
+Accumulating evaluation results...
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.378
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.602
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.407
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.240
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.420
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.481
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.311
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.500
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.528
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.367
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.572
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.657
+
+Evaluate annotation type *segm*
+Accumulating evaluation results...
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.321
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.553
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.328
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.164
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.350
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.466
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.276
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.422
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.441
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.279
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.476
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.578
+```
+
 # Model Description
 
 ## Performance
diff --git a/model_zoo/official/cv/maskrcnn/README_CN.md b/model_zoo/official/cv/maskrcnn/README_CN.md
index 4b336df14c9..6b1969a7585 100644
--- a/model_zoo/official/cv/maskrcnn/README_CN.md
+++ b/model_zoo/official/cv/maskrcnn/README_CN.md
@@ -25,6 +25,7 @@
     - [推理过程](#推理过程)
         - [使用方法](#使用方法)
         - [结果](#结果)
+    - [训练后量化推理](#训练后量化推理)
 - [模型说明](#模型说明)
     - [性能](#性能)
         - [训练性能](#训练性能)
@@ -696,6 +697,68 @@ Accumulating evaluation results...
  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.594
 ```
 
+## [训练后量化推理](#contents)
+
+训练后量化推理的相关执行脚本文件在"ascend310_quant_infer"目录下，依次执行以下步骤实现训练后量化推理。本训练后量化工程基于COCO2017数据集。
+推理过程需要占用大约600G的硬盘空间来保存推理的结果。
+
+1、生成Ascend310平台AIR模型推理需要的.bin格式数据。
+
+```shell
+python export_bin.py --coco_root [COCO DATA PATH] --mindrecord_dir [MINDRECORD PATH] --ann_file [ANNOTATION PATH]
+```
+
+2、导出训练后量化的AIR格式模型。
+
+导出训练后量化模型需要配套的量化工具包，参考[官方地址](https://www.hiascend.com/software/cann/community)
+
+```shell
+python post_quant.py --coco_root [COCO DATA PATH] --mindrecord_dir [MINDRECORD PATH] --ckpt_file [CKPT_PATH]
+```
+
+导出的模型会存储在./result/maskrcnn_quant.air。
+
+3、在Ascend310执行推理量化模型。
+
+```shell
+# Ascend310 inference
+bash run_quant_infer.sh [AIR_PATH] [DATA_PATH] [SHAPE_PATH] [ANNOTATION_PATH]
+```
+
+推理结果保存在脚本执行的当前路径，可以在acc.log中看到精度计算结果。
+
+```bash
+Evaluate annotation type *bbox*
+Accumulating evaluation results...
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.378
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.602
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.407
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.240
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.420
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.481
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.311
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.500
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.528
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.367
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.572
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.657
+
+Evaluate annotation type *segm*
+Accumulating evaluation results...
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.321
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.553
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.328
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.164
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.350
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.466
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.276
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.422
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.441
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.279
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.476
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.578
+```
+
 # 模型说明
 
 ## 性能
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/README.md b/model_zoo/official/cv/maskrcnn_mobilenetv1/README.md
index a82498b9f1e..a616368eb1c 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/README.md
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/README.md
@@ -58,8 +58,8 @@ Note that you can run the scripts based on the dataset mentioned in original pap
 
 # [Environment Requirements](#contents)
 
-- Hardware（Ascend）
-    - Prepare hardware environment with Ascend processor.
+- Hardware（Ascend/CPU）
+    - Prepare hardware environment with Ascend or CPU processor.
 - Framework
     - [MindSpore](https://gitee.com/mindspore/mindspore)
 - For more information, please check the resources below:
@@ -78,7 +78,7 @@ pip install mmcv=0.2.14
 
 1. Download the dataset COCO2017.
 
-2. Change the COCO_ROOT and other settings you need in `config.py`. The directory structure should look like the follows:
+2. Change the COCO_ROOT and other settings you need in `default_config.yaml`. The directory structure should look like the follows:
 
     ```
     .
@@ -90,24 +90,31 @@ pip install mmcv=0.2.14
       └─train2017
     ```
 
-     If you use your own dataset to train the network, **Select dataset to other when run script.**
+    If you use your own dataset to train the network, **Select dataset to other when run script.**
     Create a txt file to store dataset information organized in the way as shown as following:
 
     ```
     train2017/0000001.jpg 0,259,401,459,7 35,28,324,201,2 0,30,59,80,2
     ```
 
-    Each row is an image annotation split by spaces. The first column is a relative path of image, followed by columns containing box and class information in the format [xmin,ymin,xmax,ymax,class]. We read image from an image path joined by the `IMAGE_DIR`(dataset directory) and the relative path in `ANNO_PATH`(the TXT file path), which can be set in `config.py`.
+    Each row is an image annotation split by spaces. The first column is a relative path of image, followed by columns containing box and class information in the format [xmin,ymin,xmax,ymax,class]. We read image from an image path joined by the `IMAGE_DIR`(dataset directory) and the relative path in `ANNO_PATH`(the TXT file path), which can be set in `default_config.yaml`.
 
 3. Execute train script.
     After dataset preparation, you can start training as follows:
 
-    ```
+    ```bash
+    On Ascend:
+
     # distributed training
     bash run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_CKPT]
 
     # standalone training
     bash run_standalone_train.sh [PRETRAINED_CKPT]
+
+    On CPU:
+
+    # standalone training
+    bash run_standalone_train_cpu.sh [PRETRAINED_PATH](optional)
     ```
 
     Note:
@@ -116,27 +123,32 @@ pip install mmcv=0.2.14
     3. For large models like maskrcnn_mobilenetv1, it's better to export an external environment variable `export HCCL_CONNECT_TIMEOUT=600` to extend hccl connection checking time from the default 120 seconds to 600 seconds. Otherwise, the connection could be timeout since compiling time increases with the growth of model size.
 
 4. Execute eval script.
-   After training, you can start evaluation as follows:
 
-   ```bash
-   # Evaluation
-   bash run_eval.sh [VALIDATION_JSON_FILE] [CHECKPOINT_PATH]
-   ```
+    After training, you can start evaluation as follows:
 
-   Note:
-   1. VALIDATION_JSON_FILE is a label json file for evaluation.
+    ```bash
+    # Evaluation on Ascend
+    bash run_eval.sh [VALIDATION_JSON_FILE] [CHECKPOINT_PATH]
+
+    # Evaluation on CPU
+    bash run_eval_cpu.sh [ANN_FILE] [CHECKPOINT_PATH]
+    ```
+
+    Note:
+    1. VALIDATION_JSON_FILE is a label json file for evaluation.
 
 5. Execute inference script.
-   After training, you can start inference as follows:
 
-   ```shell
-   # inference
-   bash run_infer_310.sh [MODEL_PATH] [DATA_PATH] [ANN_FILE_PATH]
-   ```
+    After training, you can start inference as follows:
 
-   Note:
-   1. MODEL_PATH is a model file, exported by export script file.
-   2. ANN_FILE_PATH is a annotation file for inference.
+    ```shell
+    # inference
+    bash run_infer_310.sh [MODEL_PATH] [DATA_PATH] [ANN_FILE_PATH]
+    ```
+
+    Note:
+    1. MODEL_PATH is a model file, exported by export script file.
+    2. ANN_FILE_PATH is a annotation file for inference.
 
 - Running on [ModelArts](https://support.huaweicloud.com/modelarts/)
 
@@ -284,14 +296,16 @@ pip install mmcv=0.2.14
 
 ```shell
 .
-└─MaskRcnn
+└─MaskRcnn_Mobilenetv1
   ├─README.md                             # README
-  ├─ascend310_infer                       #application for 310 inference
+  ├─ascend310_infer                       # application for 310 inference
   ├─scripts                               # shell script
-    ├─run_standalone_train.sh             # training in standalone mode(1pcs)
-    ├─run_distribute_train.sh             # training in parallel mode(8 pcs)
-    ├─run_infer_310.sh                    #shell script for 310 inference
-    └─run_eval.sh                         # evaluation
+    ├─run_standalone_train.sh             # training in standalone mode on Ascend(1pcs)
+    ├─run_standalone_train_cpu.sh         # training in standalone mode on CPU(1pcs)
+    ├─run_distribute_train.sh             # training in parallel mode on Ascend(8 pcs)
+    ├─run_infer_310.sh                    # shell script for 310 inference
+    ├─run_eval_cpu.sh                     # evaluation on CPU
+    └─run_eval.sh                         # evaluation on Ascend
   ├─src
     ├─maskrcnn_mobilenetv1
       ├─__init__.py
@@ -306,11 +320,18 @@ pip install mmcv=0.2.14
       ├─mobilenetv1.py                    # backbone network
       ├─roi_align.py                      # roi align network
       └─rpn.py                            # reagion proposal network
-    ├─config.py                           # network configuration
+    ├─util.py                             # routine operation
+    ├─model_utils                         # network configuration
+      ├─__init__.py
+      ├─config.py                         # network configuration
+      ├─device_adapter.py                 # Get cloud ID
+      ├─local_adapter.py                  # Get local ID
+      ├─moxing_adapter.py                 # Parameter processing
     ├─dataset.py                          # dataset utils
     ├─lr_schedule.py                      # leanring rate geneatore
     ├─network_define.py                   # network define for maskrcnn_mobilenetv1
     └─util.py                             # routine operation
+  ├─default_config.yaml                   # default configuration settings
   ├─mindspore_hub_conf.py                 # mindspore hub interface
   ├─export.py                             #script to export AIR,MINDIR model
   ├─eval.py                               # evaluation scripts
@@ -323,11 +344,18 @@ pip install mmcv=0.2.14
 ### [Training Script Parameters](#contents)
 
 ```bash
+On Ascend:
+
 # distributed training
 Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
 
 # standalone training
 Usage: bash run_standalone_train.sh [PRETRAINED_MODEL]
+
+On CPU:
+
+# standalone training
+Usage: bash run_standalone_train_cpu.sh [PRETRAINED_MODEL](optional)
 ```
 
 ### [Parameters Configuration](#contents)
@@ -474,20 +502,27 @@ Usage: bash run_standalone_train.sh [PRETRAINED_MODEL]
 
 ## [Training Process](#contents)
 
-- Set options in `config.py`, including loss_scale, learning rate and network hyperparameters. Click [here](https://www.mindspore.cn/docs/programming_guide/en/master/dataset_sample.html) for more information about dataset.
+- Set options in `default_config.yaml`, including loss_scale, learning rate and network hyperparameters. Click [here](https://www.mindspore.cn/docs/programming_guide/en/master/dataset_sample.html) for more information about dataset.
 
 ### [Training](#content)
 
-- Run `run_standalone_train.sh` for non-distributed training of maskrcnn_mobilenetv1 model.
+- Run `run_standalone_train.sh` for non-distributed training of maskrcnn_mobilenetv1 model on Ascend.
 
 ```bash
 # standalone training
 bash run_standalone_train.sh [PRETRAINED_MODEL]
 ```
 
+- Run `run_standalone_train_cpu.sh` for non-distributed training of maskrcnn_mobilenetv1 model on CPU.
+
+```bash
+# standalone training
+bash run_standalone_train_cpu.sh [PRETRAINED_MODEL](optional)
+```
+
 ### [Distributed Training](#content)
 
-- Run `run_distribute_train.sh` for distributed training of Mask model.
+- Run `run_distribute_train.sh` for distributed training of Mask model on Ascend.
 
 ```bash
 bash run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
@@ -526,7 +561,7 @@ bash run_eval.sh [VALIDATION_ANN_FILE_JSON] [CHECKPOINT_PATH]
 ```
 
 > As for the COCO2017 dataset, VALIDATION_ANN_FILE_JSON is refer to the annotations/instances_val2017.json in the dataset directory.  
-> checkpoint can be produced and saved in training process, whose folder name begins with "train/checkpoint" or "train_parallel*/checkpoint".
+> Checkpoint can be produced and saved in training process, whose folder name begins with "train/checkpoint" or "train_parallel*/checkpoint".
 
 ### [Evaluation result](#content)
 
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/eval.py b/model_zoo/official/cv/maskrcnn_mobilenetv1/eval.py
index 2fe4998b145..056ede03896 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/eval.py
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/eval.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-21 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,7 +31,9 @@ from src.util import coco_eval, bbox2result_1image, results2json, get_seg_masks
 
 
 set_seed(1)
-context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=get_device_id())
+context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
+if config.device_target == "Ascend":
+    context.set_context(device_id=config.device_id)
 
 def maskrcnn_eval(dataset_path, ckpt_path, ann_file):
     """MaskRcnn evaluation."""
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/dataset.py b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/dataset.py
index 99e4a7d85aa..094a0856fc1 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/dataset.py
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/dataset.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-21 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@ from numpy import random
 import mindspore.dataset as de
 import mindspore.dataset.vision.c_transforms as C
 from mindspore.mindrecord import FileWriter
+from mindspore import context
 
 from src.model_utils.config import config
 
@@ -264,7 +265,7 @@ def impad_to_multiple_column(img, img_shape, gt_bboxes, gt_label, gt_num, gt_mas
 
 def imnormalize_column(img, img_shape, gt_bboxes, gt_label, gt_num, gt_mask):
     """imnormalize operation for image"""
-    img_data = mmcv.imnormalize(img, [123.675, 116.28, 103.53], [58.395, 57.12, 57.375], True)
+    img_data = mmcv.imnormalize(img, np.array([123.675, 116.28, 103.53]), np.array([58.395, 57.12, 57.375]), True)
     img_data = img_data.astype(np.float32)
     return (img_data, img_shape, gt_bboxes, gt_label, gt_num, gt_mask)
 
@@ -284,10 +285,15 @@ def flip_column(img, img_shape, gt_bboxes, gt_label, gt_num, gt_mask):
 
 def transpose_column(img, img_shape, gt_bboxes, gt_label, gt_num, gt_mask):
     """transpose operation for image"""
+    if context.get_context("device_target") == "CPU":
+        platform_dtype = np.float32
+    else:
+        platform_dtype = np.float16
+
     img_data = img.transpose(2, 0, 1).copy()
-    img_data = img_data.astype(np.float16)
-    img_shape = img_shape.astype(np.float16)
-    gt_bboxes = gt_bboxes.astype(np.float16)
+    img_data = img_data.astype(platform_dtype)
+    img_shape = img_shape.astype(platform_dtype)
+    gt_bboxes = gt_bboxes.astype(platform_dtype)
     gt_label = gt_label.astype(np.int32)
     gt_num = gt_num.astype(np.bool)
     gt_mask_data = gt_mask.astype(np.bool)
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py
index 537792c79da..ae1477f51f7 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-21 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@ import mindspore.nn as nn
 from mindspore.ops import operations as P
 from mindspore.common.tensor import Tensor
 import mindspore.common.dtype as mstype
+from mindspore import context
 
 
 class BboxAssignSample(nn.Cell):
@@ -79,7 +80,6 @@ class BboxAssignSample(nn.Cell):
         self.reshape = P.Reshape()
         self.equal = P.Equal()
         self.bounding_box_encode = P.BoundingBoxEncode(means=(0.0, 0.0, 0.0, 0.0), stds=(1.0, 1.0, 1.0, 1.0))
-        self.scatterNdUpdate = P.ScatterNdUpdate()
         self.scatterNd = P.ScatterNd()
         self.logicalnot = P.LogicalNot()
         self.tile = P.Tile()
@@ -93,8 +93,13 @@ class BboxAssignSample(nn.Cell):
 
         self.check_neg_mask = Tensor(np.array(np.ones(self.num_expected_neg - self.num_expected_pos), dtype=np.bool))
         self.range_pos_size = Tensor(np.arange(self.num_expected_pos).astype(np.float16))
-        self.check_gt_one = Tensor(np.array(-1 * np.ones((self.num_gts, 4)), dtype=np.float16))
-        self.check_anchor_two = Tensor(np.array(-2 * np.ones((self.num_bboxes, 4)), dtype=np.float16))
+
+        if context.get_context("device_target") == "CPU":
+            self.check_gt_one = Tensor(np.array(-1 * np.ones((self.num_gts, 4)), dtype=np.float32))
+            self.check_anchor_two = Tensor(np.array(-2 * np.ones((self.num_bboxes, 4)), dtype=np.float32))
+        else:
+            self.check_gt_one = Tensor(np.array(-1 * np.ones((self.num_gts, 4)), dtype=np.float16))
+            self.check_anchor_two = Tensor(np.array(-2 * np.ones((self.num_bboxes, 4)), dtype=np.float16))
 
 
     def construct(self, gt_bboxes_i, gt_labels_i, valid_mask, bboxes, gt_valids):
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py
index 8165fffa1d0..dcb31f4473b 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/bbox_assign_sample_stage2.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-21 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@ import mindspore.nn as nn
 import mindspore.common.dtype as mstype
 from mindspore.ops import operations as P
 from mindspore.common.tensor import Tensor
+from mindspore import context
 
 class BboxAssignSampleForRcnn(nn.Cell):
     """
@@ -78,8 +79,12 @@ class BboxAssignSampleForRcnn(nn.Cell):
         self.tile = P.Tile()
 
         # Check
-        self.check_gt_one = Tensor(np.array(-1 * np.ones((self.num_gts, 4)), dtype=np.float16))
-        self.check_anchor_two = Tensor(np.array(-2 * np.ones((self.num_bboxes, 4)), dtype=np.float16))
+        if context.get_context("device_target") == "CPU":
+            self.check_gt_one = Tensor(np.array(-1 * np.ones((self.num_gts, 4)), dtype=np.float32))
+            self.check_anchor_two = Tensor(np.array(-2 * np.ones((self.num_bboxes, 4)), dtype=np.float32))
+        else:
+            self.check_gt_one = Tensor(np.array(-1 * np.ones((self.num_gts, 4)), dtype=np.float16))
+            self.check_anchor_two = Tensor(np.array(-2 * np.ones((self.num_bboxes, 4)), dtype=np.float16))
 
         # Init tensor
         self.assigned_gt_inds = Tensor(np.array(-1 * np.ones(num_bboxes), dtype=np.int32))
@@ -91,8 +96,13 @@ class BboxAssignSampleForRcnn(nn.Cell):
         self.gt_ignores = Tensor(np.array(-1 * np.ones(self.num_gts), dtype=np.int32))
         self.range_pos_size = Tensor(np.arange(self.num_expected_pos).astype(np.float16))
         self.check_neg_mask = Tensor(np.array(np.ones(self.num_expected_neg - self.num_expected_pos), dtype=np.bool))
-        self.bboxs_neg_mask = Tensor(np.zeros((self.num_expected_neg, 4), dtype=np.float16))
-        self.labels_neg_mask = Tensor(np.array(np.zeros(self.num_expected_neg), dtype=np.uint8))
+
+        if context.get_context("device_target") == "CPU":
+            self.bboxs_neg_mask = Tensor(np.zeros((self.num_expected_neg, 4), dtype=np.float32))
+            self.labels_neg_mask = Tensor(np.array(np.zeros(self.num_expected_neg), dtype=np.int32))
+        else:
+            self.bboxs_neg_mask = Tensor(np.zeros((self.num_expected_neg, 4), dtype=np.float16))
+            self.labels_neg_mask = Tensor(np.array(np.zeros(self.num_expected_neg), dtype=np.uint8))
 
         self.reshape_shape_pos = (self.num_expected_pos, 1)
         self.reshape_shape_neg = (self.num_expected_neg, 1)
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py
index d40413a622e..649c2ae62fa 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/fpn_neck.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-21 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@ from mindspore.ops import operations as P
 from mindspore.common.tensor import Tensor
 from mindspore.common import dtype as mstype
 from mindspore.common.initializer import initializer
+from mindspore import context
 
 
 def bias_init_zeros(shape):
@@ -66,6 +67,10 @@ class FeatPyramidNeck(nn.Cell):
                  out_channels,
                  num_outs):
         super(FeatPyramidNeck, self).__init__()
+        if context.get_context("device_target") == "CPU":
+            self.platform_mstype = mstype.float32
+        else:
+            self.platform_mstype = mstype.float16
         self.num_outs = num_outs
         self.in_channels = in_channels
         self.fpn_layer = len(self.in_channels)
@@ -96,9 +101,9 @@ class FeatPyramidNeck(nn.Cell):
             x += (self.lateral_convs_list[i](inputs[i]),)
 
         y = (x[3],)
-        y = y + (x[2] + self.cast(self.interpolate1(y[self.fpn_layer - 4]), mstype.float16),)
-        y = y + (x[1] + self.cast(self.interpolate2(y[self.fpn_layer - 3]), mstype.float16),)
-        y = y + (x[0] + self.cast(self.interpolate3(y[self.fpn_layer - 2]), mstype.float16),)
+        y = y + (x[2] + self.cast(self.interpolate1(y[self.fpn_layer - 4]), self.platform_mstype),)
+        y = y + (x[1] + self.cast(self.interpolate2(y[self.fpn_layer - 3]), self.platform_mstype),)
+        y = y + (x[0] + self.cast(self.interpolate3(y[self.fpn_layer - 2]), self.platform_mstype),)
 
         z = ()
         for i in range(self.fpn_layer - 1, -1, -1):
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/mask_rcnn_mobilenetv1.py b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/mask_rcnn_mobilenetv1.py
index 7bde4e78568..86efb268d68 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/mask_rcnn_mobilenetv1.py
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/mask_rcnn_mobilenetv1.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-21 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@ from mindspore.ops import operations as P
 from mindspore.common.tensor import Tensor
 import mindspore.common.dtype as mstype
 from mindspore.ops import functional as F
+from mindspore import context
 from .mobilenetv1 import MobileNetV1_FeatureSelector
 from .bbox_assign_sample_stage2 import BboxAssignSampleForRcnn
 from .fpn_neck import FeatPyramidNeck
@@ -59,16 +60,15 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
         self.anchor_strides = config.anchor_strides
         self.target_means = tuple(config.rcnn_target_means)
         self.target_stds = tuple(config.rcnn_target_stds)
+        self.init_datatype()
 
         # Anchor generator
         anchor_base_sizes = None
-        self.anchor_base_sizes = list(
-            self.anchor_strides) if anchor_base_sizes is None else anchor_base_sizes
+        self.anchor_base_sizes = list(self.anchor_strides) if anchor_base_sizes is None else anchor_base_sizes
 
         self.anchor_generators = []
         for anchor_base in self.anchor_base_sizes:
-            self.anchor_generators.append(
-                AnchorGenerator(anchor_base, self.anchor_scales, self.anchor_ratios))
+            self.anchor_generators.append(AnchorGenerator(anchor_base, self.anchor_scales, self.anchor_ratios))
 
         self.num_anchors = len(self.anchor_ratios) * len(self.anchor_scales)
 
@@ -78,30 +78,21 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
         self.anchor_list = self.get_anchors(featmap_sizes)
 
         # Backbone mobilenetv1
-        self.backbone = MobileNetV1_FeatureSelector(1001, features_only=True).to_float(mstype.float16)
+        self.backbone = MobileNetV1_FeatureSelector(1001, features_only=True).to_float(self.platform_mstype)
         # Fpn
-        self.fpn_ncek = FeatPyramidNeck(config.fpn_in_channels,
-                                        config.fpn_out_channels,
-                                        config.fpn_num_outs)
+        self.fpn_neck = FeatPyramidNeck(config.fpn_in_channels, config.fpn_out_channels, config.fpn_num_outs)
 
         # Rpn and rpn loss
-        self.gt_labels_stage1 = Tensor(np.ones((self.train_batch_size, config.num_gts)).astype(np.uint8))
-        self.rpn_with_loss = RPN(config,
-                                 self.train_batch_size,
-                                 config.rpn_in_channels,
-                                 config.rpn_feat_channels,
-                                 config.num_anchors,
-                                 config.rpn_cls_out_channels)
+        self.gt_labels_stage1 = Tensor(np.ones((self.train_batch_size, config.num_gts)).astype(self.int_dtype))
+
+        self.rpn_with_loss = RPN(config, self.train_batch_size, config.rpn_in_channels, config.rpn_feat_channels,
+                                 config.num_anchors, config.rpn_cls_out_channels)
 
         # Proposal
-        self.proposal_generator = Proposal(config,
-                                           self.train_batch_size,
-                                           config.activate_num_classes,
+        self.proposal_generator = Proposal(config, self.train_batch_size, config.activate_num_classes,
                                            config.use_sigmoid_cls)
         self.proposal_generator.set_train_local(config, True)
-        self.proposal_generator_test = Proposal(config,
-                                                config.test_batch_size,
-                                                config.activate_num_classes,
+        self.proposal_generator_test = Proposal(config, config.test_batch_size, config.activate_num_classes,
                                                 config.use_sigmoid_cls)
         self.proposal_generator_test.set_train_local(config, False)
 
@@ -112,40 +103,24 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
                                           stds=self.target_stds)
 
         # Roi
-        self.roi_align = SingleRoIExtractor(config,
-                                            config.roi_layer,
-                                            config.roi_align_out_channels,
-                                            config.roi_align_featmap_strides,
-                                            self.train_batch_size,
-                                            config.roi_align_finest_scale,
-                                            mask=False)
+        self.roi_align = SingleRoIExtractor(config, config.roi_layer, config.roi_align_out_channels,
+                                            config.roi_align_featmap_strides, self.train_batch_size,
+                                            config.roi_align_finest_scale, mask=False)
         self.roi_align.set_train_local(config, True)
 
-        self.roi_align_mask = SingleRoIExtractor(config,
-                                                 config.roi_layer,
-                                                 config.roi_align_out_channels,
-                                                 config.roi_align_featmap_strides,
-                                                 self.train_batch_size,
-                                                 config.roi_align_finest_scale,
-                                                 mask=True)
+        self.roi_align_mask = SingleRoIExtractor(config, config.roi_layer, config.roi_align_out_channels,
+                                                 config.roi_align_featmap_strides, self.train_batch_size,
+                                                 config.roi_align_finest_scale, mask=True)
         self.roi_align_mask.set_train_local(config, True)
 
-        self.roi_align_test = SingleRoIExtractor(config,
-                                                 config.roi_layer,
-                                                 config.roi_align_out_channels,
-                                                 config.roi_align_featmap_strides,
-                                                 1,
-                                                 config.roi_align_finest_scale,
+        self.roi_align_test = SingleRoIExtractor(config, config.roi_layer, config.roi_align_out_channels,
+                                                 config.roi_align_featmap_strides, 1, config.roi_align_finest_scale,
                                                  mask=False)
         self.roi_align_test.set_train_local(config, False)
 
-        self.roi_align_mask_test = SingleRoIExtractor(config,
-                                                      config.roi_layer,
-                                                      config.roi_align_out_channels,
-                                                      config.roi_align_featmap_strides,
-                                                      1,
-                                                      config.roi_align_finest_scale,
-                                                      mask=True)
+        self.roi_align_mask_test = SingleRoIExtractor(config, config.roi_layer, config.roi_align_out_channels,
+                                                      config.roi_align_featmap_strides, 1,
+                                                      config.roi_align_finest_scale, mask=True)
         self.roi_align_mask_test.set_train_local(config, False)
 
         # Rcnn
@@ -176,7 +151,7 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
 
         self.rpn_max_num = config.rpn_max_num
 
-        self.zeros_for_nms = Tensor(np.zeros((self.rpn_max_num, 3)).astype(np.float16))
+        self.zeros_for_nms = Tensor(np.zeros((self.rpn_max_num, 3)).astype(self.platform_dtype))
         self.ones_mask = np.ones((self.rpn_max_num, 1)).astype(np.bool)
         self.zeros_mask = np.zeros((self.rpn_max_num, 1)).astype(np.bool)
         self.bbox_mask = Tensor(np.concatenate((self.ones_mask, self.zeros_mask,
@@ -184,10 +159,11 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
         self.nms_pad_mask = Tensor(np.concatenate((self.ones_mask, self.ones_mask,
                                                    self.ones_mask, self.ones_mask, self.zeros_mask), axis=1))
 
-        self.test_score_thresh = Tensor(np.ones((self.rpn_max_num, 1)).astype(np.float16) * config.test_score_thr)
-        self.test_score_zeros = Tensor(np.ones((self.rpn_max_num, 1)).astype(np.float16) * 0)
-        self.test_box_zeros = Tensor(np.ones((self.rpn_max_num, 4)).astype(np.float16) * -1)
-        self.test_iou_thr = Tensor(np.ones((self.rpn_max_num, 1)).astype(np.float16) * config.test_iou_thr)
+        self.test_score_thresh = Tensor(np.ones((self.rpn_max_num, 1)).astype(self.platform_dtype)
+                                        * config.test_score_thr)
+        self.test_score_zeros = Tensor(np.ones((self.rpn_max_num, 1)).astype(self.platform_dtype) * 0)
+        self.test_box_zeros = Tensor(np.ones((self.rpn_max_num, 4)).astype(self.platform_dtype) * -1)
+        self.test_iou_thr = Tensor(np.ones((self.rpn_max_num, 1)).astype(self.platform_dtype) * config.test_iou_thr)
         self.test_max_per_img = config.test_max_per_img
         self.nms_test = P.NMSWithMask(config.test_iou_thr)
         self.softmax = P.Softmax(axis=1)
@@ -201,42 +177,14 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
         self.concat_end = (self.num_classes - 1)
 
         # Init tensor
-        roi_align_index = [np.array(np.ones((config.num_expected_pos_stage2 + config.num_expected_neg_stage2, 1)) * i,
-                                    dtype=np.float16) for i in range(self.train_batch_size)]
+        self.init_tensors(config)
 
-        roi_align_index_test = [np.array(np.ones((config.rpn_max_num, 1)) * i, dtype=np.float16) \
-                                for i in range(self.test_batch_size)]
-
-        self.roi_align_index_tensor = Tensor(np.concatenate(roi_align_index))
-        self.roi_align_index_test_tensor = Tensor(np.concatenate(roi_align_index_test))
-
-        roi_align_index_pos = [np.array(np.ones((config.num_expected_pos_stage2, 1)) * i,
-                                        dtype=np.float16) for i in range(self.train_batch_size)]
-        self.roi_align_index_tensor_pos = Tensor(np.concatenate(roi_align_index_pos))
-
-        self.rcnn_loss_cls_weight = Tensor(np.array(config.rcnn_loss_cls_weight).astype(np.float16))
-        self.rcnn_loss_reg_weight = Tensor(np.array(config.rcnn_loss_reg_weight).astype(np.float16))
-        self.rcnn_loss_mask_fb_weight = Tensor(np.array(config.rcnn_loss_mask_fb_weight).astype(np.float16))
-
-        self.argmax_with_value = P.ArgMaxWithValue(axis=1)
-        self.on_value = Tensor(1.0, mstype.float32)
-        self.off_value = Tensor(0.0, mstype.float32)
-        self.onehot = P.OneHot()
-        self.reducesum = P.ReduceSum()
-        self.sigmoid = P.Sigmoid()
-        self.expand_dims = P.ExpandDims()
-        self.test_mask_fb_zeros = Tensor(np.zeros((self.rpn_max_num, 28, 28)).astype(np.float16))
-        self.value = Tensor(1.0, mstype.float16)
     def construct(self, img_data, img_metas, gt_bboxes, gt_labels, gt_valids, gt_masks):
         x = self.backbone(img_data)
-        x = self.fpn_ncek(x)
+        x = self.fpn_neck(x)
 
-        rpn_loss, cls_score, bbox_pred, rpn_cls_loss, rpn_reg_loss, _ = self.rpn_with_loss(x,
-                                                                                           img_metas,
-                                                                                           self.anchor_list,
-                                                                                           gt_bboxes,
-                                                                                           self.gt_labels_stage1,
-                                                                                           gt_valids)
+        rpn_loss, cls_score, bbox_pred, rpn_cls_loss, rpn_reg_loss, _ = \
+            self.rpn_with_loss(x, img_metas, self.anchor_list, gt_bboxes, self.gt_labels_stage1, gt_valids)
 
         if self.training:
             proposal, proposal_mask = self.proposal_generator(cls_score, bbox_pred, self.anchor_list)
@@ -258,23 +206,13 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
         if self.training:
             for i in range(self.train_batch_size):
                 gt_bboxes_i = self.squeeze(gt_bboxes[i:i + 1:1, ::])
-
-                gt_labels_i = self.squeeze(gt_labels[i:i + 1:1, ::])
-                gt_labels_i = self.cast(gt_labels_i, mstype.uint8)
-
-                gt_valids_i = self.squeeze(gt_valids[i:i + 1:1, ::])
-                gt_valids_i = self.cast(gt_valids_i, mstype.bool_)
-
-                gt_masks_i = self.squeeze(gt_masks[i:i + 1:1, ::])
-                gt_masks_i = self.cast(gt_masks_i, mstype.bool_)
+                gt_labels_i = self.cast(self.squeeze(gt_labels[i:i + 1:1, ::]), self.int_mstype)
+                gt_valids_i = self.cast(self.squeeze(gt_valids[i:i + 1:1, ::]), mstype.bool_)
+                gt_masks_i = self.cast(self.squeeze(gt_masks[i:i + 1:1, ::]), mstype.bool_)
 
                 bboxes, deltas, labels, mask, pos_bboxes, pos_mask_fb, pos_labels, pos_mask = \
-                    self.bbox_assigner_sampler_for_rcnn(gt_bboxes_i,
-                                                        gt_labels_i,
-                                                        proposal_mask[i],
-                                                        proposal[i][::, 0:4:1],
-                                                        gt_valids_i,
-                                                        gt_masks_i)
+                    self.bbox_assigner_sampler_for_rcnn(gt_bboxes_i, gt_labels_i, proposal_mask[i], \
+                                                        proposal[i][::, 0:4:1], gt_valids_i, gt_masks_i)
                 bboxes_tuple += (bboxes,)
                 deltas_tuple += (deltas,)
                 labels_tuple += (labels,)
@@ -288,14 +226,12 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
             bbox_targets = self.concat(deltas_tuple)
             rcnn_labels = self.concat(labels_tuple)
             bbox_targets = F.stop_gradient(bbox_targets)
-            rcnn_labels = F.stop_gradient(rcnn_labels)
-            rcnn_labels = self.cast(rcnn_labels, mstype.int32)
+            rcnn_labels = self.cast(F.stop_gradient(rcnn_labels), mstype.int32)
 
             rcnn_pos_masks_fb = self.concat(pos_mask_fb_tuple)
             rcnn_pos_masks_fb = F.stop_gradient(rcnn_pos_masks_fb)
             rcnn_pos_labels = self.concat(pos_labels_tuple)
-            rcnn_pos_labels = F.stop_gradient(rcnn_pos_labels)
-            rcnn_pos_labels = self.cast(rcnn_pos_labels, mstype.int32)
+            rcnn_pos_labels = self.cast(F.stop_gradient(rcnn_pos_labels), mstype.int32)
         else:
             mask_tuple += proposal_mask
             bbox_targets = proposal_mask
@@ -316,8 +252,7 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
                 pos_bboxes_all = pos_bboxes_tuple[0]
             rois = self.concat_1((self.roi_align_index_tensor, bboxes_all))
             pos_rois = self.concat_1((self.roi_align_index_tensor_pos, pos_bboxes_all))
-            pos_rois = self.cast(pos_rois, mstype.float32)
-            pos_rois = F.stop_gradient(pos_rois)
+            pos_rois = F.stop_gradient(self.cast(pos_rois, mstype.float32))
         else:
             if self.test_batch_size > 1:
                 bboxes_all = self.concat(bboxes_tuple)
@@ -325,24 +260,17 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
                 bboxes_all = bboxes_tuple[0]
             rois = self.concat_1((self.roi_align_index_test_tensor, bboxes_all))
 
-        rois = self.cast(rois, mstype.float32)
-        rois = F.stop_gradient(rois)
+        rois = F.stop_gradient(self.cast(rois, mstype.float32))
 
         if self.training:
-            roi_feats = self.roi_align(rois,
-                                       self.cast(x[0], mstype.float32),
-                                       self.cast(x[1], mstype.float32),
-                                       self.cast(x[2], mstype.float32),
-                                       self.cast(x[3], mstype.float32))
+            roi_feats = self.roi_align(rois, self.cast(x[0], mstype.float32), self.cast(x[1], mstype.float32), \
+                                       self.cast(x[2], mstype.float32), self.cast(x[3], mstype.float32))
         else:
-            roi_feats = self.roi_align_test(rois,
-                                            self.cast(x[0], mstype.float32),
-                                            self.cast(x[1], mstype.float32),
-                                            self.cast(x[2], mstype.float32),
-                                            self.cast(x[3], mstype.float32))
+            roi_feats = self.roi_align_test(rois, self.cast(x[0], mstype.float32), self.cast(x[1], mstype.float32), \
+                                            self.cast(x[2], mstype.float32), self.cast(x[3], mstype.float32))
 
 
-        roi_feats = self.cast(roi_feats, mstype.float16)
+        roi_feats = self.cast(roi_feats, self.platform_mstype)
         rcnn_masks = self.concat(mask_tuple)
         rcnn_masks = F.stop_gradient(rcnn_masks)
         rcnn_mask_squeeze = self.squeeze(self.cast(rcnn_masks, mstype.bool_))
@@ -351,22 +279,15 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
         rcnn_pos_masks = F.stop_gradient(rcnn_pos_masks)
         rcnn_pos_mask_squeeze = self.squeeze(self.cast(rcnn_pos_masks, mstype.bool_))
 
-        rcnn_cls_loss, rcnn_reg_loss = self.rcnn_cls(roi_feats,
-                                                     bbox_targets,
-                                                     rcnn_labels,
-                                                     rcnn_mask_squeeze)
+        rcnn_cls_loss, rcnn_reg_loss = self.rcnn_cls(roi_feats, bbox_targets, rcnn_labels, rcnn_mask_squeeze)
 
         output = ()
         if self.training:
-            roi_feats_mask = self.roi_align_mask(pos_rois,
-                                                 self.cast(x[0], mstype.float32),
-                                                 self.cast(x[1], mstype.float32),
-                                                 self.cast(x[2], mstype.float32),
+            roi_feats_mask = self.roi_align_mask(pos_rois, self.cast(x[0], mstype.float32),
+                                                 self.cast(x[1], mstype.float32), self.cast(x[2], mstype.float32),
                                                  self.cast(x[3], mstype.float32))
-            roi_feats_mask = self.cast(roi_feats_mask, mstype.float16)
-            rcnn_mask_fb_loss = self.rcnn_mask(roi_feats_mask,
-                                               rcnn_pos_labels,
-                                               rcnn_pos_mask_squeeze,
+            roi_feats_mask = self.cast(roi_feats_mask, self.platform_mstype)
+            rcnn_mask_fb_loss = self.rcnn_mask(roi_feats_mask, rcnn_pos_labels, rcnn_pos_mask_squeeze, \
                                                rcnn_pos_masks_fb)
 
             rcnn_loss = self.rcnn_loss_cls_weight * rcnn_cls_loss + self.rcnn_loss_reg_weight * rcnn_reg_loss + \
@@ -374,7 +295,7 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
             output += (rpn_loss, rcnn_loss, rpn_cls_loss, rpn_reg_loss, rcnn_cls_loss, rcnn_reg_loss, rcnn_mask_fb_loss)
         else:
             mask_fb_pred_all = self.rcnn_mask_test(x, bboxes_all, rcnn_cls_loss, rcnn_reg_loss)
-            output = self.get_det_bboxes(rcnn_cls_loss, rcnn_reg_loss, rcnn_masks, bboxes_all,
+            output = self.get_det_bboxes(rcnn_cls_loss, rcnn_reg_loss, rcnn_masks, bboxes_all, \
                                          img_metas, mask_fb_pred_all)
 
         return output
@@ -526,7 +447,7 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
         for i in range(num_levels):
             anchors = self.anchor_generators[i].grid_anchors(
                 featmap_sizes[i], self.anchor_strides[i])
-            multi_level_anchors += (Tensor(anchors.astype(np.float16)),)
+            multi_level_anchors += (Tensor(anchors.astype(self.platform_dtype)),)
 
         return multi_level_anchors
 
@@ -543,7 +464,7 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
         for i in range(self.test_batch_size):
             cls_score_max_index, _ = self.argmax_with_value(cls_scores_all[i])
             cls_score_max_index = self.cast(self.onehot(cls_score_max_index, self.num_classes,
-                                                        self.on_value, self.off_value), mstype.float16)
+                                                        self.on_value, self.off_value), self.platform_mstype)
             cls_score_max_index = self.expand_dims(cls_score_max_index, -1)
             cls_score_max_index = self.tile(cls_score_max_index, (1, 1, 4))
             reg_pred_max = reg_pred_all[i] * cls_score_max_index
@@ -559,6 +480,47 @@ class Mask_Rcnn_Mobilenetv1(nn.Cell):
                                                        self.cast(x[1], mstype.float32),
                                                        self.cast(x[2], mstype.float32),
                                                        self.cast(x[3], mstype.float32))
-        roi_feats_mask_test = self.cast(roi_feats_mask_test, mstype.float16)
+        roi_feats_mask_test = self.cast(roi_feats_mask_test, self.platform_mstype)
         mask_fb_pred_all = self.rcnn_mask(roi_feats_mask_test)
         return mask_fb_pred_all
+
+    def init_datatype(self):
+        self.platform = context.get_context("device_target")
+        if self.platform == "CPU":
+            self.platform_dtype = np.float32
+            self.platform_mstype = mstype.float32
+            self.int_dtype = np.int32
+            self.int_mstype = mstype.int32
+        else:
+            self.platform_dtype = np.float16
+            self.platform_mstype = mstype.float16
+            self.int_dtype = np.uint8
+            self.int_mstype = mstype.uint8
+
+    def init_tensors(self, config):
+        roi_align_index = [np.array(np.ones((config.num_expected_pos_stage2 + config.num_expected_neg_stage2, 1)) * i,
+                                    dtype=self.platform_dtype) for i in range(self.train_batch_size)]
+
+        roi_align_index_test = [np.array(np.ones((config.rpn_max_num, 1)) * i, dtype=self.platform_dtype) \
+                                for i in range(self.test_batch_size)]
+
+        self.roi_align_index_tensor = Tensor(np.concatenate(roi_align_index))
+        self.roi_align_index_test_tensor = Tensor(np.concatenate(roi_align_index_test))
+
+        roi_align_index_pos = [np.array(np.ones((config.num_expected_pos_stage2, 1)) * i,
+                                        dtype=self.platform_dtype) for i in range(self.train_batch_size)]
+        self.roi_align_index_tensor_pos = Tensor(np.concatenate(roi_align_index_pos))
+
+        self.rcnn_loss_cls_weight = Tensor(np.array(config.rcnn_loss_cls_weight).astype(self.platform_dtype))
+        self.rcnn_loss_reg_weight = Tensor(np.array(config.rcnn_loss_reg_weight).astype(self.platform_dtype))
+        self.rcnn_loss_mask_fb_weight = Tensor(np.array(config.rcnn_loss_mask_fb_weight).astype(self.platform_dtype))
+
+        self.argmax_with_value = P.ArgMaxWithValue(axis=1)
+        self.on_value = Tensor(1.0, mstype.float32)
+        self.off_value = Tensor(0.0, mstype.float32)
+        self.onehot = P.OneHot()
+        self.reducesum = P.ReduceSum()
+        self.sigmoid = P.Sigmoid()
+        self.expand_dims = P.ExpandDims()
+        self.test_mask_fb_zeros = Tensor(np.zeros((self.rpn_max_num, 28, 28)).astype(self.platform_dtype))
+        self.value = Tensor(1.0, self.platform_mstype)
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/proposal_generator.py b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/proposal_generator.py
index 3c7ae5f7d93..d32223cdb55 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/proposal_generator.py
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/proposal_generator.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-21 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@ import mindspore.nn as nn
 import mindspore.common.dtype as mstype
 from mindspore.ops import operations as P
 from mindspore import Tensor
-
+from mindspore import context
 
 class Proposal(nn.Cell):
     """
@@ -104,6 +104,8 @@ class Proposal(nn.Cell):
 
         self.multi_10 = Tensor(10.0, mstype.float16)
 
+        self.platform = context.get_context("device_target")
+
     def set_train_local(self, config, training=True):
         """Set training flag."""
         self.training_local = training
@@ -174,6 +176,10 @@ class Proposal(nn.Cell):
             proposals_decode = self.decode(anchors_sorted, bboxes_sorted)
 
             proposals_decode = self.concat_axis1((proposals_decode, self.reshape(scores_sorted, self.topK_shape[idx])))
+
+            if self.platform == "CPU":
+                proposals_decode = self.cast(proposals_decode, mstype.float32)
+
             proposals, _, mask_valid = self.nms(proposals_decode)
 
             mlvl_proposals = mlvl_proposals + (proposals,)
@@ -184,7 +190,10 @@ class Proposal(nn.Cell):
 
         _, _, _, _, scores = self.split(proposals)
         scores = self.squeeze(scores)
-        topk_mask = self.cast(self.topK_mask, mstype.float16)
+        if self.platform == "CPU":
+            topk_mask = self.cast(self.topK_mask, mstype.float32)
+        else:
+            topk_mask = self.cast(self.topK_mask, mstype.float16)
         scores_using = self.select(masks, scores, topk_mask)
 
         _, topk_inds = self.topKv2(scores_using, self.max_num)
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_cls.py b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_cls.py
index d96c2461632..6b35ab3222e 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_cls.py
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_cls.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-21 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@ from mindspore.ops import operations as P
 from mindspore.common.tensor import Tensor
 from mindspore.common.initializer import initializer
 from mindspore.common.parameter import Parameter
+from mindspore import context
 
 class DenseNoTranpose(nn.Cell):
     """Dense method"""
@@ -40,20 +41,25 @@ class FpnCls(nn.Cell):
     """dense layer of classification and box head"""
     def __init__(self, input_channels, output_channels, num_classes, pool_size):
         super(FpnCls, self).__init__()
+        if context.get_context("device_target") == "CPU":
+            self.platform_mstype = mstype.float32
+        else:
+            self.platform_mstype = mstype.float16
         representation_size = input_channels * pool_size * pool_size
         shape_0 = (output_channels, representation_size)
         weights_0 = initializer("XavierUniform", shape=shape_0[::-1], dtype=mstype.float32)
         shape_1 = (output_channels, output_channels)
         weights_1 = initializer("XavierUniform", shape=shape_1[::-1], dtype=mstype.float32)
-        self.shared_fc_0 = DenseNoTranpose(representation_size, output_channels, weights_0).to_float(mstype.float16)
-        self.shared_fc_1 = DenseNoTranpose(output_channels, output_channels, weights_1).to_float(mstype.float16)
+        self.shared_fc_0 = DenseNoTranpose(representation_size, output_channels, weights_0) \
+                           .to_float(self.platform_mstype)
+        self.shared_fc_1 = DenseNoTranpose(output_channels, output_channels, weights_1).to_float(self.platform_mstype)
 
         cls_weight = initializer('Normal', shape=[num_classes, output_channels][::-1],
                                  dtype=mstype.float32)
         reg_weight = initializer('Normal', shape=[num_classes * 4, output_channels][::-1],
                                  dtype=mstype.float32)
-        self.cls_scores = DenseNoTranpose(output_channels, num_classes, cls_weight).to_float(mstype.float16)
-        self.reg_scores = DenseNoTranpose(output_channels, num_classes * 4, reg_weight).to_float(mstype.float16)
+        self.cls_scores = DenseNoTranpose(output_channels, num_classes, cls_weight).to_float(self.platform_mstype)
+        self.reg_scores = DenseNoTranpose(output_channels, num_classes * 4, reg_weight).to_float(self.platform_mstype)
 
         self.relu = P.ReLU()
         self.flatten = P.Flatten()
@@ -99,8 +105,10 @@ class RcnnCls(nn.Cell):
                  ):
         super(RcnnCls, self).__init__()
         cfg = config
-        self.rcnn_loss_cls_weight = Tensor(np.array(cfg.rcnn_loss_cls_weight).astype(np.float16))
-        self.rcnn_loss_reg_weight = Tensor(np.array(cfg.rcnn_loss_reg_weight).astype(np.float16))
+        if context.get_context("device_target") == "CPU":
+            self.platform_mstype = mstype.float32
+        else:
+            self.platform_mstype = mstype.float16
         self.rcnn_fc_out_channels = cfg.rcnn_fc_out_channels
         self.target_means = target_means
         self.target_stds = target_stds
@@ -128,7 +136,6 @@ class RcnnCls(nn.Cell):
 
         self.on_value = Tensor(1.0, mstype.float32)
         self.off_value = Tensor(0.0, mstype.float32)
-        self.value = Tensor(1.0, mstype.float16)
 
         self.num_bboxes = (cfg.num_expected_pos_stage2 + cfg.num_expected_neg_stage2) * batch_size
 
@@ -143,7 +150,8 @@ class RcnnCls(nn.Cell):
 
         if self.training:
             bbox_weights = self.cast(self.logicaland(self.greater(labels, 0), mask), mstype.int32) * labels
-            labels = self.cast(self.onehot(labels, self.num_classes, self.on_value, self.off_value), mstype.float16)
+            labels = self.onehot(labels, self.num_classes, self.on_value, self.off_value)
+            labels = self.cast(labels, self.platform_mstype)
             bbox_targets = self.tile(self.expandims(bbox_targets, 1), (1, self.num_classes, 1))
 
             loss_cls, loss_reg = self.loss(x_cls, x_reg,
@@ -160,13 +168,13 @@ class RcnnCls(nn.Cell):
         """Loss method."""
         # loss_cls
         loss_cls, _ = self.loss_cls(cls_score, labels)
-        weights = self.cast(weights, mstype.float16)
+        weights = self.cast(weights, self.platform_mstype)
         loss_cls = loss_cls * weights
         loss_cls = self.sum_loss(loss_cls, (0,)) / self.sum_loss(weights, (0,))
 
         # loss_reg
         bbox_weights = self.cast(self.onehot(bbox_weights, self.num_classes, self.on_value, self.off_value),
-                                 mstype.float16)
+                                 self.platform_mstype)
         bbox_weights = bbox_weights * self.rmv_first_tensor   #  * self.rmv_first_tensor  exclude background
         pos_bbox_pred = self.reshape(bbox_pred, (self.num_bboxes, -1, 4))
         loss_reg = self.loss_bbox(pos_bbox_pred, bbox_targets)
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_mask.py b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_mask.py
index 08e4f9c3e6d..93cc2b9ef41 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_mask.py
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rcnn_mask.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-21 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@ import mindspore.nn as nn
 from mindspore.ops import operations as P
 from mindspore.common.tensor import Tensor
 from mindspore.common.initializer import initializer
+from mindspore import context
 
 def _conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0, pad_mode='pad'):
     """Conv2D wrapper."""
@@ -45,27 +46,32 @@ class FpnMask(nn.Cell):
     """conv layers of mask head"""
     def __init__(self, input_channels, output_channels, num_classes):
         super(FpnMask, self).__init__()
+        self.platform = context.get_context("device_target")
+        if self.platform == "CPU":
+            self.platform_mstype = mstype.float32
+        else:
+            self.platform_mstype = mstype.float16
         self.mask_conv1 = _conv(input_channels, output_channels, kernel_size=3,
-                                pad_mode="same").to_float(mstype.float16)
+                                pad_mode="same").to_float(self.platform_mstype)
         self.mask_relu1 = P.ReLU()
 
         self.mask_conv2 = _conv(output_channels, output_channels, kernel_size=3,
-                                pad_mode="same").to_float(mstype.float16)
+                                pad_mode="same").to_float(self.platform_mstype)
         self.mask_relu2 = P.ReLU()
 
         self.mask_conv3 = _conv(output_channels, output_channels, kernel_size=3,
-                                pad_mode="same").to_float(mstype.float16)
+                                pad_mode="same").to_float(self.platform_mstype)
         self.mask_relu3 = P.ReLU()
 
         self.mask_conv4 = _conv(output_channels, output_channels, kernel_size=3,
-                                pad_mode="same").to_float(mstype.float16)
+                                pad_mode="same").to_float(self.platform_mstype)
         self.mask_relu4 = P.ReLU()
 
         self.mask_deconv5 = _convTanspose(output_channels, output_channels, kernel_size=2,
-                                          stride=2, pad_mode="valid").to_float(mstype.float16)
+                                          stride=2, pad_mode="valid").to_float(self.platform_mstype)
         self.mask_relu5 = P.ReLU()
         self.mask_conv6 = _conv(output_channels, num_classes, kernel_size=1, stride=1,
-                                pad_mode="valid").to_float(mstype.float16)
+                                pad_mode="valid").to_float(self.platform_mstype)
 
     def construct(self, x):
         x = self.mask_conv1(x)
@@ -114,6 +120,11 @@ class RcnnMask(nn.Cell):
                  ):
         super(RcnnMask, self).__init__()
         cfg = config
+        self.platform = context.get_context("device_target")
+        if self.platform == "CPU":
+            self.platform_mstype = mstype.float32
+        else:
+            self.platform_mstype = mstype.float16
         self.rcnn_loss_mask_fb_weight = Tensor(np.array(cfg.rcnn_loss_mask_fb_weight).astype(np.float16))
         self.rcnn_mask_out_channels = cfg.rcnn_mask_out_channels
         self.target_means = target_means
@@ -130,7 +141,7 @@ class RcnnMask(nn.Cell):
         self.cast = P.Cast()
         self.sum_loss = P.ReduceSum()
         self.tile = P.Tile()
-        self.expandims = P.ExpandDims()
+        self.expanddims = P.ExpandDims()
 
         self.on_value = Tensor(1.0, mstype.float32)
         self.off_value = Tensor(0.0, mstype.float32)
@@ -140,13 +151,14 @@ class RcnnMask(nn.Cell):
         rmv_first[:, 0] = np.zeros((self.num_bboxes,))
         self.rmv_first_tensor = Tensor(rmv_first.astype(np.float16))
         self.mean_loss = P.ReduceMean()
+        self.maximum = P.Maximum()
 
     def construct(self, mask_featuremap, labels=None, mask=None, mask_fb_targets=None):
         x_mask_fb = self.fpn_mask(mask_featuremap)
 
         if self.training:
             bbox_weights = self.cast(self.logicaland(self.greater(labels, 0), mask), mstype.int32) * labels
-            mask_fb_targets = self.tile(self.expandims(mask_fb_targets, 1), (1, self.num_classes, 1, 1))
+            mask_fb_targets = self.tile(self.expanddims(mask_fb_targets, 1), (1, self.num_classes, 1, 1))
 
             loss_mask_fb = self.loss(x_mask_fb, bbox_weights, mask, mask_fb_targets)
             out = loss_mask_fb
@@ -158,17 +170,21 @@ class RcnnMask(nn.Cell):
 
     def loss(self, masks_fb_pred, bbox_weights, weights, masks_fb_targets):
         """Loss method."""
-        weights = self.cast(weights, mstype.float16)
+        weights = self.cast(weights, self.platform_mstype)
         bbox_weights = self.cast(self.onehot(bbox_weights, self.num_classes, self.on_value, self.off_value),
-                                 mstype.float16)
+                                 self.platform_mstype)
         bbox_weights = bbox_weights * self.rmv_first_tensor   #  * self.rmv_first_tensor  exclude background
 
         # loss_mask_fb
-        masks_fb_targets = self.cast(masks_fb_targets, mstype.float16)
+        masks_fb_targets = self.cast(masks_fb_targets, self.platform_mstype)
         loss_mask_fb = self.loss_mask(masks_fb_pred, masks_fb_targets)
         loss_mask_fb = self.mean_loss(loss_mask_fb, (2, 3))
         loss_mask_fb = loss_mask_fb * bbox_weights
-        loss_mask_fb = loss_mask_fb / self.sum_loss(weights, (0,))
+        if self.platform == "CPU":
+            sum_weight = self.sum_loss(weights, (0,))
+            loss_mask_fb = loss_mask_fb / self.maximum(self.expanddims(sum_weight, 0), 1)
+        else:
+            loss_mask_fb = loss_mask_fb / self.sum_loss(weights, (0,))
         loss_mask_fb = self.sum_loss(loss_mask_fb, (0, 1))
 
         return loss_mask_fb
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rpn.py b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rpn.py
index b7effb3d1bb..5ab88584c5c 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rpn.py
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/maskrcnn_mobilenetv1/rpn.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-21 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@ from mindspore.ops import operations as P
 from mindspore import Tensor
 from mindspore.ops import functional as F
 from mindspore.common.initializer import initializer
+from mindspore import context
 from .bbox_assign_sample import BboxAssignSample
 
 
@@ -100,6 +101,10 @@ class RPN(nn.Cell):
                  cls_out_channels):
         super(RPN, self).__init__()
         cfg_rpn = config
+        if context.get_context("device_target") == "CPU":
+            self.platform_mstype = mstype.float32
+        else:
+            self.platform_mstype = mstype.float16
         self.num_bboxes = cfg_rpn.num_bboxes
         self.slice_index = ()
         self.feature_anchor_shape = ()
@@ -180,7 +185,7 @@ class RPN(nn.Cell):
         for i in range(num_layers):
             rpn_layer.append(RpnRegClsBlock(in_channels, feat_channels, num_anchors, cls_out_channels, \
                                             weight_conv, bias_conv, weight_cls, \
-                                            bias_cls, weight_reg, bias_reg).to_float(mstype.float16))
+                                            bias_cls, weight_reg, bias_reg).to_float(self.platform_mstype))
 
         for i in range(1, num_layers):
             rpn_layer[i].rpn_conv.weight = rpn_layer[0].rpn_conv.weight
@@ -248,9 +253,9 @@ class RPN(nn.Cell):
                                                                                            mstype.bool_),
                                                                                  anchor_using_list, gt_valids_i)
 
-                bbox_weight = self.cast(bbox_weight, mstype.float16)
-                label = self.cast(label, mstype.float16)
-                label_weight = self.cast(label_weight, mstype.float16)
+                bbox_weight = self.cast(bbox_weight, self.platform_mstype)
+                label = self.cast(label, self.platform_mstype)
+                label_weight = self.cast(label_weight, self.platform_mstype)
 
                 for j in range(self.num_layers):
                     begin = self.slice_index[j]
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/network_define.py b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/network_define.py
index 4c5b4a89b45..45e2773bcc3 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/src/network_define.py
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/src/network_define.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-21 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@ from mindspore.ops import composite as C
 from mindspore import ParameterTuple
 from mindspore.train.callback import Callback
 from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
+from mindspore import context
 from src.maskrcnn_mobilenetv1.mask_rcnn_mobilenetv1 import Mask_Rcnn_Mobilenetv1
 
 time_stamp_init = False
@@ -97,6 +98,8 @@ class LossCallBack(Callback):
             time_stamp_current = time.time()
             total_loss = self.loss_sum/self.count
 
+            print("%lu epoch: %s step: %s total_loss: %.5f" %
+                  (time_stamp_current - time_stamp_first, cb_params.cur_epoch_num, cur_step_in_epoch, total_loss))
             loss_file = open("./loss_{}.log".format(self.rank_id), "a+")
             loss_file.write("%lu epoch: %s step: %s total_loss: %.5f" %
                             (time_stamp_current - time_stamp_first, cb_params.cur_epoch_num, cur_step_in_epoch,
@@ -164,7 +167,10 @@ class TrainOneStepCell(nn.Cell):
         self.optimizer = optimizer
         self.grad = C.GradOperation(get_by_list=True,
                                     sens_param=True)
-        self.sens = Tensor((np.ones((1,)) * sens).astype(np.float16))
+        if context.get_context("device_target") == "CPU":
+            self.sens = Tensor((np.ones((1,)) * sens).astype(np.float32))
+        else:
+            self.sens = Tensor((np.ones((1,)) * sens).astype(np.float16))
         self.reduce_flag = reduce_flag
         self.hyper_map = C.HyperMap()
         if reduce_flag:
diff --git a/model_zoo/official/cv/maskrcnn_mobilenetv1/train.py b/model_zoo/official/cv/maskrcnn_mobilenetv1/train.py
index d073cad3b56..22f6615eb5a 100644
--- a/model_zoo/official/cv/maskrcnn_mobilenetv1/train.py
+++ b/model_zoo/official/cv/maskrcnn_mobilenetv1/train.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-21 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -96,13 +96,15 @@ def modelarts_pre_process():
     config.pre_trained = os.path.join(config.output_path, config.pre_trained)
 
 
-context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=get_device_id())
+context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
+if config.device_target == "Ascend":
+    context.set_context(device_id=config.device_id)
 
 @moxing_wrapper(pre_process=modelarts_pre_process)
 def train_maskrcnn_mobilenetv1():
     config.mindrecord_dir = os.path.join(config.coco_root, config.mindrecord_dir)
     print('config:\n', config)
-    print("Start train for maskrcnn_mobilenetv1!")
+    print("Start training for maskrcnn_mobilenetv1!")
     if not config.do_eval and config.run_distribute:
         rank = get_rank_id()
         device_num = get_device_num()
diff --git a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_infer_310.sh b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_infer_310.sh
index d1e16bbcee0..928e828053e 100644
--- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_infer_310.sh
+++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_infer_310.sh
@@ -49,9 +49,10 @@ if [ -d ${ASCEND_HOME}/ascend-toolkit ]; then
     export PYTHONPATH=${TBE_IMPL_PATH}:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/python/site-packages:$PYTHONPATH
     export ASCEND_OPP_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp
 else
-    export PATH=$ASCEND_HOME/atc/ccec_compiler/bin:$ASCEND_HOME/atc/bin:$PATH
-    export LD_LIBRARY_PATH=/usr/local/lib:$ASCEND_HOME/atc/lib64:$ASCEND_HOME/acllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH
-    export PYTHONPATH=$ASCEND_HOME/atc/python/site-packages:$PYTHONPATH
+    export PATH=$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/fwkacllib/bin:$PATH
+    export LD_LIBRARY_PATH=/usr/local/lib:$ASCEND_HOME/fwkacllib/lib64:$ASCEND_HOME/driver/lib64:$LD_LIBRARY_PATH
+    export TBE_IMPL_PATH=$ASCEND_HOME/opp/op_impl/built-in/ai_core/tbe
+    export PYTHONPATH=$PYTHONPATH:$TBE_IMPL_PATH
     export ASCEND_OPP_PATH=$ASCEND_HOME/opp
 fi
 
@@ -104,4 +105,4 @@ cal_acc
 if [ $? -ne 0 ]; then
     echo "calculate accuracy failed"
     exit 1
-fi
\ No newline at end of file
+fi
diff --git a/model_zoo/official/cv/openpose/README.md b/model_zoo/official/cv/openpose/README.md
index 077d62954d0..2bca04112c7 100644
--- a/model_zoo/official/cv/openpose/README.md
+++ b/model_zoo/official/cv/openpose/README.md
@@ -79,7 +79,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil
 - Framework
     - [MindSpore](https://www.mindspore.cn/install/en)
 - Download the VGG19 model of the MindSpore version:
-    - vgg19-0-97_5004.ckpt
+    - [vgg19-0-97_5004.ckpt](https://download.mindspore.cn/model_zoo/converted_pretrained/vgg/vgg19-0-97_5004.ckpt)
 - For more information, please check the resources below：
     - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/master/index.html)
     - [MindSpore Python API](https://www.mindspore.cn/docs/api/en/master/index.html)
diff --git a/model_zoo/official/cv/psenet/README.md b/model_zoo/official/cv/psenet/README.md
index a8654fc34b1..efccccaa459 100644
--- a/model_zoo/official/cv/psenet/README.md
+++ b/model_zoo/official/cv/psenet/README.md
@@ -85,8 +85,9 @@ wget https://github.com/opencv/opencv/archive/3.4.9.zip
 unzip 3.4.9.zip
 cd opencv-3.4.9
 mkdir build
+cd build
 cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local -D WITH_WEBP=OFF ..
-make -j4 # -j指定线程数，用户根据机器配置修改参数
+make -j4 # -j specifies the number of threads, the user can modify the parameters according to the machine configuration
 make install
 
 # export environment variables
diff --git a/model_zoo/official/cv/psenet/README_CN.md b/model_zoo/official/cv/psenet/README_CN.md
index 770e9872bac..18c54414ef3 100644
--- a/model_zoo/official/cv/psenet/README_CN.md
+++ b/model_zoo/official/cv/psenet/README_CN.md
@@ -85,6 +85,7 @@ wget https://github.com/opencv/opencv/archive/3.4.9.zip
 unzip 3.4.9.zip
 cd opencv-3.4.9
 mkdir build
+cd build
 cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local -D WITH_WEBP=OFF ..
 make -j4 # -j指定线程数，用户根据机器配置修改参数
 make install
diff --git a/model_zoo/official/cv/psenet/postprocess.py b/model_zoo/official/cv/psenet/postprocess.py
index 7df75d72ef0..b51b6d18873 100644
--- a/model_zoo/official/cv/psenet/postprocess.py
+++ b/model_zoo/official/cv/psenet/postprocess.py
@@ -62,8 +62,7 @@ if __name__ == "__main__":
     for k in file_list:
         if os.path.splitext(k)[-1].lower() in ['.jpg', '.jpeg', '.png']:
             img_path = os.path.join(config.img_path, k)
-            img = get_img(img_path).reshape(1, 720, 1280, 3)
-            img = img[0].astype(np.uint8).copy()
+            img = get_img(img_path).astype(np.uint8).copy()
             img_name = os.path.split(img_path)[-1]
 
             score = np.fromfile(os.path.join(config.result_path, k.split('.')[0] + '_0.bin'), np.float32)
diff --git a/model_zoo/official/cv/psenet/src/network_define.py b/model_zoo/official/cv/psenet/src/network_define.py
index 3f55a996903..09ffe610209 100644
--- a/model_zoo/official/cv/psenet/src/network_define.py
+++ b/model_zoo/official/cv/psenet/src/network_define.py
@@ -23,6 +23,7 @@ from mindspore import ParameterTuple
 from mindspore.common.tensor import Tensor
 from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
 from mindspore.ops import composite as C
+from mindspore.ops import functional as F
 from mindspore.train.callback import Callback
 
 __all__ = ['LossCallBack', 'WithLossCell', 'TrainOneStepCell']
@@ -143,5 +144,4 @@ class TrainOneStepCell(nn.Cell):
         grads = self.grad(self.network, weights)(img, gt_text, gt_kernels, training_mask, self.sens)
         if self.reducer_flag:
             grads = self.grad_reducer(grads)
-        self.optimizer(grads)
-        return loss
+        return F.depend(loss, self.optimizer(grads))
diff --git a/model_zoo/official/cv/resnet/resnet101_imagenet2012_config.yaml b/model_zoo/official/cv/resnet/resnet101_imagenet2012_config.yaml
deleted file mode 100644
index 0ce8e0161d0..00000000000
--- a/model_zoo/official/cv/resnet/resnet101_imagenet2012_config.yaml
+++ /dev/null
@@ -1,80 +0,0 @@
-# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
-enable_modelarts: False
-# Url for modelarts
-data_url: ""
-train_url: ""
-checkpoint_url: ""
-# Path for local
-run_distribute: False
-enable_profiling: False
-data_path: "/cache/data"
-output_path: "/cache/train"
-load_path: "/cache/checkpoint_path/"
-device_target: "Ascend"
-checkpoint_path: "./checkpoint/"
-checkpoint_file_path: ""
-
-# ==============================================================================
-# Training options
-optimizer: "Momentum"
-infer_label: ""
-class_num: 1001
-batch_size: 32
-loss_scale: 1024
-momentum: 0.9
-weight_decay: 0.0001
-epoch_size: 120
-pretrain_epoch_size: 0
-save_checkpoint: True
-save_checkpoint_epochs: 5
-keep_checkpoint_max: 10
-warmup_epochs: 0
-lr_decay_mode: "cosine"
-use_label_smooth: True
-label_smooth_factor: 0.1
-lr: 0.1
-
-net_name: "resnet101"
-dataset: "imagenet2012"
-device_num: 1
-pre_trained: ""
-run_eval: False
-eval_dataset_path: ""
-parameter_server: False
-filter_weight: False
-save_best_ckpt: True
-eval_start_epoch: 40
-eval_interval: 1
-enable_cache: False
-cache_session_id: ""
-mode_name: "GRAPH"
-acc_mode: "O0"
-all_reduce_fusion_config:
-    - 2
-    - 60
-    - 220
-
-# Export options
-device_id: 0
-width: 224
-height: 224
-file_name: "resnet101"
-file_format: "AIR"
-ckpt_file: ""
-network_dataset: "resnet101_imagenet2012"
-
----
-# Help description for each configuration
-enable_modelarts: "Whether training on modelarts, default: False"
-data_url: "Dataset url for obs"
-checkpoint_url: "The location of checkpoint for obs"
-data_path: "Dataset path for local"
-output_path: "Training output path for local"
-load_path: "The location of checkpoint for obs"
-device_target: "Target device type, available: [Ascend, GPU, CPU]"
-enable_profiling: "Whether enable profiling while training, default: False"
-num_classes: "Class for dataset"
-batch_size: "Batch size for training and evaluation"
-epoch_size: "Total training epochs."
-checkpoint_path: "The location of the checkpoint file."
-checkpoint_file_path: "The location of the checkpoint file."
diff --git a/model_zoo/official/cv/resnet/resnet18_cifar10_config.yaml b/model_zoo/official/cv/resnet/resnet18_cifar10_config.yaml
deleted file mode 100644
index e164bffd506..00000000000
--- a/model_zoo/official/cv/resnet/resnet18_cifar10_config.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
-enable_modelarts: False
-# Url for modelarts
-data_url: ""
-train_url: ""
-checkpoint_url: ""
-# Path for local
-run_distribute: False
-enable_profiling: False
-data_path: "/cache/data"
-output_path: "/cache/train"
-load_path: "/cache/checkpoint_path/"
-device_target: "Ascend"
-checkpoint_path: "./checkpoint/"
-checkpoint_file_path: ""
-
-# ==============================================================================
-# Training options
-optimizer: "Momentum"
-infer_label: ""
-class_num: 10
-batch_size: 32
-loss_scale: 1024
-momentum: 0.9
-weight_decay: 0.0001
-epoch_size: 90
-pretrain_epoch_size: 0
-save_checkpoint: True
-save_checkpoint_epochs: 5
-keep_checkpoint_max: 10
-warmup_epochs: 5
-lr_decay_mode: "poly"
-lr_init: 0.01
-lr_end: 0.00001
-lr_max: 0.1
-
-net_name: "resnet18"
-dataset: "cifar10"
-device_num: 1
-pre_trained: ""
-run_eval: False
-eval_dataset_path: ""
-parameter_server: False
-filter_weight: False
-save_best_ckpt: True
-eval_start_epoch: 40
-eval_interval: 1
-enable_cache: False
-cache_session_id: ""
-mode_name: "GRAPH"
-acc_mode: "O0"
-
-# Export options
-device_id: 0
-width: 224
-height: 224
-file_name: "resnet18"
-file_format: "AIR"
-ckpt_file: ""
-network_dataset: "resnet18_cifar10"
-
----
-# Help description for each configuration
-enable_modelarts: "Whether training on modelarts, default: False"
-data_url: "Dataset url for obs"
-checkpoint_url: "The location of checkpoint for obs"
-data_path: "Dataset path for local"
-output_path: "Training output path for local"
-load_path: "The location of checkpoint for obs"
-device_target: "Target device type, available: [Ascend, GPU, CPU]"
-enable_profiling: "Whether enable profiling while training, default: False"
-num_classes: "Class for dataset"
-batch_size: "Batch size for training and evaluation"
-epoch_size: "Total training epochs."
-checkpoint_path: "The location of the checkpoint file."
-checkpoint_file_path: "The location of the checkpoint file."
diff --git a/model_zoo/official/cv/resnet/resnet18_imagenet2012_config.yaml b/model_zoo/official/cv/resnet/resnet18_imagenet2012_config.yaml
deleted file mode 100644
index 92c66f238a2..00000000000
--- a/model_zoo/official/cv/resnet/resnet18_imagenet2012_config.yaml
+++ /dev/null
@@ -1,78 +0,0 @@
-# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
-enable_modelarts: False
-# Url for modelarts
-data_url: ""
-train_url: ""
-checkpoint_url: ""
-# Path for local
-run_distribute: False
-enable_profiling: False
-data_path: "/cache/data"
-output_path: "/cache/train"
-load_path: "/cache/checkpoint_path/"
-device_target: "Ascend"
-checkpoint_path: "./checkpoint/"
-checkpoint_file_path: ""
-
-# ==============================================================================
-# Training options
-optimizer: "Momentum"
-infer_label: ""
-class_num: 1001
-batch_size: 256
-loss_scale: 1024
-momentum: 0.9
-weight_decay: 0.0001
-epoch_size: 90
-pretrain_epoch_size: 0
-save_checkpoint: True
-save_checkpoint_epochs: 5
-keep_checkpoint_max: 10
-warmup_epochs: 0
-lr_decay_mode: "linear"
-use_label_smooth: True
-label_smooth_factor: 0.1
-lr_init: 0
-lr_max: 0.8
-lr_end: 0.0
-
-net_name: "resnet18"
-dataset: "imagenet2012"
-device_num: 1
-pre_trained: ""
-run_eval: False
-eval_dataset_path: ""
-parameter_server: False
-filter_weight: False
-save_best_ckpt: True
-eval_start_epoch: 40
-eval_interval: 1
-enable_cache: False
-cache_session_id: ""
-mode_name: "GRAPH"
-acc_mode: "O0"
-
-# Export options
-device_id: 0
-width: 224
-height: 224
-file_name: "resnet18"
-file_format: "AIR"
-ckpt_file: ""
-network_dataset: "resnet18_imagenet2012"
-
----
-# Help description for each configuration
-enable_modelarts: "Whether training on modelarts, default: False"
-data_url: "Dataset url for obs"
-checkpoint_url: "The location of checkpoint for obs"
-data_path: "Dataset path for local"
-output_path: "Training output path for local"
-load_path: "The location of checkpoint for obs"
-device_target: "Target device type, available: [Ascend, GPU, CPU]"
-enable_profiling: "Whether enable profiling while training, default: False"
-num_classes: "Class for dataset"
-batch_size: "Batch size for training and evaluation"
-epoch_size: "Total training epochs."
-checkpoint_path: "The location of the checkpoint file."
-checkpoint_file_path: "The location of the checkpoint file."
diff --git a/model_zoo/official/cv/resnet/resnet34_imagenet2012_config.yaml b/model_zoo/official/cv/resnet/resnet34_imagenet2012_config.yaml
deleted file mode 100644
index 5b4b0493dfa..00000000000
--- a/model_zoo/official/cv/resnet/resnet34_imagenet2012_config.yaml
+++ /dev/null
@@ -1,78 +0,0 @@
-# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
-enable_modelarts: False
-# Url for modelarts
-data_url: ""
-train_url: ""
-checkpoint_url: ""
-# Path for local
-run_distribute: False
-enable_profiling: False
-data_path: "/cache/data"
-output_path: "/cache/train"
-load_path: "/cache/checkpoint_path/"
-device_target: "Ascend"
-checkpoint_path: "./checkpoint/"
-checkpoint_file_path: ""
-
-# ==============================================================================
-# Training options
-optimizer: "Momentum"
-infer_label: ""
-class_num: 1001
-batch_size: 256
-loss_scale: 1024
-momentum: 0.9
-weight_decay: 0.0001
-epoch_size: 90
-pretrain_epoch_size: 0
-save_checkpoint: True
-save_checkpoint_epochs: 5
-keep_checkpoint_max: 10
-warmup_epochs: 0
-lr_decay_mode: "linear"
-use_label_smooth: True
-label_smooth_factor: 0.1
-lr_init: 0
-lr_max: 0.8
-lr_end: 0.0
-
-net_name: "resnet34"
-dataset: "imagenet2012"
-device_num: 1
-pre_trained: ""
-run_eval: False
-eval_dataset_path: ""
-parameter_server: False
-filter_weight: False
-save_best_ckpt: True
-eval_start_epoch: 40
-eval_interval: 1
-enable_cache: False
-cache_session_id: ""
-mode_name: "GRAPH"
-acc_mode: "O0"
-
-# Export options
-device_id: 0
-width: 224
-height: 224
-file_name: "resnet34"
-file_format: "AIR"
-ckpt_file: ""
-network_dataset: "resnet34_imagenet2012"
-
----
-# Help description for each configuration
-enable_modelarts: "Whether training on modelarts, default: False"
-data_url: "Dataset url for obs"
-checkpoint_url: "The location of checkpoint for obs"
-data_path: "Dataset path for local"
-output_path: "Training output path for local"
-load_path: "The location of checkpoint for obs"
-device_target: "Target device type, available: [Ascend, GPU, CPU]"
-enable_profiling: "Whether enable profiling while training, default: False"
-num_classes: "Class for dataset"
-batch_size: "Batch size for training and evaluation"
-epoch_size: "Total training epochs."
-checkpoint_path: "The location of the checkpoint file."
-checkpoint_file_path: "The location of the checkpoint file."
diff --git a/model_zoo/official/cv/resnet/resnet50_cifar10_config.yaml b/model_zoo/official/cv/resnet/resnet50_cifar10_config.yaml
deleted file mode 100644
index 51021bb5a39..00000000000
--- a/model_zoo/official/cv/resnet/resnet50_cifar10_config.yaml
+++ /dev/null
@@ -1,79 +0,0 @@
-# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
-enable_modelarts: False
-# Url for modelarts
-data_url: ""
-train_url: ""
-checkpoint_url: ""
-# Path for local
-run_distribute: False
-enable_profiling: False
-data_path: "/cache/data"
-output_path: "/cache/train"
-load_path: "/cache/checkpoint_path/"
-device_target: "Ascend"
-checkpoint_path: "./checkpoint/"
-checkpoint_file_path: ""
-
-# ==============================================================================
-# Training options
-optimizer: "Momentum"
-infer_label: ""
-class_num: 10
-batch_size: 32
-loss_scale: 1024
-momentum: 0.9
-weight_decay: 0.0001
-epoch_size: 90
-pretrain_epoch_size: 0
-save_checkpoint: True
-save_checkpoint_epochs: 5
-keep_checkpoint_max: 10
-warmup_epochs: 5
-lr_decay_mode: "poly"
-lr_init: 0.01
-lr_end: 0.00001
-lr_max: 0.1
-
-net_name: "resnet50"
-dataset: "cifar10"
-device_num: 1
-pre_trained: ""
-run_eval: False
-eval_dataset_path: ""
-parameter_server: False
-filter_weight: False
-save_best_ckpt: True
-eval_start_epoch: 40
-eval_interval: 1
-enable_cache: False
-cache_session_id: ""
-mode_name: "GRAPH"
-acc_mode: "O0"
-all_reduce_fusion_config:
-    - 2
-    - 115
-
-# Export options
-device_id: 0
-width: 224
-height: 224
-file_name: "resnet50"
-file_format: "AIR"
-ckpt_file: ""
-network_dataset: "resnet50_cifar10"
-
----
-# Help description for each configuration
-enable_modelarts: "Whether training on modelarts, default: False"
-data_url: "Dataset url for obs"
-checkpoint_url: "The location of checkpoint for obs"
-data_path: "Dataset path for local"
-output_path: "Training output path for local"
-load_path: "The location of checkpoint for obs"
-device_target: "Target device type, available: [Ascend, GPU, CPU]"
-enable_profiling: "Whether enable profiling while training, default: False"
-num_classes: "Class for dataset"
-batch_size: "Batch size for training and evaluation"
-epoch_size: "Total training epochs."
-checkpoint_path: "The location of the checkpoint file."
-checkpoint_file_path: "The location of the checkpoint file."
diff --git a/model_zoo/official/cv/resnet/resnet50_imagenet2012_Acc_config.yaml b/model_zoo/official/cv/resnet/resnet50_imagenet2012_Acc_config.yaml
deleted file mode 100644
index 80456c685db..00000000000
--- a/model_zoo/official/cv/resnet/resnet50_imagenet2012_Acc_config.yaml
+++ /dev/null
@@ -1,81 +0,0 @@
-# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
-enable_modelarts: False
-# Url for modelarts
-data_url: ""
-train_url: ""
-checkpoint_url: ""
-# Path for local
-run_distribute: False
-enable_profiling: False
-data_path: "/cache/data"
-output_path: "/cache/train"
-load_path: "/cache/checkpoint_path/"
-device_target: "Ascend"
-checkpoint_path: "./checkpoint/"
-checkpoint_file_path: ""
-
-# ==============================================================================
-# Training options
-optimizer: "Momentum"
-infer_label: ""
-class_num: 1001
-batch_size: 256
-loss_scale: 1024
-momentum: 0.9
-weight_decay: 0.0001
-epoch_size: 90
-pretrain_epoch_size: 0
-save_checkpoint: True
-save_checkpoint_epochs: 5
-keep_checkpoint_max: 10
-warmup_epochs: 5
-lr_decay_mode: "cosine"
-use_label_smooth: True
-label_smooth_factor: 0.1
-lr_init: 0
-lr_max: 0.8
-lr_end: 0.0
-
-net_name: "resnet50"
-dataset: "imagenet2012"
-device_num: 1
-pre_trained: ""
-run_eval: False
-eval_dataset_path: ""
-parameter_server: False
-filter_weight: False
-save_best_ckpt: True
-eval_start_epoch: 40
-eval_interval: 1
-enable_cache: False
-cache_session_id: ""
-mode_name: "GRAPH"
-acc_mode: "O1"
-all_reduce_fusion_config:
-    - 85
-    - 160
-
-# Export options
-device_id: 0
-width: 224
-height: 224
-file_name: "resnet50"
-file_format: "AIR"
-ckpt_file: ""
-network_dataset: "resnet50_imagenet2012"
-
----
-# Help description for each configuration
-enable_modelarts: "Whether training on modelarts, default: False"
-data_url: "Dataset url for obs"
-checkpoint_url: "The location of checkpoint for obs"
-data_path: "Dataset path for local"
-output_path: "Training output path for local"
-load_path: "The location of checkpoint for obs"
-device_target: "Target device type, available: [Ascend, GPU, CPU]"
-enable_profiling: "Whether enable profiling while training, default: False"
-num_classes: "Class for dataset"
-batch_size: "Batch size for training and evaluation"
-epoch_size: "Total training epochs."
-checkpoint_path: "The location of the checkpoint file."
-checkpoint_file_path: "The location of the checkpoint file."
diff --git a/model_zoo/official/cv/resnet/resnet50_imagenet2012_Ascend_Thor_config.yaml b/model_zoo/official/cv/resnet/resnet50_imagenet2012_Ascend_Thor_config.yaml
deleted file mode 100644
index 2b730eb81ff..00000000000
--- a/model_zoo/official/cv/resnet/resnet50_imagenet2012_Ascend_Thor_config.yaml
+++ /dev/null
@@ -1,82 +0,0 @@
-# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
-enable_modelarts: False
-# Url for modelarts
-data_url: ""
-train_url: ""
-checkpoint_url: ""
-# Path for local
-run_distribute: False
-enable_profiling: False
-data_path: "/cache/data"
-output_path: "/cache/train"
-load_path: "/cache/checkpoint_path/"
-device_target: "Ascend"
-checkpoint_path: "./checkpoint/"
-checkpoint_file_path: ""
-
-# ==============================================================================
-# Training options
-optimizer: "Thor"
-infer_label: ""
-class_num: 1001
-batch_size: 32
-loss_scale: 128
-momentum: 0.9
-weight_decay: 0.0005
-epoch_size: 45
-pretrain_epoch_size: 0
-save_checkpoint: True
-save_checkpoint_epochs: 2
-keep_checkpoint_max: 15
-use_label_smooth: True
-label_smooth_factor: 0.1
-lr_init: 0.05803
-lr_decay: 4.04839
-lr_end_epoch: 53
-damping_init: 0.02714
-damping_decay: 0.50036
-frequency: 834
-
-net_name: "resnet50"
-dataset: "imagenet2012"
-device_num: 1
-pre_trained: ""
-run_eval: False
-eval_dataset_path: ""
-parameter_server: False
-filter_weight: False
-save_best_ckpt: True
-eval_start_epoch: 40
-eval_interval: 1
-enable_cache: False
-cache_session_id: ""
-mode_name: "GRAPH"
-acc_mode: "O0"
-all_reduce_fusion_config:
-    - 85
-    - 160
-
-# Export options
-device_id: 0
-width: 224
-height: 224
-file_name: "resnet50"
-file_format: "AIR"
-ckpt_file: ""
-network_dataset: "resnet50_imagenet2012"
-
----
-# Help description for each configuration
-enable_modelarts: "Whether training on modelarts, default: False"
-data_url: "Dataset url for obs"
-checkpoint_url: "The location of checkpoint for obs"
-data_path: "Dataset path for local"
-output_path: "Training output path for local"
-load_path: "The location of checkpoint for obs"
-device_target: "Target device type, available: [Ascend, GPU, CPU]"
-enable_profiling: "Whether enable profiling while training, default: False"
-num_classes: "Class for dataset"
-batch_size: "Batch size for training and evaluation"
-epoch_size: "Total training epochs."
-checkpoint_path: "The location of the checkpoint file."
-checkpoint_file_path: "The location of the checkpoint file."
diff --git a/model_zoo/official/cv/resnet/resnet50_imagenet2012_GPU_Thor_config.yaml b/model_zoo/official/cv/resnet/resnet50_imagenet2012_GPU_Thor_config.yaml
deleted file mode 100644
index dd4b492f7e3..00000000000
--- a/model_zoo/official/cv/resnet/resnet50_imagenet2012_GPU_Thor_config.yaml
+++ /dev/null
@@ -1,82 +0,0 @@
-# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
-enable_modelarts: False
-# Url for modelarts
-data_url: ""
-train_url: ""
-checkpoint_url: ""
-# Path for local
-run_distribute: False
-enable_profiling: False
-data_path: "/cache/data"
-output_path: "/cache/train"
-load_path: "/cache/checkpoint_path/"
-device_target: "GPU"
-checkpoint_path: "./checkpoint/"
-checkpoint_file_path: ""
-
-# ==============================================================================
-# Training options
-optimizer: "Thor"
-infer_label: ""
-class_num: 1001
-batch_size: 32
-loss_scale: 128
-momentum: 0.9
-weight_decay: 0.0005
-epoch_size: 40
-pretrain_epoch_size: 0
-save_checkpoint: True
-save_checkpoint_epochs: 1
-keep_checkpoint_max: 15
-use_label_smooth: True
-label_smooth_factor: 0.1
-lr_init: 0.05672
-lr_decay: 4.9687
-lr_end_epoch: 50
-damping_init: 0.02345
-damping_decay: 0.5467
-frequency: 834
-
-net_name: "resnet50"
-dataset: "imagenet2012"
-device_num: 1
-pre_trained: ""
-run_eval: False
-eval_dataset_path: ""
-parameter_server: False
-filter_weight: False
-save_best_ckpt: True
-eval_start_epoch: 40
-eval_interval: 1
-enable_cache: False
-cache_session_id: ""
-mode_name: "GRAPH"
-acc_mode: "O0"
-all_reduce_fusion_config:
-    - 85
-    - 160
-
-# Export options
-device_id: 0
-width: 224
-height: 224 
-file_name: "resnet50"
-file_format: "AIR"
-ckpt_file: ""
-network_dataset: "resnet50_imagenet2012"
-
----
-# Help description for each configuration
-enable_modelarts: "Whether training on modelarts, default: False"
-data_url: "Dataset url for obs"
-checkpoint_url: "The location of checkpoint for obs"
-data_path: "Dataset path for local"
-output_path: "Training output path for local"
-load_path: "The location of checkpoint for obs"
-device_target: "Target device type, available: [Ascend, GPU, CPU]"
-enable_profiling: "Whether enable profiling while training, default: False"
-num_classes: "Class for dataset"
-batch_size: "Batch size for training and evaluation"
-epoch_size: "Total training epochs."
-checkpoint_path: "The location of the checkpoint file."
-checkpoint_file_path: "The location of the checkpoint file."
diff --git a/model_zoo/official/cv/resnet/resnet50_imagenet2012_config.yaml b/model_zoo/official/cv/resnet/resnet50_imagenet2012_config.yaml
deleted file mode 100644
index a9873711004..00000000000
--- a/model_zoo/official/cv/resnet/resnet50_imagenet2012_config.yaml
+++ /dev/null
@@ -1,81 +0,0 @@
-# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
-enable_modelarts: False
-# Url for modelarts
-data_url: ""
-train_url: ""
-checkpoint_url: ""
-# Path for local
-run_distribute: False
-enable_profiling: False
-data_path: "/cache/data"
-output_path: "/cache/train"
-load_path: "/cache/checkpoint_path/"
-device_target: "Ascend"
-checkpoint_path: "./checkpoint/"
-checkpoint_file_path: ""
-
-# ==============================================================================
-# Training options
-optimizer: "Momentum"
-infer_label: ""
-class_num: 1001
-batch_size: 256
-loss_scale: 1024
-momentum: 0.9
-weight_decay: 0.0001
-epoch_size: 90
-pretrain_epoch_size: 0
-save_checkpoint: True
-save_checkpoint_epochs: 5
-keep_checkpoint_max: 10
-warmup_epochs: 0
-lr_decay_mode: "linear"
-use_label_smooth: True
-label_smooth_factor: 0.1
-lr_init: 0
-lr_max: 0.8
-lr_end: 0.0
-
-net_name: "resnet50"
-dataset: "imagenet2012"
-device_num: 1
-pre_trained: ""
-run_eval: False
-eval_dataset_path: ""
-parameter_server: False
-filter_weight: False
-save_best_ckpt: True
-eval_start_epoch: 40
-eval_interval: 1
-enable_cache: False
-cache_session_id: ""
-mode_name: "GRAPH"
-acc_mode: "O0"
-all_reduce_fusion_config:
-    - 85
-    - 160
-
-# Export options
-device_id: 0
-width: 224
-height: 224
-file_name: "resnet50"
-file_format: "AIR"
-ckpt_file: ""
-network_dataset: "resnet50_imagenet2012"
-
----
-# Help description for each configuration
-enable_modelarts: "Whether training on modelarts, default: False"
-data_url: "Dataset url for obs"
-checkpoint_url: "The location of checkpoint for obs"
-data_path: "Dataset path for local"
-output_path: "Training output path for local"
-load_path: "The location of checkpoint for obs"
-device_target: "Target device type, available: [Ascend, GPU, CPU]"
-enable_profiling: "Whether enable profiling while training, default: False"
-num_classes: "Class for dataset"
-batch_size: "Batch size for training and evaluation"
-epoch_size: "Total training epochs."
-checkpoint_path: "The location of the checkpoint file."
-checkpoint_file_path: "The location of the checkpoint file."
diff --git a/model_zoo/official/cv/resnet/resnet_benchmark_GPU.yaml b/model_zoo/official/cv/resnet/resnet_benchmark_GPU.yaml
deleted file mode 100644
index fb4d21e54c9..00000000000
--- a/model_zoo/official/cv/resnet/resnet_benchmark_GPU.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
-enable_modelarts: False
-# Url for modelarts
-data_url: ""
-train_url: ""
-checkpoint_url: ""
-# Path for local
-run_distribute: False
-enable_profiling: False
-data_path: "/cache/data"
-output_path: "/cache/train"
-load_path: "/cache/checkpoint_path/"
-device_target: "GPU"
-checkpoint_path: "./checkpoint/"
-checkpoint_file_path: ''
-
-# ==============================================================================
-# Training options
-optimizer: "Momentum"
-infer_label: ""
-batch_size: 256
-epoch_size: 2
-print_per_steps: 20
-eval: False
-save_ckpt: False
-mode_name: "GRAPH"
-dtype: "fp16"
-acc_mode: "O0"
-
-# Export options
-device_id: 0
-width: 224
-height: 224
-file_name: "resnet"
-file_format: "AIR"
-ckpt_file: ""
-network_dataset: "resnet50_imagenet2012"
-
----
-# Help description for each configuration
-enable_modelarts: "Whether training on modelarts, default: False"
-data_url: "Dataset url for obs"
-checkpoint_url: "The location of checkpoint for obs"
-data_path: "Dataset path for local"
-output_path: "Training output path for local"
-load_path: "The location of checkpoint for obs"
-device_target: "Target device type, available: [Ascend, GPU, CPU]"
-enable_profiling: "Whether enable profiling while training, default: False"
-num_classes: "Class for dataset"
-batch_size: "Batch size for training and evaluation"
-epoch_size: "Total training epochs."
-checkpoint_path: "The location of the checkpoint file."
-checkpoint_file_path: "The location of the checkpoint file."
diff --git a/model_zoo/official/cv/resnet/se-resnet50_imagenet2012_config.yaml b/model_zoo/official/cv/resnet/se-resnet50_imagenet2012_config.yaml
deleted file mode 100644
index 7d98865ddc9..00000000000
--- a/model_zoo/official/cv/resnet/se-resnet50_imagenet2012_config.yaml
+++ /dev/null
@@ -1,82 +0,0 @@
-# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
-enable_modelarts: False
-# Url for modelarts
-data_url: ""
-train_url: ""
-checkpoint_url: ""
-# Path for local
-run_distribute: False
-enable_profiling: False
-data_path: "/cache/data"
-output_path: "/cache/train"
-load_path: "/cache/checkpoint_path/"
-device_target: "Ascend"
-checkpoint_path: "./checkpoint/"
-checkpoint_file_path: ""
-
-# ==============================================================================
-# Training options
-optimizer: "Momentum"
-infer_label: ""
-class_num: 1001
-batch_size: 32
-loss_scale: 1024
-momentum: 0.9
-weight_decay: 0.0001
-epoch_size: 28
-train_epoch_size: 24
-pretrain_epoch_size: 0
-save_checkpoint: True
-save_checkpoint_epochs: 4
-keep_checkpoint_max: 10
-warmup_epochs: 3
-lr_decay_mode: "cosine"
-use_label_smooth: True
-label_smooth_factor: 0.1
-lr_init: 0
-lr_end: 0.0001
-lr_max: 0.3
-
-net_name: "se-resnet50"
-dataset: "imagenet2012"
-device_num: 1
-pre_trained: ""
-run_eval: False
-eval_dataset_path: ""
-parameter_server: False
-filter_weight: False
-save_best_ckpt: True
-eval_start_epoch: 40
-eval_interval: 1
-enable_cache: False
-cache_session_id: ""
-mode_name: "GRAPH"
-acc_mode: "O0"
-all_reduce_fusion_config:
-    - 1
-    - 100
-
-# Export options
-device_id: 0
-width: 256
-height: 256
-file_name: "se-resnet50"
-file_format: "AIR"
-ckpt_file: ""
-network_dataset: "se-resnet50_imagenet2012"
-
----
-# Help description for each configuration
-enable_modelarts: "Whether training on modelarts, default: False"
-data_url: "Dataset url for obs"
-checkpoint_url: "The location of checkpoint for obs"
-data_path: "Dataset path for local"
-output_path: "Training output path for local"
-load_path: "The location of checkpoint for obs"
-device_target: "Target device type, available: [Ascend, GPU, CPU]"
-enable_profiling: "Whether enable profiling while training, default: False"
-num_classes: "Class for dataset"
-batch_size: "Batch size for training and evaluation"
-epoch_size: "Total training epochs."
-checkpoint_path: "The location of the checkpoint file."
-checkpoint_file_path: "The location of the checkpoint file."
diff --git a/model_zoo/official/cv/resnet50_quant/scripts/run_infer_310.sh b/model_zoo/official/cv/resnet50_quant/scripts/run_infer_310.sh
index 4a19f62069f..ef1b6ad1a7f 100644
--- a/model_zoo/official/cv/resnet50_quant/scripts/run_infer_310.sh
+++ b/model_zoo/official/cv/resnet50_quant/scripts/run_infer_310.sh
@@ -49,9 +49,10 @@ if [ -d ${ASCEND_HOME}/ascend-toolkit ]; then
     export PYTHONPATH=${TBE_IMPL_PATH}:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/python/site-packages:$PYTHONPATH
     export ASCEND_OPP_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp
 else
-    export PATH=$ASCEND_HOME/atc/ccec_compiler/bin:$ASCEND_HOME/atc/bin:$PATH
-    export LD_LIBRARY_PATH=/usr/local/lib:$ASCEND_HOME/atc/lib64:$ASCEND_HOME/acllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH
-    export PYTHONPATH=$ASCEND_HOME/atc/python/site-packages:$PYTHONPATH
+    export PATH=$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/fwkacllib/bin:$PATH
+    export LD_LIBRARY_PATH=/usr/local/lib:$ASCEND_HOME/fwkacllib/lib64:$ASCEND_HOME/driver/lib64:$LD_LIBRARY_PATH
+    export TBE_IMPL_PATH=$ASCEND_HOME/opp/op_impl/built-in/ai_core/tbe
+    export PYTHONPATH=$PYTHONPATH:$TBE_IMPL_PATH
     export ASCEND_OPP_PATH=$ASCEND_HOME/opp
 fi
 
@@ -104,4 +105,4 @@ cal_acc
 if [ $? -ne 0 ]; then
     echo "calculate accuracy failed"
     exit 1
-fi
\ No newline at end of file
+fi
diff --git a/model_zoo/official/cv/ssd/README.md b/model_zoo/official/cv/ssd/README.md
index 93cd8d543e7..7b486f2d2e1 100644
--- a/model_zoo/official/cv/ssd/README.md
+++ b/model_zoo/official/cv/ssd/README.md
@@ -23,6 +23,7 @@
         - [Export MindIR](#export-mindir)
         - [Infer on Ascend310](#infer-on-ascend310)
         - [result](#result)
+        - [Post Training Quantization](#post-training-quantization)
     - [Model Description](#model-description)
         - [Performance](#performance)
             - [Evaluation Performance](#evaluation-performance)
@@ -541,6 +542,52 @@ Average Recall    (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.659
 mAP: 0.33880018942412393
 ```
 
+### [Post Training Quantization](#contents)
+
+Relative executing script files reside in the directory "ascend310_quant_infer". Please implement following steps sequentially to complete post quantization.
+Current quantization project bases on COCO2017 dataset.
+
+1. Generate data of .bin format required for AIR model inference at Ascend310 platform.
+
+```shell
+python export_bin.py --config_path [YMAL CONFIG PATH] --coco_root [COCO DATA DIR] --mindrecord_dir [MINDRECORD PATH]
+```
+
+2. Export quantized AIR model.
+
+Post quantization of model requires special toolkits for exporting quantized AIR model. Please refer to [official website](https://www.hiascend.com/software/cann/community).
+
+```shell
+python post_quant.py --config_path [YMAL CONFIG PATH] --checkpoint_path [CKPT_PATH] --coco_root [COCO DATA DIR] --mindrecord_dir [MINDRECORD PATH]
+```
+
+The quantized AIR file will be stored as "./results/ssd_quant.air".
+
+3. Implement inference at Ascend310 platform.
+
+```shell
+# Ascend310 quant inference
+bash run_quant_infer.sh [AIR_PATH] [IMAGE_DATA] [IMAGE_ID] [IMAGE_SHAPE] [ANN_FILE]
+```
+
+Inference result is saved in current path, you can find result like this in acc.log file.
+
+```bash
+Average Precision (AP) @[ IoU=0.50:0.95 | area= all   | maxDets=100 ] = 0.237
+Average Precision (AP) @[ IoU=0.50      | area= all   | maxDets=100 ] = 0.386
+Average Precision (AP) @[ IoU=0.75      | area= all   | maxDets=100 ] = 0.240
+Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.042
+Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.200
+Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.425
+Average Recall    (AR) @[ IoU=0.50:0.95 | area= all   | maxDets=  1 ] = 0.255
+Average Recall    (AR) @[ IoU=0.50:0.95 | area= all   | maxDets= 10 ] = 0.404
+Average Recall    (AR) @[ IoU=0.50:0.95 | area= all   | maxDets=100 ] = 0.441
+Average Recall    (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.136
+Average Recall    (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.455
+Average Recall    (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.707
+mAP: 0.23657619676441116
+```
+
 ## [Model Description](#contents)
 
 ### [Performance](#contents)
diff --git a/model_zoo/official/cv/ssd/README_CN.md b/model_zoo/official/cv/ssd/README_CN.md
index 525c1e596f3..6ff9b47f19d 100644
--- a/model_zoo/official/cv/ssd/README_CN.md
+++ b/model_zoo/official/cv/ssd/README_CN.md
@@ -21,6 +21,7 @@
         - [导出MindIR](#导出mindir)
         - [在Ascend310执行推理](#在ascend310执行推理)
         - [结果](#结果)
+        - [训练后量化推理](#训练后量化推理)
 - [模型描述](#模型描述)
     - [性能](#性能)
         - [评估性能](#评估性能)
@@ -463,6 +464,51 @@ Average Recall    (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.659
 mAP: 0.33880018942412393
 ```
 
+### [训练后量化推理](#contents)
+
+训练后量化推理的相关执行脚本文件在"ascend310_quant_infer"目录下，依次执行以下步骤实现训练后量化推理。本训练后量化工程基于COCO2017数据集。
+
+1、生成Ascend310平台AIR模型推理需要的.bin格式数据。
+
+```shell
+python export_bin.py --config_path [YMAL CONFIG PATH] --coco_root [COCO DATA DIR] --mindrecord_dir [MINDRECORD PATH]
+```
+
+2、导出训练后量化的AIR格式模型。
+
+导出训练后量化模型需要配套的量化工具包，参考[官方地址](https://www.hiascend.com/software/cann/community)
+
+```shell
+python post_quant.py --config_path [YMAL CONFIG PATH] --checkpoint_path [CKPT_PATH] --coco_root [COCO DATA DIR] --mindrecord_dir [MINDRECORD PATH]
+```
+
+导出的模型会存储在./result/ssd_quant.air。
+
+3、在Ascend310执行推理量化模型。
+
+```shell
+# Ascend310 quant inference
+bash run_quant_infer.sh [AIR_PATH] [IMAGE_DATA] [IMAGE_ID] [IMAGE_SHAPE] [ANN_FILE]
+```
+
+推理结果保存在脚本执行的当前路径，可以在acc.log中看到精度计算结果。
+
+```bash
+Average Precision (AP) @[ IoU=0.50:0.95 | area= all   | maxDets=100 ] = 0.237
+Average Precision (AP) @[ IoU=0.50      | area= all   | maxDets=100 ] = 0.386
+Average Precision (AP) @[ IoU=0.75      | area= all   | maxDets=100 ] = 0.240
+Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.042
+Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.200
+Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.425
+Average Recall    (AR) @[ IoU=0.50:0.95 | area= all   | maxDets=  1 ] = 0.255
+Average Recall    (AR) @[ IoU=0.50:0.95 | area= all   | maxDets= 10 ] = 0.404
+Average Recall    (AR) @[ IoU=0.50:0.95 | area= all   | maxDets=100 ] = 0.441
+Average Recall    (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.136
+Average Recall    (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.455
+Average Recall    (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.707
+mAP: 0.23657619676441116
+```
+
 # 模型描述
 
 ## 性能
diff --git a/model_zoo/official/cv/unet/README.md b/model_zoo/official/cv/unet/README.md
index 00736eb65cb..e9c85295bd8 100644
--- a/model_zoo/official/cv/unet/README.md
+++ b/model_zoo/official/cv/unet/README.md
@@ -21,6 +21,7 @@
         - [How to use](#how-to-use)
             - [Inference](#inference)
                 - [Running on Ascend 310](#running-on-ascend-310)
+                - [Post Training Quantization](#post-training-quantization)
             - [Continue Training on the Pretrained Model](#continue-training-on-the-pretrained-model)
             - [Transfer training](#transfer-training)
     - [Description of Random Situation](#description-of-random-situation)
@@ -127,7 +128,7 @@ After installing MindSpore via the official website, you can start training and
 
 - Run on Ascend
 
-```python
+```shell
 # run training example
 python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml > train.log 2>&1 &
 OR
@@ -142,6 +143,26 @@ OR
 bash scripts/run_standalone_eval.sh [DATASET] [CHECKPOINT] [CONFIG_PATH]
 ```
 
+- Run on GPU
+
+```shell
+# run training example
+python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml --device_target=GPU > train.log 2>&1 &
+OR
+bash scripts/run_standalone_train_gpu.sh [DATASET] [CONFIG_PATH] [DEVICE_ID](optional)
+
+# run distributed training example
+bash scripts/run_distribute_train.sh [RANKSIZE] [DATASET] [CONFIG_PATH] [CUDA_VISIBLE_DEVICES(0,1,2,3,4,5,6,7)](optional)
+
+# run evaluation example
+python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ --config_path=/path/to/yaml > eval.log 2>&1 &
+OR
+bash scripts/run_standalone_eval_gpu.sh [DATASET] [CHECKPOINT] [CONFIG_PATH] [DEVICE_ID](optional)
+
+# run export
+python export.py --config_path=[CONFIG_PATH] --checkpoint_file_path=[model_ckpt_path] --file_name=[air_model_name] --file_format=MINDIR --device_target=GPU
+```
+
 - Run on docker
 
 Build docker images(Change version to the one you actually used)
@@ -162,7 +183,7 @@ Then you can run everything just like on ascend.
 
 If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training and evaluation as follows:
 
-```python
+```text
 # run distributed training on modelarts example
 # (1) First, Perform a or b.
 #       a. Set "enable_modelarts=True" on yaml file.
@@ -191,33 +212,18 @@ If you want to run in modelarts, please check the official documentation of [mod
 # (7) Create your job.
 ```
 
-- Run on GPU
-
-  ```python
-  # run training example
-  python train.py --data_path=/path/to/data/ --config_path=/path/to/config/ --output ./output > train.log  2>&1 &
-  OR
-  bash scripts/run_standalone_train_gpu.sh [DATASET] [CONFIG_PATH]
-
-  # run distributed training example
-  bash scripts/run_distribute_train_gpu.sh [RANKSIZE] [DATASET] [CONFIG_PATH]
-
-  # run evaluation example
-  python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ --config_path=/path/to/config/ > eval.log  2>&1 &
-  OR
-  bash scripts/run_standalone_eval_gpu.sh [DATASET] [CHECKPOINT] [CONFIG_PATH]
-  ```
-
 ## [Script Description](#contents)
 
 ### [Script and Sample Code](#contents)
 
-```shell
+```text
 ├── model_zoo
     ├── README.md                           // descriptions about all the models
     ├── unet
         ├── README.md                       // descriptions about Unet
+        ├── README_CN.md                    // chinese descriptions about Unet
         ├── ascend310_infer                 // code of infer on ascend 310
+        ├── Dockerfile
         ├── scripts
         │   ├──docker_start.sh              // shell script for quick docker start
         │   ├──run_disribute_train.sh       // shell script for distributed on Ascend
@@ -228,7 +234,7 @@ If you want to run in modelarts, please check the official documentation of [mod
         │   ├──run_standalone_eval_gpu.sh       // shell script forevaluation on GPU
         │   ├──run_distribute_train_gpu.sh      // shell script for distributed on GPU
         ├── src
-        │   ├──config.py                    // parameter configuration
+        │   ├──__init__.py
         │   ├──data_loader.py               // creating dataset
         │   ├──loss.py                      // loss
         │   ├──eval_callback.py             // evaluation callback while training
@@ -236,18 +242,21 @@ If you want to run in modelarts, please check the official documentation of [mod
         │   ├──unet_medical                 // Unet medical architecture
                 ├──__init__.py              // init file
                 ├──unet_model.py            // unet model
-                ├──unet_parts.py            // unet part
+                └──unet_parts.py            // unet part
         │   ├──unet_nested                  // Unet++ architecture
                 ├──__init__.py              // init file
                 ├──unet_model.py            // unet model
-                ├──unet_parts.py            // unet part
-                ├── model_utils
-                │   ├── config.py          // parameter configuration
-                │   ├── device_adapter.py  // device adapter
-                │   ├── local_adapter.py   // local adapter
-                │   ├── moxing_adapter.py  // moxing adapter
+                └──unet_parts.py            // unet part
+        │   ├──model_utils
+                ├──__init__.py
+                ├── config.py               // parameter configuration
+                ├── device_adapter.py       // device adapter
+                ├── local_adapter.py        // local adapter
+                └── moxing_adapter.py       // moxing adapter
         ├── unet_medical_config.yaml        // parameter configuration
+        ├── unet_medicl_gpu_config.yaml     // parameter configuration
         ├── unet_nested_cell_config.yaml    // parameter configuration
+        ├── unet_nested_coco_config.yaml    // parameter configuration
         ├── unet_nested_config.yaml         // parameter configuration
         ├── unet_simple_config.yaml         // parameter configuration
         ├── unet_simple_coco_config.yaml    // parameter configuration
@@ -258,16 +267,16 @@ If you want to run in modelarts, please check the official documentation of [mod
         ├── postprocess.py                  // unet 310 infer postprocess.
         ├── preprocess.py                   // unet 310 infer preprocess dataset
         ├── preprocess_dataset.py           // the script to adapt MultiClass dataset
-        ├── requirements.txt                // Requirements of third party package.
+        └── requirements.txt                // Requirements of third party package.
 ```
 
 ### [Script Parameters](#contents)
 
-Parameters for both training and evaluation can be set in config.py
+Parameters for both training and evaluation can be set in *.yaml
 
 - config for Unet, ISBI dataset
 
-  ```python
+  ```yaml
   'name': 'Unet',                     # model name
   'lr': 0.0001,                       # learning rate
   'epochs': 400,                      # total training epochs when run 1p
@@ -298,7 +307,7 @@ Parameters for both training and evaluation can be set in config.py
 
 - config for Unet++, cell nuclei dataset
 
-  ```python
+  ```yaml
   'model': 'unet_nested',             # model name
   'dataset': 'Cell_nuclei',           # dataset name
   'img_size': [96, 96],               # image size
@@ -366,9 +375,9 @@ The model checkpoint will be saved in the current directory.
 #### running on GPU
 
 ```shell
-python train.py --data_path=/path/to/data/ --config_path=/path/to/config/ --output ./output > train.log  2>&1 &
+python train.py --data_path=/path/to/data/ --config_path=/path/to/config/ --output ./output --device_target GPU > train.log  2>&1 &
 OR
-bash scripts/run_standalone_train_gpu.sh [DATASET] [CONFIG_PATH]
+bash scripts/run_standalone_train_gpu.sh [DATASET] [CONFIG_PATH] [DEVICE_ID](optional)
 ```
 
 The python command above will run in the background, you can view the results through the file train.log. The model checkpoint will be saved in the current directory.
@@ -466,6 +475,25 @@ The above python command will run in the background. You can view the results th
 | Checkpoint for Fine tuning | 355.11M (.ckpt file)                                         | 355.11M (.ckpt file)                                         |
 | Scripts                    | [unet script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/unet) | [unet script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/unet) |
 
+| Parameters | Ascend | GPU |
+| -----| ----- | ----- |
+| Model Version | U-Net nested(unet++) | U-Net nested(unet++) |
+| Resource | Ascend 910 ;CPU 2.60GHz,192cores; Memory,755G; OS Euler2.8 | NV SMX2 V100-32G |
+| uploaded Date | 2021-8-20 | 2021-8-20 |
+| MindSpore Version | 1.3.0 | 1.3.0 |
+| Dataset | Cell_nuclei | Cell_nuclei |
+| Training Parameters | 1pc: epoch=200, total steps=6700, batch_size=16, lr=0.0003, 8pc: epoch=1600, total steps=6560, batch_size=16*8, lr=0.0003 | 1pc: epoch=200, total steps=6700, batch_size=16, lr=0.0003, 8pc: epoch=1600, total steps=6560, batch_size=16*8, lr=0.0003 |
+| Optimizer | ADAM | ADAM |
+| Loss Function | Softmax Cross Entropy | Softmax Cross Entropy |
+| outputs | probability |  probability |
+| probability | cross valid dice coeff is 0.966, cross valid IOU is 0.936 | cross valid dice coeff is 0.976,cross valid IOU is 0.955 |
+| Loss | <0.1 | <0.1 |
+| Speed | 1pc: 150~200 fps | 1pc：230~280 fps, 8pc：(170~210)*8 fps |
+| Total time | 1pc: 10.8min | 1pc：8min |
+| Parameters (M)  | 27M | 27M |
+| Checkpoint for Fine tuning | 103.4M(.ckpt file) | 103.4M(.ckpt file) |
+| Scripts | [unet script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/unet) | [unet script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/unet) |
+
 ## [How to use](#contents)
 
 ### Inference
@@ -489,7 +517,7 @@ The checkpoint_file_path parameter is required,
 
 Export on ModelArts (If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start as follows)
 
-```python
+```text
 # Export on ModelArts
 # (1) Perform a or b.
 #       a. Set "enable_modelarts=True" on default_config.yaml file.
@@ -526,11 +554,45 @@ Inference result is saved in current path, you can find result in acc.log file.
 Cross valid dice coeff is: 0.9054352151297033
 ```
 
+##### [Post Training Quantization](#contents)
+
+Relative executing script files reside in the directory "ascend310_quant_infer". Please implement following steps sequentially to complete post quantization.
+Current quantization project bases on ISBI dataset.
+
+1. Generate data of .bin format required for AIR model inference at Ascend310 platform.
+
+```shell
+python export_bin.py --config_path [YMAL CONFIG PATH] --data_path [DATA DIR] --result_path [RESULT PATH]
+```
+
+2. Export quantized AIR model.
+
+Post quantization of model requires special toolkits for exporting quantized AIR model. Please refer to [official website](https://www.hiascend.com/software/cann/community).
+
+```shell
+python post_quant.py --config_path [YMAL CONFIG PATH] --data_path [DATASET PATH] --checkpoint_file_path [CKPT_PATH]
+```
+
+The quantized AIR file will be stored as "./results/unet_quant.air".
+
+3. Implement inference at Ascend310 platform.
+
+```shell
+# Ascend310 quant inference
+bash run_quant_infer.sh [AIR_PATH] [DATA_PATH] [LABEL_PATH]
+```
+
+Inference result is saved in current path, you can find result like this in acc.log file.
+
+```bash
+Cross valid dice coeff is: 0.9139793866877975
+```
+
 #### Continue Training on the Pretrained Model
 
 Set options `resume` to True in `*.yaml`, and set `resume_ckpt` to the path of your checkpoint. e.g.
 
-```python
+```yaml
   'resume': True,
   'resume_ckpt': 'ckpt_unet_sample_adam_1-1_600.ckpt',
   'transfer_training': False,
@@ -541,7 +603,7 @@ Set options `resume` to True in `*.yaml`, and set `resume_ckpt` to the path of y
 
 Do the same thing as resuming traing above. In addition, set `transfer_training` to True. The `filter_weight` shows the weights which will be filtered for different dataset. Usually, the default value of `filter_weight` don't need to be changed. The default values includes the weights which depends on the class number. e.g.
 
-```python
+```yaml
   'resume': True,
   'resume_ckpt': 'ckpt_unet_sample_adam_1-1_600.ckpt',
   'transfer_training': True,
diff --git a/model_zoo/official/cv/unet/README_CN.md b/model_zoo/official/cv/unet/README_CN.md
index cd2641c329a..1de86be0285 100644
--- a/model_zoo/official/cv/unet/README_CN.md
+++ b/model_zoo/official/cv/unet/README_CN.md
@@ -22,6 +22,7 @@
         - [用法](#用法-1)
             - [推理](#推理)
                 - [Ascend 310环境运行](#ascend-310环境运行)
+                - [训练后量化推理](#训练后量化推理)
             - [继续训练预训练模型](#继续训练预训练模型)
             - [迁移学习](#迁移学习)
     - [随机情况说明](#随机情况说明)
@@ -131,9 +132,9 @@ python preprocess_dataset.py --config_path path/unet/*.yaml  --data_path /data/s
 
 - Ascend处理器环境运行
 
-  ```python
+  ```shell
   # 训练示例
-python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml > train.log 2>&1 &
+  python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml > train.log 2>&1 &
   OR
   bash scripts/run_standalone_train.sh [DATASET] [CONFIG_PATH]
 
@@ -141,11 +142,31 @@ python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml > train.l
   bash scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATASET] [CONFIG_PATH]
 
   # 评估示例
-python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ --config_path=/path/to/yaml > eval.log 2>&1 &
+  python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ --config_path=/path/to/yaml > eval.log 2>&1 &
   OR
   bash scripts/run_standalone_eval.sh [DATASET] [CHECKPOINT] [CONFIG_PATH]
   ```
 
+- GPU处理器环境运行
+
+  ```shell
+  # 训练示例
+  python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml --device_target=GPU > train.log 2>&1 &
+  OR
+  bash scripts/run_standalone_train_gpu.sh [DATASET] [CONFIG_PATH] [DEVICE_ID](optional)
+
+  # 分布式训练示例
+  bash scripts/run_distribute_train.sh [RANKSIZE] [DATASET] [CONFIG_PATH] [CUDA_VISIBLE_DEVICES(0,1,2,3,4,5,6,7)](optional)
+
+  # 评估示例
+  python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ --config_path=/path/to/yaml > eval.log 2>&1 &
+  OR
+  bash scripts/run_standalone_eval_gpu.sh [DATASET] [CHECKPOINT] [CONFIG_PATH] [DEVICE_ID](optional)
+
+  # 模型导出
+  python export.py --config_path=[CONFIG_PATH] --checkpoint_file_path=[model_ckpt_path] --file_name=[air_model_name] --file_format=MINDIR --device_target=GPU
+  ```
+
 - Docker中运行
 
 创建docker镜像(讲版本号换成你实际使用的版本)
@@ -167,7 +188,7 @@ bash scripts/docker_start.sh unet:20.1.0 [DATA_DIR] [MODEL_DIR]
 如果要在modelarts上进行模型的训练，可以参考modelarts的官方指导文档(https://support.huaweicloud.com/modelarts/)
 开始进行模型的训练和推理，具体操作如下：
 
-```python
+```text
 # 在modelarts上使用分布式训练的示例：
 # (1) 选址a或者b其中一种方式。
 #       a. 设置 "enable_modelarts=True" 。
@@ -198,35 +219,20 @@ bash scripts/docker_start.sh unet:20.1.0 [DATA_DIR] [MODEL_DIR]
 # (7) 开始模型的推理。
 ```
 
-- GPU处理器环境运行
-
-  ```python
-  # 训练示例
-  python train.py --data_path=/path/to/data/ --config_path=/path/to/config/ --output ./output > train.log  2>&1 &
-  OR
-  bash scripts/run_standalone_train_gpu.sh [DATASET] [CONFIG_PATH]
-
-  # 分布式训练示例
-  bash scripts/run_distribute_train_gpu.sh [RANKSIZE] [DATASET] [CONFIG_PATH]
-
-  # 评估示例
-  python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ --config_path=/path/to/config/ > eval.log  2>&1 &
-  OR
-  bash scripts/run_standalone_eval_gpu.sh [DATASET] [CHECKPOINT] [CONFIG_PATH]
-  ```
-
 # 脚本说明
 
 ## 脚本说明
 
 ### 脚本及样例代码
 
-```path
+```text
 ├── model_zoo
     ├── README.md                           // 模型描述
     ├── unet
         ├── README.md                       // Unet描述
+        ├── README_CN.md                    // Unet中文描述
         ├── ascend310_infer                 // Ascend 310 推理代码
+        ├── Dockerfile
         ├── scripts
         │   ├──docker_start.sh              // docker 脚本
         │   ├──run_disribute_train.sh       // Ascend 上分布式训练脚本
@@ -237,26 +243,29 @@ bash scripts/docker_start.sh unet:20.1.0 [DATA_DIR] [MODEL_DIR]
         │   ├──run_standalone_eval_gpu.sh   // GPU 上评估脚本
         │   ├──run_distribute_train_gpu.sh  // GPU 上分布式训练脚本
         ├── src
-        │   ├──config.py                    // 参数配置
+        │   ├──__init__.py
         │   ├──data_loader.py               // 数据处理
         │   ├──loss.py                      // 损失函数
-        │   ├─  eval_callback.py            // 训练时推理回调函数
+        │   ├──eval_callback.py             // 训练时推理回调函数
         │   ├──utils.py                     // 通用组件（回调函数）
         │   ├──unet_medical                 // 医学图像处理Unet结构
                 ├──__init__.py
                 ├──unet_model.py            // Unet 网络结构
-                ├──unet_parts.py            // Unet 子网
+                └──unet_parts.py            // Unet 子网
         │   ├──unet_nested                  // Unet++
                 ├──__init__.py
                 ├──unet_model.py            // Unet++ 网络结构
-                ├──unet_parts.py            // Unet++ 子网
-                        ├── model_utils
-                │   ├── config.py          // 参数配置
-                │   ├── device_adapter.py  // 设备配置
-                │   ├── local_adapter.py   // 本地设备配置
-                │   ├── moxing_adapter.py  // modelarts设备配置
+                └──net_parts.py            // Unet++ 子网
+        │   ├──model_utils
+                ├──__init__.py
+                ├──config.py          // 参数配置
+                ├──device_adapter.py  // 设备配置
+                ├──local_adapter.py   // 本地设备配置
+                └──moxing_adapter.py  // modelarts设备配置
         ├── unet_medical_config.yaml        // 配置文件
+        ├── unet_medicl_gpu_config.yaml     // 配置文件
         ├── unet_nested_cell_config.yaml    // 配置文件
+        ├── unet_nested_coco_config.yaml    // 配置文件
         ├── unet_nested_config.yaml         // 配置文件
         ├── unet_simple_config.yaml         // 配置文件
         ├── unet_simple_coco_config.yaml    // 配置文件
@@ -267,16 +276,16 @@ bash scripts/docker_start.sh unet:20.1.0 [DATA_DIR] [MODEL_DIR]
         ├── postprocess.py                  // 310 推理后处理脚本
         ├── preprocess.py                   // 310 推理前处理脚本
         ├── preprocess_dataset.py           // 适配MultiClass数据集脚本
-        ├── requirements.txt                // 需要的三方库.
+        └── requirements.txt                // 需要的三方库.
 ```
 
 ### 脚本参数
 
-在config.py中可以同时配置训练参数和评估参数。
+在*.yaml中可以同时配置训练参数和评估参数。
 
 - U-Net配置，ISBI数据集
 
-  ```python
+  ```yaml
   'name': 'Unet',                     # 模型名称
   'lr': 0.0001,                       # 学习率
   'epochs': 400,                      # 运行1p时的总训练轮次
@@ -300,7 +309,7 @@ bash scripts/docker_start.sh unet:20.1.0 [DATA_DIR] [MODEL_DIR]
 
 - Unet++配置, cell nuclei数据集
 
-  ```python
+  ```yaml
   'model': 'unet_nested',             # 模型名称
   'dataset': 'Cell_nuclei',           # 数据集名称
   'img_size': [96, 96],               # 输入图像大小
@@ -335,7 +344,7 @@ bash scripts/docker_start.sh unet:20.1.0 [DATA_DIR] [MODEL_DIR]
 - Ascend处理器环境运行
 
   ```shell
-python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml > train.log 2>&1 &
+  python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml > train.log 2>&1 &
   OR
   bash scripts/run_standalone_train.sh [DATASET] [CONFIG_PATH]
   ```
@@ -363,9 +372,9 @@ python train.py --data_path=/path/to/data/ --config_path=/path/to/yaml > train.l
 - GPU处理器环境运行
 
   ```shell
-  python train.py --data_path=/path/to/data/ --config_path=/path/to/config/ --output ./output > train.log  2>&1 &
+  python train.py --data_path=/path/to/data/ --config_path=/path/to/config/ --output ./output --device_target GPU > train.log  2>&1 &
   OR
-  bash scripts/run_standalone_train_gpu.sh [DATASET] [CONFIG_PATH]
+  bash scripts/run_standalone_train_gpu.sh [DATASET] [CONFIG_PATH] [DEVICE_ID](optional)
   ```
 
   上述python命令在后台运行，可通过`train.log`文件查看结果。
@@ -412,7 +421,7 @@ bash scripts/run_distribute_train_gpu.sh [RANKSIZE] [DATASET] [CONFIG_PATH]
   在运行以下命令之前，请检查用于评估的检查点路径。将检查点路径设置为绝对全路径，如"username/unet/ckpt_unet_medical_adam-48_600.ckpt"。
 
   ```shell
-python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ --config_path=/path/to/yaml > eval.log 2>&1 &
+  python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkpoint/ --config_path=/path/to/yaml > eval.log 2>&1 &
   OR
   bash scripts/run_standalone_eval.sh [DATASET] [CHECKPOINT] [CONFIG_PATH]
   ```
@@ -465,6 +474,25 @@ python eval.py --data_path=/path/to/data/ --checkpoint_file_path=/path/to/checkp
 | 微调检查点 | 355.11M (.ckpt文件)                                         | 355.11M (.ckpt文件)                        |
 | 脚本                    | [U-Net脚本](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/unet) | [U-Net脚本](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/unet) |
 
+| 参数 | Ascend | GPU |
+| ----- | ------ | ----- |
+| 模型版本 | U-Net nested(unet++) | U-Net nested(unet++) |
+| 资源 | Ascend 910；CPU：2.60GHz，192核；内存：755 GB；系统 Euler2.8  | NV SMX2 V100，内存：32G  |
+| 上传日期 | 2021-8-20 | 2021-8-20 |
+| MindSpore版本 | 1.3.0 | 1.3.0 |
+| 数据集 | Cell_nuclei | Cell_nuclei |
+| 训练参数   | 1卡: epoch=200, total steps=6700, batch_size=16, lr=0.0003; 8卡: epoch=1600, total steps=6560, batch_size=16*8, lr=0.0003 | 1卡: epoch=200, total steps=6700, batch_size=16, lr=0.0003; 8卡: epoch=1600, total steps=6560, batch_size=16*8, lr=0.0003 |
+| 优化器 | ADAM | ADAM |
+| 损失函数 | Softmax交叉熵 | Softmax交叉熵 |
+| 输出 | 概率 | 概率 |
+| 概率 | cross valid dice coeff is 0.966, cross valid IOU is 0.936 | cross valid dice coeff is 0.976,cross valid IOU is 0.955 |
+| 损失 | <0.1 | <0.1 |
+| 速度 | 1卡：150~200 fps | 1卡：230~280 fps, 8卡：(170~210)*8 fps|
+| 总时长 | 1卡: 10.8分钟 | 1卡: 8分钟 |
+| 参数(M)  | 27M | 27M |
+| 微调检查点 | 103.4M(.ckpt文件) | 103.4M(.ckpt文件) |
+| 脚本 | [U-Net脚本](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/unet) | [U-Net脚本](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/unet) |
+
 ### 用法
 
 #### 推理
@@ -485,7 +513,7 @@ python export.py --config_path=[CONFIG_PATH] --checkpoint_file_path=[model_ckpt_
 
 ModelArts导出mindir
 
-```python
+```text
 # (1) 把训练好的模型地方到桶的对应位置。
 # (2) 选址a或者b其中一种方式。
 #       a.  设置 "enable_modelarts=True"
@@ -520,11 +548,44 @@ bash run_infer_310.sh [NETWORK] [MINDIR_PATH] [DEVICE_ID] [NEED_PREPROCESS]
 Cross valid dice coeff is: 0.9054352151297033
 ```
 
+##### [训练后量化推理](#contents)
+
+训练后量化推理的相关执行脚本文件在"ascend310_quant_infer"目录下，依次执行以下步骤实现训练后量化推理。本训练后量化工程基于ISBI数据集。
+
+1、生成Ascend310平台AIR模型推理需要的.bin格式数据。
+
+```shell
+python export_bin.py --config_path [YMAL CONFIG PATH] --data_path [DATA DIR] --result_path [RESULT PATH]
+```
+
+2、导出训练后量化的AIR格式模型。
+
+导出训练后量化模型需要配套的量化工具包，参考[官方地址](https://www.hiascend.com/software/cann/community)
+
+```shell
+python post_quant.py --config_path [YMAL CONFIG PATH] --data_path [DATASET PATH] --checkpoint_file_path [CKPT_PATH]
+```
+
+导出的模型会存储在./result/unet_quant.air。
+
+3、在Ascend310执行推理量化模型。
+
+```shell
+# Ascend310 inference
+bash run_quant_infer.sh [AIR_PATH] [DATA_PATH] [LABEL_PATH]
+```
+
+推理结果保存在脚本执行的当前路径，可以在acc.log中看到精度计算结果。
+
+```bash
+Cross valid dice coeff is: 0.9139793866877975
+```
+
 #### 继续训练预训练模型
 
-在`config.py`里将`resume`设置成True，并将`resume_ckpt`设置成对应的权重文件路径，例如：
+在`*.yaml`里将`resume`设置成True，并将`resume_ckpt`设置成对应的权重文件路径，例如：
 
-```python
+```yaml
     'resume': True,
     'resume_ckpt': 'ckpt_unet_medical_adam_1-1_600.ckpt',
     'transfer_training': False,
@@ -535,7 +596,7 @@ Cross valid dice coeff is: 0.9054352151297033
 
 首先像上面讲的那样讲继续训练的权重加载进来。然后将`transfer_training`设置成True。配置中还有一个 `filter_weight`参数，用于将一些不能适用于不同数据集的权重过滤掉。通常这个`filter_weight`的参数不需要修改，其默认值通常是和模型的分类数相关的参数。例如：
 
-```python
+```yaml
     'resume': True,
     'resume_ckpt': 'ckpt_unet_medical_adam_1-1_600.ckpt',
     'transfer_training': True,
diff --git a/model_zoo/official/cv/unet/eval.py b/model_zoo/official/cv/unet/eval.py
index 3f2dd9a7f4c..1c037200af2 100644
--- a/model_zoo/official/cv/unet/eval.py
+++ b/model_zoo/official/cv/unet/eval.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ============================================================================
 
-import os
 import logging
 from mindspore import context, Model
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
@@ -24,6 +23,7 @@ from src.unet_nested import NestedUNet, UNet
 from src.utils import UnetEval, TempLoss, dice_coeff
 from src.model_utils.config import config
 from src.model_utils.moxing_adapter import moxing_wrapper
+from src.model_utils.device_adapter import get_device_id
 
 @moxing_wrapper()
 def test_net(data_dir,
@@ -62,7 +62,7 @@ if __name__ == '__main__':
     logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
     context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False)
     if config.device_target == "Ascend":
-        device_id = int(os.getenv('DEVICE_ID'))
+        device_id = get_device_id()
         context.set_context(device_id=device_id)
     test_net(data_dir=config.data_path,
              ckpt_path=config.checkpoint_file_path,
diff --git a/model_zoo/official/cv/unet/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/unet/scripts/run_distribute_train_gpu.sh
index 8cdcc6a1c8c..03b39237dd3 100644
--- a/model_zoo/official/cv/unet/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/cv/unet/scripts/run_distribute_train_gpu.sh
@@ -13,10 +13,55 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-echo "=============================================================================================================="
-echo "Please run the script as: "
-echo "bash scripts/run_distribute_train_gpu.sh [RANKSIZE] [DATASET] [CONFIG_PATH]"
-echo "for example: bash run_distribute_train_gpu.sh 8 /path/to/data/ /path/to/config/"
-echo "=============================================================================================================="
-mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
-python train.py  --run_distribute=True --data_path=$2  --config_path=$3  --output=./output > train.log 2>&1 &
+
+
+get_real_path() {
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+if [ $# != 3 ]  && [ $# != 4 ]
+then
+  echo "=============================================================================================================="
+  echo "Please run the script as: "
+  echo "bash scripts/run_distribute_train_gpu.sh [RANKSIZE] [DATASET] [CONFIG_PATH] [CUDA_VISIBLE_DEVICES(0,1,2,3,4,5,6,7)](optional)"
+  echo "for example: bash run_distribute_train_gpu.sh 8 /path/to/data/ /path/to/config/"
+  echo "=============================================================================================================="
+  exit 1
+fi
+
+RANK_SIZE=`expr $1 + 0`
+if [ $? != 0 ]; then
+  echo RANK_SIZE=$1 is not integer!
+  exit 1
+fi
+export RANK_SIZE=$RANK_SIZE
+DATASET=$(get_real_path $2)
+CONFIG_PATH=$(get_real_path $3)
+if [ $# != 4 ]; then
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+else
+  export CUDA_VISIBLE_DEVICES=$4
+fi
+PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
+TRAIN_OUTPUT=${PROJECT_DIR}/../train_distributed_gpu
+if [ -d $TRAIN_OUTPUT ]; then
+  rm -rf $TRAIN_OUTPUT
+fi
+mkdir $TRAIN_OUTPUT
+cd $TRAIN_OUTPUT || exit
+cp ../train.py ./
+cp ../eval.py ./
+cp -r ../src ./
+cp $CONFIG_PATH ./
+env > env.log
+
+mpirun -n $RANK_SIZE --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
+python train.py  --run_distribute=True \
+                 --data_path=$DATASET  \
+                 --config_path=${CONFIG_PATH##*/}  \
+                 --output=./output \
+                 --device_target=GPU> train.log 2>&1 &
diff --git a/model_zoo/official/cv/unet/scripts/run_standalone_eval_gpu.sh b/model_zoo/official/cv/unet/scripts/run_standalone_eval_gpu.sh
index b3655bca169..fbf9c68c69a 100644
--- a/model_zoo/official/cv/unet/scripts/run_standalone_eval_gpu.sh
+++ b/model_zoo/official/cv/unet/scripts/run_standalone_eval_gpu.sh
@@ -13,10 +13,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+get_real_path() {
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
 
-echo "=============================================================================================================="
-echo "Please run the script as: "
-echo "bash scripts/run_standalone_eval_gpu.sh [DATASET] [CHECKPOINT] [CONFIG_PATH]"
-echo "for example: bash run_standalone_eval_gpu.sh /path/to/data/ /path/to/checkpoint/ /path/to/config/"
-echo "=============================================================================================================="
-python eval.py --data_path=$1 --checkpoint_file_path=$2 --config_path=$3 > eval.log  2>&1 &
+if [ $# != 3 ]  && [ $# != 4 ]
+then
+  echo "=============================================================================================================="
+  echo "Please run the script as: "
+  echo "bash scripts/run_standalone_eval_gpu.sh [DATASET] [CHECKPOINT] [CONFIG_PATH] [DEVICE_ID](optional)"
+  echo "for example: bash run_standalone_eval_gpu.sh /path/to/data/ /path/to/checkpoint/ /path/to/config/"
+  echo "=============================================================================================================="
+  exit 1
+fi
+
+if [ $# != 4 ]; then
+  DEVICE_ID=0
+else
+  DEVICE_ID=`expr $4 + 0`
+  if [ $? != 0 ]; then
+    echo "DEVICE_ID=$4 is not an integer"
+    exit 1
+  fi
+fi
+
+export CUDA_VISIBLE_DEVICES=$DEVICE_ID
+DATASET=$(get_real_path $1)
+CHECKPOINT=$(get_real_path $2)
+CONFIG_PATH=$(get_real_path $3)
+PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
+TRAIN_OUTPUT=${PROJECT_DIR}/../eval_gpu
+if [ -d $TRAIN_OUTPUT ]; then
+  rm -rf $TRAIN_OUTPUT
+fi
+mkdir $TRAIN_OUTPUT
+cd $TRAIN_OUTPUT || exit
+cp ../eval.py ./
+cp -r ../src ./
+cp $CONFIG_PATH ./
+env > env.log
+python eval.py   --data_path=$DATASET  \
+                 --checkpoint_file_path=$CHECKPOINT \
+                 --config_path=${CONFIG_PATH##*/} \
+                 --device_target=GPU > eval.log  2>&1 &
\ No newline at end of file
diff --git a/model_zoo/official/cv/unet/scripts/run_standalone_train_gpu.sh b/model_zoo/official/cv/unet/scripts/run_standalone_train_gpu.sh
index e64e09b921c..24f35df5c91 100644
--- a/model_zoo/official/cv/unet/scripts/run_standalone_train_gpu.sh
+++ b/model_zoo/official/cv/unet/scripts/run_standalone_train_gpu.sh
@@ -14,9 +14,50 @@
 # limitations under the License.
 # ============================================================================
 
-echo "=============================================================================================================="
-echo "Please run the script as: "
-echo "bash scripts/run_standalone_train_gpu.sh [DATASET] [CONFIG_PATH]  "
-echo "for example: bash scripts/run_standalone_train_gpu.sh  /path/to/data/ /path/to/config/"
-echo "=============================================================================================================="
-python train.py  --data_path=$1  --config_path=$2 --output ./output > train.log  2>&1 &
+get_real_path() {
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+if [ $# != 2 ]  && [ $# != 3 ]
+then
+  echo "=============================================================================================================="
+  echo "Please run the script as: "
+  echo "bash scripts/run_standalone_train_gpu.sh [DATASET] [CONFIG_PATH] [DEVICE_ID](optional)"
+  echo "for example: bash scripts/run_standalone_train_gpu.sh  /path/to/data/ /path/to/config/"
+  echo "=============================================================================================================="
+  exit 1
+fi
+
+if [ $# != 3 ]; then
+  DEVICE_ID=0
+else
+  DEVICE_ID=`expr $3 + 0`
+  if [ $? != 0 ]; then
+    echo "DEVICE_ID=$3 is not an integer"
+    exit 1
+  fi
+fi
+
+export CUDA_VISIBLE_DEVICES=$DEVICE_ID
+DATASET=$(get_real_path $1)
+CONFIG_PATH=$(get_real_path $2)
+PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
+TRAIN_OUTPUT=${PROJECT_DIR}/../train_standalone_gpu
+if [ -d $TRAIN_OUTPUT ]; then
+  rm -rf $TRAIN_OUTPUT
+fi
+mkdir $TRAIN_OUTPUT
+cd $TRAIN_OUTPUT || exit
+cp ../train.py ./
+cp ../eval.py ./
+cp -r ../src ./
+cp $CONFIG_PATH ./
+env > env.log
+python train.py  --data_path=$DATASET  \
+                 --config_path=${CONFIG_PATH##*/} \
+                 --output ./output \
+                 --device_target=GPU > train.log  2>&1 &
diff --git a/model_zoo/official/cv/unet/train.py b/model_zoo/official/cv/unet/train.py
index d7de5bd547d..781fa726c7b 100644
--- a/model_zoo/official/cv/unet/train.py
+++ b/model_zoo/official/cv/unet/train.py
@@ -32,6 +32,7 @@ from src.eval_callback import EvalCallBack
 
 from src.model_utils.config import config
 from src.model_utils.moxing_adapter import moxing_wrapper
+from src.model_utils.device_adapter import get_device_id
 
 mindspore.set_seed(1)
 
@@ -79,9 +80,11 @@ def train_net(cross_valid_ind=1,
         per_print_times = 0
         repeat = config.repeat if hasattr(config, "repeat") else 1
         split = config.split if hasattr(config, "split") else 0.8
+        python_multiprocessing = not (config.device_target == "GPU" and run_distribute)
         train_dataset = create_multi_class_dataset(data_dir, config.image_size, repeat, batch_size,
                                                    num_classes=config.num_classes, is_train=True, augment=True,
-                                                   split=split, rank=rank, group_size=group_size, shuffle=True)
+                                                   split=split, rank=rank, group_size=group_size, shuffle=True,
+                                                   python_multiprocessing=python_multiprocessing)
         valid_dataset = create_multi_class_dataset(data_dir, config.image_size, 1, 1,
                                                    num_classes=config.num_classes, is_train=False,
                                                    eval_resize=config.eval_resize, split=split,
@@ -110,9 +113,9 @@ def train_net(cross_valid_ind=1,
                         loss_scale=config.loss_scale)
 
     loss_scale_manager = mindspore.train.loss_scale_manager.FixedLossScaleManager(config.FixedLossScaleManager, False)
-
-    model = Model(net, loss_fn=criterion, loss_scale_manager=loss_scale_manager, optimizer=optimizer, amp_level="O3")
-
+    amp_level = "O0" if config.device_target == "GPU" else "O3"
+    model = Model(net, loss_fn=criterion, loss_scale_manager=loss_scale_manager, optimizer=optimizer,
+                  amp_level=amp_level)
     print("============== Starting Training ==============")
     callbacks = [StepLossTimeMonitor(batch_size=batch_size, per_print_times=per_print_times), ckpoint_cb]
     if config.run_eval:
@@ -132,7 +135,7 @@ if __name__ == '__main__':
     logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
     context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False)
     if config.device_target == "Ascend":
-        device_id = int(os.getenv('DEVICE_ID'))
+        device_id = get_device_id()
         context.set_context(device_id=device_id)
     epoch_size = config.epochs if not config.run_distribute else config.distribute_epochs
     batchsize = config.batch_size
diff --git a/model_zoo/official/cv/unet/unet_nested_cell_config.yaml b/model_zoo/official/cv/unet/unet_nested_cell_config.yaml
index 30ade34ad91..c49846a5bbe 100644
--- a/model_zoo/official/cv/unet/unet_nested_cell_config.yaml
+++ b/model_zoo/official/cv/unet/unet_nested_cell_config.yaml
@@ -25,6 +25,7 @@ epochs: 200
 repeat: 10
 distribute_epochs: 1600
 batch_size: 16
+distribute_batchsize: 16
 cross_valid_ind: 1
 num_classes: 2
 num_channels: 3
@@ -69,6 +70,7 @@ device_target: "Target device type, available: [Ascend, GPU, CPU]"
 enable_profiling: "Whether enable profiling while training, default: False"
 num_classes: "Class for dataset"
 batch_size: "Batch size for training and evaluation"
+distribute_batchsize: "Batch size for distribute training"
 weight_decay: "Weight decay."
 keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
 checkpoint_path: "The location of the checkpoint file."
diff --git a/model_zoo/official/cv/vgg16/ascend310_quant_infer/run_quant_infer.sh b/model_zoo/official/cv/vgg16/ascend310_quant_infer/run_quant_infer.sh
index 56f958ea641..31bba45de8e 100644
--- a/model_zoo/official/cv/vgg16/ascend310_quant_infer/run_quant_infer.sh
+++ b/model_zoo/official/cv/vgg16/ascend310_quant_infer/run_quant_infer.sh
@@ -16,6 +16,7 @@
 
 if [ $# -lt 3 ]; then
     echo "Usage: bash run_quant_infer.sh [AIR_PATH] [DATA_PATH] [LABEL_PATH]"
+    echo "Example: bash run_quant_infer.sh ./vgg_quant.air ./00_data ./cifar10_label_ids.npy"
 exit 1
 fi
 
diff --git a/model_zoo/official/cv/yolov3_darknet53/README.md b/model_zoo/official/cv/yolov3_darknet53/README.md
index 4ee4204ccda..7635f403d97 100644
--- a/model_zoo/official/cv/yolov3_darknet53/README.md
+++ b/model_zoo/official/cv/yolov3_darknet53/README.md
@@ -16,6 +16,7 @@
             - [Evaluation](#evaluation)
         - [Export MindIR](#export-mindir)
         - [Inference Process](#inference-process)
+        - [Post Training Quantization](#post-training-quantization)
     - [Model Description](#model-description)
         - [Performance](#performance)
             - [Evaluation Performance](#evaluation-performance)
@@ -340,7 +341,7 @@ For GPU device, distributed training example(8p) by shell script
 bash run_distribute_train_gpu.sh dataset/coco2014 darknet53_backbone.ckpt
 ```
 
-The above shell script will run distribute training in the background. You can view the results through the file `train_parallel[X]/log.txt`. The loss value will be achieved as follows:
+The above shell script will run distribute training in the background. You can view the results through the file `train_parallel0/log.txt`. The loss value will be achieved as follows:
 
 ```log
 # distribute training result(8p)
@@ -440,6 +441,52 @@ Inference result is saved in current path, you can find result in acc.log file.
  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.551
 ```
 
+### [Post Training Quantization](#contents)
+
+Relative executing script files reside in the directory "ascend310_quant_infer". Please implement following steps sequentially to complete post quantization.
+Current quantization project bases on COCO2014 dataset.
+
+1. Generate data of .bin format required for AIR model inference at Ascend310 platform.
+
+```shell
+python export_bin.py --config_path [YMAL CONFIG PATH] --data_dir [DATA DIR] --annFile [ANNOTATION FILE PATH]
+```
+
+2. Export quantized AIR model.
+
+Post quantization of model requires special toolkits for exporting quantized AIR model. Please refer to [official website](https://www.hiascend.com/software/cann/community).
+
+```shell
+python post_quant.py --config_path [YMAL CONFIG PATH] --ckpt_file [CKPT_PATH] --data_dir [DATASET PATH] --annFile [ANNOTATION FILE PATH]
+```
+
+The quantized AIR file will be stored as "./results/yolov3_quant.air".
+
+3. Implement inference at Ascend310 platform.
+
+```shell
+# Ascend310 quant inference
+bash run_quant_infer.sh [AIR_PATH] [DATA_PATH] [IMAGE_ID] [IMAGE_SHAPE] [ANN_FILE]
+```
+
+Inference result is saved in current path, you can find result like this in acc.log file.
+
+```bash
+=============coco eval result=========
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.306
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.524
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.314
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.122
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.319
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.423
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.256
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.395
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.419
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.219
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.438
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.548
+```
+
 ## [Model Description](#contents)
 
 ### [Performance](#contents)
diff --git a/model_zoo/official/cv/yolov3_darknet53/README_CN.md b/model_zoo/official/cv/yolov3_darknet53/README_CN.md
index 802e4ba1a51..f618e5cb548 100644
--- a/model_zoo/official/cv/yolov3_darknet53/README_CN.md
+++ b/model_zoo/official/cv/yolov3_darknet53/README_CN.md
@@ -20,6 +20,7 @@
     - [推理过程](#推理过程)
         - [用法](#用法-2)
         - [结果](#结果-2)
+    - [训练后量化推理](#训练后量化推理)
 - [模型描述](#模型描述)
     - [性能](#性能)
         - [评估性能](#评估性能)
@@ -117,7 +118,9 @@ YOLOv3使用DarkNet53执行特征提取，这是YOLOv2中的Darknet-19和残差
       --data_dir=./dataset/coco2014 \
       --pretrained_backbone=darknet53_backbone.ckpt \
       --is_distributed=0 \
-      --lr=0.1 \
+      --lr=0.001 \
+      --loss_scale=1024 \
+      --weight_decay=0.016 \
       --T_max=320 \
       --max_epoch=320 \
       --warmup_epochs=4 \
@@ -295,7 +298,9 @@ python train.py \
     --data_dir=./dataset/coco2014 \
     --pretrained_backbone=darknet53_backbone.ckpt \
     --is_distributed=0 \
-    --lr=0.1 \
+    --lr=0.001 \
+    --loss_scale=1024 \
+    --weight_decay=0.016 \
     --T_max=320 \
     --max_epoch=320 \
     --warmup_epochs=4 \
@@ -331,7 +336,7 @@ bash run_distribute_train.sh dataset/coco2014 darknet53_backbone.ckpt rank_table
 bash run_distribute_train_gpu.sh dataset/coco2014 darknet53_backbone.ckpt
 ```
 
-上述shell脚本将在后台运行分布训练。您可以通过`train_parallel[X]/log.txt`文件查看结果。损失值的实现如下：
+上述shell脚本将在后台运行分布训练。您可以通过`train_parallel0/log.txt`文件查看结果。损失值的实现如下：
 
 ```text
 # 分布式训练示例(8卡)
@@ -430,6 +435,51 @@ bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [ANNO_PATH] [DEVICE_ID]
  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.551
 ```
 
+## [训练后量化推理](#contents)
+
+训练后量化推理的相关执行脚本文件在"ascend310_quant_infer"目录下，依次执行以下步骤实现训练后量化推理。本训练后量化工程基于COCO2014数据集。
+
+1、生成Ascend310平台AIR模型推理需要的.bin格式数据。
+
+```shell
+python export_bin.py --config_path [YMAL CONFIG PATH] --data_dir [DATA DIR] --annFile [ANNOTATION FILE PATH]
+```
+
+2、导出训练后量化的AIR格式模型。
+
+导出训练后量化模型需要配套的量化工具包，参考[官方地址](https://www.hiascend.com/software/cann/community)
+
+```shell
+python post_quant.py --config_path [YMAL CONFIG PATH] --ckpt_file [CKPT_PATH] --data_dir [DATASET PATH] --annFile [ANNOTATION FILE PATH]
+```
+
+导出的模型会存储在./result/yolov3_quant.air。
+
+3、在Ascend310执行推理量化模型。
+
+```shell
+# Ascend310 quant inference
+bash run_quant_infer.sh [AIR_PATH] [DATA_PATH] [IMAGE_ID] [IMAGE_SHAPE] [ANN_FILE]
+```
+
+推理结果保存在脚本执行的当前路径，可以在acc.log中看到精度计算结果。
+
+```bash
+=============coco eval result=========
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.306
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.524
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.314
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.122
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.319
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.423
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.256
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.395
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.419
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.219
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.438
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.548
+```
+
 # 模型描述
 
 ## 性能
diff --git a/model_zoo/official/cv/yolov3_darknet53/default_config.yaml b/model_zoo/official/cv/yolov3_darknet53/default_config.yaml
index e949d1d48db..5c8dd49e64e 100644
--- a/model_zoo/official/cv/yolov3_darknet53/default_config.yaml
+++ b/model_zoo/official/cv/yolov3_darknet53/default_config.yaml
@@ -75,6 +75,10 @@ file_name: "yolov3_darknet53"
 file_format: "AIR" # ["AIR", "ONNX", "MINDIR"]
 keep_detect: True
 
+# PostProcess option
+result_path: ""
+img_path: ""
+
 # convert weight option
 input_file: "./darknet53.conv.74"
 output_file: "./backbone_darknet53.ckpt"
diff --git a/model_zoo/official/cv/yolov3_darknet53/postprocess.py b/model_zoo/official/cv/yolov3_darknet53/postprocess.py
index 5c0e8679c58..fe71cfbf25e 100644
--- a/model_zoo/official/cv/yolov3_darknet53/postprocess.py
+++ b/model_zoo/official/cv/yolov3_darknet53/postprocess.py
@@ -14,43 +14,33 @@
 # ============================================================================
 """YoloV3 postprocess."""
 import os
-import argparse
 import datetime
 import numpy as np
 from PIL import Image
 from eval import DetectionEngine
+from model_utils.config import config
 
 def get_img_size(file_name):
     img = Image.open(file_name)
     return img.size
 
-parser = argparse.ArgumentParser('YoloV3 postprocess')
-parser.add_argument('--result_path', type=str, required=True, help='result files path.')
-parser.add_argument('--img_path', type=str, required=True, help='train data dir.')
-parser.add_argument('--per_batch_size', default=1, type=int, help='batch size for per gpu')
-parser.add_argument('--nms_thresh', type=float, default=0.5, help='threshold for NMS')
-parser.add_argument('--annFile', type=str, default='', help='path to annotation')
-parser.add_argument('--ignore_threshold', type=float, default=0.001, help='threshold to throw low quality boxes')
-parser.add_argument('--log_path', type=str, default='outputs/', help='inference result save location')
-
-args, _ = parser.parse_known_args()
-
 if __name__ == "__main__":
-    args.outputs_dir = os.path.join(args.log_path,
-                                    datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
-    if not os.path.exists(args.outputs_dir):
-        os.makedirs(args.outputs_dir)
+    config.outputs_dir = os.path.join(config.log_path,
+                                      datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
+    if not os.path.exists(config.outputs_dir):
+        os.makedirs(config.outputs_dir)
 
-    detection = DetectionEngine(args)
-    bs = args.per_batch_size
+    detection = DetectionEngine(config)
+    bs = config.per_batch_size
 
-    f_list = os.listdir(args.img_path)
+    f_list = os.listdir(config.img_path)
     for f in f_list:
-        image_size = get_img_size(os.path.join(args.img_path, f))
+        image_size = get_img_size(os.path.join(config.img_path, f))
         f = f.split('.')[0]
-        output_big = np.fromfile(os.path.join(args.result_path, f + '_0.bin'), np.float32).reshape(bs, 13, 13, 3, 85)
-        output_me = np.fromfile(os.path.join(args.result_path, f + '_1.bin'), np.float32).reshape(bs, 26, 26, 3, 85)
-        output_small = np.fromfile(os.path.join(args.result_path, f + '_2.bin'), np.float32).reshape(bs, 52, 52, 3, 85)
+        output_big = np.fromfile(os.path.join(config.result_path, f + '_0.bin'), np.float32).reshape(bs, 13, 13, 3, 85)
+        output_me = np.fromfile(os.path.join(config.result_path, f + '_1.bin'), np.float32).reshape(bs, 26, 26, 3, 85)
+        output_small = np.fromfile(os.path.join(config.result_path,
+                                                f + '_2.bin'), np.float32).reshape(bs, 52, 52, 3, 85)
         image_id = [int(f.split('_')[-1])]
         image_shape = [[image_size[0], image_size[1]]]
 
diff --git a/model_zoo/official/cv/yolov3_darknet53/scripts/run_infer_310.sh b/model_zoo/official/cv/yolov3_darknet53/scripts/run_infer_310.sh
index 65fb91e17f2..848daf83a86 100644
--- a/model_zoo/official/cv/yolov3_darknet53/scripts/run_infer_310.sh
+++ b/model_zoo/official/cv/yolov3_darknet53/scripts/run_infer_310.sh
@@ -80,7 +80,7 @@ function infer()
 
 function cal_acc()
 {
-    python3.7 ../postprocess.py --result_path=./result_Files --img_path=$data_path --annFile=$anno_path &> acc.log
+    python3.7 ../postprocess.py --per_batch_size=1 --result_path=./result_Files --img_path=$data_path --annFile=$anno_path &> acc.log
 }
 
 compile_app
diff --git a/model_zoo/official/cv/yolov3_darknet53_quant/scripts/run_infer_310.sh b/model_zoo/official/cv/yolov3_darknet53_quant/scripts/run_infer_310.sh
index 66e114a8b67..b72b9454431 100644
--- a/model_zoo/official/cv/yolov3_darknet53_quant/scripts/run_infer_310.sh
+++ b/model_zoo/official/cv/yolov3_darknet53_quant/scripts/run_infer_310.sh
@@ -53,9 +53,10 @@ if [ -d ${ASCEND_HOME}/ascend-toolkit ]; then
     export PYTHONPATH=${TBE_IMPL_PATH}:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/python/site-packages:$PYTHONPATH
     export ASCEND_OPP_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp
 else
-    export PATH=$ASCEND_HOME/atc/ccec_compiler/bin:$ASCEND_HOME/atc/bin:$PATH
-    export LD_LIBRARY_PATH=/usr/local/lib:$ASCEND_HOME/atc/lib64:$ASCEND_HOME/acllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH
-    export PYTHONPATH=$ASCEND_HOME/atc/python/site-packages:$PYTHONPATH
+    export PATH=$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/fwkacllib/bin:$PATH
+    export LD_LIBRARY_PATH=/usr/local/lib:$ASCEND_HOME/fwkacllib/lib64:$ASCEND_HOME/driver/lib64:$LD_LIBRARY_PATH
+    export TBE_IMPL_PATH=$ASCEND_HOME/opp/op_impl/built-in/ai_core/tbe
+    export PYTHONPATH=$PYTHONPATH:$TBE_IMPL_PATH
     export ASCEND_OPP_PATH=$ASCEND_HOME/opp
 fi
 
@@ -111,4 +112,4 @@ cal_acc
 if [ $? -ne 0 ]; then
     echo "calculate accuracy failed"
     exit 1
-fi
\ No newline at end of file
+fi
diff --git a/model_zoo/official/cv/yolov3_resnet18/src/yolov3.py b/model_zoo/official/cv/yolov3_resnet18/src/yolov3.py
index 91ac4081e4b..f6751ed5516 100644
--- a/model_zoo/official/cv/yolov3_resnet18/src/yolov3.py
+++ b/model_zoo/official/cv/yolov3_resnet18/src/yolov3.py
@@ -156,13 +156,17 @@ class ResNet(nn.Cell):
                  in_channels,
                  out_channels,
                  strides=None,
-                 num_classes=80):
+                 num_classes=None,
+                 feature_only=True):
         super(ResNet, self).__init__()
 
         if not len(layer_nums) == len(in_channels) == len(out_channels) == 4:
             raise ValueError("the length of "
                              "layer_num, inchannel, outchannel list must be 4!")
 
+        self.feature_only = feature_only
+        if num_classes is None:
+            self.feature_only = True
         self.conv1 = _conv2d(3, 64, 7, stride=2)
         self.bn1 = _fused_bn(64)
         self.relu = P.ReLU()
@@ -240,7 +244,7 @@ class ResNet(nn.Cell):
         c5 = self.layer4(c4)
 
         out = c5
-        if self.num_classes:
+        if self.feature_only:
             out = self.reduce_mean(c5, (2, 3))
             out = self.squeeze(out)
             out = self.end_point(out)
@@ -266,7 +270,8 @@ def resnet18(class_num=10):
                   [64, 64, 128, 256],
                   [64, 128, 256, 512],
                   [1, 2, 2, 2],
-                  num_classes=class_num)
+                  num_classes=class_num,
+                  feature_only=False)
 
 
 class YoloBlock(nn.Cell):
@@ -586,7 +591,8 @@ class yolov3_resnet18(nn.Cell):
                                                   self.config.backbone_input_shape,
                                                   self.config.backbone_shape,
                                                   self.config.backbone_stride,
-                                                  num_classes=None),
+                                                  num_classes=None,
+                                                  feature_only=True),
                                   backbone_shape=self.config.backbone_shape,
                                   out_channel=self.config.out_channel)
 
@@ -672,8 +678,7 @@ class TrainingWrapper(nn.Cell):
         if self.reducer_flag:
             # apply grad reducer on grads
             grads = self.grad_reducer(grads)
-        self.optimizer(grads)
-        return loss
+        return F.depend(loss, self.optimizer(grads))
 
 
 class YoloBoxScores(nn.Cell):
diff --git a/model_zoo/official/cv/yolov4/README.md b/model_zoo/official/cv/yolov4/README.md
index 0b6e5e396ae..338495e0521 100644
--- a/model_zoo/official/cv/yolov4/README.md
+++ b/model_zoo/official/cv/yolov4/README.md
@@ -15,6 +15,7 @@
         - [Evaluation](#evaluation)
     - [Convert Process](#convert-process)
         - [Convert](#convert)
+    - [Post Training Quantization](#post-training-quantization)
 - [Model Description](#model-description)
     - [Performance](#performance)
         - [Evaluation Performance](#evaluation-performance)
@@ -529,6 +530,52 @@ Average Recall    (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.636
 Average Recall    (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.716
 ```
 
+## [Post Training Quantization](#contents)
+
+Relative executing script files reside in the directory "ascend310_quant_infer". Please implement following steps sequentially to complete post quantization.
+Current quantization project bases on COCO2017 dataset.
+
+1. Generate data of .bin format required for AIR model inference at Ascend310 platform.
+
+```shell
+python export_bin.py --config_path [YMAL CONFIG PATH] --data_dir [DATA DIR] --annFile [ANNOTATION FILE PATH]
+```
+
+2. Export quantized AIR model.
+
+Post quantization of model requires special toolkits for exporting quantized AIR model. Please refer to [official website](https://www.hiascend.com/software/cann/community).
+
+```shell
+python post_quant.py --config_path [YMAL CONFIG PATH] --ckpt_file [CKPT_PATH] --data_dir [DATASET PATH] --annFile [ANNOTATION FILE PATH]
+```
+
+The quantized AIR file will be stored as "./results/yolov4_quant.air".
+
+3. Implement inference at Ascend310 platform.
+
+```shell
+# Ascend310 quant inference
+bash run_quant_infer.sh [AIR_PATH] [DATA_PATH] [IMAGE_ID] [IMAGE_SHAPE] [ANN_FILE]
+```
+
+Inference result is saved in current path, you can find result like this in acc.log file.
+
+```bash
+=============coco eval result=========
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.433
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.633
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.467
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.273
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.475
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.555
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.329
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.532
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.568
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.395
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.611
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.690
+```
+
 # [Model Description](#contents)
 
 ## [Performance](#contents)
diff --git a/model_zoo/official/cv/yolov4/README_CN.md b/model_zoo/official/cv/yolov4/README_CN.md
index 2d560cb13e8..142ad148efe 100644
--- a/model_zoo/official/cv/yolov4/README_CN.md
+++ b/model_zoo/official/cv/yolov4/README_CN.md
@@ -22,6 +22,7 @@
     - [推理过程](#推理过程)
         - [用法](#用法)
         - [结果](#结果)
+    - [训练后量化推理](#训练后量化推理)
 - [模型说明](#模型说明)
     - [性能](#性能)
         - [评估性能](#评估性能)
@@ -536,6 +537,51 @@ Average Recall    (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.636
 Average Recall    (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.716
 ```
 
+## [训练后量化推理](#contents)
+
+训练后量化推理的相关执行脚本文件在"ascend310_quant_infer"目录下，依次执行以下步骤实现训练后量化推理。本训练后量化工程基于COCO2017数据集。
+
+1、生成Ascend310平台AIR模型推理需要的.bin格式数据。
+
+```shell
+python export_bin.py --config_path [YMAL CONFIG PATH] --data_dir [DATA DIR] --annFile [ANNOTATION FILE PATH]
+```
+
+2、导出训练后量化的AIR格式模型。
+
+导出训练后量化模型需要配套的量化工具包，参考[官方地址](https://www.hiascend.com/software/cann/community)
+
+```shell
+python post_quant.py --config_path [YMAL CONFIG PATH] --ckpt_file [CKPT_PATH] --data_dir [DATASET PATH] --annFile [ANNOTATION FILE PATH]
+```
+
+导出的模型会存储在./result/yolov4_quant.air。
+
+3、在Ascend310执行推理量化模型。
+
+```shell
+# Ascend310 quant inference
+bash run_quant_infer.sh [AIR_PATH] [DATA_PATH] [IMAGE_ID] [IMAGE_SHAPE] [ANN_FILE]
+```
+
+推理结果保存在脚本执行的当前路径，可以在acc.log中看到精度计算结果。
+
+```bash
+=============coco eval result=========
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.433
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.633
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.467
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.273
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.475
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.555
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.329
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.532
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.568
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.395
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.611
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.690
+```
+
 # [模型说明](#目录)
 
 ## [性能](#目录)
diff --git a/model_zoo/official/gnn/gat/src/utils.py b/model_zoo/official/gnn/gat/src/utils.py
index 441ef7c48ee..c7bae8c8b86 100644
--- a/model_zoo/official/gnn/gat/src/utils.py
+++ b/model_zoo/official/gnn/gat/src/utils.py
@@ -18,6 +18,7 @@ from mindspore.common.parameter import ParameterTuple
 from mindspore import Tensor
 from mindspore.common import dtype as mstype
 from mindspore.ops import composite as C
+from mindspore.ops import functional as F
 from mindspore.ops import operations as P
 
 
@@ -149,8 +150,7 @@ class TrainOneStepCell(nn.Cell):
         loss = self.network(feature, biases)
         sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
         grads = self.grad(self.network, weights)(feature, biases, sens)
-        self.optimizer(grads)
-        return loss
+        return F.depend(loss, self.optimizer(grads))
 
 
 class TrainGAT(nn.Cell):
diff --git a/model_zoo/official/nlp/bert/README.md b/model_zoo/official/nlp/bert/README.md
index c2753423fad..382e8766ed0 100644
--- a/model_zoo/official/nlp/bert/README.md
+++ b/model_zoo/official/nlp/bert/README.md
@@ -654,8 +654,10 @@ The result will be as follows:
 
 - Export on local
 
+We only support export with fine-tuned downstream task model and yaml config file, because the pretrained model is useless in inferences task.
+
 ```shell
-python export.py --config_path [../../*.yaml] --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
+python export.py --config_path [../../*.yaml] --export_ckpt_file [CKPT_PATH] --export_file_name [FILE_NAME] --file_format [FILE_FORMAT]
 ```
 
 - Export on ModelArts (If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start as follows)
@@ -686,8 +688,7 @@ python export.py --config_path [../../*.yaml] --ckpt_file [CKPT_PATH] --file_nam
 # You will see bert_ner.mindir under {Output file path}.
 ```
 
-The ckpt_file parameter is required,
-`EXPORT_FORMAT` should be in ["AIR", "MINDIR"]
+The `export_ckpt_file` parameter is required, and `file_format` should be in ["AIR", "MINDIR"]
 
 ### [Inference Process](#contents)
 
@@ -789,6 +790,8 @@ Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/
 
 # FAQ
 
+Refer to the [ModelZoo FAQ](https://gitee.com/mindspore/mindspore/tree/master/model_zoo#FAQ) for some common question.
+
 - **Q: How to resolve the continually overflow?**
 
   **A**: Continually overflow is usually caused by using too high learning rate.
@@ -797,4 +800,3 @@ Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/
 - **Q: Why the training process failed with error for the shape can not match?**
   **A**: This is usually caused by the config `seq_length` of model can't match the dataset. You could check and modified the `seq_length` in yaml config according to the dataset you used.
   The parameter of model won't change with `seq_length`, the shapes of parameter only depends on model config `max_position_embeddings`.
-
diff --git a/model_zoo/official/nlp/bert/README_CN.md b/model_zoo/official/nlp/bert/README_CN.md
index 26cb64eb178..e0d53f64e86 100644
--- a/model_zoo/official/nlp/bert/README_CN.md
+++ b/model_zoo/official/nlp/bert/README_CN.md
@@ -613,10 +613,12 @@ bash scripts/squad.sh
 
 ## 导出mindir模型
 
+由于预训练模型通常没有应用场景，需要经过下游任务的finetune之后才能使用，所以当前仅支持使用下游任务模型和yaml配置文件进行export操作。
+
 - 在本地导出
 
 ```shell
-python export.py --config_path [../../*.yaml] --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
+python export.py --config_path [../../*.yaml] --export_ckpt_file [CKPT_PATH] --export_file_name [FILE_NAME] --file_format [FILE_FORMAT]
 ```
 
 - 在ModelArts上导出
@@ -647,7 +649,7 @@ python export.py --config_path [../../*.yaml] --ckpt_file [CKPT_PATH] --file_nam
 # 你将在{Output file path}下看到 'bert_ner.mindir'文件
 ```
 
-参数`ckpt_file` 是必需的，`EXPORT_FORMAT` 必须在 ["AIR", "MINDIR"]中进行选择。
+参数`export_ckpt_file` 是必需的，`file_format` 必须在 ["AIR", "MINDIR"]中进行选择。
 
 ## 推理过程
 
@@ -747,8 +749,10 @@ run_pretrain.py中设置了随机种子，确保分布式训练中每个节点
 
 # FAQ
 
+优先参考[ModelZoo FAQ](https://gitee.com/mindspore/mindspore/tree/master/model_zoo#FAQ)来查找一些常见的公共问题。
+
 - **Q: 运行过程中发生持续溢出怎么办？**
   **A**： 持续溢出通常是因为使用了较高的学习率导致训练不收敛。可以考虑修改yaml配置文件中的参数，调低`learning_rate`来降低初始学习率或提高`power`加速学习率衰减。
 
 - **Q: 运行报错shape不匹配是什么问题？**
-  **A**： Bert模型中的shape不匹配通常是因为模型参数配置和使用的数据集规格不匹配，主要是句长问题，可以考虑修改`seq_length`参数来匹配所使用的具体数据集。改变该参数不影响权重的规格，权重的规格仅与`max_position_embeddings`参数有关。
\ No newline at end of file
+  **A**： Bert模型中的shape不匹配通常是因为模型参数配置和使用的数据集规格不匹配，主要是句长问题，可以考虑修改`seq_length`参数来匹配所使用的具体数据集。改变该参数不影响权重的规格，权重的规格仅与`max_position_embeddings`参数有关。
diff --git a/model_zoo/official/nlp/bert/scripts/run_infer_310.sh b/model_zoo/official/nlp/bert/scripts/run_infer_310.sh
index 61b85bb6257..acd330e027c 100644
--- a/model_zoo/official/nlp/bert/scripts/run_infer_310.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_infer_310.sh
@@ -23,7 +23,9 @@ exit 1
 fi
 
 get_real_path(){
-    if [ "${1:0:1}" == "/" ]; then
+    if [ -z "$1" ]; then
+        echo ""
+    elif [ "${1:0:1}" == "/" ]; then
         echo "$1"
     else
         echo "$(realpath -m $PWD/$1)"
diff --git a/model_zoo/official/nlp/dgu/README_CN.md b/model_zoo/official/nlp/dgu/README_CN.md
index 6e0c63d6dd9..48f6bb37f62 100644
--- a/model_zoo/official/nlp/dgu/README_CN.md
+++ b/model_zoo/official/nlp/dgu/README_CN.md
@@ -25,10 +25,9 @@
         - [用法](#用法-1)
             - [Ascend处理器上运行后评估各个任务的模型](#Ascend处理器上运行后评估各个任务的模型)
             - [GPU上运行后评估各个任务的模型](#GPU上运行后评估各个任务的模型)
-    - [模型描述](#模型描述)
-    - [性能](#性能)
-        - [预训练性能](#预训练性能)
-            - [推理性能](#推理性能)
+    - [310推理](#310推理)
+        - [导出模型](#导出模型)
+        - [用法](#在ascend310执行推理)
 - [随机情况说明](#随机情况说明)
 - [ModelZoo主页](#modelzoo主页)
 
@@ -406,6 +405,36 @@ evaling...
 Accuracy  : 0.8082890070921985
 ```
 
+## 310推理
+
+### 导出模型
+
+```shell
+bash scripts/export.sh
+# export finetune ckpt to mindir
+```
+
+参数`ckpt_file`，`file_format`需要在`export.sh`中设置。
+
+### 在Ascend310执行推理
+
+以下展示了使用mindir模型执行推理的示例。
+
+```shell
+# Ascend310推理
+bash scripts/run_infer_310.sh [MINDIR_PATH] [DATA_FILE_PATH] [NEED_PREPROCESS] [DEVICE_ID] [DATASET]
+```
+
+- `MINDIR_PATH` 为ckpt导出的mindir模型文件路径。
+- `DATA_FILE_PATH` 为预处理为MindRecord格式的测试数据。
+- `NEED_PREPROCESS` 表示数据是否需要预处理，取值范围为：'y' 或者 'n'。
+- `DEVICE_ID` 可选，默认值为0。
+- `DATASET` 为执行推理的数据集，可选，数据集包括['atis', 'mrda', 'swda', 'udc'],默认值为'atis'。
+
+### 结果
+
+推理结果保存在脚本执行的当前路径，精度计算结果可以在acc.log中看到。
+
 # 随机情况说明
 
 run_dgu.sh中设置train_data_shuffle为true，eval_data_shuffle为false，默认对数据集进行轮换操作。
diff --git a/model_zoo/official/nlp/dgu/export.py b/model_zoo/official/nlp/dgu/export.py
index c434790002e..d756ad7dbc6 100644
--- a/model_zoo/official/nlp/dgu/export.py
+++ b/model_zoo/official/nlp/dgu/export.py
@@ -20,11 +20,11 @@ import mindspore.common.dtype as mstype
 from mindspore import Tensor, context, load_checkpoint, export
 
 from src.finetune_eval_config import bert_net_cfg
-from src.finetune_eval_model import BertCLSModel
+from src.bert_for_finetune import BertCLS
 parser = argparse.ArgumentParser(description="Bert export")
 parser.add_argument("--device_id", type=int, default=0, help="Device id")
-parser.add_argument("--batch_size", type=int, default=16, help="batch size")
-parser.add_argument("--number_labels", type=int, default=16, help="batch size")
+parser.add_argument("--batch_size", type=int, default=1, help="batch size")
+parser.add_argument("--number_labels", type=int, default=26, help="batch size")
 parser.add_argument("--ckpt_file", type=str, required=True, help="Bert ckpt file.")
 parser.add_argument("--file_name", type=str, default="Bert", help="bert output air name.")
 parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format")
@@ -38,7 +38,7 @@ if args.device_target == "Ascend":
 
 
 if __name__ == "__main__":
-    net = BertCLSModel(bert_net_cfg, False, num_labels=args.number_labels)
+    net = BertCLS(bert_net_cfg, False, num_labels=args.number_labels)
 
     load_checkpoint(args.ckpt_file, net=net)
     net.set_train(False)
@@ -49,4 +49,4 @@ if __name__ == "__main__":
     label_ids = Tensor(np.zeros([args.batch_size, bert_net_cfg.seq_length]), mstype.int32)
 
     input_data = [input_ids, input_mask, token_type_id]
-    export(net, *input_data, file_name=args.file_name, file_format=args.file_format)
+    export(net.bert, *input_data, file_name=args.file_name, file_format=args.file_format)
diff --git a/model_zoo/official/nlp/dgu/run_dgu.py b/model_zoo/official/nlp/dgu/run_dgu.py
index 2f139155446..7dc9b0e141f 100644
--- a/model_zoo/official/nlp/dgu/run_dgu.py
+++ b/model_zoo/official/nlp/dgu/run_dgu.py
@@ -148,13 +148,13 @@ def run_dgu(args_input):
         netwithloss = BertCLS(net_cfg, True, num_labels=num_class, dropout_prob=0.1)
         train_ds = create_classification_dataset(batch_size=args_input.train_batch_size, repeat_count=1, \
                         data_file_path=args_input.train_data_file_path, \
-                        do_shuffle=(args_input.train_data_shuffle.lower() == "true"))
+                        do_shuffle=(args_input.train_data_shuffle.lower() == "true"), drop_remainder=True)
         do_train(train_ds, netwithloss, load_pretrain_checkpoint_path, save_finetune_checkpoint_path, epoch_num)
 
     if args_input.do_eval.lower() == "true":
         eval_ds = create_classification_dataset(batch_size=args_input.eval_batch_size, repeat_count=1, \
                     data_file_path=args_input.eval_data_file_path, \
-                    do_shuffle=(args_input.eval_data_shuffle.lower() == "true"))
+                    do_shuffle=(args_input.eval_data_shuffle.lower() == "true"), drop_remainder=True)
         if args_input.task_name in ['atis_intent', 'mrda', 'swda']:
             eval_metric = metric_class("classification")
         else:
diff --git a/model_zoo/official/nlp/dgu/scripts/export.sh b/model_zoo/official/nlp/dgu/scripts/export.sh
index c6228495687..07231c26248 100644
--- a/model_zoo/official/nlp/dgu/scripts/export.sh
+++ b/model_zoo/official/nlp/dgu/scripts/export.sh
@@ -15,9 +15,33 @@
 # ============================================================================
 
 python export.py --device_id=0 \
-        --batch_size=32  \
+        --batch_size=1  \
         --number_labels=26  \
-        --ckpt_file=/home/ma-user/work/ckpt/atis_intent/0.9791666666666666_atis_intent-11_155.ckpt  \
+        --ckpt_file=./ckpt/atis_intent/atis_intent-11_155.ckpt  \
         --file_name=atis_intent.mindir  \
         --file_format=MINDIR  \
         --device_target=Ascend
+
+python export.py --device_id=0 \
+        --batch_size=1  \
+        --number_labels=5  \
+        --ckpt_file=./ckpt/mrda/mrda-7_2364.ckpt  \
+        --file_name=mrda.mindir  \
+        --file_format=MINDIR  \
+        --device_target=Ascend
+
+python export.py --device_id=0 \
+        --batch_size=1  \
+        --number_labels=42  \
+        --ckpt_file=./ckpt/swda/swda-3_6094.ckpt  \
+        --file_name=swda.mindir  \
+        --file_format=MINDIR  \
+        --device_target=Ascend
+
+python export.py --device_id=0 \
+        --batch_size=1  \
+        --number_labels=2  \
+        --ckpt_file=./ckpt/udc/udc-2_31250.ckpt  \
+        --file_name=udc.mindir  \
+        --file_format=MINDIR  \
+        --device_target=Ascend
diff --git a/model_zoo/official/nlp/dgu/src/utils.py b/model_zoo/official/nlp/dgu/src/utils.py
index 27486775a7d..474bd2b7e2e 100644
--- a/model_zoo/official/nlp/dgu/src/utils.py
+++ b/model_zoo/official/nlp/dgu/src/utils.py
@@ -37,7 +37,7 @@ from mindspore.train.callback import Callback
 
 
 def create_classification_dataset(batch_size=32, repeat_count=1,
-                                  data_file_path=None, schema_file_path=None, do_shuffle=True):
+                                  data_file_path=None, schema_file_path=None, do_shuffle=True, drop_remainder=False):
     """create finetune or evaluation dataset from mindrecord file"""
     type_cast_op = C.TypeCast(mstype.int32)
     data_set = ds.MindDataset([data_file_path],  \
@@ -48,7 +48,7 @@ def create_classification_dataset(batch_size=32, repeat_count=1,
     data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
     data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
     #data_set = data_set.repeat(repeat_count)
-    data_set = data_set.batch(batch_size, drop_remainder=True)
+    data_set = data_set.batch(batch_size, drop_remainder=drop_remainder)
     return data_set
 
 
diff --git a/model_zoo/official/nlp/emotect/README_CN.md b/model_zoo/official/nlp/emotect/README_CN.md
index 0541e6be050..08cb38a238b 100755
--- a/model_zoo/official/nlp/emotect/README_CN.md
+++ b/model_zoo/official/nlp/emotect/README_CN.md
@@ -14,6 +14,9 @@
         - [用法](#用法)
     - [评估过程](#评估过程)
         - [用法](#用法-1)
+    - [310推理](#310推理)
+        - [导出模型](#导出模型)
+        - [用法](#在ascend310执行推理)
 - [ModelZoo主页](#modelzoo主页)
 
 # 概述
@@ -56,10 +59,10 @@ label   text_a
 - 硬件（Ascend/GPU）
     - 使用Ascend或GPU处理器来搭建硬件环境。
 - 框架
-    - [MindSpore](https://www.mindspore.cn/install)
+    - [MindSpore](https://www.mindspore.cn/install/en)
 - 如需查看详情，请参见如下资源：
     - [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html)
-    - [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html)
+    - [MindSpore Python API](https://www.mindspore.cn/docs/api/en/master/index.html)
 
 # 快速入门
 
@@ -150,7 +153,7 @@ bash script/download_data.sh
 bash scripts/convert_dataset.sh
 # `convert_dataset.sh` depend on ERNIE vocabulary,
 # you should download ERNIE model first by:
-# bash script/download_model.sh
+# sh script/download_model.sh
 ```
 
 #### Ascend处理器或GPU上运行
@@ -191,6 +194,34 @@ bash scripts/run_classifier_eval_{platform}.sh
 # platform: gpu or ascend
 ```
 
+## 310推理
+
+### 导出模型
+
+```shell
+bash scripts/export.sh
+# export finetune ckpt to mindir
+```
+
+参数`ckpt_file`，`file_format`已在`export.sh`中设置。
+
+### 在Ascend310执行推理
+
+以下展示了使用minir模型执行推理的示例。
+
+```shell
+# Ascend310推理
+bash scripts/run_infer_310.sh [MINDIR_PATH] [DATA_FILE_PATH] [NEED_PREPROCESS] [DEVICE_ID]
+```
+
+- `DATA_FILE_PATH` 为预处理为MindRecord格式的测试数据。
+- `NEED_PREPROCESS` 表示数据是否需要预处理，取值范围为：'y' 或者 'n'。
+- `DEVICE_ID` 可选，默认值为0。
+
+### 结果
+
+推理结果保存在脚本执行的当前路径，精度计算结果可以在acc.log中看到。
+
 # ModelZoo主页
 
 请浏览官网[主页](https://gitee.com/mindspore/mindspore/tree/master/model_zoo)。
diff --git a/model_zoo/official/nlp/emotect/export.py b/model_zoo/official/nlp/emotect/export.py
index bf2d115ac7e..2598a83d660 100644
--- a/model_zoo/official/nlp/emotect/export.py
+++ b/model_zoo/official/nlp/emotect/export.py
@@ -20,7 +20,7 @@ import mindspore.common.dtype as mstype
 from mindspore import Tensor, context, load_checkpoint, export
 
 from src.finetune_eval_config import ernie_net_cfg
-from src.finetune_eval_model import ErnieCLSModel
+from src.ernie_for_finetune import ErnieCLS
 parser = argparse.ArgumentParser(description="Emotect export")
 parser.add_argument("--device_id", type=int, default=0, help="Device id")
 parser.add_argument("--batch_size", type=int, default=32, help="batch size")
@@ -38,7 +38,7 @@ if args.device_target == "Ascend":
     context.set_context(device_id=args.device_id)
 
 if __name__ == "__main__":
-    net = ErnieCLSModel(ernie_net_cfg, False, num_labels=args.number_labels)
+    net = ErnieCLS(ernie_net_cfg, False, num_labels=args.number_labels)
 
     load_checkpoint(args.ckpt_file, net=net)
     net.set_train(False)
@@ -49,4 +49,4 @@ if __name__ == "__main__":
     label_ids = Tensor(np.zeros([args.batch_size, ernie_net_cfg.seq_length]), mstype.int32)
 
     input_data = [input_ids, input_mask, token_type_id]
-    export(net, *input_data, file_name=args.file_name, file_format=args.file_format)
+    export(net.ernie, *input_data, file_name=args.file_name, file_format=args.file_format)
diff --git a/model_zoo/official/nlp/emotect/requirements.txt b/model_zoo/official/nlp/emotect/requirements.txt
index 193513a0cf8..651bc72dc35 100644
--- a/model_zoo/official/nlp/emotect/requirements.txt
+++ b/model_zoo/official/nlp/emotect/requirements.txt
@@ -1,4 +1,4 @@
 easydict
 six
 numpy
-paddleocr
+paddlepaddle
\ No newline at end of file
diff --git a/model_zoo/official/nlp/emotect/scripts/export.sh b/model_zoo/official/nlp/emotect/scripts/export.sh
index 1d5b0d9ddd1..94e0ac2b613 100644
--- a/model_zoo/official/nlp/emotect/scripts/export.sh
+++ b/model_zoo/official/nlp/emotect/scripts/export.sh
@@ -17,7 +17,7 @@ CUR_DIR=`pwd`
 SAVE_PATH=${CUR_DIR}/save_models
 EXPORT_PATH=${SAVE_PATH}
 python ${CUR_DIR}/export.py --device_id=0 \
-        --batch_size=32  \
+        --batch_size=1  \
         --number_labels=3  \
         --ckpt_file="${SAVE_PATH}/classifier-3_302.ckpt"  \
         --file_name="${EXPORT_PATH}/emotect.mindir"  \
diff --git a/model_zoo/official/nlp/emotect/scripts/run_classifier_eval_ascend.sh b/model_zoo/official/nlp/emotect/scripts/run_classifier_eval_ascend.sh
index 4a69d2b010f..67d2711e6f6 100755
--- a/model_zoo/official/nlp/emotect/scripts/run_classifier_eval_ascend.sh
+++ b/model_zoo/official/nlp/emotect/scripts/run_classifier_eval_ascend.sh
@@ -28,6 +28,6 @@ python ${CUR_DIR}/run_ernie_classifier.py  \
     --train_data_shuffle="true" \
     --eval_data_shuffle="false" \
     --eval_batch_size=32 \
-    --load_finetune_checkpoint_path="${SAVE_PATH}/classifier-3_302.ckpt" \
+    --load_finetune_checkpoint_path="${SAVE_PATH}/classifier-3_301.ckpt" \
     --eval_data_file_path="${DATA_PATH}/test.mindrecord" \
     --schema_file_path="" > ${GLOG_log_dir}/eval_classifier_log.txt 2>&1 &
diff --git a/model_zoo/official/nlp/fasttext/README.md b/model_zoo/official/nlp/fasttext/README.md
index ab109a88178..479f762dc29 100644
--- a/model_zoo/official/nlp/fasttext/README.md
+++ b/model_zoo/official/nlp/fasttext/README.md
@@ -68,10 +68,10 @@ After dataset preparation, you can start training and evaluation as follows:
     ```bash
     # run training example
     cd ./scripts
-    bash run_standalone_train.sh [TRAIN_DATASET] [DEVICEID]
+    bash run_standalone_train.sh [TRAIN_DATASET] [DEVICEID] [DATANAME]
 
     # run distributed training example
-    bash run_distribute_train.sh [TRAIN_DATASET] [RANK_TABLE_PATH]
+    bash run_distribute_train.sh [TRAIN_DATASET] [RANK_TABLE_PATH] [DATANAME]
 
     # run evaluation example
     bash run_eval.sh [EVAL_DATASET_PATH] [DATASET_NAME] [MODEL_CKPT] [DEVICEID]
@@ -219,14 +219,14 @@ Parameters for both training and evaluation can be set in config.py. All the dat
 
         ```bash
         cd ./scripts
-        bash run_standalone_train.sh [DATASET_PATH]
+        bash run_standalone_train.sh [DATASET_PATH] [DEVICE_ID] [DATANAME]
         ```
 
     - Running scripts for distributed training of FastText. Task training on multiple device and run the following command in bash to be executed in `scripts/`:
 
         ```bash
         cd ./scripts
-        bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH]
+        bash run_distributed_train.sh [DATASET_PATH] [RANK_TABLE_PATH] [DATANAME]
         ```
 
 - Running on GPU
diff --git a/model_zoo/official/nlp/fasttext/scripts/run_distribute_train_8p.sh b/model_zoo/official/nlp/fasttext/scripts/run_distribute_train_8p.sh
index a8a44296f06..ea0d2183db7 100644
--- a/model_zoo/official/nlp/fasttext/scripts/run_distribute_train_8p.sh
+++ b/model_zoo/official/nlp/fasttext/scripts/run_distribute_train_8p.sh
@@ -17,8 +17,9 @@
 echo "=============================================================================================================="
 echo "Please run the script as: "
 echo "sh run_distributed_train.sh DATASET_PATH RANK_TABLE_PATH"
-echo "for example: sh run_distributed_train.sh /home/workspace/ag /home/workspace/rank_table_file.json"
+echo "for example: sh run_distributed_train.sh /home/workspace/ag /home/workspace/rank_table_file.json ag"
 echo "It is better to use absolute path."
+echo "Please pay attention that the dataset should corresponds to dataset_name"
 echo "=============================================================================================================="
 get_real_path(){
   if [ "${1:0:1}" == "/" ]; then
@@ -28,11 +29,15 @@ get_real_path(){
   fi
 }
 
+if [ $3 != "ag" ] && [ $3 != "dbpedia" ] && [ $3 != "yelp_p" ]
+then
+  echo "Unrecognized dataset name, the name can choose from [ag, dbpedia, yelp_p]"
+exit 1
+fi
+
 DATASET=$(get_real_path $1)
 echo $DATASET
-DATANAME=$(basename $DATASET)
 RANK_TABLE_PATH=$(get_real_path $2)
-echo $DATANAME
 if [ ! -d $DATASET ]
 then
   echo "Error: DATA_PATH=$DATASET is not a file"
@@ -48,6 +53,19 @@ echo $RANK_TABLE_FILE
 export RANK_SIZE=8
 export DEVICE_NUM=8
 
+if [ $# -ge 1 ]; then
+  if [ $3 == 'ag' ]; then
+    DATANAME='ag'
+  elif [ $3 == 'dbpedia' ]; then
+    DATANAME='dbpedia'
+  elif [ $3 == 'yelp_p' ]; then
+    DATANAME='yelp_p'
+  else
+    echo "Unrecognized dataset name,he name can choose from [ag, dbpedia, yelp_p]"
+    exit 1
+  fi
+fi
+
 config_path="./${DATANAME}_config.yaml"
 echo "config path is : ${config_path}"
 
diff --git a/model_zoo/official/nlp/fasttext/scripts/run_standalone_train.sh b/model_zoo/official/nlp/fasttext/scripts/run_standalone_train.sh
index 9b3ef21733c..908d5453719 100644
--- a/model_zoo/official/nlp/fasttext/scripts/run_standalone_train.sh
+++ b/model_zoo/official/nlp/fasttext/scripts/run_standalone_train.sh
@@ -16,9 +16,21 @@
 echo "=============================================================================================================="
 echo "Please run the script as: "
 echo "sh run_standalone_train.sh DATASET_PATH"
-echo "for example: sh run_standalone_train.sh /home/workspace/ag"
+echo "for example: sh run_standalone_train.sh /home/workspace/ag 0 ag"
 echo "It is better to use absolute path."
+echo "Please pay attention that the dataset should corresponds to dataset_name"
 echo "=============================================================================================================="
+if [[ $# -lt 3 ]]; then
+  echo "Usage: bash run_standalone_train.sh [DATA_PATH] [DEVICE_ID] [DATANAME]
+  DATANAME can choose from [ag, dbpedia, yelp_p]"
+exit 1
+fi
+
+if [ $3 != "ag" ] && [ $3 != "dbpedia" ] && [ $3 != "yelp_p" ]
+then
+  echo "Unrecognized dataset name, the name can choose from [ag, dbpedia, yelp_p]"
+exit 1
+fi
 
 get_real_path(){
   if [ "${1:0:1}" == "/" ]; then
@@ -29,10 +41,20 @@ get_real_path(){
 }
 
 DATASET=$(get_real_path $1)
-echo $DATASET
 DATANAME=$(basename $DATASET)
-echo $DATANAME
 DEVICEID=$2
+if [ $# -ge 1 ]; then
+  if [ $3 == 'ag' ]; then
+    DATANAME='ag'
+  elif [ $3 == 'dbpedia' ]; then
+    DATANAME='dbpedia'
+  elif [ $3 == 'yelp_p' ]; then
+    DATANAME='yelp_p'
+  else
+    echo "Unrecognized dataset name"
+    exit 1
+  fi
+fi
 
 config_path="./${DATANAME}_config.yaml"
 echo "config path is : ${config_path}"
diff --git a/model_zoo/official/nlp/fasttext/src/fasttext_train.py b/model_zoo/official/nlp/fasttext/src/fasttext_train.py
index cddd78227f0..0bfaeb792d1 100644
--- a/model_zoo/official/nlp/fasttext/src/fasttext_train.py
+++ b/model_zoo/official/nlp/fasttext/src/fasttext_train.py
@@ -137,6 +137,4 @@ class FastTextTrainOneStepCell(nn.Cell):
         if self.reducer_flag:
             # apply grad reducer on grads
             grads = self.grad_reducer(grads)
-
-        self.optimizer(grads)
-        return loss
+        return F.depend(loss, self.optimizer(grads))
diff --git a/model_zoo/official/nlp/gnmt_v2/README.md b/model_zoo/official/nlp/gnmt_v2/README.md
index 9a907b79d7e..d273ff3d237 100644
--- a/model_zoo/official/nlp/gnmt_v2/README.md
+++ b/model_zoo/official/nlp/gnmt_v2/README.md
@@ -91,6 +91,23 @@ After dataset preparation, you can start training and evaluation as follows:
       VOCAB_ADDR BPE_CODE_ADDR TEST_TARGET
     ```
 
+- running on GPU
+
+    ```bash
+    # run training example
+    cd ./scripts
+    bash run_standalone_train_gpu.sh PRE_TRAIN_DATASET DEVICE_ID
+
+    # run distributed training example
+    cd ./scripts
+    bash run_distributed_train_gpu.sh PRE_TRAIN_DATASET
+
+    # run evaluation example
+    cd ./scripts
+    bash run_standalone_eval_gpu.sh TEST_DATASET EXISTED_CKPT_PATH \
+      VOCAB_ADDR BPE_CODE_ADDR TEST_TARGET DEVICE_ID
+    ```
+
 - ModelArts (If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training as follows)
 
     ```bash
@@ -206,10 +223,15 @@ The GNMT network script and code result are as follows:
   │      ├──optimizer.py                     // Optimizer.
   ├── scripts
   │   ├──run_distributed_train_ascend.sh     // Shell script for distributed train on ascend.
+  │   ├──run_distributed_train_gpu.sh        // Shell script for distributed train on GPU.  
   │   ├──run_standalone_eval_ascend.sh       // Shell script for standalone eval on ascend.
+  │   ├──run_standalone_eval_gpu.sh          // Shell script for standalone eval on GPU.
   │   ├──run_standalone_train_ascend.sh      // Shell script for standalone eval on ascend.
-  ├── default_config.yaml                    // Configurations for train
-  ├── default_test_config.yaml               // Configurations for eval
+  │   ├──run_standalone_train_gpu.sh         // Shell script for standalone eval on GPU.
+  ├── default_config.yaml                    // Configurations for train on ascend.
+  ├── default_config_gpu.yaml                // Configurations for train on GPU.
+  ├── default_test_config.yaml               // Configurations for eval on ascend.
+  ├── default_test_config_gpu.yaml           // Configurations for eval on GPU.
   ├── create_dataset.py                      // Dataset preparation.
   ├── eval.py                                // Infer API entry.
   ├── export.py                              // Export checkpoint file into air models.
@@ -262,49 +284,96 @@ For more configuration details, please refer the script `./default_config.yaml`
 
 ## Training Process
 
-For a pre-trained model, configure the following options in the `./default_config.yaml` file:
+- running on Ascend
 
-- Select an optimizer ('momentum/adam/lamb' is available).
-- Specify `ckpt_prefix` and `ckpt_path` in `checkpoint_path` to save the model file.
-- Set other parameters, including dataset configuration and network configuration.
-- If a pre-trained model exists, assign `existed_ckpt` to the path of the existing model during fine-tuning.
+    For a pre-trained model, configure the following options in the `./default_config.yaml` file:
 
-Start task training on a single device and run the shell script `scripts/run_standalone_train_ascend.sh`:
+    - Select an optimizer ('momentum/adam/lamb' is available).
+    - Specify `ckpt_prefix` and `ckpt_path` in `checkpoint_path` to save the model file.
+    - Set other parameters, including dataset configuration and network configuration.
+    - If a pre-trained model exists, assign `existed_ckpt` to the path of the existing model during fine-tuning.
 
-```bash
-cd ./scripts
-bash run_standalone_train_ascend.sh PRE_TRAIN_DATASET
-```
+    Start task training on a single device and run the shell script `scripts/run_standalone_train_ascend.sh`:
 
-In this script, the `PRE_TRAIN_DATASET` is the dataset address.
+    ```bash
+    cd ./scripts
+    bash run_standalone_train_ascend.sh PRE_TRAIN_DATASET
+    ```
 
-Run `scripts/run_distributed_train_ascend.sh` for distributed training of GNMTv2 model.
-Task training on multiple devices and run the following command in bash to be executed in `scripts/`.:
+    In this script, the `PRE_TRAIN_DATASET` is the dataset address.
 
-```bash
-cd ./scripts
-bash run_distributed_train_ascend.sh RANK_TABLE_ADDR PRE_TRAIN_DATASET
-```
+    Run `scripts/run_distributed_train_ascend.sh` for distributed training of GNMTv2 model.
+    Task training on multiple devices and run the following command in bash to be executed in `scripts/`.:
 
-Note: the `RANK_TABLE_ADDR` is the hccl_json file assigned when distributed training is running.
-Currently, inconsecutive device IDs are not supported in `scripts/run_distributed_train_ascend.sh`. The device ID must start from 0 in the `RANK_TABLE_ADDR` file.
+    ```bash
+    cd ./scripts
+    bash run_distributed_train_ascend.sh RANK_TABLE_ADDR PRE_TRAIN_DATASET
+    ```
+
+    Note: the `RANK_TABLE_ADDR` is the hccl_json file assigned when distributed training is running.
+    Currently, inconsecutive device IDs are not supported in `scripts/run_distributed_train_ascend.sh`. The device ID must start from 0 in the `RANK_TABLE_ADDR` file.
+
+- running on GPU
+
+    For a pre-trained model, configure the following options in the `./default_config_gpu.yaml` file:
+
+    - Select an optimizer ('momentum/adam/lamb' is available).
+    - Specify `ckpt_prefix` and `ckpt_path` in `checkpoint_path` to save the model file.
+    - Set other parameters, including dataset configuration and network configuration.
+    - If a pre-trained model exists, assign `existed_ckpt` to the path of the existing model during fine-tuning.
+
+    Start task training on a single device and run the shell script `scripts/run_standalone_train_gpu.sh`:
+
+    ```bash
+    cd ./scripts
+    bash run_standalone_train_gpu.sh PRE_TRAIN_DATASET DEVICE_ID
+    ```
+
+    In this script, the `PRE_TRAIN_DATASET` is the dataset address.
+
+    Run `scripts/run_distributed_train_gpu.sh` for distributed training of GNMTv2 model.
+    Task training on multiple devices and run the following command in bash to be executed in `scripts/`.:
+
+    ```bash
+    cd ./scripts
+    bash run_distributed_train_ascend.sh PRE_TRAIN_DATASET
+    ```
+
+    Currently, inconsecutive device IDs are not supported in `scripts/run_distributed_train_gpu.sh`. The device ID must start from 0 to 7.
 
 ## Inference Process
 
-For inference using a trained model on multiple hardware platforms, such as Ascend 910.
-Set options in `./default_config.yaml`.
+- running on Ascend
 
-Run the shell script `scripts/run_standalone_eval_ascend.sh` to process the output token ids to get the BLEU scores.
+    For inference using a trained model on multiple hardware platforms, such as Ascend 910.
+    Set options in `./default_test_config.yaml`.
 
-```bash
-cd ./scripts
-bash run_standalone_eval_ascend.sh
-bash run_standalone_eval_ascend.sh TEST_DATASET EXISTED_CKPT_PATH \
-  VOCAB_ADDR BPE_CODE_ADDR TEST_TARGET
-```
+    Run the shell script `scripts/run_standalone_eval_ascend.sh` to process the output token ids to get the BLEU scores.
 
-The `TEST_DATASET` is the address of inference dataset, and `EXISTED_CKPT_PATH` is the path of the model file generated during training process.
-The `VOCAB_ADDR` is the vocabulary address, `BPE_CODE_ADDR` is the bpe code address and the `TEST_TARGET` are the path of answers.
+    ```bash
+    cd ./scripts
+    bash run_standalone_eval_ascend.sh TEST_DATASET EXISTED_CKPT_PATH \
+      VOCAB_ADDR BPE_CODE_ADDR TEST_TARGET
+    ```
+
+    The `TEST_DATASET` is the address of inference dataset, and `EXISTED_CKPT_PATH` is the path of the model file generated during training process.
+    The `VOCAB_ADDR` is the vocabulary address, `BPE_CODE_ADDR` is the bpe code address and the `TEST_TARGET` are the path of answers.
+
+- running on GPU
+
+    For inference using a trained model on GPU.
+    Set options in `./default_test_config_gpu.yaml`.
+
+    Run the shell script `scripts/run_standalone_eval_gpu.sh` to process the output token ids to get the BLEU scores.
+
+    ```bash
+    cd ./scripts
+    bash run_standalone_eval_gpu.sh TEST_DATASET EXISTED_CKPT_PATH \
+      VOCAB_ADDR BPE_CODE_ADDR TEST_TARGET DEVICE_ID
+    ```
+
+    The `TEST_DATASET` is the address of inference dataset, and `EXISTED_CKPT_PATH` is the path of the model file generated during training process.
+    The `VOCAB_ADDR` is the vocabulary address, `BPE_CODE_ADDR` is the bpe code address and the `TEST_TARGET` are the path of answers.
 
 # [Model Description](#contents)
 
@@ -312,36 +381,36 @@ The `VOCAB_ADDR` is the vocabulary address, `BPE_CODE_ADDR` is the bpe code addr
 
 ### Training Performance
 
-| Parameters                 | Ascend                                                         |
-| -------------------------- | -------------------------------------------------------------- |
-| Resource                   | Ascend 910; OS Euler2.8                                                      |
-| uploaded Date              | 11/06/2020 (month/day/year)                                    |
-| MindSpore Version          | 1.0.0                                                          |
-| Dataset                    | WMT English-German for training                                |
-| Training Parameters        | epoch=6, batch_size=128                                        |
-| Optimizer                  | Adam                                                           |
-| Loss Function              | Softmax Cross Entropy                                          |
-| outputs                    | probability                                                    |
-| Speed                      | 344ms/step (8pcs)                                              |
-| Total Time                 | 7800s (8pcs)                                                   |
-| Loss                       | 63.35                                                          |
-| Params (M)                 | 613                                                            |
-| Checkpoint for inference   | 1.8G (.ckpt file)                                              |
-| Scripts                    | [gnmt_v2](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/gnmt_v2) |
+| Parameters                 | Ascend                                                         |GPU                                                         |
+| -------------------------- | -------------------------------------------------------------- | -------------------------------------------------------------- |
+| Resource                   | Ascend 910; OS Euler2.8                                                      | NV SMX2 V100-32G                                                      |
+| uploaded Date              | 11/06/2020 (month/day/year)                                    | 08/05/2021 (month/day/year)                                    |
+| MindSpore Version          | 1.0.0                                                          | 1.3.0                                                          |
+| Dataset                    | WMT English-German for training                                | WMT English-German for training                                |
+| Training Parameters        | epoch=6, batch_size=128                                        | epoch=8, batch_size=128                                        |
+| Optimizer                  | Adam                                                           | Adam                                                           |
+| Loss Function              | Softmax Cross Entropy                                          | Softmax Cross Entropy                                          |
+| outputs                    | probability                                                    | probability                                                    |
+| Speed                      | 344ms/step (8pcs)                                              | 620 ms/step (1pcs)                                              |
+| Total Time                 | 7800s (8pcs)                                                   | 17079s (1pcs)                                                   |
+| Loss                       | 63.35                                                          | 55.42                                                         |
+| Params (M)                 | 613                                                            | 613                                                           |
+| Checkpoint for inference   | 1.8G (.ckpt file)                                              | 1.8G (.ckpt file)                                              |
+| Scripts                    | [gnmt_v2](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/gnmt_v2) | [gnmt_v2](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/gnmt_v2) |
 
 ### Inference Performance
 
-| Parameters          | Ascend                      |
-| ------------------- | --------------------------- |
-| Resource            | Ascend 910; OS Euler2.8                   |
-| Uploaded Date       | 11/06/2020 (month/day/year) |
-| MindSpore Version   | 1.0.0                       |
-| Dataset             | WMT newstest2014            |
-| batch_size          | 128                         |
-| Total Time          | 1560s                       |
-| outputs             | probability                 |
-| Accuracy            | BLEU Score= 24.05           |
-| Model for inference | 1.8G (.ckpt file)           |
+| Parameters          | Ascend                      | GPU                      |
+| ------------------- | --------------------------- | --------------------------- |
+| Resource            | Ascend 910; OS Euler2.8                   | NV SMX2 V100-32G                   |
+| Uploaded Date       | 11/06/2020 (month/day/year) | 08/05/2021 (month/day/year) |
+| MindSpore Version   | 1.0.0                       | 1.3.0                       |
+| Dataset             | WMT newstest2014            | WMT newstest2014            |
+| batch_size          | 128                         | 128                         |
+| Total Time          | 1560s                       | 180s                       |
+| outputs             | probability                 | probability                 |
+| Accuracy            | BLEU Score= 24.05           | BLEU Score= 24.4           |
+| Model for inference | 1.8G (.ckpt file)           | 1.8G (.ckpt file)           |
 
 # [Random Situation Description](#contents)
 
diff --git a/model_zoo/official/nlp/gnmt_v2/default_config.yaml b/model_zoo/official/nlp/gnmt_v2/default_config.yaml
index f4b765f34fb..6fdffe7820e 100644
--- a/model_zoo/official/nlp/gnmt_v2/default_config.yaml
+++ b/model_zoo/official/nlp/gnmt_v2/default_config.yaml
@@ -9,6 +9,7 @@ data_path: "/cache/data"
 output_path: "/cache/train"
 load_path: "/cache/checkpoint_path"
 device_target: "Ascend"
+device_id: 0
 need_modelarts_dataset_unzip: False
 modelarts_dataset_unzip_name: ""
 
diff --git a/model_zoo/official/nlp/gnmt_v2/default_test_config.yaml b/model_zoo/official/nlp/gnmt_v2/default_test_config.yaml
index 7cbce1405d5..8dffef4204a 100644
--- a/model_zoo/official/nlp/gnmt_v2/default_test_config.yaml
+++ b/model_zoo/official/nlp/gnmt_v2/default_test_config.yaml
@@ -9,6 +9,7 @@ data_path: "/cache/data"
 output_path: "/cache/train"
 load_path: "/cache/checkpoint_path"
 device_target: "Ascend"
+device_id: 0
 need_modelarts_dataset_unzip: False
 modelarts_dataset_unzip_name: ""
 
diff --git a/model_zoo/official/nlp/gnmt_v2/eval.py b/model_zoo/official/nlp/gnmt_v2/eval.py
index 62b3cdb83cd..fb07277fd54 100644
--- a/model_zoo/official/nlp/gnmt_v2/eval.py
+++ b/model_zoo/official/nlp/gnmt_v2/eval.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 import pickle
 import os
 import time
+from mindspore import context
 
 from src.gnmt_model import infer
 from src.gnmt_model.bleu_calculate import bleu_calculate
@@ -83,6 +84,12 @@ def run_eval():
     '''run eval.'''
     _config = get_config(default_config)
     result = infer(_config)
+    context.set_context(
+        mode=context.GRAPH_MODE,
+        save_graphs=False,
+        device_target=_config.device_target,
+        device_id=_config.device_id,
+        reserve_class_name_in_scope=False)
 
     with open(_config.output, "wb") as f:
         pickle.dump(result, f, 1)
diff --git a/model_zoo/official/nlp/gnmt_v2/scripts/run_distributed_train_ascend.sh b/model_zoo/official/nlp/gnmt_v2/scripts/run_distributed_train_ascend.sh
index cf5c9efda40..66b1baafbe4 100644
--- a/model_zoo/official/nlp/gnmt_v2/scripts/run_distributed_train_ascend.sh
+++ b/model_zoo/official/nlp/gnmt_v2/scripts/run_distributed_train_ascend.sh
@@ -47,12 +47,12 @@ do
     cp -r ../../src .
     cp -r ../../model_utils .
     export RANK_ID=$i
-    export DEVICE_ID=$i
     config_path="${current_exec_path}/device${i}/default_config.yaml"
     echo "config path is : ${config_path}"
-  python ../../train.py \
-    --config_path=$config_path \
-    --pre_train_dataset=$PRE_TRAIN_DATASET > log_gnmt_network${i}.log 2>&1 &
-    cd ${current_exec_path} || exit
+    python ../../train.py \
+      --config_path=$config_path \
+      --pre_train_dataset=$PRE_TRAIN_DATASET \
+      --device_id=$i > log_gnmt_network${i}.log 2>&1 &
+      cd ${current_exec_path} || exit
 done
 cd ${current_exec_path} || exit
diff --git a/model_zoo/official/nlp/gnmt_v2/src/gnmt_model/dynamic_rnn.py b/model_zoo/official/nlp/gnmt_v2/src/gnmt_model/dynamic_rnn.py
index 4f7daaa3554..5d512fe909a 100644
--- a/model_zoo/official/nlp/gnmt_v2/src/gnmt_model/dynamic_rnn.py
+++ b/model_zoo/official/nlp/gnmt_v2/src/gnmt_model/dynamic_rnn.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@ import numpy as np
 import mindspore.ops.operations as P
 import mindspore.common.dtype as mstype
 import mindspore.nn as nn
+from mindspore import context
 from mindspore.common.parameter import Parameter
 from mindspore.common.tensor import Tensor
 
@@ -41,7 +42,6 @@ class DynamicRNNCell(nn.Cell):
                  hidden_size=1024,
                  initializer_range=0.1):
         super(DynamicRNNCell, self).__init__()
-        self.rnn = P.DynamicRNN()
         self.num_step = num_setp
         self.batch_size = batch_size
         self.input_size = word_embed_dim
@@ -57,15 +57,32 @@ class DynamicRNNCell(nn.Cell):
         self.dynamicRNN_h = Tensor(np.zeros((1, self.batch_size, self.hidden_size)), mstype.float32)
         self.dynamicRNN_c = Tensor(np.zeros((1, self.batch_size, self.hidden_size)), mstype.float32)
         self.cast = P.Cast()
+        self.is_ascend = context.get_context("device_target") == "Ascend"
+        if self.is_ascend:
+            self.compute_type = mstype.float16
+            self.rnn = P.DynamicRNN()
+        else:
+            self.compute_type = mstype.float32
+            self.lstm = nn.LSTM(self.input_size,
+                                self.hidden_size,
+                                num_layers=1,
+                                has_bias=True,
+                                batch_first=False,
+                                dropout=0,
+                                bidirectional=False)
 
     def construct(self, x, init_h=None, init_c=None):
-        w = self.cast(self.dynamicRNN_w, mstype.float16)
-        b = self.cast(self.dynamicRNN_b, mstype.float16)
+        """DynamicRNNCell Network."""
         if init_h is None or init_c is None:
-            init_h = self.cast(self.dynamicRNN_h, mstype.float16)
-            init_c = self.cast(self.dynamicRNN_c, mstype.float16)
-        out = self.rnn(x, w, b, None, init_h, init_c)
-        return out[0], out[1], out[2]
+            init_h = self.cast(self.dynamicRNN_h, self.compute_type)
+            init_c = self.cast(self.dynamicRNN_c, self.compute_type)
+        if self.is_ascend:
+            w = self.cast(self.dynamicRNN_w, self.compute_type)
+            b = self.cast(self.dynamicRNN_b, self.compute_type)
+            output, hn, cn = self.rnn(x, w, b, None, init_h, init_c)
+        else:
+            output, (hn, cn) = self.lstm(x, (init_h, init_c))
+        return output, hn, cn
 
 
 class DynamicRNNNet(nn.Cell):
@@ -94,13 +111,18 @@ class DynamicRNNNet(nn.Cell):
                                   batch_size=batchsize,
                                   word_embed_dim=word_embed_dim,
                                   hidden_size=hidden_size)
+        self.is_ascend = context.get_context("device_target") == "Ascend"
+        if self.is_ascend:
+            self.compute_type = mstype.float16
+        else:
+            self.compute_type = mstype.float32
 
     def construct(self, inputs, init_state=None):
         """DynamicRNN Network."""
-        inputs = self.cast(inputs, mstype.float16)
+        inputs = self.cast(inputs, self.compute_type)
         if init_state is not None:
-            init_h = self.cast(init_state[0:1, :, :], mstype.float16)
-            init_c = self.cast(init_state[-1:, :, :], mstype.float16)
+            init_h = self.cast(init_state[0:1, :, :], self.compute_type)
+            init_c = self.cast(init_state[-1:, :, :], self.compute_type)
             out, state_h, state_c = self.net(inputs, init_h, init_c)
         else:
             out, state_h, state_c = self.net(inputs)
diff --git a/model_zoo/official/nlp/gnmt_v2/src/gnmt_model/gnmt_for_infer.py b/model_zoo/official/nlp/gnmt_v2/src/gnmt_model/gnmt_for_infer.py
index 902df8eba4c..28207d55779 100644
--- a/model_zoo/official/nlp/gnmt_v2/src/gnmt_model/gnmt_for_infer.py
+++ b/model_zoo/official/nlp/gnmt_v2/src/gnmt_model/gnmt_for_infer.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,14 +14,13 @@
 # ============================================================================
 """Infer api."""
 import time
-
 import numpy as np
 
 import mindspore.nn as nn
 import mindspore.common.dtype as mstype
 from mindspore.common.tensor import Tensor
 from mindspore.ops import operations as P
-from mindspore import context, Parameter
+from mindspore import Parameter
 from mindspore.train.model import Model
 
 from src.dataset import load_dataset
@@ -29,13 +28,6 @@ from .gnmt import GNMT
 from ..utils import zero_weight
 from ..utils.load_weights import load_infer_weights
 
-context.set_context(
-    mode=context.GRAPH_MODE,
-    save_graphs=False,
-    device_target="Ascend",
-    reserve_class_name_in_scope=False)
-
-
 class GNMTInferCell(nn.Cell):
     """
     Encapsulation class of GNMT network infer.
diff --git a/model_zoo/official/nlp/gnmt_v2/train.py b/model_zoo/official/nlp/gnmt_v2/train.py
index 0cde9ed5c2d..0437f26a0ce 100644
--- a/model_zoo/official/nlp/gnmt_v2/train.py
+++ b/model_zoo/official/nlp/gnmt_v2/train.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ from mindspore.train.loss_scale_manager import DynamicLossScaleManager
 from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, SummaryCollector, TimeMonitor
 from mindspore import context, Parameter
 from mindspore.context import ParallelMode
-from mindspore.communication import management as MultiAscend
+from mindspore.communication import management as MultiDevice
 from mindspore.train.serialization import load_checkpoint
 from mindspore.common import set_seed
 
@@ -63,7 +63,7 @@ def _train(model, config,
         epoch_size = pre_training_dataset.get_repeat_count()
         print("epoch size ", epoch_size)
         if os.getenv("RANK_SIZE") is not None and int(os.getenv("RANK_SIZE")) > 1:
-            print(f" | Rank {MultiAscend.get_rank()} Call model train.")
+            print(f" | Rank {MultiDevice.get_rank()} Call model train.")
         model.train(config.epochs, pre_training_dataset,
                     callbacks=callbacks, dataset_sink_mode=config.dataset_sink_mode)
 
@@ -203,10 +203,10 @@ def _build_training_pipeline(config,
 
     rank_size = os.getenv('RANK_SIZE')
     callbacks = [time_cb, loss_monitor]
-    if rank_size is not None and int(rank_size) > 1 and MultiAscend.get_rank() % 8 == 0:
+    if rank_size is not None and int(rank_size) > 1 and MultiDevice.get_rank() % 8 == 0:
         ckpt_callback = ModelCheckpoint(
             prefix=config.ckpt_prefix,
-            directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
+            directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(MultiDevice.get_rank())),
             config=ckpt_config)
         callbacks.append(ckpt_callback)
         summary_callback = SummaryCollector(summary_dir="./summary", collect_freq=50)
@@ -215,7 +215,7 @@ def _build_training_pipeline(config,
     if rank_size is None or int(rank_size) == 1:
         ckpt_callback = ModelCheckpoint(
             prefix=config.ckpt_prefix,
-            directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
+            directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(config.device_id)),
             config=ckpt_config)
         callbacks.append(ckpt_callback)
         summary_callback = SummaryCollector(summary_dir="./summary", collect_freq=50)
@@ -231,10 +231,10 @@ def _build_training_pipeline(config,
 
 def _setup_parallel_env():
     context.reset_auto_parallel_context()
-    MultiAscend.init()
+    MultiDevice.init()
     context.set_auto_parallel_context(
         parallel_mode=ParallelMode.DATA_PARALLEL,
-        device_num=MultiAscend.get_group_size(),
+        device_num=MultiDevice.get_group_size(),
         gradients_mean=True
     )
 
@@ -253,22 +253,22 @@ def train_parallel(config):
         data_files=config.pre_train_dataset,
         batch_size=config.batch_size,
         sink_mode=config.dataset_sink_mode,
-        rank_size=MultiAscend.get_group_size(),
-        rank_id=MultiAscend.get_rank()
+        rank_size=MultiDevice.get_group_size(),
+        rank_id=MultiDevice.get_rank()
     ) if config.pre_train_dataset else None
     fine_tune_dataset = load_dataset(
         data_files=config.fine_tune_dataset,
         batch_size=config.batch_size,
         sink_mode=config.dataset_sink_mode,
-        rank_size=MultiAscend.get_group_size(),
-        rank_id=MultiAscend.get_rank()
+        rank_size=MultiDevice.get_group_size(),
+        rank_id=MultiDevice.get_rank()
     ) if config.fine_tune_dataset else None
     test_dataset = load_dataset(
         data_files=config.test_dataset,
         batch_size=config.batch_size,
         sink_mode=config.dataset_sink_mode,
-        rank_size=MultiAscend.get_group_size(),
-        rank_id=MultiAscend.get_rank()
+        rank_size=MultiDevice.get_group_size(),
+        rank_id=MultiDevice.get_rank()
     ) if config.test_dataset else None
 
     _build_training_pipeline(config=config,
@@ -359,17 +359,12 @@ def modelarts_pre_process():
 @moxing_wrapper(pre_process=modelarts_pre_process)
 def run_train():
     '''run train.'''
-    device_id = os.getenv('DEVICE_ID', None)
-    if device_id is None:
-        raise RuntimeError("`DEVICE_ID` can not be None.")
-
-    device_id = int(device_id)
-    context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend",
-                        reserve_class_name_in_scope=True, device_id=device_id)
-    _rank_size = os.getenv('RANK_SIZE')
-
     _config = get_config(default_config)
     _config.pre_train_dataset = default_config.pre_train_dataset
+
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=_config.device_target,
+                        reserve_class_name_in_scope=True, device_id=_config.device_id)
+    _rank_size = os.getenv('RANK_SIZE')
     set_seed(_config.random_seed)
     if _rank_size is not None and int(_rank_size) > 1:
         train_parallel(_config)
diff --git a/model_zoo/official/nlp/gru/requirements.txt b/model_zoo/official/nlp/gru/requirements.txt
deleted file mode 100644
index 0ba33e85625..00000000000
--- a/model_zoo/official/nlp/gru/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-nltk
-numpy
-pyyaml
diff --git a/model_zoo/official/nlp/gru/scripts/run_eval.sh b/model_zoo/official/nlp/gru/scripts/run_eval.sh
deleted file mode 100644
index a4f8869175c..00000000000
--- a/model_zoo/official/nlp/gru/scripts/run_eval.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-if [ $# -ne 2 ]
-then
-    echo "Usage: sh run_eval.sh [CKPT_FILE] [DATASET_PATH]"
-exit 1
-fi
-ulimit -u unlimited
-export DEVICE_NUM=1
-export DEVICE_ID=0
-export RANK_ID=0
-export RANK_SIZE=1
-get_real_path(){
-  if [ "${1:0:1}" == "/" ]; then
-    echo "$1"
-  else
-    echo "$(realpath -m $PWD/$1)"
-  fi
-}
-
-CKPT_FILE=$(get_real_path $1)
-echo $CKPT_FILE
-if [ ! -f $CKPT_FILE ]
-then
-    echo "error: CKPT_FILE=$CKPT_FILE is not a file"
-exit 1
-fi
-
-DATASET_PATH=$(get_real_path $2)
-echo $DATASET_PATH
-if [ ! -f $DATASET_PATH ]
-then
-    echo "error: DATASET_PATH=$DATASET_PATH is not a file"
-exit 1
-fi
-rm -rf ./eval
-mkdir ./eval
-cp ../*.py ./eval
-cp ../*.yaml ./eval
-cp *.sh ./eval
-cp -r ../src ./eval
-cp -r ../model_utils ./eval
-cd ./eval || exit
-echo "start eval for device $DEVICE_ID"
-env > env.log
-python eval.py --ckpt_file=$CKPT_FILE --dataset_path=$DATASET_PATH &> log &
-cd ..
diff --git a/model_zoo/official/nlp/gru/scripts/run_standalone_train.sh b/model_zoo/official/nlp/gru/scripts/run_standalone_train.sh
deleted file mode 100644
index 66e7893f9f0..00000000000
--- a/model_zoo/official/nlp/gru/scripts/run_standalone_train.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-if [ $# -ne 1 ]
-then
-    echo "Usage: sh run_distribute_train_ascend.sh [DATASET_PATH]"
-exit 1
-fi
-ulimit -u unlimited
-export DEVICE_NUM=1
-export DEVICE_ID=4
-export RANK_ID=0
-export RANK_SIZE=1
-get_real_path(){
-  if [ "${1:0:1}" == "/" ]; then
-    echo "$1"
-  else
-    echo "$(realpath -m $PWD/$1)"
-  fi
-}
-
-DATASET_PATH=$(get_real_path $1)
-echo $DATASET_PATH
-if [ ! -f $DATASET_PATH ]
-then
-    echo "error: DATASET_PATH=$DATASET_PATH is not a file"
-exit 1
-fi
-
-rm -rf ./train
-mkdir ./train
-cp ../*.py ./train
-cp ../*.yaml ./train
-cp *.sh ./train
-cp -r ../src ./train
-cp -r ../model_utils ./train
-cd ./train || exit
-echo "start training for device $DEVICE_ID"
-env > env.log
-python train.py --dataset_path=$DATASET_PATH &> log &
-cd ..
diff --git a/model_zoo/official/nlp/gru/src/gru.py b/model_zoo/official/nlp/gru/src/gru.py
deleted file mode 100644
index 08199c43ccc..00000000000
--- a/model_zoo/official/nlp/gru/src/gru.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""GRU cell"""
-import mindspore.nn as nn
-import mindspore.ops.operations as P
-import mindspore.common.dtype as mstype
-from src.weight_init import gru_default_state
-
-class BidirectionGRU(nn.Cell):
-    '''
-    BidirectionGRU model
-
-    Args:
-        config: config of network
-    '''
-    def __init__(self, config, is_training=True):
-        super(BidirectionGRU, self).__init__()
-        if is_training:
-            self.batch_size = config.batch_size
-        else:
-            self.batch_size = config.eval_batch_size
-        self.embedding_size = config.encoder_embedding_size
-        self.hidden_size = config.hidden_size
-        self.weight_i, self.weight_h, self.bias_i, self.bias_h, self.init_h = gru_default_state(self.batch_size,
-                                                                                                self.embedding_size,
-                                                                                                self.hidden_size)
-        self.weight_bw_i, self.weight_bw_h, self.bias_bw_i, self.bias_bw_h, self.init_bw_h = \
-            gru_default_state(self.batch_size, self.embedding_size, self.hidden_size)
-        self.reverse = P.ReverseV2(axis=[1])
-        self.concat = P.Concat(axis=2)
-        self.squeeze = P.Squeeze(axis=0)
-        self.rnn = P.DynamicGRUV2()
-        self.text_len = config.max_length
-        self.cast = P.Cast()
-
-    def construct(self, x):
-        '''
-        BidirectionGRU construction
-
-        Args:
-            x(Tensor): BidirectionGRU input
-
-        Returns:
-            output(Tensor): rnn output
-            hidden(Tensor): hidden state
-        '''
-        x = self.cast(x, mstype.float16)
-        y1, _, _, _, _, _ = self.rnn(x, self.weight_i, self.weight_h, self.bias_i, self.bias_h, None, self.init_h)
-        bw_x = self.reverse(x)
-        y1_bw, _, _, _, _, _ = self.rnn(bw_x, self.weight_bw_i,
-                                        self.weight_bw_h, self.bias_bw_i, self.bias_bw_h, None, self.init_bw_h)
-        y1_bw = self.reverse(y1_bw)
-        output = self.concat((y1, y1_bw))
-        hidden = self.concat((y1[self.text_len-1:self.text_len:1, ::, ::],
-                              y1_bw[self.text_len-1:self.text_len:1, ::, ::]))
-        hidden = self.squeeze(hidden)
-        return output, hidden
-
-class GRU(nn.Cell):
-    '''
-    GRU model
-
-    Args:
-        config: config of network
-    '''
-    def __init__(self, config, is_training=True):
-        super(GRU, self).__init__()
-        if is_training:
-            self.batch_size = config.batch_size
-        else:
-            self.batch_size = config.eval_batch_size
-        self.embedding_size = config.encoder_embedding_size
-        self.hidden_size = config.hidden_size
-        self.weight_i, self.weight_h, self.bias_i, self.bias_h, self.init_h = \
-            gru_default_state(self.batch_size, self.embedding_size + self.hidden_size*2, self.hidden_size)
-        self.rnn = P.DynamicGRUV2()
-        self.cast = P.Cast()
-
-    def construct(self, x):
-        '''
-        GRU construction
-
-        Args:
-            x(Tensor): GRU input
-
-        Returns:
-            output(Tensor): rnn output
-            hidden(Tensor): hidden state
-        '''
-        x = self.cast(x, mstype.float16)
-        y1, h1, _, _, _, _ = self.rnn(x, self.weight_i, self.weight_h, self.bias_i, self.bias_h, None, self.init_h)
-        return y1, h1
diff --git a/model_zoo/official/nlp/pangu_alpha/README.md b/model_zoo/official/nlp/pangu_alpha/README.md
index 3aa82373b85..b5018d2700e 100644
--- a/model_zoo/official/nlp/pangu_alpha/README.md
+++ b/model_zoo/official/nlp/pangu_alpha/README.md
@@ -179,12 +179,13 @@ https:gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools.
 The script will launch the GPU training through `mpirun`, the user can run the following command on any machine to start training.
 
 ```bash
-bash scripts/run_distributed_train_gpu.sh RANK_SIZE HOSTFILE DATASET MOD
+bash scripts/run_distributed_train_gpu.sh RANK_SIZE HOSTFILE DATASET PER_BATCH MOD
 ```
 
 - RANK_SIZE: The device number. This can be your total device numbers. For example, 8, 16, 32 ...
 - HOSTFILE:  It's a text file describes the host ip and its devices. Please see our [tutorial](https://www.mindspore.cn/docs/programming_guide/en/master/distributed_training_gpu.html) or [OpenMPI](https://www.open-mpi.org/) for more details.
 - DATASET: The path to the mindrecord files's parent directory . For example: `/home/work/mindrecord/`.
+- PER_BATCH: The batch size for each data parallel-way.
 - MODE: Can be `2.6B`, `13B` and `200B`.
 
 ### Incremental Training
diff --git a/model_zoo/official/nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh b/model_zoo/official/nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh
index 3fd78dd5d90..fe1dee3abfd 100644
--- a/model_zoo/official/nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh
@@ -16,8 +16,8 @@
 
 echo "=============================================================================================================="
 echo "Please run the script as: "
-echo "bash run_distributed_train_gpu.sh RANK_SIZE HOSTFILE DATASET MODE"
-echo "for example: bash run_distributed_train_gpu.sh 16 hostfile_16p /mass_dataset/train_data/ 2.6B"
+echo "bash run_distributed_train_gpu.sh RANK_SIZE HOSTFILE DATASET PER_BATCH MODE"
+echo "for example: bash run_distributed_train_gpu.sh 16 hostfile_16p /mass_dataset/train_data/ 16 2.6B"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
 
@@ -26,7 +26,8 @@ self_path=$(dirname "${script_self}")
 RANK_SIZE=$1
 HOSTFILE=$2
 DATASET=$3
-MODE=$4
+PER_BATCH=$4
+MODE=$5
 
 mpirun --allow-run-as-root -x PATH -x LD_LIBRARY_PATH -x PYTHONPATH -x NCCL_DEBUG -x GLOG_v -n $RANK_SIZE --hostfile $HOSTFILE --output-filename log_output --merge-stderr-to-stdout \
     python -s ${self_path}/../train.py  \
@@ -35,4 +36,5 @@ mpirun --allow-run-as-root -x PATH -x LD_LIBRARY_PATH -x PYTHONPATH -x NCCL_DEBU
       --device_target="GPU"             \
       --data_url=$DATASET               \
       --mode=$MODE                      \
+      --per_batch_size=$PER_BATCH       \
       --run_type=train > train_log.txt 2>&1 &
diff --git a/model_zoo/official/nlp/pangu_alpha/train.py b/model_zoo/official/nlp/pangu_alpha/train.py
index e184260cc7a..989f648995e 100644
--- a/model_zoo/official/nlp/pangu_alpha/train.py
+++ b/model_zoo/official/nlp/pangu_alpha/train.py
@@ -100,8 +100,6 @@ def run_train(args_opt):
     # Set model property
     model_parallel_num = args_opt.op_level_model_parallel_num
     data_parallel_num = int(device_num / model_parallel_num)
-    if data_parallel_num <= 1 and args_opt.optimizer_shard == 1:
-        raise ValueError("The dp must large than 1 when applying optimizer shard.")
     batch_size = args_opt.per_batch_size * data_parallel_num
     config = PANGUALPHAConfig(
         data_parallel_num=data_parallel_num, model_parallel_num=model_parallel_num,
@@ -253,8 +251,19 @@ def run_train_pipeline(args_opt):
     pangu_alpha_with_grads = PanguAlphaTrainPipelineWithLossScaleCell(
         pangu_alpha_with_loss, optimizer=optimizer, config=config, scale_update_cell=update_cell)
     if args_opt.train_and_eval_mode:
-        raise ValueError("The pipeline train_and_eval_mode is not supported yet")
-    model = Model(pangu_alpha_with_grads)
+        ds_eval = create_dataset(config.batch_size // config.micro_size, data_path=eval_cache_url,
+                                 device_num=stage_device_num, rank=rank_id % stage_device_num, eod_reset=True,
+                                 data_start_index=0, full_batch=bool(args_opt.full_batch),
+                                 column_name=args_opt.data_column_name,
+                                 num_samples=args_opt.eval_steps * config.batch_size)
+        ppl_metric = PPLMetric(config.seq_length)
+        pangu_alpha_with_loss_eval_net = _VirtualDatasetCell(PanguAlphaWithLoss(config, pangu_alpha, loss))
+        model = Model(pangu_alpha_with_grads, eval_network=pangu_alpha_with_loss_eval_net, metrics={"ppl": ppl_metric})
+        model.build(ds, ds_eval, sink_size=callback_size)
+        eval_callback = EvalCallBack(model, ds_eval, ppl_metric)
+        callback.append(eval_callback)
+    else:
+        model = Model(pangu_alpha_with_grads)
     model.train(actual_epoch_num, ds, callbacks=callback,
                 sink_size=callback_size, dataset_sink_mode=True)
 
diff --git a/model_zoo/official/nlp/prophetnet/README.md b/model_zoo/official/nlp/prophetnet/README.md
index 97018d75979..7fa77f02b6a 100644
--- a/model_zoo/official/nlp/prophetnet/README.md
+++ b/model_zoo/official/nlp/prophetnet/README.md
@@ -1,658 +1,3 @@
 # Contents
 
-- [MASS: Masked Sequence to Sequence Pre-training for Language Generation Description](#googlenet-description)
-- [Model architecture](#model-architecture)
-- [Dataset](#dataset)
-- [Features](#features)
-- [Script description](#script-description)
-    - [Data Preparation](#Data-Preparation)
-        - [Tokenization](#Tokenization)
-        - [Byte Pair Encoding](#Byte-Pair-Encoding)
-        - [Build Vocabulary](#Build-Vocabulary)
-        - [Generate Dataset](#Generate-Dataset)
-            - [News Crawl Corpus](#News-Crawl-Corpus)
-            - [Gigaword Corpus](#Gigaword-Corpus)
-            - [Cornell Movie Dialog Corpus](#Cornell-Movie-Dialog-Corpus)
-    - [Configuration](#Configuration)
-    - [Training & Evaluation process](#Training-&-Evaluation-process)
-    - [Weights average](#Weights-average)
-    - [Learning rate scheduler](#Learning-rate-scheduler)
-- [Model description](#model-description)
-    - [Performance](#performance)
-        - [Results](#results)
-            - [Training Performance](#training-performance)
-            - [Inference Performance](#inference-performance)
-- [Environment Requirements](#environment-requirements)
-    - [Platform](#Platform)
-    - [Requirements](#Requirements)
-- [Get started](#get-started)
-    - [Pre-training](#Pre-training)
-    - [Fine-tuning](#Fine-tuning)
-    - [Inference](#Inference)
-- [Description of random situation](#description-of-random-situation)
-- [others](#others)
-- [ModelZoo Homepage](#modelzoo-homepage)
-
-# MASS: Masked Sequence to Sequence Pre-training for Language Generation Description
-
-[MASS: Masked Sequence to Sequence Pre-training for Language Generation](https://www.microsoft.com/en-us/research/uploads/prod/2019/06/MASS-paper-updated-002.pdf) was released by MicroSoft in June 2019.
-
-BERT(Devlin et al., 2018) have achieved SOTA in natural language understanding area by pre-training the encoder part of Transformer(Vaswani et al., 2017) with masked rich-resource text. Likewise, GPT(Raddford et al., 2018) pre-trains the decoder part of Transformer with masked(encoder inputs are masked) rich-resource text. Both of them build a robust language model by pre-training with masked rich-resource text.
-
-Inspired by BERT, GPT and other language models, MicroSoft addressed [MASS: Masked Sequence to Sequence Pre-training for Language Generation](https://www.microsoft.com/en-us/research/uploads/prod/2019/06/MASS-paper-updated-002.pdf) which combines BERT's and GPT's idea. MASS has an important parameter k, which controls the masked fragment length. BERT and GPT are specicl case when k equals to 1 and sentence length.
-
-[Introducing MASS – A pre-training method that outperforms BERT and GPT in sequence to sequence language generation tasks](https://www.microsoft.com/en-us/research/blog/introducing-mass-a-pre-training-method-that-outperforms-bert-and-gpt-in-sequence-to-sequence-language-generation-tasks/)
-
-[Paper](https://www.microsoft.com/en-us/research/uploads/prod/2019/06/MASS-paper-updated-002.pdf): Song, Kaitao, Xu Tan, Tao Qin, Jianfeng Lu and Tie-Yan Liu. “MASS: Masked Sequence to Sequence Pre-training for Language Generation.” ICML (2019).
-
-# Model architecture
-
-The overall network architecture of MASS is shown below, which is Transformer(Vaswani et al., 2017):
-
-MASS is consisted of 6-layer encoder and 6-layer decoder with 1024 embedding/hidden size, and 4096 intermediate size between feed forward network which has two full connection layers.
-
-# Dataset
-
-Dataset used:
-
-- monolingual English data from News Crawl dataset(WMT 2019) for pre-training.
-- Gigaword Corpus(Graff et al., 2003) for Text Summarization.
-- Cornell movie dialog corpus(DanescuNiculescu-Mizil & Lee, 2011).
-
-Details about those dataset could be found in [MASS: Masked Sequence to Sequence Pre-training for Language Generation](https://www.microsoft.com/en-us/research/uploads/prod/2019/06/MASS-paper-updated-002.pdf).
-
-# Features
-
-Mass is designed to jointly pre train encoder and decoder to complete the task of language generation.
-First of all, through a sequence to sequence framework, mass only predicts the blocked token, which forces the encoder to understand the meaning of the unshielded token, and encourages the decoder to extract useful information from the encoder.
-Secondly, by predicting the continuous token of the decoder, the decoder can build better language modeling ability than only predicting discrete token.
-Third, by further shielding the input token of the decoder which is not shielded in the encoder, the decoder is encouraged to extract more useful information from the encoder side, rather than using the rich information in the previous token.
-
-# Script description
-
-MASS script and code structure are as follow:
-
-```text
-├── mass
-  ├── README.md                              // Introduction of MASS model.
-  ├── config
-  │   ├──config.py                           // Configuration instance definition.
-  │   ├──config.json                         // Configuration file.
-  ├── src
-  │   ├──dataset
-  │      ├──bi_data_loader.py                // Dataset loader for fine-tune or inferring.
-  │      ├──mono_data_loader.py              // Dataset loader for pre-training.
-  │   ├──language_model
-  │      ├──noise_channel_language_model.p   // Noisy channel language model for dataset generation.
-  │      ├──mass_language_model.py           // MASS language model according to MASS paper.
-  │      ├──loose_masked_language_model.py   // MASS language model according to MASS released code.
-  │      ├──masked_language_model.py         // Masked language model according to MASS paper.
-  │   ├──transformer
-  │      ├──create_attn_mask.py              // Generate mask matrix to remove padding positions.
-  │      ├──transformer.py                   // Transformer model architecture.
-  │      ├──encoder.py                       // Transformer encoder component.
-  │      ├──decoder.py                       // Transformer decoder component.
-  │      ├──self_attention.py                // Self-Attention block component.
-  │      ├──multi_head_attention.py          // Multi-Head Self-Attention component.
-  │      ├──embedding.py                     // Embedding component.
-  │      ├──positional_embedding.py          // Positional embedding component.
-  │      ├──feed_forward_network.py          // Feed forward network.
-  │      ├──residual_conn.py                 // Residual block.
-  │      ├──beam_search.py                   // Beam search decoder for inferring.
-  │      ├──transformer_for_infer.py         // Use Transformer to infer.
-  │      ├──transformer_for_train.py         // Use Transformer to train.
-  │   ├──utils
-  │      ├──byte_pair_encoding.py            // Apply BPE with subword-nmt.
-  │      ├──dictionary.py                    // Dictionary.
-  │      ├──loss_moniter.py                  // Callback of monitering loss during training step.
-  │      ├──lr_scheduler.py                  // Learning rate scheduler.
-  │      ├──ppl_score.py                     // Perplexity score based on N-gram.
-  │      ├──rouge_score.py                   // Calculate ROUGE score.
-  │      ├──load_weights.py                  // Load weights from a checkpoint or NPZ file.
-  │      ├──initializer.py                   // Parameters initializer.
-  ├── vocab
-  │   ├──all.bpe.codes                       // BPE codes table(this file should be generated by user).
-  │   ├──all_en.dict.bin                     // Learned vocabulary file(this file should be generated by user).
-  ├── scripts
-  │   ├──run_ascend.sh                       // Ascend train & evaluate model script.
-  │   ├──run_gpu.sh                          // GPU train & evaluate model script.
-  │   ├──learn_subword.sh                    // Learn BPE codes.
-  │   ├──stop_training.sh                    // Stop training.
-  ├── requirements.txt                       // Requirements of third party package.
-  ├── train.py                               // Train API entry.
-  ├── eval.py                                // Infer API entry.
-  ├── tokenize_corpus.py                     // Corpus tokenization.
-  ├── apply_bpe_encoding.py                  // Applying bpe encoding.
-  ├── weights_average.py                     // Average multi model checkpoints to NPZ format.
-  ├── news_crawl.py                          // Create News Crawl dataset for pre-training.
-  ├── gigaword.py                            // Create Gigaword Corpus.
-  ├── cornell_dialog.py                      // Create Cornell Movie Dialog dataset for conversation response.
-
-```
-
-## Data Preparation
-
-The data preparation of a natural language processing task contains data cleaning, tokenization, encoding and vocabulary generation steps.
-
-In our experiments, using [Byte Pair Encoding(BPE)](https://arxiv.org/abs/1508.07909) could reduce size of vocabulary, and relieve the OOV influence effectively.
-
-Vocabulary could be created using `src/utils/dictionary.py` with text dictionary which is learnt from BPE.
-For more detail about BPE, please refer to [Subword-nmt lib](https://www.cnpython.com/pypi/subword-nmt) or [paper](https://arxiv.org/abs/1508.07909).
-
-In our experiments, vocabulary was learned based on 1.9M sentences from News Crawl Dataset, size of vocabulary is 45755.
-
-Here, we have a brief introduction of data preparation scripts.
-
-### Tokenization
-
-Using `tokenize_corpus.py` could tokenize corpus whose text files are in format of `.txt`.
-
-Major parameters in `tokenize_corpus.py`:
-
-```bash
---corpus_folder:     Corpus folder path, if multi-folders are provided, use ',' split folders.
---output_folder:     Output folder path.
---tokenizer:         Tokenizer to be used, nltk or jieba, if nltk is not installed fully, use jieba instead.
---pool_size:         Processes pool size.
-```
-
-Sample code:
-
-```bash
-python tokenize_corpus.py --corpus_folder /{path}/corpus --output_folder /{path}/tokenized_corpus --tokenizer {nltk|jieba} --pool_size 16
-```
-
-### Byte Pair Encoding
-
-After tokenization, BPE is applied to tokenized corpus with provided `all.bpe.codes`.
-
-Apply BPE script can be found in `apply_bpe_encoding.py`.
-
-Major parameters in `apply_bpe_encoding.py`:
-
-```bash
---codes:            BPE codes file.
---src_folder:       Corpus folders.
---output_folder:    Output files folder.
---prefix:           Prefix of text file in `src_folder`.
---vocab_path:       Generated vocabulary output path.
---threshold:        Filter out words that frequency is lower than threshold.
---processes:        Size of process pool (to accelerate). Default: 2.
-```
-
-Sample code:
-
-```bash
-python tokenize_corpus.py --codes /{path}/all.bpe.codes \
-    --src_folder /{path}/tokenized_corpus \
-    --output_folder /{path}/tokenized_corpus/bpe \
-    --prefix tokenized \
-    --vocab_path /{path}/vocab_en.dict.bin
-    --processes 32
-```
-
-### Build Vocabulary
-
-Support that you want to create a new vocabulary, there are two options:
-
-1. Learn BPE codes from scratch, and create vocabulary with multi vocabulary files from `subword-nmt`.
-2. Create from an existing vocabulary file which lines in the format of `word frequency`.
-3. *Optional*, Create a small vocabulary based on `vocab/all_en.dict.bin` with method of `shink` from `src/utils/dictionary.py`.
-4. Persistent vocabulary to `vocab` folder with method `persistence()`.
-
-Major interface of `src/utils/dictionary.py` are as follow:
-
-1. `shrink(self, threshold=50)`: Shrink the size of vocabulary by filter out words frequency is lower than threshold. It returns a new vocabulary.
-2. `load_from_text(cls, filepaths: List[str])`: Load existed text vocabulary which lines in the format of `word frequency`.  
-3. `load_from_persisted_dict(cls, filepath)`: Load from a persisted binary vocabulary which was saved by calling `persistence()` method.
-4. `persistence(self, path)`: Save vocabulary object to binary file.
-
-Sample code:
-
-```python
-from src.utils import Dictionary
-
-vocabulary = Dictionary.load_from_persisted_dict("vocab/all_en.dict.bin")
-tokens = [1, 2, 3, 4, 5]
-# Convert ids to symbols.
-print([vocabulary[t] for t in tokens])
-
-sentence = ["Hello", "world"]
-# Convert symbols to ids.
-print([vocabulary.index[s] for s in sentence])
-```
-
-For more detail, please refer to the source file.
-
-### Generate Dataset
-
-As mentioned above, three corpus are used in MASS mode, dataset generation scripts for them are provided.
-
-#### News Crawl Corpus
-
-Script can be found in `news_crawl.py`.
-
-Major parameters in `news_crawl.py`:
-
-```bash
-Note that please provide `--existed_vocab` or `--dict_folder` at least one.
-A new vocabulary would be created in `output_folder` when pass `--dict_folder`.
-
---src_folder:       Corpus folders.
---existed_vocab:    Optional, persisted vocabulary file.
---mask_ratio:       Ratio of mask.
---output_folder:    Output dataset files folder path.
---max_len:          Maximum sentence length. If a sentence longer than `max_len`, then drop it.
---suffix:           Optional, suffix of generated dataset files.
---processes:        Optional, size of process pool (to accelerate). Default: 2.
-```
-
-Sample code:
-
-```bash
-python news_crawl.py --src_folder /{path}/news_crawl \
-    --existed_vocab /{path}/mass/vocab/all_en.dict.bin \
-    --mask_ratio 0.5 \
-    --output_folder /{path}/news_crawl_dataset \
-    --max_len 32 \
-    --processes 32
-```
-
-#### Gigaword Corpus
-
-Script can be found in `gigaword.py`.
-
-Major parameters in `gigaword.py`:
-
-```bash
---train_src:        Train source file path.
---train_ref:        Train reference file path.
---test_src:         Test source file path.
---test_ref:         Test reference file path.
---existed_vocab:    Persisted vocabulary file.
---output_folder:    Output dataset files folder path.
---noise_prob:       Optional, add noise prob. Default: 0.
---max_len:          Optional, maximum sentence length. If a sentence longer than `max_len`, then drop it. Default: 64.
---format:           Optional, dataset format, "mindrecord" or "tfrecord". Default: "tfrecord".
-```
-
-Sample code:
-
-```bash
-python gigaword.py --train_src /{path}/gigaword/train_src.txt \
-    --train_ref /{path}/gigaword/train_ref.txt \
-    --test_src /{path}/gigaword/test_src.txt \
-    --test_ref /{path}/gigaword/test_ref.txt \
-    --existed_vocab /{path}/mass/vocab/all_en.dict.bin \
-    --noise_prob 0.1 \
-    --output_folder /{path}/gigaword_dataset \
-    --max_len 64
-```
-
-#### Cornell Movie Dialog Corpus
-
-Script can be found in `cornell_dialog.py`.
-
-Major parameters in `cornell_dialog.py`:
-
-```bash
---src_folder:       Corpus folders.
---existed_vocab:    Persisted vocabulary file.
---train_prefix:     Train source and target file prefix. Default: train.
---test_prefix:      Test source and target file prefix. Default: test.
---output_folder:    Output dataset files folder path.
---max_len:          Maximum sentence length. If a sentence longer than `max_len`, then drop it.
---valid_prefix:     Optional, Valid source and target file prefix. Default: valid.
-```
-
-Sample code:
-
-```bash
-python cornell_dialog.py --src_folder /{path}/cornell_dialog \
-    --existed_vocab /{path}/mass/vocab/all_en.dict.bin \
-    --train_prefix train \
-    --test_prefix test \
-    --noise_prob 0.1 \
-    --output_folder /{path}/cornell_dialog_dataset \
-    --max_len 64
-```
-
-## Configuration
-
-Json file under the path `config/` is the template configuration file.
-Almost all of the options and arguments needed could be assigned conveniently, including the training platform, configurations of dataset and model, arguments of optimizer etc. Optional features such as loss scale and checkpoint are also available by setting the options correspondingly.
-For more detailed information about the attributes, refer to the file `config/config.py`.
-
-## Training & Evaluation process
-
-For training a model, the shell script `run_ascend.sh` or `run_gpu.sh` is all you need. In this scripts, the environment variable is set and the training script `train.py` under `mass` is executed.
-You may start a task training with single device or multiple devices by assigning the options and run the command in bash:
-
-Ascend:
-
-```ascend
-bash run_ascend.sh [--options]
-```
-
-GPU:
-
-```gpu
-bash run_gpu.sh [--options]
-```
-
-The usage of `run_ascend.sh` is shown as below:
-
-```text
-Usage: run_ascend.sh [-h, --help] [-t, --task <CHAR>] [-n, --device_num <N>]
-                     [-i, --device_id <N>] [-j, --hccl_json <FILE>]
-                     [-c, --config <FILE>] [-o, --output <FILE>]
-                     [-v, --vocab <FILE>]
-
-options:
-    -h, --help               show usage
-    -t, --task               select task: CHAR, 't' for train and 'i' for inference".
-    -n, --device_num         device number used for training: N, default is 1.
-    -i, --device_id          device id used for training with single device: N, 0<=N<=7, default is 0.
-    -j, --hccl_json          rank table file used for training with multiple devices: FILE.
-    -c, --config             configuration file as shown in the path 'mass/config': FILE.
-    -o, --output             assign output file of inference: FILE.
-    -v, --vocab              set the vocabulary.
-    -m, --metric             set the metric.
-```
-
-Notes: Be sure to assign the hccl_json file while running a distributed-training.
-
-The usage of `run_gpu.sh` is shown as below:
-
-```text
-Usage: run_gpu.sh [-h, --help] [-t, --task <CHAR>] [-n, --device_num <N>]
-                     [-i, --device_id <N>] [-c, --config <FILE>]
-                     [-o, --output <FILE>] [-v, --vocab <FILE>]
-
-options:
-    -h, --help               show usage
-    -t, --task               select task: CHAR, 't' for train and 'i' for inference".
-    -n, --device_num         device number used for training: N, default is 1.
-    -i, --device_id          device id used for training with single device: N, 0<=N<=7, default is 0.
-    -c, --config             configuration file as shown in the path 'mass/config': FILE.
-    -o, --output             assign output file of inference: FILE.
-    -v, --vocab              set the vocabulary.
-    -m, --metric             set the metric.
-```
-
-The command followed shows a example for training with 2 devices.
-Ascend:
-
-```ascend
-bash run_ascend.sh --task t --device_num 2 --hccl_json /{path}/rank_table.json --config /{path}/config.json
-```
-
-ps. Discontinuous device id is not supported in `run_ascend.sh` at present, device id in `rank_table.json` must start from 0.
-
-GPU:
-
-```gpu
-bash run_gpu.sh --task t --device_num 2 --config /{path}/config.json
-```
-
-If use a single chip, it would be like this:
-Ascend:
-
-```ascend
-bash run_ascend.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
-```
-
-GPU:
-
-```gpu
-bash run_gpu.sh --task t --device_num 1 --device_id 0 --config /{path}/config.json
-```
-
-## Weights average
-
-```python
-python weights_average.py --input_files your_checkpoint_list --output_file model.npz
-```
-
-The input_files is a list of you checkpoints file. To use model.npz as the weights, add its path in config.json at "existed_ckpt".
-
-```json
-{
-  ...
-  "checkpoint_options": {
-    "existed_ckpt": "/xxx/xxx/model.npz",
-    "save_ckpt_steps": 1000,
-    ...
-  },
-  ...
-}
-```
-
-## Learning rate scheduler
-
-Two learning rate scheduler are provided in our model:
-
-1. [Polynomial decay scheduler](https://towardsdatascience.com/learning-rate-schedules-and-adaptive-learning-rate-methods-for-deep-learning-2c8f433990d1).
-2. [Inverse square root scheduler](https://ece.uwaterloo.ca/~dwharder/aads/Algorithms/Inverse_square_root/).
-
-LR scheduler could be config in `config/config.json`.
-
-For Polynomial decay scheduler, config could be like:
-
-```json
-{
-  ...
-  "learn_rate_config": {
-    "optimizer": "adam",
-    "lr": 1e-4,
-    "lr_scheduler": "poly",
-    "poly_lr_scheduler_power": 0.5,
-    "decay_steps": 10000,
-    "warmup_steps": 2000,
-    "min_lr": 1e-6
-  },
-  ...
-}
-```
-
-For Inverse square root scheduler, config could be like:
-
-```json
-{
-  ...
-  "learn_rate_config": {
-    "optimizer": "adam",
-    "lr": 1e-4,
-    "lr_scheduler": "isr",
-    "decay_start_step": 12000,
-    "warmup_steps": 2000,
-    "min_lr": 1e-6
-  },
-  ...
-}
-```
-
-More detail about LR scheduler could be found in `src/utils/lr_scheduler.py`.
-
-# Model description
-
-The MASS network is implemented by Transformer, which has multi-encoder layers and multi-decoder layers.
-For pre-training, we use the Adam optimizer and loss-scale to get the pre-trained model.
-During fine-turning, we fine-tune this pre-trained model with different dataset according to different tasks.
-During testing, we use the fine-turned model to predict the result, and adopt a beam search algorithm to
-get the most possible prediction results.
-
-## Performance
-
-### Results
-
-#### Fine-Tuning on Text Summarization
-
-The comparisons between MASS and two other pre-training methods in terms of ROUGE score on the text summarization task
-with 3.8M training data are as follows:
-
-| Method         |  RG-1(F)      | RG-2(F)      | RG-L(F)      |
-|:---------------|:--------------|:-------------|:-------------|
-| MASS           | Ongoing       | Ongoing      | Ongoing      |
-
-#### Fine-Tuning on Conversational ResponseGeneration
-
-The comparisons between MASS and other baseline methods in terms of PPL on Cornell Movie Dialog corpus are as follows:
-
-| Method             | Data = 10K       |  Data = 110K    |
-|--------------------|------------------|-----------------|
-| MASS               | Ongoing          | Ongoing         |
-
-#### Training Performance
-
-| Parameters                 | Masked Sequence to Sequence Pre-training for Language Generation          |
-|:---------------------------|:--------------------------------------------------------------------------|
-| Model Version              | v1                                                                        |
-| Resource                   |  Ascend 910; cpu 2.60GHz, 56cores; memory 314G; OS Euler2.8                           |
-| uploaded Date              | 05/24/2020                                                                |
-| MindSpore Version          | 0.2.0                                                                     |
-| Dataset                    | News Crawl 2007-2017 English monolingual corpus, Gigaword corpus, Cornell Movie Dialog corpus |
-| Training Parameters        | Epoch=50, steps=XXX, batch_size=192, lr=1e-4                              |
-| Optimizer                  | Adam                                                                      |
-| Loss Function              | Label smoothed cross-entropy criterion                                    |
-| outputs                    | Sentence and probability                                                  |
-| Loss                       | Lower than 2                                                              |
-| Accuracy                   | For conversation response, ppl=23.52, for text summarization, RG-1=29.79. |
-| Speed                      | 611.45 sentences/s                                                        |
-| Total time                 | --/--                                                                     |
-| Params (M)                 | 44.6M                                                                     |
-| Checkpoint for Fine tuning | ---Mb, --, [A link]()                                                     |
-| Model for inference        | ---Mb, --, [A link]()                                                     |
-| Scripts                    | [A link]()                                                                |
-
-#### Inference Performance
-
-| Parameters                 | Masked Sequence to Sequence Pre-training for Language Generation |
-|:---------------------------|:-----------------------------------------------------------|
-| Model Version              | V1                                                         |
-| Resource                   | Huawei 910                                                 |
-| uploaded Date              | 05/24/2020                                                 |
-| MindSpore Version          | 0.2.0                                                      |
-| Dataset                    | Gigaword corpus, Cornell Movie Dialog corpus               |
-| batch_size                 | ---                                                        |
-| outputs                    | Sentence and probability                                   |
-| Accuracy                   | ppl=23.52 for conversation response, RG-1=29.79 for text summarization. |
-| Speed                      | ---- sentences/s                                           |
-| Total time                 | --/--                                                      |
-| Model for inference        | ---Mb, --, [A link]()                                      |
-
-# Environment Requirements
-
-## Platform
-
-- Hardware(Ascend)
-    - Prepare hardware environment with Ascend processor.
-- Framework
-    - [MindSpore](https://www.mindspore.cn/install/en)
-- For more information, please check the resources below：
-    - [MindSpore tutorials](https://www.mindspore.cn/tutorials/en/master/index.html)
-    - [MindSpore Python API](https://www.mindspore.cn/docs/api/en/master/index.html)
-
-## Requirements
-
-```txt
-nltk
-numpy
-subword-nmt
-rouge
-```
-
-<https://www.mindspore.cn/docs/programming_guide/en/master/multi_platform_inference.html>
-
-# Get started
-
-MASS pre-trains a sequence to sequence model by predicting the masked fragments in an input sequence. After this, downstream tasks including text summarization and conversation response are candidated for fine-tuning the model and for inference.
-Here we provide a practice example to demonstrate the basic usage of MASS for pre-training, fine-tuning a model, and the inference process. The overall process is as follows:
-
-1. Download and process the dataset.
-2. Modify the `config.json` to config the network.
-3. Run a task for pre-training and fine-tuning.
-4. Perform inference and validation.
-
-## Pre-training
-
-For pre-training a model, config the options in `config.json` firstly:
-
-- Assign the `pre_train_dataset` under `dataset_config` node to the dataset path.
-- Choose the optimizer('momentum/adam/lamb' is available).
-- Assign the 'ckpt_prefix' and 'ckpt_path' under `checkpoint_path` to save the model files.
-- Set other arguments including dataset configurations and network configurations.
-- If you have a trained model already, assign the `existed_ckpt` to the checkpoint file.
-
-If you use the ascend chip, run the shell script `run_ascend.sh` as followed:
-
-```ascend
-bash run_ascend.sh -t t -n 1 -i 1 -c /mass/config/config.json
-```
-
-You can also run the shell script `run_gpu.sh` on gpu as followed:
-
-```gpu
-bash run_gpu.sh -t t -n 1 -i 1 -c /mass/config/config.json
-```
-
-Get the log and output files under the path `./train_mass_*/`, and the model file under the path assigned in the `config/config.json` file.
-
-## Fine-tuning
-
-For fine-tuning a model, config the options in `config.json` firstly:
-
-- Assign the `fine_tune_dataset` under `dataset_config` node to the dataset path.
-- Assign the `existed_ckpt` under `checkpoint_path` node to the existed model file generated by pre-training.
-- Choose the optimizer('momentum/adam/lamb' is available).
-- Assign the `ckpt_prefix` and `ckpt_path` under `checkpoint_path` node to save the model files.
-- Set other arguments including dataset configurations and network configurations.
-
-If you use the ascend chip, run the shell script `run_ascend.sh` as followed:
-
-```ascend
-bash run_ascend.sh -t t -n 1 -i 1 -c config/config.json
-```
-
-You can also run the shell script `run_gpu.sh` on gpu as followed:
-
-```gpu
-bash run_gpu.sh -t t -n 1 -i 1 -c config/config.json
-```
-
-Get the log and output files under the path `./train_mass_*/`, and the model file under the path assigned in the `config/config.json` file.
-
-## Inference
-
-If you need to use the trained model to perform inference on multiple hardware platforms, such as GPU, Ascend 910 or Ascend 310, you can refer to this [Link](https://www.mindspore.cn/docs/programming_guide/en/master/multi_platform_inference.html).
-For inference, config the options in `config.json` firstly:
-
-- Assign the `test_dataset` under `dataset_config` node to the dataset path.
-- Assign the `existed_ckpt` under `checkpoint_path` node to the model file produced by fine-tuning.
-- Choose the optimizer('momentum/adam/lamb' is available).
-- Assign the `ckpt_prefix` and `ckpt_path` under `checkpoint_path` node to save the model files.
-- Set other arguments including dataset configurations and network configurations.
-
-If you use the ascend chip, run the shell script `run_ascend.sh` as followed:
-
-```bash
-bash run_ascend.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
-```
-
-You can also run the shell script `run_gpu.sh` on gpu as followed:
-
-```gpu
-bash run_gpu.sh -t i -n 1 -i 1 -c config/config.json -o {outputfile}
-```
-
-# Description of random situation
-
-MASS model contains dropout operations, if you want to disable dropout, please set related dropout_rate to 0 in `config/config.json`.
-
-# others
-
-The model has been validated on Ascend environment, not validated on CPU and GPU.
-
-# ModelZoo Homepage  
-
- [Link](https://gitee.com/mindspore/mindspore/tree/master/model_zoo)
+The prophetnet is under development. It will be released soon.
diff --git a/model_zoo/official/recommend/ncf/src/ncf.py b/model_zoo/official/recommend/ncf/src/ncf.py
index c48af973ca7..6a9bb21059f 100644
--- a/model_zoo/official/recommend/ncf/src/ncf.py
+++ b/model_zoo/official/recommend/ncf/src/ncf.py
@@ -20,6 +20,7 @@ from mindspore.nn.layer.activation import get_activation
 import mindspore.common.dtype as mstype
 from mindspore.ops import operations as P
 from mindspore.common.initializer import initializer
+from mindspore.ops import functional as F
 from mindspore.ops import composite as C
 from mindspore.context import ParallelMode
 from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
@@ -260,8 +261,7 @@ class TrainStepWrap(nn.Cell):
         if self.reducer_flag:
             # apply grad reducer on grads
             grads = self.grad_reducer(grads)
-        self.optimizer(grads)
-        return loss
+        return F.depend(loss, self.optimizer(grads))
 
 
 class PredictWithSigmoid(nn.Cell):
diff --git a/model_zoo/official/rl/dqn/README.md b/model_zoo/official/rl/dqn/README.md
index 5731d24741b..bd8b165de3c 100644
--- a/model_zoo/official/rl/dqn/README.md
+++ b/model_zoo/official/rl/dqn/README.md
@@ -34,8 +34,8 @@ The overall network architecture of DQN is show below:
 - Framework
     - [MindSpore](https://www.mindspore.cn/install/en)
 - For more information, please check the resources below：
-    - [MindSpore Tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html)
-    - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html)
+    - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/master/index.html)
+    - [MindSpore Python API](https://www.mindspore.cn/docs/api/en/master/index.html)
 
 - third-party libraries
 
@@ -118,7 +118,7 @@ pip install gym
 | Loss Function              | MSELoss                                        |MSELoss                                                     |
 | outputs                    | Reward                                                 | Reward                                                 |
 | Params (M)                 | 7.3k                                                       | 7.3k                                                       |
-| Scripts                    | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/rl/dqn | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/rl/dqn |
+| Scripts                    | <<<https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/rl/dqn>>> | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/rl/dqn |
 
 ## [Description of Random Situation](#content)
 
@@ -126,4 +126,4 @@ We use random seed in train.py.
 
 ## [ModeZoo Homepage](#contents)  
 
-Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo).
\ No newline at end of file
+Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo).
diff --git a/model_zoo/official/rl/dqn/README_CN.md b/model_zoo/official/rl/dqn/README_CN.md
index 6fca820d493..8e014d50cd5 100644
--- a/model_zoo/official/rl/dqn/README_CN.md
+++ b/model_zoo/official/rl/dqn/README_CN.md
@@ -35,10 +35,10 @@ DQN网络的模型结构见论文：
 - 硬件
     - Ascend或GPU处理器
 - 框架
-    - [MindSpore](https://www.mindspore.cn/install/en)
+    - [MindSpore](https://www.mindspore.cn/install/)
 - 通过下面网址可以获得更多信息：
-    - [MindSpore Tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html)
-    - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html)
+    - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/zh-CN/master/index.html)
+    - [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html)
 
 - 第三方库
 
@@ -115,7 +115,7 @@ pip install gym
 | 损失函数              | MSELoss                                                | MSELoss                                                |
 | 输出                    | 游戏得分值                                                 | 游戏得分值                                                 |
 | 参数量(M)                 | 7.3k                                                       | 7.3k                                                       |
-| 脚本 | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/rl/dqn | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/rl/dqn |
+| 脚本 | <<<<https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/rl/dqn>>>> | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/rl/dqn |
 
 # 随机情况描述
 
diff --git a/model_zoo/official/rl/dqn/train.py b/model_zoo/official/rl/dqn/train.py
index 40a1234028a..d6e193d2878 100644
--- a/model_zoo/official/rl/dqn/train.py
+++ b/model_zoo/official/rl/dqn/train.py
@@ -49,7 +49,6 @@ if __name__ == "__main__":
     context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
     if args.device_target == 'GPU':
         cfg = cfg_gpu
-        context.set_context(device_id=1)
 
     env = gym.make(cfg.game)
     env = env.unwrapped
@@ -105,4 +104,3 @@ if __name__ == "__main__":
     times_numpy = np.array(times)
 
     print(rewards_numpy.mean(), times_numpy.mean())
-    
\ No newline at end of file
diff --git a/model_zoo/research/audio/fcn-4/README.md b/model_zoo/research/audio/fcn-4/README.md
index 29778cb5b39..8341df7c6e6 100644
--- a/model_zoo/research/audio/fcn-4/README.md
+++ b/model_zoo/research/audio/fcn-4/README.md
@@ -188,6 +188,8 @@ SLOG_PRINT_TO_STDOUT=1 python eval.py --device_id 0
         │   ├──run_train.sh             // shell script for distributed on Ascend
         │   ├──run_eval.sh              // shell script for evaluation on Ascend
         │   ├──run_process_data.sh      // shell script for convert audio clips to mindrecord
+        │   ├──run_train_gpu.sh         // shell script for distributed on GPU
+        │   ├──run_eval_gpu.sh          // shell script for evaluation on GPU
         ├── src
         │   ├──dataset.py                     // creating dataset
         │   ├──pre_process_data.py            // pre-process dataset
@@ -253,7 +255,13 @@ Parameters for both training and evaluation can be set in default_config.yaml
 - running on Ascend
 
   ```shell
-  python train.py > train.log 2>&1 &
+  python train.py --device_target Ascend > train.log 2>&1 &
+  ```
+
+- running on GPU
+
+  ```shell
+  python train.py --device_target GPU --data_dir [dataset dir path]  --checkpoint_path [chekpoint save dir]  > train.log 2>&1 &
   ```
 
   The python command above will run in the background, you can view the results through the file `train.log`.
@@ -310,21 +318,21 @@ AUC: 0.90995
 
 #### Evaluation Performance
 
-| Parameters                 | Ascend                                                      |
-| -------------------------- | ----------------------------------------------------------- |
-| Model Version              | FCN-4                                                       |
-| Resource                   | Ascend 910; CPU 2.60GHz, 56cores; Memory 314G; OS Euler2.8            |
-| uploaded Date              | 07/05/2021 (month/day/year)                                 |
-| MindSpore Version          | 1.3.0                                                |
-| Training Parameters        | epoch=10, steps=534, batch_size = 32, lr=0.005              |
-| Optimizer                  | Adam                                                        |
-| Loss Function              | Binary cross entropy                                        |
-| outputs                    | probability                                                 |
-| Loss                       | AUC 0.909                                                  |
-| Speed                      | 1pc: 160 samples/sec;                                       |
-| Total time                 | 1pc: 20 mins;                                               |
-| Checkpoint for Fine tuning | 198.73M(.ckpt file)                                         |
-| Scripts                    | [music_auto_tagging script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/audio/fcn-4) |
+| Parameters                 | Ascend                                                      | GPU                                                         |
+| -------------------------- | ----------------------------------------------------------- | ----------------------------------------------------------- |
+| Model Version              | FCN-4                                                       | FCN-4                                                       |
+| Resource                   | Ascend 910; CPU 2.60GHz, 56cores; Memory 314G; OS Euler2.8  | Tesla V100-PICE-32G                                         |
+| uploaded Date              | 07/05/2021 (month/day/year)                                 | 07/26/2021 (month/day/year)                                 |
+| MindSpore Version          | 1.3.0                                                       | 1.3.0                                                       |
+| Training Parameters        | epoch=10, steps=534, batch_size = 32, lr=0.005              | epoch=10, steps=534, batch_size = 32, lr=0.005              |
+| Optimizer                  | Adam                                                        | Adam                                                        |
+| Loss Function              | Binary cross entropy                                        | Binary cross entropy                                        |
+| outputs                    | probability                                                 | probability                                                 |
+| Loss                       | AUC 0.909                                                   | AUC 0.909                                                   |
+| Speed                      | 1pc: 160 samples/sec;                                       | 1pc: 160 samples/sec;                                       |
+| Total time                 | 1pc: 20 mins;                                               | 1pc: 20 mins;                                               |
+| Checkpoint for Fine tuning | 198.73M(.ckpt file)                                         | 198.73M(.ckpt file)                                         |
+| Scripts                    | [music_auto_tagging script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/audio/fcn-4)             |
 
 ## [ModelZoo Homepage](#contents)  
 
diff --git a/model_zoo/research/audio/fcn-4/default_config.yaml b/model_zoo/research/audio/fcn-4/default_config.yaml
index 3e0a2fa7602..ea9c77e1b4a 100644
--- a/model_zoo/research/audio/fcn-4/default_config.yaml
+++ b/model_zoo/research/audio/fcn-4/default_config.yaml
@@ -6,7 +6,7 @@ checkpoint_url: ""
 data_path: "/cache/data"
 output_path: "/cache/train"
 load_path: "/cache/checkpoint_path"
-device_target: Ascend
+device_target: "Ascend"
 enable_profiling: False
 
 # ==============================================================================
diff --git a/model_zoo/research/audio/fcn-4/eval.py b/model_zoo/research/audio/fcn-4/eval.py
index ee8811ded88..f517469ce03 100644
--- a/model_zoo/research/audio/fcn-4/eval.py
+++ b/model_zoo/research/audio/fcn-4/eval.py
@@ -18,13 +18,11 @@ python eval.py
 '''
 
 import numpy as np
-
 from src.model_utils.config import config
 from src.model_utils.moxing_adapter import moxing_wrapper
 from src.model_utils.device_adapter import get_device_id
 from src.musictagger import MusicTaggerCNN
 from src.dataset import create_dataset
-
 import mindspore.common.dtype as mstype
 from mindspore import context
 from mindspore import Tensor
@@ -113,12 +111,15 @@ def validation(net, model_path, data_dir, filename, num_consumer, batch):
 def modelarts_process():
     pass
 
+
 @moxing_wrapper(pre_process=modelarts_process)
 def fcn4_eval():
     """
     eval network
     """
-    context.set_context(device_target=config.device_target, mode=context.GRAPH_MODE, device_id=get_device_id())
+    context.set_context(device_target=config.device_target, mode=context.GRAPH_MODE)
+    if config.device_target == 'Ascend':
+        context.set_context(device_id=get_device_id())
 
     network = MusicTaggerCNN(in_classes=[1, 128, 384, 768, 2048],
                              kernel_size=[3, 3, 3, 3, 3],
diff --git a/model_zoo/research/audio/fcn-4/src/model_utils/config.py b/model_zoo/research/audio/fcn-4/src/model_utils/config.py
index 7f1ff6e2b8d..4c37e398014 100644
--- a/model_zoo/research/audio/fcn-4/src/model_utils/config.py
+++ b/model_zoo/research/audio/fcn-4/src/model_utils/config.py
@@ -124,4 +124,5 @@ def get_config():
     final_config = merge(args, default)
     return Config(final_config)
 
+
 config = get_config()
diff --git a/model_zoo/research/audio/fcn-4/train.py b/model_zoo/research/audio/fcn-4/train.py
index 7b79011794f..72747fb0578 100644
--- a/model_zoo/research/audio/fcn-4/train.py
+++ b/model_zoo/research/audio/fcn-4/train.py
@@ -16,7 +16,7 @@
 ##############train models#################
 python train.py
 '''
-
+import os
 from mindspore import context, nn
 from mindspore.train import Model
 from mindspore.common import set_seed
@@ -35,6 +35,7 @@ from src.loss import BCELoss
 def modelarts_pre_process():
     pass
 
+
 @moxing_wrapper(pre_process=modelarts_pre_process)
 def train(model, dataset_direct, filename, columns_list, num_consumer=4,
           batch=16, epoch=50, save_checkpoint_steps=2172, keep_checkpoint_max=50,
@@ -58,8 +59,12 @@ def train(model, dataset_direct, filename, columns_list, num_consumer=4,
 if __name__ == "__main__":
     set_seed(1)
 
-    context.set_context(device_target='Ascend', mode=context.GRAPH_MODE, device_id=get_device_id())
+    config.checkpoint_path = os.path.abspath(config.checkpoint_path)
+    context.set_context(device_target=config.device_target, mode=context.GRAPH_MODE)
     context.set_context(enable_auto_mixed_precision=config.mixed_precision)
+    if config.device_target == 'Ascend':
+        context.set_context(device_id=get_device_id())
+
     network = MusicTaggerCNN(in_classes=[1, 128, 384, 768, 2048],
                              kernel_size=[3, 3, 3, 3, 3],
                              padding=[0] * 5,
diff --git a/model_zoo/research/cv/FaceRecognition/README.md b/model_zoo/research/cv/FaceRecognition/README.md
index a9d67d587e5..b5de6dd628c 100644
--- a/model_zoo/research/cv/FaceRecognition/README.md
+++ b/model_zoo/research/cv/FaceRecognition/README.md
@@ -13,7 +13,7 @@
 
 # [Face Recognition Description](#contents)
 
-This is a face recognition network based on Resnet, with support for training and evaluation on Ascend910.
+This is a face recognition network based on Resnet, with support for training and evaluation on Ascend910, CPU or GPU.
 
 ResNet (residual neural network) was proposed by Kaiming He and other four Chinese of Microsoft Research Institute. Through the use of ResNet unit, it successfully trained 152 layers of neural network, and won the championship in ilsvrc2015. The error rate on top 5 was 3.57%, and the parameter quantity was lower than vggnet, so the effect was very outstanding. Traditional convolution network or full connection network will have more or less information loss. At the same time, it will lead to the disappearance or explosion of gradient, which leads to the failure of deep network training. ResNet solves this problem to a certain extent. By passing the input information to the output, the integrity of the information is protected. The whole network only needs to learn the part of the difference between input and output, which simplifies the learning objectives and difficulties.The structure of ResNet can accelerate the training of neural network very quickly, and the accuracy of the model is also greatly improved. At the same time, ResNet is very popular, even can be directly used in the concept net network.
 
@@ -55,8 +55,8 @@ The directory structure is as follows:
 
 # [Environment Requirements](#contents)
 
-- Hardware（Ascend, CPU）
-    - Prepare hardware environment with Ascend processor. It also supports the use of CPU processor to prepare the
+- Hardware（Ascend, CPU, GPU）
+    - Prepare hardware environment with Ascend processor. It also supports the use of CPU or GPU processor to prepare the
     hardware environment.
 - Framework
     - [MindSpore](https://www.mindspore.cn/install/en)
@@ -71,16 +71,20 @@ The directory structure is as follows:
 The entire code structure is as following:
 
 ```python
-└─ face_recognition
+└─ FaceRecognition
+  ├── ascend310_infer
   ├── README.md                             // descriptions about face_recognition
   ├── scripts
   │   ├── run_distribute_train_base.sh      // shell script for distributed training on Ascend
   │   ├── run_distribute_train_beta.sh      // shell script for distributed training on Ascend
+  │   ├── run_distribute_train_for_gpu.sh   // shell script for distributed training on GPU
   │   ├── run_eval.sh                       // shell script for evaluation on Ascend
   │   ├── run_eval_cpu.sh                   // shell script for evaluation on CPU
+  │   ├── run_eval_gpu.sh                   // shell script for evaluation on gpu
   │   ├── run_export.sh                     // shell script for exporting air model
   │   ├── run_standalone_train_base.sh      // shell script for standalone training on Ascend
   │   ├── run_standalone_train_beta.sh      // shell script for standalone training on Ascend
+  │   ├── run_standalone_train_for_gpu.sh   // shell script for standalone training on GPU
   │   ├── run_train_base_cpu.sh             // shell script for training on CPU
   │   ├── run_train_btae_cpu.sh             // shell script for training on CPU
   ├── src
@@ -97,7 +101,7 @@ The entire code structure is as following:
   │   ├── lrsche_factory.py                 // learning rate schedule
   │   ├── me_init.py                        // network parameter init method
   │   ├── metric_factory.py                 // metric fc layer
-  ── utils
+  ── model_utils
   │   ├── __init__.py                       // init file
   │   ├── config.py                         // parameter analysis
   │   ├── device_adapter.py                 // device adapter
@@ -124,58 +128,98 @@ The entire code structure is as following:
 
       ```bash
       cd ./scripts
-      sh run_standalone_train_base.sh [USE_DEVICE_ID]
+      bash run_standalone_train_base.sh [USE_DEVICE_ID]
       ```
 
       for example:
 
       ```bash
       cd ./scripts
-      sh run_standalone_train_base.sh 0
+      bash run_standalone_train_base.sh 0
       ```
 
     - beta model
 
       ```bash
       cd ./scripts
-      sh run_standalone_train_beta.sh [USE_DEVICE_ID]
+      bash run_standalone_train_beta.sh [USE_DEVICE_ID]
       ```
 
       for example:
 
       ```bash
       cd ./scripts
-      sh run_standalone_train_beta.sh 0
+      bash run_standalone_train_beta.sh 0
       ```
 
-- Distribute mode (recommended)
+- Stand alone mode(GPU)
+
+    - base/beta model
+
+      ```bash
+      cd ./scripts
+      bash run_standalone_train_for_gpu.sh [base/beta] [DEVICE_ID](optional)
+      ```
+
+      for example:
+
+      ```bash
+      #base
+      cd ./scripts
+      bash run_standalone_train_for_gpu.sh base 3
+      #beta
+      cd ./scripts
+      bash run_standalone_train_for_gpu.sh beta 3
+      ```
+
+- Distribute mode (Ascend, recommended)
 
     - base model
 
       ```bash
       cd ./scripts
-      sh run_distribute_train_base.sh [RANK_TABLE]
+      bash run_distribute_train_base.sh [RANK_TABLE]
       ```
 
       for example:
 
       ```bash
       cd ./scripts
-      sh run_distribute_train_base.sh ./rank_table_8p.json
+      bash run_distribute_train_base.sh ./rank_table_8p.json
       ```
 
     - beta model
 
       ```bash
       cd ./scripts
-      sh run_distribute_train_beta.sh [RANK_TABLE]
+      bash run_distribute_train_beta.sh [RANK_TABLE]
       ```
 
       for example:
 
       ```bash
       cd ./scripts
-      sh run_distribute_train_beta.sh ./rank_table_8p.json
+      bash run_distribute_train_beta.sh ./rank_table_8p.json
+      ```
+
+- Distribute mode (GPU)
+
+    - base model
+
+      ```bash
+      cd ./scripts
+      bash run_distribute_train_for_gpu.sh [RANK_SIZE] [base/beta] [CONFIG_PATH](optional)
+      ```
+
+      for example:
+
+      ```bash
+      #base
+      cd ./scripts
+      bash run_distribute_train_for_gpu.sh 8 base
+      #beta
+      cd ./scripts
+      bash run_distribute_train_for_gpu.sh 8 beta
       ```
 
 - Stand alone mode(CPU)
@@ -184,28 +228,28 @@ The entire code structure is as following:
 
       ```bash
       cd ./scripts
-      sh run_train_base_cpu.sh
+      bash run_train_base_cpu.sh
       ```
 
       for example:
 
       ```bash
       cd ./scripts
-      sh run_train_base_cpu.sh
+      bash run_train_base_cpu.sh
       ```
 
     - beta model
 
       ```bash
       cd ./scripts
-      sh run_train_beta_cpu.sh
+      bash run_train_beta_cpu.sh
       ```
 
       for example:
 
       ```bash
       cd ./scripts
-      sh run_train_beta_cpu.sh
+      bash run_train_beta_cpu.sh
       ```
 
 - ModelArts (If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training as follows)
@@ -352,34 +396,34 @@ You will get the result as following in "./scripts/acc.log" if 'dis_dataset' ran
 
 ### Training Performance
 
-| Parameters                 | Face Recognition                                            |
-| -------------------------- | ----------------------------------------------------------- |
-| Model Version              | V1                                                          |
-| Resource                   | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8                |
-| uploaded Date              | 09/30/2020 (month/day/year)                                 |
-| MindSpore Version          | 1.0.0                                                       |
-| Dataset                    | 4.7 million images                                          |
-| Training Parameters        | epoch=100, batch_size=192, momentum=0.9                     |
-| Optimizer                  | Momentum                                                    |
-| Loss Function              | Cross Entropy                                               |
-| outputs                    | probability                                                 |
-| Speed                      | 1pc: 350-600 fps; 8pcs: 2500-4500 fps                       |
-| Total time                 | 1pc: NA hours; 8pcs: 10 hours                               |
-| Checkpoint for Fine tuning | 584M (.ckpt file)                                           |
+| Parameters                 | Face Recognition                                            | Face Recognition   |
+| -------------------------- | ----------------------------------------------------------- | ------------------ |
+| Model Version              | V1                                                          | V1                 |
+| Resource                   | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | NV SMX2 V100-32G   |
+| uploaded Date              | 09/30/2020 (month/day/year)                        | 29/07/2021 (month/day/year) |
+| MindSpore Version          | 1.0.0                                                       | 1.3.0              |
+| Dataset                    | 4.7 million images                                          | 4.7 million images |
+| Training Parameters        | epoch=100, batch_size=192, momentum=0.9 | epoch=18(base:9, beta:9), batch_size=192, momentum=0.9 |
+| Optimizer                  | Momentum                                                    | Momentum           |
+| Loss Function              | Cross Entropy                                               | Cross Entropy      |
+| outputs                    | probability                                                 | probability        |
+| Speed                      | 1pc: 350-600 fps; 8pcs: 2500-4500 fps    | base: 1pc: 310-360 fps, 8pcs: 2000-2500 fps; beta: 1pc: 420-470 fps, 8pcs: 3000-3500 fps; |
+| Total time                 | 1pc: NA hours; 8pcs: 10 hours   | 1pc: NA hours; 8pcs: 5.5(base) + 3.7(beta) hours |
+| Checkpoint for Fine tuning | 584M (.ckpt file)            | 768M (.ckpt file, base), 582M (.ckpt file, beta)  |
 
 ### Evaluation Performance
 
-| Parameters          |Face Recognition For Tracking|
-| ------------------- | --------------------------- |
-| Model Version       | V1                          |
-| Resource            | Ascend 910; OS Euler2.8                      |
-| Uploaded Date       | 09/30/2020 (month/day/year) |
-| MindSpore Version   | 1.0.0                       |
-| Dataset             | 1.1 million images          |
-| batch_size          | 512                         |
-| outputs             | ACC                         |
-| ACC                 | 0.9                         |
-| Model for inference | 584M (.ckpt file)           |
+| Parameters          | Face Recognition            | Face Recognition            |
+| ------------------- | --------------------------- | --------------------------- |
+| Model Version       | V1                          | V1                          |
+| Resource            | Ascend 910; OS Euler2.8     | NV SMX2 V100-32G            |
+| Uploaded Date       | 09/30/2020 (month/day/year) | 29/07/2021 (month/day/year) |
+| MindSpore Version   | 1.0.0                       | 1.3.0                       |
+| Dataset             | 1.1 million images          | 1.1 million images          |
+| batch_size          | 512                         | 512                         |
+| outputs             | ACC                         | ACC                         |
+| ACC                 | 0.9                         | 0.9                         |
+| Model for inference | 584M (.ckpt file)           | 582M (.ckpt file)           |
 
 # [ModelZoo Homepage](#contents)
 
diff --git a/model_zoo/research/cv/FaceRecognition/eval.py b/model_zoo/research/cv/FaceRecognition/eval.py
index a63df93ef6e..85e3b505a99 100644
--- a/model_zoo/research/cv/FaceRecognition/eval.py
+++ b/model_zoo/research/cv/FaceRecognition/eval.py
@@ -20,6 +20,7 @@ from pprint import pformat
 import numpy as np
 import cv2
 
+from mindspore.common import dtype as mstype
 import mindspore.dataset.transforms.py_transforms as transforms
 import mindspore.dataset.vision.py_transforms as vision
 import mindspore.dataset as de
@@ -127,7 +128,6 @@ def get_model(args):
     net = get_backbone(args)
     if args.fp16:
         net.add_flags_recursive(fp16=True)
-
     if args.weight.endswith('.ckpt'):
         param_dict = load_checkpoint(args.weight)
         param_dict_new = {}
@@ -143,6 +143,8 @@ def get_model(args):
     else:
         args.logger.info('ERROR, not support file:{}, please check weight in config.py'.format(args.weight))
         return 0
+    if args.device_target == 'GPU':
+        net.to_float(mstype.float32)
     net.set_train(False)
     return net
 
diff --git a/model_zoo/research/cv/FaceRecognition/export.py b/model_zoo/research/cv/FaceRecognition/export.py
index e0f02cfb668..94bb715d40d 100644
--- a/model_zoo/research/cv/FaceRecognition/export.py
+++ b/model_zoo/research/cv/FaceRecognition/export.py
@@ -23,7 +23,7 @@ from mindspore.train.serialization import export, load_checkpoint, load_param_in
 from src.backbone.resnet import get_backbone
 from model_utils.config import config
 from model_utils.moxing_adapter import moxing_wrapper
-
+from model_utils.device_adapter import get_device_id
 
 def modelarts_pre_process():
     '''modelarts pre process function.'''
@@ -41,8 +41,8 @@ def run_export():
     config.backbone = config.export_backbone
     config.use_drop = config.export_use_drop
 
-    devid = 0
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=devid)
+    context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False,
+                        device_id=get_device_id())
 
     network = get_backbone(config)
 
diff --git a/model_zoo/research/cv/FaceRecognition/scripts/run_export.sh b/model_zoo/research/cv/FaceRecognition/scripts/run_export.sh
index 7c70371fcce..44c590545cb 100644
--- a/model_zoo/research/cv/FaceRecognition/scripts/run_export.sh
+++ b/model_zoo/research/cv/FaceRecognition/scripts/run_export.sh
@@ -14,9 +14,9 @@
 # limitations under the License.
 # ============================================================================
 
-if [ $# != 3 ]
+if [ $# != 3 ]  && [ $# != 2 ]
 then
-    echo "Usage: sh run_export.sh [BATCH_SIZE] [USE_DEVICE_ID] [PRETRAINED_BACKBONE]"
+    echo "Usage: sh run_export.sh [PRETRAINED_BACKBONE] [DEVICE_TARGET] [USE_DEVICE_ID](optional)"
 exit 1
 fi
 
@@ -42,9 +42,13 @@ SCRIPT_NAME='export.py'
 
 ulimit -c unlimited
 
-BATCH_SIZE=$1
-USE_DEVICE_ID=$2
-PRETRAINED_BACKBONE=$(get_real_path $3)
+PRETRAINED_BACKBONE=$(get_real_path $1)
+DEVICE_TARGET=$2
+if [ $# = 3 ]; then
+  USE_DEVICE_ID=$3
+else
+  USE_DEVICE_ID=0
+fi
 
 if [ ! -f $PRETRAINED_BACKBONE ]
     then
@@ -52,7 +56,6 @@ if [ ! -f $PRETRAINED_BACKBONE ]
 exit 1
 fi
 
-echo $BATCH_SIZE
 echo $USE_DEVICE_ID
 echo $PRETRAINED_BACKBONE
 
@@ -65,7 +68,8 @@ cd ${current_exec_path}/device$USE_DEVICE_ID || exit
 dev=`expr $USE_DEVICE_ID + 0`
 export DEVICE_ID=$dev
 python ${dirname_path}/${SCRIPT_NAME} \
+    --config_path=${dirname_path}/beta_config.yaml \
     --pretrained=$PRETRAINED_BACKBONE \
-    --batch_size=$BATCH_SIZE > convert.log  2>&1 &
+    --device_target=$DEVICE_TARGET > convert.log  2>&1 &
 
 echo 'running'
diff --git a/model_zoo/research/cv/FaceRecognition/src/custom_dataset.py b/model_zoo/research/cv/FaceRecognition/src/custom_dataset.py
index 031ab0ed314..3c7a5e09dc3 100644
--- a/model_zoo/research/cv/FaceRecognition/src/custom_dataset.py
+++ b/model_zoo/research/cv/FaceRecognition/src/custom_dataset.py
@@ -162,6 +162,8 @@ class ImageFolderDataset:
                     with open(cache_path, 'wb') as fw:
                         pickle.dump(cache, fw)
                     print('local dump cache:{}'.format(cache_path))
+                    with open(cache_path[:cache_path.rfind('.')] + 'txt', 'w') as _f:
+                        _f.write("Rank 0 dump data to cache_path:'{}' successfully!".format(cache_path))
             else:
                 with open(cache_path, 'wb') as fw:
                     pickle.dump(cache, fw)
diff --git a/model_zoo/research/cv/FaceRecognition/src/dataset_factory.py b/model_zoo/research/cv/FaceRecognition/src/dataset_factory.py
index 8e04874a153..37f50c76348 100644
--- a/model_zoo/research/cv/FaceRecognition/src/dataset_factory.py
+++ b/model_zoo/research/cv/FaceRecognition/src/dataset_factory.py
@@ -21,18 +21,16 @@ import mindspore.dataset as de
 import mindspore.dataset.vision.py_transforms as F
 import mindspore.dataset.transforms.py_transforms as F2
 
-from model_utils.config import config
 from src.custom_dataset import DistributedCustomSampler, CustomDataset
 
 __all__ = ['get_de_dataset']
 
-
 def get_de_dataset(args):
     '''get_de_dataset'''
     lbl_transforms = [F.ToType(np.int32)]
     transform_label = F2.Compose(lbl_transforms)
 
-    drop_remainder = False
+    drop_remainder = True
 
     transforms = [F.ToPIL(),
                   F.RandomHorizontalFlip(),
@@ -40,16 +38,21 @@ def get_de_dataset(args):
                   F.Normalize(mean=[0.5], std=[0.5])]
     transform = F2.Compose(transforms)
     cache_path = os.path.join('cache', os.path.basename(args.data_dir), 'data_cache.pkl')
-    print(cache_path)
+    if args.device_target == 'GPU' and args.local_rank != 0:
+        while True:
+            if os.path.exists(cache_path) and os.path.exists(cache_path[:cache_path.rfind('.')] + 'txt'):
+                break
+        with open(cache_path[:cache_path.rfind('.')] + 'txt') as _f:
+            args.logger.info(_f.readline())
     if not os.path.exists(os.path.dirname(cache_path)):
         os.makedirs(os.path.dirname(cache_path))
     dataset = CustomDataset(args.data_dir, cache_path, args.is_distributed)
     args.logger.info("dataset len:{}".format(dataset.__len__()))
-    if config.device_target == 'Ascend':
+    if args.device_target in ('Ascend', 'GPU'):
         sampler = DistributedCustomSampler(dataset, num_replicas=args.world_size, rank=args.local_rank,
                                            is_distributed=args.is_distributed)
         de_dataset = de.GeneratorDataset(dataset, ["image", "label"], sampler=sampler)
-    elif config.device_target == 'CPU':
+    elif args.device_target == 'CPU':
         de_dataset = de.GeneratorDataset(dataset, ["image", "label"])
     args.logger.info("after sampler de_dataset datasize :{}".format(de_dataset.get_dataset_size()))
     de_dataset = de_dataset.map(input_columns="image", operations=transform)
diff --git a/model_zoo/research/cv/FaceRecognition/train.py b/model_zoo/research/cv/FaceRecognition/train.py
index 409fc557a74..b4868ed83c7 100644
--- a/model_zoo/research/cv/FaceRecognition/train.py
+++ b/model_zoo/research/cv/FaceRecognition/train.py
@@ -20,7 +20,7 @@ import mindspore
 from mindspore.nn import Cell
 from mindspore import context
 from mindspore.context import ParallelMode
-from mindspore.communication.management import init
+from mindspore.communication.management import init, get_group_size, get_rank
 from mindspore.nn.optim import Momentum
 from mindspore.train.model import Model
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
@@ -42,7 +42,11 @@ from model_utils.device_adapter import get_device_id, get_device_num, get_rank_i
 
 mindspore.common.seed.set_seed(1)
 context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False,
-                    device_id=get_device_id(), reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)
+                    reserve_class_name_in_scope=False, enable_graph_kernel=config.device_target == "GPU")
+if config.device_target == 'Ascend':
+    context.set_context(enable_auto_mixed_precision=False)
+if config.device_target != 'GPU' or not config.is_distributed:
+    context.set_context(device_id=get_device_id())
 
 class DistributedHelper(Cell):
     '''DistributedHelper'''
@@ -175,15 +179,38 @@ def modelarts_pre_process():
 
     config.ckpt_path = os.path.join(config.output_path, str(get_rank_id()), config.ckpt_path)
 
+def model_context():
+    """set context for facerecognition"""
+    if config.is_distributed:
+        parallel_mode = ParallelMode.HYBRID_PARALLEL if config.device_target == 'Ascend' else ParallelMode.DATA_PARALLEL
+    else:
+        parallel_mode = ParallelMode.STAND_ALONE
+    if config.is_distributed:
+        if config.device_target == 'Ascend':
+            context.set_auto_parallel_context(parallel_mode=parallel_mode,
+                                              device_num=config.world_size, gradients_mean=True)
+            init()
+            config.local_rank = get_rank_id()
+            config.world_size = get_device_num()
+        elif config.device_target == 'GPU':
+            init()
+            device_num = get_group_size()
+            context.reset_auto_parallel_context()
+            context.set_auto_parallel_context(device_num=device_num,
+                                              parallel_mode=parallel_mode,
+                                              gradients_mean=True)
+            config.world_size = get_group_size()
+            config.local_rank = get_rank()
+        else:
+            pass
+
 
 @moxing_wrapper(pre_process=modelarts_pre_process)
 def run_train():
     '''run train function.'''
-    config.local_rank = get_rank_id()
-    config.world_size = get_device_num()
+    model_context()
     log_path = os.path.join(config.ckpt_path, 'logs')
     config.logger = get_logger(log_path, config.local_rank)
-
     support_train_stage = ['base', 'beta']
     if config.train_stage.lower() not in support_train_stage:
         config.logger.info('your train stage is not support.')
@@ -192,13 +219,6 @@ def run_train():
     if not os.path.exists(config.data_dir):
         config.logger.info('ERROR, data_dir is not exists, please set data_dir in config.py')
         raise ValueError('ERROR, data_dir is not exists, please set data_dir in config.py')
-
-    parallel_mode = ParallelMode.HYBRID_PARALLEL if config.is_distributed else ParallelMode.STAND_ALONE
-    context.set_auto_parallel_context(parallel_mode=parallel_mode,
-                                      device_num=config.world_size, gradients_mean=True)
-    if config.is_distributed:
-        init()
-
     if config.local_rank % 8 == 0:
         if not os.path.exists(config.ckpt_path):
             os.makedirs(config.ckpt_path)
@@ -260,7 +280,7 @@ def run_train():
                                             scale_window=2000)
     if config.device_target == "Ascend":
         model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=scale_manager)
-    elif config.device_target == "CPU":
+    elif config.device_target in ("CPU", "GPU"):
         model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=None)
 
     save_checkpoint_steps = config.ckpt_steps
diff --git a/model_zoo/research/cv/ICNet/README.md b/model_zoo/research/cv/ICNet/README.md
index 8b330c1d874..de7842d4787 100644
--- a/model_zoo/research/cv/ICNet/README.md
+++ b/model_zoo/research/cv/ICNet/README.md
@@ -42,8 +42,8 @@ It contains 5,000 finely annotated images split into training, validation and te
 - frame:
     - [Mindspore](https://www.mindspore.cn/install)
 - For details, please refer to the following resources:
-    - [MindSpore course](https://www.mindspore.cn/tutorial/training/zh-CN/master/index.html)
-    - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/zh-CN/master/index.html)
+    - [MindSpore course](https://www.mindspore.cn/tutorials/en/master/index.html)
+    - [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html)
 
 # [Scription Description](#Content)
 
diff --git a/model_zoo/research/cv/LightCNN/README_CN.md b/model_zoo/research/cv/LightCNN/README_CN.md
index 2109793ac33..b7dc957f487 100644
--- a/model_zoo/research/cv/LightCNN/README_CN.md
+++ b/model_zoo/research/cv/LightCNN/README_CN.md
@@ -103,11 +103,11 @@ LightCNN适用于有大量噪声的人脸识别数据集，提出了maxout 的
 - 框架
     - [MindSpore](https://www.mindspore.cn/install)
 - 如需查看详情，请参见如下资源：
-    - [MindSpore教程](https://www.mindspore.cn/tutorial/training/zh-CN/master/index.html)
-    - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/zh-CN/master/index.html)
+    - [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html)
+    - [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html)
 - 生成config json文件用于8卡训练。
     - [简易教程](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools)
-    - 详细配置方法请参照[官网教程](https://www.mindspore.cn/tutorial/training/zh-CN/r1.2/advanced_use/distributed_training_ascend.html#id4)。
+    - 详细配置方法请参照[官网教程](https://www.mindspore.cn/tutorials/zh-CN/master/intermediate/distributed_training/distributed_training_ascend.html#id3)。
 
 # 快速入门
 
@@ -439,7 +439,7 @@ python3 eval_blfur.py \
 [3]: https://drive.google.com/file/d/0ByNaVHFekDPRbFg1YTNiMUxNYXc/view?usp=sharing
 [4]: https://hyper.ai/datasets/5543
 [5]: https://pan.baidu.com/s/1eR6vHFO
-[6]: https://www.mindspore.cn/tutorial/training/zh-CN/master/advanced_use/enable_mixed_precision.html
+[6]: https://www.mindspore.cn/docs/programming_guide/zh-CN/master/enable_mixed_precision.html
 [7]: http://www.cbsr.ia.ac.cn/users/scliao/projects/blufr/BLUFR.zip
 [8]: https://github.com/AlfredXiangWu/face_verification_experiment/blob/master/code/lfw_pairs.mat
 [9]: https://github.com/AlfredXiangWu/face_verification_experiment/blob/master/results/LightenedCNN_B_lfw.mat
diff --git a/model_zoo/research/cv/LightCNN/scripts/eval_blufr.sh b/model_zoo/research/cv/LightCNN/scripts/eval_blufr.sh
index c82226b4748..17034deab5d 100644
--- a/model_zoo/research/cv/LightCNN/scripts/eval_blufr.sh
+++ b/model_zoo/research/cv/LightCNN/scripts/eval_blufr.sh
@@ -16,7 +16,7 @@
 export DEVICE_ID=$1
 ckpt_file=$2
 
-python3 eval_blfur.py \
+python3 eval_blufr.py \
           --device_target Ascend \
           --device_id "${DEVICE_ID}" \
           --resume "${ckpt_file}" > eval_blfur.log 2>&1 &
diff --git a/model_zoo/research/cv/Pix2Pix/eval.py b/model_zoo/research/cv/Pix2Pix/eval.py
index 40477aef26b..f073fed7657 100644
--- a/model_zoo/research/cv/Pix2Pix/eval.py
+++ b/model_zoo/research/cv/Pix2Pix/eval.py
@@ -17,6 +17,7 @@
     Evaluate Pix2Pix Model.
 """
 
+import os
 from mindspore import Tensor, nn
 from mindspore.train.serialization import load_checkpoint
 from mindspore.train.serialization import load_param_into_net
@@ -63,6 +64,9 @@ if __name__ == '__main__':
     param_G = load_checkpoint(ckpt_url)
     load_param_into_net(netG, param_G)
 
+    if not os.path.isdir(args.predict_dir):
+        os.makedirs(args.predict_dir)
+
     data_loader_val = ds_val.create_dict_iterator(output_numpy=True, num_epochs=args.epoch_num)
     print("=======Starting evaluating Loop=======")
     for i, data in enumerate(data_loader_val):
diff --git a/model_zoo/research/cv/Pix2Pix/train.py b/model_zoo/research/cv/Pix2Pix/train.py
index 42ed87a1294..a4720bd655d 100644
--- a/model_zoo/research/cv/Pix2Pix/train.py
+++ b/model_zoo/research/cv/Pix2Pix/train.py
@@ -59,6 +59,13 @@ if __name__ == '__main__':
     train_net = TrainOneStepCell(loss_netD=d_loss_net, loss_netG=g_loss_net, optimizerD=d_opt, optimizerG=g_opt, sens=1)
     train_net.set_train()
 
+    if not os.path.isdir(args.train_fakeimg_dir):
+        os.makedirs(args.train_fakeimg_dir)
+    if not os.path.isdir(args.loss_show_dir):
+        os.makedirs(args.loss_show_dir)
+    if not os.path.isdir(args.ckpt_dir):
+        os.makedirs(args.ckpt_dir)
+
     # Training loop
     G_losses = []
     D_losses = []
diff --git a/model_zoo/research/cv/ProtoNet/README.md b/model_zoo/research/cv/ProtoNet/README.md
index e424789b6a6..07558b25353 100644
--- a/model_zoo/research/cv/ProtoNet/README.md
+++ b/model_zoo/research/cv/ProtoNet/README.md
@@ -29,7 +29,7 @@ Proto-Net contains 2 parts named Encoder and Relation. The former one has 4 conv
 
 Note that you can run the scripts based on the dataset mentioned in original paper or widely used in relevant domain/network architecture. In the following sections, we will introduce how to run the scripts using the related dataset below.
 
-The dataset omniglot can be obtained from (https://github.com/orobix/Prototypical-Networks-for-Few-shot-Learning-PyTorch/blob/master/). You can obtain the dataset after running the scripts.
+The dataset omniglot can be obtained from (<https://github.com/orobix/Prototypical-Networks-for-Few-shot-Learning-PyTorch/blob/master/>). You can obtain the dataset after running the scripts.
 
 ```bash
 cd src
@@ -65,8 +65,8 @@ python train.py
 - Framework
     - [MindSpore](https://www.mindspore.cn/install/en)
 - For more information, please check the resources below：
-  - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/master/index.html)
-  - [MindSpore Python API](https://www.mindspore.cn/docs/api/en/master/index.html)
+    - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/master/index.html)
+    - [MindSpore Python API](https://www.mindspore.cn/docs/api/en/master/index.html)
 
 # [Quick Start](#contents)
 
@@ -165,7 +165,7 @@ Test Acc: 0.9954400658607483  Loss: 0.02102319709956646
 | Speed                      | 215 ms/step                          |
 | Total time                 | 3 h 23m (8p)                |
 | Checkpoint for Fine tuning | 440 KB (.ckpt file)                                         |
-| Scripts                    | https://gitee.com/mindspore/mindspore/tree/r1.1/model_zoo/research/cv/protonet |
+| Scripts                    | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/cv/ProtoNet> |
 
 # [ModelZoo Homepage](#contents)
 
diff --git a/model_zoo/research/cv/SE-Net/README.md b/model_zoo/research/cv/SE-Net/README.md
index 9927bcc9c33..3ab272fbbdd 100644
--- a/model_zoo/research/cv/SE-Net/README.md
+++ b/model_zoo/research/cv/SE-Net/README.md
@@ -42,7 +42,7 @@
 
 ## Description
 
-something should be written here.
+"Squeeze-and-Excitation" (SE) block adaptively recalibrates channel-wise feature responses by explicitly modelling interdependencies between channels.
 
 ## Paper
 
diff --git a/model_zoo/research/cv/SiamFC/readme.md b/model_zoo/research/cv/SiamFC/readme.md
deleted file mode 100644
index 21026f95241..00000000000
--- a/model_zoo/research/cv/SiamFC/readme.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# Contents
-
-- [SiamFC Description](#SiamFC-Description)
-- [Model Architecture](#SiamFC-Architecture)
-- [Dataset](#SiamFC-dataset)
-- [Environmental requirements](#Environmental)
-- [Quick Start](#quick-start)
-- [Script Description](#script-description)
-    - [Script and Sample Code](#script-and-sample-code)
-    - [Script Parameters](#script-parameters)
-    - [Training Process](#training-process)
-        - [Training](#training)
-    - [Evaluation Process](#evaluation-process)
-        - [Evaluation](#evaluation)
-- [Model Description](#model-description)
-    - [Performance](#performance)
-        - [Evaluation Performance](#evaluation-performance)
-
-# [SiamFC Description](#Contents)
-
-Siamfc proposes a new full convolution twin network as the basic tracking algorithm, which is trained end-to-end on ilsvrc15 target tracking video data set. Our tracker exceeds the real-time requirement in frame rate. Although it is very simple, it achieves the best performance on multiple benchmarks.
-
-[paper](https://arxiv.org/pdf/1606.09549.pdf)  Luca Bertinetto Jack Valmadre Jo˜ao F. Henriques Andrea Vedaldi Philip H. S. Torr
-Department of Engineering Science, University of Oxford
-
-# [Model Architecture](#Contents)
-
-Siamfc first uses full convolution alexnet for feature extraction online and offline, and uses twin network to train the template and background respectively. On line, after getting the box of the first frame, it carries out centrrop, and then loads checkpoint to track the subsequent frames. In order to find the box, it needs to carry out a series of penalties on the score graph, Finally, the final prediction point is obtained by twice trilinear interpolation.
-
-# [Dataset](#Contents)
-
-used Dataset :[ILSVRC2015-VID](http://bvisionweb1.cs.unc.edu/ilsvrc2015/ILSVRC2015_VID.tar.gz)
-
-- Dataset size : 85GB ,total 30 type
-    - Training set: a total of 3862 videos and their corresponding frame pictures and box positions
-    - Verification set: 555 videos and corresponding pictures and box locations
-    - Test set: a total of 973 videos and corresponding pictures and box locations
-- Data format: the image is in h*w*C format, the box position includes the coordinates of the lower left corner and the upper right corner, the format is XML, and the XML needs to be parsed
-
-# [Environmental requirements](#Contents)
-
-- Hardware :(Ascend)
-    - Prepare ascend processor to build hardware environment
-- frame:
-    - [Mindspore](https://www.mindspore.cn/install)
-- For details, please refer to the following resources:
-    - [MindSpore course](https://www.mindspore.cn/tutorial/training/zh-CN/master/index.html)
-    - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/zh-CN/master/index.html)
-- more API
-    - got10k toolkit
-    - opencv
-    - lmdb
-
-# [quick start](#Contents)
-
-After installing mindspree through the official website, you can follow the following steps to train and evaluate:
-
-- Run the python script to preprocess the data set
-
-  python src/create_dataset_ILSVRC.py -d data_dir -o output_dir
-
-- Run Python script to create LMDB
-
-  python src/create_lmdb.py -d data_dir -o output_dir
-
-  for example：
-  data_dir = '/data/VID/ILSVRC_VID_CURATION_train'
-  output_dir = '/data/VID/ILSVRC_VID_CURATION_train.lmdb'
-
-  __Remarks:The encrypted pathname is used as the index.Therefore,you cannot change the location of the dataset
-  after creating it, because you need to find the corresponding image according to the index.__
-
-- Run the script for training
-
-  bash run_standalone_train_ascend.sh [Device_ID] [Dataset_path]
-  Remarks:For the training set position after preprocessing
-
-- more
-
-  This example is single card training.
-
-- Run the script for evaluation
-
-  python eval.py,need got10k toolkit,the dataset is OTB2013(50) or OTB2015(100)
-
-# [Script description](#Contents)
-
-## Script and sample code
-
-```python
-    ├── SiamFC
-        ├── README.md                    // Notes on siamfc
-        ├── scripts
-        │   ├──ma-pre-start.sh          // Create environment before modelarts training
-        │   ├──run_standalone_train_ascend.sh             // Single card training in ascend
-        │   ├──run_distribution_ascend.sh          // Multi card distributed training in ascend
-        ├── src
-        │   ├──alexnet.py             // Create dataset
-        │   ├──config.py              // Alexnet architecture
-        │   ├──custom_transforms.py   //Data set processing
-        │   ├──dataset.py            //GeneratorDataset
-        │   ├──Groupconv.py        //Mindpore does not support group convolution at present. This is an alternative
-        │   ├──lr_generator.py       //Dynamic learning rate
-        │   ├──tracker.py           //Trace script
-        │   ├──utils.py             // utils
-        │   ├──create_dataset_ILSVRC.py     // Create dataset
-        │   ├──create_lmdb.py               //Create LMDB
-        ├── train.py               // Training script
-        ├── eval.py               //  Evaluation script
-```
-
-## Script parameters
-
-python train.py and config.py The main parameters are as follows:
-
-- data_path：An absolutely complete path to training and evaluation data sets.
-- epoch_size：Total training rounds
-- batch_size：Training batch size.
-- image_height：The image height is used as the model input.
-- image_width：The image width is used as the model input.
-- exemplar_size：Template size
-- instance_size：Sample size.
-- lr：Learning rate.
-- frame_range：Select the frame interval of the template and sample.
-- response_scale：Scaling factor of score chart.
-
-## Training process
-
-### Training
-
-- Running in ascend processor environment
-
-```python
-  python train.py  --device_id=${DEVICE_ID} --data_path=${DATASET_PATH}
-```
-
-- After training, the loss value is as follows:
-
-```bash
-  grep "loss is " log
-  epoch: 1 step: 1, loss is 1.14123213
-  ...
-  epoch: 1 step: 1536, loss is 0.5234123
-  epoch: 1 step: 1537, loss is 0.4523326
-  epoch: 1 step: 1538, loss is 0.6235748
- ...
-```
-
-- Model checkpoints are saved in the current directory.
-
-- After training, the loss value is as follows:
-
-```bash
-  grep "loss is " log:
-  epoch: 30 step: 1, loss is 0.12534634
-  ...
-  epoch: 30 step: 1560, loss is 0.2364573
-  epoch: 30 step: 1561, loss is 0.156347
-  epoch: 30 step: 1561, loss is 0.173423
-```
-
-## Evaluation process
-
-Check the checkpoint path used for evaluation before running the following command.
-
-- Running in ascend processor environment
-
-```bash
-  python eval.py  --device_id=${DEVICE_ID} --model_path=${MODEL_PATH}
-```
-
-  The results were as follows:
-
-```bash
-  SiamFC_159_50_6650.ckpt -prec_score:0.777 -succ_score:0.589 _succ_rate:0.754
-```
-
-# [Model description](#Contents)
-
-## performance
-
-### Evaluate performance
-
-|parameter   | Ascend        |
-| -------------------------- | ---------------------------------------------- |
-|resources     | Ascend 910；CPU 2.60GHz, 192core；memory：755G |
-|Upload date   |2021.5.20         |
-|mindspore version   |mindspore1.2.0     |
-|training parameter | epoch=50,step=6650,batch_size=8,lr_init=1e-2,lr_endl=1e-5   |
-|optimizer     |SGD optimizer，momentum=0.0,weight_decay=0.0    |
-|loss function     |BCEWithLogits   |
-|training speed    | epoch time：285693.557 ms per step time :42.961 ms |
-|total time        |about 5 hours    |
-|Script URL        |https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/cv/SiamFC  |
-|Random number seed         |set_seed = 1234     |
diff --git a/model_zoo/research/cv/StarGAN/export.py b/model_zoo/research/cv/StarGAN/export.py
index 79bfa385922..b465d213cbd 100644
--- a/model_zoo/research/cv/StarGAN/export.py
+++ b/model_zoo/research/cv/StarGAN/export.py
@@ -38,4 +38,4 @@ if __name__ == '__main__':
     input_array = Tensor(np.random.uniform(-1.0, 1.0, size=(1, 3, 128, 128)).astype(np.float32))
     input_label = Tensor(np.random.uniform(-1.0, 1.0, size=(1, 5)).astype(np.float32))
     G_file = f"StarGAN_Generator"
-    export(G, input_array, file_name=G_file, file_format=config.file_format)
+    export(G, input_array, input_label, file_name=G_file, file_format=config.file_format)
diff --git a/model_zoo/research/cv/StarGAN/scripts/eval_ascend.sh b/model_zoo/research/cv/StarGAN/scripts/eval_ascend.sh
deleted file mode 100644
index a434f21103b..00000000000
--- a/model_zoo/research/cv/StarGAN/scripts/eval_ascend.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-
-export DEVICE_NUM=1
-export DEVICE_ID=0
-export MODE='test'
-echo "start training for device $DEVICE_ID"
-env > env.log
-python eval.py --run_distribute=0 --device_num=$DEVICE_NUM --device_id=$DEVICE_ID --mode=$MODE> log_eval.txt 2>&1 &
-
-cd ..
diff --git a/model_zoo/research/cv/StarGAN/scripts/eval_standalone_ascend.sh b/model_zoo/research/cv/StarGAN/scripts/eval_standalone_ascend.sh
deleted file mode 100644
index a434f21103b..00000000000
--- a/model_zoo/research/cv/StarGAN/scripts/eval_standalone_ascend.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-
-export DEVICE_NUM=1
-export DEVICE_ID=0
-export MODE='test'
-echo "start training for device $DEVICE_ID"
-env > env.log
-python eval.py --run_distribute=0 --device_num=$DEVICE_NUM --device_id=$DEVICE_ID --mode=$MODE> log_eval.txt 2>&1 &
-
-cd ..
diff --git a/model_zoo/research/cv/StarGAN/scripts/run_distribute_train_ascend.sh b/model_zoo/research/cv/StarGAN/scripts/run_distribute_train_ascend.sh
deleted file mode 100644
index c02d5d2053d..00000000000
--- a/model_zoo/research/cv/StarGAN/scripts/run_distribute_train_ascend.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-if [ $# != 3 ]
-then
-    echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [DISTRIBUTE] [RANK_TABLE_FILE]"
-    exit 1
-fi
-
-echo "After running the script, the network runs in the background. The log will be generated in LOGx/log.txt"
-
-export RANK_SIZE=$1
-DISTRIBUTE=$2
-export RANK_TABLE_FILE=$3
-
-for((i=0;i<RANK_SIZE;i++))
-do
-        export DEVICE_ID=$i
-        rm -rf LOG$i
-        mkdir ./LOG$i
-        cp ./*.json ./LOG$i
-        cp ./*.py ./LOG$i
-        cp -r ./src ./LOG$i
-        cp -r ./scripts ./LOG$i
-        cd ./LOG$i || exit
-        export RANK_ID=$i
-        echo "start training for rank $i, device $DEVICE_ID"
-        env > env.log
-        if [ $# == 3 ]
-        then
-                python train.py \
-                --run_distribute=$DISTRIBUTE \
-                --device_num=$RANK_SIZE \
-                --device_id=$DEVICE_ID > log.txt 2>&1 &
-        fi
-        cd ../
-done
diff --git a/model_zoo/research/cv/StarGAN/scripts/run_standalone_train_ascend.sh b/model_zoo/research/cv/StarGAN/scripts/run_standalone_train_ascend.sh
deleted file mode 100644
index fe96b624e13..00000000000
--- a/model_zoo/research/cv/StarGAN/scripts/run_standalone_train_ascend.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-
-export DEVICE_NUM=1
-export DEVICE_ID=0
-
-echo "start training for device $DEVICE_ID"
-env > env.log
-python train.py --run_distribute=0 --device_num=$DEVICE_NUM --device_id=$DEVICE_ID > log.txt 2>&1 &
-
-cd ..
diff --git a/model_zoo/research/cv/StarGAN/src/config.py b/model_zoo/research/cv/StarGAN/src/config.py
index 42385de9a23..aeab0c8152a 100644
--- a/model_zoo/research/cv/StarGAN/src/config.py
+++ b/model_zoo/research/cv/StarGAN/src/config.py
@@ -67,8 +67,8 @@ def get_config():
 
 
     # Directories.
-    parser.add_argument('--celeba_image_dir', type=str, default=r'/root/wcy/StarGAN_copy/celeba/images')
-    parser.add_argument('--attr_path', type=str, default=r'/root/wcy/StarGAN_copy/celeba/list_attr_celeba.txt')
+    parser.add_argument('--celeba_image_dir', type=str, default=r'/home/data/celeba/images')
+    parser.add_argument('--attr_path', type=str, default=r'/home/data/celeba/list_attr_celeba.txt')
     parser.add_argument('--rafd_image_dir', type=str, default='data/RaFD/train')
     parser.add_argument('--log_dir', type=str, default='stargan/logs')
     parser.add_argument('--model_save_dir', type=str, default='./models/')
diff --git a/model_zoo/research/cv/StarGAN/src/utils.py b/model_zoo/research/cv/StarGAN/src/utils.py
index e0527f7eb59..e8796736226 100644
--- a/model_zoo/research/cv/StarGAN/src/utils.py
+++ b/model_zoo/research/cv/StarGAN/src/utils.py
@@ -69,7 +69,7 @@ class DistributedSampler:
 def resume_model(config, G, D):
     """Restore the trained generator and discriminator."""
     print('Loading the trained models from step {}...'.format(config.resume_iters))
-    G_path = os.path.join(config.model_save_dir, f"Generator_2-0_%d.ckpt" % config.resume_iters)
+    G_path = os.path.join(config.model_save_dir, f"Generator-0_%d.ckpt" % config.resume_iters)
     # D_path = os.path.join(config.model_save_dir, f"Net_D_%d.ckpt" % config.resume_iters)
     param_G = load_checkpoint(G_path, G)
     # param_D = load_checkpoint(D_path, D)
diff --git a/model_zoo/research/cv/StarGAN/train.py b/model_zoo/research/cv/StarGAN/train.py
index dd395f71905..daf2ac4f15a 100644
--- a/model_zoo/research/cv/StarGAN/train.py
+++ b/model_zoo/research/cv/StarGAN/train.py
@@ -66,7 +66,7 @@ if __name__ == '__main__':
         # unzip data
         path = os.getcwd()
         print("cwd: %s" % path)
-        data_url = 'obs://hit-wcy/data/CelebA/'
+        data_url = 'obs://data/CelebA/'
 
         data_name = '/celeba.zip'
         print('listdir1: %s' % os.listdir('./'))
diff --git a/model_zoo/research/cv/centernet_resnet50_v1/readme.md b/model_zoo/research/cv/centernet_resnet50_v1/readme.md
index 98e11601ddc..fb06595d576 100644
--- a/model_zoo/research/cv/centernet_resnet50_v1/readme.md
+++ b/model_zoo/research/cv/centernet_resnet50_v1/readme.md
@@ -390,7 +390,7 @@ overall performance on coco2017 validation dataset
 
 ### Convert
 
-If you want to infer the network on Ascend 310, you should convert the model to AIR:
+If you want to infer the network on Ascend 310, you should convert the model to MINDIR. What you need to do before is to specify the `ckpt_file` that needs to be converted in the `export_config` section of the `src/config.py` file.
 
 ```python
 python export.py [DEVICE_ID]
diff --git a/model_zoo/research/cv/centernet_resnet50_v1/scripts/ascend_distributed_launcher/hyper_parameter_config.ini b/model_zoo/research/cv/centernet_resnet50_v1/scripts/ascend_distributed_launcher/hyper_parameter_config.ini
index 5e29aed8ea7..b24733db50e 100644
--- a/model_zoo/research/cv/centernet_resnet50_v1/scripts/ascend_distributed_launcher/hyper_parameter_config.ini
+++ b/model_zoo/research/cv/centernet_resnet50_v1/scripts/ascend_distributed_launcher/hyper_parameter_config.ini
@@ -5,7 +5,7 @@ enable_save_ckpt=true
 do_shuffle=true
 enable_data_sink=true
 data_sink_steps=-1
-save_checkpoint_path=/root/centernet_50/model_zoo/1.0
+save_checkpoint_path=./
 save_checkpoint_steps=4580
 save_checkpoint_num=30
 mindrecord_prefix="coco_det.train.mind"
diff --git a/model_zoo/research/cv/inception_resnet_v2/README.md b/model_zoo/research/cv/inception_resnet_v2/README.md
index 950c441243c..00c0e5a7a42 100644
--- a/model_zoo/research/cv/inception_resnet_v2/README.md
+++ b/model_zoo/research/cv/inception_resnet_v2/README.md
@@ -50,6 +50,8 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil
 
 # [Environment Requirements](#contents)
 
+- Hardware（Ascend）
+    - Prepare hardware environment with Ascend processor. If you want to try Ascend  , please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources.
 - Framework
     - [MindSpore](https://www.mindspore.cn/install)
 - For more information, please check the resources below：
@@ -69,13 +71,13 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil
     ├─run_distribute_train_ascend.sh    # launch distributed training with ascend platform(8p)
     └─run_eval_ascend.sh                # launch evaluating with ascend platform
   ├─src
-    ├─config.py                         # parameter configuration
-    ├─dataset.py                        # data preprocessing
-    ├─inception_resnet_v2.py.py         # network definition
-    └─callback.py                       # eval callback function
-  ├─eval.py                             # eval net
-  ├─export.py                           # export checkpoint, surpport .onnx, .air, .mindir convert
-  └─train.py                            # train net
+    ├─config.py                       # parameter configuration
+    ├─dataset.py                      # data preprocessing
+    ├─inception_resnet_v2.py.py       # network definition
+    └─callback.py                     # eval callback function
+  ├─eval.py                           # eval net
+  ├─export.py                         # export checkpoint, surpport .onnx, .air, .mindir convert
+  └─train.py                          # train net
 ```
 
 ## [Script Parameters](#contents)
@@ -113,13 +115,13 @@ You can start training using python or shell scripts. The usage of shell scripts
 
 ```bash
 # distribute training example(8p)
-bash scripts/run_distribute_train_ascend.sh RANK_TABLE_FILE DATA_DIR
+bash scripts/run_distribute_train_ascend.sh RANK_TABLE_FILE DATA_PATH DATA_DIR
 # standalone training
 bash scripts/run_standalone_train_ascend.sh DEVICE_ID DATA_DIR
 ```
 
 > Notes:
-> RANK_TABLE_FILE can refer to [Link](https://www.mindspore.cn/tutorials/zh-CN/master/intermediate/distributed_training/distributed_training_ascend.html) , and the device_ip can be got as [Link](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). For large models like InceptionV4, it's better to export an external environment variable `export HCCL_CONNECT_TIMEOUT=600` to extend hccl connection checking time from the default 120 seconds to 600 seconds. Otherwise, the connection could be timeout since compiling time increases with the growth of model size.
+> RANK_TABLE_FILE can refer to [Link](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/distributed_training_ascend.html) , and the device_ip can be got as [Link](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). For large models like InceptionV4, it's better to export an external environment variable `export HCCL_CONNECT_TIMEOUT=600` to extend hccl connection checking time from the default 120 seconds to 600 seconds. Otherwise, the connection could be timeout since compiling time increases with the growth of model size.
 >
 > This is processor cores binding operation regarding the `device_num` and total processor numbers. If you are not expect to do it, remove the operations `taskset` in `scripts/run_distribute_train.sh`
 
@@ -130,7 +132,7 @@ bash scripts/run_standalone_train_ascend.sh DEVICE_ID DATA_DIR
   shell:
       Ascend:
       # distribute training example(8p)
-      bash scripts/run_distribute_train_ascend.sh RANK_TABLE_FILE DATA_DIR
+      bash scripts/run_distribute_train_ascend.sh RANK_TABLE_FILE DATA_PATH DATA_DIR
       # standalone training
       bash scripts/run_standalone_train_ascend.sh
 ```
@@ -188,8 +190,8 @@ metric: {'Loss': 1.0413, 'Top1-Acc':0.79955, 'Top5-Acc':0.9439}
 | Optimizer           | RMSProp                                       |
 | Loss Function       | SoftmaxCrossEntropyWithLogits                 |
 | Outputs             | probability                                   |
-| Speed               | 1pc: 556 img/s; 8pcs: 4430 img/s              |
-| Total time          | 8pcs: 24h                                     |
+| Total time (8p)     | 24h                                           |
+| performance         | 1p: 556 img/s / 8p: 4430 img/s                |
 
 #### Inference Performance
 
diff --git a/model_zoo/research/cv/inception_resnet_v2/README_CN.md b/model_zoo/research/cv/inception_resnet_v2/README_CN.md
index 403d6b9d286..ddd778dc3be 100644
--- a/model_zoo/research/cv/inception_resnet_v2/README_CN.md
+++ b/model_zoo/research/cv/inception_resnet_v2/README_CN.md
@@ -1,5 +1,8 @@
 # 目录
 
+<!-- TOC -->
+
+- [目录](#目录)
 - [Inception_ResNet_v2描述](#Inception_ResNet_v2描述)
 - [模型架构](#模型架构)
 - [数据集](#数据集)
@@ -24,6 +27,8 @@
 - [随机情况说明](#随机情况说明)
 - [ModelZoo主页](#modelzoo主页)
 
+<!-- /TOC -->
+
 # Inception_ResNet_v2描述
 
 Inception_ResNet_v2是Google的深度学习卷积架构系列的一个版本。Inception_ResNet_v2主要通过修改以前的Inception架构来减少计算资源的消耗。该方法在2016年出版的Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning一文中提出的。
@@ -57,12 +62,12 @@ Inception_ResNet_v2的总体网络架构如下：
 # 环境要求
 
 - 硬件（Ascend）
-    - 使用Ascend来搭建硬件环境。
+- 使用Ascend来搭建硬件环境。
 - 框架
-    - [MindSpore](https://www.mindspore.cn/install)
+- [MindSpore](https://www.mindspore.cn/install)
 - 如需查看详情，请参见如下资源：
-    - [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html)
-    - [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html)
+- [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html)
+- [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html)
 
 # 脚本说明
 
@@ -77,13 +82,13 @@ Inception_ResNet_v2的总体网络架构如下：
     ├─run_distribute_train_ascend.sh    # launch distributed training with ascend platform(8p)
     └─run_eval_ascend.sh                # launch evaluating with ascend platform
   ├─src
-    ├─config.py                         # parameter configuration
-    ├─dataset.py                        # data preprocessing
-    ├─inception_resnet_v2.py.py         # network definition
-    └─callback.py                       # eval callback function
-  ├─eval.py                             # eval net
-  ├─export.py                           # export checkpoint, surpport .onnx, .air, .mindir convert
-  └─train.py                            # train net
+    ├─config.py                       # parameter configuration
+    ├─dataset.py                      # data preprocessing
+    ├─inception_resnet_v2.py.py       # network definition
+    └─callback.py                     # eval callback function
+  ├─eval.py                           # eval net
+  ├─export.py                         # export checkpoint, surpport .onnx, .air, .mindir convert
+  └─train.py                          # train net
 ```
 
 ## 脚本参数
@@ -121,12 +126,12 @@ Major parameters in train.py and config.py are:
 
     ```bash
     # distribute training example(8p)
-    bash scripts/run_distribute_train_ascend.sh RANK_TABLE_FILE DATA_DIR
+    bash scripts/run_distribute_train_ascend.sh RANK_TABLE_FILE DATA_PATH DATA_DIR
     # standalone training
     bash scripts/run_standalone_train_ascend.sh DEVICE_ID DATA_DIR
     ```
 
-> 注：RANK_TABLE_FILE可参考[链接](https://www.mindspore.cn/tutorials/zh-CN/master/intermediate/distributed_training/distributed_training_ascend.html)。device_ip可以通过[链接](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools)获取
+> 注：RANK_TABLE_FILE可参考[链接](https://www.mindspore.cn/tutorial/training/zh-CN/master/advanced_use/distributed_training_ascend.html)。device_ip可以通过[链接](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools)获取
 
 ### 结果
 
@@ -191,8 +196,7 @@ python export.py --ckpt_file [CKPT_PATH] --device_target [DEVICE_TARGET] --file_
 | 损失函数              | Softmax交叉熵                            |
 | 输出                    | 概率                                    |
 | 损失                       | 1.98                                           |
-| 速度 | 1卡：556 img/秒；8卡：4430 img/秒 |
-| 总时长            | 8卡：24小时                                         |
+| 总时长（8卡）            | 24小时                                            |
 
 #### 推理性能
 
diff --git a/model_zoo/research/cv/inception_resnet_v2/src/config.py b/model_zoo/research/cv/inception_resnet_v2/src/config.py
index 4f5f0bac408..cc2e39ccf22 100644
--- a/model_zoo/research/cv/inception_resnet_v2/src/config.py
+++ b/model_zoo/research/cv/inception_resnet_v2/src/config.py
@@ -40,6 +40,5 @@ config_ascend = edict({
     'lr_end': 0.000004,
     'lr_max': 0.4,
     'warmup_epochs': 1,
-    'start_epoch': 1,
-
+    'start_epoch': 1
 })
diff --git a/model_zoo/research/cv/inception_resnet_v2/src/dataset.py b/model_zoo/research/cv/inception_resnet_v2/src/dataset.py
index 89f4d016090..bb8b3421abc 100644
--- a/model_zoo/research/cv/inception_resnet_v2/src/dataset.py
+++ b/model_zoo/research/cv/inception_resnet_v2/src/dataset.py
@@ -20,8 +20,9 @@ import mindspore.dataset.vision.c_transforms as C
 import mindspore.dataset.transforms.c_transforms as C2
 from src.config import config_ascend as config
 
-DEVICE_ID = 1
-DEVICE_NUM = 1
+
+device_id = int(os.getenv('DEVICE_ID'))
+device_num = int(os.getenv('RANK_SIZE'))
 
 
 def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
@@ -39,8 +40,6 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
     """
 
     do_shuffle = bool(do_train)
-    device_id = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else DEVICE_ID
-    device_num = int(os.getenv('RANK_SIZE')) if os.getenv('RANK_SIZE') else DEVICE_NUM
 
     if device_num == 1 or not do_train:
         ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=do_shuffle)
@@ -72,7 +71,9 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
     ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=config.work_nums)
     ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=config.work_nums)
 
+    # apply batch operations
     ds = ds.batch(batch_size, drop_remainder=True)
 
+    # apply dataset repeat operation
     ds = ds.repeat(repeat_num)
     return ds
diff --git a/model_zoo/research/cv/mnasnet/scripts/run_distribute_train.sh b/model_zoo/research/cv/mnasnet/scripts/run_distribute_train.sh
index 338e713907d..9f8303dd334 100644
--- a/model_zoo/research/cv/mnasnet/scripts/run_distribute_train.sh
+++ b/model_zoo/research/cv/mnasnet/scripts/run_distribute_train.sh
@@ -21,6 +21,7 @@ ulimit -u unlimited
 export DEVICE_NUM=8
 export RANK_SIZE=8
 export RANK_TABLE_FILE=$PATH1
+export HCCL_CONNECT_TIMEOUT=1200
 
 for ((i = 0; i < ${DEVICE_NUM}; i++)); do
   let deviceID=$i
@@ -37,4 +38,4 @@ for ((i = 0; i < ${DEVICE_NUM}; i++)); do
   env >env.log
   python -u train.py --run_distribute=True --dataset_path=$PATH2 > log.txt 2>&1 &
   cd ..
-done
\ No newline at end of file
+done
diff --git a/model_zoo/research/cv/ntsnet/src/network.py b/model_zoo/research/cv/ntsnet/src/network.py
index 87c9bad1601..7cf4080f096 100644
--- a/model_zoo/research/cv/ntsnet/src/network.py
+++ b/model_zoo/research/cv/ntsnet/src/network.py
@@ -16,20 +16,12 @@
 import math
 import os
 import time
-import threading
 import numpy as np
 from mindspore import ops, load_checkpoint, load_param_into_net, Tensor, nn
 from mindspore.ops import functional as F
 from mindspore.ops import operations as P
-import mindspore.context as context
 import mindspore.common.dtype as mstype
-from mindspore.train.callback import Callback
-from mindspore.train.callback._callback import set_cur_net
-from mindspore.train.callback._checkpoint import _check_file_name_prefix, _cur_dir, CheckpointConfig, CheckpointManager, \
-    _chg_ckpt_file_name_if_same_exist
-from mindspore.train._utils import _make_directory
-from mindspore.train.serialization import save_checkpoint, _save_graph
-from mindspore.parallel._ps_context import _is_role_pserver, _get_ps_mode_rank
+from mindspore.train.callback import Callback, ModelCheckpoint
 from src.resnet import resnet50
 from src.config import config
 
@@ -321,7 +313,7 @@ class WithLossCell(nn.Cell):
         return self._backbone
 
 
-class ModelCheckpoint(Callback):
+class NtsnetModelCheckpoint(ModelCheckpoint):
     """
     The checkpoint callback class.
     It is called to combine with train process and save the model and network parameters after training.
@@ -339,142 +331,17 @@ class ModelCheckpoint(Callback):
 
     def __init__(self, prefix='CKP', directory=None, ckconfig=None,
                  device_num=1, device_id=0, args=None, run_modelart=False):
-        super(ModelCheckpoint, self).__init__()
-        self._latest_ckpt_file_name = ""
-        self._init_time = time.time()
-        self._last_time = time.time()
-        self._last_time_for_keep = time.time()
-        self._last_triggered_step = 0
+        super(NtsnetModelCheckpoint, self).__init__(prefix, directory, ckconfig)
         self.run_modelart = run_modelart
-        if _check_file_name_prefix(prefix):
-            self._prefix = prefix
-        else:
-            raise ValueError("Prefix {} for checkpoint file name invalid, "
-                             "please check and correct it and then continue.".format(prefix))
-        if directory is not None:
-            self._directory = _make_directory(directory)
-        else:
-            self._directory = _cur_dir
-        if ckconfig is None:
-            self._config = CheckpointConfig()
-        else:
-            if not isinstance(ckconfig, CheckpointConfig):
-                raise TypeError("ckconfig should be CheckpointConfig type.")
-            self._config = ckconfig
-        # get existing checkpoint files
-        self._manager = CheckpointManager()
-        self._prefix = _chg_ckpt_file_name_if_same_exist(self._directory, self._prefix)
-        self._graph_saved = False
-        self._need_flush_from_cache = True
         self.device_num = device_num
         self.device_id = device_id
         self.args = args
 
-    def step_end(self, run_context):
-        """
-        Save the checkpoint at the end of step.
-        Args:
-            run_context (RunContext): Context of the train running.
-        """
-        if _is_role_pserver():
-            self._prefix = "PServer_" + str(_get_ps_mode_rank()) + "_" + self._prefix
-        cb_params = run_context.original_args()
-        _make_directory(self._directory)
-        # save graph (only once)
-        if not self._graph_saved:
-            graph_file_name = os.path.join(self._directory, self._prefix + '-graph.meta')
-            if os.path.isfile(graph_file_name) and context.get_context("mode") == context.GRAPH_MODE:
-                os.remove(graph_file_name)
-            _save_graph(cb_params.train_network, graph_file_name)
-            self._graph_saved = True
-        thread_list = threading.enumerate()
-        for thread in thread_list:
-            if thread.getName() == "asyn_save_ckpt":
-                thread.join()
-        self._save_ckpt(cb_params)
-
-    def end(self, run_context):
-        """
-        Save the last checkpoint after training finished.
-        Args:
-            run_context (RunContext): Context of the train running.
-        """
-        cb_params = run_context.original_args()
-        _to_save_last_ckpt = True
-        self._save_ckpt(cb_params, _to_save_last_ckpt)
-        thread_list = threading.enumerate()
-        for thread in thread_list:
-            if thread.getName() == "asyn_save_ckpt":
-                thread.join()
-        from mindspore.parallel._cell_wrapper import destroy_allgather_cell
-        destroy_allgather_cell()
-
-    def _check_save_ckpt(self, cb_params, force_to_save):
-        """Check whether save checkpoint files or not."""
-        if self._config.save_checkpoint_steps and self._config.save_checkpoint_steps > 0:
-            if cb_params.cur_step_num >= self._last_triggered_step + self._config.save_checkpoint_steps \
-                    or force_to_save is True:
-                return True
-        elif self._config.save_checkpoint_seconds and self._config.save_checkpoint_seconds > 0:
-            self._cur_time = time.time()
-            if (self._cur_time - self._last_time) > self._config.save_checkpoint_seconds or force_to_save is True:
-                self._last_time = self._cur_time
-                return True
-        return False
-
     def _save_ckpt(self, cb_params, force_to_save=False):
-        """Save checkpoint files."""
-        if cb_params.cur_step_num == self._last_triggered_step:
-            return
-        save_ckpt = self._check_save_ckpt(cb_params, force_to_save)
-        step_num_in_epoch = int((cb_params.cur_step_num - 1) % cb_params.batch_num + 1)
-        if save_ckpt:
-            cur_ckpoint_file = self._prefix + "-" + str(cb_params.cur_epoch_num) + "_" \
-                               + str(step_num_in_epoch) + ".ckpt"
-            # update checkpoint file list.
-            self._manager.update_ckpoint_filelist(self._directory, self._prefix)
-            # keep checkpoint files number equal max number.
-            if self._config.keep_checkpoint_max and \
-                    0 < self._config.keep_checkpoint_max <= self._manager.ckpoint_num:
-                self._manager.remove_oldest_ckpoint_file()
-            elif self._config.keep_checkpoint_per_n_minutes and \
-                    self._config.keep_checkpoint_per_n_minutes > 0:
-                self._cur_time_for_keep = time.time()
-                if (self._cur_time_for_keep - self._last_time_for_keep) \
-                        < self._config.keep_checkpoint_per_n_minutes * 60:
-                    self._manager.keep_one_ckpoint_per_minutes(self._config.keep_checkpoint_per_n_minutes,
-                                                               self._cur_time_for_keep)
-            # generate the new checkpoint file and rename it.
-            cur_file = os.path.join(self._directory, cur_ckpoint_file)
-            self._last_time_for_keep = time.time()
-            self._last_triggered_step = cb_params.cur_step_num
-            if context.get_context("enable_ge"):
-                set_cur_net(cb_params.train_network)
-                cb_params.train_network.exec_checkpoint_graph()
-            network = self._config.saved_network if self._config.saved_network is not None \
-                else cb_params.train_network
-            save_checkpoint(network, cur_file, self._config.integrated_save,
-                            self._config.async_save)
-            self._latest_ckpt_file_name = cur_file
-            if self.run_modelart and (self.device_num == 1 or self.device_id == 0):
-                import moxing as mox
-                mox.file.copy_parallel(src_url=cur_file, dst_url=os.path.join(self.args.train_url, cur_ckpoint_file))
-
-    def _flush_from_cache(self, cb_params):
-        """Flush cache data to host if tensor is cache enable."""
-        has_cache_params = False
-        params = cb_params.train_network.get_parameters()
-        for param in params:
-            if param.cache_enable:
-                has_cache_params = True
-                Tensor(param).flush_from_cache()
-        if not has_cache_params:
-            self._need_flush_from_cache = False
-
-    @property
-    def latest_ckpt_file_name(self):
-        """Return the latest checkpoint path and file name."""
-        return self._latest_ckpt_file_name
+        super()._save_ckpt(cb_params, force_to_save)
+        if self.run_modelart and (self.device_num == 1 or self.device_id == 0):
+            import moxing as mox
+            mox.file.copy_parallel(src_url=cur_file, dst_url=os.path.join(self.args.train_url, cur_ckpoint_file))
 
 
 class LossCallBack(Callback):
diff --git a/model_zoo/research/cv/ntsnet/train.py b/model_zoo/research/cv/ntsnet/train.py
index 117dc7e00a9..87af3d5d9c2 100644
--- a/model_zoo/research/cv/ntsnet/train.py
+++ b/model_zoo/research/cv/ntsnet/train.py
@@ -24,7 +24,7 @@ from mindspore.communication.management import init, get_rank, get_group_size
 from src.config import config
 from src.dataset import create_dataset_train
 from src.lr_generator import get_lr
-from src.network import NTS_NET, WithLossCell, LossCallBack, ModelCheckpoint
+from src.network import NTS_NET, WithLossCell, LossCallBack, NtsnetModelCheckpoint
 
 parser = argparse.ArgumentParser(description='ntsnet train running')
 parser.add_argument("--run_modelart", type=ast.literal_eval, default=False, help="Run on modelArt, default is false.")
@@ -113,8 +113,9 @@ if __name__ == '__main__':
                                       keep_checkpoint_max=config.keep_checkpoint_max)
         save_checkpoint_path = os.path.join(local_output_url, "ckpt_" + str(rank) + "/")
 
-        ckpoint_cb = ModelCheckpoint(prefix=config.prefix, directory=save_checkpoint_path, ckconfig=ckptconfig,
-                                     device_num=device_num, device_id=device_id, args=args, run_modelart=run_modelart)
+        ckpoint_cb = NtsnetModelCheckpoint(prefix=config.prefix, directory=save_checkpoint_path, ckconfig=ckptconfig,
+                                           device_num=device_num, device_id=device_id, args=args,
+                                           run_modelart=run_modelart)
         cb += [ckpoint_cb]
 
     model = Model(oneStepNTSNet, amp_level="O3", keep_batchnorm_fp32=False)
diff --git a/model_zoo/research/cv/resnext152_64x4d/README.md b/model_zoo/research/cv/resnext152_64x4d/README.md
index 1e099732d3f..f06051c8ba4 100644
--- a/model_zoo/research/cv/resnext152_64x4d/README.md
+++ b/model_zoo/research/cv/resnext152_64x4d/README.md
@@ -37,8 +37,8 @@ The overall network architecture of ResNeXt is show below:
 Dataset used: [imagenet](http://www.image-net.org/)
 
 - Dataset size: ~125G, 1.2W colorful images in 1000 classes
-- Train: 120G, 1.2W images
-- Test: 5G, 50000 images
+    - Train: 120G, 1.2W images
+    - Test: 5G, 50000 images
 - Data format: RGB images
 - Note: Data will be processed in src/dataset.py
 
@@ -46,19 +46,19 @@ Dataset used: [imagenet](http://www.image-net.org/)
 
 ## [Mixed Precision](#contents)
 
-The [mixed precision](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/enable_mixed_precision.html) training method accelerates the deep learning neural network training process by using both the single-precision and half-precision data formats, and maintains the network precision achieved by the single-precision training at the same time. Mixed precision training can accelerate the computation process, reduce memory usage, and enable a larger model or batch size to be trained on specific hardware.
+The [mixed precision](https://www.mindspore.cn/docs/programming_guide/en/master/enable_mixed_precision.html) training method accelerates the deep learning neural network training process by using both the single-precision and half-precision data formats, and maintains the network precision achieved by the single-precision training at the same time. Mixed precision training can accelerate the computation process, reduce memory usage, and enable a larger model or batch size to be trained on specific hardware.
 
 For FP16 operators, if the input data type is FP32, the backend of MindSpore will automatically handle it with reduced precision. Users could check the reduced-precision operators by enabling INFO log and then searching ‘reduce precision’.
 
 # [Environment Requirements](#contents)
 
 - Hardware（Ascend）
-- Prepare hardware environment with Ascend processor. If you want to try Ascend, please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources.
+    - Prepare hardware environment with Ascend  processor.
 - Framework
-- [MindSpore](https://www.mindspore.cn/install/en)
+    - [MindSpore](https://www.mindspore.cn/install/en)
 - For more information, please check the resources below：
-- [MindSpore Tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html)
-- [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html)
+    - [MindSpore Tutorials](https://www.mindspore.cn/tutorials/en/master/index.html)
+    - [MindSpore Python API](https://www.mindspore.cn/docs/api/en/master/index.html)
 
 # [Script description](#contents)
 
@@ -145,18 +145,18 @@ or shell script:
 ```script
 Ascend:
     # distribute training example(8p)
-    sh run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
+    bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
     # standalone training
-    sh run_standalone_train.sh DEVICE_ID DATA_PATH
+    bash run_standalone_train.sh DEVICE_ID DATA_PATH
 ```
 
 #### Launch
 
 ```bash
 # distributed training example(8p) for Ascend
-sh scripts/run_distribute_train.sh RANK_TABLE_FILE /dataset/train
+bash scripts/run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
 # standalone training example for Ascend
-sh scripts/run_standalone_train.sh 0 /dataset/train
+bash scripts/run_standalone_train.sh DEVICE_ID DATA_PATH
 ```
 
 You can find checkpoint file together with result in log.
@@ -175,7 +175,7 @@ or shell script:
 
 ```script
 # Evaluation
-sh run_eval.sh DEVICE_ID DATA_PATH PRETRAINED_CKPT_PATH PLATFORM
+bash run_eval.sh DEVICE_ID DATA_PATH PRETRAINED_CKPT_PATH PLATFORM
 ```
 
 PLATFORM is Ascend, default is Ascend.
@@ -184,10 +184,10 @@ PLATFORM is Ascend, default is Ascend.
 
 ```bash
 # Evaluation with checkpoint
-sh scripts/run_eval.sh 0 /opt/npu/datasets/classification/val /resnext152_100.ckpt Ascend
+bash scripts/run_eval.sh DEVICE_ID PRETRAINED_CKPT_PATH PLATFORM
 
-#Directly use the script to run
-python eval.py --data_dir /opt/npu/pvc/dataset/storage/imagenet/val/ --platform Ascend --pretrained /root/test/resnext152_64x4d/outputs_demo/best_acc_4.ckpt
+# Directly use the script to run
+python eval.py --data_dir ~/imagenet/val/ --platform Ascend --pretrained ~/best_acc_4.ckpt
 ```
 
 #### Result
@@ -213,31 +213,31 @@ python export.py --device_target [PLATFORM] --ckpt_file [CKPT_PATH] --file_forma
 
 ### Training Performance
 
-| Parameters                 | ResNeXt152                                    |      |
-| -------------------------- | --------------------------------------------- | ---- |
-| Resource                   | Ascend 910, cpu:2.60GHz 192cores, memory:755G |      |
-| uploaded Date              | 06/30/2021                                    |      |
-| MindSpore Version          | 1.2                                           |      |
-| Dataset                    | ImageNet                                      |      |
-| Training Parameters        | src/config.py                                 |      |
-| Optimizer                  | Momentum                                      |      |
-| Loss Function              | SoftmaxCrossEntropy                           |      |
-| Loss                       | 1.28923                                       |      |
-| Accuracy                   | 80.08%(TOP1)                                  |      |
-| Total time                 | 7.8 h 8ps                                     |      |
-| Checkpoint for Fine tuning | 192 M(.ckpt file)                             |      |
+| Parameters                 | ResNeXt152                                    |
+| -------------------------- | --------------------------------------------- |
+| Resource                   | Ascend 910, cpu:2.60GHz 192cores, memory:755G |
+| uploaded Date              | 06/30/2021                                    |
+| MindSpore Version          | 1.2                                           |
+| Dataset                    | ImageNet                                      |
+| Training Parameters        | src/config.py                                 |
+| Optimizer                  | Momentum                                      |
+| Loss Function              | SoftmaxCrossEntropy                           |
+| Loss                       | 1.28923                                       |
+| Accuracy                   | 80.08%(TOP1)                                  |
+| Total time                 | 7.8 h 8ps                                     |
+| Checkpoint for Fine tuning | 192 M(.ckpt file)                             |
 
 #### Inference Performance
 
-| Parameters        |      |      |                  |
-| ----------------- | ---- | ---- | ---------------- |
-| Resource          |      |      | Ascend 910       |
-| uploaded Date     |      |      | 06/20/2021       |
-| MindSpore Version |      |      | 1.2              |
-| Dataset           |      |      | ImageNet, 1.2W   |
-| batch_size        |      |      | 1                |
-| outputs           |      |      | probability      |
-| Accuracy          |      |      | acc=80.08%(TOP1) |
+| Parameters        |                  |
+| ----------------- | ---------------- |
+| Resource          | Ascend 910       |
+| uploaded Date     | 06/20/2021       |
+| MindSpore Version | 1.2              |
+| Dataset           | ImageNet, 1.2W   |
+| batch_size        | 1                |
+| outputs           | probability      |
+| Accuracy          | acc=80.08%(TOP1) |
 
 # [Description of Random Situation](#contents)
 
diff --git a/model_zoo/research/cv/resnext152_64x4d/README_CN.md b/model_zoo/research/cv/resnext152_64x4d/README_CN.md
index 28fe5d76433..b0ee1604e6e 100644
--- a/model_zoo/research/cv/resnext152_64x4d/README_CN.md
+++ b/model_zoo/research/cv/resnext152_64x4d/README_CN.md
@@ -51,19 +51,19 @@ ResNeXt整体网络架构如下：
 
 ## 混合精度
 
-采用[混合精度](https://www.mindspore.cn/tutorial/training/zh-CN/master/advanced_use/enable_mixed_precision.html)的训练方法使用支持单精度和半精度数据来提高深度学习神经网络的训练速度，同时保持单精度训练所能达到的网络精度。混合精度训练提高计算速度、减少内存使用的同时，支持在特定硬件上训练更大的模型或实现更大批次的训练。
+采用[混合精度](https://www.mindspore.cn/docs/programming_guide/zh-CN/master/enable_mixed_precision.html)的训练方法使用支持单精度和半精度数据来提高深度学习神经网络的训练速度，同时保持单精度训练所能达到的网络精度。混合精度训练提高计算速度、减少内存使用的同时，支持在特定硬件上训练更大的模型或实现更大批次的训练。
 
 以FP16算子为例，如果输入数据类型为FP32，MindSpore后台会自动降低精度来处理数据。用户可打开INFO日志，搜索“reduce precision”查看精度降低的算子。
 
 # 环境要求
 
 - 硬件（Ascend）
-    - 准备Ascend处理器搭建硬件环境。如需试用昇腾处理器，请发送[申请表](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx)至ascend@huawei.com，审核通过即可获得资源。
+    - 使用Ascend处理器来搭建硬件环境。
 - 框架
     - [MindSpore](https://www.mindspore.cn/install)
 - 如需查看详情，请参见如下资源：
-    - [MindSpore教程](https://www.mindspore.cn/tutorial/training/zh-CN/master/index.html)
-    - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/zh-CN/master/index.html)
+    - [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html)
+    - [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html)
 
 # 脚本说明
 
@@ -149,18 +149,18 @@ python train.py --data_dir ~/imagenet/train/ --platform Ascend --is_distributed
 ```shell
 Ascend:
     # 分布式训练示例（8卡）
-    sh run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
+    bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
     # 单机训练
-    sh run_standalone_train.sh DEVICE_ID DATA_PATH
+    bash run_standalone_train.sh DEVICE_ID DATA_PATH
 ```
 
 ### 样例
 
 ```shell
 # Ascend分布式训练示例（8卡）
-sh scripts/run_distribute_train.sh RANK_TABLE_FILE /dataset/train
+bash scripts/run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
 # Ascend单机训练示例
-sh scripts/run_standalone_train.sh 0 /dataset/train
+bash scripts/run_standalone_train.sh DEVICE_ID DATA_PATH
 ```
 
 您可以在日志中找到检查点文件和结果。
@@ -179,7 +179,7 @@ python eval.py --data_dir ~/imagenet/val/ --platform Ascend --pretrained resnext
 
 ```shell
 # 评估
-sh run_eval.sh DEVICE_ID DATA_PATH PRETRAINED_CKPT_PATH PLATFORM
+bash run_eval.sh DEVICE_ID DATA_PATH PRETRAINED_CKPT_PATH PLATFORM
 ```
 
 PLATFORM is Ascend, default is Ascend.
@@ -188,10 +188,10 @@ PLATFORM is Ascend, default is Ascend.
 
 ```shell
 # 检查点评估
-sh scripts/run_eval.sh 0 /opt/npu/datasets/classification/val /resnext152_100.ckpt Ascend
+bash scripts/run_eval.sh DEVICE_ID PRETRAINED_CKPT_PATH PLATFORM
 
 #或者直接使用脚本运行
-python eval.py --data_dir /opt/npu/pvc/dataset/storage/imagenet/val/ --platform Ascend --pretrained /root/test/resnext152_64x4d/outputs_demo/best_acc_0.ckpt
+python eval.py --data_dir ~/imagenet/val/ --platform Ascend --pretrained ~/best_acc_0.ckpt
 ```
 
 #### 结果
@@ -217,31 +217,31 @@ python export.py --device_target [PLATFORM] --ckpt_file [CKPT_PATH] --file_forma
 
 ### 训练性能
 
-| 参数 | ResNeXt152 | |
-| -------------------------- | ---------------------------------------------------------- | ------------------------- |
-| 资源                   | Ascend 910；CPU：2.60GHz，192核；内存：755GB              |           |
-| 上传日期              | 2021-6-30                                          |       |
-| MindSpore版本          | 1.2                                                    |                      |
-| 数据集 | ImageNet |  |
-| 训练参数        | src/config.py                                           |           |
-| 优化器                  | Momentum                                                        |                  |
-| 损失函数             | Softmax交叉熵 |  |
-| 损失                       | 1.2892 |  |
-| 准确率 | 80.08%(TOP1)                                          |      |
-| 总时长                 | 7.8小时 （8卡） |  |
-| 调优检查点 | 192 M（.ckpt文件） |      |
+| 参数 | ResNeXt152 |
+| -------------------------- | ---------------------------------------------------------- |
+| 资源                   | Ascend 910；CPU：2.60GHz，192核；内存：755GB              |
+| 上传日期              | 2021-6-30                                          |
+| MindSpore版本          | 1.2                                                    |
+| 数据集 | ImageNet |
+| 训练参数        | src/config.py                                           |
+| 优化器                  | Momentum                                                        |
+| 损失函数             | Softmax交叉熵 |
+| 损失                       | 1.2892 |
+| 准确率 | 80.08%(TOP1)                                          |
+| 总时长                 | 7.8小时 （8卡） |
+| 调优检查点 | 192 M（.ckpt文件） |
 
 #### 推理性能
 
-| 参数                 |                               |                           |                      |
-| -------------------------- | ----------------------------- | ------------------------- | -------------------- |
-| 资源                   |                     |  | Ascend 910          |
-| 上传日期              |                                            |    | 2021-6-20 |
-| MindSpore版本         |      |                      | 1.2             |
-| 数据集 |      |      | ImageNet， 1.2万 |
-| batch_size                 |      |      | 1                    |
-| 输出 |      |      | 概率 |
-| 准确率 |               |           | acc=80.08%(TOP1) |
+| 参数                 |                      |
+| -------------------------- | -------------------- |
+| 资源                   | Ascend 910          |
+| 上传日期              | 2021-6-20 |
+| MindSpore版本         | 1.2             |
+| 数据集 | ImageNet， 1.2万 |
+| batch_size                 | 1                    |
+| 输出 | 概率 |
+| 准确率 | acc=80.08%(TOP1) |
 
 # 随机情况说明
 
diff --git a/model_zoo/research/cv/resnext152_64x4d/scripts/run_distribute_train.sh b/model_zoo/research/cv/resnext152_64x4d/scripts/run_distribute_train.sh
index 2cfc0045d1e..e0b10e8f0b1 100644
--- a/model_zoo/research/cv/resnext152_64x4d/scripts/run_distribute_train.sh
+++ b/model_zoo/research/cv/resnext152_64x4d/scripts/run_distribute_train.sh
@@ -52,6 +52,7 @@ do
     --is_distribute=1 \
     --device_id=$DEVICE_ID \
     --pretrained=$PATH_CHECKPOINT \
-    --data_dir=$DATA_DIR > log_less.txt 2>&1 &
+    --data_dir=$DATA_DIR \
+    --run_eval=False > log_less.txt 2>&1 &
     cd ../
 done
diff --git a/model_zoo/research/cv/resnext152_64x4d/scripts/run_standalone_train.sh b/model_zoo/research/cv/resnext152_64x4d/scripts/run_standalone_train.sh
index 6f96801064f..07cb60cfe6d 100644
--- a/model_zoo/research/cv/resnext152_64x4d/scripts/run_standalone_train.sh
+++ b/model_zoo/research/cv/resnext152_64x4d/scripts/run_standalone_train.sh
@@ -26,5 +26,6 @@ python train.py  \
     --is_distribute=0 \
     --device_id=$DEVICE_ID \
     --pretrained=$PATH_CHECKPOINT \
-    --data_dir=$DATA_DIR > log.txt 2>&1 &
+    --data_dir=$DATA_DIR \
+    --run_eval=False > log.txt 2>&1 &
 
diff --git a/model_zoo/research/cv/resnext152_64x4d/train.py b/model_zoo/research/cv/resnext152_64x4d/train.py
index 6e8436e7aef..90586184fd6 100644
--- a/model_zoo/research/cv/resnext152_64x4d/train.py
+++ b/model_zoo/research/cv/resnext152_64x4d/train.py
@@ -146,7 +146,7 @@ def parse_args(cloud_args=None):
     #dataset of eval dataset
     parser.add_argument('--eval_data_dir',
                         type=str,
-                        default='/opt/npu/pvc/dataset/storage/imagenet/val',
+                        default='',
                         help='eval data dir')
     parser.add_argument('--eval_per_batch_size',
                         default=32,
@@ -289,9 +289,6 @@ def train(cloud_args=None):
     # checkpoint save
     progress_cb = ProgressMonitor(args)
     callbacks = [progress_cb,]
-    #eval dataset
-    if args.eval_data_dir is None or (not os.path.isdir(args.eval_data_dir)):
-        raise ValueError("{} is not a existing path.".format(args.eval_data_dir))
     #code like eval.py
     #if run eval
     if args.run_eval:
diff --git a/model_zoo/research/cv/retinanet_resnet101/README_CN.md b/model_zoo/research/cv/retinanet_resnet101/README_CN.md
index c5efe8f3b27..617861582bd 100644
--- a/model_zoo/research/cv/retinanet_resnet101/README_CN.md
+++ b/model_zoo/research/cv/retinanet_resnet101/README_CN.md
@@ -313,3 +313,9 @@ mAP: 0.3710347196613514
 # [ModelZoo 主页](#内容)
 
 请核对官方 [主页](https://gitee.com/mindspore/mindspore/tree/master/model_zoo).
+
+# FAQ
+
+优先参考[ModelZoo FAQ](https://gitee.com/mindspore/mindspore/tree/master/model_zoo#FAQ)来查找一些常见的公共问题。
+
+- **Q: 使用PYNATIVE_MODE发生内存溢出怎么办？** **A**：内存溢出通常是因为PYNATIVE_MODE需要更多的内存， 将batch size设置为16降低内存消耗，可进行网络训练。
diff --git a/model_zoo/research/cv/simple_baselines/README.md b/model_zoo/research/cv/simple_baselines/README.md
deleted file mode 100644
index fe453b8027c..00000000000
--- a/model_zoo/research/cv/simple_baselines/README.md
+++ /dev/null
@@ -1,263 +0,0 @@
-# 目录
-
-<!-- TOC -->
-
-- [simple_baselines描述](#simple_baselines描述)
-- [模型架构](#模型架构)
-- [数据集](#数据集)
-- [特性](#特性)
-    - [混合精度](#混合精度)
-- [环境要求](#环境要求)
-- [快速入门](#快速入门)
-- [脚本说明](#脚本说明)
-    - [脚本及样例代码](#脚本及样例代码)
-    - [脚本参数](#脚本参数)
-    - [训练过程](#训练过程)
-    - [评估过程](#评估过程)
-- [模型描述](#模型描述)
-    - [性能](#性能)
-        - [评估性能](#评估性能)
-- [随机情况说明](#随机情况说明)
-- [ModelZoo主页](#ModelZoo主页)
-
-<!-- /TOC -->
-
-# simple baselines描述
-
-## 概述
-
-simple_baselines模型网络由微软亚洲研究院Bin Xiao等人提出，作者认为当前流行的人体姿态估计和追踪方法都过于复杂，已有的关于人体姿势估计和姿势追踪模型在结构上看似差异较大，但在性能方面确又接近。作者提出了一种简单有效的基线方法，通过在主干网络ResNet上添加反卷积层，这恰恰是从高和低分辨率特征图中估计热图的最简单方法，从而有助于激发和评估该领域的新想法。
-
-simple_baselines模型网络具体细节可参考[论文1](https://arxiv.org/pdf/1804.06208.pdf)，simple_baselines模型网络Mindspore实现基于原微软亚洲研究院发布的Pytorch版本实现，具体可参考(<https://github.com/microsoft/human-pose-estimation.pytorch>)。
-
-## 论文
-
-1. [论文](https://arxiv.org/pdf/1804.06208.pdf)：Bin Xiao, Haiping Wu, Yichen Wei."Simple baselines for human pose estimation and tracking"
-
-# 模型架构
-
-simple_baselines的总体网络架构如下：
-[链接](https://arxiv.org/pdf/1804.06208.pdf)
-
-# 数据集
-
-使用的数据集：[COCO2017]
-
-- 数据集大小：
-    - 训练集：19.56G, 118,287个图像
-    - 测试集：825MB, 5,000个图像
-- 数据格式：JPG文件
-    - 注：数据在src/dataset.py中处理
-
-# 特性
-
-## 混合精度
-
-采用[混合精度](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/enable_mixed_precision.html)的训练方法使用支持单精度和半精度数据来提高深度学习神经网络的训练速度，同时保持单精度训练所能达到的网络精度。混合精度训练提高计算速度、减少内存使用的同时，支持在特定硬件上训练更大的模型或实现更大批次的训练。
-以FP16算子为例，如果输入数据类型为FP32，MindSpore后台会自动降低精度来处理数据。用户可打开INFO日志，搜索“reduce precision”查看精度降低的算子。
-
-# 环境要求
-
-- 硬件(Ascend)
-    - 准备Ascend处理器搭建硬件环境。
-- 框架
-    - [MindSpore](https://www.mindspore.cn/install/en)
-- 如需查看详情，请参见如下资源：
-    - [MindSpore教程](https://www.mindspore.cn/tutorial/training/zh-CN/master/index.html)
-    - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/zh-CN/master/index.html)
-
-# 快速入门
-
-通过官方网站安装MindSpore后，您可以按照如下步骤进行训练和评估：
-
-- 预训练模型
-
-  当开始训练之前需要获取mindspore图像网络预训练模型，可通过在[official model zoo](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/resnet)中运行Resnet训练脚本来获取模型权重文件，预训练文件名称为resnet50.ckpt。
-
-- 数据集准备
-
-  simple_baselines网络模型使用COCO2017数据集用于训练和推理，数据集可通过[official website](https://cocodataset.org/)官方网站下载使用。
-
-- Ascend处理器环境运行
-
-```text
-# 分布式训练
-用法：bash run_distribute_train.sh RANK_TABLE
-
-# 单机训练
-用法：bash run_standalone_train.sh DEVICE_ID
-
-# 运行评估示例
-用法：bash run_eval.sh
-```
-
-# 脚本说明
-
-## 脚本及样例代码
-
-```shell
-
-└──simple_baselines
-  ├── README.md
-  ├── scripts
-    ├── run_distribute_train.sh            # 启动Ascend分布式训练（8卡）
-    ├── run_eval.sh                        # 启动Ascend评估
-    ├── run_standalone_train.sh            # 启动Ascend单机训练（单卡）
-  ├── src
-    ├── utils
-        ├── coco.py                        # COCO数据集评估结果
-        ├── inference.py                   # 热图关键点预测
-        ├── nms.py                         # nms
-        ├── transforms.py                  # 图像处理转换
-    ├── config.py                          # 参数配置
-    ├── dataset.py                         # 数据预处理
-    ├── network_with_loss.py               # 损失函数定义
-    └── pose_resnet.py                     # 主干网络定义
-  ├── eval.py                              # 评估网络
-  └── train.py                             # 训练网络
-```
-
-## 脚本参数
-
-在src/config.py中配置相关参数。
-
-- 配置模型相关参数：
-
-```python
-config.MODEL.INIT_WEIGHTS = True                                 # 初始化模型权重
-config.MODEL.PRETRAINED = 'resnet50.ckpt'                        # 预训练模型
-config.MODEL.NUM_JOINTS = 17                                     # 关键点数量
-config.MODEL.IMAGE_SIZE = [192, 256]                             # 图像大小
-```
-
-- 配置网络相关参数：
-
-```python
-config.NETWORK.NUM_LAYERS = 50                                   # resnet主干网络层数
-config.NETWORK.DECONV_WITH_BIAS = False                          # 网络反卷积偏差
-config.NETWORK.NUM_DECONV_LAYERS = 3                             # 网络反卷积层数
-config.NETWORK.NUM_DECONV_FILTERS = [256, 256, 256]              # 反卷积层过滤器尺寸
-config.NETWORK.NUM_DECONV_KERNELS = [4, 4, 4]                    # 反卷积层内核大小
-config.NETWORK.FINAL_CONV_KERNEL = 1                             # 最终卷积层内核大小
-config.NETWORK.HEATMAP_SIZE = [48, 64]                           # 热图尺寸
-```
-
-- 配置训练相关参数：
-
-```python
-config.TRAIN.SHUFFLE = True                                      # 训练数据随机排序
-config.TRAIN.BATCH_SIZE = 64                                     # 训练批次大小
-config.TRAIN.BEGIN_EPOCH = 0                                     # 测试数据集文件名
-config.DATASET.FLIP = True                                       # 数据集随机翻转
-config.DATASET.SCALE_FACTOR = 0.3                                # 数据集随机规模因数
-config.DATASET.ROT_FACTOR = 40                                   # 数据集随机旋转因数
-config.TRAIN.BEGIN_EPOCH = 0                                     # 初始周期数
-config.TRAIN.END_EPOCH = 140                                     # 最终周期数
-config.TRAIN.LR = 0.001                                          # 初始学习率
-config.TRAIN.LR_FACTOR = 0.1                                     # 学习率降低因子
-```
-
-- 配置验证相关参数：
-
-```python
-config.TEST.BATCH_SIZE = 32                                      # 验证批次大小
-config.TEST.FLIP_TEST = True                                     # 翻转验证
-config.TEST.USE_GT_BBOX = False                                  # 使用标注框
-```
-
-- 配置nms相关参数：
-
-```python
-config.TEST.OKS_THRE = 0.9                                       # OKS阈值
-config.TEST.IN_VIS_THRE = 0.2                                    # 可视化阈值
-config.TEST.BBOX_THRE = 1.0                                      # 候选框阈值
-config.TEST.IMAGE_THRE = 0.0                                     # 图像阈值
-config.TEST.NMS_THRE = 1.0                                       # nms阈值
-```
-
-## 训练过程
-
-### 用法
-
-#### Ascend处理器环境运行
-
-```text
-# 分布式训练
-用法：bash run_distribute_train.sh RANK_TABLE
-
-# 单机训练
-用法：bash run_standalone_train.sh DEVICE_ID
-
-# 运行评估示例
-用法：bash run_eval.sh
-```
-
-### 结果
-
-- 使用COCO2017数据集训练simple_baselines
-
-```text
-分布式训练结果（8P）
-epoch:1 step:2340, loss is 0.0008106
-epoch:2 step:2340, loss is 0.0006160
-epoch:3 step:2340, loss is 0.0006480
-epoch:4 step:2340, loss is 0.0005620
-epoch:5 step:2340, loss is 0.0005207
-...
-epoch:138 step:2340, loss is 0.0003183
-epoch:139 step:2340, loss is 0.0002866
-epoch:140 step:2340, loss is 0.0003393
-```
-
-## 评估过程
-
-### 用法
-
-#### Ascend处理器环境运行
-
-可通过改变config.py文件中的"config.TEST.MODEL_FILE"文件进行相应模型推理。
-
-```bash
-# 评估
-bash eval.sh
-```
-
-### 结果
-
-使用COCO2017数据集文件夹中val2017进行评估simple_baselines,如下所示：
-
-```text
-coco eval results saved to /cache/train_output/multi_train_poseresnet_v5_2-140_2340/keypoints_results.pkl
-AP: 0.704
-```
-
-# 模型描述
-
-## 性能
-
-### 评估性能
-
-#### COCO2017上性能参数
-
-| Parameters          | Ascend 910                   |
-| ------------------- | --------------------------- |
-| 模型版本       | simple_baselines               |
-| 资源            | Ascend 910；CPU：2.60GHz，192核；内存：755G                  |
-| 上传日期       | 2021-03-29 |
-| MindSpore版本   | 1.1.0                       |
-| 数据集             | COCO2017                    |
-| 训练参数 | epoch=140, batch_size=64   |
-| 优化器           | Adam                        |
-| 损失函数       | Mean Squared Error          |
-| 输出             | heatmap                     |
-| 输出             | heatmap                     |
-| 速度               | 1pc: 251.4 ms/step        |
-| 训练性能   | AP: 0.704          |
-
-# 随机情况说明
-
-dataset.py中设置了“create_dataset”函数内的种子，同时在model.py中使用了初始化网络权重。
-
-# ModelZoo主页
-
- 请浏览官网[主页](https://gitee.com/mindspore/mindspore/tree/master/model_zoo)。
diff --git a/model_zoo/research/cv/wideresnet/README_CN.md b/model_zoo/research/cv/wideresnet/README_CN.md
index 5bdbdcb888f..1e2cbb99b10 100644
--- a/model_zoo/research/cv/wideresnet/README_CN.md
+++ b/model_zoo/research/cv/wideresnet/README_CN.md
@@ -276,3 +276,9 @@ dataset.py中设置了“create_dataset”函数内的种子，同时还使用
 # ModelZoo主页
 
 请浏览官网[主页](https://gitee.com/mindspore/mindspore/tree/master/model_zoo)。
+
+# FAQ
+
+优先参考[ModelZoo FAQ](https://gitee.com/mindspore/mindspore/tree/master/model_zoo#FAQ)来查找一些常见的公共问题。
+
+- **Q: 使用PYNATIVE_MODE发生内存溢出怎么办？** **A**：内存溢出通常是因为PYNATIVE_MODE需要更多的内存， 将batch size设置为16降低内存消耗，可进行网络训练。
diff --git a/model_zoo/research/nlp/seq2seq/README_CN.md b/model_zoo/research/nlp/seq2seq/README_CN.md
index 032e7f2404e..7f2a8cd7f51 100644
--- a/model_zoo/research/nlp/seq2seq/README_CN.md
+++ b/model_zoo/research/nlp/seq2seq/README_CN.md
@@ -33,7 +33,7 @@ bash wmt14_en_fr.sh
 
 ## 混合精度
 
-采用[混合精度](https://www.mindspore.cn/tutorial/training/zh-CN/master/advanced_use/enable_mixed_precision.html)的训练方法使用支持单精度和半精度数据来提高深度学习神经网络的训练速度，同时保持单精度训练所能达到的网络精度。混合精度训练提高计算速度、减少内存使用的同时，支持在特定硬件上训练更大的模型或实现更大批次的训练。
+采用[混合精度](https://www.mindspore.cn/docs/programming_guide/zh-CN/master/enable_mixed_precision.html))的训练方法使用支持单精度和半精度数据来提高深度学习神经网络的训练速度，同时保持单精度训练所能达到的网络精度。混合精度训练提高计算速度、减少内存使用的同时，支持在特定硬件上训练更大的模型或实现更大批次的训练。
 以FP16算子为例，如果输入数据类型为FP32，MindSpore后台会自动降低精度来处理数据。用户可打开INFO日志，搜索“reduce precision”查看精度降低的算子。
 
 # 环境要求
@@ -41,10 +41,10 @@ bash wmt14_en_fr.sh
 - 硬件（Ascend）
     - 使用Ascend处理器来搭建硬件环境。
 - 框架
-    - [MindSpore](https://www.mindspore.cn/install/en)
+    - [MindSpore](https://www.mindspore.cn/install/)
 - 如需查看详情，请参见如下资源：
-    - [MindSpore教程](https://www.mindspore.cn/tutorial/training/zh-CN/master/index.html)
-    - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html)
+    - [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html)
+    - [MindSpore Python API](https://www.mindspore.cn/docs/api/en/master/index.html)
 
 # 快速入门
 
diff --git a/model_zoo/research/nlp/seq2seq/config/config.json b/model_zoo/research/nlp/seq2seq/config/config.json
index 6a7d92f63c9..61c1455d13d 100644
--- a/model_zoo/research/nlp/seq2seq/config/config.json
+++ b/model_zoo/research/nlp/seq2seq/config/config.json
@@ -22,9 +22,9 @@
     "max_decode_length": 50
   },
   "loss_scale_config": {
-    "init_loss_scale": 65536,
+    "init_loss_scale": 64,
     "loss_scale_factor": 2,
-    "scale_window": 1000
+    "scale_window": 5000
   },
   "learn_rate_config": {
     "optimizer": "adam",
diff --git a/model_zoo/research/nlp/seq2seq/eval.py b/model_zoo/research/nlp/seq2seq/eval.py
index 6f10eeb5a0c..060d75c9314 100644
--- a/model_zoo/research/nlp/seq2seq/eval.py
+++ b/model_zoo/research/nlp/seq2seq/eval.py
@@ -16,10 +16,9 @@
 import os
 # os.system("pip3 install subword-nmt")
 # os.system("pip3 install sacremoses")
-
+import ast
 import argparse
 import pickle
-import moxing as mox
 from mindspore.common import dtype as mstype
 from mindspore import context
 
@@ -30,19 +29,14 @@ from src.dataset.tokenizer import Tokenizer
 
 is_modelarts = False
 
-if is_modelarts:
-    parser = argparse.ArgumentParser(description='seq2seq')
-    parser.add_argument("--config", type=str, required=True,
-                        help="model config json file path.")
-    parser.add_argument("--data_url", type=str, required=True,
-                        help="data address.")
-    parser.add_argument("--train_url", type=str, required=True,
-                        help="output address.")
-
 
 parser = argparse.ArgumentParser(description='seq2seq')
 parser.add_argument("--config", type=str, required=True,
                     help="model config json file path.")
+parser.add_argument("--data_url", type=str, default=None,
+                    help="data address.")
+parser.add_argument("--train_url", type=str, default=None,
+                    help="output address.")
 parser.add_argument("--test_dataset", type=str, required=True,
                     help="test dataset address.")
 parser.add_argument("--existed_ckpt", type=str, required=True,
@@ -57,6 +51,11 @@ parser.add_argument("--test_tgt", type=str, required=True,
 parser.add_argument("--output", type=str, required=False,
                     default="./output.npz",
                     help="result file path.")
+parser.add_argument("--is_modelarts", type=ast.literal_eval, default=False,
+                    help="running on modelarts")
+args, _ = parser.parse_known_args()
+if args.is_modelarts:
+    import moxing as mox
 
 context.set_context(
     mode=context.GRAPH_MODE,
@@ -78,11 +77,10 @@ def _check_args(config):
 
 
 if __name__ == '__main__':
-    args, _ = parser.parse_known_args()
     _check_args(args.config)
     _config = get_config(args.config)
 
-    if is_modelarts:
+    if args.is_modelarts:
         mox.file.copy_parallel(src_url=args.data_url, dst_url='/cache/dataset_menu/')
         _config.test_dataset = '/cache/dataset_menu/newstest2014.en.mindrecord'
         _config.existed_ckpt = '/cache/dataset_menu/seq2seq-7_1642.ckpt'
@@ -103,7 +101,7 @@ if __name__ == '__main__':
     scores = bleu_calculate(tokenizer, result_npy_addr, test_tgt)
     print(f"BLEU scores is :{scores}")
 
-    if is_modelarts:
+    if args.is_modelarts:
         result_npy_addr = output
         vocab = '/cache/dataset_menu/vocab.bpe.32000'
         bpe_codes = '/cache/dataset_menu/bpe.32000'
diff --git a/model_zoo/research/nlp/seq2seq/src/seq2seq_model/beam_search.py b/model_zoo/research/nlp/seq2seq/src/seq2seq_model/beam_search.py
index 7d142cec666..14ac445c7e8 100644
--- a/model_zoo/research/nlp/seq2seq/src/seq2seq_model/beam_search.py
+++ b/model_zoo/research/nlp/seq2seq/src/seq2seq_model/beam_search.py
@@ -34,7 +34,7 @@ class LengthPenalty(nn.Cell):
     def __init__(self, weight=1.0, compute_type=mstype.float32):
         super(LengthPenalty, self).__init__()
         self.weight = weight
-        self.add = P.TensorAdd()
+        self.add = P.Add()
         self.pow = P.Pow()
         self.div = P.RealDiv()
         self.five = Tensor(5.0, mstype.float32)
@@ -183,7 +183,7 @@ class BeamSearchDecoder(nn.Cell):
         self.decoder = decoder
         self.is_using_while = is_using_while
 
-        self.add = P.TensorAdd()
+        self.add = P.Add()
         self.expand = P.ExpandDims()
         self.reshape = P.Reshape()
         self.shape_flat = (-1,)
diff --git a/model_zoo/research/nlp/seq2seq/src/seq2seq_model/dynamic_rnn.py b/model_zoo/research/nlp/seq2seq/src/seq2seq_model/dynamic_rnn.py
index 9d956816109..014c40287ee 100644
--- a/model_zoo/research/nlp/seq2seq/src/seq2seq_model/dynamic_rnn.py
+++ b/model_zoo/research/nlp/seq2seq/src/seq2seq_model/dynamic_rnn.py
@@ -90,7 +90,6 @@ class DynamicRNNNet(nn.Cell):
         self.cast = P.Cast()
         self.concat = P.Concat(axis=0)
         self.get_shape = P.Shape()
-        self.print = P.Print()
         self.net = DynamicRNNCell(num_setp=seq_length,
                                   batch_size=batchsize,
                                   word_embed_dim=word_embed_dim,
diff --git a/model_zoo/research/nlp/seq2seq/src/seq2seq_model/embedding.py b/model_zoo/research/nlp/seq2seq/src/seq2seq_model/embedding.py
index 68202c18b41..a56ba2a3c89 100644
--- a/model_zoo/research/nlp/seq2seq/src/seq2seq_model/embedding.py
+++ b/model_zoo/research/nlp/seq2seq/src/seq2seq_model/embedding.py
@@ -49,7 +49,7 @@ class EmbeddingLookup(nn.Cell):
         init_weight = np.random.normal(-initializer_range, initializer_range, size=[vocab_size, embed_dim])
         self.embedding_table = Parameter(Tensor(init_weight, mstype.float32), name="embedding_table")
         self.expand = P.ExpandDims()
-        self.gather = P.GatherV2()
+        self.gather = P.Gather()
         self.one_hot = P.OneHot()
         self.on_value = Tensor(1.0, mstype.float32)
         self.off_value = Tensor(0.0, mstype.float32)
diff --git a/model_zoo/research/nlp/seq2seq/src/seq2seq_model/seq2seq_for_train.py b/model_zoo/research/nlp/seq2seq/src/seq2seq_model/seq2seq_for_train.py
index c1edff1ada2..3f6bd3b9a01 100644
--- a/model_zoo/research/nlp/seq2seq/src/seq2seq_model/seq2seq_for_train.py
+++ b/model_zoo/research/nlp/seq2seq/src/seq2seq_model/seq2seq_for_train.py
@@ -23,8 +23,7 @@ from mindspore.common.tensor import Tensor
 from mindspore import Parameter
 from mindspore.common import dtype as mstype
 from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
-from mindspore.context import ParallelMode
-from mindspore.parallel._utils import _get_device_num, _get_parallel_mode, _get_gradients_mean
+from mindspore.communication.management import get_group_size
 
 from .seq2seq import Seq2seqModel
 
@@ -32,43 +31,31 @@ from .seq2seq import Seq2seqModel
 GRADIENT_CLIP_TYPE = 1
 GRADIENT_CLIP_VALUE = 5.0
 
-class ClipGradients(nn.Cell):
+clip_grad = C.MultitypeFuncGraph("clip_grad")
+
+
+@clip_grad.register("Number", "Number", "Tensor")
+def _clip_grad(clip_type, clip_value, grad):
     """
     Clip gradients.
 
-    Args:
-        grads (list): List of gradient tuples.
-        clip_type (Tensor): The way to clip, 'value' or 'norm'.
-        clip_value (Tensor): Specifies how much to clip.
+    Inputs:
+        clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.
+        clip_value (float): Specifies how much to clip.
+        grad (tuple[Tensor]): Gradients.
 
-    Returns:
-        List, a list of clipped_grad tuples.
+    Outputs:
+        tuple[Tensor], clipped gradients.
     """
-    def __init__(self):
-        super(ClipGradients, self).__init__()
-        self.clip_by_norm = nn.ClipByNorm()
-        self.cast = P.Cast()
-        self.dtype = P.DType()
-
-    def construct(self,
-                  grads,
-                  clip_type,
-                  clip_value):
-        """Defines the gradients clip."""
-        if clip_type not in (0, 1):
-            return grads
-
-        new_grads = ()
-        for grad in grads:
-            dt = self.dtype(grad)
-            if clip_type == 0:
-                t = C.clip_by_value(grad, self.cast(F.tuple_to_array((-clip_value,)), dt),
-                                    self.cast(F.tuple_to_array((clip_value,)), dt))
-            else:
-                t = self.clip_by_norm(grad, self.cast(F.tuple_to_array((clip_value,)), dt))
-            new_grads = new_grads + (t,)
-
-        return new_grads
+    if clip_type not in (0, 1):
+        return grad
+    dt = F.dtype(grad)
+    if clip_type == 0:
+        new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
+                                   F.cast(F.tuple_to_array((clip_value,)), dt))
+    else:
+        new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
+    return new_grad
 
 class PredLogProbs(nn.Cell):
     """
@@ -238,8 +225,7 @@ grad_overflow = P.FloatStatus()
 def _tensor_grad_overflow(grad):
     return grad_overflow(grad)
 
-
-class Seq2seqTrainOneStepWithLossScaleCell(nn.Cell):
+class Seq2seqTrainOneStepWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
     """
     Encapsulation class of seq2seq network training.
 
@@ -254,48 +240,18 @@ class Seq2seqTrainOneStepWithLossScaleCell(nn.Cell):
     Returns:
         Tuple[Tensor, Tensor, Tensor], loss, overflow, sen.
     """
-
     def __init__(self, network, optimizer, scale_update_cell=None):
-
-        super(Seq2seqTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
-        self.network = network
-        self.network.set_grad()
-        self.network.add_flags(defer_inline=True)
-        self.weights = optimizer.parameters
-        self.optimizer = optimizer
-        self.grad = C.GradOperation(get_by_list=True,
-                                    sens_param=True)
-        self.reducer_flag = False
-        self.all_reduce = P.AllReduce()
-
-        self.parallel_mode = _get_parallel_mode()
-        if self.parallel_mode not in ParallelMode.MODE_LIST:
-            raise ValueError("Parallel mode does not support: ", self.parallel_mode)
-        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
-            self.reducer_flag = True
-        self.grad_reducer = None
-        if self.reducer_flag:
-            mean = _get_gradients_mean()
-            degree = _get_device_num()
-            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
-        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
-        self.clip_gradients = ClipGradients()
+        super(Seq2seqTrainOneStepWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell)
         self.cast = P.Cast()
-        self.alloc_status = P.NPUAllocFloatStatus()
-        self.get_status = P.NPUGetFloatStatus()
-        self.clear_before_grad = P.NPUClearFloatStatus()
-        self.reduce_sum = P.ReduceSum(keep_dims=False)
-        self.base = Tensor(1, mstype.float32)
-        self.less_equal = P.LessEqual()
-        self.hyper_map = C.HyperMap()
+        self.degree = 1
+        if self.reducer_flag:
+            self.degree = get_group_size()
+            self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
 
         self.loss_scale = None
         self.loss_scaling_manager = scale_update_cell
         if scale_update_cell:
-            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(),
-                                               dtype=mstype.float32), name="loss_scale")
-        self.add_flags(has_effect=True)
-
+            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
 
     def construct(self,
                   source_eos_ids,
@@ -330,14 +286,13 @@ class Seq2seqTrainOneStepWithLossScaleCell(nn.Cell):
                             target_ids,
                             label_ids,
                             label_weights)
-        # Alloc status.
-        init = self.alloc_status()
-        # Clear overflow buffer.
-        self.clear_before_grad(init)
         if sens is None:
             scaling_sens = self.loss_scale
         else:
             scaling_sens = sens
+
+        status, scaling_sens = self.start_overflow_check(loss, scaling_sens)
+
         grads = self.grad(self.network, weights)(source_ids,
                                                  source_mask,
                                                  target_ids,
@@ -345,26 +300,18 @@ class Seq2seqTrainOneStepWithLossScaleCell(nn.Cell):
                                                  label_weights,
                                                  self.cast(scaling_sens,
                                                            mstype.float32))
+        # apply grad reducer on grads
+        grads = self.grad_reducer(grads)
+        grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
+        grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
 
-        grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads)
-        grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE)
-        if self.reducer_flag:
-            # Apply grad reducer on grads.
-            grads = self.grad_reducer(grads)
-        self.get_status(init)
-        flag_sum = self.reduce_sum(init, (0,))
-
-        if self.is_distributed:
-            # Sum overflow flag over devices.
-            flag_reduce = self.all_reduce(flag_sum)
-            cond = self.less_equal(self.base, flag_reduce)
-        else:
-            cond = self.less_equal(self.base, flag_sum)
-
+        cond = self.get_overflow_status(status, grads)
         overflow = cond
         if sens is None:
             overflow = self.loss_scaling_manager(self.loss_scale, cond)
-        if not overflow:
-            self.optimizer(grads)
-
-        return (loss, cond, scaling_sens)
+        if overflow:
+            succ = False
+        else:
+            succ = self.optimizer(grads)
+        ret = (loss, cond, scaling_sens)
+        return F.depend(ret, succ)
diff --git a/model_zoo/research/nlp/seq2seq/train.py b/model_zoo/research/nlp/seq2seq/train.py
index 23c9e0fb9d8..ef39d22e8fe 100644
--- a/model_zoo/research/nlp/seq2seq/train.py
+++ b/model_zoo/research/nlp/seq2seq/train.py
@@ -25,7 +25,7 @@ from mindspore.nn.optim import Lamb
 from mindspore.train.model import Model
 from mindspore.train.loss_scale_manager import DynamicLossScaleManager
 from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor
-from mindspore.train.callback import LossMonitor, SummaryCollector
+from mindspore.train.callback import LossMonitor
 from mindspore import context, Parameter
 from mindspore.context import ParallelMode
 from mindspore.communication import management as MultiAscend
@@ -52,7 +52,7 @@ if args.is_modelarts:
     import moxing as mox
 context.set_context(
     mode=context.GRAPH_MODE,
-    save_graphs=True,
+    save_graphs=False,
     device_target="Ascend",
     reserve_class_name_in_scope=True)
 
@@ -221,12 +221,12 @@ def _build_training_pipeline(config: Seq2seqConfig,
     loss_monitor = LossCallBack(config)
     dataset_size = dataset.get_dataset_size()
     time_cb = TimeMonitor(data_size=dataset_size)
-    ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps,
+    ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset.get_dataset_size(),
                                    keep_checkpoint_max=config.keep_ckpt_max)
 
     rank_size = os.getenv('RANK_SIZE')
     callbacks = [time_cb, loss_monitor]
-    callbacks.append(LossMonitor(1642))
+    callbacks.append(LossMonitor())
 
     if rank_size is not None and int(rank_size) > 1 and MultiAscend.get_rank() % 8 == 0:
         ckpt_callback = ModelCheckpoint(
@@ -234,8 +234,6 @@ def _build_training_pipeline(config: Seq2seqConfig,
             directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
             config=ckpt_config)
         callbacks.append(ckpt_callback)
-        summary_callback = SummaryCollector(summary_dir="./summary", collect_freq=50)
-        callbacks.append(summary_callback)
 
     if rank_size is None or int(rank_size) == 1:
         ckpt_callback = ModelCheckpoint(
@@ -243,8 +241,6 @@ def _build_training_pipeline(config: Seq2seqConfig,
             directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
             config=ckpt_config)
         callbacks.append(ckpt_callback)
-        summary_callback = SummaryCollector(summary_dir="./summary", collect_freq=50)
-        callbacks.append(summary_callback)
 
     print(f" | ALL SET, PREPARE TO TRAIN.")
     _train(model=model, config=config,
diff --git a/model_zoo/research/recommend/Fat-DeepFFM/src/fat_deepffm.py b/model_zoo/research/recommend/Fat-DeepFFM/src/fat_deepffm.py
index 715c02ff1bf..3de30f1a3b3 100644
--- a/model_zoo/research/recommend/Fat-DeepFFM/src/fat_deepffm.py
+++ b/model_zoo/research/recommend/Fat-DeepFFM/src/fat_deepffm.py
@@ -21,6 +21,7 @@ from mindspore.common.initializer import initializer
 
 import mindspore.ops as P
 from mindspore.ops import composite as C
+from mindspore.ops import functional as F
 
 from mindspore import Parameter, ParameterTuple
 from mindspore import Tensor
@@ -350,8 +351,7 @@ class TrainStepWrap(nn.Cell):
         grads = self.grad(self.network, weights)(cats_vals, num_vals, label, sens)
         if self.reducer_flag:
             grads = self.grad_reducer(grads)
-        self.optimizer(grads)
-        return loss
+        return F.depend(loss, self.optimizer(grads))
 
 
 class ModelBuilder:
diff --git a/model_zoo/research/recommend/Fat-DeepFFM/src/preprocess_data.py b/model_zoo/research/recommend/Fat-DeepFFM/src/preprocess_data.py
index 8bed00d339b..70ef11d76be 100644
--- a/model_zoo/research/recommend/Fat-DeepFFM/src/preprocess_data.py
+++ b/model_zoo/research/recommend/Fat-DeepFFM/src/preprocess_data.py
@@ -176,8 +176,8 @@ def random_split_trans2mindrecord(input_file_path, output_file_path, recommendat
     dense_list = []
     label_list = []
 
-    writer_train = FileWriter(os.path.join(output_file_path, "train_input_part.mindrecord"), 1)
-    writer_test = FileWriter(os.path.join(output_file_path, "test_input_part.mindrecord"), 1)
+    writer_train = FileWriter(os.path.join(output_file_path, "train_input_part.mindrecord"), 21)
+    writer_test = FileWriter(os.path.join(output_file_path, "test_input_part.mindrecord"), 3)
 
     schema = {"label": {"type": "float32", "shape": [-1]}, "num_vals": {"type": "float32", "shape": [-1]},
               "cats_vals": {"type": "int32", "shape": [-1]}}
diff --git a/model_zoo/research/recommend/autodis/src/autodis.py b/model_zoo/research/recommend/autodis/src/autodis.py
index 57c775d8f57..17289864006 100644
--- a/model_zoo/research/recommend/autodis/src/autodis.py
+++ b/model_zoo/research/recommend/autodis/src/autodis.py
@@ -19,6 +19,7 @@ import numpy as np
 from sklearn.metrics import roc_auc_score
 import mindspore.common.dtype as mstype
 from mindspore.ops import composite as C
+from mindspore.ops import functional as F
 from mindspore.ops import operations as P
 from mindspore.nn import Dropout
 from mindspore.nn.optim import Adam
@@ -332,8 +333,7 @@ class TrainStepWrap(nn.Cell):
         loss = self.network(batch_ids, batch_wts, label)
         sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens) #
         grads = self.grad(self.network, weights)(batch_ids, batch_wts, label, sens)
-        self.optimizer(grads)
-        return loss
+        return F.depend(loss, self.optimizer(grads))
 
 
 class PredictWithSigmoid(nn.Cell):
diff --git a/tests/st/auto_monad/test_auto_monad.py b/tests/st/auto_monad/test_auto_monad.py
index d79b5ba7eff..9acf136eeb0 100644
--- a/tests/st/auto_monad/test_auto_monad.py
+++ b/tests/st/auto_monad/test_auto_monad.py
@@ -21,7 +21,7 @@ import mindspore as ms
 import mindspore.ops.operations as P
 import mindspore.nn as nn
 from mindspore.nn import Cell
-from mindspore.nn import ReLU, BatchNorm2d, Conv2d, Dense, PReLU, ParameterUpdate
+from mindspore.nn import ReLU, BatchNorm2d, Conv2d, ParameterUpdate
 from mindspore.nn import Momentum, SoftmaxCrossEntropyWithLogits
 from mindspore import context, Tensor
 from mindspore.common.parameter import Parameter
@@ -1042,7 +1042,7 @@ def test_variable_from_outer_graph():
     np.testing.assert_array_equal(out.asnumpy(), expect.asnumpy())
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
@@ -1079,7 +1079,7 @@ def test_ctrl_while_by_while_and_if_in_first_while():
     net(input_me_a)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
@@ -1214,33 +1214,12 @@ def find_newest_validateir_file(folder_path):
 
 
 def read_file():
-    filename = find_newest_validateir_file('./')
+    filename = find_newest_validateir_file('./rank_0/ir_dump')
     with open((os.path.join(filename)), 'r') as f:
         content = f.read()
     return content
 
 
-# Net contain Prelu,BN,Conv,Dense which have weight value
-class NetRrelu(Cell):
-    def __init__(self, in_channel, out_channel):
-        super().__init__()
-        self.relu = PReLU(channel=in_channel, w=0.25)
-        self.bn = BatchNorm2d(num_features=in_channel)
-        self.conv = Conv2d(in_channels=in_channel, out_channels=out_channel, kernel_size=2, stride=1, has_bias=False,
-                           weight_init='ones', pad_mode='same')
-        self.mean = P.ReduceMean(keep_dims=False)
-        self.fc = Dense(in_channels=out_channel, out_channels=out_channel,
-                        weight_init='ones', bias_init='zeros', has_bias=True)
-
-    def construct(self, x):
-        x = self.relu(x)
-        x = self.bn(x)
-        x = self.conv(x)
-        x = self.mean(x, (2, 3))
-        x = self.fc(x)
-        return x
-
-
 def check_keep_batchnorm_fp32_false(kwargs, level):
     if ms.context.get_context("device_target") == "GPU":
         if level == "O2":
@@ -1274,13 +1253,6 @@ def use_build_train_network_check_cast_num(network, level, inputs, label, cast_n
     return out_me
 
 
-def test_auto_mixed_precision_train_prelunet(with_save_graphs):
-    net2 = NetRrelu(3, 12)
-    input32 = Tensor(np.ones([1, 3, 2, 2]).astype(np.float32))
-    label32 = Tensor(np.zeros([1, 12]).astype(np.float32))
-    use_build_train_network_check_cast_num(net2, "O2", input32, label32, 16)
-
-
 class AssignNet(Cell):
     def __init__(self):
         super().__init__()
diff --git a/tests/st/auto_monad/test_auto_monad_gpu.py b/tests/st/auto_monad/test_auto_monad_gpu.py
index 685d686128a..e61da3048ab 100644
--- a/tests/st/auto_monad/test_auto_monad_gpu.py
+++ b/tests/st/auto_monad/test_auto_monad_gpu.py
@@ -136,7 +136,7 @@ def test_side_effect_castall():
     inputs1 = np.random.randn(5, 5)
     inputs2 = np.random.randn(5, 5)
     net(Tensor(inputs1, ms.float32), Tensor(inputs2, ms.float32))
-    result = find_files('hwopt*cast_all*.ir', 'CastAll')
+    result = find_files('./rank_0/ir_dump/hwopt*cast_all*.ir', 'CastAll')
     assert result == '2'
 
 
@@ -226,7 +226,7 @@ class SideEffectTwoAssignTwoAddnDependencyNet(Cell):
         return grad_out
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_ctrl_while_by_while_and_if_in_first_while():
@@ -262,7 +262,7 @@ def test_ctrl_while_by_while_and_if_in_first_while():
     net(input_me_a)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_ctrl_while_by_while_and_while_in_first_while():
@@ -348,9 +348,9 @@ def test_ir_fusion_inplace_bn_conv_conv():
                                   keep_batchnorm_fp32=False)
     net.set_train()
     net(Tensor(input_np), Tensor(label))
-    find_accum = find_files("hwopt*cudnn_inplace*ir",
+    find_accum = find_files("./rank_0/ir_dump/hwopt*cudnn_inplace*ir",
                             "inplace_algo: accumulation")
-    find_cover = find_files("hwopt*cudnn_inplace*ir",
+    find_cover = find_files("./rank_0/ir_dump/hwopt*cudnn_inplace*ir",
                             "inplace_algo: cover")
     assert find_accum == '1'
     assert find_cover == '1'
@@ -372,7 +372,7 @@ def find_newest_validateir_file(folder_path):
 
 
 def read_file():
-    filename = find_newest_validateir_file('./')
+    filename = find_newest_validateir_file('./rank_0/ir_dump/')
     with open((os.path.join(filename)), 'r') as f:
         content = f.read()
     clean_all_ir_files('./')
diff --git a/tests/st/auto_monad/test_auto_monad_mindtester.py b/tests/st/auto_monad/test_auto_monad_mindtester.py
index 8dc7af94920..a5a4857d98c 100644
--- a/tests/st/auto_monad/test_auto_monad_mindtester.py
+++ b/tests/st/auto_monad/test_auto_monad_mindtester.py
@@ -507,7 +507,7 @@ class SideEffectControlFlowAssignDependTwoIfNet(Cell):
         return grad_out
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
diff --git a/tests/st/auto_monad/test_auto_monad_momentum_loss.py b/tests/st/auto_monad/test_auto_monad_momentum_loss.py
index e86a8f2590a..1f0ef4301be 100644
--- a/tests/st/auto_monad/test_auto_monad_momentum_loss.py
+++ b/tests/st/auto_monad/test_auto_monad_momentum_loss.py
@@ -61,7 +61,7 @@ class MSELoss(Cell):
         return self.reduce_mean(self.square(diff), get_axis(diff))
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
diff --git a/tests/st/control/inner/test_000_single_if.py b/tests/st/control/inner/test_000_single_if.py
index 0b172f6a7c2..ea47677e061 100644
--- a/tests/st/control/inner/test_000_single_if.py
+++ b/tests/st/control/inner/test_000_single_if.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.ops import composite as C
 from mindspore.common import dtype as mstype
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
 
 
 class SingleIfNet(nn.Cell):
@@ -62,26 +62,38 @@ def control_flow_single_if(input_net, x, y):
     context.set_context(mode=context.GRAPH_MODE)
     net = input_net()
     grad_net = GradNet(net)
-    graph_forward_res = net(x, y)
+
+    forward_net = input_net()
+    graph_forward_res = forward_net(x, y)
     graph_backward_res = grad_net(x, y)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     net = input_net()
     grad_net = GradNet(net)
-    pynative_forward_res = net(x, y)
+
+    forward_net = input_net()
+    pynative_forward_res = forward_net(x, y)
     pynative_backward_res = grad_net(x, y)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_single_if():
     x = Tensor(2, mstype.int32)
     y = Tensor(5, mstype.int32)
     control_flow_single_if(SingleIfNet, x, y)
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_single_if_01():
     x = Tensor(2, mstype.int32)
     y = Tensor(5, mstype.int32)
diff --git a/tests/st/control/inner/test_001_single_while.py b/tests/st/control/inner/test_001_single_while.py
index 1f626f45e95..5f669a18c1d 100644
--- a/tests/st/control/inner/test_001_single_while.py
+++ b/tests/st/control/inner/test_001_single_while.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=True, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=True)
 
 
 class ForwardNet(nn.Cell):
@@ -41,7 +42,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     c1 = Tensor([0], mstype.int32)
     c2 = Tensor([0], mstype.int32)
@@ -50,7 +55,11 @@ def test_forward():
     output = forward_net(c1, c2)
     assert expect == output
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     c1 = Tensor([0], mstype.int32)
     c2 = Tensor([0], mstype.int32)
diff --git a/tests/st/control/inner/test_002_single_for.py b/tests/st/control/inner/test_002_single_for.py
index 2f8a49e92a1..ca4f7762119 100644
--- a/tests/st/control/inner/test_002_single_for.py
+++ b/tests/st/control/inner/test_002_single_for.py
@@ -19,11 +19,17 @@ from mindspore import Tensor, nn
 from mindspore.common.parameter import Parameter
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
+from mindspore.ops import functional as F
 from mindspore.common import dtype as mstype
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
 
+
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_single_for_01():
     class SingleForNet(nn.Cell):
         def __init__(self):
@@ -52,22 +58,31 @@ def test_single_for_01():
 
     # graph mode
     context.set_context(mode=context.GRAPH_MODE)
+    for_net_foward = SingleForNet()
+    graph_forward_res = for_net_foward(x, y, z)
+
     for_net = SingleForNet()
     net = GradNet(for_net)
-    graph_forward_res = for_net(x, y, z)
     graph_backward_res = net(x, y, z)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
+    for_net_foward = SingleForNet()
+    pynative_forward_res = for_net_foward(x, y, z)
+
     for_net = SingleForNet()
     net = GradNet(for_net)
-    pynative_forward_res = for_net(x, y, z)
     pynative_backward_res = net(x, y, z)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_single_for_02():
     class SingleForNet(nn.Cell):
         def __init__(self):
@@ -98,20 +113,29 @@ def test_single_for_02():
     context.set_context(mode=context.GRAPH_MODE)
     for_net = SingleForNet()
     net = GradNet(for_net)
-    graph_forward_res = for_net(x, y, z)
+
+    for_net_forward = SingleForNet()
+    graph_forward_res = for_net_forward(x, y, z)
     graph_backward_res = net(x, y, z)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_net = SingleForNet()
     net = GradNet(for_net)
-    pynative_forward_res = for_net(x, y, z)
+
+    for_net_forward = SingleForNet()
+    pynative_forward_res = for_net_forward(x, y, z)
     pynative_backward_res = net(x, y, z)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_single_for_03():
     class SingleForNet(nn.Cell):
         def __init__(self):
@@ -153,20 +177,29 @@ def test_single_for_03():
     context.set_context(mode=context.GRAPH_MODE)
     single_for_net = SingleForNet()
     net = GradNet(single_for_net)
-    graph_forward_res = single_for_net(x, y)
+
+    for_net_forward = SingleForNet()
+    graph_forward_res = for_net_forward(x, y)
     graph_backward_res = net(x, y)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     single_for_net = SingleForNet()
     net = GradNet(single_for_net)
-    pynative_forward_res = single_for_net(x, y)
+
+    for_net_forward = SingleForNet()
+    pynative_forward_res = for_net_forward(x, y)
     pynative_backward_res = net(x, y)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-@pytest.mark.skip(reason="not supported side effect")
+
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_single_for_04():
     class SingleForNet(nn.Cell):
         def __init__(self):
@@ -183,7 +216,7 @@ def test_single_for_04():
         def construct(self, x):
             self.assign(self.param_a, x + self.param_a)
             for _ in range(1):
-                self.param_b = x - self.param_a
+                F.assign(self.param_b, x - self.param_a)
             return self.param_b
 
     class GradNet(nn.Cell):
@@ -200,20 +233,29 @@ def test_single_for_04():
     context.set_context(mode=context.GRAPH_MODE)
     single_for_net = SingleForNet()
     net = GradNet(single_for_net)
-    graph_forward_res = single_for_net(x)
+
+    for_net_forward = SingleForNet()
+    graph_forward_res = for_net_forward(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     single_for_net = SingleForNet()
     net = GradNet(single_for_net)
-    pynative_forward_res = single_for_net(x)
+
+    for_net_forward = SingleForNet()
+    pynative_forward_res = for_net_forward(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_single_for_05():
     class SingleForNet(nn.Cell):
         def __init__(self):
@@ -245,14 +287,18 @@ def test_single_for_05():
     context.set_context(mode=context.GRAPH_MODE)
     single_for_net = SingleForNet()
     net = GradNet(single_for_net)
-    graph_forward_res = single_for_net(x)
+
+    for_net_forward = SingleForNet()
+    graph_forward_res = for_net_forward(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     single_for_net = SingleForNet()
     net = GradNet(single_for_net)
-    pynative_forward_res = single_for_net(x)
+
+    for_net_forward = SingleForNet()
+    pynative_forward_res = for_net_forward(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_010_if_in_if.py b/tests/st/control/inner/test_010_if_in_if.py
index a4fc529581b..ae94b9e020b 100644
--- a/tests/st/control/inner/test_010_if_in_if.py
+++ b/tests/st/control/inner/test_010_if_in_if.py
@@ -20,7 +20,6 @@ from mindspore.common import dtype as mstype
 from mindspore.common.parameter import Parameter
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
 
 
 class IfInIfNet(nn.Cell):
@@ -111,6 +110,23 @@ class IfInIfNet3(nn.Cell):
         return x
 
 
+# add a while to test if_in_if run with vm.Only should run in ascend.
+class IfInIfNet4(nn.Cell):
+    def __init__(self):
+        super().__init__()
+        self.param_a = Parameter(Tensor(5, mstype.int32), name='a')
+        self.param_b = Parameter(Tensor(4, mstype.int32), name='b')
+
+    def construct(self, x):
+        while x < 1:
+            x = x + 1
+        if self.param_a > self.param_b:
+            out = self.func(x)
+        else:
+            out = self.func(self.param_a)
+        out += self.param_b
+        return out
+
 class GradNet(nn.Cell):
     def __init__(self, net):
         super(GradNet, self).__init__()
@@ -125,37 +141,65 @@ def control_flow_if_in_if(input_net, x):
     context.set_context(mode=context.GRAPH_MODE)
     net = input_net()
     grad_net = GradNet(net)
-    graph_forward_res = net(x)
+
+    forward_net = input_net()
+    graph_forward_res = forward_net(x)
     graph_backward_res = grad_net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     net = input_net()
     grad_net = GradNet(net)
-    pynative_forward_res = net(x)
+
+    forward_net = input_net()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = grad_net(x)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_in_if():
     x = Tensor(2, mstype.int32)
     control_flow_if_in_if(IfInIfNet, x)
 
-
-@pytest.mark.skip(reason="not supported side effect")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_in_if_01():
     x = Tensor(2, mstype.int32)
     control_flow_if_in_if(IfInIfNet1, x)
 
-
-@pytest.mark.skip(reason="not supported side effect")
+@pytest.mark.skip(reason="Ascend compile error in multigraph sink.")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_in_if_02():
     x = Tensor(2, mstype.int32)
     control_flow_if_in_if(IfInIfNet2, x)
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_in_if_03():
     x = Tensor(2, mstype.int32)
     control_flow_if_in_if(IfInIfNet3, x)
+
+@pytest.mark.skip(reason="Result not correct in ascend vm")
+@pytest.mark.level1
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_if_in_if_04():
+    x = Tensor(2, mstype.int32)
+    control_flow_if_in_if(IfInIfNet4, x)
diff --git a/tests/st/control/inner/test_011_if_in_while.py b/tests/st/control/inner/test_011_if_in_while.py
index 561a0a97b8a..9c4b2ca7427 100644
--- a/tests/st/control/inner/test_011_if_in_while.py
+++ b/tests/st/control/inner/test_011_if_in_while.py
@@ -22,7 +22,7 @@ from mindspore import context
 from mindspore.ops import functional as F
 from mindspore.common.parameter import Parameter
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -56,7 +56,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     # Graph Mode
     context.set_context(mode=context.GRAPH_MODE)
@@ -72,6 +76,7 @@ def test_forward():
 
 
 @pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
diff --git a/tests/st/control/inner/test_011_if_in_while_break.py b/tests/st/control/inner/test_011_if_in_while_break.py
index 0ce06bd5ba2..5f20c2b7a49 100644
--- a/tests/st/control/inner/test_011_if_in_while_break.py
+++ b/tests/st/control/inner/test_011_if_in_while_break.py
@@ -20,7 +20,7 @@ from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -79,12 +79,22 @@ class BackwardNetReplaceBreak(nn.Cell):
         return grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=10)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_mode_out = forward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    pynative_forward_net = ForwardNet(max_cycles=10)
+    pynative_mode_out = pynative_forward_net(x, y)
+    assert graph_mode_out == pynative_mode_out
 
 
 # Problem: Exceed function call depth limit 1000.
@@ -93,27 +103,58 @@ def test_forward():
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
 def test_backward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=10)
     backward_net = BackwardNet(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    forward_net = ForwardNet(max_cycles=10)
+    backward_net = BackwardNet(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward_replace_break():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNetReplaceBreak(max_cycles=10)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_out = forward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNetReplaceBreak(max_cycles=10)
+    pynative_out = forward_net(x, y)
+    assert graph_out == pynative_out
 
 
 # Problem: Exceed function call depth limit 1000.
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward_replace_break():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNetReplaceBreak(max_cycles=10)
     backward_net = BackwardNetReplaceBreak(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNetReplaceBreak(max_cycles=10)
+    backward_net = BackwardNetReplaceBreak(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
diff --git a/tests/st/control/inner/test_012_if_in_for.py b/tests/st/control/inner/test_012_if_in_for.py
index aca6bb0e4eb..0546c04f259 100644
--- a/tests/st/control/inner/test_012_if_in_for.py
+++ b/tests/st/control/inner/test_012_if_in_for.py
@@ -22,7 +22,7 @@ from mindspore import context
 from mindspore.common.parameter import Parameter
 from mindspore.ops import functional as F
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -52,8 +52,10 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
-@pytest.mark.skip(reason="not supported side effect")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -62,13 +64,16 @@ def test_forward():
     graph_forward_net = ForwardNet(max_cycles=3)
     graph_mode_out = graph_forward_net(x, y)
     # Pynative Mode
-    context.set_context(mode=context.PYNATIVE_MODE)
-    pynative_forward_net = ForwardNet(max_cycles=3)
-    pynative_mode_out = pynative_forward_net(x, y)
-    assert graph_mode_out == pynative_mode_out
+    # context.set_context(mode=context.PYNATIVE_MODE)
+    # pynative_forward_net = ForwardNet(max_cycles=3)
+    # pynative_mode_out = pynative_forward_net(x, y)
+    expect = (Tensor(np.array(9), mstype.int32), Tensor(np.array(2), mstype.int32))
+    assert graph_mode_out == expect
 
-
-@pytest.mark.skip(reason="not supported side effect")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -78,8 +83,9 @@ def test_backward():
     graph_backward_net = BackwardNet(graph_forward_net)
     graph_mode_grads = graph_backward_net(x, y)
     # Pynative Mode
-    context.set_context(mode=context.PYNATIVE_MODE)
-    pynative_forward_net = ForwardNet(max_cycles=3)
-    pynative_backward_net = BackwardNet(pynative_forward_net)
-    pynative_mode_grads = pynative_backward_net(x, y)
-    assert graph_mode_grads == pynative_mode_grads
+    # context.set_context(mode=context.PYNATIVE_MODE)
+    # pynative_forward_net = ForwardNet(max_cycles=3)
+    # pynative_backward_net = BackwardNet(pynative_forward_net)
+    # pynative_mode_grads = pynative_backward_net(x, y)
+    expect = (Tensor(np.array(9), mstype.int32), Tensor(np.array(3), mstype.int32))
+    assert graph_mode_grads == expect
diff --git a/tests/st/control/inner/test_012_if_in_for_break.py b/tests/st/control/inner/test_012_if_in_for_break.py
index 93736f1e5fd..a1afce63669 100644
--- a/tests/st/control/inner/test_012_if_in_for_break.py
+++ b/tests/st/control/inner/test_012_if_in_for_break.py
@@ -13,13 +13,14 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -48,18 +49,43 @@ class BackwardNet(nn.Cell):
         return grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_out = forward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    pynative_out = forward_net(x, y)
+    assert graph_out == pynative_out
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
     backward_net = BackwardNet(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    backward_net = BackwardNet(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
diff --git a/tests/st/control/inner/test_020_while_in_if.py b/tests/st/control/inner/test_020_while_in_if.py
index 27553792de1..5f4312bca4f 100644
--- a/tests/st/control/inner/test_020_while_in_if.py
+++ b/tests/st/control/inner/test_020_while_in_if.py
@@ -14,6 +14,7 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
@@ -22,7 +23,7 @@ from mindspore import context
 from mindspore.common.parameter import Parameter
 from mindspore.ops import functional as F
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -56,6 +57,11 @@ class BackwardNet(nn.Cell):
         return grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -70,6 +76,11 @@ def test_forward():
     assert graph_mode_out == pynative_mode_out
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
diff --git a/tests/st/control/inner/test_021_while_while_normal.py b/tests/st/control/inner/test_021_while_while_normal.py
index 45a07a578fb..9e6ef44329c 100644
--- a/tests/st/control/inner/test_021_while_while_normal.py
+++ b/tests/st/control/inner/test_021_while_while_normal.py
@@ -14,13 +14,14 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -53,18 +54,43 @@ class BackwardNet(nn.Cell):
         return grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_out = forward_net(x, y)
+
+    context.set_context(mode=context.GRAPH_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    pynative_out = forward_net(x, y)
+    assert graph_out == pynative_out
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
     backward_net = BackwardNet(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    backward_net = BackwardNet(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
diff --git a/tests/st/control/inner/test_022_for_while_normal.py b/tests/st/control/inner/test_022_for_while_normal.py
index 904c8ead8cf..e51e64b0446 100644
--- a/tests/st/control/inner/test_022_for_while_normal.py
+++ b/tests/st/control/inner/test_022_for_while_normal.py
@@ -14,13 +14,14 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -51,18 +52,43 @@ class BackwardNet(nn.Cell):
         return grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_out = forward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    pynative_out = forward_net(x, y)
+    assert graph_out == pynative_out
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
     backward_net = BackwardNet(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    backward_net = BackwardNet(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
diff --git a/tests/st/control/inner/test_030_for_in_if.py b/tests/st/control/inner/test_030_for_in_if.py
index bbf2948b856..126c1e418de 100644
--- a/tests/st/control/inner/test_030_for_in_if.py
+++ b/tests/st/control/inner/test_030_for_in_if.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.common.parameter import Parameter
@@ -21,8 +22,12 @@ from mindspore.ops import operations as P
 from mindspore.common import dtype as mstype
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_in_if_01():
     class ForInIfNet(nn.Cell):
         def __init__(self):
@@ -56,19 +61,28 @@ def test_for_in_if_01():
     context.set_context(mode=context.GRAPH_MODE)
     for_in_if_net = ForInIfNet()
     net = GradNet(for_in_if_net)
-    graph_forward_res = for_in_if_net(x)
+
+    forward_net = ForInIfNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_in_if_net = ForInIfNet()
     net = GradNet(for_in_if_net)
-    pynative_forward_res = for_in_if_net(x)
+
+    forward_net = ForInIfNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_in_if_02():
     class ForInIfNet(nn.Cell):
         def __init__(self):
@@ -100,26 +114,34 @@ def test_for_in_if_02():
         def construct(self, *inputs):
             return grad_all(self.net)(*inputs)
 
-    x = Tensor([10], mstype.int32)
+    x = Tensor([10], mstype.float32)
 
     # graph mode
     context.set_context(mode=context.GRAPH_MODE)
     for_in_if_net = ForInIfNet()
     net = GradNet(for_in_if_net)
-    graph_forward_res = for_in_if_net(x)
+
+    forward_net = ForInIfNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_in_if_net = ForInIfNet()
     net = GradNet(for_in_if_net)
-    pynative_forward_res = for_in_if_net(x)
+
+    forward_net = ForInIfNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_in_if_03():
     class ForInIfNet(nn.Cell):
         def __init__(self):
@@ -152,26 +174,35 @@ def test_for_in_if_03():
         def construct(self, *inputs):
             return grad_all(self.net)(*inputs)
 
-    x = Tensor([10], mstype.int32)
+    x = Tensor([10], mstype.float32)
 
     # graph mode
     context.set_context(mode=context.GRAPH_MODE)
     for_in_if_net = ForInIfNet()
     net = GradNet(for_in_if_net)
-    graph_forward_res = for_in_if_net(x)
+
+    forward_net = ForInIfNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_in_if_net = ForInIfNet()
     net = GradNet(for_in_if_net)
-    pynative_forward_res = for_in_if_net(x)
+
+    forward_net = ForInIfNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-
+@pytest.mark.skip(reason="Ascend control multi sink result error")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_in_if_04():
     class ForInIfNet(nn.Cell):
         def __init__(self):
@@ -207,20 +238,28 @@ def test_for_in_if_04():
     context.set_context(mode=context.GRAPH_MODE)
     for_in_if_net = ForInIfNet()
     net = GradNet(for_in_if_net)
-    graph_forward_res = for_in_if_net(x)
+
+    forward_net = ForInIfNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
+    forward_net = ForInIfNet()
+    pynative_forward_res = forward_net(x)
     for_in_if_net = ForInIfNet()
     net = GradNet(for_in_if_net)
-    pynative_forward_res = for_in_if_net(x)
-    pynative_backward_res = net(x)
+    expect_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
-    assert graph_backward_res == pynative_backward_res
-
+    assert graph_backward_res == expect_backward_res
 
+@pytest.mark.skip(reason="Ascend control multi sink result error")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_in_if_05():
     class ForInIfNet(nn.Cell):
         def __init__(self):
@@ -258,15 +297,19 @@ def test_for_in_if_05():
     context.set_context(mode=context.GRAPH_MODE)
     for_in_if_net = ForInIfNet()
     net = GradNet(for_in_if_net)
-    graph_forward_res = for_in_if_net(x)
+
+    forward_net = ForInIfNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_in_if_net = ForInIfNet()
-    net = GradNet(for_in_if_net)
+
     pynative_forward_res = for_in_if_net(x)
-    pynative_backward_res = net(x)
+    for_in_if_net = ForInIfNet()
+    net = GradNet(for_in_if_net)
+    expect_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
-    assert graph_backward_res == pynative_backward_res
+    assert graph_backward_res == expect_backward_res
diff --git a/tests/st/control/inner/test_031_for_in_while.py b/tests/st/control/inner/test_031_for_in_while.py
index cb4e3b7956d..0f65cd8a034 100644
--- a/tests/st/control/inner/test_031_for_in_while.py
+++ b/tests/st/control/inner/test_031_for_in_while.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.common.parameter import Parameter
@@ -21,8 +22,8 @@ from mindspore.ops import operations as P
 from mindspore.common import dtype as mstype
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
 
+@pytest.mark.skip(reason="not supported for in while")
 def test_for_in_while_01():
     class ForInWhileNet(nn.Cell):
         def __init__(self):
@@ -60,21 +61,25 @@ def test_for_in_while_01():
     # graph mode
     context.set_context(mode=context.GRAPH_MODE)
     for_in_while_net = ForInWhileNet()
-    net = GradNet(for_in_while_net)
-    graph_forward_res = for_in_while_net(x)
-    graph_backward_res = net(x)
+    backward_net = GradNet(for_in_while_net)
+
+    forward_net = ForInWhileNet()
+    graph_forward_res = forward_net(x)
+    graph_backward_res = backward_net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_in_while_net = ForInWhileNet()
-    net = GradNet(for_in_while_net)
-    pynative_forward_res = for_in_while_net(x)
-    pynative_backward_res = net(x)
+    backward_net = GradNet(for_in_while_net)
+
+    forward_net = ForInWhileNet()
+    pynative_forward_res = forward_net(x)
+    pynative_backward_res = backward_net(x)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-
+@pytest.mark.skip(reason="not supported for in while")
 def test_for_in_while_02():
     class ForInWhileNet(nn.Cell):
         def __init__(self):
diff --git a/tests/st/control/inner/test_032_for_in_for.py b/tests/st/control/inner/test_032_for_in_for.py
index dd7094e54aa..9d14e253e23 100644
--- a/tests/st/control/inner/test_032_for_in_for.py
+++ b/tests/st/control/inner/test_032_for_in_for.py
@@ -22,9 +22,12 @@ from mindspore.ops import operations as P
 from mindspore.common import dtype as mstype
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="GPU")
-
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_in_for_01():
     class ForInForNet(nn.Cell):
         def __init__(self):
@@ -64,14 +67,18 @@ def test_for_in_for_01():
     context.set_context(mode=context.GRAPH_MODE)
     for_in_for_net = ForInForNet()
     net = GradNet(for_in_for_net)
-    graph_forward_res = for_in_for_net(x)
+
+    forward_net = ForInForNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_in_for_net = ForInForNet()
     net = GradNet(for_in_for_net)
-    pynative_forward_res = for_in_for_net(x)
+
+    forward_net = ForInForNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
@@ -79,6 +86,8 @@ def test_for_in_for_01():
 
 @pytest.mark.level0
 @pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
 def test_for_in_for_02():
     class ForInForNet(nn.Cell):
@@ -114,14 +123,18 @@ def test_for_in_for_02():
     context.set_context(mode=context.GRAPH_MODE)
     for_in_for_net = ForInForNet()
     net = GradNet(for_in_for_net)
-    graph_forward_res = for_in_for_net(x)
+
+    forward_net = ForInForNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_in_for_net = ForInForNet()
     net = GradNet(for_in_for_net)
-    pynative_forward_res = for_in_for_net(x)
+
+    forward_net = ForInForNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_100_if_after_if.py b/tests/st/control/inner/test_100_if_after_if.py
index f68af8cd58e..2703ac1e203 100644
--- a/tests/st/control/inner/test_100_if_after_if.py
+++ b/tests/st/control/inner/test_100_if_after_if.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.ops import composite as C
@@ -19,7 +20,6 @@ from mindspore.common import dtype as mstype
 from mindspore.common.parameter import Parameter
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
 
 
 class IfAfterIfNet(nn.Cell):
@@ -93,6 +93,28 @@ class IfAfterIfNet3(nn.Cell):
         return x
 
 
+# Add a while to run with vm in ascend
+class IfAfterIfNet4(nn.Cell):
+    def __init__(self):
+        super().__init__()
+        self.param_a = Parameter(Tensor(5, mstype.int32), name='a')
+        self.param_b = Parameter(Tensor(4, mstype.int32), name='b')
+
+    def construct(self, x, y):
+        while x < 0:
+            x = x + 1
+        out = x * y + self.func(self.param_b)
+        if self.param_a > self.param_b:
+            out += 5
+        return out
+
+    def func(self, x):
+        if self.param_a > self.param_b:
+            x += 5
+        self.param_b += 4
+        return x
+
+
 class GradNet(nn.Cell):
     def __init__(self, net):
         super(GradNet, self).__init__()
@@ -107,39 +129,75 @@ def control_flow_if_after_if(input_net, x, y):
     context.set_context(mode=context.GRAPH_MODE)
     net = input_net()
     grad_net = GradNet(net)
-    graph_forward_res = net(x, y)
+
+    forward_net = input_net()
+    graph_forward_res = forward_net(x, y)
     graph_backward_res = grad_net(x, y)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     net = input_net()
     grad_net = GradNet(net)
-    pynative_forward_res = net(x, y)
+
+    forward_net = input_net()
+    pynative_forward_res = forward_net(x, y)
     pynative_backward_res = grad_net(x, y)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_after_if():
     x = Tensor(2, mstype.int32)
     y = Tensor(5, mstype.int32)
     control_flow_if_after_if(IfAfterIfNet, x, y)
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_after_if_01():
     x = Tensor(2, mstype.int32)
     y = Tensor(5, mstype.int32)
     control_flow_if_after_if(IfAfterIfNet1, x, y)
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_after_if_02():
     x = Tensor(2, mstype.int32)
     y = Tensor(5, mstype.int32)
     control_flow_if_after_if(IfAfterIfNet2, x, y)
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+# Now in ascend result is not correct
+# @pytest.mark.platform_arm_ascend_training
+# @pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_after_if_03():
     x = Tensor(2, mstype.int32)
     y = Tensor(5, mstype.int32)
     control_flow_if_after_if(IfAfterIfNet3, x, y)
+
+
+@pytest.mark.skip(reason="Result is not correct in multigraph sink.")
+@pytest.mark.level1
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_if_after_if_04():
+    x = Tensor(2, mstype.int32)
+    y = Tensor(5, mstype.int32)
+    control_flow_if_after_if(IfAfterIfNet4, x, y)
diff --git a/tests/st/control/inner/test_101_if_after_while.py b/tests/st/control/inner/test_101_if_after_while.py
index 3b322db3d1e..afc24531b39 100644
--- a/tests/st/control/inner/test_101_if_after_while.py
+++ b/tests/st/control/inner/test_101_if_after_while.py
@@ -22,7 +22,7 @@ from mindspore import context
 from mindspore.common.parameter import Parameter
 from mindspore.ops import functional as F
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -43,7 +43,6 @@ class ForwardNet(nn.Cell):
             i = i + 1
         if out >= 20:
             F.assign(self.weight, out)
-            self.weight = out
             out = out - 20
         return out, self.weight
 
@@ -58,7 +57,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -73,8 +76,11 @@ def test_forward():
     pynative_mode_out = pynative_forward_net(x, y)
     assert graph_mode_out == pynative_mode_out
 
-
-@pytest.mark.skip(reason="not supported side effect")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
diff --git a/tests/st/control/inner/test_102_if_after_for.py b/tests/st/control/inner/test_102_if_after_for.py
index e1faf845472..5cf9e7da0ce 100644
--- a/tests/st/control/inner/test_102_if_after_for.py
+++ b/tests/st/control/inner/test_102_if_after_for.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.common.parameter import Parameter
@@ -21,9 +22,12 @@ from mindspore.ops import operations as P
 from mindspore.common import dtype as mstype
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
-
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_after_for_01():
     class IfAfterForNet(nn.Cell):
         def __init__(self):
@@ -64,20 +68,28 @@ def test_if_after_for_01():
     context.set_context(mode=context.GRAPH_MODE)
     if_after_for_net = IfAfterForNet()
     net = GradNet(if_after_for_net)
-    graph_forward_res = if_after_for_net(x)
+
+    forward_net = IfAfterForNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     if_after_for_net = IfAfterForNet()
     net = GradNet(if_after_for_net)
-    pynative_forward_res = if_after_for_net(x)
+
+    forward_net = IfAfterForNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_after_for_02():
     class IfAfterForNet(nn.Cell):
         def __init__(self):
@@ -118,14 +130,18 @@ def test_if_after_for_02():
     context.set_context(mode=context.GRAPH_MODE)
     if_after_for_net = IfAfterForNet()
     net = GradNet(if_after_for_net)
-    graph_forward_res = if_after_for_net(x)
+
+    forward_net = IfAfterForNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     if_after_for_net = IfAfterForNet()
     net = GradNet(if_after_for_net)
-    pynative_forward_res = if_after_for_net(x)
+
+    forward_net = IfAfterForNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_110_if_after_if_in_if.py b/tests/st/control/inner/test_110_if_after_if_in_if.py
index e0ce1edab70..a0e3ad893ad 100644
--- a/tests/st/control/inner/test_110_if_after_if_in_if.py
+++ b/tests/st/control/inner/test_110_if_after_if_in_if.py
@@ -20,7 +20,6 @@ from mindspore.common import dtype as mstype
 from mindspore.common.parameter import Parameter
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="GPU")
 
 
 class IfAfterIfInIfNet(nn.Cell):
@@ -133,14 +132,18 @@ def control_flow_if_after_if_in_if(input_net, x):
     context.set_context(mode=context.GRAPH_MODE)
     net = input_net()
     grad_net = GradNet(net)
-    graph_forward_res = net(x)
+
+    forward_net = input_net()
+    graph_forward_res = forward_net(x)
     graph_backward_res = grad_net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     net = input_net()
     grad_net = GradNet(net)
-    pynative_forward_res = net(x)
+
+    forward_net = input_net()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = grad_net(x)
 
     assert graph_forward_res == pynative_forward_res
@@ -154,7 +157,9 @@ def test_if_after_if_in_if():
     control_flow_if_after_if_in_if(IfAfterIfInIfNet, x)
 
 
-@pytest.mark.skip(reason="not supported side effect")
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
 def test_if_after_if_in_if_01():
     x = Tensor(2, mstype.int32)
     control_flow_if_after_if_in_if(IfAfterIfInIfNet1, x)
diff --git a/tests/st/control/inner/test_111_if_after_if_in_while.py b/tests/st/control/inner/test_111_if_after_if_in_while.py
index 7910839c57a..bb09aae70d9 100644
--- a/tests/st/control/inner/test_111_if_after_if_in_while.py
+++ b/tests/st/control/inner/test_111_if_after_if_in_while.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
@@ -20,7 +21,7 @@ from mindspore.ops import composite as C
 from mindspore import context
 from mindspore.common.parameter import Parameter
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -56,7 +57,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -70,7 +75,11 @@ def test_forward():
     pynative_mode_out = pynative_forward_net(x, y)
     assert graph_mode_out == pynative_mode_out
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
diff --git a/tests/st/control/inner/test_112_if_after_if_in_for.py b/tests/st/control/inner/test_112_if_after_if_in_for.py
index 9a05c0ba472..30d6729db89 100644
--- a/tests/st/control/inner/test_112_if_after_if_in_for.py
+++ b/tests/st/control/inner/test_112_if_after_if_in_for.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.ops import composite as C
@@ -19,7 +20,6 @@ from mindspore.common import dtype as mstype
 from mindspore.common.parameter import Parameter
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
 
 
 class IfAfterIfInForNet(nn.Cell):
@@ -124,35 +124,56 @@ def control_flow_if_after_if_in_for(input_net, x):
     context.set_context(mode=context.GRAPH_MODE)
     net = input_net()
     grad_net = GradNet(net)
-    graph_forward_res = net(x)
+
+    forward_net = input_net()
+    graph_forward_res = forward_net(x)
     graph_backward_res = grad_net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     net = input_net()
     grad_net = GradNet(net)
-    pynative_forward_res = net(x)
+
+    forward_net = input_net()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = grad_net(x)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-
+@pytest.mark.skip(reason="ME EvalCNode error")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_after_if_in_for():
     x = Tensor(2, mstype.int32)
     control_flow_if_after_if_in_for(IfAfterIfInForNet, x)
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_after_if_in_for_01():
     x = Tensor(2, mstype.int32)
     control_flow_if_after_if_in_for(IfAfterIfInForNet1, x)
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_after_if_in_for_02():
     x = Tensor(2, mstype.int32)
     control_flow_if_after_if_in_for(IfAfterIfInForNet2, x)
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_after_if_in_for_03():
     x = Tensor(2, mstype.int32)
     control_flow_if_after_if_in_for(IfAfterIfInForNet3, x)
diff --git a/tests/st/control/inner/test_120_if_after_while_in_if.py b/tests/st/control/inner/test_120_if_after_while_in_if.py
index f3db6ab5e7d..4ca941c6193 100644
--- a/tests/st/control/inner/test_120_if_after_while_in_if.py
+++ b/tests/st/control/inner/test_120_if_after_while_in_if.py
@@ -14,6 +14,7 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
@@ -21,7 +22,7 @@ from mindspore.ops import composite as C
 from mindspore import context
 from mindspore.common.parameter import Parameter
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -56,7 +57,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -70,7 +75,11 @@ def test_forward():
     pynative_mode_out = pynative_forward_net(x, y)
     assert graph_mode_out == pynative_mode_out
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
diff --git a/tests/st/control/inner/test_121_if_after_while_in_while.py b/tests/st/control/inner/test_121_if_after_while_in_while.py
index 9f3feb6a16c..f2c1d6092bd 100644
--- a/tests/st/control/inner/test_121_if_after_while_in_while.py
+++ b/tests/st/control/inner/test_121_if_after_while_in_while.py
@@ -19,10 +19,11 @@ from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
+from mindspore.ops import functional as F
 from mindspore import context
 from mindspore.common.parameter import Parameter
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="GPU")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -37,14 +38,14 @@ class ForwardNet(nn.Cell):
         out = self.zero
         i = self.i
         while x < y:
-            self.weight = x
+            F.assign(self.weight, out)
             while i < self.max_cycles:
                 out = x * y + out
                 i = i + 1
-                self.weight = i
+                F.assign(self.weight, i)
             x = x + 1
         if out < 20:
-            self.weight = out
+            F.assign(self.weight, out)
             out = out - 20
         return out, self.weight
 
@@ -59,7 +60,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -74,7 +79,11 @@ def test_forward():
     assert graph_mode_out == pynative_mode_out
 
 
-@pytest.mark.skip(reason="not supported side effect")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -126,6 +135,8 @@ class BackwardNetNoAssign(nn.Cell):
 # This test case has a problem of evaluator endless loop.
 @pytest.mark.level0
 @pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
 def test_backward_no_assign():
     x = Tensor(np.array(1), mstype.int32)
diff --git a/tests/st/control/inner/test_122_if_after_while_in_for.py b/tests/st/control/inner/test_122_if_after_while_in_for.py
index 5c572faeb85..dd874ba6c7e 100644
--- a/tests/st/control/inner/test_122_if_after_while_in_for.py
+++ b/tests/st/control/inner/test_122_if_after_while_in_for.py
@@ -19,10 +19,11 @@ from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
+from mindspore.ops import functional as F
 from mindspore import context
 from mindspore.common.parameter import Parameter
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="GPU")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -38,9 +39,9 @@ class ForwardNet(nn.Cell):
             while x < y:
                 out = x * y + out
                 x = x + 1
-                self.weight = x
+                F.assign(self.weight, x)
         if out > 20:
-            self.weight = out
+            F.assign(self.weight, out)
             out = out - 20
         return out, self.weight
 
@@ -55,7 +56,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -70,7 +75,11 @@ def test_forward():
     assert graph_mode_out == pynative_mode_out
 
 
-@pytest.mark.skip(reason="not supported side effect")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -84,6 +93,7 @@ def test_backward():
     pynative_forward_net = ForwardNet(max_cycles=3)
     pynative_backward_net = BackwardNet(pynative_forward_net)
     pynative_mode_grads = pynative_backward_net(x, y)
+    #expect = (Tensor(np.array(6), mstype.int32), Tensor(np.array(3), mstype.int32))
     assert graph_mode_grads == pynative_mode_grads
 
 
@@ -100,7 +110,7 @@ class ForwardNetNoAssign(nn.Cell):
             while x < y:
                 out = x * y + out
                 x = x + 1
-                #self.weight = x
+                # self.weight = x
         if out > 20:
             self.weight = out
             out = out - 20
@@ -119,6 +129,8 @@ class BackwardNetNoAssign(nn.Cell):
 
 @pytest.mark.level0
 @pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
 def test_backward_no_assign():
     x = Tensor(np.array(1), mstype.int32)
diff --git a/tests/st/control/inner/test_130_if_after_for_in_if.py b/tests/st/control/inner/test_130_if_after_for_in_if.py
index 9adb67d7ee5..8c9874c6c21 100644
--- a/tests/st/control/inner/test_130_if_after_for_in_if.py
+++ b/tests/st/control/inner/test_130_if_after_for_in_if.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.ops import composite as C
@@ -19,8 +20,11 @@ from mindspore.common import dtype as mstype
 from mindspore.common.parameter import Parameter
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_after_for_in_if():
     class IfAfterForInIfNet(nn.Cell):
         def __init__(self):
@@ -53,14 +57,18 @@ def test_if_after_for_in_if():
     context.set_context(mode=context.GRAPH_MODE)
     if_after_for_in_if_net = IfAfterForInIfNet()
     net = GradNet(if_after_for_in_if_net)
-    graph_forward_res = if_after_for_in_if_net(x)
+
+    forward_net = IfAfterForInIfNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     if_after_for_in_if_net = IfAfterForInIfNet()
     net = GradNet(if_after_for_in_if_net)
-    pynative_forward_res = if_after_for_in_if_net(x)
+
+    forward_net = IfAfterForInIfNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_131_if_after_for_in_while.py b/tests/st/control/inner/test_131_if_after_for_in_while.py
index 7bb07615a8a..7d64b995b3b 100644
--- a/tests/st/control/inner/test_131_if_after_for_in_while.py
+++ b/tests/st/control/inner/test_131_if_after_for_in_while.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.ops import composite as C
@@ -19,8 +20,8 @@ from mindspore.common import dtype as mstype
 from mindspore.common.parameter import Parameter
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
 
+@pytest.mark.skip(reason="not supported for in while")
 def test_if_after_for_in_while():
     class IfAfterForInWhileNet(nn.Cell):
         def __init__(self):
@@ -53,14 +54,18 @@ def test_if_after_for_in_while():
     context.set_context(mode=context.GRAPH_MODE)
     if_after_for_in_while_net = IfAfterForInWhileNet()
     net = GradNet(if_after_for_in_while_net)
-    graph_forward_res = if_after_for_in_while_net(x)
+
+    forward_net = IfAfterForInWhileNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     if_after_for_in_while_net = IfAfterForInWhileNet()
     net = GradNet(if_after_for_in_while_net)
-    pynative_forward_res = if_after_for_in_while_net(x)
+
+    forward_net = IfAfterForInWhileNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_132_if_after_for_in_for.py b/tests/st/control/inner/test_132_if_after_for_in_for.py
index 7e178a891c7..0dcb0f56b31 100644
--- a/tests/st/control/inner/test_132_if_after_for_in_for.py
+++ b/tests/st/control/inner/test_132_if_after_for_in_for.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.ops import composite as C
@@ -19,8 +20,11 @@ from mindspore.common import dtype as mstype
 from mindspore.common.parameter import Parameter
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_if_after_for_in_for():
     class IfAfterForInForNet(nn.Cell):
         def __init__(self):
@@ -53,14 +57,18 @@ def test_if_after_for_in_for():
     context.set_context(mode=context.GRAPH_MODE)
     if_after_for_in_for_net = IfAfterForInForNet()
     net = GradNet(if_after_for_in_for_net)
-    graph_forward_res = if_after_for_in_for_net(x)
+
+    forward_net = IfAfterForInForNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     if_after_for_in_for_net = IfAfterForInForNet()
     net = GradNet(if_after_for_in_for_net)
-    pynative_forward_res = if_after_for_in_for_net(x)
+
+    forward_net = IfAfterForInForNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_200_while_after_if.py b/tests/st/control/inner/test_200_while_after_if.py
index fb0802d9c81..ec937e1180e 100644
--- a/tests/st/control/inner/test_200_while_after_if.py
+++ b/tests/st/control/inner/test_200_while_after_if.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
@@ -20,7 +21,7 @@ from mindspore.ops import composite as C
 from mindspore import context
 from mindspore.common.parameter import Parameter
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -54,7 +55,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -68,7 +73,11 @@ def test_forward():
     pynative_mode_out = pynative_forward_net(x, y)
     assert graph_mode_out == pynative_mode_out
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
diff --git a/tests/st/control/inner/test_201_for_n_while.py b/tests/st/control/inner/test_201_for_n_while.py
index ea0f3e80a6c..7e166144d52 100644
--- a/tests/st/control/inner/test_201_for_n_while.py
+++ b/tests/st/control/inner/test_201_for_n_while.py
@@ -14,13 +14,14 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -52,18 +53,43 @@ class BackwardNet(nn.Cell):
         return grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_out = forward_net(x, y)
 
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    pynative_out = forward_net(x, y)
+    assert graph_out == pynative_out
 
+@pytest.mark.skip(reason="Ascend kernel compiler error!")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
     backward_net = BackwardNet(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    backward_net = BackwardNet(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
diff --git a/tests/st/control/inner/test_202_while_n_while.py b/tests/st/control/inner/test_202_while_n_while.py
index f0978012195..d656b1d37d8 100644
--- a/tests/st/control/inner/test_202_while_n_while.py
+++ b/tests/st/control/inner/test_202_while_n_while.py
@@ -14,13 +14,14 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -54,18 +55,43 @@ class BackwardNet(nn.Cell):
         return grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_out = forward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    pynative_out = forward_net(x, y)
+    assert graph_out == pynative_out
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
     backward_net = BackwardNet(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    backward_net = BackwardNet(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
diff --git a/tests/st/control/inner/test_210_while_after_if_in_if.py b/tests/st/control/inner/test_210_while_after_if_in_if.py
index 47151cb5331..4556fb7dd8d 100644
--- a/tests/st/control/inner/test_210_while_after_if_in_if.py
+++ b/tests/st/control/inner/test_210_while_after_if_in_if.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
@@ -20,7 +21,7 @@ from mindspore.ops import composite as C
 from mindspore import context
 from mindspore.common.parameter import Parameter
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -55,7 +56,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -69,7 +74,11 @@ def test_forward():
     pynative_mode_out = pynative_forward_net(x, y)
     assert graph_mode_out == pynative_mode_out
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
diff --git a/tests/st/control/inner/test_211_while_after_if_in_while.py b/tests/st/control/inner/test_211_while_after_if_in_while.py
index 617e43c9914..741e5a1f3d9 100644
--- a/tests/st/control/inner/test_211_while_after_if_in_while.py
+++ b/tests/st/control/inner/test_211_while_after_if_in_while.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
@@ -20,7 +21,7 @@ from mindspore.ops import composite as C
 from mindspore import context
 from mindspore.common.parameter import Parameter
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -58,7 +59,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -72,7 +77,11 @@ def test_forward():
     pynative_mode_out = pynative_forward_net(x, y)
     assert graph_mode_out == pynative_mode_out
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
diff --git a/tests/st/control/inner/test_212_while_after_if_in_for.py b/tests/st/control/inner/test_212_while_after_if_in_for.py
index d2f6ebb3443..2bfa118d2a4 100644
--- a/tests/st/control/inner/test_212_while_after_if_in_for.py
+++ b/tests/st/control/inner/test_212_while_after_if_in_for.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
@@ -20,7 +21,7 @@ from mindspore.ops import composite as C
 from mindspore import context
 from mindspore.common.parameter import Parameter
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -55,7 +56,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     x = Tensor(np.array(3), mstype.int32)
     y = Tensor(np.array(5), mstype.int32)
@@ -69,7 +74,11 @@ def test_forward():
     pynative_mode_out = pynative_forward_net(x, y)
     assert graph_mode_out == pynative_mode_out
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     x = Tensor(np.array(3), mstype.int32)
     y = Tensor(np.array(5), mstype.int32)
diff --git a/tests/st/control/inner/test_220_while_after_while_in_if.py b/tests/st/control/inner/test_220_while_after_while_in_if.py
index b91dd48c98b..3fbb3948a0a 100644
--- a/tests/st/control/inner/test_220_while_after_while_in_if.py
+++ b/tests/st/control/inner/test_220_while_after_while_in_if.py
@@ -14,6 +14,7 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
@@ -21,7 +22,7 @@ from mindspore.ops import composite as C
 from mindspore import context
 from mindspore.common.parameter import Parameter
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -56,7 +57,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -69,7 +74,11 @@ def test_forward():
     pynative_mode_out = forward_net(x, y)
     assert graph_mode_out == pynative_mode_out
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
diff --git a/tests/st/control/inner/test_221_while_while_while.py b/tests/st/control/inner/test_221_while_while_while.py
index 48130e564fc..025daec86a2 100644
--- a/tests/st/control/inner/test_221_while_while_while.py
+++ b/tests/st/control/inner/test_221_while_while_while.py
@@ -14,13 +14,14 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -57,18 +58,43 @@ class BackwardNet(nn.Cell):
         return grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_out = forward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    pynative_out = forward_net(x, y)
+    assert graph_out == pynative_out
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
     backward_net = BackwardNet(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    backward_net = BackwardNet(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
diff --git a/tests/st/control/inner/test_222_for_while_while.py b/tests/st/control/inner/test_222_for_while_while.py
index 9dc5e134c92..ca1dbcc49a4 100644
--- a/tests/st/control/inner/test_222_for_while_while.py
+++ b/tests/st/control/inner/test_222_for_while_while.py
@@ -14,13 +14,14 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -55,18 +56,43 @@ class BackwardNet(nn.Cell):
         return grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_out = forward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    pynative_out = forward_net(x, y)
+    assert graph_out == pynative_out
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
     backward_net = BackwardNet(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    backward_net = BackwardNet(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
diff --git a/tests/st/control/inner/test_230_while_after_for_in_if.py b/tests/st/control/inner/test_230_while_after_for_in_if.py
index 33b2f930feb..901124ffd2c 100644
--- a/tests/st/control/inner/test_230_while_after_for_in_if.py
+++ b/tests/st/control/inner/test_230_while_after_for_in_if.py
@@ -14,6 +14,7 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
@@ -21,7 +22,7 @@ from mindspore.ops import composite as C
 from mindspore import context
 from mindspore.common.parameter import Parameter
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -53,7 +54,11 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -67,7 +72,11 @@ def test_forward():
     pynative_mode_out = pynative_forward_net(x, y)
     assert graph_mode_out == pynative_mode_out
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
diff --git a/tests/st/control/inner/test_231_while_for_while.py b/tests/st/control/inner/test_231_while_for_while.py
index 2894d5a021c..90787a96880 100644
--- a/tests/st/control/inner/test_231_while_for_while.py
+++ b/tests/st/control/inner/test_231_while_for_while.py
@@ -14,13 +14,14 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -54,7 +55,7 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.skip(reason="not supported for in while")
 def test_forward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
@@ -62,7 +63,7 @@ def test_forward():
     out = forward_net(x, y)
     print("forward out:", out)
 
-
+@pytest.mark.skip(reason="not supported for in while")
 def test_backward():
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
diff --git a/tests/st/control/inner/test_232_for_for_while.py b/tests/st/control/inner/test_232_for_for_while.py
index 55739ee2bd7..651d9fe09f0 100644
--- a/tests/st/control/inner/test_232_for_for_while.py
+++ b/tests/st/control/inner/test_232_for_for_while.py
@@ -14,13 +14,14 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -54,18 +55,43 @@ class BackwardNet(nn.Cell):
         return grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_out = forward_net(x, y)
 
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    pynative_out = forward_net(x, y)
+    assert graph_out == pynative_out
 
+@pytest.mark.skip(reason="Ascend kernel compiler error!")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
     backward_net = BackwardNet(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    backward_net = BackwardNet(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
diff --git a/tests/st/control/inner/test_300_for_after_if.py b/tests/st/control/inner/test_300_for_after_if.py
index 9001a62be76..ce8abdecc7f 100644
--- a/tests/st/control/inner/test_300_for_after_if.py
+++ b/tests/st/control/inner/test_300_for_after_if.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.ops import composite as C
@@ -19,8 +20,11 @@ from mindspore.common import dtype as mstype
 from mindspore.common.parameter import Parameter
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_after_if():
     class ForAfterIfNet(nn.Cell):
         def __init__(self):
@@ -52,14 +56,18 @@ def test_for_after_if():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_if_net = ForAfterIfNet()
     net = GradNet(for_after_if_net)
-    graph_forward_res = for_after_if_net(x)
+
+    forward_net = ForAfterIfNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_if_net = ForAfterIfNet()
     net = GradNet(for_after_if_net)
-    pynative_forward_res = for_after_if_net(x)
+
+    forward_net = ForAfterIfNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_301_while_normal_for.py b/tests/st/control/inner/test_301_while_normal_for.py
index 77ddd3753da..f93ddfef4eb 100644
--- a/tests/st/control/inner/test_301_while_normal_for.py
+++ b/tests/st/control/inner/test_301_while_normal_for.py
@@ -14,13 +14,14 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -52,18 +53,43 @@ class BackwardNet(nn.Cell):
         return grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_out = forward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    pynative_out = forward_net(x, y)
+    assert graph_out == pynative_out
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
     backward_net = BackwardNet(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    backward_net = BackwardNet(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
diff --git a/tests/st/control/inner/test_302_for_after_for.py b/tests/st/control/inner/test_302_for_after_for.py
index e7fbb37ffa2..281f69103f0 100644
--- a/tests/st/control/inner/test_302_for_after_for.py
+++ b/tests/st/control/inner/test_302_for_after_for.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.common.parameter import Parameter
@@ -21,8 +22,11 @@ from mindspore.ops import operations as P
 from mindspore.common import dtype as mstype
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_after_for_01():
     class ForAfterForNet(nn.Cell):
         def __init__(self):
@@ -65,20 +69,28 @@ def test_for_after_for_01():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_for_net = ForAfterForNet()
     net = GradNet(for_after_for_net)
-    graph_forward_res = for_after_for_net(x)
+
+    forward_net = ForAfterForNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_for_net = ForAfterForNet()
     net = GradNet(for_after_for_net)
-    pynative_forward_res = for_after_for_net(x)
+
+    forward_net = ForAfterForNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_after_for_02():
     class ForAfterForNet(nn.Cell):
         def __init__(self):
@@ -118,14 +130,18 @@ def test_for_after_for_02():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_for_net = ForAfterForNet()
     net = GradNet(for_after_for_net)
-    graph_forward_res = for_after_for_net(x)
+
+    forward_net = ForAfterForNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_for_net = ForAfterForNet()
     net = GradNet(for_after_for_net)
-    pynative_forward_res = for_after_for_net(x)
+
+    forward_net = ForAfterForNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_310_for_after_if_in_if.py b/tests/st/control/inner/test_310_for_after_if_in_if.py
index 78d70db1138..5ff748b9fc4 100644
--- a/tests/st/control/inner/test_310_for_after_if_in_if.py
+++ b/tests/st/control/inner/test_310_for_after_if_in_if.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.ops import composite as C
@@ -19,8 +20,11 @@ from mindspore.common import dtype as mstype
 from mindspore.common.parameter import Parameter
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_after_if_in_if():
     class ForAfterIfInIfNet(nn.Cell):
         def __init__(self):
@@ -55,14 +59,18 @@ def test_for_after_if_in_if():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_if_in_if_net = ForAfterIfInIfNet()
     net = GradNet(for_after_if_in_if_net)
-    graph_forward_res = for_after_if_in_if_net(x)
+
+    forward_net = ForAfterIfInIfNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_if_in_if_net = ForAfterIfInIfNet()
     net = GradNet(for_after_if_in_if_net)
-    pynative_forward_res = for_after_if_in_if_net(x)
+
+    forward_net = ForAfterIfInIfNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_311_while_if_for.py b/tests/st/control/inner/test_311_while_if_for.py
index 5b99a7b8337..835e082d1c5 100644
--- a/tests/st/control/inner/test_311_while_if_for.py
+++ b/tests/st/control/inner/test_311_while_if_for.py
@@ -14,13 +14,14 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -56,18 +57,43 @@ class BackwardNet(nn.Cell):
         return grads
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_out = forward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    pynative_out = forward_net(x, y)
+    assert graph_out == pynative_out
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
     backward_net = BackwardNet(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    backward_net = BackwardNet(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
diff --git a/tests/st/control/inner/test_320_for_after_while_in_if.py b/tests/st/control/inner/test_320_for_after_while_in_if.py
index 5d44a581fdb..279ec7049c8 100644
--- a/tests/st/control/inner/test_320_for_after_while_in_if.py
+++ b/tests/st/control/inner/test_320_for_after_while_in_if.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.common.parameter import Parameter
@@ -21,8 +22,11 @@ from mindspore.ops import operations as P
 from mindspore.common import dtype as mstype
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_after_while_in_if_01():
     class ForAfterWhileInIfNet(nn.Cell):
         def __init__(self):
@@ -78,20 +82,28 @@ def test_for_after_while_in_if_01():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_while_in_if_net = ForAfterWhileInIfNet()
     net = GradNet(for_after_while_in_if_net)
-    graph_forward_res = for_after_while_in_if_net(x, y)
+
+    forward_net = ForAfterWhileInIfNet()
+    graph_forward_res = forward_net(x, y)
     graph_backward_res = net(x, y)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_while_in_if_net = ForAfterWhileInIfNet()
     net = GradNet(for_after_while_in_if_net)
-    pynative_forward_res = for_after_while_in_if_net(x, y)
+
+    forward_net = ForAfterWhileInIfNet()
+    pynative_forward_res = forward_net(x, y)
     pynative_backward_res = net(x, y)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_after_while_in_if_02():
     class ForAfterWhileInIfNet(nn.Cell):
         def __init__(self):
@@ -138,14 +150,18 @@ def test_for_after_while_in_if_02():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_while_in_if_net = ForAfterWhileInIfNet()
     net = GradNet(for_after_while_in_if_net)
-    graph_forward_res = for_after_while_in_if_net(x, y)
+
+    forward_net = ForAfterWhileInIfNet()
+    graph_forward_res = forward_net(x, y)
     graph_backward_res = net(x, y)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_while_in_if_net = ForAfterWhileInIfNet()
     net = GradNet(for_after_while_in_if_net)
-    pynative_forward_res = for_after_while_in_if_net(x, y)
+
+    forward_net = ForAfterWhileInIfNet()
+    pynative_forward_res = forward_net(x, y)
     pynative_backward_res = net(x, y)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_321_while_while_in_while.py b/tests/st/control/inner/test_321_while_while_in_while.py
index 7f3b8663a93..9cd338593f0 100644
--- a/tests/st/control/inner/test_321_while_while_in_while.py
+++ b/tests/st/control/inner/test_321_while_while_in_while.py
@@ -14,13 +14,14 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 from mindspore.common import dtype as mstype
 from mindspore import nn
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore import context
 
-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
 
 
 class ForwardNet(nn.Cell):
@@ -56,19 +57,42 @@ class BackwardNet(nn.Cell):
         grads = self.grad(self.forward_net)(*inputs)
         return grads
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_forward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
-    out = forward_net(x, y)
-    print("forward out:", out)
+    graph_out = forward_net(x, y)
 
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    pynative_out = forward_net(x, y)
+    assert graph_out == pynative_out
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_backward():
+    context.set_context(mode=context.GRAPH_MODE)
     x = Tensor(np.array(1), mstype.int32)
     y = Tensor(np.array(3), mstype.int32)
     forward_net = ForwardNet(max_cycles=3)
     backward_net = BackwardNet(forward_net)
-    grads = backward_net(x, y)
-    print("grads:", grads)
+    graph_grads = backward_net(x, y)
+
+    context.set_context(mode=context.PYNATIVE_MODE)
+    x = Tensor(np.array(1), mstype.int32)
+    y = Tensor(np.array(3), mstype.int32)
+    forward_net = ForwardNet(max_cycles=3)
+    backward_net = BackwardNet(forward_net)
+    pynative_grads = backward_net(x, y)
+    assert graph_grads == pynative_grads
diff --git a/tests/st/control/inner/test_322_for_after_while_in_for.py b/tests/st/control/inner/test_322_for_after_while_in_for.py
index d18a070213d..05b03438dcb 100644
--- a/tests/st/control/inner/test_322_for_after_while_in_for.py
+++ b/tests/st/control/inner/test_322_for_after_while_in_for.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.common.parameter import Parameter
@@ -21,8 +22,11 @@ from mindspore.ops import operations as P
 from mindspore.common import dtype as mstype
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_after_while_in_for_01():
     class ForAfterWhileInForNet(nn.Cell):
         def __init__(self):
@@ -79,20 +83,28 @@ def test_for_after_while_in_for_01():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_while_in_for_net = ForAfterWhileInForNet()
     net = GradNet(for_after_while_in_for_net)
-    graph_forward_res = for_after_while_in_for_net(x, y)
+
+    forward_net = ForAfterWhileInForNet()
+    graph_forward_res = forward_net(x, y)
     graph_backward_res = net(x, y)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_while_in_for_net = ForAfterWhileInForNet()
     net = GradNet(for_after_while_in_for_net)
-    pynative_forward_res = for_after_while_in_for_net(x, y)
+
+    forward_net = ForAfterWhileInForNet()
+    pynative_forward_res = forward_net(x, y)
     pynative_backward_res = net(x, y)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_after_while_in_for_02():
     class ForAfterWhileInForNet(nn.Cell):
         def __init__(self):
@@ -139,14 +151,18 @@ def test_for_after_while_in_for_02():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_while_in_for_net = ForAfterWhileInForNet()
     net = GradNet(for_after_while_in_for_net)
-    graph_forward_res = for_after_while_in_for_net(x, y)
+
+    forward_net = ForAfterWhileInForNet()
+    graph_forward_res = forward_net(x, y)
     graph_backward_res = net(x, y)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_while_in_for_net = ForAfterWhileInForNet()
     net = GradNet(for_after_while_in_for_net)
-    pynative_forward_res = for_after_while_in_for_net(x, y)
+
+    forward_net = ForAfterWhileInForNet()
+    pynative_forward_res = forward_net(x, y)
     pynative_backward_res = net(x, y)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_330_for_after_for_in_if.py b/tests/st/control/inner/test_330_for_after_for_in_if.py
index c05d387fc34..b85016f8ad9 100644
--- a/tests/st/control/inner/test_330_for_after_for_in_if.py
+++ b/tests/st/control/inner/test_330_for_after_for_in_if.py
@@ -20,9 +20,13 @@ from mindspore.common import dtype as mstype
 from mindspore.common.parameter import Parameter
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
 
 @pytest.mark.skip(reason="not supported side effect")
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_after_for_in_if():
     class ForAfterForInIfNet(nn.Cell):
         def __init__(self):
@@ -56,14 +60,18 @@ def test_for_after_for_in_if():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_for_in_if_net = ForAfterForInIfNet()
     net = GradNet(for_after_for_in_if_net)
-    graph_forward_res = for_after_for_in_if_net(x)
+
+    forward_net = ForAfterForInIfNet()
+    graph_forward_res = forward_net(x)
     graph_backward_res = net(x)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_for_in_if_net = ForAfterForInIfNet()
     net = GradNet(for_after_for_in_if_net)
-    pynative_forward_res = for_after_for_in_if_net(x)
+
+    forward_net = ForAfterForInIfNet()
+    pynative_forward_res = forward_net(x)
     pynative_backward_res = net(x)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_331_for_after_for_in_while.py b/tests/st/control/inner/test_331_for_after_for_in_while.py
index 18ece685a30..1e47e8b13c4 100644
--- a/tests/st/control/inner/test_331_for_after_for_in_while.py
+++ b/tests/st/control/inner/test_331_for_after_for_in_while.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.common.parameter import Parameter
@@ -21,8 +22,7 @@ from mindspore.ops import operations as P
 from mindspore.common import dtype as mstype
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
-
+@pytest.mark.skip(reason="not supported for in while")
 def test_for_after_for_in_while_01():
     class ForAfterForInWhileNet(nn.Cell):
         def __init__(self):
@@ -74,20 +74,24 @@ def test_for_after_for_in_while_01():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_for_in_while_net = ForAfterForInWhileNet()
     net = GradNet(for_after_for_in_while_net)
-    graph_forward_res = for_after_for_in_while_net(x, y)
+
+    forward_net = ForAfterForInWhileNet()
+    graph_forward_res = forward_net(x, y)
     graph_backward_res = net(x, y)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_for_in_while_net = ForAfterForInWhileNet()
     net = GradNet(for_after_for_in_while_net)
-    pynative_forward_res = for_after_for_in_while_net(x, y)
+
+    forward_net = ForAfterForInWhileNet()
+    pynative_forward_res = forward_net(x, y)
     pynative_backward_res = net(x, y)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-
+@pytest.mark.skip(reason="not supported for in while")
 def test_for_after_for_in_while_02():
     class ForAfterForInWhileNet(nn.Cell):
         def __init__(self):
@@ -127,14 +131,18 @@ def test_for_after_for_in_while_02():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_for_in_while_net = ForAfterForInWhileNet()
     net = GradNet(for_after_for_in_while_net)
-    graph_forward_res = for_after_for_in_while_net(x, y)
+
+    forward_net = ForAfterForInWhileNet()
+    graph_forward_res = forward_net(x, y)
     graph_backward_res = net(x, y)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_for_in_while_net = ForAfterForInWhileNet()
     net = GradNet(for_after_for_in_while_net)
-    pynative_forward_res = for_after_for_in_while_net(x, y)
+
+    forward_net = ForAfterForInWhileNet()
+    pynative_forward_res = forward_net(x, y)
     pynative_backward_res = net(x, y)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/inner/test_332_for_after_for_in_for.py b/tests/st/control/inner/test_332_for_after_for_in_for.py
index 989655818a8..21479cff7e1 100644
--- a/tests/st/control/inner/test_332_for_after_for_in_for.py
+++ b/tests/st/control/inner/test_332_for_after_for_in_for.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
+import pytest
 from mindspore import context
 from mindspore import Tensor, nn
 from mindspore.common.parameter import Parameter
@@ -21,8 +22,12 @@ from mindspore.ops import operations as P
 from mindspore.common import dtype as mstype
 
 grad_all = C.GradOperation(get_all=True)
-context.set_context(device_target="Ascend")
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_after_for_in_for_01():
     class ForAfterForInForNet(nn.Cell):
         def __init__(self):
@@ -70,20 +75,28 @@ def test_for_after_for_in_for_01():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_for_in_for_net = ForAfterForInForNet()
     net = GradNet(for_after_for_in_for_net)
-    graph_forward_res = for_after_for_in_for_net(x, y)
+
+    forward_net = ForAfterForInForNet()
+    graph_forward_res = forward_net(x, y)
     graph_backward_res = net(x, y)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_for_in_for_net = ForAfterForInForNet()
     net = GradNet(for_after_for_in_for_net)
-    pynative_forward_res = for_after_for_in_for_net(x, y)
+
+    forward_net = ForAfterForInForNet()
+    pynative_forward_res = forward_net(x, y)
     pynative_backward_res = net(x, y)
 
     assert graph_forward_res == pynative_forward_res
     assert graph_backward_res == pynative_backward_res
 
-
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
 def test_for_after_for_in_for_02():
     class ForAfterForInForNet(nn.Cell):
         def __init__(self):
@@ -127,14 +140,18 @@ def test_for_after_for_in_for_02():
     context.set_context(mode=context.GRAPH_MODE)
     for_after_for_in_for_net = ForAfterForInForNet()
     net = GradNet(for_after_for_in_for_net)
-    graph_forward_res = for_after_for_in_for_net(x, y)
+
+    forward_net = ForAfterForInForNet()
+    graph_forward_res = forward_net(x, y)
     graph_backward_res = net(x, y)
 
     # pynative mode
     context.set_context(mode=context.PYNATIVE_MODE)
     for_after_for_in_for_net = ForAfterForInForNet()
     net = GradNet(for_after_for_in_for_net)
-    pynative_forward_res = for_after_for_in_for_net(x, y)
+
+    forward_net = ForAfterForInForNet()
+    pynative_forward_res = forward_net(x, y)
     pynative_backward_res = net(x, y)
 
     assert graph_forward_res == pynative_forward_res
diff --git a/tests/st/control/test_cont_grad.py b/tests/st/control/test_cont_grad.py
index 45ccc095f67..b41acaccc0c 100644
--- a/tests/st/control/test_cont_grad.py
+++ b/tests/st/control/test_cont_grad.py
@@ -1484,7 +1484,7 @@ def test_if_by_if_forward_all_const_branch():
     assert np.allclose(graph_output.asnumpy(), pynative_output.asnumpy(), 0.0001, 0.0001)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
@@ -1520,7 +1520,7 @@ def test_if_const_grad():
     net(a, b)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
@@ -1560,7 +1560,7 @@ def test_if_by_if_const_grad():
     net(a, b)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
@@ -1594,7 +1594,7 @@ def test_while_const_grad():
     net(a, b)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
diff --git a/tests/st/dump/test_data_dump.py b/tests/st/dump/test_data_dump.py
index f1b637084d7..5ee893a88e4 100644
--- a/tests/st/dump/test_data_dump.py
+++ b/tests/st/dump/test_data_dump.py
@@ -15,6 +15,7 @@
 import os
 import json
 import sys
+import tempfile
 import time
 import shutil
 import glob
@@ -46,12 +47,11 @@ x = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
 y = np.array([[7, 8, 9], [10, 11, 12]]).astype(np.float32)
 
 
-def change_current_dump_json(file_name, dump_path):
+def change_current_dump_json(file_name, dump_path, dump_config_path):
     with open(file_name, 'r+') as f:
         data = json.load(f)
-
     data["common_dump_settings"]["path"] = dump_path
-    with open(file_name, 'w') as f:
+    with open(dump_config_path, 'w') as f:
         json.dump(data, f)
 
 
@@ -62,52 +62,49 @@ def change_current_dump_json(file_name, dump_path):
 def test_async_dump():
     context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
     pwd = os.getcwd()
-    dump_path = pwd + "/async_dump"
-    change_current_dump_json('async_dump.json', dump_path)
-    os.environ['MINDSPORE_DUMP_CONFIG'] = pwd + "/async_dump.json"
-    dump_file_path = dump_path + '/rank_0/Net/0/0/'
-    if os.path.isdir(dump_path):
-        shutil.rmtree(dump_path)
-    add = Net()
-    add(Tensor(x), Tensor(y))
-    time.sleep(5)
-    assert len(os.listdir(dump_file_path)) == 1
-
-    # Delete generated dump data
-    os.system("rm -rf {}".format(dump_path))
+    with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
+        dump_path = os.path.join(tmp_dir, 'async_dump')
+        dump_config_path = os.path.join(tmp_dir, 'async_dump.json')
+        change_current_dump_json('async_dump.json', dump_path, dump_config_path)
+        os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
+        dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
+        if os.path.isdir(dump_path):
+            shutil.rmtree(dump_path)
+        add = Net()
+        add(Tensor(x), Tensor(y))
+        time.sleep(5)
+        assert len(os.listdir(dump_file_path)) == 1
 
 
 def run_e2e_dump():
     if sys.platform != 'linux':
         return
     pwd = os.getcwd()
-    dump_path = pwd + '/e2e_dump'
-    change_current_dump_json('e2e_dump.json', dump_path)
-    os.environ['MINDSPORE_DUMP_CONFIG'] = pwd + '/e2e_dump.json'
-    dump_file_path = dump_path + '/rank_0/Net/0/0/'
-    if os.path.isdir(dump_path):
-        shutil.rmtree(dump_path)
-    add = Net()
-    add(Tensor(x), Tensor(y))
-    time.sleep(5)
-    if context.get_context("device_target") == "Ascend":
-        assert len(os.listdir(dump_file_path)) == 5
-        output_name = "Add.Add-op1.0.0.*.output.0.DefaultFormat.npy"
-    elif context.get_context("device_target") == "CPU":
-        assert len(os.listdir(dump_file_path)) == 5
-        output_name = "Add.Add-op3.0.0.*.output.0.DefaultFormat.npy"
-    else:
-        assert len(os.listdir(dump_file_path)) == 3
-        output_name = "Add.Add-op3.0.0.*.output.0.DefaultFormat.npy"
-    output_path = glob.glob(dump_file_path + output_name)[0]
-    real_path = os.path.realpath(output_path)
-    output = np.load(real_path)
-    expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
-    assert output.dtype == expect.dtype
-    assert np.array_equal(output, expect)
-
-    # Delete generated dump data
-    os.system("rm -rf {}".format(dump_path))
+    with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
+        dump_path = os.path.join(tmp_dir, 'e2e_dump')
+        dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
+        change_current_dump_json('e2e_dump.json', dump_path, dump_config_path)
+        os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
+        dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
+        if os.path.isdir(dump_path):
+            shutil.rmtree(dump_path)
+        add = Net()
+        add(Tensor(x), Tensor(y))
+        if context.get_context("device_target") == "Ascend":
+            assert len(os.listdir(dump_file_path)) == 5
+            output_name = "Add.Add-op1.0.0.*.output.0.DefaultFormat.npy"
+        elif context.get_context("device_target") == "CPU":
+            assert len(os.listdir(dump_file_path)) == 5
+            output_name = "Add.Add-op3.0.0.*.output.0.DefaultFormat.npy"
+        else:
+            assert len(os.listdir(dump_file_path)) == 3
+            output_name = "Add.Add-op3.0.0.*.output.0.DefaultFormat.npy"
+        output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
+        real_path = os.path.realpath(output_path)
+        output = np.load(real_path)
+        expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
+        assert output.dtype == expect.dtype
+        assert np.array_equal(output, expect)
 
 
 @pytest.mark.level0
@@ -119,6 +116,17 @@ def test_e2e_dump():
     run_e2e_dump()
 
 
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_e2e_dump_with_hccl_env():
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+    os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
+    os.environ["RANK_ID"] = "4"
+    run_e2e_dump()
+
+
 @pytest.mark.level0
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
@@ -126,6 +134,17 @@ def test_cpu_e2e_dump():
     context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
     run_e2e_dump()
 
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_cpu_e2e_dump_with_hccl_set():
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
+    os.environ["RANK_ID"] = "4"
+    run_e2e_dump()
+
+
 @pytest.mark.level0
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
@@ -133,6 +152,17 @@ def test_gpu_e2e_dump():
     context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
     run_e2e_dump()
 
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_gpu_e2e_dump_with_hccl_set():
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
+    os.environ["RANK_ID"] = "4"
+    run_e2e_dump()
+
+
 class ReluReduceMeanDenseRelu(Cell):
     def __init__(self, kernel, bias, in_channel, num_class):
         super().__init__()
@@ -224,16 +254,15 @@ def test_dump_with_diagnostic_path():
     """
     context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
     pwd = os.getcwd()
-    change_current_dump_json('e2e_dump.json', '')
-    os.environ['MINDSPORE_DUMP_CONFIG'] = pwd + "/e2e_dump.json"
-    diagnose_path = pwd + "/e2e_dump"
-    os.environ['MS_DIAGNOSTIC_DATA_PATH'] = diagnose_path
-    dump_file_path = diagnose_path + '/debug_dump/rank_0/Net/0/0/'
-    if os.path.isdir(diagnose_path):
-        shutil.rmtree(diagnose_path)
-    add = Net()
-    add(Tensor(x), Tensor(y))
-    assert len(os.listdir(dump_file_path)) == 5
-
-    # Delete generated dump data
-    os.system("rm -rf {}".format(diagnose_path))
+    with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
+        dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
+        change_current_dump_json('e2e_dump.json', '', dump_config_path)
+        os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
+        diagnose_path = os.path.join(tmp_dir, 'e2e_dump')
+        os.environ['MS_DIAGNOSTIC_DATA_PATH'] = diagnose_path
+        dump_file_path = os.path.join(diagnose_path, 'debug_dump', 'rank_0', 'Net', '0', '0')
+        if os.path.isdir(diagnose_path):
+            shutil.rmtree(diagnose_path)
+        add = Net()
+        add(Tensor(x), Tensor(y))
+        assert len(os.listdir(dump_file_path)) == 5
diff --git a/tests/st/dynamic_shape/test_ftrl.py b/tests/st/dynamic_shape/test_ftrl.py
index bc1ce5b5143..d063283a0de 100644
--- a/tests/st/dynamic_shape/test_ftrl.py
+++ b/tests/st/dynamic_shape/test_ftrl.py
@@ -56,7 +56,7 @@ def test_ftrl_net():
                                                  [[0.6821311, 0.6821311]],
                                                  [[0.6821311, 0.6821311]]]))
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
diff --git a/tests/st/fusion/test_tbe_eltwise_fusion_1.py b/tests/st/fusion/test_tbe_eltwise_fusion_1.py
index 4f4494c49e6..dbdd7dd4784 100644
--- a/tests/st/fusion/test_tbe_eltwise_fusion_1.py
+++ b/tests/st/fusion/test_tbe_eltwise_fusion_1.py
@@ -36,7 +36,7 @@ class Net(nn.Cell):
         return x
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
diff --git a/tests/st/fusion/test_tbe_eltwise_fusion_2.py b/tests/st/fusion/test_tbe_eltwise_fusion_2.py
index ee74a214b70..41bec156548 100644
--- a/tests/st/fusion/test_tbe_eltwise_fusion_2.py
+++ b/tests/st/fusion/test_tbe_eltwise_fusion_2.py
@@ -42,7 +42,7 @@ class Net(nn.Cell):
         return x
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
diff --git a/tests/st/fusion/test_tbe_reduce_eltwise_fusion.py b/tests/st/fusion/test_tbe_reduce_eltwise_fusion.py
index 9b7328fd9b2..00fc98adc61 100644
--- a/tests/st/fusion/test_tbe_reduce_eltwise_fusion.py
+++ b/tests/st/fusion/test_tbe_reduce_eltwise_fusion.py
@@ -42,7 +42,7 @@ class Net(nn.Cell):
         return x
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
diff --git a/tests/st/host_device/test_host_device_lenet.py b/tests/st/host_device/test_host_device_lenet.py
index 80bf7b578a4..a24bdcfa8b1 100644
--- a/tests/st/host_device/test_host_device_lenet.py
+++ b/tests/st/host_device/test_host_device_lenet.py
@@ -78,7 +78,7 @@ def train(net, data, label):
     assert np.all(diff < 1.e-6)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
diff --git a/tests/st/mix_precision/test_mix_precision.py b/tests/st/mix_precision/test_mix_precision.py
index 00714222c99..302c90554f7 100644
--- a/tests/st/mix_precision/test_mix_precision.py
+++ b/tests/st/mix_precision/test_mix_precision.py
@@ -126,15 +126,15 @@ def test_sit_auto_mix_precision_model_o0():
     loss = nn.SoftmaxCrossEntropyWithLogits(sparse=False)
     model = Model(net, loss, opt, amp_level="O0")
     model.train(1, dataset1, dataset_sink_mode=False)
-    contend = read_validateir_file('./test_amp_o0')
+    contend = read_validateir_file('./test_amp_o0/rank_0/ir_dump')
     castnum = re.findall(r"Cast\(", contend)
     assert len(castnum) == 5
     clean_all_ir_files('./test_amp_o0')
     model.predict(Tensor(input_data))
-    contend = read_validateir_file('./test_amp_o0')
+    contend = read_validateir_file('./test_amp_o0/rank_0/ir_dump')
     castnum = re.findall(r"Cast\(", contend)
     assert len(castnum) == 11
-    clean_all_ir_files('./test_amp_o0')
+    clean_all_ir_files('./test_amp_o0/rank_0/ir_dump')
 
 
 @pytest.mark.level0
@@ -162,10 +162,10 @@ def test_sit_auto_mix_precision_model_o2():
     loss = nn.SoftmaxCrossEntropyWithLogits(sparse=False)
     model = Model(net, loss, opt, amp_level="O2")
     model.train(1, dataset1, dataset_sink_mode=False)
-    contend = read_validateir_file('./test_amp_o2')
+    contend = read_validateir_file('./test_amp_o2/rank_0/ir_dump')
     castnum = re.findall(r"Cast\(", contend)
     assert len(castnum) == 14
-    clean_all_ir_files('./test_amp_o2')
+    clean_all_ir_files('./test_amp_o2/rank_0/ir_dump')
     out_graph = model.predict(Tensor(input_data))
 
     # pynative mode
diff --git a/tests/st/model_zoo_tests/yolov3/src/yolov3.py b/tests/st/model_zoo_tests/yolov3/src/yolov3.py
index 643fe0be1d7..7ddf3ae695f 100644
--- a/tests/st/model_zoo_tests/yolov3/src/yolov3.py
+++ b/tests/st/model_zoo_tests/yolov3/src/yolov3.py
@@ -671,8 +671,7 @@ class TrainingWrapper(nn.Cell):
         if self.reducer_flag:
             # apply grad reducer on grads
             grads = self.grad_reducer(grads)
-        self.optimizer(grads)
-        return loss
+        return F.depend(loss, self.optimizer(grads))
 
 
 class YoloBoxScores(nn.Cell):
diff --git a/tests/st/networks/test_cpu_lenet.py b/tests/st/networks/test_cpu_lenet.py
index 6d25e6a4713..8b917d99d15 100644
--- a/tests/st/networks/test_cpu_lenet.py
+++ b/tests/st/networks/test_cpu_lenet.py
@@ -71,7 +71,7 @@ def train(net, data, label):
     assert res
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_lenet():
diff --git a/tests/st/networks/test_gpu_lenet.py b/tests/st/networks/test_gpu_lenet.py
index c732ad44bfe..c2e25ff5b21 100644
--- a/tests/st/networks/test_gpu_lenet.py
+++ b/tests/st/networks/test_gpu_lenet.py
@@ -187,7 +187,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1,
     return mnist_ds
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_train_and_eval_lenet():
diff --git a/tests/st/ops/ascend/test_tbe_ops/Initialize.info b/tests/st/ops/ascend/test_tbe_ops/Initialize.info
index 47581920842..2e0c6330dbd 100644
--- a/tests/st/ops/ascend/test_tbe_ops/Initialize.info
+++ b/tests/st/ops/ascend/test_tbe_ops/Initialize.info
@@ -12,9 +12,7 @@
       "offlineTune": false,
       "op_bank_path": "",
       "op_bank_update": false,
-      "op_compiler_cache_dir": "",
-      "op_compiler_cache_mode": 0,
-      "op_debug_dir": "./",
+      "op_debug_dir": "./rank_0/",
       "op_debug_level": "0",
       "op_impl_mode": "",
       "op_impl_mode_list": [],
diff --git a/tests/st/ops/cpu/test_cpu_type.py b/tests/st/ops/cpu/test_cpu_type.py
index 55dfd5564cd..e28d7618945 100644
--- a/tests/st/ops/cpu/test_cpu_type.py
+++ b/tests/st/ops/cpu/test_cpu_type.py
@@ -57,7 +57,7 @@ class Net2(nn.Cell):
         return self.bias_add1(self.bias_add(x, b), c)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_bias_add2():
diff --git a/tests/st/ops/cpu/test_dropout_op.py b/tests/st/ops/cpu/test_dropout_op.py
index 4fc1be596f1..06b0155fe66 100644
--- a/tests/st/ops/cpu/test_dropout_op.py
+++ b/tests/st/ops/cpu/test_dropout_op.py
@@ -33,7 +33,7 @@ class Net(nn.Cell):
         return self.dropout(x)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_net():
@@ -54,7 +54,7 @@ class Net1(nn.Cell):
         return self.dropout(x)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_net1():
@@ -75,7 +75,7 @@ class Net2(nn.Cell):
         return self.dropout(x)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_net2():
diff --git a/tests/st/ops/cpu/test_gather_d_grad_op.py b/tests/st/ops/cpu/test_gather_d_grad_op.py
index 3260ad5da10..0a19a91b22c 100644
--- a/tests/st/ops/cpu/test_gather_d_grad_op.py
+++ b/tests/st/ops/cpu/test_gather_d_grad_op.py
@@ -46,7 +46,7 @@ class NetGatherDGrad(nn.Cell):
         return self.grad(self.network)(inputx, index, output_grad)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_gatherd_grad_fp32():
@@ -64,7 +64,7 @@ def test_gatherd_grad_fp32():
     print(output_grad.asnumpy())
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_gatherd_grad_fp16():
@@ -82,7 +82,7 @@ def test_gatherd_grad_fp16():
     print(output_grad.asnumpy())
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_gatherd_grad_int32():
diff --git a/tests/st/ops/cpu/test_lstm_op.py b/tests/st/ops/cpu/test_lstm_op.py
index 52f61dfbbc8..e7687e4f3e4 100644
--- a/tests/st/ops/cpu/test_lstm_op.py
+++ b/tests/st/ops/cpu/test_lstm_op.py
@@ -254,7 +254,7 @@ class MultiLayerBiLstmNet(nn.Cell):
         return self.lstm(self.x, (self.h, self.c))
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_multi_layer_bilstm():
@@ -345,7 +345,7 @@ class Net(nn.Cell):
         return self.lstm(self.x, (self.h, self.c))[0]
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_grad():
diff --git a/tests/st/ops/cpu/test_minimum_grad_op.py b/tests/st/ops/cpu/test_minimum_grad_op.py
index 95e2ec071cc..d4731046f9f 100644
--- a/tests/st/ops/cpu/test_minimum_grad_op.py
+++ b/tests/st/ops/cpu/test_minimum_grad_op.py
@@ -63,7 +63,7 @@ def gen_data(inputA_np, inputB_np, grad_=None):
     return output
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_min_tensor_grad_4d():
diff --git a/tests/st/ops/cpu/test_momentum_op.py b/tests/st/ops/cpu/test_momentum_op.py
index b35ec5da4ed..4d7e39c4da8 100644
--- a/tests/st/ops/cpu/test_momentum_op.py
+++ b/tests/st/ops/cpu/test_momentum_op.py
@@ -42,7 +42,7 @@ class MomentumNet(nn.Cell):
         return output
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_momentum():
diff --git a/tests/st/ops/cpu/test_random_choice_with_mask_op.py b/tests/st/ops/cpu/test_random_choice_with_mask_op.py
index 47a4ac200a9..5ab7f77756a 100644
--- a/tests/st/ops/cpu/test_random_choice_with_mask_op.py
+++ b/tests/st/ops/cpu/test_random_choice_with_mask_op.py
@@ -109,8 +109,8 @@ def test_RCWM_1D():
     context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
     input_tensor = Tensor(
         np.array([1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1]).astype(np.bool))
-    expect_index = np.array([[0], [7], [9], [8], [8], [0],
-                             [2], [7], [0], [0]]).astype(np.int32)
+    expect_index = np.array([[11], [0], [8], [2], [9], [7],
+                             [10], [15], [0], [0]]).astype(np.int32)
     expect_mask = np.array(
         [True, True, True, True, True, True, True, True, False, False])
     rcwm = RCWM_1D()
diff --git a/tests/st/ops/cpu/test_softplus_grad_op.py b/tests/st/ops/cpu/test_softplus_grad_op.py
index 76879689960..5dc8cc5a3e9 100644
--- a/tests/st/ops/cpu/test_softplus_grad_op.py
+++ b/tests/st/ops/cpu/test_softplus_grad_op.py
@@ -48,7 +48,7 @@ class Grad(nn.Cell):
 @pytest.mark.level0
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
-def test_softplus_grad():
+def test_softplus_grad_1d_fp32():
     x = np.array([0.58401114, 0.68800163, 0.9760397, 0.14702141, 0.46563736, 0.9607501,
                   0.14567593, 0.12261796, 0.37054458, 0.46421242]).astype(np.float32)
     dy = np.array([0.5559598, 0.96994054, 0.24770357, 0.34646875, 0.2984393, 0.03287048,
@@ -67,7 +67,7 @@ def test_softplus_grad():
 @pytest.mark.level0
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
-def test_softplus_grad_fp16():
+def test_softplus_grad_3d_fp16():
     np.random.seed(42)
     x_np = np.random.randn(5, 3, 6).astype(np.float16)
     dy_np = np.random.randn(5, 3, 6).astype(np.float16)
@@ -76,3 +76,17 @@ def test_softplus_grad_fp16():
     output = grad(Tensor(x_np), Tensor(dy_np))
     expect = dy_np * np.exp(x_np) / (1 + np.exp(x_np))
     assert np.allclose(output[0].asnumpy(), expect, rtol=1e-2)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_softplus_grad_7d_fp32():
+    np.random.seed(20)
+    x_np = np.random.randn(5, 3, 6, 3, 4, 5, 6).astype(np.float32)
+    dy_np = np.random.randn(5, 3, 6, 3, 4, 5, 6).astype(np.float32)
+    net = SoftplusNet()
+    grad = Grad(net)
+    output = grad(Tensor(x_np), Tensor(dy_np))
+    expect = dy_np * np.exp(x_np) / (1 + np.exp(x_np))
+    assert np.allclose(output[0].asnumpy(), expect, rtol=1e-2)
diff --git a/tests/st/ops/cpu/test_softplus_op.py b/tests/st/ops/cpu/test_softplus_op.py
index 19af2a20762..87aada0feb8 100644
--- a/tests/st/ops/cpu/test_softplus_op.py
+++ b/tests/st/ops/cpu/test_softplus_op.py
@@ -40,7 +40,21 @@ def SoftplusCompute(x):
 @pytest.mark.level0
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
-def test_softplus_1d():
+def test_softplus_0d_fp32():
+    x_np = np.array(1.2, np.float32)
+    y_np = SoftplusCompute(x_np)
+
+    x_ms = Tensor(x_np)
+    net = SoftplusNet()
+    y_ms = net(x_ms)
+
+    assert np.allclose(y_np, y_ms.asnumpy())
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_softplus_1d_fp32():
     x_np = np.random.random((50,)).astype(np.float32)
     y_np = SoftplusCompute(x_np)
 
@@ -54,7 +68,7 @@ def test_softplus_1d():
 @pytest.mark.level0
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
-def test_softplus_2d():
+def test_softplus_2d_fp32():
     x_np = np.random.random((50, 40)).astype(np.float32)
     y_np = SoftplusCompute(x_np)
 
@@ -68,7 +82,7 @@ def test_softplus_2d():
 @pytest.mark.level0
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
-def test_softplus_4d():
+def test_softplus_4d_fp32():
     x_np = np.random.random((32, 3, 224, 224)).astype(np.float32)
     y_np = SoftplusCompute(x_np)
 
@@ -105,3 +119,17 @@ def test_softplus_4d_fp16():
     y_ms = net(x_ms)
 
     assert np.allclose(y_np, y_ms.asnumpy(), rtol=5e-3)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_softplus_7d_fp32():
+    x_np = np.random.random((32, 3, 20, 20, 20, 10, 10)).astype(np.float32)
+    y_np = SoftplusCompute(x_np)
+
+    x_ms = Tensor(x_np)
+    net = SoftplusNet()
+    y_ms = net(x_ms)
+
+    assert np.allclose(y_np, y_ms.asnumpy(), rtol=5e-3)
diff --git a/tests/st/ops/cpu/test_tile_op.py b/tests/st/ops/cpu/test_tile_op.py
index deafd8e5ef3..2568609fca6 100644
--- a/tests/st/ops/cpu/test_tile_op.py
+++ b/tests/st/ops/cpu/test_tile_op.py
@@ -35,7 +35,7 @@ class Net(nn.Cell):
 arr_x = np.array([[0], [1], [2], [3]]).astype(np.int32)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_net():
@@ -48,7 +48,7 @@ def test_net():
 arr_x = np.array([[0], [1], [2], [3]]).astype(np.float64)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_net_float64():
@@ -61,7 +61,7 @@ def test_net_float64():
 arr_x = np.array([[0], [1], [2], [3]]).astype(np.bool_)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
 def test_net_bool():
diff --git a/tests/st/ops/gpu/test_error_on_dynamic_shape_input_op.py b/tests/st/ops/gpu/test_error_on_dynamic_shape_input_op.py
index 7431220c968..0a77215bfd7 100644
--- a/tests/st/ops/gpu/test_error_on_dynamic_shape_input_op.py
+++ b/tests/st/ops/gpu/test_error_on_dynamic_shape_input_op.py
@@ -46,7 +46,7 @@ def test_error_on_dynamic_shape_input_is_dynamic():
         error_on_dynamic_shape_input.infer_shape([-1, -1, -1])
     assert "Input is dynamically shaped" in str(info.value)
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_error_on_dynamic_shape_input_not_dynamic():
diff --git a/tests/st/ops/gpu/test_momentum_op.py b/tests/st/ops/gpu/test_momentum_op.py
index 51ec0ffc7aa..ddf70c430f2 100644
--- a/tests/st/ops/gpu/test_momentum_op.py
+++ b/tests/st/ops/gpu/test_momentum_op.py
@@ -42,7 +42,7 @@ class NetMomentum(nn.Cell):
         return output
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_momentum():
diff --git a/tests/st/ops/gpu/test_print_op.py b/tests/st/ops/gpu/test_print_op.py
index e8b890bbd04..48c325ab29c 100644
--- a/tests/st/ops/gpu/test_print_op.py
+++ b/tests/st/ops/gpu/test_print_op.py
@@ -118,84 +118,84 @@ def test_print_multiple_types():
     net(x, y, z)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_print_bool():
     print_testcase(np.bool)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_print_int8():
     print_testcase(np.int8)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_print_int16():
     print_testcase(np.int16)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_print_int32():
     print_testcase(np.int32)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_print_int64():
     print_testcase(np.int64)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_print_uint8():
     print_testcase(np.uint8)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_print_uint16():
     print_testcase(np.uint16)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_print_uint32():
     print_testcase(np.uint32)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_print_uint64():
     print_testcase(np.uint64)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_print_float16():
     print_testcase(np.float16)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_print_float32():
     print_testcase(np.float32)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_print_string():
diff --git a/tests/st/probability/distribution/test_categorical_gpu.py b/tests/st/probability/distribution/test_categorical_gpu.py
index 0ec57bcf4b7..fcaefe2a2e7 100644
--- a/tests/st/probability/distribution/test_categorical_gpu.py
+++ b/tests/st/probability/distribution/test_categorical_gpu.py
@@ -52,7 +52,7 @@ class CategoricalProb(nn.Cell):
 
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_probability_categorical_prob_cdf_probs_none():
diff --git a/tests/st/probability/distribution/test_cauchy_pynative.py b/tests/st/probability/distribution/test_cauchy_pynative.py
index 24b626c3f76..c99053c2d2a 100644
--- a/tests/st/probability/distribution/test_cauchy_pynative.py
+++ b/tests/st/probability/distribution/test_cauchy_pynative.py
@@ -36,7 +36,7 @@ class CauchyMean(nn.Cell):
 
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.env_onecard
 def test_probability_cauchy_mean_loc_scale_rand_2_ndarray():
@@ -61,7 +61,7 @@ class CauchyProb(nn.Cell):
         return out1, out2, out3, out4, out5, out6
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.env_onecard
 def test_probability_cauchy_prob_cdf_loc_scale_rand_4_ndarray():
diff --git a/tests/st/pynative/loss_scale/test_loss_scale.py b/tests/st/pynative/loss_scale/test_loss_scale.py
index 1c5a4a7a93f..3cbbaa819e7 100644
--- a/tests/st/pynative/loss_scale/test_loss_scale.py
+++ b/tests/st/pynative/loss_scale/test_loss_scale.py
@@ -193,7 +193,7 @@ def test_loss_scale_fp16_lr_overflow_set_sense_scale():
     assert output_1[0].asnumpy() == output_2[0].asnumpy()
     assert output_1[1].asnumpy() == output_2[1].asnumpy() == True
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
diff --git a/tests/st/pynative/test_graph_param_transform.py b/tests/st/pynative/test_graph_param_transform.py
index d30bf32d10e..a6975fafefa 100644
--- a/tests/st/pynative/test_graph_param_transform.py
+++ b/tests/st/pynative/test_graph_param_transform.py
@@ -179,7 +179,7 @@ def test_parser_switch_layer_inputs_tuple():
     assert np.allclose(goodout.asnumpy(), netout.asnumpy(), 0, 0)
 
 
-@pytest.mark.level0
+@pytest.mark.level1
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 86d21eef618..02960a70d38 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -184,6 +184,7 @@ list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/optimizer/
 list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/optimizer/gpu/batch_norm_add_relu_grad_fusion.cc")
 list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/optimizer/gpu/batch_norm_relu_fusion.cc")
 list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/optimizer/gpu/batch_norm_relu_grad_fusion.cc")
+list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/kernel_compiler/tbe/ascend_kernel_compile.cc")
 
 add_library(_ut_mindspore_obj OBJECT ${MINDSPORE_SRC_LIST})
 add_library(_ut_ut_obj OBJECT ${UT_SRCS})
diff --git a/tests/ut/cpp/dataset/CMakeLists.txt b/tests/ut/cpp/dataset/CMakeLists.txt
index 1f33a1b4e3f..5211277faa9 100644
--- a/tests/ut/cpp/dataset/CMakeLists.txt
+++ b/tests/ut/cpp/dataset/CMakeLists.txt
@@ -16,7 +16,6 @@ SET(DE_UT_SRCS
         c_api_audio_r_to_z_test.cc
         c_api_cache_test.cc
         c_api_dataset_album_test.cc
-        c_api_audio_a_to_q_test.cc
         c_api_dataset_cifar_test.cc
         c_api_dataset_clue_test.cc
         c_api_dataset_coco_test.cc
diff --git a/tests/ut/cpp/dataset/c_api_audio_a_to_q_test.cc b/tests/ut/cpp/dataset/c_api_audio_a_to_q_test.cc
index 089029ffd13..42933e7f1ad 100644
--- a/tests/ut/cpp/dataset/c_api_audio_a_to_q_test.cc
+++ b/tests/ut/cpp/dataset/c_api_audio_a_to_q_test.cc
@@ -13,12 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "common/common.h"
 #include "include/api/types.h"
 #include "utils/log_adapter.h"
 
 #include "minddata/dataset/include/dataset/audio.h"
 #include "minddata/dataset/include/dataset/datasets.h"
+#include "minddata/dataset/include/dataset/transforms.h"
 
 using namespace mindspore::dataset;
 using mindspore::LogStream;
@@ -31,7 +33,7 @@ class MindDataTestPipeline : public UT::DatasetOpTesting {
 };
 
 TEST_F(MindDataTestPipeline, TestAmplitudeToDBPipeline) {
-  MS_LOG(INFO) << "Basic Function Test";
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAmplitudeToDBPipeline.";
   // Original waveform
   std::shared_ptr<SchemaObj> schema = Schema();
   ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 200}));
@@ -69,7 +71,7 @@ TEST_F(MindDataTestPipeline, TestAmplitudeToDBPipeline) {
 }
 
 TEST_F(MindDataTestPipeline, TestAmplitudeToDBWrongArgs) {
-  MS_LOG(INFO) << "Basic Function Test";
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAmplitudeToDBWrongArgs.";
   // Original waveform
   std::shared_ptr<SchemaObj> schema = Schema();
   ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 200}));
@@ -89,8 +91,8 @@ TEST_F(MindDataTestPipeline, TestAmplitudeToDBWrongArgs) {
   EXPECT_EQ(iter, nullptr);
 }
 
-TEST_F(MindDataTestPipeline, Level0_TestBandBiquad001) {
-  MS_LOG(INFO) << "Basic Function Test";
+TEST_F(MindDataTestPipeline, TestBandBiquadBasic) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBandBiquadBasic.";
   // Original waveform
   std::shared_ptr<SchemaObj> schema = Schema();
   ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 200}));
@@ -128,8 +130,8 @@ TEST_F(MindDataTestPipeline, Level0_TestBandBiquad001) {
   iter->Stop();
 }
 
-TEST_F(MindDataTestPipeline, Level0_TestBandBiquad002) {
-  MS_LOG(INFO) << "Wrong Arg.";
+TEST_F(MindDataTestPipeline, TestBandBiquadParamCheck) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBandBiquadParamCheck.";
   std::shared_ptr<SchemaObj> schema = Schema();
   // Original waveform
   ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 2}));
@@ -157,8 +159,8 @@ TEST_F(MindDataTestPipeline, Level0_TestBandBiquad002) {
   EXPECT_EQ(iter02, nullptr);
 }
 
-TEST_F(MindDataTestPipeline, Level0_TestAllpassBiquad001) {
-  MS_LOG(INFO) << "Basic Function Test";
+TEST_F(MindDataTestPipeline, TestAllpassBiquadBasic) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAllpassBiquadBasic.";
   // Original waveform
   std::shared_ptr<SchemaObj> schema = Schema();
   ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 200}));
@@ -196,8 +198,8 @@ TEST_F(MindDataTestPipeline, Level0_TestAllpassBiquad001) {
   iter->Stop();
 }
 
-TEST_F(MindDataTestPipeline, Level0_TestAllpassBiquad002) {
-  MS_LOG(INFO) << "Wrong Arg.";
+TEST_F(MindDataTestPipeline, TestAllpassBiquadParamCheck) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAllpassBiquadParamCheck.";
   std::shared_ptr<SchemaObj> schema = Schema();
   // Original waveform
   ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 2}));
@@ -225,8 +227,8 @@ TEST_F(MindDataTestPipeline, Level0_TestAllpassBiquad002) {
   EXPECT_EQ(iter02, nullptr);
 }
 
-TEST_F(MindDataTestPipeline, Level0_TestBandpassBiquad001) {
-  MS_LOG(INFO) << "Basic Function Test";
+TEST_F(MindDataTestPipeline, TestBandpassBiquadBasic) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBandpassBiquadBasic.";
   // Original waveform
   std::shared_ptr<SchemaObj> schema = Schema();
   ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 200}));
@@ -264,8 +266,8 @@ TEST_F(MindDataTestPipeline, Level0_TestBandpassBiquad001) {
   iter->Stop();
 }
 
-TEST_F(MindDataTestPipeline, Level0_TestBandpassBiquad002) {
-  MS_LOG(INFO) << "Wrong Arg.";
+TEST_F(MindDataTestPipeline, TestBandpassBiquadParamCheck) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBandpassBiquadParamCheck.";
   std::shared_ptr<SchemaObj> schema = Schema();
   // Original waveform
   ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 2}));
@@ -293,8 +295,8 @@ TEST_F(MindDataTestPipeline, Level0_TestBandpassBiquad002) {
   EXPECT_EQ(iter02, nullptr);
 }
 
-TEST_F(MindDataTestPipeline, Level0_TestBandrejectBiquad001) {
-  MS_LOG(INFO) << "Basic Function Test";
+TEST_F(MindDataTestPipeline, TestBandrejectBiquadBasic) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBandrejectBiquadBasic.";
   // Original waveform
   std::shared_ptr<SchemaObj> schema = Schema();
   ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 200}));
@@ -332,8 +334,8 @@ TEST_F(MindDataTestPipeline, Level0_TestBandrejectBiquad001) {
   iter->Stop();
 }
 
-TEST_F(MindDataTestPipeline, Level0_TestBandrejectBiquad002) {
-  MS_LOG(INFO) << "Wrong Arg.";
+TEST_F(MindDataTestPipeline, TestBandrejectBiquadParamCheck) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBandrejectBiquadParamCheck.";
   std::shared_ptr<SchemaObj> schema = Schema();
   // Original waveform
   ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 2}));
@@ -361,8 +363,8 @@ TEST_F(MindDataTestPipeline, Level0_TestBandrejectBiquad002) {
   EXPECT_EQ(iter02, nullptr);
 }
 
-TEST_F(MindDataTestPipeline, Level0_TestBassBiquad001) {
-  MS_LOG(INFO) << "Basic Function Test";
+TEST_F(MindDataTestPipeline, TestBassBiquadBasic) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBassBiquadBasic.";
   // Original waveform
   std::shared_ptr<SchemaObj> schema = Schema();
   ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 200}));
@@ -400,8 +402,8 @@ TEST_F(MindDataTestPipeline, Level0_TestBassBiquad001) {
   iter->Stop();
 }
 
-TEST_F(MindDataTestPipeline, Level0_TestBassBiquad002) {
-  MS_LOG(INFO) << "Wrong Arg.";
+TEST_F(MindDataTestPipeline, TestBassBiquadParamCheck) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBassBiquadParamCheck.";
   std::shared_ptr<SchemaObj> schema = Schema();
   // Original waveform
   ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 2}));
@@ -430,7 +432,7 @@ TEST_F(MindDataTestPipeline, Level0_TestBassBiquad002) {
 }
 
 TEST_F(MindDataTestPipeline, TestAnglePipeline) {
-  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAnglePipeline";
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAnglePipeline.";
 
   std::shared_ptr<SchemaObj> schema = Schema();
   ASSERT_OK(schema->add_column("complex", mindspore::DataType::kNumberTypeFloat32, {2, 2}));
@@ -468,7 +470,7 @@ TEST_F(MindDataTestPipeline, TestAnglePipeline) {
 }
 
 TEST_F(MindDataTestPipeline, TestAnglePipelineError) {
-  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAnglePipelineError";
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAnglePipelineError.";
 
   std::shared_ptr<SchemaObj> schema = Schema();
   ASSERT_OK(schema->add_column("complex", mindspore::DataType::kNumberTypeFloat32, {3, 2, 1}));
@@ -487,3 +489,121 @@ TEST_F(MindDataTestPipeline, TestAnglePipelineError) {
   std::unordered_map<std::string, mindspore::MSTensor> row;
   EXPECT_ERROR(iter->GetNextRow(&row));
 }
+
+TEST_F(MindDataTestPipeline, TestFrequencyMaskingPipeline) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFrequencyMaskingPipeline.";
+  // Original waveform
+  std::shared_ptr<SchemaObj> schema = Schema();
+  ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {200, 200}));
+  std::shared_ptr<Dataset> ds = RandomData(50, schema);
+  EXPECT_NE(ds, nullptr);
+
+  ds = ds->SetNumWorkers(4);
+  EXPECT_NE(ds, nullptr);
+
+  auto frequencymasking = audio::FrequencyMasking(true, 6);
+
+  ds = ds->Map({frequencymasking});
+  EXPECT_NE(ds, nullptr);
+
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(ds, nullptr);
+
+  std::unordered_map<std::string, mindspore::MSTensor> row;
+  ASSERT_OK(iter->GetNextRow(&row));
+
+  std::vector<int64_t> expected = {200, 200};
+
+  int i = 0;
+  while (row.size() != 0) {
+    auto col = row["inputData"];
+    ASSERT_EQ(col.Shape(), expected);
+    ASSERT_EQ(col.Shape().size(), 2);
+    ASSERT_EQ(col.DataType(), mindspore::DataType::kNumberTypeFloat32);
+    ASSERT_OK(iter->GetNextRow(&row));
+    i++;
+  }
+  EXPECT_EQ(i, 50);
+
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestFrequencyMaskingWrongArgs) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFrequencyMaskingWrongArgs.";
+  // Original waveform
+  std::shared_ptr<SchemaObj> schema = Schema();
+  ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {20, 20}));
+  std::shared_ptr<Dataset> ds = RandomData(50, schema);
+  EXPECT_NE(ds, nullptr);
+
+  ds = ds->SetNumWorkers(4);
+  EXPECT_NE(ds, nullptr);
+
+  auto frequencymasking = audio::FrequencyMasking(true, -100);
+
+  ds = ds->Map({frequencymasking});
+  EXPECT_NE(ds, nullptr);
+
+  // Filtered waveform by bandbiquad
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  // Expect failure
+  EXPECT_EQ(iter, nullptr);
+}
+
+TEST_F(MindDataTestPipeline, TestComplexNormBasic) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestComplexNormBasic.";
+
+  // Original waveform
+  std::shared_ptr<SchemaObj> schema = Schema();
+  ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeInt64, {3, 2, 4, 2}));
+  std::shared_ptr<Dataset> ds = RandomData(50, schema);
+  EXPECT_NE(ds, nullptr);
+
+  ds = ds->SetNumWorkers(4);
+  EXPECT_NE(ds, nullptr);
+
+  auto ComplexNormOp = audio::ComplexNorm(3.0);
+
+  ds = ds->Map({ComplexNormOp});
+  EXPECT_NE(ds, nullptr);
+
+  // Filtered waveform by ComplexNorm
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(ds, nullptr);
+
+  std::unordered_map<std::string, mindspore::MSTensor> row;
+  ASSERT_OK(iter->GetNextRow(&row));
+
+  std::vector<int64_t> expected = {3, 2, 2};
+
+  int i = 0;
+  while (row.size() != 0) {
+    auto col = row["inputData"];
+    ASSERT_EQ(col.Shape(), expected);
+    ASSERT_EQ(col.DataType(), mindspore::DataType::kNumberTypeFloat32);
+    ASSERT_OK(iter->GetNextRow(&row));
+    i++;
+  }
+  EXPECT_EQ(i, 50);
+
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestComplexNormWrongArgs) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestComplexNormWrongArgs.";
+
+  // Original waveform
+  std::shared_ptr<SchemaObj> schema = Schema();
+  ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeInt64, {3, 2, 4, 2}));
+  std::shared_ptr<Dataset> ds = RandomData(50, schema);
+  EXPECT_NE(ds, nullptr);
+
+  ds = ds->SetNumWorkers(4);
+  EXPECT_NE(ds, nullptr);
+
+  auto ComplexNormOp = audio::ComplexNorm(-10);
+
+  ds = ds->Map({ComplexNormOp});
+  std::shared_ptr<Iterator> iter1 = ds->CreateIterator();
+  EXPECT_EQ(iter1, nullptr);
+}
diff --git a/tests/ut/cpp/dataset/c_api_audio_r_to_z_test.cc b/tests/ut/cpp/dataset/c_api_audio_r_to_z_test.cc
index 902f906a5c2..cc833a53654 100644
--- a/tests/ut/cpp/dataset/c_api_audio_r_to_z_test.cc
+++ b/tests/ut/cpp/dataset/c_api_audio_r_to_z_test.cc
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "common/common.h"
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/include/dataset/datasets.h"
@@ -23,12 +24,73 @@ using mindspore::LogStream;
 using mindspore::ExceptionType::NoExceptionType;
 using mindspore::MsLogLevel::INFO;
 
-class MindDataTestPipeline : public UT::Common {
- public:
+class MindDataTestPipeline : public UT::DatasetOpTesting {
+ protected:
 };
 
+TEST_F(MindDataTestPipeline, TestTimeMaskingPipeline) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTimeMaskingPipeline.";
+  // Original waveform
+  std::shared_ptr<SchemaObj> schema = Schema();
+  ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 200}));
+  std::shared_ptr<Dataset> ds = RandomData(50, schema);
+  EXPECT_NE(ds, nullptr);
+
+  ds = ds->SetNumWorkers(4);
+  EXPECT_NE(ds, nullptr);
+
+  auto timemasking = audio::TimeMasking(true, 6);
+
+  ds = ds->Map({timemasking});
+  EXPECT_NE(ds, nullptr);
+
+  // Filtered waveform by bandbiquad
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(ds, nullptr);
+
+  std::unordered_map<std::string, mindspore::MSTensor> row;
+  ASSERT_OK(iter->GetNextRow(&row));
+
+  std::vector<int64_t> expected = {2, 200};
+
+  int i = 0;
+  while (row.size() != 0) {
+    auto col = row["inputData"];
+    ASSERT_EQ(col.Shape(), expected);
+    ASSERT_EQ(col.Shape().size(), 2);
+    ASSERT_EQ(col.DataType(), mindspore::DataType::kNumberTypeFloat32);
+    ASSERT_OK(iter->GetNextRow(&row));
+    i++;
+  }
+  EXPECT_EQ(i, 50);
+
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestTimeMaskingWrongArgs) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTimeMaskingWrongArgs.";
+  // Original waveform
+  std::shared_ptr<SchemaObj> schema = Schema();
+  ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {2, 20}));
+  std::shared_ptr<Dataset> ds = RandomData(50, schema);
+  EXPECT_NE(ds, nullptr);
+
+  ds = ds->SetNumWorkers(4);
+  EXPECT_NE(ds, nullptr);
+
+  auto timemasking = audio::TimeMasking(true, -100);
+
+  ds = ds->Map({timemasking});
+  EXPECT_NE(ds, nullptr);
+
+  // Filtered waveform by bandbiquad
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  // Expect failure
+  EXPECT_EQ(iter, nullptr);
+}
+
 TEST_F(MindDataTestPipeline, TestTimeStretchPipeline) {
-  MS_LOG(INFO) << "Doing test TimeStretchOp with custom param value. Pipeline.";
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTimeStretchPipeline.";
   // op param
   int freq = 1025;
   int hop_length = 512;
@@ -54,7 +116,7 @@ TEST_F(MindDataTestPipeline, TestTimeStretchPipeline) {
   std::unordered_map<std::string, mindspore::MSTensor> row;
   ASSERT_OK(iter->GetNextRow(&row));
 
-  std::vector<int64_t> expected = {2, freq, int(std::ceil(400 / rate)), 2};
+  std::vector<int64_t> expected = {2, freq, static_cast<int64_t>(std::ceil(400 / rate)), 2};
 
   int i = 0;
   while (row.size() != 0) {
@@ -70,7 +132,7 @@ TEST_F(MindDataTestPipeline, TestTimeStretchPipeline) {
 }
 
 TEST_F(MindDataTestPipeline, TestTimeStretchPipelineWrongArgs) {
-  MS_LOG(INFO) << "Doing test TimeStretchOp with wrong param value. Pipeline.";
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTimeStretchPipelineWrongArgs.";
   // op param
   int freq = 1025;
   int hop_length = 512;
@@ -93,4 +155,4 @@ TEST_F(MindDataTestPipeline, TestTimeStretchPipelineWrongArgs) {
   std::shared_ptr<Iterator> iter = ds->CreateIterator();
   // Expect failure
   EXPECT_EQ(iter, nullptr);
-}
\ No newline at end of file
+}
diff --git a/tests/ut/cpp/dataset/c_api_vision_a_to_q_test.cc b/tests/ut/cpp/dataset/c_api_vision_a_to_q_test.cc
index 0647ae7bdd0..2d2678eb46f 100644
--- a/tests/ut/cpp/dataset/c_api_vision_a_to_q_test.cc
+++ b/tests/ut/cpp/dataset/c_api_vision_a_to_q_test.cc
@@ -27,9 +27,8 @@ class MindDataTestPipeline : public UT::DatasetOpTesting {
 
 // Tests for vision C++ API A to Q TensorTransform Operations (in alphabetical order)
 
-TEST_F(MindDataTestPipeline, TestAdjustGammaSuccess1) {
-  // pipeline 3-channel
-  MS_LOG(INFO) << "Pipeline Test.";
+TEST_F(MindDataTestPipeline, TestAdjustGamma3Channel) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAdjustGamma3Channel.";
   std::string MindDataPath = "data/dataset";
   std::string folder_path = MindDataPath + "/testImageNetData/train/";
   std::shared_ptr<Dataset> ds1 = ImageFolder(folder_path, true, std::make_shared<RandomSampler>(false, 2));
@@ -65,9 +64,8 @@ TEST_F(MindDataTestPipeline, TestAdjustGammaSuccess1) {
   iter2->Stop();
 }
 
-TEST_F(MindDataTestPipeline, TestAdjustGammaSuccess2) {
-  // pipeline 1-channel
-  MS_LOG(INFO) << "Pipeline Test.";
+TEST_F(MindDataTestPipeline, TestAdjustGamma1Channel) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAdjustGamma1Channel.";
   std::string MindDataPath = "data/dataset";
   std::string folder_path = MindDataPath + "/testImageNetData/train/";
   std::shared_ptr<Dataset> ds1 = ImageFolder(folder_path, true, std::make_shared<RandomSampler>(false, 2));
@@ -105,8 +103,7 @@ TEST_F(MindDataTestPipeline, TestAdjustGammaSuccess2) {
 }
 
 TEST_F(MindDataTestPipeline, TestAdjustGammaParamCheck) {
-  // pipeline 3-channel
-  MS_LOG(INFO) << "Pipeline Test.";
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAdjustGammaParamCheck.";
   std::string MindDataPath = "data/dataset";
   std::string folder_path = MindDataPath + "/testImageNetData/train/";
   std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true, std::make_shared<RandomSampler>(false, 2));
diff --git a/tests/ut/cpp/dataset/c_api_vision_r_to_z_test.cc b/tests/ut/cpp/dataset/c_api_vision_r_to_z_test.cc
index ebffd3807ee..33630b22e1b 100644
--- a/tests/ut/cpp/dataset/c_api_vision_r_to_z_test.cc
+++ b/tests/ut/cpp/dataset/c_api_vision_r_to_z_test.cc
@@ -312,3 +312,41 @@ TEST_F(MindDataTestPipeline, TestRotatePass) {
   // Manually terminate the pipeline
   iter->Stop();
 }
+
+TEST_F(MindDataTestPipeline, TestRGB2BGR) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRGB2BGR.";
+  // create two imagenet dataset
+  std::string MindDataPath = "data/dataset";
+  std::string folder_path = MindDataPath + "/testImageNetData/train/";
+  std::shared_ptr<Dataset> ds1 = ImageFolder(folder_path, true, std::make_shared<RandomSampler>(false, 2));
+  EXPECT_NE(ds1, nullptr);
+  std::shared_ptr<Dataset> ds2 = ImageFolder(folder_path, true, std::make_shared<RandomSampler>(false, 2));
+  EXPECT_NE(ds2, nullptr);
+
+  auto rgb2bgr_op = vision::RGB2BGR();
+
+  ds1 = ds1->Map({rgb2bgr_op});
+  EXPECT_NE(ds1, nullptr);
+
+  std::shared_ptr<Iterator> iter1 = ds1->CreateIterator();
+  EXPECT_NE(iter1, nullptr);
+  std::unordered_map<std::string, mindspore::MSTensor> row1;
+  iter1->GetNextRow(&row1);
+
+  std::shared_ptr<Iterator> iter2 = ds2->CreateIterator();
+  EXPECT_NE(iter2, nullptr);
+  std::unordered_map<std::string, mindspore::MSTensor> row2;
+  iter2->GetNextRow(&row2);
+
+  uint64_t i = 0;
+  while (row1.size() != 0) {
+    i++;
+    auto image =row1["image"];
+    iter1->GetNextRow(&row1);
+    iter2->GetNextRow(&row2);
+  }
+  EXPECT_EQ(i, 2);
+
+  iter1->Stop();
+  iter2->Stop();
+}
diff --git a/tests/ut/cpp/dataset/cmu_arctic_test.cc b/tests/ut/cpp/dataset/cmu_arctic_test.cc
deleted file mode 100644
index f799ebc897e..00000000000
--- a/tests/ut/cpp/dataset/cmu_arctic_test.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h> 
-
-#include "utils/ms_utils.h"
-#include "common/common.h"
-#include "minddata/dataset/core/client.h"
-#include "minddata/dataset/core/global_context.h"
-#include "minddata/dataset/engine/datasetops/source/cmu_arctic_op.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h"
-#include "minddata/dataset/include/dataset/datasets.h"
-#include "minddata/dataset/util/path.h"
-#include "minddata/dataset/util/status.h"
-#include "gtest/gtest.h"
-#include "utils/log_adapter.h"
-#include "securec.h"
-
-namespace common = mindspore::common;
-using namespace mindspore::dataset;
-using mindspore::LogStream;
-using mindspore::ExceptionType::NoExceptionType;
-using mindspore::MsLogLevel::ERROR;
-
-std::shared_ptr<RepeatOp> Repeat(int repeat_cnt);
-
-std::shared_ptr<ExecutionTree> Build(std::vector<std::shared_ptr<DatasetOp>> ops);
-
-class MindDataTestCmuArcticSampler : public UT::DatasetOpTesting {
- protected:
-};
-
-TEST_F(MindDataTestCmuArcticSampler, TestSequentialCmuArcticWithRepeat) {
-  std::string folder_path = datasets_root_path_ + "/testCmuArcticData/";
-  int64_t num_samples = 10;
-  int64_t start_index = 0;
-  std::shared_ptr<Dataset> ds =
-    CmuArctic(folder_path, "aew", std::make_shared<SequentialSampler>(start_index, num_samples));
-  EXPECT_NE(ds, nullptr);
-  ds = ds->Repeat(2);
-  EXPECT_NE(ds, nullptr);
-  std::shared_ptr<Iterator> iter = ds->CreateIterator();
-  EXPECT_NE(iter, nullptr);
-  std::unordered_map<std::string, mindspore::MSTensor> row;
-  ASSERT_OK(iter->GetNextRow(&row));
-
-  std::string_view utterance;
-  std::string_view utterance_id;
-  uint32_t rate;
-  
-  uint64_t i = 0;
-  while (row.size() != 0) {
-
-    auto waveform = row["waveform"];
-    auto sample_rate = row["sample_rate"];
-    auto utterance_ = row["utterance"];
-    auto utterance_id_ = row["utterance_id"];
-
-    MS_LOG(ERROR) << "Tensor image shape: " << waveform.Shape();
-
-    std::shared_ptr<Tensor> t_rate;
-    ASSERT_OK(Tensor::CreateFromMSTensor(sample_rate, &t_rate));
-    ASSERT_OK(t_rate->GetItemAt<uint32_t>(&rate, {}));
-    MS_LOG(ERROR) << "Tensor rate: " << rate;
-
-    std::shared_ptr<Tensor> t_utterance;
-    ASSERT_OK(Tensor::CreateFromMSTensor(utterance_, &t_utterance));
-    ASSERT_OK(t_utterance->GetItemAt(&utterance, {}));
-    MS_LOG(ERROR) << "Tensor utterance value: " << utterance;
-
-    std::shared_ptr<Tensor> t_utterance_id;
-    ASSERT_OK(Tensor::CreateFromMSTensor(utterance_id_, &t_utterance_id));
-    ASSERT_OK(t_utterance_id->GetItemAt(&utterance_id, {}));
-    MS_LOG(ERROR) << "Tensor utterance_id value: " << utterance_id;
-
-
-    ASSERT_OK(iter->GetNextRow(&row));
-    i++;
-  }
-
-  EXPECT_EQ(i, 20);
-  iter->Stop();
-}
-
-// TEST_F(MindDataTestMnistSampler, TestSequentialImageFolderWithRepeatBatch) {
-//   std::string folder_path = datasets_root_path_ + "/testMnistData/";
-//   int64_t num_samples = 10;
-//   int64_t start_index = 0;
-//   std::shared_ptr<Dataset> ds =
-//     Mnist(folder_path, "all", std::make_shared<SequentialSampler>(start_index, num_samples));
-//   EXPECT_NE(ds, nullptr);
-//   ds = ds->Repeat(2);
-//   EXPECT_NE(ds, nullptr);
-//   ds = ds->Batch(5);
-//   EXPECT_NE(ds, nullptr);
-//   std::shared_ptr<Iterator> iter = ds->CreateIterator();
-//   EXPECT_NE(iter, nullptr);
-//   std::vector<std::vector<uint32_t>> expected = {{0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}};
-//   std::unordered_map<std::string, mindspore::MSTensor> row;
-//   ASSERT_OK(iter->GetNextRow(&row));
-//   uint64_t i = 0;
-//   while (row.size() != 0) {
-//     auto image = row["image"];
-//     auto label = row["label"];
-//     MS_LOG(INFO) << "Tensor image shape: " << image.Shape();
-//     TEST_MS_LOG_MSTENSOR(INFO, "Tensor label: ", label);
-//     std::shared_ptr<Tensor> de_expected_label;
-//     ASSERT_OK(Tensor::CreateFromVector(expected[i % 4], &de_expected_label));
-//     mindspore::MSTensor expected_label =
-//       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_label));
-//     EXPECT_MSTENSOR_EQ(label, expected_label);
-//     ASSERT_OK(iter->GetNextRow(&row));
-//     i++;
-//   }
-//   EXPECT_EQ(i, 4);
-//   iter->Stop();
-// }
-
-
diff --git a/tests/ut/cpp/dataset/common/bboxop_common.cc b/tests/ut/cpp/dataset/common/bboxop_common.cc
index 70c794856c0..18819b9a88f 100644
--- a/tests/ut/cpp/dataset/common/bboxop_common.cc
+++ b/tests/ut/cpp/dataset/common/bboxop_common.cc
@@ -67,8 +67,8 @@ void BBoxOpCommon::GetInputImagesAndAnnotations(const std::string &dir, std::siz
     EXPECT_TRUE(dir_path.Exists());
   }
   // get image file paths
-  while (image_dir_itr->hasNext()) {
-    Path image_path = image_dir_itr->next();
+  while (image_dir_itr->HasNext()) {
+    Path image_path = image_dir_itr->Next();
     if (image_path.Extension() == std::string(kImageExt)) {
       paths_to_fetch.push_back(image_path.toString());
     }
diff --git a/tests/ut/cpp/dataset/data_helper_test.cc b/tests/ut/cpp/dataset/data_helper_test.cc
index b1ffefe6b71..5600e479a0f 100644
--- a/tests/ut/cpp/dataset/data_helper_test.cc
+++ b/tests/ut/cpp/dataset/data_helper_test.cc
@@ -50,7 +50,7 @@ TEST_F(MindDataTestDataHelper, MindDataTestHelper) {
   std::string file_path = datasets_root_path_ + "/testAlbum/images/1.json";
   DataHelper dh; 
   std::vector<std::string> new_label = {"3", "4"};
-  Status rc = dh.UpdateArray(file_path, "label", new_label); 
+  Status rc = dh.UpdateArray(file_path, "label", new_label);
   if (rc.IsError()) {
     MS_LOG(ERROR) << "Return code error detected during label update: "  << ".";
     EXPECT_TRUE(false);
diff --git a/tests/ut/cpp/dataset/deserialize_test.cc b/tests/ut/cpp/dataset/deserialize_test.cc
index 61b8ada1371..a929d373579 100644
--- a/tests/ut/cpp/dataset/deserialize_test.cc
+++ b/tests/ut/cpp/dataset/deserialize_test.cc
@@ -462,6 +462,7 @@ TEST_F(MindDataTestDeserialize, TestDeserializeFill) {
   std::shared_ptr<TensorOperation> operation2 = std::make_shared<text::ToNumberOperation>("int32_t");
   std::vector<std::shared_ptr<TensorOperation>> ops = {operation1, operation2};
   ds = std::make_shared<MapNode>(ds, ops);
+  ds = std::make_shared<TransferNode>(ds, "queue", "type", 1, true, 10, true);
   compare_dataset(ds);
 }
 
@@ -482,3 +483,19 @@ TEST_F(MindDataTestDeserialize, TestDeserializeTensor) {
   json_ss1 << json_obj1;
   EXPECT_EQ(json_ss.str(), json_ss1.str());
 }
+
+// Helper function to get the session id from SESSION_ID env variable
+Status GetSessionFromEnv(session_id_type *session_id);
+
+TEST_F(MindDataTestDeserialize, DISABLED_TestDeserializeCache) {
+  MS_LOG(INFO) << "Doing MindDataTestDeserialize-Cache.";
+  std::string data_dir = "./data/dataset/testCache";
+  std::string usage = "all";
+  session_id_type env_session;
+  ASSERT_TRUE(GetSessionFromEnv(&env_session));
+  std::shared_ptr<DatasetCache> some_cache = CreateDatasetCache(env_session, 0, false, "127.0.0.1", 50052, 1, 1);
+
+  std::shared_ptr<SamplerObj> sampler = std::make_shared<SequentialSamplerObj>(0, 10);
+  std::shared_ptr<DatasetNode> ds = std::make_shared<Cifar10Node>(data_dir, usage, sampler, some_cache);
+  compare_dataset(ds);
+}
\ No newline at end of file
diff --git a/tests/ut/cpp/dataset/execute_test.cc b/tests/ut/cpp/dataset/execute_test.cc
index c7069a5b2f2..6aef3af925a 100644
--- a/tests/ut/cpp/dataset/execute_test.cc
+++ b/tests/ut/cpp/dataset/execute_test.cc
@@ -35,7 +35,7 @@ class MindDataTestExecute : public UT::DatasetOpTesting {
 };
 
 TEST_F(MindDataTestExecute, TestAllpassBiquadWithEager) {
-  MS_LOG(INFO) << "Basic Function Test With Eager.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestAllpassBiquadWithEager.";
   // Original waveform
   std::vector<float> labels = {
     2.716064453125000000e-03, 6.347656250000000000e-03, 9.246826171875000000e-03, 1.089477539062500000e-02,
@@ -54,7 +54,7 @@ TEST_F(MindDataTestExecute, TestAllpassBiquadWithEager) {
 }
 
 TEST_F(MindDataTestExecute, TestAllpassBiquadWithWrongArg) {
-  MS_LOG(INFO) << "Wrong Arg.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestAllpassBiquadWithWrongArg.";
   std::vector<double> labels = {
     2.716064453125000000e-03, 6.347656250000000000e-03, 9.246826171875000000e-03, 1.089477539062500000e-02,
     1.138305664062500000e-02, 1.156616210937500000e-02, 1.394653320312500000e-02, 1.550292968750000000e-02,
@@ -72,9 +72,8 @@ TEST_F(MindDataTestExecute, TestAllpassBiquadWithWrongArg) {
   EXPECT_FALSE(s01.IsOk());
 }
 
-TEST_F(MindDataTestExecute, TestAdjustGammaEager1) {
-  // 3-channel eager
-  MS_LOG(INFO) << "3-channel image test";
+TEST_F(MindDataTestExecute, TestAdjustGammaEager3Channel) {
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestAdjustGammaEager3Channel.";
   // Read images
   auto image = ReadFileToTensor("data/dataset/apple.jpg");
 
@@ -87,9 +86,8 @@ TEST_F(MindDataTestExecute, TestAdjustGammaEager1) {
   EXPECT_EQ(rc, Status::OK());
 }
 
-TEST_F(MindDataTestExecute, TestAdjustGammaEager2) {
-  // 1-channel eager
-  MS_LOG(INFO) << "1-channel image test";
+TEST_F(MindDataTestExecute, TestAdjustGammaEager1Channel) {
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestAdjustGammaEager1Channel.";
   auto m1 = ReadFileToTensor("data/dataset/apple.jpg");
   // Transform params
   auto decode = vision::Decode();
@@ -102,7 +100,7 @@ TEST_F(MindDataTestExecute, TestAdjustGammaEager2) {
 }
 
 TEST_F(MindDataTestExecute, TestAmplitudeToDB) {
-  MS_LOG(INFO) << "Basic Function Test With Eager.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestAmplitudeToDB.";
   // Original waveform
   std::vector<float> labels = {
     2.716064453125000000e-03, 6.347656250000000000e-03, 9.246826171875000000e-03, 1.089477539062500000e-02,
@@ -122,7 +120,7 @@ TEST_F(MindDataTestExecute, TestAmplitudeToDB) {
 }
 
 TEST_F(MindDataTestExecute, TestAmplitudeToDBWrongArgs) {
-  MS_LOG(INFO) << "Wrong Arg.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestAmplitudeToDBWrongArgs.";
   // Original waveform
   std::vector<float> labels = {
     2.716064453125000000e-03, 6.347656250000000000e-03, 9.246826171875000000e-03, 1.089477539062500000e-02,
@@ -142,7 +140,7 @@ TEST_F(MindDataTestExecute, TestAmplitudeToDBWrongArgs) {
 }
 
 TEST_F(MindDataTestExecute, TestAmplitudeToDBWrongInput) {
-  MS_LOG(INFO) << "Wrong Input.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestAmplitudeToDBWrongInput.";
   // Original waveform
   std::vector<float> labels = {
     2.716064453125000000e-03, 6.347656250000000000e-03, 9.246826171875000000e-03, 1.089477539062500000e-02,
@@ -161,7 +159,7 @@ TEST_F(MindDataTestExecute, TestAmplitudeToDBWrongInput) {
 }
 
 TEST_F(MindDataTestExecute, TestComposeTransforms) {
-  MS_LOG(INFO) << "Doing TestComposeTransforms.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestComposeTransforms.";
 
   // Read images
   auto image = ReadFileToTensor("data/dataset/apple.jpg");
@@ -197,8 +195,34 @@ TEST_F(MindDataTestExecute, TestCrop) {
   EXPECT_EQ(image.Shape()[1], 15);
 }
 
+TEST_F(MindDataTestExecute, TestFrequencyMasking) {
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestFrequencyMasking.";
+  std::shared_ptr<Tensor> input_tensor_;
+  TensorShape s = TensorShape({6, 2});
+  ASSERT_OK(Tensor::CreateFromVector(
+    std::vector<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}), s, &input_tensor_));
+  auto input_tensor = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input_tensor_));
+  std::shared_ptr<TensorTransform> frequency_masking_op = std::make_shared<audio::FrequencyMasking>(true, 2);
+  mindspore::dataset::Execute transform({frequency_masking_op});
+  Status status = transform(input_tensor, &input_tensor);
+  EXPECT_TRUE(status.IsOk());
+}
+
+TEST_F(MindDataTestExecute, TestTimeMasking) {
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestTimeMasking.";
+  std::shared_ptr<Tensor> input_tensor_;
+  TensorShape s = TensorShape({2, 6});
+  ASSERT_OK(Tensor::CreateFromVector(
+    std::vector<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}), s, &input_tensor_));
+  auto input_tensor = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input_tensor_));
+  std::shared_ptr<TensorTransform> time_masking_op = std::make_shared<audio::TimeMasking>(true, 2);
+  mindspore::dataset::Execute transform({time_masking_op});
+  Status status = transform(input_tensor, &input_tensor);
+  EXPECT_TRUE(status.IsOk());
+}
+
 TEST_F(MindDataTestExecute, TestTimeStretchEager) {
-  MS_LOG(INFO) << "Doing test TimeStretchOp with custom param value. Eager.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestTimeStretchEager.";
   std::shared_ptr<Tensor> input_tensor_;
   // op param
   int freq = 4;
@@ -222,8 +246,8 @@ TEST_F(MindDataTestExecute, TestTimeStretchEager) {
   EXPECT_TRUE(status.IsOk());
 }
 
-TEST_F(MindDataTestExecute, TestTimeStretchParamCheck1) {
-  MS_LOG(INFO) << "Doing MindDataTestTimeStretch-TestTimeStretchParamCheck with invalid parameters.";
+TEST_F(MindDataTestExecute, TestTimeStretchParamCheck) {
+  MS_LOG(INFO) << "Doing MindDataTestTimeStretch-TestTimeStretchParamCheck.";
   // Create an input
   std::shared_ptr<Tensor> input_tensor_;
   std::shared_ptr<Tensor> output_tensor;
@@ -233,26 +257,15 @@ TEST_F(MindDataTestExecute, TestTimeStretchParamCheck1) {
                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}),
     s, &input_tensor_));
   auto input_ms = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input_tensor_));
-  std::shared_ptr<TensorTransform> timestretch = std::make_shared<audio::TimeStretch>(4, 512, -2);
-  mindspore::dataset::Execute Transform({timestretch});
-  Status status = Transform(input_ms, &input_ms);
-  EXPECT_FALSE(status.IsOk());
-}
 
-TEST_F(MindDataTestExecute, TestTimeStretchParamCheck2) {
-  MS_LOG(INFO) << "Doing MindDataTestTimeStretch-TestTimeStretchParamCheck with invalid parameters.";
-  // Create an input
-  std::shared_ptr<Tensor> input_tensor_;
-  std::shared_ptr<Tensor> output_tensor;
-  TensorShape s = TensorShape({1, 4, 3, 2});
-  ASSERT_OK(Tensor::CreateFromVector(
-    std::vector<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f,
-                        1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}),
-    s, &input_tensor_));
-  auto input_ms = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input_tensor_));
-  std::shared_ptr<TensorTransform> timestretch = std::make_shared<audio::TimeStretch>(4, -512, 2);
-  mindspore::dataset::Execute Transform({timestretch});
-  Status status = Transform(input_ms, &input_ms);
+  std::shared_ptr<TensorTransform> time_stretch1 = std::make_shared<audio::TimeStretch>(4, 512, -2);
+  mindspore::dataset::Execute Transform1({time_stretch1});
+  Status status = Transform1(input_ms, &input_ms);
+  EXPECT_FALSE(status.IsOk());
+
+  std::shared_ptr<TensorTransform> time_stretch2 = std::make_shared<audio::TimeStretch>(4, -512, 2);
+  mindspore::dataset::Execute Transform2({time_stretch2});
+  status = Transform2(input_ms, &input_ms);
   EXPECT_FALSE(status.IsOk());
 }
 
@@ -485,7 +498,7 @@ TEST_F(MindDataTestExecute, TestResizeWithBBox) {
 }
 
 TEST_F(MindDataTestExecute, TestBandBiquadWithEager) {
-  MS_LOG(INFO) << "Basic Function Test With Eager.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestBandBiquadWithEager.";
   // Original waveform
   std::vector<float> labels = {
     2.716064453125000000e-03, 6.347656250000000000e-03, 9.246826171875000000e-03, 1.089477539062500000e-02,
@@ -504,7 +517,7 @@ TEST_F(MindDataTestExecute, TestBandBiquadWithEager) {
 }
 
 TEST_F(MindDataTestExecute, TestBandBiquadWithWrongArg) {
-  MS_LOG(INFO) << "Wrong Arg.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestBandBiquadWithWrongArg.";
   std::vector<double> labels = {
     2.716064453125000000e-03, 6.347656250000000000e-03, 9.246826171875000000e-03, 1.089477539062500000e-02,
     1.138305664062500000e-02, 1.156616210937500000e-02, 1.394653320312500000e-02, 1.550292968750000000e-02,
@@ -523,7 +536,7 @@ TEST_F(MindDataTestExecute, TestBandBiquadWithWrongArg) {
 }
 
 TEST_F(MindDataTestExecute, TestBandpassBiquadWithEager) {
-  MS_LOG(INFO) << "Basic Function Test With Eager.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestBandpassBiquadWithEager.";
   // Original waveform
   std::vector<float> labels = {
     2.716064453125000000e-03, 6.347656250000000000e-03, 9.246826171875000000e-03, 1.089477539062500000e-02,
@@ -542,7 +555,7 @@ TEST_F(MindDataTestExecute, TestBandpassBiquadWithEager) {
 }
 
 TEST_F(MindDataTestExecute, TestBandpassBiquadWithWrongArg) {
-  MS_LOG(INFO) << "Wrong Arg.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestBandpassBiquadWithWrongArg.";
   std::vector<double> labels = {
     2.716064453125000000e-03, 6.347656250000000000e-03, 9.246826171875000000e-03, 1.089477539062500000e-02,
     1.138305664062500000e-02, 1.156616210937500000e-02, 1.394653320312500000e-02, 1.550292968750000000e-02,
@@ -561,7 +574,7 @@ TEST_F(MindDataTestExecute, TestBandpassBiquadWithWrongArg) {
 }
 
 TEST_F(MindDataTestExecute, TestBandrejectBiquadWithEager) {
-  MS_LOG(INFO) << "Basic Function Test With Eager.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestBandrejectBiquadWithEager.";
   // Original waveform
   std::vector<float> labels = {
     2.716064453125000000e-03, 6.347656250000000000e-03, 9.246826171875000000e-03, 1.089477539062500000e-02,
@@ -580,7 +593,7 @@ TEST_F(MindDataTestExecute, TestBandrejectBiquadWithEager) {
 }
 
 TEST_F(MindDataTestExecute, TestBandrejectBiquadWithWrongArg) {
-  MS_LOG(INFO) << "Wrong Arg.";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestBandrejectBiquadWithWrongArg.";
   std::vector<double> labels = {
     2.716064453125000000e-03, 6.347656250000000000e-03, 9.246826171875000000e-03, 1.089477539062500000e-02,
     1.138305664062500000e-02, 1.156616210937500000e-02, 1.394653320312500000e-02, 1.550292968750000000e-02,
@@ -599,7 +612,7 @@ TEST_F(MindDataTestExecute, TestBandrejectBiquadWithWrongArg) {
 }
 
 TEST_F(MindDataTestExecute, TestAngleEager) {
-  MS_LOG(INFO) << "Doing MindDataTestExecute-TestAngleEager";
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestAngleEager.";
   std::vector<double> origin = {1.143, 1.3123, 2.632, 2.554, -1.213, 1.3, 0.456, 3.563};
   TensorShape input_shape({4, 2});
   std::shared_ptr<Tensor> de_tensor;
@@ -612,3 +625,34 @@ TEST_F(MindDataTestExecute, TestAngleEager) {
 
   ASSERT_TRUE(s.IsOk());
 }
+
+TEST_F(MindDataTestExecute, TestRGB2BGREager) {
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestRGB2BGREager.";
+
+  // Read images
+  auto image = ReadFileToTensor("data/dataset/apple.jpg");
+
+  // Transform params
+  auto decode = vision::Decode();
+  auto rgb2bgr_op = vision::RGB2BGR();
+
+  auto transform = Execute({decode, rgb2bgr_op});
+  Status rc = transform(image, &image);
+
+  EXPECT_EQ(rc, Status::OK());
+}
+
+TEST_F(MindDataTestExecute, TestComplexNormEager) {
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestComplexNormEager.";
+  // testing
+  std::shared_ptr<Tensor> input_tensor_;
+  Tensor::CreateFromVector(std::vector<float>({1.0, 1.0, 2.0, 3.0, 4.0, 4.0}), TensorShape({3, 2}), &input_tensor_);
+
+  auto input_02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input_tensor_));
+  std::shared_ptr<TensorTransform> complex_norm_01 = std::make_shared<audio::ComplexNorm>(4.0);
+
+  // Filtered waveform by complexnorm
+  mindspore::dataset::Execute Transform01({complex_norm_01});
+  Status s01 = Transform01(input_02, &input_02);
+  EXPECT_TRUE(s01.IsOk());
+}
diff --git a/tests/ut/cpp/dataset/libri_speech_test.cc b/tests/ut/cpp/dataset/libri_speech_test.cc
deleted file mode 100644
index f5f7a737310..00000000000
--- a/tests/ut/cpp/dataset/libri_speech_test.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h> 
-
-#include "utils/ms_utils.h"
-#include "common/common.h"
-#include "minddata/dataset/core/client.h"
-#include "minddata/dataset/core/global_context.h"
-#include "minddata/dataset/engine/datasetops/source/libri_speech_op.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/distributed_sampler.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/pk_sampler.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/random_sampler.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/subset_random_sampler.h"
-#include "minddata/dataset/engine/datasetops/source/sampler/weighted_random_sampler.h"
-#include "minddata/dataset/include/dataset/datasets.h"
-#include "minddata/dataset/util/path.h"
-#include "minddata/dataset/util/status.h"
-#include "gtest/gtest.h"
-#include "utils/log_adapter.h"
-#include "securec.h"
-
-namespace common = mindspore::common;
-using namespace mindspore::dataset;
-using mindspore::LogStream;
-using mindspore::ExceptionType::NoExceptionType;
-using mindspore::MsLogLevel::ERROR;
-
-std::shared_ptr<RepeatOp> Repeat(int repeat_cnt);
-
-std::shared_ptr<ExecutionTree> Build(std::vector<std::shared_ptr<DatasetOp>> ops);
-
-class MindDataTestLibriSpeechSampler : public UT::DatasetOpTesting {
- protected:
-};
-
-TEST_F(MindDataTestLibriSpeechSampler, TestSequentialLibriSpeechWithRepeat) {
-  std::string folder_path = "/home/user06/zjm/data/libri_speech/LibriSpeech/";
-  int64_t num_samples = 10;
-  int64_t start_index = 0;
-  std::shared_ptr<Dataset> ds =
-    LibriSpeech(folder_path, "dev-clean", std::make_shared<SequentialSampler>(start_index, num_samples));
-  EXPECT_NE(ds, nullptr);
-  ds = ds->Repeat(2);
-  EXPECT_NE(ds, nullptr);
-  std::shared_ptr<Iterator> iter = ds->CreateIterator();
-  EXPECT_NE(iter, nullptr);
-  std::unordered_map<std::string, mindspore::MSTensor> row;
-  ASSERT_OK(iter->GetNextRow(&row));
-
-  std::string_view utterance;
-  uint32_t rate;
-  uint32_t utterance_id;
-  uint32_t speaker_id;
-  uint32_t chapter_id;
-
-  
-  uint64_t i = 0;
-  while (row.size() != 0) {
-
-    auto waveform = row["waveform"];
-    auto sample_rate = row["sample_rate"];
-    auto utterance_ = row["utterance"];
-    auto utterance_id_ = row["utterance_id"];
-    auto speaker_id_ = row["speaker_id"];
-    auto chapter_id_ = row["chapter_id"];
-
-    MS_LOG(ERROR) << "Tensor image shape: " << waveform.Shape();
-
-    std::shared_ptr<Tensor> t_rate;
-    ASSERT_OK(Tensor::CreateFromMSTensor(sample_rate, &t_rate));
-    ASSERT_OK(t_rate->GetItemAt<uint32_t>(&rate, {}));
-    MS_LOG(ERROR) << "Tensor rate: " << rate;
-
-    std::shared_ptr<Tensor> t_utterance;
-    ASSERT_OK(Tensor::CreateFromMSTensor(utterance_, &t_utterance));
-    ASSERT_OK(t_utterance->GetItemAt(&utterance, {}));
-    MS_LOG(ERROR) << "Tensor utterance value: " << utterance;
-
-    std::shared_ptr<Tensor> t_speaker_id;
-    ASSERT_OK(Tensor::CreateFromMSTensor(speaker_id_, &t_speaker_id));
-    ASSERT_OK(t_speaker_id->GetItemAt<uint32_t>(&speaker_id, {}));
-    MS_LOG(ERROR) << "Tensor speaker_id value: " << speaker_id;
-
-    std::shared_ptr<Tensor> t_chapter_id;
-    ASSERT_OK(Tensor::CreateFromMSTensor(chapter_id_, &t_chapter_id));
-    ASSERT_OK(t_chapter_id->GetItemAt<uint32_t>(&chapter_id, {}));
-    MS_LOG(ERROR) << "Tensor chapter_id value: " << chapter_id;
-
-
-    std::shared_ptr<Tensor> t_utterance_id;
-    ASSERT_OK(Tensor::CreateFromMSTensor(utterance_id_, &t_utterance_id));
-    ASSERT_OK(t_utterance_id->GetItemAt<uint32_t>(&utterance_id, {}));
-    MS_LOG(ERROR) << "Tensor utterance_id value: " << utterance_id;
-
-
-
-    ASSERT_OK(iter->GetNextRow(&row));
-    i++;
-  }
-
-  EXPECT_EQ(i, 20);
-  iter->Stop();
-}
-
-// TEST_F(MindDataTestMnistSampler, TestSequentialImageFolderWithRepeatBatch) {
-//   std::string folder_path = datasets_root_path_ + "/testMnistData/";
-//   int64_t num_samples = 10;
-//   int64_t start_index = 0;
-//   std::shared_ptr<Dataset> ds =
-//     Mnist(folder_path, "all", std::make_shared<SequentialSampler>(start_index, num_samples));
-//   EXPECT_NE(ds, nullptr);
-//   ds = ds->Repeat(2);
-//   EXPECT_NE(ds, nullptr);
-//   ds = ds->Batch(5);
-//   EXPECT_NE(ds, nullptr);
-//   std::shared_ptr<Iterator> iter = ds->CreateIterator();
-//   EXPECT_NE(iter, nullptr);
-//   std::vector<std::vector<uint32_t>> expected = {{0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}};
-//   std::unordered_map<std::string, mindspore::MSTensor> row;
-//   ASSERT_OK(iter->GetNextRow(&row));
-//   uint64_t i = 0;
-//   while (row.size() != 0) {
-//     auto image = row["image"];
-//     auto label = row["label"];
-//     MS_LOG(INFO) << "Tensor image shape: " << image.Shape();
-//     TEST_MS_LOG_MSTENSOR(INFO, "Tensor label: ", label);
-//     std::shared_ptr<Tensor> de_expected_label;
-//     ASSERT_OK(Tensor::CreateFromVector(expected[i % 4], &de_expected_label));
-//     mindspore::MSTensor expected_label =
-//       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_label));
-//     EXPECT_MSTENSOR_EQ(label, expected_label);
-//     ASSERT_OK(iter->GetNextRow(&row));
-//     i++;
-//   }
-//   EXPECT_EQ(i, 4);
-//   iter->Stop();
-// }
-
-
diff --git a/tests/ut/cpp/dataset/path_test.cc b/tests/ut/cpp/dataset/path_test.cc
index b36b38bbc70..9c215f3632d 100644
--- a/tests/ut/cpp/dataset/path_test.cc
+++ b/tests/ut/cpp/dataset/path_test.cc
@@ -35,8 +35,8 @@ TEST_F(MindDataTestPath, Test1) {
   auto dir_it = Path::DirIterator::OpenDirectory(&f);
   ASSERT_NE(dir_it.get(), nullptr);
   int i = 0;
-  while (dir_it->hasNext()) {
-    Path v = dir_it->next();
+  while (dir_it->HasNext()) {
+    Path v = dir_it->Next();
     MS_LOG(DEBUG) << v.toString() << "\n";
     i++;
     if (i == 10) {
diff --git a/tests/ut/cpp/dataset/rgb_to_bgr_test_op.cc b/tests/ut/cpp/dataset/rgb_to_bgr_test_op.cc
deleted file mode 100644
index 9c93ea788b3..00000000000
--- a/tests/ut/cpp/dataset/rgb_to_bgr_test_op.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <opencv2/imgcodecs.hpp>
-#include <opencv2/opencv.hpp>
-#include "common/common.h"
-#include "common/cvop_common.h"
-#include "include/dataset/datasets.h"
-#include "include/dataset/transforms.h"
-#include "include/dataset/vision.h"
-#include "include/dataset/execute.h"
-#include "minddata/dataset/kernels/image/image_utils.h"
-#include "minddata/dataset/kernels/image/rgb_to_bgr_op.h"
-#include "minddata/dataset/core/cv_tensor.h"
-#include "utils/log_adapter.h"
-
-using namespace std;
-using namespace mindspore::dataset;
-using mindspore::dataset::CVTensor;
-using mindspore::dataset::BorderType;
-using mindspore::dataset::Tensor;
-using mindspore::LogStream;
-using mindspore::ExceptionType::NoExceptionType;
-using mindspore::MsLogLevel::INFO;
-
-
-class MindDataTestRgbToBgrOp : public UT::DatasetOpTesting {
- protected:
-};
-
-
-TEST_F(MindDataTestRgbToBgrOp, TestOp1) {
-  // Eager
-  MS_LOG(INFO) << "Doing MindDataTestGaussianBlur-TestGaussianBlurEager.";
-
-  // Read images
-  auto image = ReadFileToTensor("data/dataset/apple.jpg");
-
-  // Transform params
-  auto decode = vision::Decode();
-  auto rgb2bgr_op = vision::RGB2BGR();
-
-  auto transform = Execute({decode, rgb2bgr_op});
-  Status rc = transform(image, &image);
-
-  EXPECT_EQ(rc, Status::OK());
-}
-
-
-TEST_F(MindDataTestRgbToBgrOp, TestOp2) {
-  // pipeline
-  MS_LOG(INFO) << "Basic Function Test.";
-  // create two imagenet dataset
-  std::string MindDataPath = "data/dataset";
-  std::string folder_path = MindDataPath + "/testImageNetData/train/";
-  std::shared_ptr<Dataset> ds1 = ImageFolder(folder_path, true, std::make_shared<RandomSampler>(false, 2));
-  EXPECT_NE(ds1, nullptr);
-  std::shared_ptr<Dataset> ds2 = ImageFolder(folder_path, true, std::make_shared<RandomSampler>(false, 2));
-  EXPECT_NE(ds2, nullptr);
-
-  auto rgb2bgr_op = vision::RGB2BGR();
-  
-  ds1 = ds1->Map({rgb2bgr_op});
-  EXPECT_NE(ds1, nullptr);
-
-  std::shared_ptr<Iterator> iter1 = ds1->CreateIterator();
-  EXPECT_NE(iter1, nullptr);
-  std::unordered_map<std::string, mindspore::MSTensor> row1;
-  iter1->GetNextRow(&row1);
-
-  std::shared_ptr<Iterator> iter2 = ds2->CreateIterator();
-  EXPECT_NE(iter2, nullptr);
-  std::unordered_map<std::string, mindspore::MSTensor> row2;
-  iter2->GetNextRow(&row2);
-
-  uint64_t i = 0;
-  while (row1.size() != 0) {
-    i++;
-    auto image =row1["image"];
-    iter1->GetNextRow(&row1);
-    iter2->GetNextRow(&row2);
-  }
-  EXPECT_EQ(i, 2);
-
-  iter1->Stop();
-  iter2->Stop();
-}
diff --git a/tests/ut/cpp/device/hccl_adapter_test.cc b/tests/ut/cpp/device/hccl_adapter_test.cc
index 6c3b6fdeb56..7095f148df0 100644
--- a/tests/ut/cpp/device/hccl_adapter_test.cc
+++ b/tests/ut/cpp/device/hccl_adapter_test.cc
@@ -83,6 +83,12 @@ class TestHcclAdapter : public UT::Common {
   std::string format_ = "NCHW";
 };
 
+/// Feature: AllToAllvCalcParam
+/// Description: on 2p, send to rank 1, and recv nothing
+/// Expectation: send count 0 1
+///             send offset 0 0
+///              recv count 0 0
+///             recv offset 0 0
 TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_only_send) {
   auto graph = std::make_shared<FuncGraph>();
   ASSERT_TRUE(graph != nullptr);
@@ -100,6 +106,12 @@ TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_only_send) {
   EXPECT_EQ(calc.GetRecvDispls(), std::vector<int64_t>({0, 0}));
 }
 
+/// Feature: AllToAllvCalcParam
+/// Description: on 2p, send nothing, and recv from rank 0 and rank 1
+/// Expectation: send count 0 0
+///             send offset 0 0
+///              recv count 1 1
+///             recv offset 0 128
 TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_only_recv) {
   auto graph = std::make_shared<FuncGraph>();
   ASSERT_TRUE(graph != nullptr);
@@ -117,6 +129,12 @@ TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_only_recv) {
   EXPECT_EQ(calc.GetRecvDispls(), std::vector<int64_t>({0, 128}));
 }
 
+/// Feature: AllToAllvCalcParam
+/// Description: on 4p, send to rank1,2,3, and recv nothing
+/// Expectation: send count 0 1 1 1
+///             send offset 0 0 128 256
+///              recv count 0 0 0 0
+///             recv offset 0 0 0 0
 TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_4p_only_send) {
   auto graph = std::make_shared<FuncGraph>();
   ASSERT_TRUE(graph != nullptr);
@@ -135,6 +153,12 @@ TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_4p_only_send) {
   EXPECT_EQ(calc.GetRecvDispls(), std::vector<int64_t>({0, 0, 0, 0}));
 }
 
+/// Feature: AllToAllvCalcParam
+/// Description: on 4p, send to rank1,3, and recv nothing
+/// Expectation: send count 0 1 0 1
+///             send offset 0 0 128 128
+///              recv count 0 0 0 0
+///             recv offset 0 0 0 0
 TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_4p_only_send_2) {
   auto graph = std::make_shared<FuncGraph>();
   ASSERT_TRUE(graph != nullptr);
@@ -153,6 +177,12 @@ TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_4p_only_send_2) {
   EXPECT_EQ(calc.GetRecvDispls(), std::vector<int64_t>({0, 0, 0, 0}));
 }
 
+/// Feature: AllToAllvCalcParam
+/// Description: on 2p, send to rank1, and recv from rank1
+/// Expectation: send count 0 1
+///             send offset 0 0
+///              recv count 0 1
+///             recv offset 0 0
 TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_exchange) {
   auto graph = std::make_shared<FuncGraph>();
   ASSERT_TRUE(graph != nullptr);
@@ -170,6 +200,12 @@ TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_exchange) {
   EXPECT_EQ(calc.GetRecvDispls(), std::vector<int64_t>({0, 0}));
 }
 
+/// Feature: AllToAllvCalcParam
+/// Description: on 2p, send to rank0, and recv from rank0
+/// Expectation: send count 1 0
+///             send offset 0 128
+///              recv count 1 0
+///             recv offset 0 128
 TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_send_to_self) {
   auto graph = std::make_shared<FuncGraph>();
   ASSERT_TRUE(graph != nullptr);
@@ -187,6 +223,12 @@ TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_send_to_self) {
   EXPECT_EQ(calc.GetRecvDispls(), std::vector<int64_t>({0, 128}));
 }
 
+/// Feature: AllToAllvCalcParam
+/// Description: on 4p, send to rank0123, and recv from rank0123
+/// Expectation: send count 1 1 1 1
+///             send offset 0 128 256 384
+///              recv count 1 1 1 1
+///             recv offset 0 128 256 384
 TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_4p_all_to_all) {
   auto graph = std::make_shared<FuncGraph>();
   ASSERT_TRUE(graph != nullptr);
@@ -205,6 +247,12 @@ TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_4p_all_to_all) {
   EXPECT_EQ(calc.GetRecvDispls(), std::vector<int64_t>({0, 128, 256, 384}));
 }
 
+/// Feature: AllToAllvCalcParam
+/// Description: on 4p, send to rank0123, and recv from rank0123, but recv order is wrong
+/// Expectation: send count 1 1 1 1
+///             send offset 0 128 256 384
+///              recv count 1 1 1 1
+///             recv offset 256 128 384 0
 TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_4p_all_in_all_in_wrong_order) {
   auto graph = std::make_shared<FuncGraph>();
   ASSERT_TRUE(graph != nullptr);
@@ -223,6 +271,12 @@ TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_4p_all_in_all_in_wrong_orde
   EXPECT_EQ(calc.GetRecvDispls(), std::vector<int64_t>({256, 128, 384, 0}));
 }
 
+/// Feature: AllToAllvCalcParam
+/// Description: on 4p, send to rank123, and recv from nothing, but send order is wrong
+/// Expectation: send count 0 1 1 1
+///             send offset 0 128 256 0
+///              recv count 0 0 0 0
+///             recv offset 0 0 0 0
 TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_4p_only_send_in_wrong_order) {
   auto graph = std::make_shared<FuncGraph>();
   ASSERT_TRUE(graph != nullptr);
@@ -241,6 +295,9 @@ TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_4p_only_send_in_wrong_order
   EXPECT_EQ(calc.GetRecvDispls(), std::vector<int64_t>({0, 0, 0, 0}));
 }
 
+/// Feature: AllToAllvCalcParam
+/// Description: on 2p, rank id over valid range
+/// Expectation: throw exception
 TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_invalid_rank_id) {
   auto graph = std::make_shared<FuncGraph>();
   ASSERT_TRUE(graph != nullptr);
@@ -254,6 +311,9 @@ TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_invalid_rank_id) {
   ASSERT_ANY_THROW(calc.CalcOpParam());
 }
 
+/// Feature: AllToAllvCalcParam
+/// Description: on 2p, has 2 outputs but only 1 recv_rank_ids is set
+/// Expectation: throw exception
 TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_invalid_rank_id_2) {
   auto graph = std::make_shared<FuncGraph>();
   ASSERT_TRUE(graph != nullptr);
@@ -267,6 +327,9 @@ TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_invalid_rank_id_2) {
   ASSERT_ANY_THROW(calc.CalcOpParam());
 }
 
+/// Feature: AllToAllvCalcParam
+/// Description: on 2p, rank id over valid range
+/// Expectation: throw exception
 TEST_F(TestHcclAdapter, test_all_to_all_v_calc_param_2p_wrong_order_and_invalid_rank_id) {
   auto graph = std::make_shared<FuncGraph>();
   ASSERT_TRUE(graph != nullptr);
diff --git a/tests/ut/cpp/pre_activate/ascend/mindir/all_to_all_unify_mindir_test.cc b/tests/ut/cpp/pre_activate/ascend/mindir/all_to_all_unify_mindir_test.cc
index 9338e293e09..b5460d1eb3a 100644
--- a/tests/ut/cpp/pre_activate/ascend/mindir/all_to_all_unify_mindir_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/mindir/all_to_all_unify_mindir_test.cc
@@ -38,7 +38,7 @@ class TestAllToAllUnifyMindIr : public BackendCommon {
 TEST_F(TestAllToAllUnifyMindIr, test_neighbor_exchange) {
   FuncGraphPtr g = getPyFun_.CallAndParseRet("test_neighbor_exchange", "before");
   ASSERT_TRUE(g != nullptr);
-  std::vector<int64_t> shp_x{2, 3};
+  std::vector<int64_t> shp_x{2, 2};
   auto x_abstract = std::make_shared<abstract::AbstractTuple>(
     AbstractBasePtrList{std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x)});
   AbstractBasePtrList args_spec_list{x_abstract};
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/all_to_all_unify_mindir_test.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/all_to_all_unify_mindir_test.py
index 08d49cee307..f65fbf6003f 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/all_to_all_unify_mindir_test.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/all_to_all_unify_mindir_test.py
@@ -13,8 +13,15 @@
 # limitations under the License.
 # ============================================================================
 import mindspore as ms
+from mindspore import context
 from mindspore.ops.operations._inner_ops import NeighborExchange
 from mindspore.ops.operations.comm_ops import _AlltoAll
+from mindspore.communication.management import GlobalComm, init
+
+context.set_context(device_target="Ascend")
+GlobalComm.CHECK_ENVS = False
+init("hccl")
+GlobalComm.CHECK_ENVS = True
 
 class FnDict:
     def __init__(self):
@@ -28,7 +35,7 @@ class FnDict:
 
 def test_neighbor_exchange(tag):
     fns = FnDict()
-    neighbor = NeighborExchange(send_rank_ids=[0], recv_rank_ids=[1], recv_shapes=([2, 3],), send_shapes=([2, 2],),
+    neighbor = NeighborExchange(send_rank_ids=[0], recv_rank_ids=[1], recv_shapes=([2, 2],), send_shapes=([2, 2],),
                                 recv_type=ms.float32)
     @fns
     def before(x):
@@ -37,6 +44,7 @@ def test_neighbor_exchange(tag):
     return fns[tag]
 
 def test_all_to_all(tag):
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
     fns = FnDict()
     altoall = _AlltoAll(split_count=8, split_dim=2, concat_dim=3)
     @fns
diff --git a/tests/ut/cpp/runtest.sh b/tests/ut/cpp/runtest.sh
index df1f81e9bd2..29a9b2a7a97 100755
--- a/tests/ut/cpp/runtest.sh
+++ b/tests/ut/cpp/runtest.sh
@@ -29,7 +29,7 @@ cd ${BUILD_PATH}/mindspore/tests/ut/cpp
 export LD_LIBRARY_PATH=${BUILD_PATH}/mindspore/googletest/googlemock/gtest:${PROJECT_PATH}/mindspore:\
 ${PROJECT_PATH}/mindspore/lib:${PROJECT_PATH}/graphengine/third_party/prebuild/x86_64:\
 ${PROJECT_PATH}/graphengine/third_party/prebuild/aarch64:${LD_LIBRARY_PATH}
-export PYTHONPATH=${PROJECT_PATH}/tests/ut/cpp/python_input:$PYTHONPATH:${PROJECT_PATH}
+export PYTHONPATH=${PROJECT_PATH}/tests/ut/cpp/python_input:$PYTHONPATH:${PROJECT_PATH}:${PROJECT_PATH}/tests/ut/python
 export GLOG_v=2
 export GC_COLLECT_IN_CELL=1
 ## set op info config path
diff --git a/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc b/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc
index 7be74ba8d73..9dcd67640c1 100644
--- a/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc
+++ b/tests/ut/cpp/stub/ge/ge_task_launch_stub.cc
@@ -23,12 +23,14 @@ HcclAdapter &HcclAdapter::GetInstance() {
   return instance;
 }
 bool HcclAdapter::InitHccl() { return true; }
-bool HcclAdapter::InitHccl(uint32_t, std::string_view, std::string_view) { return true; }
+bool HcclAdapter::InitHccl(uint32_t, std::string_view, std::string_view, bool) { return true; }
 bool HcclAdapter::FinalizeHccl() { return true; }
 HcclResult HcclAdapter::HcclCreateGroup(const std::string &, uint32_t, uint32_t *) const { return HCCL_SUCCESS; }
 HcclResult HcclAdapter::HcclDestroyGroup(const std::string &) const { return HCCL_SUCCESS; }
 HcclResult HcclAdapter::HcclGetRankId(const std::string &, uint32_t *) const { return HCCL_SUCCESS; }
 HcclResult HcclAdapter::HcclGetRankSize(const std::string &, uint32_t *) const { return HCCL_SUCCESS; }
+HcclResult HcclAdapter::HcclGetRankId(uint32_t *rank_id) const { return HCCL_SUCCESS; }
+HcclResult HcclAdapter::HcclGetRankSize(uint32_t *rank_size) const { return HCCL_SUCCESS; }
 bool HcclAdapter::GenTask(const AnfNodePtr &, HcclDataType, std::vector<HcclTaskInfo> *) const { return true; }
 int64_t HcclAdapter::CalcWorkspaceSize(const AnfNodePtr &, HcclDataType) const { return 0; }
 void *HcclAdapter::GetHcclOpsKernelInfoStore() const { return nullptr; }
diff --git a/tests/ut/cpp/stub/hccl/hccl_stub.cc b/tests/ut/cpp/stub/hccl/hccl_stub.cc
index 9778acc09ff..716b1afab76 100644
--- a/tests/ut/cpp/stub/hccl/hccl_stub.cc
+++ b/tests/ut/cpp/stub/hccl/hccl_stub.cc
@@ -131,6 +131,24 @@ HcclResult HcclCommInitRootInfo(uint32_t nRanks, const HcclRootInfo *rootInfo, u
   return HCCL_SUCCESS;
 }
 
+/**
+ * @brief Get the rank size of this comm.
+ *
+ * @param comm A pointer identifying the communication resource based on.
+ * @param rankSize  A pointer identifying the rank size.
+ * @return HcclResult
+ */
+HcclResult HcclGetRankSize(HcclComm comm, uint32_t *rankSize) { return HCCL_SUCCESS; }
+
+/**
+ * @brief Get the rank id of this comm.
+ *
+ * @param comm A pointer identifying the communication resource based on.
+ * @param rankSize  A pointer identifying the rank id.
+ * @return HcclResult
+ */
+HcclResult HcclGetRankId(HcclComm comm, uint32_t *rank) { return HCCL_SUCCESS; }
+
 HcclResult HcclAllReduce(void *sendBuf, void *recvBuf, uint64_t count, HcclDataType dataType, HcclReduceOp op,
                                 HcclComm comm, aclrtStream stream) {
   return HCCL_SUCCESS;
diff --git a/tests/ut/cpp/stub/kernel/kernel_fusion_stub.cc b/tests/ut/cpp/stub/kernel/kernel_fusion_stub.cc
index 13bd5208e4e..5da6755d1ac 100755
--- a/tests/ut/cpp/stub/kernel/kernel_fusion_stub.cc
+++ b/tests/ut/cpp/stub/kernel/kernel_fusion_stub.cc
@@ -15,6 +15,7 @@
  */
 #include "backend/kernel_compiler/kernel_fusion.h"
 #include "backend/kernel_compiler/tbe/tbe_kernel_mod.h"
+#include "backend/kernel_compiler/tbe/ascend_kernel_compile.h"
 #include "utils/ms_utils.h"
 
 namespace mindspore {
@@ -26,5 +27,26 @@ std::map<int64_t, KernelModPtr> KernelFusion(const std::vector<FusionScopeInfo>
   }
   return kernel_mod_ret;
 }
+namespace ascend {
+std::string AscendKernelCompileManager::AscendOpSelectFormat(const AnfNodePtr &node) { return std::string(); }
+bool AscendKernelCompileManager::AscendOpCheckSupported(const AnfNodePtr &node) { return true; }
+AscendKernelCompileManager::~AscendKernelCompileManager() {}
+bool AscendKernelCompileManager::tbe_init_flag_ = true;
+
+void AscendKernelCompileManager::TbeInitialize() {}
+// pre build
+void AscendKernelCompileManager::AscendPreBuild(const std::shared_ptr<session::KernelGraph> &kernel_graph) {}
+// single op compile
+bool AscendKernelCompileManager::AscendSingleOpCompile(const std::vector<AnfNodePtr> &anf_nodes) { return true; }
+// fusion op compile
+KernelModMap AscendKernelCompileManager::AscendFusionOpCompile(const std::vector<FusionScopeInfo> &fusion_scopes) {
+  std::map<int64_t, KernelModPtr> kernel_mod_ret;
+  for (const auto &fusion_scope_iter : fusion_scopes) {
+    kernel_mod_ret[fusion_scope_iter.scope_id] = std::make_shared<TbeKernelMod>(nullptr);
+  }
+  return kernel_mod_ret;
+}
+void AscendKernelCompileManager::ResetOldTask() {}
+}  // namespace ascend
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/tests/ut/cpp/stub/profiling/profiling_stub.cc b/tests/ut/cpp/stub/profiling/profiling_stub.cc
index 6678225390e..144dae83b78 100644
--- a/tests/ut/cpp/stub/profiling/profiling_stub.cc
+++ b/tests/ut/cpp/stub/profiling/profiling_stub.cc
@@ -15,6 +15,7 @@
  */
 #include <string>
 #include "prof_mgr_core.h"
+#include "prof_callback.h"
 
 namespace Msprof {
 namespace Engine {
@@ -54,3 +55,21 @@ int ProfMgrStop(void* handle) { return 0; }
 namespace Analysis::Dvvp::ProfilerSpecial {
 uint32_t MsprofilerInit() { return 0; }
 }
+
+/*
+ * @name  MsprofInit
+ * @brief Profiling module init
+ * @param [in] dataType: profiling type: ACL Env/ACL Json/GE Option
+ * @param [in] data: profiling switch data
+ * @param [in] dataLen: Length of data
+ * @return 0:SUCCESS, >0:FAILED
+ */
+int32_t MsprofInit(uint32_t dataType, void *data, uint32_t dataLen) { return 0; }
+
+/*
+ * @name AscendCL
+ * @brief Finishing Profiling
+ * @param NULL
+ * @return 0:SUCCESS, >0:FAILED
+ */
+int32_t MsprofFinalize() { return 0; }
\ No newline at end of file
diff --git a/tests/ut/cpp/stub/runtime/runtime_stub.cc b/tests/ut/cpp/stub/runtime/runtime_stub.cc
index 0682ce3e7f8..4a47bbac262 100644
--- a/tests/ut/cpp/stub/runtime/runtime_stub.cc
+++ b/tests/ut/cpp/stub/runtime/runtime_stub.cc
@@ -25,6 +25,10 @@
 
 rtError_t rtEventSynchronize(rtEvent_t event) { return RT_ERROR_NONE; }
 
+rtError_t rtEventCreateWithFlag(rtEvent_t *event, uint32_t flag) { return RT_ERROR_NONE; }
+
+rtError_t rtEventElapsedTime(float *time, rtEvent_t start, rtEvent_t end) { return RT_ERROR_NONE; }
+
 rtError_t rtMalloc(void **devPtr, uint64_t size, rtMemType_t type) { return RT_ERROR_NONE; }
 
 rtError_t rtMemcpy(void *dst, uint64_t destMax, const void *src, uint64_t count, rtMemcpyKind_t kind) {
@@ -197,3 +201,5 @@ RTS_API rtError_t rtKernelLaunchWithFlag(const void *stubFunc, uint32_t blockDim
 }
 
 RTS_API rtError_t rtMemGetInfoEx(rtMemInfoType_t memInfoType, size_t *free, size_t *total) { return RT_ERROR_NONE; }
+
+RTS_API rtError_t rtProfRegisterCtrlCallback(uint32_t moduleId, rtProfCtrlHandle callback) { return RT_ERROR_NONE; }
diff --git a/tests/ut/python/dataset/test_adjustgamma.py b/tests/ut/python/dataset/test_adjustgamma.py
index 61e91fdc5f5..32363f76b84 100644
--- a/tests/ut/python/dataset/test_adjustgamma.py
+++ b/tests/ut/python/dataset/test_adjustgamma.py
@@ -31,8 +31,6 @@ MNIST_DATA_DIR = "../data/dataset/testMnistData"
 DATA_DIR_2 = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
 
-GENERATE_GOLDEN = False
-
 
 def generate_numpy_random_rgb(shape):
     """
@@ -90,26 +88,20 @@ def test_adjust_gamma_invalid_gamma_param_c():
     logger.info("Test AdjustGamma C Op with invalid ignore parameter")
     try:
         data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
-        data_set = data_set.map(operations=[C.Decode(),
-                                            C.Resize((224, 224)),
-                                            lambda img: np.array(img[:, :, 0])],
+        data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])],
                                 input_columns=["image"])
         # invalid gamma
-        data_set = data_set.map(operations=C.AdjustGamma(gamma=-10.0,
-                                                         gain=1.0),
+        data_set = data_set.map(operations=C.AdjustGamma(gamma=-10.0, gain=1.0),
                                 input_columns="image")
     except ValueError as error:
         logger.info("Got an exception in AdjustGamma: {}".format(str(error)))
         assert "Input is not within the required interval of " in str(error)
     try:
         data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
-        data_set = data_set.map(operations=[C.Decode(),
-                                            C.Resize((224, 224)),
-                                            lambda img: np.array(img[:, :, 0])],
+        data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])],
                                 input_columns=["image"])
         # invalid gamma
-        data_set = data_set.map(operations=C.AdjustGamma(gamma=[1, 2],
-                                                         gain=1.0),
+        data_set = data_set.map(operations=C.AdjustGamma(gamma=[1, 2], gain=1.0),
                                 input_columns="image")
     except TypeError as error:
         logger.info("Got an exception in AdjustGamma: {}".format(str(error)))
@@ -129,8 +121,7 @@ def test_adjust_gamma_invalid_gamma_param_py():
             F.AdjustGamma(gamma=-10.0),
             F.ToTensor()
         ])
-        data_set = data_set.map(operations=[trans],
-                                input_columns=["image"])
+        data_set = data_set.map(operations=[trans], input_columns=["image"])
     except ValueError as error:
         logger.info("Got an exception in AdjustGamma: {}".format(str(error)))
         assert "Input is not within the required interval of " in str(error)
@@ -142,8 +133,7 @@ def test_adjust_gamma_invalid_gamma_param_py():
             F.AdjustGamma(gamma=[1, 2]),
             F.ToTensor()
         ])
-        data_set = data_set.map(operations=[trans],
-                                input_columns=["image"])
+        data_set = data_set.map(operations=[trans], input_columns=["image"])
     except TypeError as error:
         logger.info("Got an exception in AdjustGamma: {}".format(str(error)))
         assert "is not of type [<class 'float'>, <class 'int'>], but got" in str(error)
@@ -156,13 +146,10 @@ def test_adjust_gamma_invalid_gain_param_c():
     logger.info("Test AdjustGamma C Op with invalid gain parameter")
     try:
         data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
-        data_set = data_set.map(operations=[C.Decode(),
-                                            C.Resize((224, 224)),
-                                            lambda img: np.array(img[:, :, 0])],
+        data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])],
                                 input_columns=["image"])
         # invalid gain
-        data_set = data_set.map(operations=C.AdjustGamma(gamma=10.0,
-                                                         gain=[1, 10]),
+        data_set = data_set.map(operations=C.AdjustGamma(gamma=10.0, gain=[1, 10]),
                                 input_columns="image")
     except TypeError as error:
         logger.info("Got an exception in AdjustGamma: {}".format(str(error)))
@@ -182,8 +169,7 @@ def test_adjust_gamma_invalid_gain_param_py():
             F.AdjustGamma(gamma=10.0, gain=[1, 10]),
             F.ToTensor()
         ])
-        data_set = data_set.map(operations=[trans],
-                                input_columns=["image"])
+        data_set = data_set.map(operations=[trans], input_columns=["image"])
     except TypeError as error:
         logger.info("Got an exception in AdjustGamma: {}".format(str(error)))
         assert "is not of type [<class 'float'>, <class 'int'>], but got " in str(error)
diff --git a/tests/ut/python/dataset/test_allpass_biquad.py b/tests/ut/python/dataset/test_allpass_biquad.py
index e3cadece4f5..29805ab6df3 100644
--- a/tests/ut/python/dataset/test_allpass_biquad.py
+++ b/tests/ut/python/dataset/test_allpass_biquad.py
@@ -19,16 +19,14 @@ import mindspore.dataset.audio.transforms as audio
 from mindspore import log as logger
 
 
-def _count_unequal_element(data_expected, data_me, rtol, atol):
-
+def count_unequal_element(data_expected, data_me, rtol, atol):
     assert data_expected.shape == data_me.shape
     total_count = len(data_expected.flatten())
     error = np.abs(data_expected - data_me)
     greater = np.greater(error, atol + np.abs(data_expected) * rtol)
     loss_count = np.count_nonzero(greater)
-    assert (loss_count / total_count) < rtol, \
-        "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}". \
-        format(data_expected[greater], data_me[greater], error[greater])
+    assert (loss_count / total_count) < rtol, "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".format(
+        data_expected[greater], data_me[greater], error[greater])
 
 
 def test_func_allpass_biquad_eager():
@@ -37,12 +35,11 @@ def test_func_allpass_biquad_eager():
     # Original waveform
     waveform = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float64)
     # Expect waveform
-    expect_waveform = np.array([[0.96049707, 1.0, 1.0],
-                                [1.0, 1.0, 1.0]], dtype=np.float64)
+    expect_waveform = np.array([[0.96049707, 1.0, 1.0], [1.0, 1.0, 1.0]], dtype=np.float64)
     allpass_biquad_op = audio.AllpassBiquad(44100, 200.0, 0.707)
     # Filtered waveform by allpassbiquad
     output = allpass_biquad_op(waveform)
-    _count_unequal_element(expect_waveform, output, 0.0001, 0.0001)
+    count_unequal_element(expect_waveform, output, 0.0001, 0.0001)
 
 
 def test_func_allpass_biquad_pipeline():
@@ -51,56 +48,57 @@ def test_func_allpass_biquad_pipeline():
     # Original waveform
     waveform = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float64)
     # Expect waveform
-    expect_waveform = np.array([[0.96049707, 1.0, 1.0],
-                                [1.0, 1.0, 1.0]], dtype=np.float64)
+    expect_waveform = np.array([[0.96049707, 1.0, 1.0], [1.0, 1.0, 1.0]], dtype=np.float64)
     label = np.random.sample((2, 1))
     data = (waveform, label)
     dataset = ds.NumpySlicesDataset(data, ["channel", "sample"], shuffle=False)
     allpass_biquad_op = audio.AllpassBiquad(44100, 200.0)
     # Filtered waveform by allpassbiquad
-    dataset = dataset.map(
-        input_columns=["channel"], operations=allpass_biquad_op, num_parallel_workers=8)
+    dataset = dataset.map(input_columns=["channel"], operations=allpass_biquad_op, num_parallel_workers=8)
     i = 0
-    for _ in dataset.create_dict_iterator(output_numpy=True):
-        _count_unequal_element(expect_waveform[i, :],
-                               _['channel'], 0.0001, 0.0001)
+    for item in dataset.create_dict_iterator(output_numpy=True):
+        count_unequal_element(expect_waveform[i, :], item['channel'], 0.0001, 0.0001)
         i += 1
 
+
 def test_invalid_input_all():
     waveform = np.random.rand(2, 1000)
+
     def test_invalid_input(test_name, sample_rate, central_freq, Q, error, error_msg):
         logger.info("Test Allpassallpassiquad with bad input: {0}".format(test_name))
         with pytest.raises(error) as error_info:
             audio.AllpassBiquad(sample_rate, central_freq, Q)(waveform)
         assert error_msg in str(error_info.value)
+
     test_invalid_input("invalid sample_rate parameter type as a float", 44100.5, 200, 0.707, TypeError,
                        "Argument sample_rate with value 44100.5 is not of type [<class 'int'>],"
-                       +" but got <class 'float'>.")
+                       + " but got <class 'float'>.")
     test_invalid_input("invalid sample_rate parameter type as a String", "44100", 200, 0.707, TypeError,
-                       "Argument sample_rate with value 44100 is not of type [<class 'int'>],"+
+                       "Argument sample_rate with value 44100 is not of type [<class 'int'>]," +
                        " but got <class 'str'>.")
     test_invalid_input("invalid contral_freq parameter type as a String", 44100, "200", 0.707, TypeError,
                        "Argument central_freq with value 200 is not of type [<class 'float'>, <class 'int'>],"
-                       +" but got <class 'str'>.")
+                       + " but got <class 'str'>.")
     test_invalid_input("invalid Q parameter type as a String", 44100, 200, "0.707", TypeError,
                        "Argument Q with value 0.707 is not of type [<class 'float'>, <class 'int'>],"
-                       +" but got <class 'str'>.")
+                       + " but got <class 'str'>.")
     test_invalid_input("invalid sample_rate parameter value", 441324343243242342345300, 200, 0.707, ValueError,
-                       "Input sample_rate is not within the required interval of [-2147483648, 2147483647].")
+                       "Input sample_rate is not within the required interval of [-2147483648, 0) and (0, 2147483647].")
     test_invalid_input("invalid contral_freq parameter value", 44100, 32434324324234321, 0.707, ValueError,
                        "Input central_freq is not within the required interval of [-16777216, 16777216].")
     test_invalid_input("invalid sample_rate parameter value", None, 200, 0.707, TypeError,
                        "Argument sample_rate with value None is not of type [<class 'int'>],"
-                       +" but got <class 'NoneType'>.")
+                       + " but got <class 'NoneType'>.")
     test_invalid_input("invalid central_rate parameter value", 44100, None, 0.707, TypeError,
                        "Argument central_freq with value None is not of type [<class 'float'>, <class 'int'>],"
-                       +" but got <class 'NoneType'>.")
+                       + " but got <class 'NoneType'>.")
     test_invalid_input("invalid sample_rate parameter value", 0, 200, 0.707, ValueError,
-                       "Input sample_rate can not be 0.")
+                       "Input sample_rate is not within the required interval of [-2147483648, 0) and (0, 2147483647].")
     test_invalid_input("invalid Q parameter value", 44100, 200, 1.707, ValueError,
                        "Input Q is not within the required interval of (0, 1].")
 
+
 if __name__ == '__main__':
-    test_eager_allpassbiquad_mindspore_001()
-    test_pipeline_allpass_biquad_001()
+    test_func_allpass_biquad_eager()
+    test_func_allpass_biquad_pipeline()
     test_invalid_input_all()
diff --git a/tests/ut/python/dataset/test_amplitude_to_db.py b/tests/ut/python/dataset/test_amplitude_to_db.py
index 448b8b09ef4..9fba2ed07b9 100644
--- a/tests/ut/python/dataset/test_amplitude_to_db.py
+++ b/tests/ut/python/dataset/test_amplitude_to_db.py
@@ -23,7 +23,6 @@ import mindspore.dataset.audio.transforms as c_audio
 from mindspore import log as logger
 from mindspore.dataset.audio.utils import ScaleType
 
-
 CHANNEL = 1
 FREQ = 20
 TIME = 15
@@ -32,19 +31,18 @@ TIME = 15
 def gen(shape):
     np.random.seed(0)
     data = np.random.random(shape)
-    yield(np.array(data, dtype=np.float32),)
+    yield (np.array(data, dtype=np.float32),)
 
 
-def _count_unequal_element(data_expected, data_me, rtol, atol):
+def count_unequal_element(data_expected, data_me, rtol, atol):
     """ Precision calculation func """
     assert data_expected.shape == data_me.shape
     total_count = len(data_expected.flatten())
     error = np.abs(data_expected - data_me)
     greater = np.greater(error, atol + np.abs(data_expected) * rtol)
     loss_count = np.count_nonzero(greater)
-    assert (loss_count / total_count) < rtol, \
-        "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}". \
-        format(data_expected[greater], data_me[greater], error[greater])
+    assert (loss_count / total_count) < rtol, "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".format(
+        data_expected[greater], data_me[greater], error[greater])
 
 
 def allclose_nparray(data_expected, data_me, rtol, atol, equal_nan=True):
@@ -52,9 +50,7 @@ def allclose_nparray(data_expected, data_me, rtol, atol, equal_nan=True):
     if np.any(np.isnan(data_expected)):
         assert np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan)
     elif not np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan):
-        _count_unequal_element(data_expected, data_me, rtol, atol)
-    else:
-        assert True
+        count_unequal_element(data_expected, data_me, rtol, atol)
 
 
 def test_func_amplitude_to_db_eager():
@@ -91,9 +87,7 @@ def test_func_amplitude_to_db_pipeline():
 
     data1 = ds.GeneratorDataset(source=generator, column_names=["multi_dimensional_data"])
 
-    transforms = [
-        c_audio.AmplitudeToDB()
-    ]
+    transforms = [c_audio.AmplitudeToDB()]
     data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
 
     for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
@@ -102,7 +96,6 @@ def test_func_amplitude_to_db_pipeline():
 
 
 def test_amplitude_to_db_invalid_input():
-
     def test_invalid_input(test_name, stype, ref_value, amin, top_db, error, error_msg):
         logger.info("Test AmplitudeToDB with bad input: {0}".format(test_name))
         with pytest.raises(error) as error_info:
diff --git a/tests/ut/python/dataset/test_angle.py b/tests/ut/python/dataset/test_angle.py
index 6c366b6a41e..1de8e2fd0a2 100755
--- a/tests/ut/python/dataset/test_angle.py
+++ b/tests/ut/python/dataset/test_angle.py
@@ -19,28 +19,28 @@ import pytest
 import mindspore.dataset as ds
 import mindspore.dataset.audio.transforms as a_c_trans
 
-def _count_unequal_element(data_expected, data_me, rtol, atol):
 
+def count_unequal_element(data_expected, data_me, rtol, atol):
     assert data_expected.shape == data_me.shape
     total_count = len(data_expected.flatten())
     error = np.abs(data_expected - data_me)
     greater = np.greater(error, atol + np.abs(data_expected) * rtol)
     loss_count = np.count_nonzero(greater)
-    assert (loss_count / total_count) < rtol, \
-        "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}". \
-        format(data_expected[greater], data_me[greater], error[greater])
+    assert (loss_count / total_count) < rtol, "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".format(
+        data_expected[greater], data_me[greater], error[greater])
+
 
 def test_func_angle_001():
     """
     Eager Test
     """
-    arr = np.array([[73.04, -13.00], [57.49, 13.20], [-57.64, 6.51], [-52.25, 30.67], [-30.11, -18.34], \
+    arr = np.array([[73.04, -13.00], [57.49, 13.20], [-57.64, 6.51], [-52.25, 30.67], [-30.11, -18.34],
                     [-63.32, 99.33], [95.82, -24.76]], dtype=np.double)
-    expected = np.array([-0.17614017, 0.22569334, 3.02912684, 2.6107975, -2.59450886, 2.13831337, -0.25286988], \
+    expected = np.array([-0.17614017, 0.22569334, 3.02912684, 2.6107975, -2.59450886, 2.13831337, -0.25286988],
                         dtype=np.double)
     angle_op = a_c_trans.Angle()
     output = angle_op(arr)
-    _count_unequal_element(expected, output, 0.0001, 0.0001)
+    count_unequal_element(expected, output, 0.0001, 0.0001)
 
 
 def test_func_angle_002():
@@ -48,9 +48,9 @@ def test_func_angle_002():
     Pipeline Test
     """
     np.random.seed(6)
-    arr = np.array([[[84.25, -85.92], [-92.23, 23.06], [-7.33, -44.17], [-62.95, -14.73]], \
+    arr = np.array([[[84.25, -85.92], [-92.23, 23.06], [-7.33, -44.17], [-62.95, -14.73]],
                     [[93.09, 38.18], [-81.94, 71.34], [71.33, -39.00], [95.25, -32.94]]], dtype=np.double)
-    expected = np.array([[-0.79521156, 2.89658848, -1.73524737, -2.91173309], \
+    expected = np.array([[-0.79521156, 2.89658848, -1.73524737, -2.91173309],
                          [0.3892177, 2.42523905, -0.50034807, -0.33295219]], dtype=np.double)
     label = np.random.sample((2, 4, 1))
     data = (arr, label)
@@ -58,7 +58,8 @@ def test_func_angle_002():
     angle_op = a_c_trans.Angle()
     dataset = dataset.map(operations=angle_op, input_columns=["col1"])
     for item1, item2 in zip(dataset.create_dict_iterator(output_numpy=True), expected):
-        _count_unequal_element(item2, item1['col1'], 0.0001, 0.0001)
+        count_unequal_element(item2, item1['col1'], 0.0001, 0.0001)
+
 
 def test_func_angle_003():
     """
@@ -72,7 +73,7 @@ def test_func_angle_003():
     angle_op = a_c_trans.Angle()
     dataset = dataset.map(operations=angle_op, input_columns=["col1"])
     num_itr = 0
-    with pytest.raises(RuntimeError, match="The input type should be numbers"):
+    with pytest.raises(RuntimeError, match="input tensor type should be int, float or double"):
         for _ in dataset.create_dict_iterator(output_numpy=True):
             num_itr += 1
 
diff --git a/tests/ut/python/dataset/test_band_biquad.py b/tests/ut/python/dataset/test_band_biquad.py
index a554a4df36c..6136159cbd7 100644
--- a/tests/ut/python/dataset/test_band_biquad.py
+++ b/tests/ut/python/dataset/test_band_biquad.py
@@ -19,16 +19,14 @@ import mindspore.dataset.audio.transforms as audio
 from mindspore import log as logger
 
 
-def _count_unequal_element(data_expected, data_me, rtol, atol):
-
+def count_unequal_element(data_expected, data_me, rtol, atol):
     assert data_expected.shape == data_me.shape
     total_count = len(data_expected.flatten())
     error = np.abs(data_expected - data_me)
     greater = np.greater(error, atol + np.abs(data_expected) * rtol)
     loss_count = np.count_nonzero(greater)
-    assert (loss_count / total_count) < rtol, \
-        "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}". \
-        format(data_expected[greater], data_me[greater], error[greater])
+    assert (loss_count / total_count) < rtol, "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".format(
+        data_expected[greater], data_me[greater], error[greater])
 
 
 def test_func_band_biquad_eager():
@@ -42,7 +40,7 @@ def test_func_band_biquad_eager():
     band_biquad_op = audio.BandBiquad(44100, 200.0, 0.707, False)
     # Filtered waveform by bandbiquad
     output = band_biquad_op(waveform)
-    _count_unequal_element(expect_waveform, output, 0.0001, 0.0001)
+    count_unequal_element(expect_waveform, output, 0.0001, 0.0001)
 
 
 def test_func_band_biquad_pipeline():
@@ -61,9 +59,9 @@ def test_func_band_biquad_pipeline():
     dataset = dataset.map(
         input_columns=["channel"], operations=band_biquad_op, num_parallel_workers=8)
     i = 0
-    for _ in dataset.create_dict_iterator(output_numpy=True):
-        _count_unequal_element(expect_waveform[i, :],
-                               _['channel'], 0.0001, 0.0001)
+    for item in dataset.create_dict_iterator(output_numpy=True):
+        count_unequal_element(expect_waveform[i, :],
+                              item['channel'], 0.0001, 0.0001)
         i += 1
 
 
@@ -83,7 +81,7 @@ def test_band_biquad_invalid_input():
                        "Argument central_freq with value 200 is not of type [<class 'float'>, <class 'int'>],"
                        " but got <class 'str'>.")
     test_invalid_input("invalid sample_rate parameter value", 0, 200, 0.707, True, ValueError,
-                       "Input sample_rate can not be 0.")
+                       "Input sample_rate is not within the required interval of [-2147483648, 0) and (0, 2147483647].")
     test_invalid_input("invalid contral_freq parameter value", 44100, 32434324324234321, 0.707, True, ValueError,
                        "Input central_freq is not within the required interval of [-16777216, 16777216].")
     test_invalid_input("invalid Q parameter type as a String", 44100, 200, "0.707", True, TypeError,
@@ -94,7 +92,7 @@ def test_band_biquad_invalid_input():
     test_invalid_input("invalid Q parameter value", 44100, 200, 0, True, ValueError,
                        "Input Q is not within the required interval of (0, 1].")
     test_invalid_input("invalid sample_rate parameter value", 441324343243242342345300, 200, 0.707, True, ValueError,
-                       "Input sample_rate is not within the required interval of [-2147483648, 2147483647].")
+                       "Input sample_rate is not within the required interval of [-2147483648, 0) and (0, 2147483647].")
     test_invalid_input("invalid sample_rate parameter value", None, 200, 0.707, True, TypeError,
                        "Argument sample_rate with value None is not of type [<class 'int'>],"
                        " but got <class 'NoneType'>.")
diff --git a/tests/ut/python/dataset/test_bandpass_biquad.py b/tests/ut/python/dataset/test_bandpass_biquad.py
index 90a8ddc78b1..caa8277dc35 100644
--- a/tests/ut/python/dataset/test_bandpass_biquad.py
+++ b/tests/ut/python/dataset/test_bandpass_biquad.py
@@ -19,16 +19,14 @@ import mindspore.dataset.audio.transforms as audio
 from mindspore import log as logger
 
 
-def _count_unequal_element(data_expected, data_me, rtol, atol):
-
+def count_unequal_element(data_expected, data_me, rtol, atol):
     assert data_expected.shape == data_me.shape
     total_count = len(data_expected.flatten())
     error = np.abs(data_expected - data_me)
     greater = np.greater(error, atol + np.abs(data_expected) * rtol)
     loss_count = np.count_nonzero(greater)
-    assert (loss_count / total_count) < rtol, \
-        "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}". \
-        format(data_expected[greater], data_me[greater], error[greater])
+    assert (loss_count / total_count) < rtol, "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".format(
+        data_expected[greater], data_me[greater], error[greater])
 
 
 def test_func_bandpass_biquad_eager():
@@ -42,7 +40,7 @@ def test_func_bandpass_biquad_eager():
     bandpass_biquad_op = audio.BandpassBiquad(44000, 200.0, 0.707, False)
     # Filtered waveform by bandpassbiquad
     output = bandpass_biquad_op(waveform)
-    _count_unequal_element(expect_waveform, output, 0.0001, 0.0001)
+    count_unequal_element(expect_waveform, output, 0.0001, 0.0001)
 
 
 def test_func_bandpass_biquad_pipeline():
@@ -58,12 +56,10 @@ def test_func_bandpass_biquad_pipeline():
     dataset = ds.NumpySlicesDataset(data, ["channel", "sample"], shuffle=False)
     bandpass_biquad_op = audio.BandpassBiquad(44000, 200.0)
     # Filtered waveform by bandpassbiquad
-    dataset = dataset.map(
-        input_columns=["channel"], operations=bandpass_biquad_op, num_parallel_workers=8)
+    dataset = dataset.map(input_columns=["channel"], operations=bandpass_biquad_op, num_parallel_workers=8)
     i = 0
-    for _ in dataset.create_dict_iterator(output_numpy=True):
-        _count_unequal_element(expect_waveform[i, :],
-                               _['channel'], 0.0001, 0.0001)
+    for item in dataset.create_dict_iterator(output_numpy=True):
+        count_unequal_element(expect_waveform[i, :], item['channel'], 0.0001, 0.0001)
         i += 1
 
 
@@ -72,8 +68,7 @@ def test_bandpass_biquad_invalid_input():
         logger.info(
             "Test BandpassBiquad with bad input: {0}".format(test_name))
         with pytest.raises(error) as error_info:
-            audio.BandpassBiquad(
-                sample_rate, central_freq, Q, const_skirt_gain)
+            audio.BandpassBiquad(sample_rate, central_freq, Q, const_skirt_gain)
         assert error_msg in str(error_info.value)
 
     test_invalid_input("invalid sample_rate parameter type as a float", 44100.5, 200, 0.707, True, TypeError,
@@ -85,7 +80,7 @@ def test_bandpass_biquad_invalid_input():
                        "Argument central_freq with value 200 is not of type [<class 'float'>, <class 'int'>],"
                        " but got <class 'str'>.")
     test_invalid_input("invalid sample_rate parameter value", 0, 200, 0.707, True, ValueError,
-                       "Input sample_rate can not be 0.")
+                       "Input sample_rate is not within the required interval of [-2147483648, 0) and (0, 2147483647].")
     test_invalid_input("invalid contral_freq parameter value", 44100, 32434324324234321, 0.707, True, ValueError,
                        "Input central_freq is not within the required interval of [-16777216, 16777216].")
     test_invalid_input("invalid Q parameter type as a String", 44100, 200, "0.707", True, TypeError,
@@ -96,7 +91,7 @@ def test_bandpass_biquad_invalid_input():
     test_invalid_input("invalid Q parameter value", 44100, 200, 0, True, ValueError,
                        "Input Q is not within the required interval of (0, 1].")
     test_invalid_input("invalid sample_rate parameter value", 441324343243242342345300, 200, 0.707, True, ValueError,
-                       "Input sample_rate is not within the required interval of [-2147483648, 2147483647].")
+                       "Input sample_rate is not within the required interval of [-2147483648, 0) and (0, 2147483647].")
     test_invalid_input("invalid sample_rate parameter value", None, 200, 0.707, True, TypeError,
                        "Argument sample_rate with value None is not of type [<class 'int'>],"
                        " but got <class 'NoneType'>.")
diff --git a/tests/ut/python/dataset/test_bandreject_biquad.py b/tests/ut/python/dataset/test_bandreject_biquad.py
index 3c799c6f827..af04d34de25 100644
--- a/tests/ut/python/dataset/test_bandreject_biquad.py
+++ b/tests/ut/python/dataset/test_bandreject_biquad.py
@@ -19,16 +19,14 @@ import mindspore.dataset.audio.transforms as audio
 from mindspore import log as logger
 
 
-def _count_unequal_element(data_expected, data_me, rtol, atol):
-
+def count_unequal_element(data_expected, data_me, rtol, atol):
     assert data_expected.shape == data_me.shape
     total_count = len(data_expected.flatten())
     error = np.abs(data_expected - data_me)
     greater = np.greater(error, atol + np.abs(data_expected) * rtol)
     loss_count = np.count_nonzero(greater)
-    assert (loss_count / total_count) < rtol, \
-        "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}". \
-        format(data_expected[greater], data_me[greater], error[greater])
+    assert (loss_count / total_count) < rtol, "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".format(
+        data_expected[greater], data_me[greater], error[greater])
 
 
 def test_func_bandreject_biquad_eager():
@@ -43,7 +41,7 @@ def test_func_bandreject_biquad_eager():
     bandreject_biquad_op = audio.BandrejectBiquad(44100, 200.0, 0.707)
     # Filtered waveform by bandrejectbiquad
     output = bandreject_biquad_op(waveform)
-    _count_unequal_element(expect_waveform, output, 0.0001, 0.0001)
+    count_unequal_element(expect_waveform, output, 0.0001, 0.0001)
 
 
 def test_func_bandreject_biquad_pipeline():
@@ -63,9 +61,9 @@ def test_func_bandreject_biquad_pipeline():
     dataset = dataset.map(
         input_columns=["channel"], operations=bandreject_biquad_op, num_parallel_workers=8)
     i = 0
-    for _ in dataset.create_dict_iterator(output_numpy=True):
-        _count_unequal_element(expect_waveform[i, :],
-                               _['channel'], 0.0001, 0.0001)
+    for item in dataset.create_dict_iterator(output_numpy=True):
+        count_unequal_element(expect_waveform[i, :],
+                              item['channel'], 0.0001, 0.0001)
         i += 1
 
 
@@ -76,6 +74,7 @@ def test_bandreject_biquad_invalid_input():
         with pytest.raises(error) as error_info:
             audio.BandrejectBiquad(sample_rate, central_freq, Q)
         assert error_msg in str(error_info.value)
+
     test_invalid_input("invalid sample_rate parameter type as a float", 44100.5, 200, 0.707, TypeError,
                        "Argument sample_rate with value 44100.5 is not of type [<class 'int'>],"
                        " but got <class 'float'>.")
@@ -85,7 +84,7 @@ def test_bandreject_biquad_invalid_input():
                        "Argument central_freq with value 200 is not of type [<class 'float'>, <class 'int'>],"
                        " but got <class 'str'>.")
     test_invalid_input("invalid sample_rate parameter value", 0, 200, 0.707, ValueError,
-                       "Input sample_rate can not be 0.")
+                       "Input sample_rate is not within the required interval of [-2147483648, 0) and (0, 2147483647].")
     test_invalid_input("invalid contral_freq parameter value", 44100, 32434324324234321, 0.707, ValueError,
                        "Input central_freq is not within the required interval of [-16777216, 16777216].")
     test_invalid_input("invalid Q parameter type as a String", 44100, 200, "0.707", TypeError,
@@ -96,7 +95,7 @@ def test_bandreject_biquad_invalid_input():
     test_invalid_input("invalid Q parameter value", 44100, 200, 0, ValueError,
                        "Input Q is not within the required interval of (0, 1].")
     test_invalid_input("invalid sample_rate parameter value", 441324343243242342345300, 200, 0.707, ValueError,
-                       "Input sample_rate is not within the required interval of [-2147483648, 2147483647].")
+                       "Input sample_rate is not within the required interval of [-2147483648, 0) and (0, 2147483647].")
     test_invalid_input("invalid sample_rate parameter value", None, 200, 0.707, TypeError,
                        "Argument sample_rate with value None is not of type [<class 'int'>],"
                        " but got <class 'NoneType'>.")
@@ -106,6 +105,6 @@ def test_bandreject_biquad_invalid_input():
 
 
 if __name__ == "__main__":
-    test_func_band_biquad_eager()
-    test_func_band_biquad_pipeline()
-    test_band_biquad_invalid_input()
+    test_func_bandreject_biquad_eager()
+    test_func_bandreject_biquad_pipeline()
+    test_bandreject_biquad_invalid_input()
diff --git a/tests/ut/python/dataset/test_bass_biquad.py b/tests/ut/python/dataset/test_bass_biquad.py
index c06470db271..41f1e7c87cf 100644
--- a/tests/ut/python/dataset/test_bass_biquad.py
+++ b/tests/ut/python/dataset/test_bass_biquad.py
@@ -19,16 +19,14 @@ import mindspore.dataset.audio.transforms as audio
 from mindspore import log as logger
 
 
-def _count_unequal_element(data_expected, data_me, rtol, atol):
-
+def count_unequal_element(data_expected, data_me, rtol, atol):
     assert data_expected.shape == data_me.shape
     total_count = len(data_expected.flatten())
     error = np.abs(data_expected - data_me)
     greater = np.greater(error, atol + np.abs(data_expected) * rtol)
     loss_count = np.count_nonzero(greater)
-    assert (loss_count / total_count) < rtol, \
-        "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}". \
-        format(data_expected[greater], data_me[greater], error[greater])
+    assert (loss_count / total_count) < rtol, "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".format(
+        data_expected[greater], data_me[greater], error[greater])
 
 
 def test_func_bass_biquad_eager():
@@ -42,7 +40,7 @@ def test_func_bass_biquad_eager():
     bass_biquad_op = audio.BassBiquad(44100, 50.0, 100.0, 0.707)
     # Filtered waveform by bassbiquad
     output = bass_biquad_op(waveform)
-    _count_unequal_element(expect_waveform, output, 0.0001, 0.0001)
+    count_unequal_element(expect_waveform, output, 0.0001, 0.0001)
 
 
 def test_func_bass_biquad_pipeline():
@@ -61,9 +59,9 @@ def test_func_bass_biquad_pipeline():
     dataset = dataset.map(
         input_columns=["channel"], operations=bass_biquad_op, num_parallel_workers=8)
     i = 0
-    for _ in dataset.create_dict_iterator(output_numpy=True):
-        _count_unequal_element(expect_waveform[i, :],
-                               _['channel'], 0.0001, 0.0001)
+    for item in dataset.create_dict_iterator(output_numpy=True):
+        count_unequal_element(expect_waveform[i, :],
+                              item['channel'], 0.0001, 0.0001)
         i += 1
 
 
@@ -73,6 +71,7 @@ def test_invalid_invalid_input():
         with pytest.raises(error) as error_info:
             audio.BassBiquad(sample_rate, gain, central_freq, Q)
         assert error_msg in str(error_info.value)
+
     test_invalid_input("invalid sample_rate parameter type as a float", 44100.5, 50.0, 200, 0.707, TypeError,
                        "Argument sample_rate with value 44100.5 is not of type [<class 'int'>],"
                        " but got <class 'float'>.")
@@ -90,7 +89,7 @@ def test_invalid_invalid_input():
                        " but got <class 'str'>.")
 
     test_invalid_input("invalid sample_rate parameter value", 441324343243242342345300, 50.0, 200, 0.707, ValueError,
-                       "Input sample_rate is not within the required interval of [-2147483648, 2147483647].")
+                       "Input sample_rate is not within the required interval of [-2147483648, 0) and (0, 2147483647].")
     test_invalid_input("invalid gain parameter value", 44100, 32434324324234321, 200, 0.707, ValueError,
                        "Input gain is not within the required interval of [-16777216, 16777216].")
     test_invalid_input("invalid contral_freq parameter value", 44100, 50, 32434324324234321, 0.707, ValueError,
@@ -107,10 +106,11 @@ def test_invalid_invalid_input():
                        " but got <class 'NoneType'>.")
 
     test_invalid_input("invalid sample_rate parameter value", 0, 50.0, 200, 0.707, ValueError,
-                       "Input sample_rate can not be 0.")
+                       "Input sample_rate is not within the required interval of [-2147483648, 0) and (0, 2147483647].")
     test_invalid_input("invalid Q parameter value", 44100, 50.0, 200, 1.707, ValueError,
                        "Input Q is not within the required interval of (0, 1].")
 
+
 if __name__ == '__main__':
     test_func_bass_biquad_eager()
     test_func_bass_biquad_pipeline()
diff --git a/tests/ut/python/dataset/test_batch.py b/tests/ut/python/dataset/test_batch.py
index 692c3f640ef..7044de4cec0 100644
--- a/tests/ut/python/dataset/test_batch.py
+++ b/tests/ut/python/dataset/test_batch.py
@@ -238,6 +238,23 @@ def test_batch_12():
     save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
+def test_batch_13():
+    """
+    Test batch: python_multiprocessing is True and does not work for per_batch_map is None
+    """
+    logger.info("test_batch_12")
+    # define parameters
+    batch_size = True
+
+    # apply dataset operations
+    data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
+    data1 = data1.batch(batch_size=batch_size, python_multiprocessing=True)
+
+    assert sum([1 for _ in data1]) == 12
+    filename = "batch_12_result.npz"
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
+
+
 def test_batch_exception_01():
     """
     Test batch exception: num_parallel_workers=0
@@ -493,6 +510,7 @@ if __name__ == '__main__':
     test_batch_10()
     test_batch_11()
     test_batch_12()
+    test_batch_13()
     test_batch_exception_01()
     test_batch_exception_02()
     test_batch_exception_03()
diff --git a/tests/ut/python/dataset/test_config.py b/tests/ut/python/dataset/test_config.py
index 08b20a28fe9..7a12eff3139 100644
--- a/tests/ut/python/dataset/test_config.py
+++ b/tests/ut/python/dataset/test_config.py
@@ -21,6 +21,7 @@ import glob
 import numpy as np
 
 import mindspore.dataset as ds
+import mindspore.dataset.engine.iterators as it
 import mindspore.dataset.transforms.py_transforms
 import mindspore.dataset.vision.c_transforms as c_vision
 import mindspore.dataset.vision.py_transforms as py_vision
@@ -311,6 +312,10 @@ def test_deterministic_python_seed_multi_thread():
     """
     logger.info("test_deterministic_python_seed_multi_thread")
 
+    # Sometimes there are some ITERATORS left in ITERATORS_LIST when run all UTs together,
+    # and cause core dump and blocking in this UT. Add cleanup() here to fix it.
+    it._cleanup()  # pylint: disable=W0212
+
     # Save original configuration values
     num_parallel_workers_original = ds.config.get_num_parallel_workers()
     seed_original = ds.config.get_seed()
diff --git a/tests/ut/python/dataset/test_datasets_cmuarctic.py b/tests/ut/python/dataset/test_datasets_cmuarctic.py
deleted file mode 100644
index 8dc36bddd91..00000000000
--- a/tests/ut/python/dataset/test_datasets_cmuarctic.py
+++ /dev/null
@@ -1,203 +0,0 @@
-"""
-Test CmuArctic dataset operators
-"""
-import os
-import pytest
-import numpy as np
-import matplotlib.pyplot as plt
-import mindspore.dataset as ds
-import mindspore.dataset.vision.c_transforms as vision
-from mindspore import log as logger
-
-DATA_DIR = "/home/user06/zjm/data/cmu_arctic/"
-
-def test_cmuarctic_basic():
-    """
-    Validate CmuarcticDataset
-    """
-    logger.info("Test CmuArcticDataset Op")
-
-    # case 1: test loading fault dataset
-    data1 = ds.CmuArcticDataset(DATA_DIR)
-    num_iter1 = 0
-    for _ in data1.create_dict_iterator( output_numpy=True,num_epochs=1):
-        num_iter1 += 1
-    assert num_iter1 == 1132
-
-    # case 2: test num_samples
-    data2 = ds.CmuArcticDataset(DATA_DIR, num_samples=500)
-    num_iter2 = 0
-    for _ in data2.create_dict_iterator( output_numpy=True,num_epochs=1):
-        num_iter2 += 1
-    assert num_iter2 == 500
-
-    # case 3: test repeat
-    data3 = ds.CmuArcticDataset(DATA_DIR, num_samples=200)
-    data3 = data3.repeat(5)
-    num_iter3 = 0
-    for _ in data3.create_dict_iterator( output_numpy=True,num_epochs=1):
-        num_iter3 += 1
-    assert num_iter3 == 1000
-
-    # case 4: test batch with drop_remainder=False
-    data4 = ds.CmuArcticDataset(DATA_DIR, num_samples=100)
-    assert data4.get_dataset_size() == 100
-    assert data4.get_batch_size() == 1
-    data4 = data4.batch(batch_size=7)  # drop_remainder is default to be False
-    assert data4.get_dataset_size() == 15
-    assert data4.get_batch_size() == 7
-    # num_iter4 = 0
-    # for _ in data4.create_dict_iterator( output_numpy=True,num_epochs=1):
-    #     num_iter4 += 1
-    # assert num_iter4 == 15
-
-    # case 5: test batch with drop_remainder=True
-    data5 = ds.CmuArcticDataset(DATA_DIR, num_samples=100)
-    assert data5.get_dataset_size() == 100
-    assert data5.get_batch_size() == 1
-    data5 = data5.batch(batch_size=7, drop_remainder=True)  # the rest of incomplete batch will be dropped
-    assert data5.get_dataset_size() == 14
-    assert data5.get_batch_size() == 7
-    # num_iter5 = 0
-    # for _ in data5.create_dict_iterator( output_numpy=True,num_epochs=1):
-    #     num_iter5 += 1
-    # assert num_iter5 == 14
-
-
-
-def test_cmu_arctic_sequential_sampler():
-    """
-    Test CmuArcticDataset with SequentialSampler
-    """
-    logger.info("Test CmuArcticDataset Op with SequentialSampler")
-    num_samples = 50
-    sampler = ds.SequentialSampler(num_samples=num_samples)
-    data1 = ds.CmuArcticDataset(DATA_DIR, sampler=sampler)
-    data2 = ds.CmuArcticDataset(DATA_DIR, shuffle=False, num_samples=num_samples)
-    label_list1, label_list2 = [], []
-    num_iter = 0
-    for item1, item2 in zip(data1.create_dict_iterator( output_numpy=True,num_epochs=1), data2.create_dict_iterator( output_numpy=True,num_epochs=1)):
-        label_list1.append(item1["utterance"])
-        label_list2.append(item2["utterance"])
-        num_iter += 1
-    np.testing.assert_array_equal(label_list1, label_list2)
-    assert num_iter == num_samples
-
-
-def test_cmu_arctic_exception():
-    """
-    Test error cases for CmuArcticDataset
-    """
-    logger.info("Test error cases for CmuArcticDataset")
-    error_msg_1 = "sampler and shuffle cannot be specified at the same time"
-    with pytest.raises(RuntimeError, match=error_msg_1):
-        ds.CmuArcticDataset(DATA_DIR, shuffle=False, sampler=ds.PKSampler(3))
-
-    error_msg_2 = "sampler and sharding cannot be specified at the same time"
-    with pytest.raises(RuntimeError, match=error_msg_2):
-        ds.CmuArcticDataset(DATA_DIR, sampler=ds.PKSampler(3), num_shards=2, shard_id=0)
-
-    error_msg_3 = "num_shards is specified and currently requires shard_id as well"
-    with pytest.raises(RuntimeError, match=error_msg_3):
-        ds.CmuArcticDataset(DATA_DIR, num_shards=10)
-
-    error_msg_4 = "shard_id is specified but num_shards is not"
-    with pytest.raises(RuntimeError, match=error_msg_4):
-        ds.CmuArcticDataset(DATA_DIR, shard_id=0)
-
-    error_msg_5 = "Input shard_id is not within the required interval"
-    with pytest.raises(ValueError, match=error_msg_5):
-        ds.CmuArcticDataset(DATA_DIR, num_shards=5, shard_id=-1)
-    with pytest.raises(ValueError, match=error_msg_5):
-        ds.CmuArcticDataset(DATA_DIR, num_shards=5, shard_id=5)
-    with pytest.raises(ValueError, match=error_msg_5):
-        ds.CmuArcticDataset(DATA_DIR, num_shards=2, shard_id=5)
-
-    error_msg_6 = "num_parallel_workers exceeds"
-    with pytest.raises(ValueError, match=error_msg_6):
-        ds.CmuArcticDataset(DATA_DIR, shuffle=False, num_parallel_workers=0)
-    with pytest.raises(ValueError, match=error_msg_6):
-        ds.CmuArcticDataset(DATA_DIR, shuffle=False, num_parallel_workers=256)
-    with pytest.raises(ValueError, match=error_msg_6):
-        ds.CmuArcticDataset(DATA_DIR, shuffle=False, num_parallel_workers=-2)
-
-    error_msg_7 = "Argument shard_id"
-    with pytest.raises(TypeError, match=error_msg_7):
-        ds.CmuArcticDataset(DATA_DIR, num_shards=2, shard_id="0")
-
-    def exception_func(item):
-        raise Exception("Error occur!")
-
-    error_msg_8 = "The corresponding data files"
-    with pytest.raises(RuntimeError, match=error_msg_8):
-        data = ds.CmuArcticDataset(DATA_DIR)
-        data = data.map(operations=exception_func, input_columns=["waveform"], num_parallel_workers=1)
-        for _ in data.__iter__():
-            pass
-    with pytest.raises(RuntimeError, match=error_msg_8):
-        data = ds.CmuArcticDataset(DATA_DIR)
-        data = data.map(operations=vision.Decode(), input_columns=["waveform"], num_parallel_workers=1)
-        data = data.map(operations=exception_func, input_columns=["waveform"], num_parallel_workers=1)
-        for _ in data.__iter__():
-            pass
-    with pytest.raises(RuntimeError, match=error_msg_8):
-        data = ds.CmuArcticDataset(DATA_DIR)
-        data = data.map(operations=exception_func, input_columns=["waveform"], num_parallel_workers=1)
-        for _ in data.__iter__():
-            pass
-
-
-def test_cmu_arctic_visualize(plot=False):
-    """
-    Visualize CmuArcticDataset results
-    """
-    logger.info("Test CmuArcticDataset visualization")
-
-    data1 = ds.CmuArcticDataset(DATA_DIR, num_samples=10, shuffle=False)
-    num_iter = 0
-    for item in data1.create_dict_iterator( num_epochs=1, output_numpy=True):
-        audio = item["waveform"]
-        sample_rate = item["sample_rate"]
-        assert isinstance(audio, np.ndarray)
-        assert audio.dtype == np.float64
-        assert sample_rate.dtype == np.uint32
-        num_iter += 1
-    assert num_iter == 10
-
-
-def test_cmu_arctic_usage():
-    """
-    Validate CmuArcticDataset audio readings
-    """
-    logger.info("Test CmuArcticDataset usage flag")
-
-    def test_config(usage, cmu_arctic_path=None):
-        cmu_arctic_path = DATA_DIR if cmu_arctic_path is None else cmu_arctic_path
-        try:
-            data = ds.CmuArcticDataset(cmu_arctic_path, usage=usage, shuffle=False)
-            num_rows = 0
-            for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
-                num_rows += 1
-        except (ValueError, TypeError, RuntimeError) as e:
-            return str(e)
-        return num_rows
-
-    assert test_config("aew") == 1132
-    assert test_config("ahw") == 593
-    assert "Input usage is not within the valid set of ['aew', 'ahw', 'aup', 'awb', 'axb', 'bdl', 'clb', 'eey', 'fem', 'gka', 'jmk', 'ksp', 'ljm', 'lnh', 'rms', 'rxr', 'slp', 'slt']." in test_config("invalid")
-    assert "Argument usage with value ['list'] is not of type [<class 'str'>]" in test_config(["list"])
-
-    all_files_path = None
-    if all_files_path is not None:
-        assert test_config("aew", all_files_path) == 1132
-        assert test_config("ahw", all_files_path) == 593
-        assert ds.cmu_arcticDataset(all_files_path, usage="aew").get_dataset_size() == 1132
-        assert ds.cmu_arcticDataset(all_files_path, usage="ahw").get_dataset_size() == 593
-
-
-if __name__ == '__main__':
-    test_cmuarctic_basic()
-    test_cmu_arctic_sequential_sampler()
-    test_cmu_arctic_exception()
-    test_cmu_arctic_visualize(plot=True)
-    test_cmu_arctic_usage()
diff --git a/tests/ut/python/dataset/test_datasets_librispeech.py b/tests/ut/python/dataset/test_datasets_librispeech.py
deleted file mode 100644
index 0a12dc0601a..00000000000
--- a/tests/ut/python/dataset/test_datasets_librispeech.py
+++ /dev/null
@@ -1,209 +0,0 @@
-"""
-Test Librispeech dataset operators
-"""
-import pytest
-import numpy as np
-import matplotlib.pyplot as plt
-import mindspore.dataset as ds
-import mindspore.dataset.vision.c_transforms as vision
-from mindspore import log as logger
-
-DATA_DIR = "/home/user06/zjm/data/libri_speech/LibriSpeech/"
-
-
-def test_librispeech_basic():
-    """
-    Validate LibriSpeechDataset
-    """
-    logger.info("Test LibriSpeechDataset Op")
-
-    # case 1: test loading fault dataset
-    data1 = ds.LibriSpeechDataset(DATA_DIR)
-    num_iter1 = 0
-    for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
-        num_iter1 += 1
-    assert num_iter1 == 2939
-
-    # case 2: test num_samples
-    data2 = ds.LibriSpeechDataset(DATA_DIR, num_samples=500)
-    num_iter2 = 0
-    for _ in data2.create_dict_iterator(num_epochs=1, output_numpy=True):
-        num_iter2 += 1
-    assert num_iter2 == 500
-
-    # case 3: test repeat
-    data3 = ds.LibriSpeechDataset(DATA_DIR, num_samples=200)
-    data3 = data3.repeat(5)
-    num_iter3 = 0
-    for _ in data3.create_dict_iterator(num_epochs=1, output_numpy=True):
-        num_iter3 += 1
-    assert num_iter3 == 1000
-
-    # case 4: test batch with drop_remainder=False
-    data4 = ds.LibriSpeechDataset(DATA_DIR, num_samples=100)
-    assert data4.get_dataset_size() == 100
-    assert data4.get_batch_size() == 1
-    data4 = data4.batch(batch_size=7)  # drop_remainder is default to be False
-    assert data4.get_dataset_size() == 15
-    assert data4.get_batch_size() == 7
-    # num_iter4 = 0
-    # for _ in data4.create_dict_iterator(num_epochs=1,output_numpy=True):
-    #     num_iter4 += 1
-    # assert num_iter4 == 15
-
-    # case 5: test batch with drop_remainder=True
-    data5 = ds.LibriSpeechDataset(DATA_DIR, num_samples=100)
-    assert data5.get_dataset_size() == 100
-    assert data5.get_batch_size() == 1
-    data5 = data5.batch(batch_size=7, drop_remainder=True)  # the rest of incomplete batch will be dropped
-    assert data5.get_dataset_size() == 14
-    assert data5.get_batch_size() == 7
-    # num_iter5 = 0
-    # for _ in data5.create_dict_iterator(num_epochs=1,output_numpy=True):
-    #     num_iter5 += 1
-    # assert num_iter5 == 14
-
-
-def test_librispeech_sequential_sampler():
-    """
-    Test LibriSpeechDataset with SequentialSampler
-    """
-    logger.info("Test LibriSpeechDataset Op with SequentialSampler")
-    num_samples = 50
-    sampler = ds.SequentialSampler(num_samples=num_samples)
-    data1 = ds.LibriSpeechDataset(DATA_DIR, sampler=sampler)
-    data2 = ds.LibriSpeechDataset(DATA_DIR, shuffle=False, num_samples=num_samples)
-    label_list1, label_list2 = [], []
-    num_iter = 0
-    for item1, item2 in zip(data1.create_dict_iterator(num_epochs=1, output_numpy=True),
-                            data2.create_dict_iterator(num_epochs=1, output_numpy=True)):
-        label_list1.append(item1["utterance"])
-        label_list2.append(item2["utterance"])
-        num_iter += 1
-    np.testing.assert_array_equal(label_list1, label_list2)
-    assert num_iter == num_samples
-
-
-def test_librispeech_exception():
-    """
-    Test error cases for LibriSpeechDataset
-    """
-    logger.info("Test error cases for LibriSpeechDataset")
-    error_msg_1 = "sampler and shuffle cannot be specified at the same time"
-    with pytest.raises(RuntimeError, match=error_msg_1):
-        ds.LibriSpeechDataset(DATA_DIR, shuffle=False, sampler=ds.PKSampler(3))
-
-    error_msg_2 = "sampler and sharding cannot be specified at the same time"
-    with pytest.raises(RuntimeError, match=error_msg_2):
-        ds.LibriSpeechDataset(DATA_DIR, sampler=ds.PKSampler(3), num_shards=2, shard_id=0)
-
-    error_msg_3 = "num_shards is specified and currently requires shard_id as well"
-    with pytest.raises(RuntimeError, match=error_msg_3):
-        ds.LibriSpeechDataset(DATA_DIR, num_shards=10)
-
-    error_msg_4 = "shard_id is specified but num_shards is not"
-    with pytest.raises(RuntimeError, match=error_msg_4):
-        ds.LibriSpeechDataset(DATA_DIR, shard_id=0)
-
-    error_msg_5 = "Input shard_id is not within the required interval"
-    with pytest.raises(ValueError, match=error_msg_5):
-        ds.LibriSpeechDataset(DATA_DIR, num_shards=5, shard_id=-1)
-    with pytest.raises(ValueError, match=error_msg_5):
-        ds.LibriSpeechDataset(DATA_DIR, num_shards=5, shard_id=5)
-    with pytest.raises(ValueError, match=error_msg_5):
-        ds.LibriSpeechDataset(DATA_DIR, num_shards=2, shard_id=5)
-
-    error_msg_6 = "num_parallel_workers exceeds"
-    with pytest.raises(ValueError, match=error_msg_6):
-        ds.LibriSpeechDataset(DATA_DIR, shuffle=False, num_parallel_workers=0)
-    with pytest.raises(ValueError, match=error_msg_6):
-        ds.LibriSpeechDataset(DATA_DIR, shuffle=False, num_parallel_workers=256)
-    with pytest.raises(ValueError, match=error_msg_6):
-        ds.LibriSpeechDataset(DATA_DIR, shuffle=False, num_parallel_workers=-2)
-
-    error_msg_7 = "Argument shard_id"
-    with pytest.raises(TypeError, match=error_msg_7):
-        ds.LibriSpeechDataset(DATA_DIR, num_shards=2, shard_id="0")
-
-    def exception_func(item):
-        raise Exception("Error occur!")
-
-    error_msg_8 = "The corresponding data files"
-    with pytest.raises(RuntimeError, match=error_msg_8):
-        data = ds.LibriSpeechDataset(DATA_DIR)
-        data = data.map(operations=exception_func, input_columns=["waveform"], num_parallel_workers=1)
-        for _ in data.__iter__():
-            pass
-    with pytest.raises(RuntimeError, match=error_msg_8):
-        data = ds.LibriSpeechDataset(DATA_DIR)
-        data = data.map(operations=vision.Decode(), input_columns=["waveform"], num_parallel_workers=1)
-        data = data.map(operations=exception_func, input_columns=["waveform"], num_parallel_workers=1)
-        for _ in data.__iter__():
-            pass
-    with pytest.raises(RuntimeError, match=error_msg_8):
-        data = ds.LibriSpeechDataset(DATA_DIR)
-        data = data.map(operations=exception_func, input_columns=["waveform"], num_parallel_workers=1)
-        for _ in data.__iter__():
-            pass
-
-
-def test_librispeech_visualize(plot=False):
-    """
-    Visualize LibriSpeechDataset results
-    """
-    logger.info("Test LibriSpeechDataset visualization")
-
-    data1 = ds.LibriSpeechDataset(DATA_DIR, num_samples=10, shuffle=False)
-    num_iter = 0
-    for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
-        audio = item["waveform"]
-        sample_rate = item["sample_rate"]
-        speaker_id = item["speaker_id"];
-        chapter_id = item["chapter_id"];
-        utterance_id = item["utterance_id"];
-        assert isinstance(audio, np.ndarray)
-        assert audio.dtype == np.float64
-        assert sample_rate.dtype == np.uint32
-        assert speaker_id.dtype == np.uint32
-        assert chapter_id.dtype == np.uint32
-        assert utterance_id.dtype == np.uint32
-        num_iter += 1
-    assert num_iter == 10
-
-
-def test_librispeech_usage():
-    """
-    Validate LibriSpeechDataset audio readings
-    """
-    logger.info("Test LibriSpeechDataset usage flag")
-
-    def test_config(usage, librispeech_path=None):
-        librispeech_path = DATA_DIR if librispeech_path is None else librispeech_path
-        try:
-            data = ds.LibriSpeechDataset(librispeech_path, usage=usage, shuffle=False)
-            num_rows = 0
-            for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
-                num_rows += 1
-        except (ValueError, TypeError, RuntimeError) as e:
-            return str(e)
-        return num_rows
-
-    assert test_config("dev-clean") == 2703
-    assert test_config("dev-other") == 2864
-    assert "Input usage is not within the valid set of ['dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100', 'train-clean-360', 'train-other-500']." in test_config("invalid")
-    assert "Argument usage with value ['list'] is not of type [<class 'str'>]" in test_config(["list"])
-
-    all_files_path = None
-    if all_files_path is not None:
-        assert test_config("dev-clean", all_files_path) == 2703
-        assert test_config("dev-other", all_files_path) == 2864
-        assert ds.LibriSpeechDataset(all_files_path, usage="dev-clean").get_dataset_size() == 2703
-        assert ds.LibrispeechDataset(all_files_path, usage="dev-other").get_dataset_size() == 2864
-
-
-if __name__ == '__main__':
-    test_librispeech_basic()#pass
-    test_librispeech_sequential_sampler()#pass
-    test_librispeech_exception()#pass
-    test_librispeech_visualize(plot=True)#pass
-    test_librispeech_usage()#pass
diff --git a/tests/ut/python/dataset/test_datasets_sbd.py b/tests/ut/python/dataset/test_datasets_sbd.py
index 3801cfa669b..db7c3b9fd05 100644
--- a/tests/ut/python/dataset/test_datasets_sbd.py
+++ b/tests/ut/python/dataset/test_datasets_sbd.py
@@ -22,7 +22,6 @@ import mindspore.dataset as ds
 from mindspore import log as logger
 import mindspore.dataset.vision.c_transforms as c_vision
 
-
 DATASET_DIR = "../data/dataset/testSBData/sbd"
 
 
@@ -193,6 +192,7 @@ def test_sbd_usage():
     """
     Validate SBDataset image readings
     """
+
     def test_config(usage):
         try:
             data = ds.SBDataset(DATASET_DIR, task='Segmentation', usage=usage)
diff --git a/tests/ut/python/dataset/test_minddataset.py b/tests/ut/python/dataset/test_minddataset.py
index 9c470c56b54..8dce7bb2ec3 100644
--- a/tests/ut/python/dataset/test_minddataset.py
+++ b/tests/ut/python/dataset/test_minddataset.py
@@ -2568,6 +2568,60 @@ def test_distributed_shuffle_with_multi_epochs(create_multi_mindrecord_files):
     assert datas_epoch2 not in (datas_epoch1, datas_epoch3)
     assert datas_epoch3 not in (datas_epoch2, datas_epoch1)
 
+def test_field_is_null_numpy():
+    """add/remove nlp file"""
+    paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0'))
+             for x in range(FILES_NUM)]
+    for x in paths:
+        if os.path.exists("{}".format(x)):
+            os.remove("{}".format(x))
+        if os.path.exists("{}.db".format(x)):
+            os.remove("{}.db".format(x))
+
+    writer = FileWriter(NLP_FILE_NAME, FILES_NUM)
+    data = []
+    # field array_d is null
+    for row_id in range(16):
+        data.append({
+            "label": row_id,
+            "array_a": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129,
+                                            255, 256, -32768, 32767, -32769, 32768, -2147483648,
+                                            2147483647], dtype=np.int32), [-1]),
+            "array_b": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129, 255,
+                                            256, -32768, 32767, -32769, 32768,
+                                            -2147483648, 2147483647, -2147483649, 2147483649,
+                                            -922337036854775808, 9223372036854775807]), [1, -1]),
+            "array_d": np.array([], dtype=np.int64)
+        })
+    nlp_schema_json = {"label": {"type": "int32"},
+                       "array_a": {"type": "int32",
+                                   "shape": [-1]},
+                       "array_b": {"type": "int64",
+                                   "shape": [1, -1]},
+                       "array_d": {"type": "int64",
+                                   "shape": [-1]}
+                       }
+    writer.set_header_size(1 << 14)
+    writer.set_page_size(1 << 15)
+    writer.add_schema(nlp_schema_json, "nlp_schema")
+    writer.write_raw_data(data)
+    writer.commit()
+
+    data_set = ds.MindDataset(dataset_file=NLP_FILE_NAME + "0",
+                              columns_list=["label", "array_a", "array_b", "array_d"],
+                              num_parallel_workers=2,
+                              shuffle=False)
+    assert data_set.get_dataset_size() == 16
+    assert data_set.output_shapes() == [[], [15], [1, 19], []]
+    assert data_set.output_types()[0] == np.int32
+    assert data_set.output_types()[1] == np.int32
+    assert data_set.output_types()[2] == np.int64
+    assert data_set.output_types()[3] == np.int64
+
+    for x in paths:
+        os.remove("{}".format(x))
+        os.remove("{}.db".format(x))
+
 if __name__ == '__main__':
     test_nlp_compress_data(add_and_remove_nlp_compress_file)
     test_nlp_compress_data_old_version(add_and_remove_nlp_compress_file)
@@ -2603,3 +2657,4 @@ if __name__ == '__main__':
     test_shuffle_with_global_infile_files(create_multi_mindrecord_files)
     test_distributed_shuffle_with_global_infile_files(create_multi_mindrecord_files)
     test_distributed_shuffle_with_multi_epochs(create_multi_mindrecord_files)
+    test_field_is_null_numpy()
diff --git a/tests/ut/python/dataset/test_rgb_bgr.py b/tests/ut/python/dataset/test_rgb_bgr.py
index b6c93a64a97..6b1fd20ef59 100644
--- a/tests/ut/python/dataset/test_rgb_bgr.py
+++ b/tests/ut/python/dataset/test_rgb_bgr.py
@@ -24,8 +24,6 @@ import mindspore.dataset.vision.c_transforms as vision
 import mindspore.dataset.vision.py_transforms as py_vision
 import mindspore.dataset.vision.py_transforms_util as util
 
-GENERATE_GOLDEN = False
-
 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
 
diff --git a/tests/ut/python/dataset/test_time_stretch.py b/tests/ut/python/dataset/test_time_stretch.py
index 52a796c7ad6..577c40ebdbf 100644
--- a/tests/ut/python/dataset/test_time_stretch.py
+++ b/tests/ut/python/dataset/test_time_stretch.py
@@ -31,27 +31,24 @@ COMPLEX = 2
 def gen(shape):
     np.random.seed(0)
     data = np.random.random(shape)
-    yield(np.array(data, dtype=np.float32),)
+    yield (np.array(data, dtype=np.float32),)
 
 
-def _count_unequal_element(data_expected, data_me, rtol, atol):
+def count_unequal_element(data_expected, data_me, rtol, atol):
     assert data_expected.shape == data_me.shape
     total_count = len(data_expected.flatten())
     error = np.abs(data_expected - data_me)
     greater = np.greater(error, atol + np.abs(data_expected) * rtol)
     loss_count = np.count_nonzero(greater)
-    assert (loss_count / total_count) < rtol, \
-        "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}". \
-        format(data_expected[greater], data_me[greater], error[greater])
+    assert (loss_count / total_count) < rtol, "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".format(
+        data_expected[greater], data_me[greater], error[greater])
 
 
 def allclose_nparray(data_expected, data_me, rtol, atol, equal_nan=True):
     if np.any(np.isnan(data_expected)):
         assert np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan)
     elif not np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan):
-        _count_unequal_element(data_expected, data_me, rtol, atol)
-    else:
-        assert True
+        count_unequal_element(data_expected, data_me, rtol, atol)
 
 
 def test_time_stretch_pipeline():
@@ -60,18 +57,14 @@ def test_time_stretch_pipeline():
     """
     logger.info("test TimeStretch op")
     generator = gen([CHANNEL_NUM, FREQ, FRAME_NUM, COMPLEX])
-    data1 = ds.GeneratorDataset(source=generator, column_names=[
-        "multi_dimensional_data"])
+    data1 = ds.GeneratorDataset(source=generator, column_names=["multi_dimensional_data"])
 
-    transforms = [
-        c_audio.TimeStretch(512, FREQ, 1.3)
-    ]
-    data1 = data1.map(operations=transforms, input_columns=[
-        "multi_dimensional_data"])
+    transforms = [c_audio.TimeStretch(512, FREQ, 1.3)]
+    data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
 
     for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
         out_put = item["multi_dimensional_data"]
-    assert out_put.shape == (CHANNEL_NUM, FREQ, np.ceil(FRAME_NUM/1.3), COMPLEX)
+    assert out_put.shape == (CHANNEL_NUM, FREQ, np.ceil(FRAME_NUM / 1.3), COMPLEX)
 
 
 def test_time_stretch_pipeline_invalid_param():
@@ -80,19 +73,15 @@ def test_time_stretch_pipeline_invalid_param():
     """
     logger.info("test TimeStretch op with invalid values")
     generator = gen([CHANNEL_NUM, FREQ, FRAME_NUM, COMPLEX])
-    data1 = ds.GeneratorDataset(source=generator, column_names=[
-        "multi_dimensional_data"])
+    data1 = ds.GeneratorDataset(source=generator, column_names=["multi_dimensional_data"])
 
     with pytest.raises(ValueError, match=r"Input fixed_rate is not within the required interval of \(0, 16777216\]."):
-        transforms = [
-            c_audio.TimeStretch(512, FREQ, -1.3)
-        ]
-        data1 = data1.map(operations=transforms, input_columns=[
-            "multi_dimensional_data"])
+        transforms = [c_audio.TimeStretch(512, FREQ, -1.3)]
+        data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
 
         for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
             out_put = item["multi_dimensional_data"]
-        assert out_put.shape == (CHANNEL_NUM, FREQ, np.ceil(FRAME_NUM/1.3), COMPLEX)
+        assert out_put.shape == (CHANNEL_NUM, FREQ, np.ceil(FRAME_NUM / 1.3), COMPLEX)
 
 
 def test_time_stretch_eager():
@@ -102,7 +91,7 @@ def test_time_stretch_eager():
     logger.info("test TimeStretch op with customized parameter values")
     spectrogram = next(gen([CHANNEL_NUM, FREQ, FRAME_NUM, COMPLEX]))[0]
     out_put = c_audio.TimeStretch(512, FREQ, 1.3)(spectrogram)
-    assert out_put.shape == (CHANNEL_NUM, FREQ, np.ceil(FRAME_NUM/1.3), COMPLEX)
+    assert out_put.shape == (CHANNEL_NUM, FREQ, np.ceil(FRAME_NUM / 1.3), COMPLEX)
 
 
 def test_percision_time_stretch_eager():
diff --git a/tests/ut/python/ir/test_dtype.py b/tests/ut/python/ir/test_dtype.py
index 49f834092e0..42da96ccb52 100644
--- a/tests/ut/python/ir/test_dtype.py
+++ b/tests/ut/python/ir/test_dtype.py
@@ -35,6 +35,8 @@ def test_dtype_to_nptype():
     assert ms.dtype_to_nptype(ms.float16) == np.float16
     assert ms.dtype_to_nptype(ms.float32) == np.float32
     assert ms.dtype_to_nptype(ms.float64) == np.float64
+    assert ms.dtype_to_nptype(ms.complex64) == np.complex64
+    assert ms.dtype_to_nptype(ms.complex128) == np.complex128
 
 
 def test_dtype_to_pytype():
@@ -51,6 +53,8 @@ def test_dtype_to_pytype():
     assert ms.dtype_to_pytype(ms.float16) == float
     assert ms.dtype_to_pytype(ms.float32) == float
     assert ms.dtype_to_pytype(ms.float64) == float
+    assert ms.dtype_to_pytype(ms.complex64) == complex
+    assert ms.dtype_to_pytype(ms.complex128) == complex
     assert ms.dtype_to_pytype(ms.list_) == list
     assert ms.dtype_to_pytype(ms.tuple_) == tuple
     assert ms.dtype_to_pytype(ms.string) == str
@@ -94,6 +98,12 @@ def test_dtype():
     me_type = dtype.get_py_obj_dtype(x)
     assert me_type == ms.bool_
 
+    x = 0.1+3j
+    me_type = dtype.get_py_obj_dtype(type(x))
+    assert me_type == ms.complex128
+    me_type = dtype.get_py_obj_dtype(x)
+    assert me_type == ms.complex128
+
     # support str
     # x = "string type"
 
diff --git a/tests/ut/python/ir/test_tensor.py b/tests/ut/python/ir/test_tensor.py
index 2ec8bff3600..4f2e29c0a1e 100644
--- a/tests/ut/python/ir/test_tensor.py
+++ b/tests/ut/python/ir/test_tensor.py
@@ -74,6 +74,45 @@ def test_tensor_type_float16():
     assert t_float16.shape == (2, 3)
     assert t_float16.dtype == ms.float16
 
+def test_tensor_type_complex64():
+    np_input = np.array(
+        [[1+0.1j, 2j, 3+0.3j], [4-0.4j, 5, 6]], dtype=np.complex64)
+    t_complex64 = ms.Tensor(np_input)
+    assert isinstance(t_complex64, ms.Tensor)
+    assert t_complex64.shape == (2, 3)
+    assert t_complex64.dtype == ms.complex64
+    assert np.all(t_complex64.asnumpy() == np_input)
+
+
+def test_tensor_type_complex64_user_define():
+    np_input = np.zeros([1, 2, 3])
+    t_complex64 = ms.Tensor(np_input, ms.complex64)
+    assert isinstance(t_complex64, ms.Tensor)
+    assert t_complex64.shape == (1, 2, 3)
+    assert t_complex64.dtype == ms.complex64
+    assert np.all(t_complex64.asnumpy() == np_input)
+
+
+def test_tensor_type_complex128():
+    np_input = np.array(
+        [[1+0.1j, 2j, 3+0.3j], [4-0.4j, 5, 6]], dtype=np.complex128)
+    t_complex128 = ms.Tensor(np_input)
+    assert isinstance(t_complex128, ms.Tensor)
+    assert t_complex128.shape == (2, 3)
+    assert t_complex128.dtype == ms.complex128
+    assert np.all(t_complex128.asnumpy() == np_input)
+    np_input = (1, 2.22222222j, 3)
+    t_complex128 = ms.Tensor(np_input)
+    assert np.all(t_complex128.asnumpy() == np_input)
+
+
+def test_tensor_type_complex128_user_define():
+    np_input = np.zeros([1, 2, 3])
+    t_complex128 = ms.Tensor(np_input, ms.complex128)
+    assert isinstance(t_complex128, ms.Tensor)
+    assert t_complex128.shape == (1, 2, 3)
+    assert t_complex128.dtype == ms.complex128
+    assert np.all(t_complex128.asnumpy() == np_input)
 
 def test_tensor_type_float32():
     t_float32 = ms.Tensor(np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32))
@@ -332,13 +371,6 @@ def test_tensor_input_ndarray_bool():
     inp = np.array([False, 2, 4])
     ms.Tensor(inp)
 
-
-def test_tensor_input_ndarray_complex():
-    with pytest.raises(TypeError):
-        inp = np.array([20j, 2, 4])
-        ms.Tensor(inp)
-
-
 def test_tensor_input_ndarray_none():
     with pytest.raises(TypeError):
         inp = np.array([None, 2, 4])
@@ -445,6 +477,19 @@ def test_tensor_dtype_fp64_to_uint8():
     assert t.shape == (2, 3)
     assert t.dtype == ms.uint8
 
+def test_tensor_dtype_complex64_to_float32():
+    array = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.complex64)
+    t = ms.Tensor(array, ms.float32)
+    assert isinstance(t, ms.Tensor)
+    assert t.shape == (2, 3)
+    assert t.dtype == ms.float32
+
+def test_tensor_dtype_float32_to_complex64():
+    array = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+    t = ms.Tensor(array, ms.complex64)
+    assert isinstance(t, ms.Tensor)
+    assert t.shape == (2, 3)
+    assert t.dtype == ms.complex64
 
 def test_tensor_operation():
     x = Tensor(np.ones((3, 3)) * 4)
diff --git a/tests/ut/python/nn/test_parameter.py b/tests/ut/python/nn/test_parameter.py
index 893a605b2b9..b794e7165dd 100644
--- a/tests/ut/python/nn/test_parameter.py
+++ b/tests/ut/python/nn/test_parameter.py
@@ -200,6 +200,12 @@ def test_parameter_lazy_init():
     assert isinstance(para.data, Tensor)
     assert np.array_equal(para.data.asnumpy(), np.ones((1, 2, 3)))
 
+    para = Parameter(initializer('ones', [1, 2, 3], mstype.complex64), 'test1')
+    assert isinstance(para.data, Tensor)
+    para = para.init_data()
+    assert isinstance(para.data, Tensor)
+    assert np.array_equal(para.data.asnumpy(), np.ones((1, 2, 3)))
+
     # Call init_data() after set_data is set.
     para = Parameter(initializer('ones', [1, 2, 3], mstype.float32), 'test2')
     assert isinstance(para.data, Tensor)
diff --git a/tests/ut/python/nn/test_transformer.py b/tests/ut/python/nn/test_transformer.py
index 0c1596747a2..8731a5ea7b3 100644
--- a/tests/ut/python/nn/test_transformer.py
+++ b/tests/ut/python/nn/test_transformer.py
@@ -14,41 +14,93 @@
 # ============================================================================
 """ test transformer"""
 import numpy as np
+import pytest
 from mindspore import Tensor
 from mindspore.common import dtype
-from mindspore.nn.parallel import MultiHeadAttention, FeedForward, TransformerEncoderLayer, TransformerEncoder, \
-    TransformerDecoder, TransformerDecoderLayer, Transformer
+from mindspore.parallel.nn import MultiHeadAttention, FeedForward, TransformerEncoderLayer, TransformerEncoder, \
+    TransformerDecoder, TransformerDecoderLayer, Transformer, CrossEntropyLoss, AttentionMask
 from mindspore.common.api import _executor
 
 
 def test_transformer_encoder_only():
-    model = Transformer(encoder_layers=2,
+    model = Transformer(batch_size=2,
+                        src_seq_length=20,
+                        tgt_seq_length=0,
+                        encoder_layers=2,
                         decoder_layers=0,
                         hidden_size=64,
-                        ffn_hidden_size=64,
-                        src_seq_length=16,
-                        tgt_seq_length=32)
+                        ffn_hidden_size=64)
 
     encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
-    encoder_input_mask = Tensor(np.ones((2, 1, 20, 20)), dtype.float16)
+    encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16)
+
+    _executor.compile(model, encoder_input_value, encoder_input_mask)
+
+
+def test_transformer_encoder_log_softmax():
+    with pytest.raises(ValueError):
+        model = Transformer(batch_size=2,
+                            src_seq_length=20,
+                            tgt_seq_length=0,
+                            encoder_layers=2,
+                            decoder_layers=0,
+                            hidden_act='logsoftmax',
+                            hidden_size=64,
+                            ffn_hidden_size=64)
+
+        encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
+        encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16)
+
+        _executor.compile(model, encoder_input_value, encoder_input_mask)
+
+
+def test_transformer_encoder_leakyrelu():
+    model = Transformer(batch_size=2,
+                        src_seq_length=20,
+                        tgt_seq_length=0,
+                        encoder_layers=2,
+                        decoder_layers=0,
+                        hidden_act='leakyrelu',
+                        hidden_size=64,
+                        ffn_hidden_size=64)
+
+    encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
+    encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16)
+
+    _executor.compile(model, encoder_input_value, encoder_input_mask)
+
+
+def test_transformer_encoder_logsigmoid():
+    model = Transformer(batch_size=2,
+                        src_seq_length=20,
+                        tgt_seq_length=0,
+                        encoder_layers=2,
+                        decoder_layers=0,
+                        hidden_act='logsigmoid',
+                        hidden_size=64,
+                        ffn_hidden_size=64)
+
+    encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
+    encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16)
 
     _executor.compile(model, encoder_input_value, encoder_input_mask)
 
 
 def test_encoder_and_decoder():
-    model = Transformer(encoder_layers=1,
+    model = Transformer(batch_size=2,
+                        src_seq_length=20,
+                        tgt_seq_length=10,
+                        encoder_layers=1,
                         decoder_layers=2,
                         hidden_size=64,
-                        ffn_hidden_size=64,
-                        src_seq_length=20,
-                        tgt_seq_length=20)
+                        ffn_hidden_size=64)
 
     encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
-    encoder_input_mask = Tensor(np.ones((2, 1, 20, 20)), dtype.float16)
+    encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16)
 
     decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32)
-    decoder_input_mask = Tensor(np.ones((2, 1, 10, 10)), dtype.float16)
-    memory_mask = Tensor(np.ones((2, 1, 10, 20)), dtype.float16)
+    decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16)
+    memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16)
 
     _executor.compile(model, encoder_input_value, encoder_input_mask,
                       decoder_input_value,
@@ -57,14 +109,15 @@ def test_encoder_and_decoder():
 
 
 def test_transformer_encoder():
-    model = TransformerEncoder(num_layers=2,
+    model = TransformerEncoder(batch_size=2,
+                               seq_length=16,
+                               num_layers=2,
                                hidden_size=8,
                                ffn_hidden_size=64,
-                               seq_length=16,
                                num_heads=2)
 
     encoder_input_value = Tensor(np.ones((2, 16, 8)), dtype.float32)
-    encoder_input_mask = Tensor(np.ones((2, 1, 16, 16)), dtype.float16)
+    encoder_input_mask = Tensor(np.ones((2, 16, 16)), dtype.float16)
 
     _executor.compile(model,
                       encoder_input_value,
@@ -72,11 +125,11 @@ def test_transformer_encoder():
 
 
 def test_transformer_encoder_layer():
-    model = TransformerEncoderLayer(hidden_size=8, ffn_hidden_size=64, seq_length=16,
+    model = TransformerEncoderLayer(batch_size=2, hidden_size=8, ffn_hidden_size=64, seq_length=16,
                                     num_heads=2)
 
     encoder_input_value = Tensor(np.ones((2, 16, 8)), dtype.float32)
-    encoder_input_mask = Tensor(np.ones((2, 1, 16, 16)), dtype.float16)
+    encoder_input_mask = Tensor(np.ones((2, 16, 16)), dtype.float16)
 
     _executor.compile(model,
                       encoder_input_value,
@@ -84,11 +137,13 @@ def test_transformer_encoder_layer():
 
 
 def test_transformer_encoder_layer_post_ture():
-    model = TransformerEncoderLayer(hidden_size=8, ffn_hidden_size=64, seq_length=16,
+    model = TransformerEncoderLayer(batch_size=2,
+                                    seq_length=16,
+                                    hidden_size=8, ffn_hidden_size=64,
                                     num_heads=2, post_layernorm_residual=True)
 
     encoder_input_value = Tensor(np.ones((2, 16, 8)), dtype.float32)
-    encoder_input_mask = Tensor(np.ones((2, 1, 16, 16)), dtype.float16)
+    encoder_input_mask = Tensor(np.ones((2, 16, 16)), dtype.float16)
 
     _executor.compile(model,
                       encoder_input_value,
@@ -97,16 +152,18 @@ def test_transformer_encoder_layer_post_ture():
 
 def test_transformer_decoder():
     model = TransformerDecoder(num_layers=1,
+                               batch_size=2,
+                               src_seq_length=20,
+                               tgt_seq_length=10,
                                hidden_size=64,
                                ffn_hidden_size=64,
-                               num_heads=2,
-                               seq_length=10)
+                               num_heads=2)
 
     encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
 
     decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32)
-    decoder_input_mask = Tensor(np.ones((2, 1, 10, 10)), dtype.float16)
-    memory_mask = Tensor(np.ones((2, 1, 10, 20)), dtype.float16)
+    decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16)
+    memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16)
 
     _executor.compile(model, decoder_input_value, decoder_input_mask,
                       encoder_input_value,
@@ -115,16 +172,18 @@ def test_transformer_decoder():
 
 def test_transformer_decoder_layer():
     model = TransformerDecoderLayer(
+        batch_size=2,
+        src_seq_length=20,
+        tgt_seq_length=10,
         hidden_size=64,
         ffn_hidden_size=64,
-        num_heads=2,
-        seq_length=10)
+        num_heads=2)
 
     encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32)
 
     decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32)
-    decoder_input_mask = Tensor(np.ones((2, 1, 10, 10)), dtype.float16)
-    memory_mask = Tensor(np.ones((2, 1, 10, 20)), dtype.float16)
+    decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16)
+    memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16)
 
     _executor.compile(model, decoder_input_value, decoder_input_mask,
                       encoder_input_value,
@@ -133,12 +192,15 @@ def test_transformer_decoder_layer():
 
 def test_multihead_attention():
     model = MultiHeadAttention(hidden_size=15,
+                               src_seq_length=20,
+                               tgt_seq_length=20,
+                               batch_size=2,
                                num_heads=3)
     from_tensor = Tensor(np.ones((2, 20, 15)), dtype.float32)
     to_tensor = Tensor(np.ones((2, 20, 15)), dtype.float16)
-    attention_mask = Tensor(np.ones((2, 1, 20, 20)), dtype.float16)
+    attention_mask = Tensor(np.ones((2, 20, 20)), dtype.float16)
 
-    _executor.compile(model, from_tensor, to_tensor, attention_mask)
+    _executor.compile(model, from_tensor, to_tensor, to_tensor, attention_mask)
 
 
 def test_feedforward_layer():
@@ -149,3 +211,18 @@ def test_feedforward_layer():
     tensor = Tensor(np.ones((2, 20, 15)), dtype.float32)
 
     _executor.compile(model, tensor)
+
+
+def test_cross_entroy():
+    model = CrossEntropyLoss()
+    logits = Tensor(np.array([[3, 5, 6, 9, 12, 33, 42, 12, 32, 72]]), dtype.float32)
+    labels_np = np.array([1]).astype(np.int32)
+    input_mask = Tensor(np.ones(1).astype(np.float32))
+    labels = Tensor(labels_np)
+    _executor.compile(model, logits, labels, input_mask)
+
+
+def test_attention_mask():
+    model = AttentionMask(seq_length=19)
+    inputs = Tensor(np.ones((2, 19)), dtype.float32)
+    _executor.compile(model, inputs)
diff --git a/tests/ut/python/ops/test_control_ops.py b/tests/ut/python/ops/test_control_ops.py
index 4144547f0e6..880698e4980 100644
--- a/tests/ut/python/ops/test_control_ops.py
+++ b/tests/ut/python/ops/test_control_ops.py
@@ -1015,3 +1015,23 @@ def test_recursive_call():
         net(input_data)
     os.environ['ENV_RECURSIVE_EVAL'] = '0'
     context.set_context(max_call_depth=old_max_call_depth)
+
+
+# grad for Tensor(Bool) input and eliminate AddN(MakeTuple(Xs, zeros_like(Bool)))
+def test_grad_tensor_bool():
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+
+        def construct(self, x, y, z):
+            out = z
+            while x:
+                out = out + z
+                x = y
+            return out
+
+    x = Tensor(np.array(False).astype(np.bool))
+    y = Tensor(np.array(False).astype(np.bool))
+    z = Tensor(np.ones([2, 3], dtype=np.float32))
+    net = grad_all(Net())
+    net(x, y, z)
diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py
index b21e85500bc..c352d76969c 100755
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@@ -2189,6 +2189,10 @@ test_case_nn_ops = [
                         Tensor(np.zeros((1, 1, 2, 2)), mstype.uint16)],
         'desc_bprop': [],
         'skip': ['backward']}),
+    ('Roll', {
+        'block': nn.Roll(shift=[1, -2], axis=[0, 1]),
+        'desc_inputs': [Tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]], mstype.float32)],
+        'desc_bprop': [Tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]], mstype.float32)]}),
     ('SoftShrink', {
         'block': P.SoftShrink(),
         'desc_inputs': [Tensor(np.array([[0.5297, 0.7871, 1.1754], [0.7836, 0.6218, -1.1542]]), mstype.float32)],
diff --git a/tests/ut/python/optimizer/test_recompute.py b/tests/ut/python/optimizer/test_recompute.py
index 28bbb38de8d..0e35c7f22a7 100644
--- a/tests/ut/python/optimizer/test_recompute.py
+++ b/tests/ut/python/optimizer/test_recompute.py
@@ -38,7 +38,7 @@ def test_set_recompute_true():
 
 def test_set_recompute_false():
     net = Net()
-    net.pool.recompute(False)
+    net.pool.recompute(mode=False)
     assert net.pool.get_scope() is None
 
 
@@ -51,32 +51,32 @@ def test_set_recompute_true_twice():
 
 def test_set_recompute_false_twice():
     net = Net()
-    net.pool.recompute(False)
-    net.pool.recompute(False)
+    net.pool.recompute(mode=False)
+    net.pool.recompute(mode=False)
     assert net.pool.get_scope() is None
 
 
 def test_reset_recompute1():
     net = Net()
-    net.pool.recompute(True)
-    net.pool.recompute(False)
+    net.pool.recompute(mode=True)
+    net.pool.recompute(mode=False)
     assert net.pool.get_scope() == ""
 
 
 def test_reset_recompute2():
     net = Net()
-    net.pool.recompute(False)
-    net.pool.recompute(True)
+    net.pool.recompute(mode=False)
+    net.pool.recompute(mode=True)
     assert net.pool.get_scope() == recompute_prefix
 
 
 def test_set_scope_and_set_recompute_repeatedly():
     net = Net()
-    net.pool.recompute(True)
+    net.pool.recompute(mode=True)
     assert net.pool.get_scope() == recompute_prefix
-    net.pool.recompute(False)
+    net.pool.recompute(mode=False)
     assert net.pool.get_scope() == ""
-    net.pool.recompute(True)
+    net.pool.recompute(mode=True)
     assert net.pool.get_scope() == recompute_prefix
-    net.pool.recompute(False)
+    net.pool.recompute(mode=False)
     assert net.pool.get_scope() == ""
diff --git a/tests/ut/python/parallel/test_alltoall.py b/tests/ut/python/parallel/test_alltoall.py
index 8cc29cfa0a5..df7537342d8 100644
--- a/tests/ut/python/parallel/test_alltoall.py
+++ b/tests/ut/python/parallel/test_alltoall.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import re
+import pytest
 import numpy as np
 
 import mindspore as ms
@@ -24,11 +25,20 @@ from mindspore.common.parameter import Parameter
 from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
 from mindspore.nn.optim.momentum import Momentum
 from mindspore.ops import operations as P
+from mindspore.ops.operations.comm_ops import _AlltoAll
 from mindspore.parallel._utils import _reset_op_id
 from mindspore.train import Model
 from mindspore.context import ParallelMode
+from mindspore.communication.management import GlobalComm, init
 from tests.dataset_mock import MindData
 
+context.set_context(device_target="Ascend")
+GlobalComm.CHECK_ENVS = False
+init("hccl")
+GlobalComm.CHECK_ENVS = True
+
+_x1 = Tensor(np.ones([64, 3, 224, 224]), dtype=ms.float32)
+
 
 class Dataset(MindData):
     def __init__(self, predict, label, length=3):
@@ -109,5 +119,202 @@ def test_all_to_all():
     context.set_context(save_graphs=False)
 
 
+def test_all_to_all_success():
+    """
+    Feature: AlltoAll
+    Description: on 8p, a 4d tensor split at dim 2 and concat at dim 3
+    Expectation: success
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = _AlltoAll(split_count=8, split_dim=2, concat_dim=3)
+
+        def construct(self, x1):
+            out = self.alltoallv(x1)
+            return out
+
+    net = Net()
+    _executor.compile(net, _x1)
+
+
+def test_all_to_all_invalid_split_count_value_failed():
+    """
+    Feature: AlltoAll
+    Description: split_count should be equal to rank size, but not
+    Expectation: throw ValueError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = _AlltoAll(split_count=7, split_dim=2, concat_dim=3)
+
+        def construct(self, x1):
+            out = self.alltoallv(x1)
+            return out
+
+    with pytest.raises(ValueError):
+        net = Net()
+        _executor.compile(net, _x1)
+
+
+def test_all_to_all_invalid_split_count_type_failed():
+    """
+    Feature: AlltoAll
+    Description: split_count should be int, but a list is given
+    Expectation: throw TypeError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = _AlltoAll(split_count=[8], split_dim=2, concat_dim=3)
+
+        def construct(self, x1):
+            out = self.alltoallv(x1)
+            return out
+
+    with pytest.raises(TypeError):
+        net = Net()
+        _executor.compile(net, _x1)
+
+
+def test_all_to_all_invalid_split_dim_value_failed():
+    """
+    Feature: AlltoAll
+    Description: split_dim over input shape
+    Expectation: throw IndexError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = _AlltoAll(split_count=8, split_dim=4, concat_dim=3)
+
+        def construct(self, x1):
+            out = self.alltoallv(x1)
+            return out
+
+    with pytest.raises(IndexError):
+        net = Net()
+        _executor.compile(net, _x1)
+
+
+def test_all_to_all_invalid_split_dim_type_failed():
+    """
+    Feature: AlltoAll
+    Description: split_dim should be int, but a tuple is given
+    Expectation: throw TypeError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = _AlltoAll(split_count=8, split_dim=(3,), concat_dim=3)
+
+        def construct(self, x1):
+            out = self.alltoallv(x1)
+            return out
+
+    with pytest.raises(TypeError):
+        net = Net()
+        _executor.compile(net, _x1)
+
+
+def test_all_to_all_invalid_concat_dim_value_failed():
+    """
+    Feature: AlltoAll
+    Description: concat_dim over input shape
+    Expectation: throw IndexError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = _AlltoAll(split_count=8, split_dim=3, concat_dim=4)
+
+        def construct(self, x1):
+            out = self.alltoallv(x1)
+            return out
+
+    with pytest.raises(IndexError):
+        net = Net()
+        _executor.compile(net, _x1)
+
+
+def test_all_to_all_invalid_concat_dim_type_failed():
+    """
+    Feature: AlltoAll
+    Description: concat_dim should be int, but a tuple is given
+    Expectation: throw TypeError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = _AlltoAll(split_count=8, split_dim=3, concat_dim=([3],))
+
+        def construct(self, x1):
+            out = self.alltoallv(x1)
+            return out
+
+    with pytest.raises(TypeError):
+        net = Net()
+        _executor.compile(net, _x1)
+
+
+def test_all_to_all_invalid_split_count_cannot_be_divisible_failed():
+    """
+    Feature: AlltoAll
+    Description: shape at split_dim should be divisible by split_count, but not
+    Expectation: throw ValueError
+    """
+    context.set_auto_parallel_context(device_num=3, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = _AlltoAll(split_count=3, split_dim=3, concat_dim=3)
+
+        def construct(self, x1):
+            out = self.alltoallv(x1)
+            return out
+
+    with pytest.raises(ValueError):
+        net = Net()
+        _executor.compile(net, _x1)
+
+
+def test_all_to_all_invalid_group_type_failed():
+    """
+    Feature: AlltoAll
+    Description: group should be str, but a tuple is given
+    Expectation: throw TypeError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = _AlltoAll(split_count=8, split_dim=3, concat_dim=3, group=3)
+
+        def construct(self, x1):
+            out = self.alltoallv(x1)
+            return out
+
+    with pytest.raises(TypeError):
+        net = Net()
+        _executor.compile(net, _x1)
+
+
 if __name__ == '__main__':
     test_all_to_all()
diff --git a/tests/ut/python/parallel/test_auto_parallel_reshape.py b/tests/ut/python/parallel/test_auto_parallel_reshape.py
index 479c7274756..8707ca01b30 100644
--- a/tests/ut/python/parallel/test_auto_parallel_reshape.py
+++ b/tests/ut/python/parallel/test_auto_parallel_reshape.py
@@ -323,3 +323,57 @@ def test_reshape_auto_7():
     net.set_auto_parallel()
     net.set_train()
     _executor.compile(net, x)
+
+def test_reshape_depend_reshape():
+    class Net(nn.Cell):
+        def __init__(self):
+            super().__init__()
+            self.reshape1 = P.Reshape()
+            self.reshape2 = P.Reshape()
+            self.relu = P.ReLU()
+            self.depend = P.Depend()
+            self.mul = P.Mul().shard(((2, 4), (2, 4)))
+            self.mul_weight = Parameter(Tensor(np.ones([128, 96]), dtype=ms.float32), name="weight")
+            self.add = P.Add().shard(((4, 2), (4, 2)))
+
+        def construct(self, x, y):
+            out1 = self.mul(x, self.mul_weight)
+            y = self.relu(y)
+            out2 = self.reshape1(y, (96, 32, 4))
+            out3 = self.depend(out2, out1)
+            out3 = self.reshape2(out3, (128, 96))
+            out = out1 + out3
+            return out
+
+    class NetWithLoss1(nn.Cell):
+        def __init__(self, network):
+            super(NetWithLoss1, self).__init__()
+            self.mean = P.ReduceMean(keep_dims=False)
+            self.network = network
+
+        def construct(self, x, y):
+            predict = self.network(x, y)
+            return self.mean(predict, ())
+
+    class GradWrap1(nn.Cell):
+        def __init__(self, network):
+            super(GradWrap1, self).__init__()
+            self.network = network
+
+        def construct(self, x, y):
+            return grad_all(self.network)(x, y)
+
+    size = 8
+    context.set_auto_parallel_context(device_num=size, global_rank=0)
+    x = Tensor(np.ones([128, 96]), dtype=ms.float32)
+    y = Tensor(np.ones([256, 48]), dtype=ms.float32)
+    net = GradWrap1(NetWithLoss1(Net()))
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+    net.set_auto_parallel()
+    net.set_train()
+    _executor.compile(net, x, y)
+    net_auto = GradWrap1(NetWithLoss1(Net()))
+    context.set_auto_parallel_context(parallel_mode="auto_parallel")
+    net_auto.set_auto_parallel()
+    net_auto.set_train()
+    _executor.compile(net_auto, x, y)
diff --git a/tests/ut/python/parallel/test_conv2d.py b/tests/ut/python/parallel/test_conv2d.py
index 1ef971a0587..08086e030bb 100644
--- a/tests/ut/python/parallel/test_conv2d.py
+++ b/tests/ut/python/parallel/test_conv2d.py
@@ -38,18 +38,20 @@ class Net(Cell):
 
 
 _x = Tensor(np.ones([32, 16, 8, 8]), dtype=ms.float32)
+_x2 = Tensor(np.ones([32, 16, 10, 10]), dtype=ms.float32)
+_w0 = Tensor(np.ones([8, 16, 1, 1]), dtype=ms.float32)
 _w1 = Tensor(np.ones([8, 16, 2, 2]), dtype=ms.float32)
 _w2 = Tensor(np.ones([8, 16, 3, 3]), dtype=ms.float32)
 _w3 = Tensor(np.ones([8, 16, 5, 5]), dtype=ms.float32)
 _b = Tensor(np.ones([32, 16, 8, 8]), dtype=ms.float32)
 
 
-def compile_net(net):
+def compile_net(net, input_x=_x):
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     train_net = TrainOneStepCell(net, optimizer)
     train_net.set_auto_parallel()
     train_net.set_train()
-    _executor.compile(train_net, _x, _b)
+    _executor.compile(train_net, input_x, _b)
     context.reset_auto_parallel_context()
 
 
@@ -85,6 +87,12 @@ def test_conv2d_model_parallel3():
     compile_net(net)
 
 
+def test_conv2d_auto_parallel():
+    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=8, global_rank=0)
+    net = Net(_w2, out_channel=8, kernel_size=3, pad_mode="same", stride=1)
+    compile_net(net)
+
+
 def test_conv2d_model_parallel4():
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=32, global_rank=0)
     strategy1 = ((2, 2, 1, 4), (2, 2, 1, 1))
@@ -102,6 +110,24 @@ def test_conv2d_left_and_right_no_need_to_send():
         compile_net(net)
 
 
+def test_conv2d_kernel_size_larger_than_stride_and_split_h():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=32, global_rank=0)
+    strategy1 = ((2, 2, 4, 1), (2, 2, 1, 1))
+    strategy2 = ((2, 2, 4, 1),)
+    net = Net(_w2, out_channel=8, kernel_size=3, pad_mode="same", stride=1, strategy1=strategy1, strategy2=strategy2)
+    with pytest.raises(RuntimeError):
+        compile_net(net)
+
+
+def test_conv2d_valid_mode_kernel_size_larger_than_stride():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((2, 1, 1, 2), (1, 1, 1, 1))
+    strategy2 = ((2, 1, 1, 4),)
+    net = Net(_w2, out_channel=8, kernel_size=3, pad_mode="valid", stride=1, strategy1=strategy1, strategy2=strategy2)
+    with pytest.raises(RuntimeError):
+        compile_net(net)
+
+
 def test_conv2d_output_can_not_divisible_by_strategy():
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
     strategy1 = ((1, 1, 1, 8), (1, 1, 1, 1))
@@ -109,3 +135,57 @@ def test_conv2d_output_can_not_divisible_by_strategy():
     net = Net(_w1, out_channel=8, kernel_size=2, pad_mode="same", stride=2, strategy1=strategy1, strategy2=strategy2)
     with pytest.raises(RuntimeError):
         compile_net(net)
+
+
+def test_split_kernel():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((1, 1, 1, 1), (1, 1, 2, 2))
+    strategy2 = ((1, 1, 1, 8),)
+    net = Net(_w1, out_channel=8, kernel_size=2, pad_mode="same", stride=2, strategy1=strategy1, strategy2=strategy2)
+    with pytest.raises(RuntimeError):
+        compile_net(net)
+
+
+def test_kernel_size_smaller_than_stride_and_slice_can_not_divisible_by_stride_same_mode():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((1, 1, 1, 2), (1, 1, 1, 1))
+    strategy2 = ((1, 1, 1, 8),)
+    net = Net(_w0, out_channel=8, kernel_size=1, pad_mode="same", stride=3, strategy1=strategy1, strategy2=strategy2)
+    with pytest.raises(RuntimeError):
+        compile_net(net, _x2)
+
+
+def test_kernel_size_smaller_than_stride_and_slice_can_not_divisible_by_stride_valid_mode():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((1, 1, 1, 2), (1, 1, 1, 1))
+    strategy2 = ((1, 1, 1, 8),)
+    net = Net(_w0, out_channel=8, kernel_size=1, pad_mode="valid", stride=3, strategy1=strategy1, strategy2=strategy2)
+    with pytest.raises(RuntimeError):
+        compile_net(net, _x2)
+
+
+def test_kernel_size_larger_than_stride_and_input_can_not_divisible_by_stride():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((1, 1, 1, 2), (1, 1, 1, 1))
+    strategy2 = ((1, 1, 1, 8),)
+    net = Net(_w3, out_channel=8, kernel_size=5, pad_mode="same", stride=3, strategy1=strategy1, strategy2=strategy2)
+    with pytest.raises(RuntimeError):
+        compile_net(net, _x2)
+
+
+def test_kernel_size_larger_than_stride_and_slice_too_small():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((1, 1, 1, 8), (1, 1, 1, 1))
+    strategy2 = ((1, 1, 1, 8),)
+    net = Net(_w3, out_channel=8, kernel_size=5, pad_mode="same", stride=1, strategy1=strategy1, strategy2=strategy2)
+    with pytest.raises(RuntimeError):
+        compile_net(net)
+
+
+def test_kernel_size_larger_than_stride_and_left_pad_is_0():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((1, 1, 1, 4), (1, 1, 1, 1))
+    strategy2 = ((1, 1, 1, 8),)
+    net = Net(_w1, out_channel=8, kernel_size=2, pad_mode="same", stride=1, strategy1=strategy1, strategy2=strategy2)
+    with pytest.raises(RuntimeError):
+        compile_net(net)
diff --git a/tests/ut/python/parallel/test_conv2d_transpose.py b/tests/ut/python/parallel/test_conv2d_transpose.py
index 46b65a2ea86..9e6316d4ca5 100644
--- a/tests/ut/python/parallel/test_conv2d_transpose.py
+++ b/tests/ut/python/parallel/test_conv2d_transpose.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import pytest
 
 import mindspore as ms
 from mindspore import context, Tensor, Parameter
@@ -54,6 +55,8 @@ class Net2(Cell):
 _x = Tensor(np.ones([32, 8, 8, 8]), dtype=ms.float32)
 _w1 = Tensor(np.ones([8, 16, 2, 2]), dtype=ms.float32)
 _w2 = Tensor(np.ones([8, 16, 4, 4]), dtype=ms.float32)
+_w3 = Tensor(np.ones([8, 16, 10, 10]), dtype=ms.float32)
+_w4 = Tensor(np.ones([8, 16, 3, 3]), dtype=ms.float32)
 _b = Tensor(np.ones([32, 16, 8, 8]), dtype=ms.float32)
 
 
@@ -98,3 +101,33 @@ def test_conv2d_transpose_model_parallel3():
     net = Net2(_w2, out_channel=8, kernel_size=(4, 4), pad_mode="same", stride=2,
                strategy1=strategy1, strategy2=strategy2)
     compile_net(net)
+
+
+def test_conv2d_transpose_all_rank_no_need_overlap():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=16, global_rank=0)
+    strategy1 = ((2, 2, 1, 4), (2, 1, 1, 1))
+    strategy2 = ((2, 2, 1, 4),)
+    net = Net2(_w1, out_channel=8, kernel_size=(2, 2), pad_mode="same", stride=2,
+               strategy1=strategy1, strategy2=strategy2)
+    compile_net(net)
+
+
+def test_conv2d_transpose_overlap_size_too_large():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((1, 1, 1, 8), (1, 1, 1, 1))
+    strategy2 = ((1, 1, 1, 8),)
+    net = Net2(_w3, out_channel=8, kernel_size=(10, 10), pad_mode="same", stride=2,
+               strategy1=strategy1, strategy2=strategy2)
+    with pytest.raises(RuntimeError):
+        compile_net(net)
+
+
+def test_conv2d_transpose_rank0_no_need_overlap():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=16, global_rank=0)
+    strategy1 = ((2, 2, 1, 4), (2, 1, 1, 1))
+    strategy2 = ((2, 2, 1, 4),)
+    net = Net2(_w4, out_channel=8, kernel_size=(3, 3), pad_mode="same", stride=2,
+               strategy1=strategy1, strategy2=strategy2)
+    with pytest.raises(RuntimeError):
+        compile_net(net)
+    
\ No newline at end of file
diff --git a/tests/ut/python/parallel/test_full_batch.py b/tests/ut/python/parallel/test_full_batch.py
index dc82cb04a25..6b5e3c65987 100644
--- a/tests/ut/python/parallel/test_full_batch.py
+++ b/tests/ut/python/parallel/test_full_batch.py
@@ -71,7 +71,8 @@ def all_to_all_common(strategy1):
 
     context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=8, full_batch=True)
+    context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=8,
+                                      dataset_strategy="full_batch")
     predict = Tensor(np.ones([256, 128]), dtype=ms.float32)
     label = Tensor(np.ones([256]), dtype=ms.int32)
     dataset = Dataset(predict, label, 2)
diff --git a/tests/ut/python/parallel/test_maxpool_avgpool.py b/tests/ut/python/parallel/test_maxpool_avgpool.py
index 637161eedb4..9604282d4a2 100644
--- a/tests/ut/python/parallel/test_maxpool_avgpool.py
+++ b/tests/ut/python/parallel/test_maxpool_avgpool.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import pytest
 
 import mindspore as ms
 from mindspore import context, Tensor, Parameter
@@ -98,6 +99,16 @@ def test_maxpool_auto_parallel():
     compile_net(net)
 
 
+def test_maxpool_output_can_not_divisible_by_strategy():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((8, 1, 1, 1), (1, 1, 1, 1))
+    strategy2 = ((1, 1, 1, 8),)
+    net = Net(_w1, out_channel=8, kernel_size=2, pad_mode="same", stride=1, pool_kernel_size=2, pool_strides=2,
+              strategy1=strategy1, strategy2=strategy2)
+    with pytest.raises(RuntimeError):
+        compile_net(net)
+
+
 def test_avgpool_data_parallel():
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
     strategy1 = ((8, 1, 1, 1), (1, 1, 1, 1))
diff --git a/tests/ut/python/parallel/test_neighborexchange.py b/tests/ut/python/parallel/test_neighborexchange.py
index 787dd86704a..f1d0003f51e 100644
--- a/tests/ut/python/parallel/test_neighborexchange.py
+++ b/tests/ut/python/parallel/test_neighborexchange.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import pytest
 import numpy as np
 import mindspore as ms
 import mindspore.context as context
@@ -22,39 +23,6 @@ from mindspore.nn import TrainOneStepCell, Momentum
 from mindspore.ops import operations as P
 from mindspore.ops.operations._inner_ops import NeighborExchange
 
-
-class MatMulNet(nn.Cell):
-    def __init__(self, weight1):
-        super(MatMulNet, self).__init__()
-        self.matmul = P.MatMul()
-        self.mul = P.Mul()
-        self.alltoallv = NeighborExchange(send_rank_ids=[0], recv_rank_ids=[1, 2], recv_shapes=([32, 32], [32, 64]),
-                                          send_shapes=([32, 32], [32, 16]), recv_type=ms.float32)
-        self.weight1 = Parameter(weight1, "w1")
-
-    def construct(self, x1, x2):
-        out = self.matmul(x1, x2)
-        out = self.mul(out, self.weight1)
-        out = self.alltoallv((out, x1))
-        return out[0]
-
-
-class MatMulNet2(nn.Cell):
-    def __init__(self, weight1):
-        super(MatMulNet2, self).__init__()
-        self.matmul = P.MatMul()
-        self.mul = P.Mul()
-        self.alltoallv = NeighborExchange(send_rank_ids=[0], recv_rank_ids=[1, 2], recv_shapes=([32, 32], [32, 64]),
-                                          send_shapes=([32, 32],), recv_type=ms.float32)
-        self.weight1 = Parameter(weight1, "w1")
-
-    def construct(self, x1, x2):
-        out = self.matmul(x1, x2)
-        out = self.mul(out, self.weight1)
-        out = self.alltoallv((out,))
-        return out[0]
-
-
 _w1 = Tensor(np.ones([32, 32]), dtype=ms.float32)
 _x1 = Tensor(np.ones([32, 16]), dtype=ms.float32)
 _x2 = Tensor(np.ones([16, 32]), dtype=ms.float32)
@@ -68,13 +36,361 @@ def compile_net(net):
     _executor.compile(train_net, _x1, _x2)
 
 
-def test_NeighborExchange_two_inputs():
+def test_NeighborExchange_two_inputs_success():
+    """
+    Feature: NeighborExchange
+    Description: two inputs and two outputs, with valid arguments
+    Expectation: success
+    """
     context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class MatMulNet(nn.Cell):
+        def __init__(self, weight1):
+            super(MatMulNet, self).__init__()
+            self.matmul = P.MatMul()
+            self.mul = P.Mul()
+            self.alltoallv = NeighborExchange(send_rank_ids=[0, 1], recv_rank_ids=[1, 2],
+                                              recv_shapes=([32, 32], [32, 64]),
+                                              send_shapes=([32, 32], [32, 16]), recv_type=ms.float32)
+            self.weight1 = Parameter(weight1, "w1")
+
+        def construct(self, x1, x2):
+            out = self.matmul(x1, x2)
+            out = self.mul(out, self.weight1)
+            out = self.alltoallv((out, x1))
+            return out[0]
+
     net = MatMulNet(_w1)
     compile_net(net)
 
 
-def test_NeighborExchange_single_input():
+def test_NeighborExchange_single_input_success():
+    """
+    Feature: NeighborExchange
+    Description: one inputs and two outputs, with valid arguments
+    Expectation: success
+    """
     context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class MatMulNet2(nn.Cell):
+        def __init__(self, weight1):
+            super(MatMulNet2, self).__init__()
+            self.matmul = P.MatMul()
+            self.mul = P.Mul()
+            self.alltoallv = NeighborExchange(send_rank_ids=[0], recv_rank_ids=[1, 2], recv_shapes=([32, 32], [32, 64]),
+                                              send_shapes=([32, 32],), recv_type=ms.float32)
+            self.weight1 = Parameter(weight1, "w1")
+
+        def construct(self, x1, x2):
+            out = self.matmul(x1, x2)
+            out = self.mul(out, self.weight1)
+            out = self.alltoallv((out,))
+            return out[0]
+
     net = MatMulNet2(_w1)
     compile_net(net)
+
+
+def test_NeighborExchage_empty_send_empty_recv_success():
+    """
+    Feature: NeighborExchange
+    Description: empty inputs and empty outputs, with valid arguments
+    Expectation: success
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = NeighborExchange(send_rank_ids=[], recv_rank_ids=[],
+                                              recv_shapes=(),
+                                              send_shapes=(), recv_type=ms.float32, group=("str",))
+
+        def construct(self, x1):
+            self.alltoallv()
+            return x1
+
+    net = Net()
+    with pytest.raises(TypeError):
+        _executor.compile(net, _x1)
+
+
+def test_NeighborExchage_recv_shape_num_diff_with_recv_rank_size_failed():
+    """
+    Feature: NeighborExchange
+    Description: send_rank_ids and send_shapes are set as 1 input, but gives 2
+    Expectation: throw ValueError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self, weight1):
+            super(Net, self).__init__()
+            self.matmul = P.MatMul()
+            self.mul = P.Mul()
+            self.alltoallv = NeighborExchange(send_rank_ids=[0], recv_rank_ids=[1, 2], recv_shapes=([32, 32],),
+                                              send_shapes=([32, 32],), recv_type=ms.float32)
+            self.weight1 = Parameter(weight1, "w1")
+
+        def construct(self, x1, x2):
+            out = self.matmul(x1, x2)
+            out = self.mul(out, self.weight1)
+            out = self.alltoallv((out,))
+            return out[0]
+
+    net = Net(_w1)
+    with pytest.raises(ValueError):
+        compile_net(net)
+
+
+def test_NeighborExchage_send_shape_num_diff_with_send_rank_size_failed():
+    """
+    Feature: NeighborExchange
+    Description: send_rank_ids is set as 2 inputs, but send_shapes are set as 1 input
+    Expectation: throw ValueError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self, weight1):
+            super(Net, self).__init__()
+            self.matmul = P.MatMul()
+            self.mul = P.Mul()
+            self.alltoallv = NeighborExchange(send_rank_ids=[0, 1], recv_rank_ids=[1, 2],
+                                              recv_shapes=([32, 32], [32, 32]),
+                                              send_shapes=([32, 32],), recv_type=ms.float32)
+            self.weight1 = Parameter(weight1, "w1")
+
+        def construct(self, x1, x2):
+            out = self.matmul(x1, x2)
+            out = self.mul(out, self.weight1)
+            out = self.alltoallv((out,))
+            return out[0]
+
+    net = Net(_w1)
+    with pytest.raises(ValueError):
+        compile_net(net)
+
+
+def test_NeighborExchage_send_shape_num_diff_with_input_num_failed():
+    """
+    Feature: NeighborExchange
+    Description: send_rank_ids and send_shapes are set as 2 inputs, but has only 1 input
+    Expectation: throw Exception
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self, weight1):
+            super(Net, self).__init__()
+            self.matmul = P.MatMul()
+            self.mul = P.Mul()
+            self.alltoallv = NeighborExchange(send_rank_ids=[0, 1], recv_rank_ids=[1, 2],
+                                              recv_shapes=([32, 32], [32, 32]),
+                                              send_shapes=([32, 32], [32, 32]), recv_type=ms.float32)
+            self.weight1 = Parameter(weight1, "w1")
+
+        def construct(self, x1, x2):
+            out = self.matmul(x1, x2)
+            out = self.mul(out, self.weight1)
+            out = self.alltoallv((out,))
+            return out[0]
+
+    net = Net(_w1)
+    with pytest.raises(Exception):
+        compile_net(net)
+
+
+def test_NeighborExchage_send_shape_diff_with_input_shape_failed():
+    """
+    Feature: NeighborExchange
+    Description: send_shapes is set as [16, 16], but input is [32, 32]
+    Expectation: throw Exception
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self, weight1):
+            super(Net, self).__init__()
+            self.matmul = P.MatMul()
+            self.mul = P.Mul()
+            self.alltoallv = NeighborExchange(send_rank_ids=[0], recv_rank_ids=[1, 2], recv_shapes=([32, 32], [32, 64]),
+                                              send_shapes=([16, 16],), recv_type=ms.float32)
+            self.weight1 = Parameter(weight1, "w1")
+
+        def construct(self, x1, x2):
+            out = self.matmul(x1, x2)
+            out = self.mul(out, self.weight1)
+            out = self.alltoallv((out,))
+            return out[0]
+
+    net = Net(_w1)
+    with pytest.raises(Exception):
+        compile_net(net)
+
+
+def test_NeighborExchage_attr_check_send_rank_ids_is_tuple_failed():
+    """
+    Feature: NeighborExchange
+    Description: send_rank_ids should be list, but a tuple is given
+    Expectation: throw TypeError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = NeighborExchange(send_rank_ids=(0), recv_rank_ids=[1, 2], recv_shapes=([32, 32], [32, 64]),
+                                              send_shapes=([32, 16],), recv_type=ms.float32)
+
+        def construct(self, x1):
+            out = self.alltoallv((x1,))
+            return out[0]
+
+    net = Net()
+    with pytest.raises(TypeError):
+        _executor.compile(net, _x1)
+
+
+def test_NeighborExchage_attr_check_send_rank_ids_is_float_failed():
+    """
+    Feature: NeighborExchange
+    Description: send_rank_ids should be int, but a float is given
+    Expectation: throw TypeError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = NeighborExchange(send_rank_ids=[1.0], recv_rank_ids=[1, 2],
+                                              recv_shapes=([32, 32], [32, 64]),
+                                              send_shapes=([32, 16],), recv_type=ms.float32)
+
+        def construct(self, x1):
+            out = self.alltoallv((x1,))
+            return out[0]
+
+    net = Net()
+    with pytest.raises(TypeError):
+        _executor.compile(net, _x1)
+
+
+def test_NeighborExchage_attr_check_recv_rank_ids_is_tuple_failed():
+    """
+    Feature: NeighborExchange
+    Description: recv_rank_ids should be list, but a tuple is given
+    Expectation: throw TypeError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = NeighborExchange(send_rank_ids=[0], recv_rank_ids=([1, 2],),
+                                              recv_shapes=([32, 32], [32, 64]),
+                                              send_shapes=([32, 16],), recv_type=ms.float32)
+
+        def construct(self, x1):
+            out = self.alltoallv((x1,))
+            return out[0]
+
+    net = Net()
+    with pytest.raises(TypeError):
+        _executor.compile(net, _x1)
+
+
+def test_NeighborExchage_attr_check_recv_rank_ids_is_float_failed():
+    """
+    Feature: NeighborExchange
+    Description: recv_rank_ids should be int, but a float is given
+    Expectation: throw TypeError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = NeighborExchange(send_rank_ids=[1], recv_rank_ids=[1, 2.0],
+                                              recv_shapes=([32, 32], [32, 64]),
+                                              send_shapes=([32, 16],), recv_type=ms.float32)
+
+        def construct(self, x1):
+            out = self.alltoallv((x1,))
+            return out[0]
+
+    net = Net()
+    with pytest.raises(TypeError):
+        _executor.compile(net, _x1)
+
+
+def test_NeighborExchage_attr_check_send_shape_not_tuple_failed():
+    """
+    Feature: NeighborExchange
+    Description: send_shapes should be tuple(list), but a list is given
+    Expectation: throw TypeError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = NeighborExchange(send_rank_ids=[1], recv_rank_ids=[1, 2],
+                                              recv_shapes=([32, 32], [32, 64]),
+                                              send_shapes=([32, 16]), recv_type=ms.float32)
+
+        def construct(self, x1):
+            out = self.alltoallv((x1,))
+            return out[0]
+
+    net = Net()
+    with pytest.raises(TypeError):
+        _executor.compile(net, _x1)
+
+
+def test_NeighborExchage_attr_check_recv_type_numpy_failed():
+    """
+    Feature: NeighborExchange
+    Description: recv_type should be mindspore type, but a numpy type is given
+    Expectation: throw TypeError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = NeighborExchange(send_rank_ids=[1], recv_rank_ids=[1, 2],
+                                              recv_shapes=([32, 32], [32, 64]),
+                                              send_shapes=([32, 16],), recv_type=np.float32)
+
+        def construct(self, x1):
+            out = self.alltoallv((x1,))
+            return out[0]
+
+    net = Net()
+    with pytest.raises(TypeError):
+        _executor.compile(net, _x1)
+
+
+def test_NeighborExchage_attr_invalid_grpup_failed():
+    """
+    Feature: NeighborExchange
+    Description: group should be str, but a tuple is given
+    Expectation: throw TypeError
+    """
+    context.set_auto_parallel_context(device_num=8, global_rank=0)
+
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.alltoallv = NeighborExchange(send_rank_ids=[1], recv_rank_ids=[1, 2],
+                                              recv_shapes=([32, 32], [32, 64]),
+                                              send_shapes=([32, 16],), recv_type=ms.float32, group=("str",))
+
+        def construct(self, x1):
+            out = self.alltoallv((x1,))
+            return out[0]
+
+    net = Net()
+    with pytest.raises(TypeError):
+        _executor.compile(net, _x1)
diff --git a/tests/ut/python/parallel/test_parallel_transformer.py b/tests/ut/python/parallel/test_parallel_transformer.py
index 5192ed9bb37..bc3c97ef509 100644
--- a/tests/ut/python/parallel/test_parallel_transformer.py
+++ b/tests/ut/python/parallel/test_parallel_transformer.py
@@ -13,14 +13,21 @@
 # limitations under the License.
 
 import numpy as np
-
+import pytest
 import mindspore.common.dtype as mstype
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.context import set_auto_parallel_context, ParallelMode
 from mindspore.ops import composite as C
-from mindspore.nn.parallel import TransformerEncoder, TransformerDecoder, Transformer, TransformerParallelConfig,\
-    VocabEmbedding
+from mindspore.ops import functional as F
+import mindspore.ops as P
+from mindspore.parallel.nn import TransformerEncoder, TransformerDecoder, Transformer, TransformerOpParallelConfig, \
+    VocabEmbedding, CrossEntropyLoss, OpParallelConfig, EmbeddingOpParallelConfig
+from mindspore.nn import Dense as Linear
+from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
+from mindspore.nn.optim import AdamWeightDecay
+from mindspore.nn.wrap.cell_wrapper import PipelineCell, _VirtualDatasetCell, TrainOneStepCell
+from mindspore.nn.wrap.loss_scale import _TrainPipelineWithLossScaleCell
 from mindspore.train import Model
 from tests.dataset_mock import MindData
 from tests.ut.python.ops.test_math_ops import VirtualLoss
@@ -48,39 +55,159 @@ class Dataset(MindData):
         self.index = 0
 
 
-def test_transformer_model():
-    class NetWithLoss(nn.Cell):
-        def __init__(self, network):
-            super(NetWithLoss, self).__init__()
-            self.loss = VirtualLoss()
-            self.network = network
+config = TransformerOpParallelConfig(data_parallel=1, model_parallel=8, vocab_emb_dp=False)
+pipeline_config = TransformerOpParallelConfig(data_parallel=1, model_parallel=8, pipeline_stage=4,
+                                              micro_batch_num=4, vocab_emb_dp=False)
 
-        def construct(self, x1, x2, x3, x4, x5):
+
+class NetWithLossFiveInputs(nn.Cell):
+    def __init__(self, network):
+        super(NetWithLossFiveInputs, self).__init__()
+        self.loss = VirtualLoss()
+        self.network = network
+
+    def construct(self, x1, x2, x3, x4, x5):
+        predict, _, _ = self.network(x1, x2, x3, x4, x5)
+        return self.loss(predict)
+
+
+def run_total_transformer_model_head(e_layer,
+                                     d_layer,
+                                     arg_parallel_config):
+    dp = arg_parallel_config.data_parallel
+    mp = arg_parallel_config.model_parallel
+    pp = arg_parallel_config.pipeline_stage
+    if dp * mp * pp != 1:
+        set_auto_parallel_context(device_num=8,
+                                  full_batch=True,
+                                  global_rank=0, parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL)
+
+    class Net(nn.Cell):
+        def __init__(self, en_layer, de_layer, parallel_config):
+            super(Net, self).__init__()
+            self.embedding = VocabEmbedding(vocab_size=240, embedding_size=20,
+                                            parallel_config=config.embedding_dp_mp_config)
+            self.network = Transformer(encoder_layers=en_layer,
+                                       decoder_layers=de_layer,
+                                       batch_size=2,
+                                       src_seq_length=20,
+                                       tgt_seq_length=10,
+                                       hidden_size=64,
+                                       num_heads=8,
+                                       ffn_hidden_size=64,
+                                       parallel_config=parallel_config)
+            self.head = Linear(in_channels=64, out_channels=200)
+            self.loss = CrossEntropyLoss(parallel_config=config.dp_mp_config)
+
+        def construct(self, x1, x2, x3, x4, x5, y, mask):
             predict, _, _ = self.network(x1, x2, x3, x4, x5)
-            return self.loss(predict)
+            predict = P.Reshape()(predict, (-1, F.shape(predict)[-1]))
+            return self.loss(predict, y, mask)
 
-    config = TransformerParallelConfig(dp=1, mp=8)
-    set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL)
+    encoder_input_value = Tensor(np.ones((2, 20, 64)), mstype.float32)
+    encoder_input_mask = Tensor(np.ones((2, 20, 20)), mstype.float16)
+    decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32)
+    decoder_input_mask = Tensor(np.ones((2, 10, 10)), mstype.float16)
+    memory_mask = Tensor(np.ones((2, 10, 20)), mstype.float16)
+    seq = 20
+    if d_layer > 0:
+        seq = 10
+    label = Tensor(np.ones((2 * seq,)), mstype.int32)
+    input_mask = Tensor(np.ones((2 * seq,)), mstype.float32)
+    net = Net(en_layer=e_layer, de_layer=d_layer, parallel_config=arg_parallel_config)
+    params = net.trainable_params()
+    optimizer = AdamWeightDecay(params)
+    dataset = Dataset(encoder_input_value, encoder_input_mask, decoder_input_value, decoder_input_mask,
+                      memory_mask, label, input_mask)
+    net_with_grad = TrainOneStepCell(net, optimizer=optimizer)
+    model = Model(net_with_grad)
+
+    model.train(1, dataset, dataset_sink_mode=False)
+
+
+def test_transformer_model():
+    set_auto_parallel_context(device_num=8, global_rank=0,
+                              full_batch=True,
+                              parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL)
     net = Transformer(encoder_layers=1,
+                      decoder_layers=2,
+                      batch_size=2,
+                      src_seq_length=20,
+                      tgt_seq_length=10,
+                      hidden_size=64,
+                      num_heads=8,
+                      ffn_hidden_size=64,
+                      parallel_config=config)
+
+    encoder_input_value = Tensor(np.ones((2, 20, 64)), mstype.float32)
+    encoder_input_mask = Tensor(np.ones((2, 20, 20)), mstype.float16)
+    decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32)
+    decoder_input_mask = Tensor(np.ones((2, 10, 10)), mstype.float16)
+    memory_mask = Tensor(np.ones((2, 10, 20)), mstype.float16)
+    net = NetWithLossFiveInputs(net)
+    params = net.trainable_params()
+    optimizer = AdamWeightDecay(params)
+    dataset = Dataset(encoder_input_value, encoder_input_mask, decoder_input_value, decoder_input_mask,
+                      memory_mask)
+    net_with_grad = TrainOneStepCell(net, optimizer=optimizer)
+    model = Model(net_with_grad)
+
+    model.train(1, dataset, dataset_sink_mode=False)
+
+
+def test_transformer_model_head_parallel_only_encoder():
+    local_config = TransformerOpParallelConfig(data_parallel=1, model_parallel=8)
+    run_total_transformer_model_head(e_layer=2, d_layer=0, arg_parallel_config=local_config)
+
+
+def test_transformer_model_head_parallel():
+    local_config = TransformerOpParallelConfig(data_parallel=1, model_parallel=8)
+    run_total_transformer_model_head(e_layer=1, d_layer=1, arg_parallel_config=local_config)
+
+
+def test_transformer_model_head_parallel_decoder():
+    local_config = TransformerOpParallelConfig(data_parallel=1, model_parallel=8)
+    with pytest.raises(ValueError):
+        run_total_transformer_model_head(e_layer=0, d_layer=1, arg_parallel_config=local_config)
+
+
+def test_transformer_model_head_stand_alone():
+    local_config = TransformerOpParallelConfig(data_parallel=1, model_parallel=1)
+    run_total_transformer_model_head(e_layer=2, d_layer=2, arg_parallel_config=local_config)
+
+
+def test_pipeline_single_transformer():
+    set_auto_parallel_context(device_num=32,
+                              full_batch=True,
+                              pipeline_stages=pipeline_config.pipeline_stage, global_rank=0,
+                              parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL)
+
+    net = Transformer(batch_size=4 // pipeline_config.micro_batch_num,
+                      src_seq_length=20,
+                      tgt_seq_length=10,
+                      encoder_layers=2,
                       decoder_layers=2,
                       hidden_size=64,
                       num_heads=8,
                       ffn_hidden_size=64,
-                      src_seq_length=20,
-                      tgt_seq_length=20,
-                      parallel_config=config)
-
-    encoder_input_value = Tensor(np.ones((2, 20, 64)), mstype.float32)
-    encoder_input_mask = Tensor(np.ones((2, 1, 20, 20)), mstype.float16)
-    decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32)
-    decoder_input_mask = Tensor(np.ones((2, 1, 10, 10)), mstype.float16)
-    memory_mask = Tensor(np.ones((2, 1, 10, 20)), mstype.float16)
-    net = NetWithLoss(net)
+                      parallel_config=pipeline_config)
 
+    encoder_input_value = Tensor(np.ones((4, 20, 64)), mstype.float32)
+    encoder_input_mask = Tensor(np.ones((4, 20, 20)), mstype.float16)
+    decoder_input_value = Tensor(np.ones((4, 10, 64)), mstype.float32)
+    decoder_input_mask = Tensor(np.ones((4, 10, 10)), mstype.float16)
+    memory_mask = Tensor(np.ones((4, 10, 20)), mstype.float16)
+    net = NetWithLossFiveInputs(net)
+    net = PipelineCell(net, pipeline_config.micro_batch_num)
+    net = _VirtualDatasetCell(net)
+    params = net.infer_param_pipeline_stage()
+    optimizer = AdamWeightDecay(params)
     dataset = Dataset(encoder_input_value, encoder_input_mask, decoder_input_value, decoder_input_mask,
                       memory_mask)
-
-    model = Model(net)
+    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=1024, scale_factor=2, scale_window=1000)
+    net_with_grad = _TrainPipelineWithLossScaleCell(net, optimizer=optimizer,
+                                                    scale_sense=update_cell)
+    model = Model(net_with_grad)
 
     model.train(1, dataset, dataset_sink_mode=False)
 
@@ -96,17 +223,19 @@ def test_encoder():
             predict, _ = self.network(x1, x2)
             return self.loss(predict)
 
-    config = TransformerParallelConfig(dp=1, mp=8)
-    set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL)
+    set_auto_parallel_context(device_num=8,
+                              full_batch=True,
+                              global_rank=0, parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL)
     net = TransformerEncoder(num_layers=2,
+                             batch_size=2,
+                             seq_length=16,
                              hidden_size=8,
                              ffn_hidden_size=64,
-                             seq_length=16,
                              num_heads=8,
                              parallel_config=config)
 
     encoder_input_value = Tensor(np.ones((2, 16, 8)), mstype.float32)
-    encoder_input_mask = Tensor(np.ones((2, 1, 16, 16)), mstype.float16)
+    encoder_input_mask = Tensor(np.ones((2, 16, 16)), mstype.float16)
 
     net = NetWithLoss(net)
 
@@ -128,19 +257,22 @@ def test_decoder():
             predict, _, _ = self.network(x1, x2, x3, x4)
             return self.loss(predict)
 
-    config = TransformerParallelConfig(dp=1, mp=8)
-    set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL)
+    set_auto_parallel_context(device_num=8,
+                              full_batch=True,
+                              global_rank=0, parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL)
     net = TransformerDecoder(num_layers=1,
+                             batch_size=8,
                              hidden_size=16,
                              ffn_hidden_size=8,
                              num_heads=8,
-                             seq_length=10,
+                             src_seq_length=20,
+                             tgt_seq_length=10,
                              parallel_config=config)
 
-    encoder_input_value = Tensor(np.ones((2, 20, 16)), mstype.float32)
-    decoder_input_value = Tensor(np.ones((2, 10, 16)), mstype.float32)
-    decoder_input_mask = Tensor(np.ones((2, 1, 10, 10)), mstype.float16)
-    memory_mask = Tensor(np.ones((2, 1, 10, 20)), mstype.float16)
+    encoder_input_value = Tensor(np.ones((8, 20, 16)), mstype.float32)
+    decoder_input_value = Tensor(np.ones((8, 10, 16)), mstype.float32)
+    decoder_input_mask = Tensor(np.ones((8, 10, 10)), mstype.float16)
+    memory_mask = Tensor(np.ones((8, 10, 20)), mstype.float16)
 
     net = NetWithLoss(net)
 
@@ -151,7 +283,6 @@ def test_decoder():
 
 
 def test_vocabembedding_dp_true():
-    config = TransformerParallelConfig(dp=1, mp=8)
     set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL)
 
     class NetWithLoss(nn.Cell):
@@ -164,15 +295,7 @@ def test_vocabembedding_dp_true():
             predict, _ = self.network(x1)
             return self.loss(predict)
 
-    class GradWrap(nn.Cell):
-        def __init__(self, network):
-            super(GradWrap, self).__init__()
-            self.network = network
-
-        def construct(self, x1):
-            return grad_all(self.network)(x1)
-
-    net = VocabEmbedding(vocab_size=100, embedding_size=16, parallel_config=config)
+    net = VocabEmbedding(vocab_size=160, embedding_size=16, parallel_config=config.embedding_dp_mp_config)
     net = NetWithLoss(net)
     encoder_input_value = Tensor(np.ones((2, 64)), mstype.int32)
     dataset = Dataset(encoder_input_value)
@@ -182,7 +305,6 @@ def test_vocabembedding_dp_true():
 
 
 def test_vocabembedding_dp_false():
-    config = TransformerParallelConfig(dp=1, mp=8, vocab_emb_dp=False)
     set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL)
 
     class NetWithLoss(nn.Cell):
@@ -195,18 +317,109 @@ def test_vocabembedding_dp_false():
             predict, _ = self.network(x1)
             return self.loss(predict)
 
-    class GradWrap(nn.Cell):
-        def __init__(self, network):
-            super(GradWrap, self).__init__()
-            self.network = network
-
-        def construct(self, x1):
-            return grad_all(self.network)(x1)
-
-    net = VocabEmbedding(vocab_size=160, embedding_size=16, parallel_config=config)
+    net = VocabEmbedding(vocab_size=160, embedding_size=16, parallel_config=config.embedding_dp_mp_config)
     net = NetWithLoss(net)
     encoder_input_value = Tensor(np.ones((2, 64)), mstype.int32)
     dataset = Dataset(encoder_input_value)
 
     model = Model(net)
     model.train(1, dataset, dataset_sink_mode=False)
+
+
+def test_parallel_cross_entroy_loss_semi_auto_parallel():
+    set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL)
+
+    class NetWithLoss(nn.Cell):
+        def __init__(self, network, config_setting):
+            super(NetWithLoss, self).__init__()
+            self.loss = CrossEntropyLoss(config_setting)
+            self.network = network
+
+        def construct(self, x1, x2, x3):
+            predict, _ = self.network(x1)
+            predict = P.Reshape()(predict, (-1, 16))
+            return self.loss(predict, x2, x3)
+
+    net = VocabEmbedding(vocab_size=160, embedding_size=16, parallel_config=config.embedding_dp_mp_config)
+    net = NetWithLoss(net, config.dp_mp_config)
+    embed_ids = Tensor(np.ones((2, 64)), mstype.int32)
+    labels = Tensor(np.ones((2 * 64,)), mstype.int32)
+    input_mask = Tensor(np.ones((2 * 64,)), mstype.float32)
+    dataset = Dataset(embed_ids, labels, input_mask)
+
+    model = Model(net)
+    model.train(1, dataset, dataset_sink_mode=False)
+
+
+def test_transformer_parallel_config():
+    parallel_test_config = TransformerOpParallelConfig(data_parallel=1, model_parallel=3)
+
+    with pytest.raises(TypeError):
+        parallel_test_config.data_parallel = False
+
+    with pytest.raises(ValueError):
+        parallel_test_config.data_parallel = 0
+
+    with pytest.raises(TypeError):
+        parallel_test_config.model_parallel = False
+
+    with pytest.raises(ValueError):
+        parallel_test_config.model_parallel = 0
+
+    with pytest.raises(TypeError):
+        parallel_test_config.pipeline_stage = False
+
+    with pytest.raises(ValueError):
+        parallel_test_config.pipeline_stage = 0
+
+    with pytest.raises(TypeError):
+        parallel_test_config.micro_batch_num = False
+
+    with pytest.raises(ValueError):
+        parallel_test_config.micro_batch_num = 0
+
+    with pytest.raises(TypeError):
+        parallel_test_config.gradient_aggregation_group = False
+
+    with pytest.raises(ValueError):
+        parallel_test_config.gradient_aggregation_group = 0
+
+    with pytest.raises(TypeError):
+        parallel_test_config.recompute = 1
+
+    parallel_test_config.recompute = False
+
+    assert not parallel_test_config.recompute
+
+
+def test_parallel_config():
+    parallel_test_config = OpParallelConfig(data_parallel=1, model_parallel=3)
+
+    with pytest.raises(ValueError):
+        parallel_test_config.data_parallel = 0
+
+    with pytest.raises(TypeError):
+        parallel_test_config.model_parallel = False
+
+    with pytest.raises(ValueError):
+        parallel_test_config.model_parallel = 0
+
+    assert parallel_test_config.model_parallel == 3
+
+
+def test_embedding_parallel_config():
+    parallel_test_config = EmbeddingOpParallelConfig(data_parallel=1, model_parallel=3, vocab_emb_dp=False)
+
+    with pytest.raises(ValueError):
+        parallel_test_config.data_parallel = 0
+
+    with pytest.raises(TypeError):
+        parallel_test_config.model_parallel = False
+
+    with pytest.raises(ValueError):
+        parallel_test_config.model_parallel = 0
+
+    with pytest.raises(TypeError):
+        parallel_test_config.vocab_emb_dp = 0
+
+    assert not parallel_test_config.vocab_emb_dp
diff --git a/tests/ut/python/parallel/test_virtual_output.py b/tests/ut/python/parallel/test_virtual_output.py
index 834dc1906f8..3d2067cc287 100644
--- a/tests/ut/python/parallel/test_virtual_output.py
+++ b/tests/ut/python/parallel/test_virtual_output.py
@@ -132,7 +132,8 @@ def compile_graph_two_input(x, y, net):
 
 def test_dense_relu_semi_auto():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel", full_batch=False)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel",
+                                      dataset_strategy="data_parallel")
     net = DenseMutMulNet()
     x = Tensor(np.ones([32, 128]).astype(np.float32) * 0.01)
     strategies = compile_graph(x, net)
@@ -142,7 +143,8 @@ def test_dense_relu_semi_auto():
 
 def test_dense_relu_semi_auto_full_batch():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel", full_batch=True)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel",
+                                      dataset_strategy="full_batch")
     net = DenseMutMulNet()
     x = Tensor(np.ones([32, 128]).astype(np.float32) * 0.01)
     strategies = compile_graph(x, net)
@@ -152,7 +154,8 @@ def test_dense_relu_semi_auto_full_batch():
 
 def test_dense_relu_auto():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel", full_batch=False)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel",
+                                      dataset_strategy="data_parallel")
     net = DenseMutMulNet()
     x = Tensor(np.ones([32, 128]).astype(np.float32) * 0.01)
     strategies = compile_graph(x, net)
@@ -162,7 +165,8 @@ def test_dense_relu_auto():
 
 def test_dense_relu_auto_full_batch():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel", full_batch=True)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel",
+                                      dataset_strategy="full_batch")
     net = DenseMutMulNet()
     x = Tensor(np.ones([32, 128]).astype(np.float32) * 0.01)
     strategies = compile_graph(x, net)
@@ -172,7 +176,8 @@ def test_dense_relu_auto_full_batch():
 
 def test_mul_neg_two_output_semi_auto():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel", full_batch=False)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel",
+                                      dataset_strategy="data_parallel")
     net = MulNegTwoOutputNet()
     x = Tensor(np.ones([32, 128]).astype(np.float32) * 0.01)
     strategies = compile_graph(x, net)
@@ -185,7 +190,8 @@ def test_mul_neg_two_output_semi_auto():
 
 def test_mul_neg_two_output_semi_auto_full_batch():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel", full_batch=True)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel",
+                                      dataset_strategy="full_batch")
     net = MulNegTwoOutputNet()
     x = Tensor(np.ones([32, 128]).astype(np.float32) * 0.01)
     strategies = compile_graph(x, net)
@@ -198,7 +204,8 @@ def test_mul_neg_two_output_semi_auto_full_batch():
 
 def test_mul_neg_two_output_auto():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel", full_batch=False)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel",
+                                      dataset_strategy="data_parallel")
     net = MulNegTwoOutputNet()
     x = Tensor(np.ones([32, 128]).astype(np.float32) * 0.01)
     strategies = compile_graph(x, net)
@@ -211,7 +218,8 @@ def test_mul_neg_two_output_auto():
 
 def test_mul_neg_two_output_full_batch():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel", full_batch=True)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel",
+                                      dataset_strategy="full_batch")
     net = MulNegTwoOutputNet()
     x = Tensor(np.ones([32, 128]).astype(np.float32) * 0.01)
     strategies = compile_graph(x, net)
@@ -224,7 +232,8 @@ def test_mul_neg_two_output_full_batch():
 
 def test_reshape_matmul_semi_auto():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel", full_batch=False)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel",
+                                      dataset_strategy="data_parallel")
     strategy1 = None
     strategy2 = ((1, 1), (1, 8))
     net = ReshapeMatMulNet(strategy1, strategy2)
@@ -236,7 +245,8 @@ def test_reshape_matmul_semi_auto():
 
 def test_reshape_matmul_auto():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel", full_batch=False)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel",
+                                      dataset_strategy="data_parallel")
     strategy1 = None
     strategy2 = ((1, 1), (1, 8))
     net = ReshapeMatMulNet(strategy1, strategy2)
@@ -248,7 +258,8 @@ def test_reshape_matmul_auto():
 
 def test_matmul_reshape_semi_auto():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel", full_batch=False)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel",
+                                      dataset_strategy="data_parallel")
     strategy2 = None
     strategy1 = ((1, 1), (1, 8))
     net = MatMulReshapeNet(strategy1, strategy2)
@@ -260,7 +271,8 @@ def test_matmul_reshape_semi_auto():
 
 def test_matmul_reshape_auto():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel", full_batch=False)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel",
+                                      dataset_strategy="data_parallel")
     strategy2 = None
     strategy1 = ((1, 1), (1, 8))
     net = MatMulReshapeNet(strategy1, strategy2)
@@ -272,7 +284,8 @@ def test_matmul_reshape_auto():
 
 def test_reshape_mul_semi_auto():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel", full_batch=True)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel",
+                                      dataset_strategy="full_batch")
     net = ReshapeMulNet()
     x = Tensor(np.ones([64, 4]), ms.float32)
     strategies = compile_graph(x, net)
@@ -282,7 +295,8 @@ def test_reshape_mul_semi_auto():
 
 def test_reshape_mul_auto():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel", full_batch=True)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel",
+                                      dataset_strategy="full_batch")
     net = ReshapeMulNet()
     x = Tensor(np.ones([64, 4]), ms.float32)
     strategies = compile_graph(x, net)
@@ -292,7 +306,8 @@ def test_reshape_mul_auto():
 
 def test_scalar_output_semi_auto():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel", full_batch=False)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="semi_auto_parallel",
+                                      dataset_strategy="data_parallel")
     net = ParallelMulNet()
     loss_fn = nn.SoftmaxCrossEntropyWithLogits(reduction='mean')
     eval_net = nn.WithEvalCell(net, loss_fn)
@@ -308,7 +323,8 @@ def test_scalar_output_semi_auto():
 
 def test_scalar_output_auto():
     context.reset_auto_parallel_context()
-    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel", full_batch=False)
+    context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode="auto_parallel",
+                                      dataset_strategy="data_parallel")
     net = ParallelMulNet()
     loss_fn = nn.SoftmaxCrossEntropyWithLogits(reduction='mean')
     eval_net = nn.WithEvalCell(net, loss_fn)
diff --git a/tests/ut/python/profiler/parser/test_minddata_analyzer.py b/tests/ut/python/profiler/parser/test_minddata_analyzer.py
index ea87eb73626..9297efecda3 100644
--- a/tests/ut/python/profiler/parser/test_minddata_analyzer.py
+++ b/tests/ut/python/profiler/parser/test_minddata_analyzer.py
@@ -125,7 +125,7 @@ def test_analyze_basic():
         # 1. returned dictionary
         # 2. JSON file
         # 3. CSV file
-        md_analyzer = MinddataProfilingAnalyzer(ANALYZE_FILE_PATH, "CPU", 0, ANALYZE_FILE_PATH)
+        md_analyzer = MinddataProfilingAnalyzer(ANALYZE_FILE_PATH, 0, ANALYZE_FILE_PATH)
         md_summary_dict = md_analyzer.analyze()
 
         # Confirm MindData Profiling analyze summary files are created
diff --git a/version.txt b/version.txt
index e21e727f96f..13175fdc437 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-1.4.0
\ No newline at end of file
+1.4.1
\ No newline at end of file