[feat] [assistant] [I3T96T] add new Dataset operator CMUARCTICDataset

2021-08-22 16:26:45 +08:00 · 2021-08-22 16:26:45 +08:00 · b077aa1cab
parent c5bc9af49f
commit b077aa1cab
1432 changed files with 21434 additions and 10802 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -24,6 +24,9 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
        -Wno-return-std-move -Wno-unused-private-field -Wno-unused-lambda-capture -Wno-sign-compare \
        -Wno-overloaded-virtual -Wno-unneeded-internal-declaration -Wno-unused-variable -Wno-pessimizing-move \
        -Wno-inconsistent-missing-override -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
+elseif(ENABLE_SYM_FILE)
+    set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -g -ggdb -Wl,--allow-shlib-undefined \
+        -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
 else()
    set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Wl,--allow-shlib-undefined \
        -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
--- a/build.sh
+++ b/build.sh
@ -27,7 +27,7 @@ usage()
  echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K] \\"
  echo "              [-B on|off] [-E] [-l on|off] [-n full|lite|off] [-H on|off] \\"
  echo "              [-A on|off] [-S on|off] [-k on|off] [-W sse|neon|avx|avx512|off] \\"
-  echo "              [-L Tensor-RT path]  \\"
+  echo "              [-L Tensor-RT path] [-y on|off]  \\"
  echo ""
  echo "Options:"
  echo "    -d Debug mode"
@ -64,6 +64,7 @@ usage()
  echo "    -W Enable x86_64 SSE or AVX instruction set, use [sse|neon|avx|avx512|off], default off for lite and avx for CPU"
  echo "    -H Enable hidden"
  echo "    -L Link and specify Tensor-RT library path, default disable Tensor-RT lib linking"
+  echo "    -y Compile the symbol table switch and save the symbol table to the directory output"
 }

 # check value of input is 'on' or 'off'
@ -122,8 +123,9 @@ checkopts()
  TENSORRT_HOME=""
  USER_ENABLE_DUMP_IR=false
  USER_ENABLE_DEBUGGER=false
+  ENABLE_SYM_FILE="off"
  # Process the options
-  while getopts 'drvj:c:t:hb:s:a:g:p:ie:m:l:I:RP:D:zM:V:K:B:En:A:S:k:W:H:L:' opt
+  while getopts 'drvj:c:t:hb:s:a:g:p:ie:m:l:I:RP:D:zM:V:K:B:En:A:S:k:W:H:L:y' opt
  do
    CASE_SENSIVE_ARG=${OPTARG}
    OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
@ -140,6 +142,9 @@ checkopts()
          exit 1
        fi
        ;;
+      y)
+        ENABLE_SYM_FILE="on"
+        ;;
      r)
        DEBUG_MODE="off"
        ;;
@ -442,6 +447,9 @@ build_mindspore()
    if [[ -n "$TRAIN_MODE" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_${TRAIN_MODE}=ON"
    fi
+    if [[ "X$ENABLE_SYM_FILE" = "Xon" ]]; then
+        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_SYM_FILE=ON"
+    fi
    if [[ "X$ENABLE_ASAN" = "Xon" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_ASAN=ON"
    fi
--- a/cmake/external_libs/flatbuffers.cmake
+++ b/cmake/external_libs/flatbuffers.cmake
@ -1,10 +1,10 @@
 if(MSVC)
    set(flatbuffers_CXXFLAGS "${CMAKE_CXX_FLAGS}")
-    set(flatbuffers_CFLAGS "${CMAKE_CXX_FLAGS}")
+    set(flatbuffers_CFLAGS "${CMAKE_C_FLAGS}")
    set(flatbuffers_LDFLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
 else()
-    set(flatbuffers_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2")
-    set(flatbuffers_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
+    set(flatbuffers_CXXFLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -fstack-protector-strong")
+    set(flatbuffers_CFLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O2 -fstack-protector-strong")
 endif()

 if(WIN32)
--- a/cmake/mind_expression.cmake
+++ b/cmake/mind_expression.cmake
@ -89,7 +89,6 @@ if(ENABLE_MINDDATA)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/tinyxml2.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/cppjieba.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/sentencepiece.cmake)
-    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/ffmpeg.cmake)
 endif()

 if(ENABLE_MINDDATA)
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@ -25,6 +25,7 @@ option(ENABLE_ACL "enable acl" OFF)
 option(ENABLE_GLIBCXX "enable_glibcxx" OFF)
 option(MODE_ASCEND_ALL "supports all ascend platform" OFF)
 option(MODE_ASCEND_ACL "supports ascend acl mode only" OFF)
+option(ENABLE_SYM_FILE "enable sym file" OFF)

 if(NOT ENABLE_D AND NOT ENABLE_TESTCASES AND NOT ENABLE_ACL AND NOT ENABLE_GE)
    set(ENABLE_GLIBCXX ON)
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@ -12,6 +12,8 @@ set(CPACK_TEMPORARY_PACKAGE_FILE_NAME ${BUILD_PATH}/package/mindspore)
 set(CPACK_TEMPORARY_INSTALL_DIRECTORY ${BUILD_PATH}/package/mindspore)
 set(CPACK_PACK_ROOT_DIR ${BUILD_PATH}/package/)
 set(CPACK_CMAKE_SOURCE_DIR ${CMAKE_SOURCE_DIR})
+set(CPACK_ENABLE_SYM_FILE ${ENABLE_SYM_FILE})
+set(CPACK_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
 if(ENABLE_GE)
    set(CPACK_MS_BACKEND "ge")
    set(CPACK_MS_TARGET "ascend or cpu")
@ -125,17 +127,6 @@ if(ENABLE_MINDDATA)
      DESTINATION ${INSTALL_LIB_DIR} RENAME libicudata.so.67 COMPONENT mindspore)
    install(FILES ${icu4c_LIBPATH}/libicui18n.so.67.1
      DESTINATION ${INSTALL_LIB_DIR} RENAME libicui18n.so.67 COMPONENT mindspore)
-
-    install(FILES ${ffmpeg_LIBPATH}/libavcodec.so.58.91.100
-            DESTINATION ${INSTALL_LIB_DIR} RENAME libavcodec.so.58 COMPONENT mindspore)
-    install(FILES ${ffmpeg_LIBPATH}/libavformat.so.58.45.100
-            DESTINATION ${INSTALL_LIB_DIR} RENAME libavformat.so.58 COMPONENT mindspore)
-    install(FILES ${ffmpeg_LIBPATH}/libavutil.so.56.51.100
-            DESTINATION ${INSTALL_LIB_DIR} RENAME libavutil.so.56 COMPONENT mindspore)
-    install(FILES ${ffmpeg_LIBPATH}/libswresample.so.3.7.100
-            DESTINATION ${INSTALL_LIB_DIR} RENAME libswresample.so.3 COMPONENT mindspore)
-    install(FILES ${ffmpeg_LIBPATH}/libswscale.so.5.7.100
-            DESTINATION ${INSTALL_LIB_DIR} RENAME libswscale.so.5 COMPONENT mindspore)
 endif()

 if(ENABLE_CPU)
--- a/cmake/package_script.cmake
+++ b/cmake/package_script.cmake
@ -77,6 +77,48 @@ set(ENV{BACKEND_TARGET} ${CPACK_MS_TARGET})
 set(ENV{MS_PACKAGE_NAME} ${CPACK_MS_PACKAGE_NAME})
 set(ENV{COMMIT_ID} ${GIT_COMMIT_ID})

+file(GLOB DEBUG_SYM
+    ${MS_PACK_ROOT_DIR}/mindspore/*.so
+    ${MS_PACK_ROOT_DIR}/mindspore/lib/*.so
+)
+
+file(GLOB DEBUG_STRIP_SYM
+    ${MS_PACK_ROOT_DIR}/mindspore/*.so
+    ${MS_PACK_ROOT_DIR}/mindspore/lib/*.so*
+)
+
+set(CMAKE_OBJCOPY $ENV{CROSS_COMPILE}objcopy)
+set(CMAKE_STRIP $ENV{CROSS_COMPILE}strip)
+
+if(CPACK_ENABLE_SYM_FILE)
+    foreach(schema ${DEBUG_SYM})
+        execute_process(
+            COMMAND ${CMAKE_OBJCOPY} "--only-keep-debug" ${schema} ${schema}.sym
+            WORKING_DIRECTORY ${MS_PACK_ROOT_DIR}
+    )
+    endforeach()
+endif()
+
+if("${CPACK_CMAKE_BUILD_TYPE}" STREQUAL "Release")
+    foreach(schema ${DEBUG_STRIP_SYM})
+    execute_process(
+        COMMAND ${CMAKE_STRIP} ${schema}
+        WORKING_DIRECTORY ${MS_PACK_ROOT_DIR}
+    )
+    endforeach()
+endif()
+
+file(GLOB DEBUG_SYM_FILE
+    ${MS_PACK_ROOT_DIR}/mindspore/*.sym
+    ${MS_PACK_ROOT_DIR}/mindspore/lib/*.sym
+)
+
+if(CPACK_ENABLE_SYM_FILE)
+    file(MAKE_DIRECTORY ${MS_ROOT_DIR}/debug_info)
+    file(COPY ${DEBUG_SYM_FILE} DESTINATION ${MS_ROOT_DIR}/debug_info/)
+    file(REMOVE_RECURSE ${DEBUG_SYM_FILE})
+endif()
+
 execute_process(
    COMMAND ${PYTHON} ${MS_ROOT_DIR}/setup.py "bdist_wheel"
    WORKING_DIRECTORY ${MS_PACK_ROOT_DIR}
@ -104,3 +146,16 @@ file(COPY ${MS_PACK_ROOT_DIR}/${NEW_FILE_NAME} DESTINATION ${MS_ROOT_DIR}/output

 file(SHA256 ${MS_ROOT_DIR}/output/${NEW_FILE_NAME} SHA256_VAR)
 file(WRITE ${MS_ROOT_DIR}/output/${NEW_FILE_NAME}.sha256 ${SHA256_VAR} " " ${NEW_FILE_NAME})
+set(CMAKE_TAR $ENV{CROSS_COMPILE}tar)
+if(CPACK_ENABLE_SYM_FILE)
+    file(MAKE_DIRECTORY ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG})
+    file(COPY ${MS_ROOT_DIR}/debug_info/ DESTINATION
+        ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}/)
+    execute_process(COMMAND
+        ${CMAKE_COMMAND} -E ${CMAKE_TAR} cfv
+        ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}.zip
+        ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG}/ --format=zip
+        WORKING_DIRECTORY ${MS_ROOT_DIR})
+    file(REMOVE_RECURSE ${MS_ROOT_DIR}/debug_info)
+    file(REMOVE_RECURSE ${MS_ROOT_DIR}/output/${PACKAGE_NAME}-${VERSION}-${PY_TAGS}-${PLATFORM_TAG})
+endif()
--- a/cmake/package_tar.cmake
+++ b/cmake/package_tar.cmake
@ -91,18 +91,6 @@ if(ENABLE_MINDDATA)
            DESTINATION ${INSTALL_LIB_DIR}
            COMPONENT mindspore
    )
-    file(GLOB_RECURSE FFMPEG_LIB_LIST
-            ${ffmpeg_LIBPATH}/libavcodec*
-            ${ffmpeg_LIBPATH}/libavformat*
-            ${ffmpeg_LIBPATH}/libavutil*
-            ${ffmpeg_LIBPATH}/libswresample*
-            ${ffmpeg_LIBPATH}/libswscale*
-            )
-    install(
-            FILES ${FFMPEG_LIB_LIST}
-            DESTINATION ${INSTALL_LIB_DIR}
-            COMPONENT mindspore
-    )
 endif()

 # CPU mode
--- a/cmake/package_win.cmake
+++ b/cmake/package_win.cmake
@ -42,7 +42,6 @@ set(opencv_LIBPATH ${opencv_LIBPATH}/../bin/)
 set(jpeg_turbo_LIBPATH ${jpeg_turbo_LIBPATH}/../bin/)
 set(sqlite_LIBPATH ${sqlite_LIBPATH}/../bin/)
 set(tinyxml2_LIBPATH ${tinyxml2_LIBPATH}/../bin/)
-set(ffmpeg_LIBPATH ${ffmpeg_LIBPATH}/../bin/)

 message("offline debugger does not support windows system temporarily")

@ -98,18 +97,6 @@ if(ENABLE_MINDDATA)
    DESTINATION ${INSTALL_LIB_DIR}
    COMPONENT mindspore
  )
-  file(GLOB_RECURSE FFMPEG_LIB_LIST
-    ${ffmpeg_LIBPATH}/libavcodec*
-    ${ffmpeg_LIBPATH}/libavformat*
-    ${ffmpeg_LIBPATH}/libavutil*
-    ${ffmpeg_LIBPATH}/libswresample*
-    ${ffmpeg_LIBPATH}/libswscale*
-    )
-  install(
-    FILES ${FFMPEG_LIB_LIST}
-    DESTINATION ${INSTALL_LIB_DIR}
-    COMPONENT mindspore
-  )
 endif()

 if(ENABLE_CPU)
--- a/docker/OWNERS
+++ b/docker/OWNERS
@ -1,2 +1,4 @@
+approvers:
+- zhoufeng54
 reviewers:
- HW_KK
+- HW_KK
--- a/docker/mindspore-cpu/devel/Dockerfile
+++ b/docker/mindspore-cpu/devel/Dockerfile
@ -58,8 +58,11 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
    && make install -j4 \
    && rm -f /usr/local/bin/python \
    && rm -f /usr/local/bin/pip \
+    && rm -f /usr/local/lib/libpython3.7m.so.1.0 \
    && ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
    && ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
+    && ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \
+    && ldconfig \
    && rm -rf /tmp/cpython-3.7.5 \
    && rm -f /tmp/v3.7.5.tar.gz

--- a/docker/mindspore-cpu/runtime/Dockerfile
+++ b/docker/mindspore-cpu/runtime/Dockerfile
@ -51,13 +51,16 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
    && tar -xvf v3.7.5.tar.gz \
    && cd /tmp/cpython-3.7.5 \
    && mkdir -p ${PYTHON_ROOT_PATH} \
-    && ./configure --prefix=${PYTHON_ROOT_PATH} \
+    && ./configure --prefix=${PYTHON_ROOT_PATH} --enable-shared \
    && make -j4 \
    && make install -j4 \
    && rm -f /usr/local/bin/python \
    && rm -f /usr/local/bin/pip \
+    && rm -f /usr/local/lib/libpython3.7m.so.1.0 \
    && ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
    && ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
+    && ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \
+    && ldconfig \
    && rm -rf /tmp/cpython-3.7.5 \
    && rm -f /tmp/v3.7.5.tar.gz

--- a/docker/mindspore-gpu/devel/Dockerfile
+++ b/docker/mindspore-gpu/devel/Dockerfile
@ -1,4 +1,4 @@
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+FROM nvidia/cuda:11.1-cudnn8-devel-ubuntu18.04

 MAINTAINER leonwanghui <leon.wanghui@huawei.com>

@ -43,7 +43,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt install -y \
    libnuma-dev

 # Configure cuDNN (v7.6.5)
-RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7.6.5 /usr/local/cuda/lib64/libcudnn.so
+RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.8.0.5 /usr/local/cuda/lib64/libcudnn.so

 # Set bash
 RUN echo "dash dash/sh boolean false" | debconf-set-selections
@ -62,8 +62,11 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
    && make install -j4 \
    && rm -f /usr/local/bin/python \
    && rm -f /usr/local/bin/pip \
+    && rm -f /usr/local/lib/libpython3.7m.so.1.0 \
    && ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
    && ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
+    && ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \
+    && ldconfig \
    && rm -rf /tmp/cpython-3.7.5 \
    && rm -f /tmp/v3.7.5.tar.gz

--- a/docker/mindspore-gpu/runtime/Dockerfile
+++ b/docker/mindspore-gpu/runtime/Dockerfile
@ -1,4 +1,4 @@
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+FROM nvidia/cuda:11.1-cudnn8-devel-ubuntu18.04

 MAINTAINER leonwanghui <leon.wanghui@huawei.com>

@ -53,13 +53,16 @@ RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
    && tar -xvf v3.7.5.tar.gz \
    && cd /tmp/cpython-3.7.5 \
    && mkdir -p ${PYTHON_ROOT_PATH} \
-    && ./configure --prefix=${PYTHON_ROOT_PATH} \
+    && ./configure --prefix=${PYTHON_ROOT_PATH} --enable-shared \
    && make -j4 \
    && make install -j4 \
    && rm -f /usr/local/bin/python \
    && rm -f /usr/local/bin/pip \
+    && rm -f /usr/local/lib/libpython3.7m.so.1.0 \
    && ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
    && ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
+    && ln -s ${PYTHON_ROOT_PATH}/lib/libpython3.7m.so.1.0 /usr/local/lib/libpython3.7m.so.1.0 \
+    && ldconfig \
    && rm -rf /tmp/cpython-3.7.5 \
    && rm -f /tmp/v3.7.5.tar.gz

--- a/include/api/context.h
+++ b/include/api/context.h
@ -38,12 +38,19 @@ class Allocator;
 class Delegate;
 class DeviceInfoContext;

+/// \brief Context is used to store environment variables during execution.
 class MS_API Context {
 public:
  Context();
  ~Context() = default;

+  /// \brief Set the number of threads at runtime. This option is only valid for MindSpore Lite.
+  ///
+  /// \param[in] thread_num the number of threads at runtime.
  void SetThreadNum(int32_t thread_num);
+  /// \brief Get the current thread number setting.
+  ///
+  /// \return The current thread number setting.
  int32_t GetThreadNum() const;

  /// \brief Set the thread affinity to CPU cores.
@ -60,6 +67,10 @@ class MS_API Context {
  void SetDelegate(const std::shared_ptr<Delegate> &delegate);
  std::shared_ptr<Delegate> GetDelegate() const;

+  /// \brief Get a mutable reference of DeviceInfoContext vector in this context. Only MindSpore Lite supports
+  /// heterogeneous scenarios with multiple members in the vector.
+  ///
+  /// \return Mutable reference of DeviceInfoContext vector in this context.
  std::vector<std::shared_ptr<DeviceInfoContext>> &MutableDeviceInfo();

 private:
@ -67,14 +78,24 @@ class MS_API Context {
  std::shared_ptr<Data> data_;
 };

+/// \brief DeviceInfoContext defines different device contexts.
 class MS_API DeviceInfoContext : public std::enable_shared_from_this<DeviceInfoContext> {
 public:
  struct Data;

  DeviceInfoContext();
  virtual ~DeviceInfoContext() = default;
+
+  /// \brief Get the type of this DeviceInfoContext.
+  ///
+  /// \return Type of this DeviceInfoContext.
  virtual enum DeviceType GetDeviceType() const = 0;

+  /// \brief A similar function to RTTI is provided when the -fno-rtti compilation option is turned on, which converts
+  /// DeviceInfoContext to a shared pointer of type T, and returns nullptr if the conversion fails.
+  ///
+  /// \param T Type
+  /// \return A pointer of type T after conversion. If the conversion fails, it will be nullptr.
  template <class T>
  std::shared_ptr<T> Cast() {
    static_assert(std::is_base_of<DeviceInfoContext, T>::value, "Wrong cast type.");
@ -98,27 +119,60 @@ class MS_API DeviceInfoContext : public std::enable_shared_from_this<DeviceInfoC
  std::shared_ptr<Data> data_;
 };

+/// \brief Derived from DeviceInfoContext, The configuration of the model running on the CPU. This option is only valid
+/// for MindSpore Lite.
 class MS_API CPUDeviceInfo : public DeviceInfoContext {
 public:
+  /// \brief Get the type of this DeviceInfoContext.
+  ///
+  /// \return Type of this DeviceInfoContext.
  enum DeviceType GetDeviceType() const override { return DeviceType::kCPU; };

+  /// \brief Set enables to perform the float16 inference
+  ///
+  /// \param[in] is_fp16 Enable float16 inference or not.
  void SetEnableFP16(bool is_fp16);
+  /// \brief Get enables to perform the float16 inference
+  ///
+  /// \return Whether enable float16 inference.
  bool GetEnableFP16() const;
 };

+/// \brief Derived from DeviceInfoContext, The configuration of the model running on the NPU. This option is only valid
+/// for MindSpore Lite.
 class MS_API KirinNPUDeviceInfo : public DeviceInfoContext {
 public:
+  /// \brief Get the type of this DeviceInfoContext.
+  ///
+  /// \return Type of this DeviceInfoContext.
  enum DeviceType GetDeviceType() const override { return DeviceType::kKirinNPU; };

+  /// \brief Set the NPU frequency.
+  ///
+  /// \param[in] frequency Can be set to 1 (low power consumption), 2 (balanced), 3 (high performance), 4 (extreme
+  /// performance), default as 3.
  void SetFrequency(int frequency);
+  /// \brief Get the NPU frequency.
+  ///
+  /// \return NPU frequency
  int GetFrequency() const;
 };

+/// \brief Derived from DeviceInfoContext, The configuration of the model running on the GPU.
 class MS_API GPUDeviceInfo : public DeviceInfoContext {
 public:
+  /// \brief Get the type of this DeviceInfoContext.
+  ///
+  /// \return Type of this DeviceInfoContext.
  enum DeviceType GetDeviceType() const override { return DeviceType::kGPU; };

+  /// \brief Set device id.
+  ///
+  /// \param[in] device_id The device id.
  void SetDeviceID(uint32_t device_id);
+  /// \brief Get the device id.
+  ///
+  /// \return The device id.
  uint32_t GetDeviceID() const;

  void SetGpuTrtInferMode(bool gpu_trt_infer_mode);
@ -127,8 +181,15 @@ class MS_API GPUDeviceInfo : public DeviceInfoContext {
  inline void SetPrecisionMode(const std::string &precison_mode);
  inline std::string GetPrecisionMode() const;

+  /// \brief Set enables to perform the float16 inference
+  ///
+  /// \param[in] is_fp16 Enable float16 inference or not.
  void SetEnableFP16(bool is_fp16);
+  /// \brief Get enables to perform the float16 inference
+  ///
+  /// \return Whether enable float16 inference.
  bool GetEnableFP16() const;
+
 private:
  void SetPrecisionMode(const std::vector<char> &precision_mode);
  std::vector<char> GetPrecisionModeChar() const;
@ -139,52 +200,113 @@ void GPUDeviceInfo::SetPrecisionMode(const std::string &precision_mode) {
 }
 std::string GPUDeviceInfo::GetPrecisionMode() const { return CharToString(GetPrecisionModeChar()); }

+/// \brief Derived from DeviceInfoContext, The configuration of the model running on the Ascend910. This option is
+/// invalid for MindSpore Lite.
 class MS_API Ascend910DeviceInfo : public DeviceInfoContext {
 public:
+  /// \brief Get the type of this DeviceInfoContext.
+  ///
+  /// \return Type of this DeviceInfoContext.
  enum DeviceType GetDeviceType() const override { return DeviceType::kAscend910; };

+  /// \brief Set device id.
+  ///
+  /// \param[in] device_id The device id.
  void SetDeviceID(uint32_t device_id);
+  /// \brief Get the device id.
+  ///
+  /// \return The device id.
  uint32_t GetDeviceID() const;
 };

+/// \brief Derived from DeviceInfoContext, The configuration of the model running on the Ascend310. This option is
+/// invalid for MindSpore Lite.
 class MS_API Ascend310DeviceInfo : public DeviceInfoContext {
 public:
+  /// \brief Get the type of this DeviceInfoContext.
+  ///
+  /// \return Type of this DeviceInfoContext.
  enum DeviceType GetDeviceType() const override { return DeviceType::kAscend310; };

+  /// \brief Set device id.
+  ///
+  /// \param[in] device_id The device id.
  void SetDeviceID(uint32_t device_id);
+  /// \brief Get the device id.
+  ///
+  /// \return The device id.
  uint32_t GetDeviceID() const;

  inline void SetDumpConfigPath(const std::string &cfg_path);
  inline std::string GetDumpConfigPath() const;

-  // aipp config file
+  /// \brief Set AIPP configuration file path.
+  ///
+  /// \param[in] cfg_path AIPP configuration file path.
  inline void SetInsertOpConfigPath(const std::string &cfg_path);
+  /// \brief Get AIPP configuration file path.
+  ///
+  /// \return AIPP configuration file path.
  inline std::string GetInsertOpConfigPath() const;

-  // nchw or nhwc
+  /// \brief Set format of model inputs.
+  ///
+  /// \param[in] format Optional "NCHW", "NHWC", etc.
  inline void SetInputFormat(const std::string &format);
+  /// \brief Get format of model inputs.
+  ///
+  /// \return The format of model inputs.
  inline std::string GetInputFormat() const;

-  // Mandatory while dynamic batch: e.g. "input_op_name1: 1,2,3,4;input_op_name2: 4,3,2,1"
+  /// \brief Set shape of model inputs.
+  ///
+  /// \param[in] shape e.g. "input_op_name1: 1,2,3,4;input_op_name2: 4,3,2,1".
  inline void SetInputShape(const std::string &shape);
+  /// \brief Get shape of model inputs.
+  ///
+  /// \return The shape of model inputs.
  inline std::string GetInputShape() const;

+  /// \brief Set shape of model inputs.
+  ///
+  /// \param[in] shape e.g. {{1, {1,2,3,4}}, {2, {4,3,2,1}}} means the first input shape 1,2,3,4 and the second input
+  /// shape 4,3,2,1.
  void SetInputShapeMap(const std::map<int, std::vector<int>> &shape);
+  /// \brief Get shape of model inputs.
+  ///
+  /// \return The shape of model inputs.
  std::map<int, std::vector<int>> GetInputShapeMap() const;

  void SetDynamicBatchSize(const std::vector<size_t> &dynamic_batch_size);
  inline std::string GetDynamicBatchSize() const;

-  // FP32, UINT8 or FP16, default as FP32
+  /// \brief Set type of model outputs.
+  ///
+  /// \param[in] output_type FP32, UINT8 or FP16, default as FP32.
  void SetOutputType(enum DataType output_type);
+  /// \brief Get type of model outputs.
+  ///
+  /// \return The set type of model outputs.
  enum DataType GetOutputType() const;

-  // "force_fp16", "allow_fp32_to_fp16", "must_keep_origin_dtype" or "allow_mix_precision", default as "force_fp16"
+  /// \brief Set precision mode of model.
+  ///
+  /// \param[in] precision_mode Optional "force_fp16", "allow_fp32_to_fp16", "must_keep_origin_dtype" and
+  /// "allow_mix_precision", "force_fp16" is set as default
  inline void SetPrecisionMode(const std::string &precision_mode);
+  /// \brief Get precision mode of model.
+  ///
+  /// \return The set type of model outputs
  inline std::string GetPrecisionMode() const;

-  // Optional "high_performance" and "high_precision", "high_performance" is set as default
+  /// \brief Set op select implementation mode.
+  ///
+  /// \param[in] op_select_impl_mode Optional "high_performance" and "high_precision", "high_performance" is set as
+  /// default.
  inline void SetOpSelectImplMode(const std::string &op_select_impl_mode);
+  /// \brief Get op select implementation mode.
+  ///
+  /// \return The set op select implementation mode.
  inline std::string GetOpSelectImplMode() const;

  inline void SetFusionSwitchConfigPath(const std::string &cfg_path);
--- a/include/api/model.h
+++ b/include/api/model.h
@ -37,32 +37,75 @@ class Metrics;
 namespace dataset {
 class Dataset;
 }  // namespace dataset
-
+/// \brief The Model class is used to define a MindSpore model, facilitating computational graph management.
 class MS_API Model {
 public:
  Model();
  ~Model();
  Model(const Model &) = delete;
  void operator=(const Model &) = delete;
-
+  /// \brief Builds a model so that it can run on a device.
+  ///
+  /// \param[in] graph GraphCell is a derivative of Cell. Cell is not available currently. GraphCell can be constructed
+  /// from Graph, for example, model.Build(GraphCell(graph), context).
+  /// \param[in] model_context A context used to store options during execution.
+  /// \param[in] train_cfg A config used by training.
+  ///
+  /// \return Status.
  Status Build(GraphCell graph, const std::shared_ptr<Context> &model_context = nullptr,
               const std::shared_ptr<TrainCfg> &train_cfg = nullptr);
+
+  /// \brief Resizes the shapes of inputs.
+  ///
+  /// \param[in] inputs A vector that includes all input tensors in order.
+  /// \param[in] dims Defines the new shapes of inputs, should be consistent with inputs.
+  ///
+  /// \return Status.
  Status Resize(const std::vector<MSTensor> &inputs, const std::vector<std::vector<int64_t>> &dims);

+  /// \brief Inference model.
+  ///
+  /// \param[in] inputs A vector where model inputs are arranged in sequence.
+  /// \param[out] outputs Which is a pointer to a vector. The model outputs are filled in the container in sequence.
+  /// \param[in] before CallBack before predict.
+  /// \param[in] after CallBack after predict.
+  ///
+  /// \return Status.
  Status Predict(const std::vector<MSTensor> &inputs, std::vector<MSTensor> *outputs,
                 const MSKernelCallBack &before = nullptr, const MSKernelCallBack &after = nullptr);

+  /// \brief Obtains all input tensors of the model.
+  ///
+  /// \return The vector that includes all input tensors.
  std::vector<MSTensor> GetInputs();
+  /// \brief Obtains the input tensor of the model by name.
+  ///
+  /// \return The input tensor with the given name, if the name is not found, an invalid tensor is returned.
  inline MSTensor GetInputByTensorName(const std::string &tensor_name);

  Status InitMetrics(std::vector<Metrics *> metrics);
  std::vector<Metrics *> GetMetrics();

+  /// \brief Obtains all output tensors of the model.
+  ///
+  /// \return The vector that includes all output tensors.
  std::vector<MSTensor> GetOutputs();
+  /// \brief Obtains names of all output tensors of the model.
+  ///
+  /// \return A vector that includes names of all output tensors.
  inline std::vector<std::string> GetOutputTensorNames();
+  /// \brief Obtains the output tensor of the model by name.
+  ///
+  /// \return The output tensor with the given name, if the name is not found, an invalid tensor is returned.
  inline MSTensor GetOutputByTensorName(const std::string &tensor_name);
  inline std::vector<MSTensor> GetOutputsByNodeName(const std::string &tensor_name);

+  /// \brief Inference model.
+  ///
+  /// \param[in] device_type Device type，options are kGPU, kAscend910, etc.
+  /// \param[in] model_type The type of model file, options are ModelType::kMindIR, ModelType::kOM.
+  ///
+  /// \return Is supported or not.
  static bool CheckModelSupport(enum DeviceType device_type, ModelType model_type);

  Status SetTrainMode(bool train);
--- a/include/api/serialization.h
+++ b/include/api/serialization.h
@ -27,13 +27,43 @@
 #include "include/api/dual_abi_helper.h"

 namespace mindspore {
-
+/// \brief The Serialization class is used to summarize methods for reading and writing model files.
 class MS_API Serialization {
 public:
+  /// \brief Loads a model file from memory buffer.
+  ///
+  /// \param[in] model_data A buffer filled by model file.
+  /// \param[in] data_size The size of the buffer.
+  /// \param[in] model_type The Type of model file, options are ModelType::kMindIR, ModelType::kOM.
+  /// \param[out] graph The output parameter, an object saves graph data.
+  /// \param[in] dec_key The decryption key, key length is 16, 24, or 32.
+  /// \param[in] dec_mode The decryption mode, optional options are AES-GCM, AES-CBC.
+  ///
+  /// \return Status.
  inline static Status Load(const void *model_data, size_t data_size, ModelType model_type, Graph *graph,
                            const Key &dec_key = {}, const std::string &dec_mode = kDecModeAesGcm);
+
+  /// \brief Loads a model file from path, is not supported on MindSpore Lite.
+  ///
+  /// \param[in] file The path of model file.
+  /// \param[in] model_type The Type of model file, options are ModelType::kMindIR, ModelType::kOM.
+  /// \param[out] graph The output parameter, an object saves graph data.
+  /// \param[in] dec_key The decryption key, key length is 16, 24, or 32.
+  /// \param[in] dec_mode The decryption mode, optional options are AES-GCM, AES-CBC.
+  ///
+  /// \return Status.
  inline static Status Load(const std::string &file, ModelType model_type, Graph *graph, const Key &dec_key = {},
                            const std::string &dec_mode = kDecModeAesGcm);
+
+  /// \brief Load multiple models from multiple files, MindSpore Lite does not provide this feature.
+  ///
+  /// \param[in] files The path of model files.
+  /// \param[in] model_type The Type of model file, options are ModelType::kMindIR, ModelType::kOM.
+  /// \param[out] graph The output parameter, an object saves graph data.
+  /// \param[in] dec_key The decryption key, key length is 16, 24, or 32.
+  /// \param[in] dec_mode The decryption mode, optional options are AES-GCM, AES-CBC.
+  ///
+  /// \return Status.
  inline static Status Load(const std::vector<std::string> &files, ModelType model_type, std::vector<Graph> *graphs,
                            const Key &dec_key = {}, const std::string &dec_mode = kDecModeAesGcm);
  static Status SetParameters(const std::map<std::string, Buffer> &parameters, Model *model);
--- a/include/api/types.h
+++ b/include/api/types.h
@ -25,11 +25,17 @@
 #include "include/api/dual_abi_helper.h"
 #include "include/api/format.h"

+#ifndef MS_API
 #ifdef _WIN32
+#ifdef BUILDING_DLL
 #define MS_API __declspec(dllexport)
 #else
+#define MS_API __declspec(dllimport)
+#endif
+#else
 #define MS_API __attribute__((visibility("default")))
 #endif
+#endif

 namespace mindspore {
 enum ModelType : uint32_t {
@ -64,18 +70,64 @@ struct QuantParam {
 };

 class Allocator;
+/// \brief The MSTensor class defines a tensor in MindSpore.
 class MS_API MSTensor {
 public:
  class Impl;
-
+  /// \brief Creates a MSTensor object, whose data need to be copied before accessed by Model, must be used in pairs
+  /// with DestroyTensorPtr.
+  ///
+  /// \param[in] name The name of the MSTensor.
+  /// \param[in] type The data type of the MSTensor.
+  /// \param[in] shape The shape of the MSTensor.
+  /// \param[in] data The data pointer that points to allocated memory.
+  /// \param[in] data_len The length of the memory, in bytes.
+  ///
+  /// \return A pointer of MSTensor.
  static inline MSTensor *CreateTensor(const std::string &name, DataType type, const std::vector<int64_t> &shape,
                                       const void *data, size_t data_len) noexcept;
+  /// \brief Creates a MSTensor object, whose data can be directly accessed by Model, must be used in pairs with
+  /// DestroyTensorPtr.
+  ///
+  /// \param[in] name The name of the MSTensor.
+  /// \param[in] type The data type of the MSTensor.
+  /// \param[in] shape The shape of the MSTensor.
+  /// \param[in] data The data pointer that points to allocated memory.
+  /// \param[in] data_len The length of the memory, in bytes.
+  ///
+  /// \return A pointer of MSTensor.
  static inline MSTensor *CreateRefTensor(const std::string &name, DataType type, const std::vector<int64_t> &shape,
                                          const void *data, size_t data_len) noexcept;
+  /// \brief Creates a MSTensor object, whose device data can be directly accessed by Model, must be used in pairs with
+  /// DestroyTensorPtr.
+  ///
+  /// \param[in] name The name of the MSTensor.
+  /// \param[in] type The data type of the MSTensor.
+  /// \param[in] shape The shape of the MSTensor.
+  /// \param[in] data The data pointer that points to device memory.
+  /// \param[in] data_len The length of the memory, in bytes.
+  ///
+  /// \return A pointer of MSTensor.
  static inline MSTensor *CreateDevTensor(const std::string &name, DataType type, const std::vector<int64_t> &shape,
                                          const void *data, size_t data_len) noexcept;
+  /// \brief Create a string type MSTensor object whose data can be accessed by Model only after being copied, must be
+  /// used in pair with DestroyTensorPtr.
+  ///
+  /// \param[in] name The name of the MSTensor.
+  /// \param[in] str A vector container containing several strings.
+  ///
+  /// \return A pointer of MSTensor.
  static inline MSTensor *StringsToTensor(const std::string &name, const std::vector<std::string> &str);
+  /// \brief Parse the string type MSTensor object into strings.
+  ///
+  /// \param[in] tensor A MSTensor object.
+  ///
+  /// \return A vector container containing several strings.
  static inline std::vector<std::string> TensorToStrings(const MSTensor &tensor);
+  /// \brief Destroy an object created by Clone, StringsToTensor, CreateRefTensor, CreateDevTensor or CreateTensor. Do
+  /// not use it to destroy MSTensor from other sources.
+  ///
+  /// \param[in] tensor A MSTensor object.
  static void DestroyTensorPtr(MSTensor *tensor) noexcept;

  MSTensor();
@ -85,19 +137,51 @@ class MS_API MSTensor {
  explicit MSTensor(std::nullptr_t);
  ~MSTensor();

+  /// \brief Obtains the name of the MSTensor.
+  ///
+  /// \return The name of the MSTensor.
  inline std::string Name() const;
+  /// \brief Obtains the data type of the MSTensor.
+  ///
+  /// \return The data type of the MSTensor.
  enum DataType DataType() const;
+  /// \brief Obtains the shape of the MSTensor.
+  ///
+  /// \return The shape of the MSTensor.
  const std::vector<int64_t> &Shape() const;
+  /// \brief Obtains the number of elements of the MSTensor.
+  ///
+  /// \return The number of elements of the MSTensor.
  int64_t ElementNum() const;

+  /// \brief Obtains a shared pointer to the copy of data of the MSTensor. The data can be read on host.
+  ///
+  /// \return A shared pointer to the copy of data of the MSTensor.
  std::shared_ptr<const void> Data() const;
+  /// \brief Obtains the pointer to the data of the MSTensor. If the MSTensor is a device tensor, the data cannot be
+  /// accessed directly on host.
+  ///
+  /// \return A pointer to the data of the MSTensor.
  void *MutableData();
+  /// \brief Obtains the length of the data of the MSTensor, in bytes.
+  ///
+  /// \return The length of the data of the MSTensor, in bytes.
  size_t DataSize() const;
-
+  /// \brief Gets the boolean value that indicates whether the memory of MSTensor is on device.
+  ///
+  /// \return The boolean value that indicates whether the memory of MSTensor is on device.
  bool IsDevice() const;
-
+  /// \brief Gets a deep copy of the MSTensor, must be used in pair with DestroyTensorPtr.
+  ///
+  /// \return A pointer points to a deep copy of the MSTensor.
  MSTensor *Clone() const;
+  /// \brief Gets the boolean value that indicates whether the MSTensor is valid.
+  ///
+  /// \return The boolean value that indicates whether the MSTensor is valid.
  bool operator==(std::nullptr_t) const;
+  /// \brief Gets the boolean value that indicates whether the MSTensor is valid.
+  ///
+  /// \return The boolean value that indicates whether the MSTensor is valid.
  bool operator!=(std::nullptr_t) const;
  bool operator==(const MSTensor &tensor) const;

--- a/mindspore/_checkparam.py
+++ b/mindspore/_checkparam.py
@ -23,6 +23,7 @@ from itertools import repeat, zip_longest
 from collections import deque
 from collections.abc import Iterable
 import numpy as np
+from mindspore import context
 from mindspore import log as logger
 from mindspore.common import dtype as mstype
 from mindspore._c_expression import Tensor as Tensor_
@ -846,6 +847,10 @@ class Validator:
        """Returns an empty Tensor."""
        return Tensor_(dtype, shape)

+    @staticmethod
+    def check_type_support(dtype, device, supported_dtypes):
+        return dtype in supported_dtypes or not context.get_context('device_target') == device
+

 def check_input_format(input_param):
    """Judge input format."""
--- a/mindspore/_extends/graph_kernel/parallel_estimate.py
+++ b/mindspore/_extends/graph_kernel/parallel_estimate.py
@ -21,7 +21,7 @@ from . import model


 def estimate_ops(json_str: str):
-    """Call costmodel to estimate ops."""
+    """Call cost model to estimate ops."""
    try:
        json_obj = json.loads(json_str)
        graph_descs = json_obj["graph_desc"]
@ -38,7 +38,7 @@ def estimate_ops(json_str: str):


 def estimate_calulation_amount(json_str: str):
-    """Call costmodel to estimate calculation amount of op."""
+    """Call cost model to estimate calculation amount of op."""
    try:
        graph_desc = json.loads(json_str)
        comp = model.load_composite(graph_desc)
--- a/mindspore/_extends/graph_kernel/splitter.py
+++ b/mindspore/_extends/graph_kernel/splitter.py
@ -24,7 +24,7 @@ from . import utils


 def split_with_json(json_str, flags_str):
-    """Call costmodel to split GraphKernel"""
+    """Call cost model to split GraphKernel"""
    try:
        graph_desc = json.loads(json_str)
        flags = json.loads(flags_str)
--- a/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py
+++ b/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py
@ -50,11 +50,6 @@ def _compile_akg_task_gpu(json_strs, attrs):
        if not res:
            raise ValueError("Compile error, args: {}! build attrs: {}".format(json_str, attrs))

-    pid_path = os.path.realpath("./cuda_meta_" + str(os.getpid()))
-    if os.path.exists(pid_path):
-        copy_json(pid_path, os.path.realpath("./cuda_meta_" + str(os.getppid())))
-        shutil.rmtree(pid_path)
-

 def _compile_akg_task_ascend(json_strs, attrs):
    """
--- a/mindspore/_extends/parse/parser.py
+++ b/mindspore/_extends/parse/parser.py
@ -159,12 +159,17 @@ def resolve_symbol(namespace, symbol):
        if getattr(resolve_, "__hash__") is None:
            return resolve_

+        # Raise NotImplementedError when parsing the numpy methods, but not the numpy constant.
+        if namespace.name == "numpy" and isinstance(resolve_, (types.FunctionType, types.MethodType, types.ModuleType)):
+            raise NotImplementedError(
+                f"MindSpore does not support to use the numpy methods in the function construct with the graph mode.")
+
        # If need trope the obj
        if resolve_ in convert_object_map:
            resolve_ = convert_object_map.get(resolve_)
            logger.debug("convert resolve = %r", resolve_)
            if resolve_ == NO_IMPLEMENT:
-                raise NotImplementedError(f"Not support for `{symbol}`")
+                raise NotImplementedError(f"Not support for `{symbol}`.")
    except Exception as e:
        if isinstance(e, NotImplementedError):
            raise e
--- a/mindspore/_extends/parse/standard_method.py
+++ b/mindspore/_extends/parse/standard_method.py
@ -1312,7 +1312,8 @@ def sum(x, axis=None, dtype=None, keepdims=False, initial=None): # pylint: disab
        >>> print(input_x.sum(axis=1))
        [10. 35.]
    """
-    dtype = x.dtype if dtype is None else dtype
+    input_x = x.astype(mstype.int32) if x.dtype == mstype.bool_ else x
+    dtype = input_x.dtype if dtype is None else dtype
    if not isinstance(keepdims, int):
        const_utils.raise_type_error("integer argument expected")
    if initial is not None and not isinstance(initial, (int, float, bool)):
@ -1322,14 +1323,14 @@ def sum(x, axis=None, dtype=None, keepdims=False, initial=None): # pylint: disab
    else:
        axis = check_and_canonicalize_axes(axis, x.ndim)

-    if x.dtype == mstype.bool_:
-        x = x.astype("int32")
+    if not check_type_support(input_x.dtype, 'GPU', (mstype.float64, mstype.float32, mstype.float16)):
+        input_x = input_x.astype(mstype.float32)
    if 0 in x.shape:
        x = const_utils.make_tensor([0], x.dtype)
    if keepdims:
-        res = _reduce_sum_keepdims(x, axis)
+        res = _reduce_sum_keepdims(input_x, axis)
    else:
-        res = _reduce_sum_default(x, axis)
+        res = _reduce_sum_default(input_x, axis)
    if initial is not None:
        res += initial
    return res.astype(dtype)
@ -1648,6 +1649,7 @@ get_log2_size = constexpr(validator.get_log2_size)
 check_axis_type = constexpr(validator.check_axis_type)
 check_and_canonicalize_axes = constexpr(validator.check_and_canonicalize_axes)
 empty_compile = constexpr(validator.empty_compile)
+check_type_support = constexpr(validator.check_type_support)


 def tensor_bool(x):
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@ -325,7 +325,7 @@ endif()
 set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
 set_property(SOURCE "pipeline/jit/init.cc" PROPERTY
            COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PIPELINE)
-pybind11_add_module(_c_expression "pipeline/jit/init.cc")
+pybind11_add_module(_c_expression NO_EXTRAS "pipeline/jit/init.cc")

 MESSAGE(STATUS "operation system is ${CMAKE_SYSTEM}")
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@ -35,6 +35,7 @@ if(ENABLE_CPU)
        "cpu/fl/*.cc"
        "cpu/ps/*.cc"
        "cpu/quantum/*.cc"
+        "cpu/pyfunc/*.cc"
    )

    if(NOT ENABLE_MPI)
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
@ -16,6 +16,11 @@

 #include "backend/kernel_compiler/akg/akg_kernel_build.h"

+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
 #include <algorithm>
 #include <map>
 #include <memory>
@ -23,6 +28,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+#include <iostream>
 #include "nlohmann/json.hpp"
 #include "ir/dtype.h"
 #include "ir/func_graph.h"
@ -34,9 +40,320 @@

 namespace mindspore {
 namespace kernel {
+
+#define INIT_SET_FROM_2D_ARRAY(set_var, list_idx) \
+  std::set<size_t> set_var(kernel_lists_[list_idx], kernel_lists_[list_idx] + kernel_lists_[list_idx][kMaxKernelNum_]);
+
+#define LIST_BEGIN(list_idx) kernel_lists_[list_idx]
+#define LIST_END(list_idx) (kernel_lists_[list_idx] + kernel_lists_[list_idx][kMaxKernelNum_])
+#define RESET_LIST_SIZE(list_idx, val) kernel_lists_[list_idx][kMaxKernelNum_] = val
+
+#define INCREASE_LIST_SIZE(list_idx, val) kernel_lists_[list_idx][kMaxKernelNum_] += val
+
 constexpr int32_t PROCESS_NUM = 16;
 constexpr int32_t TIME_OUT = 300;

+bool AkgKernelPool::LockMng::TryLock() {
+  // Try to lock 100 times. Return errno if lock unsuccessfully
+  uint32_t trial = 100;
+
+  int32_t ret = -1;
+  while (trial > 0) {
+    ret = lockf(fd_, F_TLOCK, 0);
+    if (ret == 0 || (errno != EACCES && errno != EAGAIN)) {
+      break;
+    }
+
+    trial--;
+    usleep(5000);
+  }
+
+  if (ret == -1) {
+    MS_LOG(ERROR) << "Failed to acquire the lock, errno:" << strerror(errno) << ".";
+    return false;
+  }
+
+  return true;
+}
+
+void AkgKernelPool::LockMng::Unlock() {
+  auto ret = lockf(fd_, F_ULOCK, 0);
+  if (ret == -1) {
+    MS_LOG(ERROR) << "Failed to release the lock, errno:" << strerror(errno);
+  }
+}
+
+std::string AkgKernelPool::GetCurrentPath() {
+  char cwd[PATH_MAX];
+  char *ret = getcwd(cwd, sizeof(cwd));
+  if (ret == nullptr) {
+    MS_LOG(ERROR) << "Get current work directory failed, errno:" << strerror(errno);
+    return "";
+  }
+
+  char abspath[PATH_MAX];
+  char *res = realpath(cwd, abspath);
+  if (res == nullptr) {
+    MS_LOG(ERROR) << "Change to realpath failed, errno:" << strerror(errno);
+    return "";
+  }
+
+  return std::string(abspath);
+}
+
+void *AkgKernelPool::CreateSharedMem(const std::string &path) {
+  is_creator_ = false;
+
+  auto hash_id = std::hash<std::string>()(path);
+  auto key_id = static_cast<key_t>(hash_id);
+  auto mem_size = sizeof(size_t) * kListNum_ * (kMaxKernelNum_ + 1) + 512;
+
+  {
+    LockMng lock(fd_);
+    if (!lock.locked_) {
+      MS_LOG(ERROR) << "Failed to acquire lock.";
+      return nullptr;
+    }
+
+    // check if the shared memory exists or not.
+    // remove shared memory if exists and the nattach is 0
+    struct shmid_ds buf;
+    auto id = shmget(key_id, mem_size, 0);
+    if (id != -1) {
+      auto ret = shmctl(id, IPC_STAT, &buf);
+      if (ret == -1) {
+        MS_LOG(ERROR) << "Failed to get the info of shared memory, errno:" << strerror(errno);
+        return nullptr;
+      }
+
+      if (buf.shm_nattch == 0) {
+        ret = shmctl(id, IPC_RMID, nullptr);
+        if (ret < 0) {
+          MS_LOG(EXCEPTION) << "Realse shared_mem failed, errno:" << strerror(errno);
+        }
+      }
+    }
+  }
+
+  LockMng lock(fd_);
+  if (!lock.locked_) {
+    MS_LOG(ERROR) << "Failed to acquire lock.";
+    return nullptr;
+  }
+
+  shm_id_ = shmget(key_id, mem_size, IPC_CREAT | IPC_EXCL | 0600);
+  if (shm_id_ == -1) {
+    if (errno == EEXIST) {
+      shm_id_ = shmget(key_id, mem_size, 0);
+    }
+
+    if (shm_id_ == -1) {
+      MS_LOG(ERROR) << "Create shared_mem failed, error no:" << strerror(errno);
+      return nullptr;
+    }
+  } else {
+    is_creator_ = true;
+  }
+
+  auto local_addr = shmat(shm_id_, nullptr, 0);
+  if (local_addr == reinterpret_cast<void *>(-1)) {
+    MS_LOG(ERROR) << "Attach to shared_mem failed, error no:" << strerror(errno);
+    return nullptr;
+  }
+
+  if (is_creator_) {
+    (void)memset(local_addr, 0, mem_size);
+  }
+
+  return local_addr;
+}
+
+int32_t AkgKernelPool::Init(const std::vector<JsonNodePair> &build_args) {
+  auto cp = GetCurrentPath();
+  if (cp.empty()) {
+    return -1;
+  }
+
+  fd_ = open(kKeyName_, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+  if (fd_ == -1) {
+    MS_LOG(ERROR) << "open file <" << kKeyName_ << "> failed, errno:" << strerror(errno);
+    return -1;
+  }
+
+  auto addr = CreateSharedMem(cp);
+  if (addr == nullptr) {
+    return -1;
+  }
+
+  InitKernelLists(addr);
+
+  auto ret = AddKernels(build_args);
+  if (ret != 0) {
+    MS_LOG(ERROR) << "AkgKernelPool AddKernels failed.";
+    return false;
+  }
+
+  return 0;
+}
+
+AkgKernelPool::~AkgKernelPool() {
+  // Detach shared memory
+  auto ret = shmdt(reinterpret_cast<void *>(kernel_lists_[0]));
+  if (ret < 0) {
+    MS_LOG(EXCEPTION) << "Shared_mem detach failed, errno:" << strerror(errno);
+  }
+
+  // Realse shared_memroy
+  if (is_creator_) {
+    ret = shmctl(shm_id_, IPC_RMID, nullptr);
+    if (ret < 0) {
+      MS_LOG(EXCEPTION) << "Realse shared_mem failed, errno:" << strerror(errno);
+    }
+  }
+
+  // Close key file
+  if (fd_ != -1) {
+    (void)close(fd_);
+  }
+}
+
+int32_t AkgKernelPool::AddKernels(const std::vector<JsonNodePair> &build_args) {
+  LockMng lock(fd_);
+  if (!lock.locked_) {
+    MS_LOG(ERROR) << "Failed to acquire lock.";
+    return -1;
+  }
+
+  INIT_SET_FROM_2D_ARRAY(todo_list, kToDoIdx_);
+  INIT_SET_FROM_2D_ARRAY(doing_list, kDoingIdx_);
+  INIT_SET_FROM_2D_ARRAY(done_list, kDoneIdx_);
+
+  for (const auto &[json_generator, anf_node] : build_args) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto kernel_name = json_generator.kernel_name();
+
+    auto hash_id = std::hash<std::string>()(kernel_name);
+    if (self_kernel_ids_.count(hash_id) != 0) {
+      MS_LOG(ERROR) << "Duplicated hash_id in list.";
+      return -1;
+    }
+
+    self_kernel_ids_.emplace(hash_id);
+  }
+
+  std::set<size_t> diff_from_todo;
+  std::set<size_t> diff_from_doing;
+  std::set<size_t> diff_from_done;
+
+  // add the unique kernel only once, so need to check if it exists in todo_list, doing_list, or done_list
+  std::set_difference(self_kernel_ids_.begin(), self_kernel_ids_.end(), todo_list.begin(), todo_list.end(),
+                      std::inserter(diff_from_todo, diff_from_todo.begin()));
+  std::set_difference(diff_from_todo.begin(), diff_from_todo.end(), doing_list.begin(), doing_list.end(),
+                      std::inserter(diff_from_doing, diff_from_doing.begin()));
+  std::set_difference(diff_from_doing.begin(), diff_from_doing.end(), done_list.begin(), done_list.end(),
+                      std::inserter(diff_from_done, diff_from_done.begin()));
+
+  auto new_kernel_size = diff_from_done.size();
+  if (new_kernel_size + todo_list.size() > static_cast<size_t>(kMaxKernelNum_)) {
+    MS_LOG(ERROR) << "The size of kernels is " << new_kernel_size << ", while the left space of the pool is "
+                  << kMaxKernelNum_ - todo_list.size();
+    return -1;
+  }
+
+  std::copy(diff_from_done.begin(), diff_from_done.end(), LIST_END(kToDoIdx_));
+  INCREASE_LIST_SIZE(kToDoIdx_, new_kernel_size);
+
+  return 0;
+}
+
+int32_t AkgKernelPool::FetchKernels(std::set<size_t> *out) {
+  LockMng lock(fd_);
+  if (!lock.locked_) {
+    MS_LOG(ERROR) << "Failed to acquire lock.";
+    return -1;
+  }
+
+  std::set<size_t> left_in_todo_list;
+
+  // filter out kernels which belongs to other processes
+  auto FilterBySelfList = [&left_in_todo_list, &out, this](size_t id) {
+    if (this->self_kernel_ids_.count(id) != 0) {
+      out->emplace(id);
+    } else {
+      left_in_todo_list.emplace(id);
+    }
+  };
+
+  std::for_each(LIST_BEGIN(kToDoIdx_), LIST_END(kToDoIdx_), FilterBySelfList);
+
+  std::copy(out->begin(), out->end(), LIST_END(kDoingIdx_));
+  INCREASE_LIST_SIZE(kDoingIdx_, out->size());
+
+  std::copy(left_in_todo_list.begin(), left_in_todo_list.end(), LIST_BEGIN(kToDoIdx_));
+  RESET_LIST_SIZE(kToDoIdx_, left_in_todo_list.size());
+
+  return 0;
+}
+
+int32_t AkgKernelPool::UpdateAndWait(const std::set<size_t> &ids) {
+  if (!ids.empty()) {
+    LockMng lock(fd_);
+    if (!lock.locked_) {
+      MS_LOG(ERROR) << "Failed to acquire lock.";
+      return -1;
+    }
+
+    // update the state of finished kernels to `done`
+    std::copy(ids.begin(), ids.end(), LIST_END(kDoneIdx_));
+    INCREASE_LIST_SIZE(kDoneIdx_, ids.size());
+
+    // delete the finished kernels from doing_list
+    std::vector<size_t> left_in_doing_list;
+    INIT_SET_FROM_2D_ARRAY(doing_list, kDoingIdx_);
+    std::set_difference(doing_list.begin(), doing_list.end(), ids.begin(), ids.end(),
+                        std::inserter(left_in_doing_list, left_in_doing_list.begin()));
+
+    std::copy(left_in_doing_list.begin(), left_in_doing_list.end(), LIST_BEGIN(kDoingIdx_));
+    RESET_LIST_SIZE(kDoingIdx_, left_in_doing_list.size());
+  }
+
+  auto ret = Wait();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "AkgKernelPool Wait failed.";
+    return -1;
+  }
+
+  return 0;
+}
+
+int32_t AkgKernelPool::Wait() {
+  // wait until all the kernels which belong to this process finish compiling
+  uint32_t trials = 1000;
+
+  while (trials > 0) {
+    {
+      LockMng lock(fd_);
+      if (!lock.locked_) {
+        MS_LOG(ERROR) << "Failed to acquire lock.";
+        return -1;
+      }
+
+      INIT_SET_FROM_2D_ARRAY(done_list, kDoneIdx_);
+
+      if (std::all_of(self_kernel_ids_.begin(), self_kernel_ids_.end(),
+                      [&done_list](size_t id) { return done_list.count(id) != 0; })) {
+        return 0;
+      }
+    }
+
+    usleep(1000000);
+    trials--;
+  }
+
+  MS_LOG(ERROR) << "Time out while wait kernel compiling";
+  return -1;
+}
+
 std::vector<std::string> AkgKernelBuilder::GetNotCachedKernelJsons(const std::vector<JsonNodePair> &build_args) {
  // Remove cached nodes, gether unique nodes, and collect repeated nodes which need postprecess.
  std::vector<std::string> jsons;
@ -66,6 +383,31 @@ std::vector<std::string> AkgKernelBuilder::GetNotCachedKernelJsons(const std::ve
  return jsons;
 }

+std::vector<JsonNodePair> AkgKernelBuilder::GetNotCachedKernels(const std::vector<JsonNodePair> &build_args) {
+  std::unordered_set<std::string> kernel_name_set;
+  std::vector<JsonNodePair> new_build_args;
+  for (const auto &[json_generator, anf_node] : build_args) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto kernel_name = json_generator.kernel_name();
+
+    auto cached_kernel_pack = AkgSearchCache(kernel_name);
+    if (cached_kernel_pack != nullptr) {
+      MS_LOG(DEBUG) << "Use cached kernel, kernel_name[" << kernel_name << "], fullname_with_scope["
+                    << anf_node->fullname_with_scope() << "].";
+      AkgSetKernelMod(cached_kernel_pack, json_generator, anf_node);
+      continue;
+    }
+
+    if (kernel_name_set.count(kernel_name) != 0) {
+      repeat_nodes_.push_back({json_generator, anf_node});
+      continue;
+    }
+    kernel_name_set.insert(kernel_name);
+    new_build_args.push_back({json_generator, anf_node});
+  }
+  return new_build_args;
+}
+
 bool AkgKernelBuilder::InsertToCache(const std::vector<JsonNodePair> &build_args) {
  for (const auto &[json_generator, anf_node] : build_args) {
    auto kernel_name = json_generator.kernel_name();
@ -97,32 +439,77 @@ bool AkgKernelBuilder::HandleRepeatNodes() {
  return true;
 }

+std::vector<std::string> AkgKernelBuilder::GetKernelJsonsByHashId(const std::vector<JsonNodePair> &build_args,
+                                                                  std::set<size_t> fetched_ids) {
+  std::vector<std::string> jsons;
+  for (const auto &[json_generator, anf_node] : build_args) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto kernel_name = json_generator.kernel_name();
+
+    auto hash_id = std::hash<std::string>()(kernel_name);
+
+    if (fetched_ids.count(hash_id) == 0) {
+      continue;
+    }
+
+    auto kernel_json = json_generator.kernel_json_str();
+    AkgSaveJsonInfo(kernel_name, kernel_json);
+    jsons.push_back(kernel_json);
+  }
+  return jsons;
+}
+
 bool AkgKernelBuilder::AkgOpParallelBuild(const std::vector<JsonNodePair> &build_args) {
  repeat_nodes_.clear();
-  auto jsons = GetNotCachedKernelJsons(build_args);
-  if (jsons.empty()) {
+  auto new_build_args = GetNotCachedKernels(build_args);
+  if (new_build_args.empty()) {
    return true;
  }

-  auto client = GetClient();
-  MS_EXCEPTION_IF_NULL(client);
-  if (!client->AkgStart(PROCESS_NUM, TIME_OUT)) {
-    MS_LOG(ERROR) << "Akg start failed.";
+  AkgKernelPool kp;
+  auto ret = kp.Init(new_build_args);
+  if (ret != 0) {
+    MS_LOG(ERROR) << "AkgKernelPool init failed.";
    return false;
  }
-  auto attrs = CollectBuildAttrs();
-  if (!attrs.empty() && !client->AkgSendAttr(attrs)) {
-    MS_LOG(ERROR) << "Akg send attr failed.";
+
+  std::set<size_t> fetched_ids;
+  ret = kp.FetchKernels(&fetched_ids);
+  if (ret != 0) {
+    MS_LOG(ERROR) << "AkgKernelPool FetchKernels failed.";
    return false;
  }
-  if (!client->AkgSendData(jsons)) {
-    MS_LOG(ERROR) << "Akg send data failed.";
-    return false;
-  }
-  if (!client->AkgWait()) {
-    MS_LOG(ERROR) << "Akg compile failed.";
+
+  if (!fetched_ids.empty()) {
+    auto jsons = GetKernelJsonsByHashId(new_build_args, fetched_ids);
+
+    auto client = GetClient();
+    MS_EXCEPTION_IF_NULL(client);
+    if (!client->AkgStart(PROCESS_NUM, TIME_OUT)) {
+      MS_LOG(ERROR) << "Akg start failed.";
+      return false;
+    }
+    auto attrs = CollectBuildAttrs();
+    if (!attrs.empty() && !client->AkgSendAttr(attrs)) {
+      MS_LOG(ERROR) << "Akg send attr failed.";
+      return false;
+    }
+    if (!client->AkgSendData(jsons)) {
+      MS_LOG(ERROR) << "Akg send data failed.";
+      return false;
+    }
+    if (!client->AkgWait()) {
+      MS_LOG(ERROR) << "Akg compile failed.";
+      return false;
+    }
+  }
+
+  ret = kp.UpdateAndWait(fetched_ids);
+  if (ret != 0) {
+    MS_LOG(ERROR) << "AkgKernelPool UpdateAndWait failed.";
    return false;
  }
+
  // All unique done here, cache them and set kernel.
  if (!InsertToCache(build_args)) {
    MS_LOG(ERROR) << "Insert cache failed.";
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
@ -17,10 +17,13 @@
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_

+#include <sys/shm.h>
+
 #include <string>
 #include <utility>
 #include <vector>
 #include <map>
+#include <set>
 #include "ir/anf.h"
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/session/kernel_build_client.h"
@ -45,12 +48,84 @@ class AkgKernelBuilder {

 private:
  std::vector<std::string> GetNotCachedKernelJsons(const std::vector<JsonNodePair> &build_args);
+  std::vector<JsonNodePair> GetNotCachedKernels(const std::vector<JsonNodePair> &build_args);
+  std::vector<std::string> GetKernelJsonsByHashId(const std::vector<JsonNodePair> &build_args,
+                                                  std::set<size_t> fetched_ids);
  bool InsertToCache(const std::vector<JsonNodePair> &build_args);
  bool HandleRepeatNodes();
  bool AkgOpParallelBuild(const std::vector<JsonNodePair> &build_args);
  std::vector<JsonNodePair> repeat_nodes_;
  std::string CollectBuildAttrs();
 };
+
+class AkgKernelPool {
+ public:
+  class LockMng {
+   public:
+    explicit LockMng(int32_t fd) {
+      fd_ = fd;
+      locked_ = TryLock();
+    }
+
+    virtual ~LockMng() {
+      if (locked_) {
+        Unlock();
+      }
+    }
+
+    bool locked_{false};
+
+   private:
+    bool TryLock();
+    void Unlock();
+
+    int32_t fd_{-1};
+  };
+
+ public:
+  AkgKernelPool() = default;
+  virtual ~AkgKernelPool();
+
+  int32_t Init(const std::vector<JsonNodePair> &build_args);
+  int32_t FetchKernels(std::set<size_t> *out);
+  int32_t UpdateAndWait(const std::set<size_t> &ids);
+
+  constexpr inline static size_t kMaxKernelNum_{1000};
+  constexpr inline static key_t kSharedMemKey_{0x57565845};
+
+  // allocate memory for todo_list, doing_list, done_list
+  constexpr inline static size_t kListNum_{3};
+
+  constexpr inline static auto kKeyName_ = "./akg_build_tmp.key";
+
+  constexpr inline static int32_t kToDoIdx_ = 0;
+  constexpr inline static int32_t kDoingIdx_ = 1;
+  constexpr inline static int32_t kDoneIdx_ = 2;
+
+ private:
+  void *CreateSharedMem(const std::string &path);
+  std::string GetCurrentPath();
+
+  inline void InitKernelLists(void *addr) {
+    kernel_lists_[kToDoIdx_] = reinterpret_cast<size_t *>(addr);
+    kernel_lists_[kDoingIdx_] = kernel_lists_[kToDoIdx_] + kMaxKernelNum_ + 1;
+    kernel_lists_[kDoneIdx_] = kernel_lists_[kDoingIdx_] + kMaxKernelNum_ + 1;
+  }
+
+  int32_t AddKernels(const std::vector<JsonNodePair> &kernel_jsons);
+  int32_t Wait();
+
+  int32_t shm_id_{-1};
+  bool is_creator_{false};
+  int32_t fd_{-1};
+
+  // includes 3 lists: todo_list, doing_list, done_list.
+  // each list has kMaxKernelNum_ + 1 elements and, the count of elements in each list
+  // is stored in kernel_lists_[xx][kMaxKernelNum_]
+  size_t *kernel_lists_[kListNum_]{nullptr, nullptr, nullptr};
+
+  std::set<size_t> self_kernel_ids_;
+};
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_build.cc
@ -44,8 +44,10 @@ KernelPackPtr AkgAscendKernelBuilder::AkgInsertCache(const std::string &kernel_n
 void AkgAscendKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack,
                                             const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) {
  auto kernel_mod_ptr = std::make_shared<AkgKernelMod>(kernel_pack);
+  auto kernel_json_info = kernel_pack->kernel_json_info();
  kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
  kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
+  kernel_mod_ptr->SetWorkspaceSizeList(kernel_json_info.workspaces);
  AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
 }

--- a/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/ascend/akg_ascend_kernel_mod.cc
@ -49,7 +49,7 @@ const std::vector<size_t> &AkgKernelMod::GetOutputSizeList() const { return outp

 const std::vector<size_t> &AkgKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }

-bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                          const std::vector<AddressPtr> &outputs, void *stream_ptr) {
  if (stream_ptr == nullptr) {
    MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
@ -74,6 +74,10 @@ bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
                       [](const AddressPtr &input) -> void * { return input->addr; });
  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtime_args),
                       [](const AddressPtr &output) -> void * { return output->addr; });
+  if (!workspace.empty()) {
+    (void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtime_args),
+                         [](const AddressPtr &addr) -> void * { return addr->addr; });
+  }

  rtL2Ctrl_t *l2ctrl = nullptr;
  auto stream = static_cast<rtStream_t *>(stream_ptr);
@ -86,7 +90,8 @@ bool AkgKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
  return true;
 }

-std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &inputs,
+                                               const std::vector<AddressPtr> &workspace,
                                               const std::vector<AddressPtr> &outputs, uint32_t stream_id) {
  if (kernel_pack_ == nullptr) {
    MS_LOG(EXCEPTION) << "kernel pack should not be nullptr.";
@ -107,6 +112,10 @@ std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &in
                       [](const AddressPtr &input) -> void * { return input->addr; });
  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_data_addrs),
                       [](const AddressPtr &output) -> void * { return output->addr; });
+  if (!workspace.empty()) {
+    (void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(workspace_addrs),
+                         [](const AddressPtr &workspace) -> void * { return workspace->addr; });
+  }

  uint32_t block_dim = DEFAULT_BLOCK_DIM;  // default blockdim equal to 1.
  auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim);
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_build.cc
@ -39,8 +39,10 @@ KernelPackPtr AkgGpuKernelBuilder::AkgInsertCache(const std::string &kernel_name
 void AkgGpuKernelBuilder::AkgSetKernelMod(const KernelPackPtr &kernel_pack,
                                          const AkgKernelJsonGenerator &json_generator, const AnfNodePtr &anf_node) {
  auto kernel_mod_ptr = std::make_shared<GpuKernelMod>(kernel_pack);
+  auto kernel_json_info = kernel_pack->kernel_json_info();
  kernel_mod_ptr->SetInputSizeList(json_generator.input_size_list());
  kernel_mod_ptr->SetOutputSizeList(json_generator.output_size_list());
+  kernel_mod_ptr->SetWorkspaceSizeList(kernel_json_info.workspaces);
  AnfAlgo::SetKernelMod(kernel_mod_ptr, anf_node.get());
 }

--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc
@ -92,13 +92,15 @@ void GpuKernelMod::SetInputSizeList(const std::vector<size_t> &size_list) { inpu

 void GpuKernelMod::SetOutputSizeList(const std::vector<size_t> &size_list) { output_size_list_ = size_list; }

+void GpuKernelMod::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; }
+
 const std::vector<size_t> &GpuKernelMod::GetInputSizeList() const { return input_size_list_; }

 const std::vector<size_t> &GpuKernelMod::GetOutputSizeList() const { return output_size_list_; }

 const std::vector<size_t> &GpuKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }

-bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                          const std::vector<AddressPtr> &outputs, void *stream_ptr) {
  if (stream_ptr == 0) {
    MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
@ -122,6 +124,10 @@ bool GpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
                       [](const AddressPtr &input) -> void * { return reinterpret_cast<void *>(&(input->addr)); });
  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs),
                       [](const AddressPtr &output) -> void * { return reinterpret_cast<void *>(&(output->addr)); });
+  if (!workspace.empty()) {
+    (void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtimeargs),
+                         [](const AddressPtr &addr) -> void * { return addr->addr; });
+  }
  result = cuLaunchKernel(kernel_addr, thread_info[0], thread_info[1], thread_info[2], thread_info[3], thread_info[4],
                          thread_info[5], 0, reinterpret_cast<CUstream>(stream_ptr),
                          reinterpret_cast<void **>(&runtimeargs[0]), 0);
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.h
@ -60,6 +60,7 @@ class GpuKernelMod : public KernelMod {

  void SetInputSizeList(const std::vector<size_t> &size_list);
  void SetOutputSizeList(const std::vector<size_t> &size_list);
+  void SetWorkspaceSizeList(const std::vector<size_t> &size_list);
  const std::vector<size_t> &GetInputSizeList() const override;
  const std::vector<size_t> &GetOutputSizeList() const override;
  const std::vector<size_t> &GetWorkspaceSizeList() const override;
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
@ -141,14 +141,8 @@ FusionType GetFusionTypeByName(const std::string &name) {
  return iter->first;
 }

-void KernelMeta::Initialize(int pid) {
-  if (pid == -1) {
-    kernel_meta_path_ = std::string(kGpuKernelMeta) + "_" + std::to_string(getpid()) + "/";
-  } else {
-    kernel_meta_path_ = std::string(kGpuKernelMeta) + "_" + std::to_string(pid) + "/";
-  }
-  // remove old kernel cache
-  RemoveKernelCache();
+void KernelMeta::Initialize() {
+  kernel_meta_path_ = std::string(kGpuKernelMeta) + "/";

 #if defined(_WIN32) || defined(_WIN64)
  auto ret = mkdir(kernel_meta_path_.c_str());
@ -161,21 +155,6 @@ void KernelMeta::Initialize(int pid) {
  initialized_ = true;
 }

-void KernelMeta::RemoveKernelCache() {
-  DIR *dir = opendir(kernel_meta_path_.c_str());
-  if (dir == nullptr) {
-    return;
-  }
-  struct dirent *entry;
-  while ((entry = readdir(dir)) != nullptr) {
-    std::string kernel_file = entry->d_name;
-    std::string kernel_file_realpath = kernel_meta_path_ + kernel_file;
-    (void)remove(kernel_file_realpath.c_str());
-  }
-  (void)closedir(dir);
-  (void)rmdir(kernel_meta_path_.c_str());
-}
-
 std::string KernelMeta::Search(const std::string &kernel_name) const {
  if (!initialized_) {
    return "";
@ -227,7 +206,7 @@ KernelPackPtr SearchCache(const std::string &kernel_name, const std::string &pro
    KernelPackPtr kernel_pack = std::make_shared<KernelPack>();
    // just a tmp solution.
    if (!kernel_pack->ReadFromJsonFile(kernel_json, processor)) {
-      MS_LOG(DEBUG) << "Read cache json and bin file failed[" << kernel_json << "].";
+      MS_LOG(ERROR) << "Read cache json and bin file failed[" << kernel_json << "].";
      return nullptr;
    } else {
      return kernel_pack;
@ -250,7 +229,7 @@ KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &pro
  (void)kernel_json.append(kernel_name).append(kJsonSuffix);
  KernelPackPtr kernel_pack = std::make_shared<KernelPack>();
  if (!kernel_pack->ReadFromJsonFile(kernel_json, processor)) {
-    MS_LOG(DEBUG) << "Read json and bin file failed[" << kernel_json << "].";
+    MS_LOG(ERROR) << "Read json and bin file failed[" << kernel_json << "].";
    return nullptr;
  }

@ -714,6 +693,9 @@ void GetFuncGraphOutputNodes(const FuncGraphPtr &func_graph, std::vector<AnfNode
      for (size_t input_idx = 1; input_idx < cnode->inputs().size(); ++input_idx) {
        auto input_node = cnode->input(input_idx);
        MS_EXCEPTION_IF_NULL(input_node);
+        if (input_node->isa<CNode>() && AnfAlgo::GetInputTensorNum(input_node) == 0) {
+          continue;
+        }
        output_list->push_back(AnfAlgo::VisitKernel(input_node, 0).first);
      }
    } else {
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
@ -55,8 +55,7 @@ using KernelMetaPtr = std::shared_ptr<KernelMetaInfo>;
 class KernelMeta {
 public:
  KernelMeta() = default;
-  void Initialize(int pid);
-  void RemoveKernelCache();
+  void Initialize();
  std::string Search(const std::string &kernel_name) const;
  bool Insert(const std::string &kernel_name, const std::string &kernel_json);
  std::string kernel_meta_path() const { return kernel_meta_path_; }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.cc
@ -26,46 +26,26 @@ namespace mindspore {
 namespace kernel {
 constexpr size_t kSizeFloat16 = sizeof(float16);
 constexpr size_t kSizeFloat32 = sizeof(float);
+constexpr size_t kScalarIndex = 0;
 constexpr size_t kAdamWeightDecayInputSize = 9;
 constexpr size_t kAdamWeightDecayOutputSize = 3;

-void AdamWeightDecayCPUKernel::ParallelForAdam(const CTask &task, size_t count) {
-  auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
-  const float block_size = 128.0;
-  const float align_size = 16.0;
-  size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
-  std::vector<common::Task> tasks;
-  size_t start = 0;
-  size_t once_compute_size = align_size * std::ceil(count / (align_size * thread_num));
-  while (start < count) {
-    size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
-    auto block = [&, start, end]() {
-      task(start, end);
-      return common::SUCCESS;
-    };
-    tasks.emplace_back(block);
-    start += once_compute_size;
-  }
-  common::ThreadPool::GetInstance().SyncRun(tasks);
-}
-
 template <typename T, typename S>
-void AdamWeightDecayCPUKernel::LaunchFusedAdam(const std::vector<AddressPtr> &inputs,
-                                               const std::vector<AddressPtr> &outputs) {
-  auto var = reinterpret_cast<T *>(inputs[0]->addr);
-  auto m = reinterpret_cast<T *>(inputs[1]->addr);
-  auto v = reinterpret_cast<T *>(inputs[2]->addr);
-  auto lr = reinterpret_cast<T *>(inputs[3]->addr)[0];
-  auto beta1 = reinterpret_cast<T *>(inputs[4]->addr)[0];
-  auto beta2 = reinterpret_cast<T *>(inputs[5]->addr)[0];
-  auto epsilon = reinterpret_cast<T *>(inputs[6]->addr)[0];
-  auto decay = reinterpret_cast<T *>(inputs[7]->addr)[0];
-  auto gradient16 = reinterpret_cast<S *>(inputs[8]->addr);
+void AdamWeightDecayCPUKernel::LaunchFusedAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &) {
+  auto var = reinterpret_cast<T *>(inputs[VAR]->addr);
+  auto m = reinterpret_cast<T *>(inputs[M]->addr);
+  auto v = reinterpret_cast<T *>(inputs[V]->addr);
+  auto lr = reinterpret_cast<T *>(inputs[LR]->addr)[kScalarIndex];
+  auto beta1 = reinterpret_cast<T *>(inputs[BETA1]->addr)[kScalarIndex];
+  auto beta2 = reinterpret_cast<T *>(inputs[BETA2]->addr)[kScalarIndex];
+  auto epsilon = reinterpret_cast<T *>(inputs[EPSILON]->addr)[kScalarIndex];
+  auto decay = reinterpret_cast<T *>(inputs[DECAY]->addr)[kScalarIndex];
+  auto gradient16 = reinterpret_cast<S *>(inputs[GRAD]->addr);
  const auto beta1_minus = 1 - beta1;
  const auto beta2_minus = 1 - beta2;

  // multithreading
-  size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(float)) : 1;
+  size_t lens = inputs[VAR]->size > 0 ? static_cast<size_t>(inputs[VAR]->size / sizeof(float)) : 1;
  std::function<void(size_t, size_t)> task;

  task = [&](size_t start, size_t end) {
@ -81,28 +61,27 @@ void AdamWeightDecayCPUKernel::LaunchFusedAdam(const std::vector<AddressPtr> &in
      var[i] -= lr * update;
    }
  };
-  ParallelForAdam(task, lens);
+  CPUKernelUtils::ParallelFor(task, lens);
 }

 template <typename T>
 void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector<AddressPtr> &inputs,
-                                                     const std::vector<AddressPtr> &outputs) {
-  auto var = reinterpret_cast<T *>(inputs[0]->addr);
-  auto m = reinterpret_cast<T *>(inputs[1]->addr);
-  auto v = reinterpret_cast<T *>(inputs[2]->addr);
-  auto lr = reinterpret_cast<T *>(inputs[3]->addr)[0];
-  auto beta1 = reinterpret_cast<T *>(inputs[4]->addr)[0];
-  auto beta2 = reinterpret_cast<T *>(inputs[5]->addr)[0];
-  auto epsilon = reinterpret_cast<T *>(inputs[6]->addr)[0];
-  auto decay = reinterpret_cast<T *>(inputs[7]->addr)[0];
-  auto gradient = reinterpret_cast<T *>(inputs[8]->addr);
+                                                     const std::vector<AddressPtr> &) {
+  auto var = reinterpret_cast<T *>(inputs[VAR]->addr);
+  auto m = reinterpret_cast<T *>(inputs[M]->addr);
+  auto v = reinterpret_cast<T *>(inputs[V]->addr);
+  auto lr = reinterpret_cast<T *>(inputs[LR]->addr)[kScalarIndex];
+  auto beta1 = reinterpret_cast<T *>(inputs[BETA1]->addr)[kScalarIndex];
+  auto beta2 = reinterpret_cast<T *>(inputs[BETA2]->addr)[kScalarIndex];
+  auto epsilon = reinterpret_cast<T *>(inputs[EPSILON]->addr)[kScalarIndex];
+  auto decay = reinterpret_cast<T *>(inputs[DECAY]->addr)[kScalarIndex];
+  auto gradient = reinterpret_cast<T *>(inputs[GRAD]->addr);
  const auto beta1_minus = 1 - beta1;
  const auto beta2_minus = 1 - beta2;

  // multithreading
-  size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(float)) : 1;
+  size_t lens = inputs[VAR]->size > 0 ? static_cast<size_t>(inputs[VAR]->size / sizeof(float)) : 1;
  std::function<void(size_t, size_t)> task;
-
  task = [&](size_t start, size_t end) {
    size_t i = AdamWeightDecayFp32(var, m, v, lr, beta1, beta2, epsilon, decay, gradient, start, end);
    // remaining
@ -114,14 +93,14 @@ void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector<AddressPt
      var[i] -= lr * update;
    }
  };
-  ParallelForAdam(task, lens);
+  CPUKernelUtils::ParallelFor(task, lens);
 }

 void AdamWeightDecayCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
-  std::vector<size_t> var_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
-  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
-  gradient_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 8);
+  std::vector<size_t> var_shape = AnfAlgo::GetInputDeviceShape(kernel_node, VAR);
+  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, VAR);
+  gradient_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, GRAD);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != kAdamWeightDecayInputSize) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but AdamWeightDecay needs 9 inputs.";
@ -155,12 +134,12 @@ void AdamWeightDecayCPUKernel::CheckParam(const std::vector<kernel::AddressPtr>
  }
  size_t elem1_size = elem_num_ * kSizeFloat32;
  size_t elem2_size = gradient_dtype_ == kNumberTypeFloat16 ? elem_num_ * kSizeFloat16 : elem1_size;
-  if (inputs[0]->size != elem1_size || inputs[1]->size != elem1_size || inputs[2]->size != elem1_size ||
-      inputs[8]->size != elem2_size) {
+  if (inputs[VAR]->size != elem1_size || inputs[M]->size != elem1_size || inputs[V]->size != elem1_size ||
+      inputs[GRAD]->size != elem2_size) {
    MS_LOG(EXCEPTION) << "Error input data size!";
  }
-  if (inputs[3]->size != kSizeFloat32 || inputs[4]->size != kSizeFloat32 || inputs[5]->size != kSizeFloat32 ||
-      inputs[6]->size != kSizeFloat32 || inputs[7]->size != kSizeFloat32) {
+  if (inputs[LR]->size != kSizeFloat32 || inputs[BETA1]->size != kSizeFloat32 || inputs[BETA2]->size != kSizeFloat32 ||
+      inputs[EPSILON]->size != kSizeFloat32 || inputs[DECAY]->size != kSizeFloat32) {
    MS_LOG(EXCEPTION) << "The attribute beta, lr, epsilon and weight decay must be float!";
  }
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h
@ -32,7 +32,6 @@ class AdamWeightDecayCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
-  void ParallelForAdam(const CTask &task, size_t count);
  void CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  template <typename T, typename S>
  void LaunchFusedAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
@ -41,6 +40,7 @@ class AdamWeightDecayCPUKernel : public CPUKernel {
  size_t elem_num_{0};
  TypeId dtype_{kTypeUnknown};
  TypeId gradient_dtype_{kTypeUnknown};
+  enum input_list_ { VAR, M, V, LR, BETA1, BETA2, EPSILON, DECAY, GRAD };
 };

 MS_REG_CPU_KERNEL(AdamWeightDecay,
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_adagrad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_adagrad_cpu_kernel.cc
@ -76,27 +76,10 @@ void ApplyAdagradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,

  // multithreading
  size_t length = inputs[0]->size / sizeof(T);
-  size_t max_thread_num = std::thread::hardware_concurrency();
-  size_t use_thread_num = length < 128 * max_thread_num ? std::ceil(length / 128.0) : max_thread_num;
-  std::vector<std::thread> threads;
-  threads.reserve(use_thread_num);
-  size_t start = 0;
-  const size_t batch_size = (length + use_thread_num - 1) / use_thread_num;
-
-  if (batch_size == 0) {
-    MS_LOG(EXCEPTION) << "Error occur in launch kernel";
-    return;
-  }
-  while (start < length) {
-    size_t end = (start + batch_size) > length ? length : (start + batch_size);
-    threads.emplace_back(
-      std::thread(&ApplyAdagradCPUKernel::LaunchApplyAdagrad<T *>, this, var, accum, lr, gradient, start, end));
-    start += batch_size;
-  }
-
-  for (auto &it : threads) {
-    it.join();
-  }
+  auto task = [this, &var, &accum, lr, gradient](size_t start, size_t end) {
+    LaunchApplyAdagrad(var, accum, lr, gradient, start, end);
+  };
+  CPUKernelUtils::ParallelForAutoSearch(task, length, &parallel_search_info_);

  // Copy result to output tensor
  auto output_var = reinterpret_cast<T *>(outputs[0]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.cc
@ -13,10 +13,12 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+
+#include "backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h"
 #include <cmath>
 #include <string>
 #include <map>
-#include "backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h"
+#include <functional>
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
@ -29,7 +31,9 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o
      auto iter = base_iter;
      iter.SetPos(start);
      for (size_t i = start; i < end; i++) {
-        out[i] = input1[iter.GetInputPosA()] < input2[iter.GetInputPosB()];
+        auto x = input1[iter.GetInputPosA()];
+        auto y = input2[iter.GetInputPosB()];
+        out[i] = std::less<T>()(x, y);
        iter.GenNextPos();
      }
    };
@ -37,7 +41,9 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o
  } else {
    base_iter.SetPos(0);
    for (size_t i = 0; i < output_size_; i++) {
-      out[i] = input1[base_iter.GetInputPosA()] < input2[base_iter.GetInputPosB()];
+      auto x = input1[base_iter.GetInputPosA()];
+      auto y = input2[base_iter.GetInputPosB()];
+      out[i] = std::less<T>()(x, y);
      base_iter.GenNextPos();
    }
  }
@ -50,7 +56,9 @@ void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *
    auto iter = base_iter;
    iter.SetPos(start);
    for (size_t i = start; i < end; i++) {
-      out[i] = input1[iter.GetInputPosA()] == input2[iter.GetInputPosB()];
+      auto x = input1[iter.GetInputPosA()];
+      auto y = input2[iter.GetInputPosB()];
+      out[i] = std::equal_to<T>()(x, y);
      iter.GenNextPos();
    }
  };
@ -64,7 +72,9 @@ void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, boo
    auto iter = base_iter;
    iter.SetPos(start);
    for (size_t i = start; i < end; i++) {
-      out[i] = input1[iter.GetInputPosA()] != input2[iter.GetInputPosB()];
+      auto x = input1[iter.GetInputPosA()];
+      auto y = input2[iter.GetInputPosB()];
+      out[i] = std::not_equal_to<T>()(x, y);
      iter.GenNextPos();
    }
  };
@ -106,7 +116,9 @@ void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool
    auto iter = base_iter;
    iter.SetPos(start);
    for (size_t i = start; i < end; i++) {
-      out[i] = input1[iter.GetInputPosA()] > input2[iter.GetInputPosB()];
+      auto x = input1[iter.GetInputPosA()];
+      auto y = input2[iter.GetInputPosB()];
+      out[i] = std::greater<T>()(x, y);
      iter.GenNextPos();
    }
  };
@ -120,7 +132,9 @@ void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2,
    auto iter = base_iter;
    iter.SetPos(start);
    for (size_t i = start; i < end; i++) {
-      out[i] = input1[iter.GetInputPosA()] >= input2[iter.GetInputPosB()];
+      auto x = input1[iter.GetInputPosA()];
+      auto y = input2[iter.GetInputPosB()];
+      out[i] = std::greater_equal<T>()(x, y);
      iter.GenNextPos();
    }
  };
@ -134,7 +148,9 @@ void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bo
    auto iter = base_iter;
    iter.SetPos(start);
    for (size_t i = start; i < end; i++) {
-      out[i] = input1[iter.GetInputPosA()] <= input2[iter.GetInputPosB()];
+      auto x = input1[iter.GetInputPosA()];
+      auto y = input2[iter.GetInputPosB()];
+      out[i] = std::less_equal<T>()(x, y);
      iter.GenNextPos();
    }
  };
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.cc
@ -21,6 +21,7 @@
 #include <string>

 #include "runtime/device/kernel_info.h"
+#include "runtime/device/cpu/kernel_select_cpu.h"

 namespace mindspore {
 namespace kernel {
@ -111,6 +112,11 @@ std::pair<bool, size_t> CPUKernelFactory::CPUKernelAttrCheck(const std::string &
    MS_LOG(INFO) << "Not registered CPU kernel: op[" << kernel_name << "]!";
    return std::make_pair(false, 0);
  }
+
+  if (device::cpu::IsDynamicParamKernel(kernel_name)) {
+    return std::make_pair(true, 0);
+  }
+
  auto kernel_attrs = GetSupportedKernelAttrList(kernel_name);
  if (kernel_attrs[0].GetInputSize() == 0 && kernel_attrs[0].GetOutputSize() == 0) {
    auto op_info_ptr = mindspore::kernel::OpLib::FindOp(kernel_name, kernel::OpImplyType::kCPU);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.cc
@ -43,9 +43,9 @@ void DropoutGradCpuBwdKernel::InitKernel(const CNodePtr &kernel_node) {
 bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                     const std::vector<AddressPtr> &outputs) {
  if (dtype_ == kNumberTypeFloat16) {
-    DropoutBackwardKernel<float16>(inputs, outputs, num_count_, keep_prob_);
+    DropoutBackwardKernel<float16>(inputs, outputs, keep_prob_);
  } else if (dtype_ == kNumberTypeFloat32) {
-    DropoutBackwardKernel<float>(inputs, outputs, num_count_, keep_prob_);
+    DropoutBackwardKernel<float>(inputs, outputs, keep_prob_);
  } else {
    MS_LOG(ERROR) << "Input data type: " << dtype_ << " is not supported for DropoutGrad kernel for CPU.";
  }
@ -55,8 +55,7 @@ bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs, cons

 template <typename T>
 void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr> &inputs,
-                                                    const std::vector<AddressPtr> &outputs, size_t num_count,
-                                                    float keep_prob) {
+                                                    const std::vector<AddressPtr> &outputs, float keep_prob) {
  auto *output = reinterpret_cast<T *>(outputs[0]->addr);
  const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
  const auto *mask = reinterpret_cast<T *>(inputs[1]->addr);
@ -70,7 +69,7 @@ void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr
      input_tmp[i] = static_cast<float>(input[i]);
      mask_tmp[i] = static_cast<float>(mask[i]);
    }
-    DropoutGrad(input_tmp, mask_tmp, output_tmp, num_count_, scale);
+    DropoutGrad(input_tmp, mask_tmp, output_tmp, SizeToInt(num_count_), scale);
    for (size_t i = 0; i < num_count_; ++i) {
      output[i] = static_cast<float16>(output_tmp[i]);
    }
@ -78,7 +77,7 @@ void DropoutGradCpuBwdKernel::DropoutBackwardKernel(const std::vector<AddressPtr
    delete[] output_tmp;
    delete[] mask_tmp;
  } else if constexpr (std::is_same_v<T, float>) {
-    DropoutGrad(input, mask, output, num_count_, scale);
+    DropoutGrad(input, mask, output, SizeToInt(num_count_), scale);
  }
 }
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.h
@ -40,7 +40,7 @@ class DropoutGradCpuBwdKernel : public CPUKernel {
  TypeId dtype_{kTypeUnknown};
  template <typename T>
  void DropoutBackwardKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs,
-                             size_t num_count, float keep_prob);
+                             float keep_prob);
 };

 MS_REG_CPU_KERNEL(DropoutGrad, KernelAttr(), DropoutGradCpuBwdKernel);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include <map>
+
 #include "backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h"
+#include <string>
+#include <map>
 #include "common/thread_pool.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "nnacl/fp32_grad/activation_grad.h"
@ -25,50 +27,50 @@ namespace mindspore {
 namespace kernel {
 template <typename T>
 void EltWiseGradCPUKernel<T>::ReluGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
-  if constexpr (std::is_same_v<T, float>) {
-    int ret = ::ReluGrad(input1 + start, input2 + start, end - start, out + start);
-    if (ret == NNACL_ERR) {
-      MS_LOG(EXCEPTION) << "ReLUGrad failed.";
-    }
-  } else {
+  if constexpr (!std::is_same<T, float>::value) {
    MS_LOG(EXCEPTION) << "ReLUGrad only support float";
  }
+
+  int ret = ::ReluGrad(input1 + start, input2 + start, end - start, out + start);
+  if (ret == NNACL_ERR) {
+    MS_LOG(EXCEPTION) << "ReLUGrad execute failed.";
+  }
 }

 template <typename T>
 void EltWiseGradCPUKernel<T>::ReLU6Grad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
-  if constexpr (std::is_same_v<T, float>) {
-    int ret = ::Relu6Grad(input1 + start, input2 + start, end - start, out + start);
-    if (ret == NNACL_ERR) {
-      MS_LOG(EXCEPTION) << "ReLU6Grad failed.";
-    }
-  } else {
+  if constexpr (!std::is_same<T, float>::value) {
    MS_LOG(EXCEPTION) << "ReLU6Grad only support float";
  }
+
+  int ret = ::Relu6Grad(input1 + start, input2 + start, end - start, out + start);
+  if (ret == NNACL_ERR) {
+    MS_LOG(EXCEPTION) << "ReLU6Grad execute failed.";
+  }
 }

 template <typename T>
 void EltWiseGradCPUKernel<T>::AbsGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
-  if constexpr (std::is_same_v<T, float>) {
-    int ret = ::ElementAbsGrad(input1 + start, input2 + start, out + start, end - start);
-    if (ret == NNACL_ERR) {
-      MS_LOG(EXCEPTION) << "AbsGrad failed.";
-    }
-  } else {
+  if constexpr (!std::is_same<T, float>::value) {
    MS_LOG(EXCEPTION) << "AbsGrad only support float";
  }
+
+  int ret = ::ElementAbsGrad(input1 + start, input2 + start, out + start, end - start);
+  if (ret == NNACL_ERR) {
+    MS_LOG(EXCEPTION) << "AbsGrad execute failed.";
+  }
 }

 template <typename T>
 void EltWiseGradCPUKernel<T>::SigmoidGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
-  if constexpr (std::is_same_v<T, float>) {
-    int ret = ::SigmoidGrad(input2 + start, input1 + start, end - start, out + start);
-    if (ret == NNACL_ERR) {
-      MS_LOG(EXCEPTION) << "SigmoidGrad failed.";
-    }
-  } else {
+  if constexpr (!std::is_same<T, float>::value) {
    MS_LOG(EXCEPTION) << "SigmoidGrad only support float";
  }
+
+  int ret = ::SigmoidGrad(input2 + start, input1 + start, end - start, out + start);
+  if (ret == NNACL_ERR) {
+    MS_LOG(EXCEPTION) << "SigmoidGrad execute failed.";
+  }
 }

 template <typename T>
@ -80,14 +82,14 @@ void EltWiseGradCPUKernel<T>::SqrtGrad(const T *input1, const T *input2, T *out,

 template <typename T>
 void EltWiseGradCPUKernel<T>::TanhGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
-  if constexpr (std::is_same_v<T, float>) {
-    int ret = ::TanhGrad(input2 + start, input1 + start, end - start, out + start);
-    if (ret == NNACL_ERR) {
-      MS_LOG(EXCEPTION) << "TanhGrad failed.";
-    }
-  } else {
+  if constexpr (!std::is_same<T, float>::value) {
    MS_LOG(EXCEPTION) << "TanhGrad only support float";
  }
+
+  int ret = ::TanhGrad(input2 + start, input1 + start, end - start, out + start);
+  if (ret == NNACL_ERR) {
+    MS_LOG(EXCEPTION) << "TanhGrad execute failed.";
+  }
 }

 template <typename T>
@ -207,6 +209,18 @@ void EltWiseGradCPUKernel<T>::AcoshGrad(const T *input1, const T *input2, T *out
  }
 }

+template <typename T>
+void EltWiseGradCPUKernel<T>::SoftplusGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const {
+  if constexpr (!std::is_same<T, float>::value) {
+    MS_LOG(EXCEPTION) << "SoftplusGrad only support float";
+  }
+
+  int ret = ::SoftplusGrad(input1 + start, input2 + start, end - start, out + start);
+  if (ret == NNACL_ERR) {
+    MS_LOG(EXCEPTION) << "SoftplusGrad execute failed.";
+  }
+}
+
 template <typename T>
 void EltWiseGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
@ -219,12 +233,19 @@ bool EltWiseGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inpu
                                     const std::vector<kernel::AddressPtr> &outputs) {
  static const std::map<std::string,
                        std::function<void(EltWiseGradCPUKernel *, const T *, const T *, T *, size_t, size_t)>>
-    elt_map{{"ReluGrad", &EltWiseGradCPUKernel<T>::ReluGrad},       {"ReLU6Grad", &EltWiseGradCPUKernel<T>::ReLU6Grad},
-            {"SigmoidGrad", &EltWiseGradCPUKernel<T>::SigmoidGrad}, {"AbsGrad", &EltWiseGradCPUKernel<T>::AbsGrad},
-            {"TanhGrad", &EltWiseGradCPUKernel<T>::TanhGrad},       {"SqrtGrad", &EltWiseGradCPUKernel<T>::SqrtGrad},
-            {"GeLUGrad", &EltWiseGradCPUKernel<T>::GeluGrad},       {"AsinGrad", &EltWiseGradCPUKernel<T>::AsinGrad},
-            {"ACosGrad", &EltWiseGradCPUKernel<T>::ACosGrad},       {"AtanGrad", &EltWiseGradCPUKernel<T>::AtanGrad},
-            {"AsinhGrad", &EltWiseGradCPUKernel<T>::AsinhGrad},     {"AcoshGrad", &EltWiseGradCPUKernel<T>::AcoshGrad}};
+    elt_map{{prim::kPrimReluGrad->name(), &EltWiseGradCPUKernel<T>::ReluGrad},
+            {prim::kPrimRelu6Grad->name(), &EltWiseGradCPUKernel<T>::ReLU6Grad},
+            {prim::kPrimSigmoidGrad->name(), &EltWiseGradCPUKernel<T>::SigmoidGrad},
+            {prim::kPrimAbsGrad->name(), &EltWiseGradCPUKernel<T>::AbsGrad},
+            {prim::kPrimTanhGrad->name(), &EltWiseGradCPUKernel<T>::TanhGrad},
+            {prim::kPrimSqrtGrad->name(), &EltWiseGradCPUKernel<T>::SqrtGrad},
+            {prim::kPrimGeLUGrad->name(), &EltWiseGradCPUKernel<T>::GeluGrad},
+            {prim::kPrimAsinGrad->name(), &EltWiseGradCPUKernel<T>::AsinGrad},
+            {prim::kPrimACosGrad->name(), &EltWiseGradCPUKernel<T>::ACosGrad},
+            {prim::kPrimAtanGrad->name(), &EltWiseGradCPUKernel<T>::AtanGrad},
+            {prim::kPrimAsinhGrad->name(), &EltWiseGradCPUKernel<T>::AsinhGrad},
+            {prim::kPrimAcoshGrad->name(), &EltWiseGradCPUKernel<T>::AcoshGrad},
+            {prim::kPrimSoftplusGrad->name(), &EltWiseGradCPUKernel<T>::SoftplusGrad}};
  if (inputs.size() < 2 || outputs.size() != 1) {
    MS_LOG(ERROR) << kernel_name_ << " requires at least 2 inputs and 1 output, but got " << inputs.size()
                  << " inputs and " << outputs.size() << " output.";
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -48,6 +48,7 @@ class EltWiseGradCPUKernel : public CPUKernel {
  void AtanGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const;
  void AsinhGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const;
  void AcoshGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const;
+  void SoftplusGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) const;

  std::string kernel_name_ = "";
 };
@ -103,6 +104,10 @@ MS_REG_CPU_KERNEL_T(
  AcoshGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  EltWiseGradCPUKernel, float);
+MS_REG_CPU_KERNEL_T(
+  SoftplusGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  EltWiseGradCPUKernel, float);
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.cc
@ -13,39 +13,47 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+
 #include "backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h"
+#include <string>
+#include <unordered_map>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "utils/ms_utils.h"

 namespace mindspore {
 namespace kernel {
+namespace {
+struct DescParam {
+  dnnl::algorithm algorithm;
+  float alpha = 0.f;
+  float beta = 0.f;
+};
+}  // namespace
+
 dnnl::eltwise_forward::desc EltWiseCPUKernel::GetForwardEltwiseDesc(const CNodePtr &kernel_node,
                                                                    const dnnl::memory::desc src_desc) {
+  static const std::unordered_map<std::string, DescParam> eltWiseOpDescMap{
+    {prim::kPrimRelu->name(), DescParam{dnnl::algorithm::eltwise_relu}},
+    {prim::kPrimRelu6->name(), DescParam{dnnl::algorithm::eltwise_clip, 0.f, 6.f}},
+    {prim::kPrimAbs->name(), DescParam{dnnl::algorithm::eltwise_abs}},
+    {prim::kPrimExp->name(), DescParam{dnnl::algorithm::eltwise_exp}},
+    {prim::kPrimLog->name(), DescParam{dnnl::algorithm::eltwise_log}},
+    {prim::kPrimSigmoid->name(), DescParam{dnnl::algorithm::eltwise_logistic}},
+    {prim::kPrimSqrt->name(), DescParam{dnnl::algorithm::eltwise_sqrt}},
+    {prim::kPrimSquare->name(), DescParam{dnnl::algorithm::eltwise_square}},
+    {prim::kPrimTanh->name(), DescParam{dnnl::algorithm::eltwise_tanh}},
+    {prim::kPrimElu->name(), DescParam{dnnl::algorithm::eltwise_elu, 1.f, 0.f}},
+    {prim::kPrimSoftplus->name(), DescParam{dnnl::algorithm::eltwise_soft_relu}},
+  };
+
  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
-  if (kernel_name == "ReLU") {
-    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_relu, src_desc, 0.0);
-  } else if (kernel_name == "ReLU6") {
-    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_clip, src_desc, 0.0, 6.0);
-  } else if (kernel_name == "Abs") {
-    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_abs, src_desc);
-  } else if (kernel_name == "Exp") {
-    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_exp, src_desc);
-  } else if (kernel_name == "Log") {
-    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_log, src_desc);
-  } else if (kernel_name == "Sigmoid") {
-    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_logistic, src_desc);
-  } else if (kernel_name == "Sqrt") {
-    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_sqrt, src_desc);
-  } else if (kernel_name == "Square") {
-    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_square, src_desc);
-  } else if (kernel_name == "Tanh") {
-    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_tanh, src_desc);
-  } else if (kernel_name == "Elu") {
-    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_elu, src_desc, 1.0);
-  } else {
-    MS_LOG(EXCEPTION) << "Eltwise operators don't support " << kernel_name;
+  const auto desc_pair = eltWiseOpDescMap.find(kernel_name);
+  if (desc_pair == eltWiseOpDescMap.end()) {
+    MS_LOG(EXCEPTION) << "EltWiseCPUKernel does not support " << kernel_name;
  }
+  return dnnl::eltwise_forward::desc(DnnlForward, desc_pair->second.algorithm, src_desc, desc_pair->second.alpha,
+                                     desc_pair->second.beta);
 }

 void EltWiseCPUKernel::InitKernel(const CNodePtr &kernel_node) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -56,6 +56,8 @@ MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutpu
                  EltWiseCPUKernel);
 MS_REG_CPU_KERNEL(Tanh, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  EltWiseCPUKernel);
+MS_REG_CPU_KERNEL(Softplus, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  EltWiseCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt
@ -36,6 +36,24 @@ file(GLOB KERNEL_SRC
    ${NNACL_DIR}/fp32_grad/*.c
 )

+if(MSLITE_STRING_KERNEL)
+    file(GLOB KERNEL_SRC_INFER_STRING
+            ${NNACL_DIR}/infer/string/*.c
+            )
+    set(KERNEL_SRC
+            ${KERNEL_SRC}
+            ${KERNEL_SRC_INFER_STRING}
+            )
+endif()
+if(MSLITE_CONTROL_TENSORLIST)
+    file(GLOB KERNEL_SRC_INFER_CONTROL_TENSORLIST
+            ${NNACL_DIR}/infer/control/*.c
+            )
+    set(KERNEL_SRC
+            ${KERNEL_SRC}
+            ${KERNEL_SRC_INFER_CONTROL_TENSORLIST}
+            )
+endif()
 if(PLATFORM_ARM64)
    file(GLOB ASSEMBLY_SRC ${NNACL_DIR}/assembly/arm64/*.S)
    set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/opt/MatmulDpInt8.S
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/opt/MatmulDpInt8.S
@ -5,7 +5,8 @@

 //void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4, 
 //                      const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,
-//                      int *multiplier, int *left_shift, int *right_shift, int row, int col, int stride, int peroc);
+//                      const int *multiplier, const int *left_shift, const int *right_shift, int row,
+//                      int col, int stride, int peroc);

 // x0: a(left matrix ptr)
 // x1: b(right matrix ptr)
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/opt/MatmulDpInt8Opt.S
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/opt/MatmulDpInt8Opt.S
@ -4,8 +4,9 @@
 .align 5

 //void MatmulInt8DpOpt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep4, const int *a_sums,
-//                     const int *bias, int act_min, int act_max, int out_zp, int32_t *multiplier, int32_t *left_shift,
-//                     int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp)
+//                     const int *bias, int act_min, int act_max, int out_zp, const int32_t *multiplier,
+//                     const int32_t *left_shift, const int32_t *right_shift, size_t stride, size_t filter_peroc,
+//                     const int32_t *filter_zp)

 // x0: a(left matrix ptr)
 // x1: b(right matrix ptr)
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/batch_to_space_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/batch_to_space_base.c
@ -23,19 +23,19 @@ void BatchToSpaceNoCropForNHWC(const void *input, void *output, const int *in_sh
  int in_h = in_shape[1];
  int in_w = in_shape[2];
  int in_c = in_shape[3];
-  size_t stride_h = block_w * out_n;
-  size_t output_offset = 0;
-  size_t copy_size = in_c * data_size;
-  size_t in_stride_h = in_w * in_c;
-  size_t in_stride_n = in_stride_h * in_h;
+  int stride_h = block_w * out_n;
+  int output_offset = 0;
+  int copy_size = in_c * data_size;
+  int in_stride_h = in_w * in_c;
+  int in_stride_n = in_stride_h * in_h;
  for (int n = 0; n < out_n; ++n) {
    for (int h = 0; h < in_h; ++h) {
-      size_t h_offset = h * in_stride_h;
+      int h_offset = h * in_stride_h;
      for (int bh = 0; bh < block_h; ++bh) {
        for (int w = 0; w < in_w; ++w) {
-          size_t w_offset = w * in_c;
+          int w_offset = w * in_c;
          for (int bw = 0; bw < block_w; ++bw) {
-            size_t in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
+            int in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
            memcpy((int8_t *)output + output_offset, (int8_t *)input + in_offset * data_size, copy_size);
            output_offset += copy_size;
          }
@ -49,6 +49,9 @@ void BatchToSpaceForNHWC(const void *input, void *output, const int *in_shape, i
                         const int *crops, int data_size) {
  int block_h = block[0];
  int block_w = block[1];
+  if (block_h == 0 || block_w == 0) {
+    return;
+  }
  int in_h = in_shape[1];
  int in_w = in_shape[2];
  int in_c = in_shape[3];
@ -61,27 +64,27 @@ void BatchToSpaceForNHWC(const void *input, void *output, const int *in_shape, i
  int w_end = MSMIN((in_w * block_w - crops[3]) / block_w + 1, in_w);
  int w_valid_end = in_w * block_w - crops[3] - 1;

-  size_t stride_h = block_w * out_n;
-  size_t output_offset = 0;
-  size_t copy_size = in_c * data_size;
-  size_t in_stride_h = in_w * in_c;
-  size_t in_stride_n = in_stride_h * in_h;
+  int stride_h = block_w * out_n;
+  int output_offset = 0;
+  int copy_size = in_c * data_size;
+  int in_stride_h = in_w * in_c;
+  int in_stride_n = in_stride_h * in_h;
  for (int n = 0; n < out_n; ++n) {
    for (int h = h_start; h < h_end; ++h) {
-      size_t h_offset = h * in_stride_h;
+      int h_offset = h * in_stride_h;
      for (int bh = 0; bh < block_h; ++bh) {
-        size_t h_index = h * block_h + bh;
+        int h_index = h * block_h + bh;
        if (h_index < h_valid_begin || h_index > h_valid_end) {
          continue;
        }
        for (int w = w_start; w < w_end; ++w) {
-          size_t w_offset = w * in_c;
+          int w_offset = w * in_c;
          for (int bw = 0; bw < block_w; ++bw) {
-            size_t w_index = w * block_w + bw;
+            int w_index = w * block_w + bw;
            if (w_index < w_valid_begin || w_index > w_valid_end) {
              continue;
            }
-            size_t in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
+            int in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
            memcpy((int8_t *)output + output_offset, (int8_t *)input + in_offset * data_size, copy_size);
            output_offset += copy_size;
          }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.c
@ -62,7 +62,7 @@ void pad_input_shape(int *input_shape, int input_shape_len, int output_shape_len
    shape_info->input_shape_size_ = dim_max + 1;                                                       \
                                                                                                       \
    size_t before_dim_elements_num = accumulate(input_shape, 0, dim_max - 1);                          \
-    size_t after_dim_elements_num = input_shape[dim_max];                                              \
+    size_t after_dim_elements_num = (size_t)(input_shape[dim_max]);                                    \
    size_t dim_broadcast_rate = (size_t)(output_shape[dim_max] / input_shape[dim_max]);                \
    for (size_t i = 0; i < before_dim_elements_num; ++i) {                                             \
      const type *in_ptr = input + i * after_dim_elements_num;                                         \
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/concat_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/concat_base.c
@ -24,15 +24,18 @@ void Concat(void **input, int input_num, int axis, int **inputs_output_shape, si
  }

  int after_axis_size = data_size;
-  for (size_t i = axis + 1; i < shape_size; ++i) {
+  for (size_t i = (size_t)(axis) + 1; i < shape_size; ++i) {
    after_axis_size *= inputs_output_shape[0][i];
  }
  int axis_offset = 0;
  uint8_t *dst_base = (output);
-  size_t output_stride = after_axis_size * inputs_output_shape[input_num][axis];
+  int output_stride = after_axis_size * inputs_output_shape[input_num][axis];
  for (int i = 0; i < input_num; ++i) {
    const uint8_t *src_base = (input[i]);
-    size_t input_stride = after_axis_size * inputs_output_shape[i][axis];
+    if (inputs_output_shape[i] == NULL) {
+      continue;
+    }
+    int input_stride = after_axis_size * inputs_output_shape[i][axis];
    int offset = UP_DIV(input_stride, thread_num);
    int count = input_stride - offset * task_id;
    if (count <= 0) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/depth_to_space_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/depth_to_space_base.c
@ -22,17 +22,17 @@ void DepthToSpaceForNHWC(const void *input, void *output, const int *in_shape, c
  int32_t in_shape_dim1 = in_shape[1];
  size_t copy_size = block_size * param->out_stride_dim2_ * param->data_type_size_;
  for (int i = 0; i < in_shape[0]; ++i) {
-    size_t in_offset_n = i * param->in_stride_dim0_;
-    size_t out_offset_n = i * param->out_stride_dim0_;
+    int in_offset_n = i * param->in_stride_dim0_;
+    int out_offset_n = i * param->out_stride_dim0_;
    for (int j = 0; j < in_shape_dim1; ++j) {
-      size_t in_offset_h = in_offset_n + j * param->in_stride_dim1_;
-      size_t out_offset_h = out_offset_n + j * block_size * param->out_stride_dim1_;
+      int in_offset_h = in_offset_n + j * param->in_stride_dim1_;
+      int out_offset_h = out_offset_n + j * block_size * param->out_stride_dim1_;
      for (int k = 0; k < in_shape_dim2; ++k) {
-        size_t in_offset_w = in_offset_h + k * param->in_stride_dim2_;
-        size_t out_offset_w = out_offset_h + k * block_size * param->out_stride_dim2_;
+        int in_offset_w = in_offset_h + k * param->in_stride_dim2_;
+        int out_offset_w = out_offset_h + k * block_size * param->out_stride_dim2_;
        for (int l = 0; l < block_size; ++l) {
-          size_t out_offset = (out_offset_w + l * param->out_stride_dim1_) * param->data_type_size_;
-          size_t in_offset = (in_offset_w + l * block_size * param->out_stride_dim2_) * param->data_type_size_;
+          int out_offset = (out_offset_w + l * param->out_stride_dim1_) * param->data_type_size_;
+          int in_offset = (in_offset_w + l * block_size * param->out_stride_dim2_) * param->data_type_size_;
          memcpy((int8_t *)output + out_offset, (int8_t *)input + in_offset, copy_size);
        }
      }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/minimal_filtering_generator.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/minimal_filtering_generator.c
@ -118,7 +118,9 @@ int B(const float *poly_array, float *matrix_b, int in_unit) {
  float matrix_t[MAX_LEN];   // n * in_unit

  T(poly_array, matrix_t, n);
-  LT(poly_array, matrix_lt, n);
+  if (LT(poly_array, matrix_lt, n) != NNACL_OK) {
+    return NNACL_ERR;
+  }
  MatrixTranspose(matrix_lt, matrix_l, n, n);
  MatrixMultiply(matrix_l, matrix_t, matrix_b, n, n, in_unit);
  matrix_b[in_unit * in_unit - 1] = 1;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/slice_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/slice_base.c
@ -47,43 +47,43 @@ void DoSlice(const void *input, void *output, SliceParameter *param, int thread_
  int8_t *int8_in = (int8_t *)input;
  int8_t *int8_out = (int8_t *)output;

-  size_t out_stride[8];
+  int out_stride[8];
  out_stride[7] = 1;
  for (int i = 6; i >= 0; --i) {
    out_stride[i] = out_stride[i + 1] * param->size_[i + 1];
  }

-  size_t count_per_thread = UP_DIV(param->size_[5], param->op_parameter_.thread_num_);
-  size_t thread_begin = thread_id * count_per_thread;
-  size_t thread_end = MSMIN(param->size_[5], thread_begin + count_per_thread);
-  size_t copy_size = param->size_[7] * data_size;
-  size_t in_stride[8];
+  int count_per_thread = UP_DIV(param->size_[5], param->op_parameter_.thread_num_);
+  int thread_begin = thread_id * count_per_thread;
+  int thread_end = MSMIN(param->size_[5], thread_begin + count_per_thread);
+  int copy_size = param->size_[7] * data_size;
+  int in_stride[8];
  in_stride[7] = 1;
  for (int i = 6; i >= 0; --i) {
    in_stride[i] = param->shape_[i + 1] * in_stride[i + 1];
  }

  for (int ii = 0; ii < param->size_[0]; ++ii) {
-    size_t out_offset0 = ii * out_stride[0];
-    size_t in_offset0 = (ii + param->begin_[0]) * in_stride[0] + param->begin_[7];
+    int out_offset0 = ii * out_stride[0];
+    int in_offset0 = (ii + param->begin_[0]) * in_stride[0] + param->begin_[7];
    for (int jj = 0; jj < param->size_[1]; ++jj) {
-      size_t out_offset1 = jj * out_stride[1] + out_offset0;
-      size_t in_offset1 = (jj + param->begin_[1]) * in_stride[1] + in_offset0;
+      int out_offset1 = jj * out_stride[1] + out_offset0;
+      int in_offset1 = (jj + param->begin_[1]) * in_stride[1] + in_offset0;
      for (int kk = 0; kk < param->size_[2]; ++kk) {
-        size_t out_offset2 = kk * out_stride[2] + out_offset1;
-        size_t in_offset2 = (kk + param->begin_[2]) * in_stride[2] + in_offset1;
+        int out_offset2 = kk * out_stride[2] + out_offset1;
+        int in_offset2 = (kk + param->begin_[2]) * in_stride[2] + in_offset1;
        for (int ll = 0; ll < param->size_[3]; ++ll) {
-          size_t out_offset3 = ll * out_stride[3] + out_offset2;
-          size_t in_offset3 = (ll + param->begin_[3]) * in_stride[3] + in_offset2;
+          int out_offset3 = ll * out_stride[3] + out_offset2;
+          int in_offset3 = (ll + param->begin_[3]) * in_stride[3] + in_offset2;
          for (int i = 0; i < param->size_[4]; ++i) {
-            size_t out_offset4 = i * out_stride[4] + out_offset3;
-            size_t in_offset4 = (i + param->begin_[4]) * in_stride[4] + in_offset3;
-            for (size_t j = thread_begin; j < thread_end; ++j) {
-              size_t out_offset5 = j * out_stride[5] + out_offset4;
-              size_t in_offset5 = (j + param->begin_[5]) * in_stride[5] + in_offset4;
+            int out_offset4 = i * out_stride[4] + out_offset3;
+            int in_offset4 = (i + param->begin_[4]) * in_stride[4] + in_offset3;
+            for (int j = thread_begin; j < thread_end; ++j) {
+              int out_offset5 = j * out_stride[5] + out_offset4;
+              int in_offset5 = (j + param->begin_[5]) * in_stride[5] + in_offset4;
              for (int k = 0; k < param->size_[6]; ++k) {
-                size_t out_offset6 = k * out_stride[6] + out_offset5;
-                size_t in_offset6 = (k + param->begin_[6]) * in_stride[6] + in_offset5;
+                int out_offset6 = k * out_stride[6] + out_offset5;
+                int in_offset6 = (k + param->begin_[6]) * in_stride[6] + in_offset5;
                memcpy(int8_out + out_offset6 * data_size, int8_in + in_offset6 * data_size, copy_size);
              }
            }
@ -105,8 +105,8 @@ void DoSliceNoParallel(const void *input, void *output, SliceParameter *param, i
  int8_t *int8_in = (int8_t *)input;
  int8_t *int8_out = (int8_t *)output;

-  size_t copy_size = param->size_[7] * data_size;
-  size_t in_stride[8];
+  int copy_size = param->size_[7] * data_size;
+  int in_stride[8];
  in_stride[7] = 1;
  for (int i = 6; i >= 0; --i) {
    in_stride[i] = param->shape_[i + 1] * in_stride[i + 1];
@ -115,9 +115,9 @@ void DoSliceNoParallel(const void *input, void *output, SliceParameter *param, i
  for (int i = 0; i < DIMENSION_8D; ++i) {
    axis_copy_flag[i] = WhetherCopyByAxis(param->begin_, param->end_, param->shape_, i);
  }
-  size_t out_offset = 0;
+  int out_offset = 0;
  for (int32_t dim0 = param->begin_[0]; dim0 < param->end_[0]; ++dim0) {
-    size_t in_offset0 = dim0 * in_stride[0] + param->begin_[7];
+    int in_offset0 = dim0 * in_stride[0] + param->begin_[7];
 #define FAST_COPY_IF_NEED(rank)                                                      \
  if (axis_copy_flag[rank]) {                                                        \
    int left_block_num = param->end_[rank] - dim##rank;                              \
@ -128,24 +128,24 @@ void DoSliceNoParallel(const void *input, void *output, SliceParameter *param, i
    continue;                                                                        \
  }
    FAST_COPY_IF_NEED(0);
-    for (size_t dim1 = param->begin_[1]; dim1 < param->end_[1]; ++dim1) {
-      size_t in_offset1 = dim1 * in_stride[1] + in_offset0;
+    for (int dim1 = param->begin_[1]; dim1 < param->end_[1]; ++dim1) {
+      int in_offset1 = dim1 * in_stride[1] + in_offset0;
      FAST_COPY_IF_NEED(1);
      for (int32_t dim2 = param->begin_[2]; dim2 < param->end_[2]; ++dim2) {
-        size_t in_offset2 = in_offset1 + dim2 * in_stride[2];
+        int in_offset2 = in_offset1 + dim2 * in_stride[2];
        FAST_COPY_IF_NEED(2);
        for (int32_t dim3 = param->begin_[3]; dim3 < param->end_[3]; ++dim3) {
-          size_t in_offset3 = in_offset2 + dim3 * in_stride[3];
+          int in_offset3 = in_offset2 + dim3 * in_stride[3];
          FAST_COPY_IF_NEED(3);
          for (int32_t dim4 = param->begin_[4]; dim4 < param->end_[4]; ++dim4) {
-            size_t in_offset4 = in_offset3 + dim4 * in_stride[4];
+            int in_offset4 = in_offset3 + dim4 * in_stride[4];
            FAST_COPY_IF_NEED(4);
            for (int32_t dim5 = param->begin_[5]; dim5 < param->end_[5]; ++dim5) {
-              size_t in_offset5 = in_offset4 + dim5 * in_stride[5];
+              int in_offset5 = in_offset4 + dim5 * in_stride[5];
              FAST_COPY_IF_NEED(5);
 #undef FAST_COPY_IF_NEED
              for (int32_t dim6 = param->begin_[6]; dim6 < param->end_[6]; ++dim6) {
-                size_t in_offset6 = in_offset5 + dim6 * in_stride[6];
+                int in_offset6 = in_offset5 + dim6 * in_stride[6];
                memcpy(int8_out + out_offset * data_size, int8_in + in_offset6 * data_size, copy_size);
                out_offset += param->size_[7];
              }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/split_base.c
@ -21,10 +21,6 @@

 int DoSplit(void *in_data, void **out_data, const int *input_shape, int offset, int num_unit,
            SplitParameter *split_param, int data_size) {
-  if (in_data == NULL || out_data == NULL) {
-    return NNACL_ERR;
-  }
-
  int8_t *int8_in = (int8_t *)in_data;

  int num_split = split_param->num_split_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/tile_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/tile_base.c
@ -26,15 +26,15 @@ void DoCopyData(const uint8_t *input_data, uint8_t *output_data, size_t size, si
 }

 int DoTileOneDimension(uint8_t *input_data, uint8_t *output_data, size_t dim, const TileParameter *parameter) {
-  size_t src_dim_size = parameter->in_shape_[dim];
+  int src_dim_size = parameter->in_shape_[dim];
  if (dim == parameter->in_dim_ - 1) {
    DoCopyData(input_data, output_data, src_dim_size, parameter->data_size_, parameter->multiples_[dim]);
    return 0;
  }
-  for (size_t i = 0; i < src_dim_size; ++i) {
-    for (size_t j = 0; j < parameter->multiples_[dim]; ++j) {
-      size_t in_pos = parameter->in_strides_[dim] * i;
-      size_t out_pos = parameter->out_strides_[dim] * (i + j * src_dim_size);
+  for (int i = 0; i < src_dim_size; ++i) {
+    for (int j = 0; j < parameter->multiples_[dim]; ++j) {
+      int in_pos = parameter->in_strides_[dim] * i;
+      int out_pos = parameter->out_strides_[dim] * (i + j * src_dim_size);
      DoTileOneDimension(input_data + in_pos * parameter->data_size_, output_data + out_pos * parameter->data_size_,
                         dim + 1, parameter);
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/tile_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/tile_base.h
@ -18,20 +18,20 @@
 #define MINDSPORE_NNACL_BASE_TILE_H_

 #include "nnacl/op_base.h"
-
+#define MAX_TILE_DIM_SIZE 8
 typedef struct TileParameter {
  // primitive parameter
  OpParameter op_parameter_;
-  int multiples_[8];
-  int dims_[8];
+  int multiples_[MAX_TILE_DIM_SIZE];
+  int dims_[MAX_TILE_DIM_SIZE];
  size_t dims_size_;
  size_t multiples_size_;

  // shape correlative
-  int in_shape_[8];
-  int out_shape_[8];
-  int in_strides_[8];
-  int out_strides_[8];
+  int in_shape_[MAX_TILE_DIM_SIZE];
+  int out_shape_[MAX_TILE_DIM_SIZE];
+  int in_strides_[MAX_TILE_DIM_SIZE];
+  int out_strides_[MAX_TILE_DIM_SIZE];

  // other parameter
  int in_dim_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/transpose_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/transpose_base.c
@ -184,7 +184,7 @@
    for (int i = dims - 1; i > 0; --i) {                                                               \
      *(size + i - 1) = *(size + i) * output_shape[i];                                                 \
    }                                                                                                  \
-    for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) {                                     \
+    for (int idx = 0; idx < (*size) * output_shape[0]; ++idx) {                                        \
      int pos = idx;                                                                                   \
      int output_idx = 0;                                                                              \
      int input_idx = 0;                                                                               \
@ -215,7 +215,7 @@
      return;                                                                                        \
    }                                                                                                \
    count = MSMIN(offset_size, count);                                                               \
-    for (size_t idx = task_offset; idx < task_offset + count; ++idx) {                               \
+    for (int idx = task_offset; idx < task_offset + count; ++idx) {                                  \
      int pos = idx;                                                                                 \
      int output_idx = 0;                                                                            \
      int input_idx = 0;                                                                             \
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/common_func.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/common_func.c
@ -16,15 +16,19 @@

 #include "nnacl/common_func.h"

-int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3) {
+int Offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3) {
  return ((dim0 * shape[1] + dim1) * shape[2] + dim2) * shape[3] + dim3;
 }

-int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2) {
+int OffsetComm(const int *shape, const int dim0, const int dim1, const int dim2) {
  return ((dim0 * shape[1] + dim1) * shape[2] + dim2) * shape[3];
 }

-int offset4d(const int *shape, const int *dims) { return offset(shape, dims[0], dims[1], dims[2], dims[3]); }
+int Offset4d(const int *shape, const int *dims) { return Offset(shape, dims[0], dims[1], dims[2], dims[3]); }
+
+int Offset6d(const int *shape, const int *dims) {
+  return ((OffsetComm(shape, dims[0], dims[1], dims[2]) + dims[3]) * shape[4] + dims[4]) * shape[5];
+}

 int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); }

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/common_func.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/common_func.h
@ -36,9 +36,10 @@ void ReluFp32C8(float *data, float *dst, int ele_num);
 void Relu6Fp32C8(float *data, float *dst, int ele_num);
 #endif
 #endif
-int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3);
-int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2);
-int offset4d(const int *shape, const int *dims);
+int Offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3);
+int OffsetComm(const int *shape, const int dim0, const int dim1, const int dim2);
+int Offset4d(const int *shape, const int *dims);
+int Offset6d(const int *shape, const int *dims);

 static inline bool isAddOverflow(int32_t x, int32_t y) {
  int32_t sum = x + y;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pad_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/pad_fp16.c
@ -19,16 +19,22 @@

 void PadFp16(const float16_t *input_data, float16_t *output_data, const int *input_shape, const int *output_shape,
             const int *paddings, const int tid, const int thread_num) {
-  int in[4], out[4];
+  int in[DEFAULT_PAD_NDIMS], out[DEFAULT_PAD_NDIMS];
  for (in[0] = 0; in[0] < input_shape[0]; in[0]++) {
    out[0] = in[0] + paddings[0];
    for (in[1] = tid; in[1] < input_shape[1]; in[1] += thread_num) {
      out[1] = in[1] + paddings[2];
      for (in[2] = 0; in[2] < input_shape[2]; in[2]++) {
        out[2] = in[2] + paddings[4];
-        float16_t *dst = output_data + offset(output_shape, out[0], out[1], out[2], paddings[6]);
-        const float16_t *src = input_data + offset(input_shape, in[0], in[1], in[2], 0);
-        memcpy(dst, src, input_shape[3] * sizeof(float16_t));
+        for (in[3] = 0; in[3] < input_shape[3]; in[3]++) {
+          out[3] = in[3] + paddings[6];
+          for (in[4] = 0; in[4] < input_shape[4]; in[4]++) {
+            out[4] = in[4] + paddings[8];
+            float16_t *dst = output_data + Offset6d(output_shape, out) + paddings[10];
+            const float16_t *src = input_data + Offset6d(input_shape, in);
+            memcpy(dst, src, input_shape[5] * sizeof(float16_t));
+          }
+        }
      }
    }
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.c
@ -152,17 +152,15 @@ int AdamDeltaFp32(float *delta, float *m, float *v, float lr, float beta1, float
  return NNACL_OK;
 }

-int AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
-                        const float *gradient, size_t start, size_t end) {
+size_t AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon,
+                           float decay, const float *gradient, size_t start, size_t end) {
  size_t c1 = start;
 #ifdef ENABLE_AVX512
-  const float beta1_minus = 1 - beta1;
-  const float beta2_minus = 1 - beta2;
  struct AVX_Data beta1_r, beta2_r, beta1_minus_r, beta2_minus_r, lr_neg_r, epsilon_r, decay_r;
  beta1_r.data = _mm512_set1_ps(beta1);
  beta2_r.data = _mm512_set1_ps(beta2);
-  beta1_minus_r.data = _mm512_set1_ps(beta1_minus);
-  beta2_minus_r.data = _mm512_set1_ps(beta2_minus);
+  beta1_minus_r.data = _mm512_set1_ps(1.0f - beta1);
+  beta2_minus_r.data = _mm512_set1_ps(1.0f - beta2);
  lr_neg_r.data = _mm512_set1_ps(-lr);
  epsilon_r.data = _mm512_set1_ps(epsilon);
  decay_r.data = _mm512_set1_ps(decay);
@ -260,17 +258,15 @@ int AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, f
  return c1;
 }

-int FusedAdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
-                  const int16_t *gradient16, size_t start, size_t end) {
+size_t FusedAdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+                     const int16_t *gradient16, size_t start, size_t end) {
  size_t c1 = start;
 #ifdef ENABLE_AVX512
-  const float beta1_minus = 1 - beta1;
-  const float beta2_minus = 1 - beta2;
  struct AVX_Data beta1_r, beta2_r, beta1_minus_r, beta2_minus_r, lr_neg_r, epsilon_r, decay_r;
  beta1_r.data = _mm512_set1_ps(beta1);
  beta2_r.data = _mm512_set1_ps(beta2);
-  beta1_minus_r.data = _mm512_set1_ps(beta1_minus);
-  beta2_minus_r.data = _mm512_set1_ps(beta2_minus);
+  beta1_minus_r.data = _mm512_set1_ps(1.0f - beta1);
+  beta2_minus_r.data = _mm512_set1_ps(1.0f - beta2);
  lr_neg_r.data = _mm512_set1_ps(-lr);
  epsilon_r.data = _mm512_set1_ps(epsilon);
  decay_r.data = _mm512_set1_ps(decay);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.h
@ -71,10 +71,10 @@ int AdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2,
             size_t start, size_t end, bool use_nesterov);
 int AdamDeltaFp32(float *delta, float *m, float *v, float lr, float beta1, float beta2, float epsilon,
                  const float *gradient, size_t start, size_t end, bool use_nesterov);
-int AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
-                        const float *gradient, size_t start, size_t end);
-int FusedAdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
-                  const int16_t *gradient16, size_t start, size_t end);
+size_t AdamWeightDecayFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon,
+                           float decay, const float *gradient, size_t start, size_t end);
+size_t FusedAdamFp32(float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+                     const int16_t *gradient16, size_t start, size_t end);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/arg_min_max_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/arg_min_max_fp32.c
@ -49,8 +49,8 @@ void ArgMaxTopK1(const float *input, void *output, float *output_value, const Ar
  float *outputfp32 = (float *)output;
  int *outputint = (int *)output;
  for (int i = 0; i < pre_axis_count; ++i) {
-    size_t output_offset = i * after_axis_count;
-    size_t input_offset = output_offset * axis_count;
+    int output_offset = i * after_axis_count;
+    int input_offset = output_offset * axis_count;
    for (int j = 0; j < after_axis_count; ++j) {
      float value = -FLT_MAX;
      int index = 0;
@ -79,8 +79,8 @@ void ArgMinTopK1(const float *input, void *output, float *output_value, const Ar
  float *outputfp32 = (float *)output;
  int *outputint = (int *)output;
  for (int i = 0; i < pre_axis_count; ++i) {
-    size_t output_offset = i * after_axis_count;
-    size_t input_offset = output_offset * axis_count;
+    int output_offset = i * after_axis_count;
+    int input_offset = output_offset * axis_count;
    for (int j = 0; j < after_axis_count; ++j) {
      float value = FLT_MAX;
      int index = 0;
@ -109,13 +109,13 @@ void ArgMinMaxDim0(const float *input, void *output, float *output_value, const
  int *outputint = (int *)output;
  for (int32_t i = 0; i < param->in_strides_[0]; ++i) {
    for (int j = 0; j < in_shape[0]; ++j) {
-      size_t offset = param->in_strides_[0] * j + i;
+      int offset = param->in_strides_[0] * j + i;
      param->arg_elements_[j].index_ = j;
      param->arg_elements_[j].data_.f_data_ = input[offset];
    }
    qsort(param->arg_elements_, in_shape[0], sizeof(ArgElement), *compare_func);
    for (int j = 0; j < param->topk_; ++j) {
-      size_t out_offset = j * param->out_strides_[0] + i;
+      int out_offset = j * param->out_strides_[0] + i;
      if (param->out_value_) {
        outputfp32[out_offset] = param->arg_elements_[j].data_.f_data_;
      } else {
@ -135,17 +135,17 @@ void ArgMinMaxDim1(const float *input, void *output, float *output_value, const
  int *outputint = (int *)output;
  int in_shape1 = in_shape[1];
  for (int i = 0; i < in_shape[0]; ++i) {
-    size_t in_dim0_offset = i * param->in_strides_[0];
-    size_t out_dim0_offset = i * param->out_strides_[0];
+    int in_dim0_offset = i * param->in_strides_[0];
+    int out_dim0_offset = i * param->out_strides_[0];
    for (int j = 0; j < param->in_strides_[1]; ++j) {
      for (int k = 0; k < in_shape1; ++k) {
-        size_t offset = param->in_strides_[1] * k + in_dim0_offset + j;
+        int offset = param->in_strides_[1] * k + in_dim0_offset + j;
        param->arg_elements_[k].index_ = k;
        param->arg_elements_[k].data_.f_data_ = input[offset];
      }
      qsort(param->arg_elements_, in_shape1, sizeof(ArgElement), *compare_func);
      for (int k = 0; k < param->topk_; ++k) {
-        size_t out_offset = out_dim0_offset + j + k * param->out_strides_[1];
+        int out_offset = out_dim0_offset + j + k * param->out_strides_[1];
        if (param->out_value_) {
          outputfp32[out_offset] = param->arg_elements_[k].data_.f_data_;
        } else {
@ -167,20 +167,20 @@ void ArgMinMaxDim2(const float *input, void *output, float *output_value, const
  float *outputfp32 = (float *)output;
  int *outputint = (int *)output;
  for (int i = 0; i < in_shape[0]; ++i) {
-    size_t in_dim0_offset = i * param->in_strides_[0];
-    size_t out_dim0_offset = i * param->out_strides_[0];
+    int in_dim0_offset = i * param->in_strides_[0];
+    int out_dim0_offset = i * param->out_strides_[0];
    for (int j = 0; j < in_shape1; ++j) {
-      size_t in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
-      size_t out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
+      int in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
+      int out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
      for (int k = 0; k < param->in_strides_[2]; ++k) {
        for (int l = 0; l < in_shape2; ++l) {
-          size_t offset = param->in_strides_[2] * l + k + in_dim1_offset;
+          int offset = param->in_strides_[2] * l + k + in_dim1_offset;
          param->arg_elements_[l].index_ = l;
          param->arg_elements_[l].data_.f_data_ = input[offset];
        }
        qsort(param->arg_elements_, in_shape2, sizeof(ArgElement), *compare_func);
        for (int l = 0; l < param->topk_; ++l) {
-          size_t out_offset = out_dim1_offset + k + l * param->out_strides_[2];
+          int out_offset = out_dim1_offset + k + l * param->out_strides_[2];
          if (param->out_value_) {
            outputfp32[out_offset] = param->arg_elements_[l].data_.f_data_;
          } else {
@ -203,26 +203,26 @@ void ArgMinMaxDim3(const float *input, void *output, float *output_value, const
  float *outputfp32 = (float *)output;
  int *outputint = (int *)output;
  for (int i = 0; i < in_shape[0]; ++i) {
-    size_t in_dim0_offset = i * param->in_strides_[0];
-    size_t out_dim0_offset = i * param->out_strides_[0];
+    int in_dim0_offset = i * param->in_strides_[0];
+    int out_dim0_offset = i * param->out_strides_[0];
    for (int j = 0; j < in_shape1; ++j) {
-      size_t in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
-      size_t out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
+      int in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
+      int out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
      for (int k = 0; k < in_shape2; ++k) {
-        size_t in_dim2_offset = k * param->in_strides_[2] + in_dim1_offset;
-        size_t out_dim2_offset = k * param->out_strides_[2] + out_dim1_offset;
+        int in_dim2_offset = k * param->in_strides_[2] + in_dim1_offset;
+        int out_dim2_offset = k * param->out_strides_[2] + out_dim1_offset;
        for (int l = 0; l < in_shape3; ++l) {
-          size_t offset = l + in_dim2_offset;
+          int offset = l + in_dim2_offset;
          param->arg_elements_[l].index_ = l;
          param->arg_elements_[l].data_.f_data_ = input[offset];
        }
        qsort(param->arg_elements_, in_shape3, sizeof(ArgElement), *compare_func);
        for (int l = 0; l < param->topk_; ++l) {
-          size_t out_offset = out_dim2_offset + l;
+          int out_offset = out_dim2_offset + l;
          if (param->out_value_) {
            outputfp32[out_offset] = param->arg_elements_[l].data_.f_data_;
          } else {
-            outputint[out_offset] = param->arg_elements_[l].index_;
+            outputint[out_offset] = (int)(param->arg_elements_[l].index_);
          }
          if (output_value != NULL) {
            output_value[out_offset] = param->arg_elements_[l].data_.f_data_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/common_func_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/common_func_fp32.c
@ -21,10 +21,10 @@ void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_p
  if (size == 0) {
    return;
  }
-  for (int oc = 0; oc < output_channel; oc++) {
+  for (size_t oc = 0; oc < output_channel; oc++) {
    int oc_div = oc / size;
    int oc_mod = oc % size;
-    for (int hw = 0; hw < plane_size; hw++) {
+    for (int hw = 0; hw < (int)plane_size; hw++) {
      int src_index = oc_div * size * plane_stride + hw * size + oc_mod;
      int dst_index = hw * oc_stride + oc;
      float value = src_ptr_[src_index];
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c
@ -52,7 +52,8 @@ int ConvDw(float *output_data, const float *input_data, const float *weight_data
      int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_));

      for (int ow = 0; ow < conv_param->output_w_; ow++) {
-        memcpy(dst_data + ow * conv_param->output_channel_, bias_data, conv_param->output_channel_ * sizeof(float));
+        memcpy(dst_data + ow * conv_param->output_channel_, bias_data,
+               conv_param->output_channel_ * (int)(sizeof(float)));
      }
      for (int kh = start_kh; kh < end_kh; kh++) {
        int ih = ih_origin + conv_param->dilation_w_ * kh;
@ -764,10 +765,10 @@ void ConvDwFp32IndirectRow(float *output, float **input, const float *weights, c
                           int output_width, int input_stride, bool relu, bool relu6, int kernel) {
  do {
    float **in = input;
-    size_t c = channels;
+    size_t c = (size_t)channels;
    const float *w = weights;
    float *out = output;
-    memcpy(out, bias, channels * sizeof(float));
+    memcpy(out, bias, channels * (int)sizeof(float));
    for (; c >= C4NUM; c -= C4NUM) {
      for (int i = 0; i < C4NUM; i++) {
        for (int k = 0; k < kernel; k++) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/deconv_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/deconv_fp32.c
@ -61,7 +61,7 @@ void DeConvPostFp32C8(const float *src, float *tmp, const float *bias, float *ds
  for (int c = 0; c < oc8; c += 8) {
    float *dst_ptr = tmp + c * output_plane;
    const float *src_ptr = src + c * in_plane_round * kernel_plane;
-    memset(dst_ptr, 0, output_plane * C8NUM * sizeof(float));
+    memset(dst_ptr, 0, output_plane * C8NUM * (int)sizeof(float));

    for (int ih = 0; ih < conv_param->input_h_; ih++) {
      for (int iw = 0; iw < conv_param->input_w_; iw++) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/embedding_lookup_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/embedding_lookup_fp32.c
@ -43,7 +43,7 @@ int CopyData(float *input_data, const int *ids, float *output_data, int num,
    parameter->is_regulated_[ids[num]] = true;
  }

-  memcpy(out_data, in_data, sizeof(float) * parameter->layer_size_);
+  memcpy(out_data, in_data, sizeof(float) * (size_t)(parameter->layer_size_));
  return NNACL_OK;
 }

@ -52,7 +52,7 @@ int EmbeddingLookup(float *input_data, const int *ids, float *output_data, const
  if (parameter->op_parameter_.thread_num_ == 0) {
    return NNACL_PARAM_INVALID;
  }
-  for (size_t i = task_id; i < parameter->ids_size_; i += parameter->op_parameter_.thread_num_) {
+  for (int i = task_id; i < parameter->ids_size_; i += parameter->op_parameter_.thread_num_) {
    int ret = CopyData(input_data, ids, output_data, i, parameter);
    if (ret != NNACL_OK) {
      return ret;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/gatherNd_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/gatherNd_fp32.c
@ -21,7 +21,7 @@
 int GatherNd(const float *input, float *output, const int *in_offset, int area, int count) {
  int i = 0;
  for (i = 0; i < count; i++) {
-    (void)memcpy(output + area * i, input + in_offset[i], area * sizeof(float));
+    (void)memcpy(output + area * i, input + in_offset[i], (size_t)(area) * sizeof(float));
  }
  return NNACL_OK;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/lstm_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/lstm_fp32.c
@ -41,7 +41,7 @@ void PackLstmBias(float *dst, const float *src, int batch, int col, int col_alig
  for (int i = 0; i < unidirectional_batch; i++) {
    const float *src_batch = src + i * col;
    float *dst_batch = dst + i * col_align;
-    memcpy(dst_batch, src_batch, col * sizeof(float));
+    memcpy(dst_batch, src_batch, col * (int)sizeof(float));
  }
  if (is_bidirectional) {
    const float *backward_src = src + batch * col;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_fp32.c
@ -263,9 +263,9 @@ void RowMajor2Col12Major_arm32(const float *src_c, float *dst_c, size_t col) {
 void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, int row, int col) {
  const float *src_r = src_ptr;
  float *dst_r = dst_ptr;
-  size_t ri = 0;
+  int ri = 0;
  for (; ri < (row / C12NUM * C12NUM); ri += C12NUM) {
-    size_t ci = 0;
+    int ci = 0;
    for (; ci < (col / C4NUM * C4NUM); ci += C4NUM) {
      const float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C12NUM;
@ -340,7 +340,7 @@ void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, int row, int col)
    for (; ci < col; ci++) {
      const float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C12NUM;
-      for (size_t i = 0; i < C12NUM; i++) {
+      for (int i = 0; i < C12NUM; i++) {
        dst_c[i] = src_c[i * col];
      }
    }
@ -348,16 +348,15 @@ void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, int row, int col)
    dst_r += C12NUM * col;
  }
  for (; ri < row; ri++, dst_r++, src_r += col) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C12NUM] = src_r[i];
    }
  }
  for (; ri < UP_ROUND(row, C12NUM); ri++, dst_r++) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C12NUM] = 0;
    }
  }
-  return;
 }

 #ifdef ENABLE_ARM64
@ -532,20 +531,20 @@ void RowMajor2Col8Major_arm32(const float *src_c, float *dst_c, size_t col) {
 #endif
 #endif
 void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, int row, int col) {
-  size_t row8 = row / C8NUM * C8NUM;
+  int row8 = row / C8NUM * C8NUM;
 #ifdef ENABLE_ARM64
-  size_t col_skip = col / C8NUM * C8NUM;
+  int col_skip = col / C8NUM * C8NUM;
  int skip_size = C8NUM;
 #else
-  size_t col_skip = col / C4NUM * C4NUM;
+  int col_skip = col / C4NUM * C4NUM;
  int skip_size = C4NUM;
 #endif
  const float *src_r = src_ptr;
  float *dst_r = dst_ptr;

-  size_t ri = 0;
+  int ri = 0;
  for (; ri < row8; ri += C8NUM) {
-    size_t ci = 0;
+    int ci = 0;
    for (; ci < col_skip; ci += skip_size) {
      const float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C8NUM;
@ -593,7 +592,7 @@ void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, int row, int col)
    for (; ci < col; ci++) {
      const float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C8NUM;
-      for (size_t i = 0; i < C8NUM; i++) {
+      for (int i = 0; i < C8NUM; i++) {
        dst_c[i] = src_c[i * col];
      }
    }
@ -601,29 +600,28 @@ void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, int row, int col)
    dst_r += C8NUM * col;
  }
  for (; ri < row; ri++, src_r += col, dst_r++) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C8NUM] = src_r[i];
    }
  }

  for (; ri < UP_ROUND(row, C8NUM); ri++, dst_r++) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C8NUM] = 0;
    }
  }
-  return;
 }

 void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, int row, int col) {
-  size_t row16 = row / C16NUM * C16NUM;
-  size_t col_skip = col / C4NUM * C4NUM;
+  int row16 = row / C16NUM * C16NUM;
+  int col_skip = col / C4NUM * C4NUM;
  int skip_size = C4NUM;
  const float *src_r = src_ptr;
  float *dst_r = dst_ptr;

-  size_t ri = 0;
+  int ri = 0;
  for (; ri < row16; ri += C16NUM) {
-    size_t ci = 0;
+    int ci = 0;
    for (; ci < col_skip; ci += skip_size) {
      const float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C16NUM;
@ -636,7 +634,7 @@ void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, int row, int col)
    for (; ci < col; ci++) {
      const float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C16NUM;
-      for (size_t i = 0; i < C16NUM; i++) {
+      for (int i = 0; i < C16NUM; i++) {
        dst_c[i] = src_c[i * col];
      }
    }
@ -644,21 +642,20 @@ void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, int row, int col)
    dst_r += C16NUM * col;
  }
  for (; ri < row; ri++) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C16NUM] = src_r[i];
    }
    src_r += col;
    dst_r += 1;
  }

-  size_t total_row = UP_ROUND(row, C16NUM);
+  int total_row = UP_ROUND(row, C16NUM);
  for (; ri < total_row; ri++) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C16NUM] = 0;
    }
    dst_r += 1;
  }
-  return;
 }

 void RowMajor2Col32Major(const float *src_ptr, float *dst_ptr, int row, int col) {
@ -680,15 +677,15 @@ void RowMajor2Col32Major(const float *src_ptr, float *dst_ptr, int row, int col)
 }

 void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, int row, int col) {
-  size_t totalRow = UP_ROUND(row, C6NUM);
-  size_t row6 = row / C6NUM * C6NUM;
-  size_t col8 = col / C8NUM * C8NUM;
+  int totalRow = UP_ROUND(row, C6NUM);
+  int row6 = row / C6NUM * C6NUM;
+  int col8 = col / C8NUM * C8NUM;
  const float *src_r = src_ptr;
  float *dst_r = dst_ptr;

-  size_t ri = 0;
+  int ri = 0;
  for (; ri < row6; ri += C6NUM) {
-    size_t ci = 0;
+    int ci = 0;
    for (; ci < col8; ci += C8NUM) {
      const float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C6NUM;
@ -753,7 +750,7 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, int row, int col)
    for (; ci < col; ci++) {
      const float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C6NUM;
-      for (size_t i = 0; i < C6NUM; i++) {
+      for (int i = 0; i < C6NUM; i++) {
        dst_c[i] = src_c[i * col];
      }
    }
@ -762,7 +759,7 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, int row, int col)
  }

  for (; ri < row; ri++) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C6NUM] = src_r[i];
    }
    src_r += col;
@ -770,30 +767,29 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, int row, int col)
  }

  for (; ri < totalRow; ri++) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C6NUM] = 0;
    }
    dst_r += 1;
  }
-  return;
 }

 void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col) {
-  size_t total_row = UP_ROUND(row, C4NUM);
-  size_t row4 = row / C4NUM * C4NUM;
-  size_t col4 = col / C4NUM * C4NUM;
+  int total_row = UP_ROUND(row, C4NUM);
+  int row4 = row / C4NUM * C4NUM;
+  int col4 = col / C4NUM * C4NUM;
  const float *src_r = src_ptr;
  float *dst_r = dst_ptr;

-  size_t ri = 0;
+  int ri = 0;
  for (; ri < row4; ri += C4NUM) {
-    size_t ci = 0;
+    int ci = 0;
    for (; ci < col4; ci += C4NUM) {
      const float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C4NUM;

 #ifdef ENABLE_ARM32
-      size_t stride = col * 4;
+      int stride = col * 4;
      asm volatile(
        "mov r10, %[src_c]\n"
        "mov r12, %[dst_c]\n"
@ -840,8 +836,8 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col)
      _mm_storeu_ps(dst_c + 8, dst2);
      _mm_storeu_ps(dst_c + 12, dst3);
 #else
-      for (int tr = 0; tr < C4NUM; tr++) {
-        for (int tc = 0; tc < C4NUM; tc++) {
+      for (size_t tr = 0; tr < C4NUM; tr++) {
+        for (size_t tc = 0; tc < C4NUM; tc++) {
          dst_c[tc * C4NUM + tr] = src_c[tr * col + tc];
        }
      }
@ -850,7 +846,7 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col)
    for (; ci < col; ci++) {
      const float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C4NUM;
-      for (size_t i = 0; i < C4NUM; i++) {
+      for (int i = 0; i < C4NUM; i++) {
        dst_c[i] = src_c[i * col];
      }
    }
@ -858,7 +854,7 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col)
    dst_r += C4NUM * col;
  }
  for (; ri < row; ri++) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C4NUM] = src_r[i];
    }
    src_r += col;
@ -866,12 +862,11 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, int row, int col)
  }

  for (; ri < total_row; ri++) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C4NUM] = 0;
    }
    dst_r += 1;
  }
-  return;
 }

 #ifndef ENABLE_ARM
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pad_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/pad_fp32.c
@ -23,16 +23,22 @@ void Pad(const float *input_data, float *output_data, const int *input_shape, co
  if (thread_num == 0) {
    return;
  }
-  int in[4], out[4];
+  int in[DEFAULT_PAD_NDIMS], out[DEFAULT_PAD_NDIMS];
  for (in[0] = 0; in[0] < input_shape[0]; in[0]++) {
    out[0] = in[0] + paddings[0];
    for (in[1] = tid; in[1] < input_shape[1]; in[1] += thread_num) {
      out[1] = in[1] + paddings[2];
      for (in[2] = 0; in[2] < input_shape[2]; in[2]++) {
        out[2] = in[2] + paddings[4];
-        float *dst = output_data + offset(output_shape, out[0], out[1], out[2], paddings[6]);
-        const float *src = input_data + offset(input_shape, in[0], in[1], in[2], 0);
-        memcpy(dst, src, input_shape[3] * sizeof(float));
+        for (in[3] = 0; in[3] < input_shape[3]; in[3]++) {
+          out[3] = in[3] + paddings[6];
+          for (in[4] = 0; in[4] < input_shape[4]; in[4]++) {
+            out[4] = in[4] + paddings[8];
+            float *dst = output_data + Offset6d(output_shape, out) + paddings[10];
+            const float *src = input_data + Offset6d(input_shape, in);
+            memcpy(dst, src, input_shape[5] * (int)(sizeof(float)));
+          }
+        }
      }
    }
  }
@ -57,8 +63,7 @@ int TransOut2InputDimIndex(int out_dim_index, int left_pad, int in_dim, int offs

 int GetInputFlattenIndex(int out_flatten_index, const int *input_shape, const PadParameter *pad_param) {
  int in_flatten_index = 0;
-  int i;
-  for (i = 0; i < COMM_SHAPE_SIZE; ++i) {
+  for (int i = 0; i < DEFAULT_PAD_NDIMS; ++i) {
    int left_pad = pad_param->paddings_[i * 2];
    NNACL_CHECK_ZERO_RETURN_ERR(pad_param->out_strides[i])
    int out_dim_index = out_flatten_index / pad_param->out_strides[i];
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/resize_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/resize_fp32.c
@ -510,8 +510,8 @@ int ResizeNearestNeighbor(const float *input_data, float *output_data, const int
        } else {
          input_x = (int)(floorf(actual_x));
        }
-        int in_offset = offset(input_shape, batch, input_y, input_x, 0);
-        int out_offset = offset(output_shape, batch, y, x, 0);
+        int in_offset = Offset(input_shape, batch, input_y, input_x, 0);
+        int out_offset = Offset(output_shape, batch, y, x, 0);
        memcpy(output_data + out_offset, input_data + in_offset, c * sizeof(float));
      }
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/reverse_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/reverse_fp32.c
@ -20,10 +20,8 @@
 #include "nnacl/nnacl_utils.h"

 int Reverse(const float *input, float *output, size_t elem_size, int *index) {
-  for (int i = 0; i < elem_size; i++) {
+  for (size_t i = 0; i < elem_size; i++) {
    NNACL_ASSERT(index[i] >= 0);
-  }
-  for (int i = 0; i < elem_size; i++) {
    output[index[i]] = input[i];
  }
  return NNACL_OK;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/scatter_nd_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/scatter_nd_fp32.c
@ -23,7 +23,7 @@ int DoScatterND(float *output_ptr, const float *update, int *output_unit_offsets
    return NNACL_ERR;
  }
  for (int i = 0; i < num_units; i++) {
-    (void)memcpy(output_ptr + output_unit_offsets[i], update + unit_size * i, unit_size * sizeof(float));
+    (void)memcpy(output_ptr + output_unit_offsets[i], update + unit_size * i, (size_t)(unit_size) * sizeof(float));
  }
  return NNACL_OK;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/splice_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/splice_fp32.c
@ -25,7 +25,7 @@ void SpliceFp32(const float *src_data, int src_row, int src_col, const SplicePar
      forward_index++;
      const float *tmp_src_data = src_data + r_off * src_col;
      float *tmp_dst_data = dst_row_data + off * src_col;
-      memcpy(tmp_dst_data, tmp_src_data, src_col * sizeof(float));
+      memcpy(tmp_dst_data, tmp_src_data, (size_t)(src_col) * sizeof(float));
    }
  }
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/strided_slice_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/strided_slice_fp32.c
@ -70,7 +70,7 @@ int DoStridedSliceIntFp64Bool(const void *in_data, void *out_data, StridedSliceP
  if (param->num_axes_ < DIMENSION_8D) {
    PadStridedSliceParameterTo8D(param);
  }
-  size_t dim_offset[DIMENSION_8D - 1];
+  int dim_offset[DIMENSION_8D - 1];
  dim_offset[6] = in_shape[7];
  dim_offset[5] = in_shape[6] * dim_offset[6];
  dim_offset[4] = in_shape[5] * dim_offset[5];
@ -132,7 +132,7 @@ int DoStridedSlice(const void *in_data, void *out_data, StridedSliceParameter *p
  if (param->num_axes_ < DIMENSION_8D) {
    PadStridedSliceParameterTo8D(param);
  }
-  size_t dim_offset[DIMENSION_8D - 1];
+  int dim_offset[DIMENSION_8D - 1];
  dim_offset[6] = in_shape[7];
  dim_offset[5] = in_shape[6] * dim_offset[6];
  dim_offset[4] = in_shape[5] * dim_offset[5];
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/transpose_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/transpose_fp32.c
@ -180,15 +180,15 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_
  int *strides = (int *)(transpose_param->strides_);
  int *out_strides = (int *)(transpose_param->out_strides_);
  int num_axes = transpose_param->num_axes_;
-  size_t data_size = (*out_strides) * output_shape[0];
-  size_t offset_size = UP_DIV(data_size, thread_num);
-  size_t task_offset = offset_size * task_id;
+  int data_size = (*out_strides) * output_shape[0];
+  int offset_size = UP_DIV(data_size, thread_num);
+  int task_offset = offset_size * task_id;
  int count = data_size - task_offset;
  if (count <= 0) {
    return;
  }
  count = MSMIN(offset_size, count);
-  for (size_t idx = task_offset; idx < task_offset + count; ++idx) {
+  for (int idx = task_offset; idx < task_offset + count; ++idx) {
    int pos = idx;
    int output_idx = 0;
    int input_idx = 0;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_transform.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_transform.c
@ -45,7 +45,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
    int dst_plane_offset = c * in_channel;
    for (int ic = 0; ic < ic4; ic++) {
      // clear tmp buffer
-      memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));
+      memset(tmp_data, 0, input_unit * input_unit * C4NUM * (int)(sizeof(float)));

      int real_c = in_channel - ic * C4NUM;
      real_c = real_c > C4NUM ? C4NUM : real_c;
@ -87,7 +87,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
      // input transform
      const int tile_num = C12NUM;
      int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
-      size_t dst_step = tile_num * in_channel;
+      int dst_step = tile_num * in_channel;
      float *trans_input_ptr = trans_input + dst_ic4_offset;
      func(tmp_data, trans_input_ptr, C4NUM, dst_step, real_c);
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.c
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -17,6 +17,7 @@
 #include <math.h>
 #include "nnacl/op_base.h"
 #include "nnacl/fp32/arithmetic_fp32.h"
+#include "nnacl/fp32/exp_fp32.h"
 #include "nnacl/fp32_grad/activation_grad.h"
 #include "nnacl/errorcode.h"

@ -110,3 +111,27 @@ int GeluGrad(const float *src0, const float *src1, size_t length, float *dst) {
  }
  return NNACL_OK;
 }
+
+int SoftplusGrad(const float *src0, const float *src1, int length, float *dst) {
+  int i = 0;
+#if defined(ENABLE_AVX)
+  for (; i <= length - C8NUM; i += C8NUM) {
+    simd_exp_avx(-(MS_LD256_F32(src1 + i)), dst + i);
+    MS_ST256_F32(dst + i,
+                 MS_DIV256_F32(MS_LD256_F32(src0 + i), MS_ADD256_F32(MS_MOV256_F32(1.0f), MS_LD256_F32(dst + i))));
+  }
+#endif
+
+#if defined(ENABLE_ARM) || defined(ENABLE_SSE)
+  for (; i <= length - C4NUM; i += C4NUM) {
+    simd_exp(MS_SUBQ_F32(MS_MOVQ_F32(0.0f), MS_LDQ_F32(src1 + i)), dst + i);
+    MS_STQ_F32(dst + i, MS_DIVQ_F32(MS_LDQ_F32(src0 + i), MS_ADDQ_F32(MS_MOVQ_F32(1.0f), MS_LDQ_F32(dst + i))));
+  }
+#endif
+
+  for (; i < length; ++i) {
+    single_exp(-src1[i], dst + i);
+    dst[i] = src0[i] / (1.0f + dst[i]);
+  }
+  return NNACL_OK;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -39,6 +39,7 @@ int HSwishGrad(const float *src0, const float *src1, size_t length, float *dst);
 int HSigmoidGrad(const float *src0, const float *src1, size_t length, float *dst);
 int EluGrad(const float *src0, const float *src1, size_t length, float *dst, float alpha);
 int GeluGrad(const float *src0, const float *src1, size_t length, float *dst);
+int SoftplusGrad(const float *src, const float *src1, int length, float *dst);

 #ifdef __cplusplus
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/gemm.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/gemm.c
@ -231,7 +231,7 @@ static void RowMajor2Col12MajorStride(const float *src_ptr, float *dst_ptr, size
    for (; ci < col; ci++) {
      const float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C12NUM;
-      for (size_t i = 0; i < C12NUM; i++) {
+      for (int i = 0; i < C12NUM; i++) {
        dst_c[i] = src_c[i * lead];
      }
    }
@ -240,7 +240,7 @@ static void RowMajor2Col12MajorStride(const float *src_ptr, float *dst_ptr, size
  }

  for (; ri < row; ri++) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C12NUM] = src_r[i];
    }
    src_r += lead;
@ -248,12 +248,11 @@ static void RowMajor2Col12MajorStride(const float *src_ptr, float *dst_ptr, size
  }

  for (; ri < row_up_12; ri++) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C12NUM] = 0;
    }
    dst_r += 1;
  }
-  return;
 }
 #endif

@ -261,10 +260,10 @@ static void RowMajor2Col8MajorStride(const float *src_ptr, float *dst_ptr, size_
  size_t row8 = row / C8NUM * C8NUM;
 #ifdef ENABLE_ARM64
  size_t col_skip = col / C8NUM * C8NUM;
-  int skip_size = C8NUM;
+  size_t skip_size = C8NUM;
 #else
  size_t col_skip = col / C4NUM * C4NUM;
-  int skip_size = C4NUM;
+  size_t skip_size = C4NUM;
 #endif
  const float *src_r = src_ptr;
  float *dst_r = dst_ptr;
@ -450,7 +449,7 @@ static void RowMajor2Col8MajorStride(const float *src_ptr, float *dst_ptr, size_
    for (; ci < col; ci++) {
      const float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C8NUM;
-      for (size_t i = 0; i < C8NUM; i++) {
+      for (int i = 0; i < C8NUM; i++) {
        dst_c[i] = src_c[i * lead];
      }
    }
@ -458,7 +457,7 @@ static void RowMajor2Col8MajorStride(const float *src_ptr, float *dst_ptr, size_
    dst_r += C8NUM * col;
  }
  for (; ri < row; ri++) {
-    for (size_t i = 0; i < col; i++) {
+    for (int i = 0; i < col; i++) {
      dst_r[i * C8NUM] = src_r[i];
    }
    src_r += lead;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/reduce_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/reduce_grad.c
@ -64,11 +64,11 @@ void ReduceSumByAxes(const float *input, const int *input_dims, float *output, c
    if (output_dims[idx] != input_dims[idx]) same_shape = 0;
  }
  if (same_shape) {
-    memcpy(output, input, num_outputs * sizeof(float));
+    memcpy(output, input, (size_t)(num_outputs) * sizeof(float));
    return;
  }

-  memset(output, 0, num_outputs * sizeof(float));  // zero output
+  memset(output, 0, (size_t)(num_outputs) * sizeof(float));  // zero output

  int input_iter[8] = {0};
  int axes[5] = {0};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/softmax_grad.c
@ -37,13 +37,13 @@ void SoftmaxGrad(const float *input_ptr, const float *yt_ptr, float *output_ptr,
  for (int i = 0; i < inner_size * input_shape[axis]; i++) sum_mul[i] = 1.0;
  for (int i = 0; i < n_dim; i++) dim *= input_shape[i];
  dim /= outter_size;
-  memcpy(output_ptr, yt_ptr, ele_size * sizeof(float));
+  memcpy(output_ptr, yt_ptr, (size_t)(ele_size) * sizeof(float));

  const int M = input_shape[axis];
  const int N = inner_size;
  for (int i = 0; i < outter_size; i++) {
    int outter_offset = i * dim;
-    memset(sum_data, 0.0f, inner_size * sizeof(float));
+    memset(sum_data, 0, (size_t)(inner_size) * sizeof(float));
    for (int k = 0; k < inner_size; k++) {
      int inner_offset = outter_offset + k;
      for (int j = 0; j < input_shape[axis]; j++) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/strided_slice_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/strided_slice_grad.c
@ -20,7 +20,7 @@
 static size_t CalcIndex(const int *shape, size_t size, int i, size_t pos) {
  size_t res = 1;
  for (size_t j = 0; j < size; j++) {
-    res *= shape[(i + 1) + j];
+    res *= shape[((size_t)(i) + 1) + j];
  }
  return (pos / res % shape[i]);
 }
@ -37,7 +37,7 @@ int DoStridedSliceGrad(const float *inputs, float *output, const int *dx_shape,
  const int *s = param->strides_;
  const int *b = param->begins_;
  for (int i = 0; i < DIMENSION_8D; i++) {
-    size *= param->in_shape_[i];
+    size *= (size_t)(param->in_shape_[i]);
  }

  for (size_t pos = 0; pos < size; pos++) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/addn_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/addn_infer.c
@ -56,13 +56,13 @@ int AddnInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **o
  for (size_t d = 0; d < inputs[max_dims_idx]->shape_size_; ++d) {
    size_t max_dim = 0;
    for (size_t i = 0; i < inputs_size; ++i) {
-      size_t shift = max_dims - inputs[i]->shape_size_;
-      size_t dim = (i < shift) ? 1 : inputs[i]->shape_[d];
+      size_t shift = max_dims - (size_t)(inputs[i]->shape_size_);
+      size_t dim = (i < shift) ? 1 : (size_t)(inputs[i]->shape_[d]);
      if (dim > max_dim) {
        max_dim = dim;
      }
    }
-    output->shape_[d] = max_dim;  // set the biggest dimension in the output tensor
+    output->shape_[d] = (int)(max_dim);  // set the biggest dimension in the output tensor
  }

  return NNACL_OK;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/affine_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/affine_infer.c
@ -17,8 +17,8 @@
 #include "nnacl/infer/affine_infer.h"
 #include "nnacl/infer/infer_register.h"

-int MatmulInfer(AffineParameter *param, int a_shape[MAX_SHAPE_SIZE], size_t a_shape_size, int b_shape[MAX_SHAPE_SIZE],
-                size_t b_shape_size) {
+int MatmulInfer(const AffineParameter *param, int a_shape[MAX_SHAPE_SIZE], size_t a_shape_size,
+                int b_shape[MAX_SHAPE_SIZE], size_t b_shape_size) {
  MatMulParameter *matmul_param = param->matmul_parameter_;
  if (matmul_param->a_transpose_) {
    if (a_shape_size < 2) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/argmin_max_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/argmin_max_infer.c
@ -56,8 +56,8 @@ int ArgMinMaxInferShape(const TensorC *const *inputs, size_t inputs_size, Tensor
  int output_shape[MAX_SHAPE_SIZE] = {0};
  size_t output_shape_size = 0;
  ShapeSet(output_shape, &output_shape_size, input->shape_, input->shape_size_);
-  size_t input_shape_size = input->shape_size_;
-  int axis = param->axis_ < 0 ? param->axis_ + (int)input_shape_size : param->axis_;
+  int input_shape_size = (int)input->shape_size_;
+  int axis = param->axis_ < 0 ? param->axis_ + input_shape_size : param->axis_;
  if (axis >= input_shape_size || axis < 0) {
    return NNACL_PARAM_INVALID;
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/arithmetic_grad_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/arithmetic_grad_infer.c
@ -55,10 +55,10 @@ int ArithmeticGradInferShape(const TensorC *const *inputs, size_t inputs_size, T

  if (GetElementNum(dx1) < GetElementNum(dx2)) {
    param->ndim_ = in_shape1_size;
-    param->in_elements_num0_ = param->ndim_;
-    param->in_elements_num1_ = param->ndim_;
-    param->out_elements_num_ = param->ndim_;
-    int fill_dim_num = in_shape1_size - in_shape0_size;  // This will not work for batch!
+    param->in_elements_num0_ = (int)param->ndim_;
+    param->in_elements_num1_ = (int)param->ndim_;
+    param->out_elements_num_ = (int)param->ndim_;
+    size_t fill_dim_num = in_shape1_size - in_shape0_size;  // This will not work for batch!
    int j = 0;
    for (unsigned int i = 0; i < in_shape1_size; i++) {
      if (i < fill_dim_num) {
@ -76,7 +76,7 @@ int ArithmeticGradInferShape(const TensorC *const *inputs, size_t inputs_size, T
    param->out_elements_num_ = param->ndim_;
    param->broadcasting_ = true;
    int j = 0;
-    int fill_dim_num = in_shape0_size - in_shape1_size;
+    size_t fill_dim_num = in_shape0_size - in_shape1_size;
    for (unsigned int i = 0; i < in_shape0_size; i++) {
      if (i < fill_dim_num) {
        param->in_shape1_[i] = 1;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/audio_spectrogram_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/audio_spectrogram_infer.c
@ -66,7 +66,7 @@ int AudioSpectrogramInferShape(const TensorC *const *inputs, size_t inputs_size,
  int sample_sub_window = input->shape_[0] - param->window_size_;
  output_shape[1] = sample_sub_window < 0 ? 0 : 1 + sample_sub_window / param->stride_;
  // compute fft length
-  int fft_length = GetFftLength(param->window_size_);
+  int fft_length = (int)GetFftLength(param->window_size_);
  output_shape[2] = fft_length / 2 + 1;
  SetShapeArray(output, output_shape, 3);
  return NNACL_OK;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/bias_grad_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/bias_grad_infer.c
@ -33,8 +33,8 @@ int BiasGradInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC
  int inshape[MAX_SHAPE_SIZE];
  size_t inshape_size = 0;
  ShapeSet(inshape, &inshape_size, in0->shape_, in0->shape_size_);
-  int ndim = inshape_size;
-  for (int i = 0; i < ndim - 1; i++) {
+  size_t ndim = inshape_size;
+  for (size_t i = 0; i < ndim - 1; i++) {
    inshape[i] = 1;
  }
  SetDataTypeFormat(out, in0);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/broadcast_to_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/broadcast_to_infer.c
@ -111,12 +111,12 @@ int BroadcastToInferShape(const TensorC *const *inputs, size_t inputs_size, Tens
  const int *input_shape = input->shape_;
  size_t input_shape_size = input->shape_size_;
  int shape[MAX_SHAPE_SIZE];
-  int input_shape_index = input_shape_size - 1;
+  int input_shape_index = (int)(input_shape_size)-1;
  if (input_shape_size > dst_shape_size) {
    return NNACL_ERR;
  }

-  for (int i = dst_shape_size - 1; i >= 0; --i) {
+  for (int i = (int)(dst_shape_size)-1; i >= 0; --i) {
    if (dst_shape[i] < 0) {
      return NNACL_ERR;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.c
@ -18,6 +18,7 @@
 #include <string.h>
 #include "nnacl/infer/infer_register.h"

+#ifdef ENABLE_CONTROL_TENSORLIST
 int MallocTensorListData(TensorListC *tensor_list, TypeIdC dtype, const vvector *tensor_shape) {
  // This function will create a new tensors_
  // Your must to set shape(param2: tensor_shape) and data_type_(tensors_data_type_ = param1: dtype) of each tensor in
@ -35,7 +36,7 @@ int MallocTensorListData(TensorListC *tensor_list, TypeIdC dtype, const vvector
    return NNACL_NULL_PTR;
  }
  memset(tensor_list->tensors_, 0, tensor_list->element_num_ * sizeof(TensorC));
-  for (int i = 0; i < tensor_list->element_num_; ++i) {
+  for (size_t i = 0; i < tensor_list->element_num_; ++i) {
    tensor_list->tensors_[i].format_ = Format_NHWC;
    tensor_list->tensors_[i].data_type_ = dtype;
    ShapeSet(tensor_list->tensors_[i].shape_, &(tensor_list->tensors_[i].shape_size_), tensor_shape->shape_[i],
@ -69,6 +70,7 @@ bool TensorListIsFullyDefined(const int *shape, size_t shape_size) {
  }
  return true;
 }
+#endif

 int CheckAugmentNull(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
                     const OpParameter *parameter) {
@ -157,7 +159,7 @@ void SetShapeTensor(TensorC *dst, const TensorC *src) {
 }

 void SetShapeArray(TensorC *dst, const int *src, size_t src_size) {
-  for (size_t i = 0; i < src_size; i++) {
+  for (size_t i = 0; i < src_size && i < MAX_SHAPE_SIZE; i++) {
    dst->shape_[i] = src[i];
  }
  dst->shape_size_ = src_size;
@ -286,13 +288,17 @@ int GetDimensionSize(const TensorC *tensor, const size_t index) {
 }

 void ShapeSet(int *dst_shape, size_t *dst_shape_size, const int *src_shape, size_t src_shape_size) {
-  for (size_t i = 0; i < src_shape_size; i++) {
+  size_t i = 0;
+  for (; i < src_shape_size && i < MAX_SHAPE_SIZE; i++) {
    dst_shape[i] = src_shape[i];
  }
-  *dst_shape_size = src_shape_size;
+  *dst_shape_size = i;
 }

 void ShapePush(int *shape, size_t *shape_size, int value) {
+  if (*shape_size >= MAX_SHAPE_SIZE) {
+    return;
+  }
  shape[*shape_size] = value;
  *shape_size = *shape_size + 1;
 }
@ -301,6 +307,9 @@ int ShapeInsert(int *shape, size_t *shape_size, int index, int value) {
  if (index < 0 || index > *shape_size) {
    return NNACL_ERR;
  }
+  if (*shape_size >= MAX_SHAPE_SIZE) {
+    return NNACL_ERR;
+  }
  for (int i = *shape_size; i > index; i--) {
    shape[i] = shape[i - 1];
  }
@ -325,7 +334,7 @@ bool ShapeEqual(const int *shape0, size_t shape0_size, const int *shape1, size_t
  if (shape0_size != shape1_size) {
    return false;
  }
-  for (int i = 0; i < shape0_size; i++) {
+  for (size_t i = 0; i < shape0_size; i++) {
    if (shape0[i] != shape1[i]) {
      return false;
    }
@ -401,96 +410,6 @@ int FftInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **ou
  return NNACL_OK;
 }

-int VectorCInit(VectorC *vc, size_t per_malloc_size) {
-  if (per_malloc_size == 0) {
-    return NNACL_ERR;
-  }
-  vc->data_ = (int *)malloc(per_malloc_size * sizeof(int));
-  if (vc->data_ == NULL) {
-    return NNACL_ERR;
-  }
-  vc->size_ = 0;
-  vc->max_size_ = per_malloc_size;
-  vc->per_malloc_size_ = per_malloc_size;
-  return NNACL_OK;
-}
-
-int VectorCSet(VectorC *vc, const int *src_shape, size_t src_shape_size) {
-  if (src_shape_size == 0) {
-    vc->size_ = 0;
-  } else {
-    free(vc->data_);
-    if (vc->per_malloc_size_ == 0) {
-      return NNACL_ERR;
-    }
-    vc->max_size_ = (src_shape_size / vc->per_malloc_size_ + 1) * vc->per_malloc_size_;
-    vc->data_ = (int *)malloc(sizeof(int) * vc->max_size_);
-    if (vc->data_ == NULL) {
-      return NNACL_ERR;
-    }
-    for (size_t i = 0; i < src_shape_size; i++) {
-      vc->data_[i] = src_shape[i];
-    }
-    vc->size_ = src_shape_size;
-  }
-  return NNACL_OK;
-}
-
-int VectorCPush(VectorC *vc, int value) {
-  if (vc->size_ + 1 > vc->max_size_) {
-    int *tmp = (int *)malloc(vc->per_malloc_size_ * sizeof(int) + vc->max_size_ * sizeof(int));
-    if (tmp == NULL) {
-      return NNACL_ERR;
-    }
-    memcpy(tmp, vc->data_, vc->size_ * sizeof(int));
-    free(vc->data_);
-    vc->data_ = tmp;
-    vc->max_size_ = vc->max_size_ + vc->per_malloc_size_;
-  }
-  vc->data_[vc->size_] = value;
-  vc->size_++;
-  return NNACL_OK;
-}
-
-int VectorCInsert(VectorC *vc, int index, int value) {
-  if (vc->size_ + 1 > vc->max_size_) {
-    int *tmp = (int *)malloc(vc->per_malloc_size_ * sizeof(int) + vc->max_size_ * sizeof(int));
-    if (tmp == NULL) {
-      return NNACL_ERR;
-    }
-    memcpy(tmp, vc->data_, vc->size_ * sizeof(int));
-    free(vc->data_);
-    vc->data_ = tmp;
-    vc->max_size_ = vc->max_size_ + vc->per_malloc_size_;
-  }
-  memmove(vc->data_ + index + 1, vc->data_ + index, (vc->size_ - index) * sizeof(int));
-  vc->data_[index] = value;
-  vc->size_++;
-  return NNACL_OK;
-}
-
-void VectorCErase(VectorC *vc, int index) {
-  memmove(vc->data_ + index, vc->data_ + index + 1, (vc->size_ - index - 1) * sizeof(int));
-  vc->size_--;
-}
-
-bool VectorCEqual(const VectorC *vc1, const VectorC *vc2) {
-  if (vc1->size_ != vc2->size_) {
-    return false;
-  }
-  for (size_t i = 0; i < vc1->size_; i++) {
-    if (vc1->data_[i] != vc2->data_[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-void VectorCFree(VectorC *vc) {
-  free(vc->data_);
-  vc->data_ = NULL;
-}
-
 bool InferFlag(const TensorC *const *inputs, size_t inputs_size) {
  if (inputs == NULL) {
    return false;
@ -499,18 +418,22 @@ bool InferFlag(const TensorC *const *inputs, size_t inputs_size) {
    if (inputs[i] == NULL) {
      return false;
    }
+#ifdef ENABLE_CONTROL_TENSORLIST
    if (inputs[i]->data_type_ == kObjectTypeTensorType) {
      TensorListC *input_tensor_list = (TensorListC *)inputs[i];
      if (input_tensor_list->shape_value_ == -1) {
        return false;
      }
    } else {
+#endif
      for (size_t j = 0; j < inputs[i]->shape_size_; ++j) {
        if (inputs[i]->shape_[j] == -1) {
          return false;
        }
      }
+#ifdef ENABLE_CONTROL_TENSORLIST
    }
+#endif
  }
  return true;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/common_infer.h
@ -138,6 +138,7 @@ typedef struct vvector {
  size_t size_;      // number of shapes
 } vvector;

+#ifdef ENABLE_CONTROL_TENSORLIST
 typedef struct TensorListC {
  bool is_ready_;
  int data_type_;
@ -150,6 +151,7 @@ typedef struct TensorListC {
  size_t element_shape_size_;
  TensorC *tensors_;
 } TensorListC;
+#endif

 typedef struct VectorC {
  int *data_;
@ -158,9 +160,11 @@ typedef struct VectorC {
  size_t per_malloc_size_;
 } VectorC;

+#ifdef ENABLE_CONTROL_TENSORLIST
 int MallocTensorListData(TensorListC *tensor_list, TypeIdC dtype, const vvector *tensor_shape);
 int TensorListMergeShape(int *element_shape, size_t *element_shape_size, const int *tmp, size_t tmp_size);
 bool TensorListIsFullyDefined(const int *shape, size_t shape_size);
+#endif

 int GetBatch(const TensorC *tensor);
 int GetHeight(const TensorC *tensor);
@ -202,13 +206,6 @@ int CommonInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC *
 int FftInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
                  const OpParameter *parameter);

-int VectorCInit(VectorC *vc, size_t per_malloc_size);
-int VectorCSet(VectorC *vc, const int *src_shape, size_t src_shape_size);
-int VectorCPush(VectorC *vc, int value);
-int VectorCInsert(VectorC *vc, int index, int value);
-void VectorCErase(VectorC *vc, int index);
-bool VectorCEqual(const VectorC *vc1, const VectorC *vc2);
-void VectorCFree(VectorC *vc);
 bool InferFlag(const TensorC *const *inputs, size_t inputs_size);

 #ifdef __cplusplus
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/concat_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/concat_infer.c
@ -54,8 +54,13 @@ int ConcatInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC *
  }
  int output_axis_dim = input0_shape[axis];
  for (size_t i = 1; i < inputs_size; ++i) {
-    if (inputs[i]->shape_size_ != input0_shape_size) {
-      return NNACL_PARAM_INVALID;
+    size_t input_i_shape_size = inputs[i]->shape_size_;
+    if (input_i_shape_size != input0_shape_size) {
+      if (input_i_shape_size != 0) {
+        return NNACL_PARAM_INVALID;
+      } else {
+        continue;
+      }
    }
    int shape_tmp[MAX_SHAPE_SIZE] = {0};
    size_t shape_tmp_size = 0;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/constant_of_shape_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/constant_of_shape_infer.c
@ -37,7 +37,7 @@ int ConstantOfShapeInferShape(const TensorC *const *inputs, size_t inputs_size,
    return NNACL_ERR;
  }
  int out_shape[MAX_SHAPE_SIZE];
-  size_t out_shape_size = size;
+  int out_shape_size = size;
  switch (in_tensor->data_type_) {
    case kNumberTypeInt32: {
      int32_t *in_data = (int32_t *)(in_tensor->data_);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/conv2d_grad_filter_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/conv2d_grad_filter_infer.c
@ -34,7 +34,10 @@ int Conv2dGradFilterInferShape(const TensorC *const *inputs, size_t inputs_size,
  if (inputs[2]->shape_size_ < 1 || inputs[2]->data_ == NULL) {
    return NNACL_ERR;
  }
-  size_t filter_shape_size = inputs[2]->shape_[0];
+  if (inputs[2]->shape_[0] < 0) {
+    return NNACL_ERR;
+  }
+  size_t filter_shape_size = (size_t)(inputs[2]->shape_[0]);
  if (filter_shape_size != 4) {
    return NNACL_ERR;
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/conv2d_grad_input_infer.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/conv2d_grad_input_infer.c
@ -40,16 +40,16 @@ int Conv2dGradInputInferShape(const TensorC *const *inputs, size_t inputs_size,
  if (inputs[2]->shape_size_ < 1 || inputs[2]->data_ == NULL) {
    return NNACL_ERR;
  }
-  size_t shape_size = inputs[2]->shape_[0];
-  if (shape_size != 4) {
+  size_t data_size = (size_t)inputs[2]->shape_[0];
+  if (data_size != 4) {
    return NNACL_ERR;
  }
  int shape[MAX_SHAPE_SIZE];
  const int nchw2nhwc[4] = {0, 2, 3, 1};
-  for (int i = 0; i < shape_size; i++) {
+  for (size_t i = 0; i < data_size; i++) {
    shape[i] = *((int *)(inputs[2]->data_) + nchw2nhwc[i]);
  }
-  SetShapeArray(out, shape, shape_size);
+  SetShapeArray(out, shape, data_size);

  return NNACL_OK;
 }
--- a/Show More
+++ b/Show More