Merge branch 'master' of https://gitee.com/mindspore/mindspore into export

2021-01-18 09:13:57 +02:00 · 2021-01-18 09:13:57 +02:00 · 53ace98343
parent 33d7741904 b988780fd7
commit 53ace98343
66 changed files with 1692 additions and 433 deletions
--- a/build.bat
+++ b/build.bat
@ -78,6 +78,7 @@ IF NOT EXIST "%BUILD_PATH%/mindspore" (

 cd %BUILD_PATH%/mindspore
 IF "%1%" == "lite" (
+    (git log -1 | findstr "^commit") > %BUILD_PATH%\.commit_id
    cmake -DPLATFORM_ARM64=off -DSUPPORT_TRAIN=off ^
    -DENABLE_TOOLS=on -DENABLE_CONVERTER=on -DBUILD_TESTCASES=off ^
    -DCMAKE_BUILD_TYPE=Release -DSUPPORT_GPU=off -DBUILD_MINDDATA=off -DOFFLINE_COMPILE=off ^
--- a/build.sh
+++ b/build.sh
@ -510,6 +510,11 @@ get_version() {
    VERSION_STR=${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_REVISION}
 }

+write_commit_file() {
+    COMMIT_STR=$(git log -1 | grep commit)
+    echo ${COMMIT_STR} > "${BASEPATH}/mindspore/lite/build/.commit_id"
+}
+
 build_lite()
 {
    get_version
@ -542,6 +547,7 @@ build_lite()
    fi
    mkdir -pv build
    cd build
+    write_commit_file
    BUILD_TYPE="Release"
    if [[ "${DEBUG_MODE}" == "on" ]]; then
      BUILD_TYPE="Debug"
--- a/cmake/package_lite.cmake
+++ b/cmake/package_lite.cmake
@ -3,6 +3,8 @@ include(CMakePackageConfigHelpers)
 set(RUNTIME_PKG_NAME ${MAIN_DIR}-${RUNTIME_COMPONENT_NAME})
 set(CONVERTER_PKG_NAME ${MAIN_DIR}-${CONVERTER_COMPONENT_NAME})

+set(RUNTIME_ROOT_DIR ${RUNTIME_PKG_NAME}/)
+set(CONVERTER_ROOT_DIR ${CONVERTER_PKG_NAME}/)
 set(RUNTIME_LIB_DIR ${RUNTIME_PKG_NAME}/lib)
 set(RUNTIME_INC_DIR ${RUNTIME_PKG_NAME}/include)
 set(CONVERTER_LIB_DIR ${CONVERTER_PKG_NAME}/lib)
@ -17,154 +19,230 @@ set(MIND_DATA_LIB_DIR ${RUNTIME_PKG_NAME}/minddata/lib)

 set(LIB_DIR_RUN_X86 ${RUNTIME_PKG_NAME}/lib)

-if (BUILD_MINDDATA STREQUAL "full" OR BUILD_MINDDATA STREQUAL "wrapper")
-    install(DIRECTORY ${TOP_DIR}/mindspore/ccsrc/minddata/dataset/include/ DESTINATION ${MIND_DATA_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "vision.h" EXCLUDE)
-    if (PLATFORM_ARM64)
+if(BUILD_MINDDATA STREQUAL "full" OR BUILD_MINDDATA STREQUAL "wrapper")
+    install(DIRECTORY ${TOP_DIR}/mindspore/ccsrc/minddata/dataset/include/ DESTINATION ${MIND_DATA_INC_DIR}
+            COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "vision.h" EXCLUDE)
+    if(PLATFORM_ARM64)
        file(GLOB JPEGTURBO_LIB_LIST ${jpeg_turbo_LIBPATH}/*.so)
-        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
        install(FILES ${JPEGTURBO_LIB_LIST} DESTINATION ${TURBO_DIR}/lib COMPONENT ${RUNTIME_COMPONENT_NAME})
-    elseif (PLATFORM_ARM32)
+    elseif(PLATFORM_ARM32)
        file(GLOB JPEGTURBO_LIB_LIST ${jpeg_turbo_LIBPATH}/*.so)
-        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
        install(FILES ${JPEGTURBO_LIB_LIST} DESTINATION ${TURBO_DIR}/lib COMPONENT ${RUNTIME_COMPONENT_NAME})
-    else ()
-        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${jpeg_turbo_LIBPATH}/libjpeg.so.62.3.0 DESTINATION ${TURBO_DIR}/lib  RENAME libjpeg.so.62 COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${jpeg_turbo_LIBPATH}/libturbojpeg.so.0.2.0 DESTINATION ${TURBO_DIR}/lib RENAME libturbojpeg.so.0 COMPONENT ${RUNTIME_COMPONENT_NAME})
-    endif ()
-endif ()
-
-if (BUILD_MINDDATA STREQUAL "lite")
-    install(DIRECTORY ${TOP_DIR}/mindspore/ccsrc/minddata/dataset/include/ DESTINATION ${MIND_DATA_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
-    if (PLATFORM_ARM64)
-        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/third_party/libjpeg-turbo/lib/libjpeg.so DESTINATION ${TURBO_DIR}/lib COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/third_party/libjpeg-turbo/lib/libturbojpeg.so DESTINATION ${TURBO_DIR}/lib COMPONENT ${RUNTIME_COMPONENT_NAME})
-    elseif (PLATFORM_ARM32)
-        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/third_party/libjpeg-turbo/lib/libjpeg.so DESTINATION ${TURBO_DIR}/lib COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/third_party/libjpeg-turbo/lib/libturbojpeg.so DESTINATION ${TURBO_DIR}/lib COMPONENT ${RUNTIME_COMPONENT_NAME})
-    else ()
-        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/third_party/libjpeg-turbo/lib/libjpeg.so.62.3.0 DESTINATION ${TURBO_DIR}/lib RENAME libjpeg.so.62 COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/third_party/libjpeg-turbo/lib/libturbojpeg.so.0.2.0 DESTINATION ${TURBO_DIR}/lib RENAME libturbojpeg.so.0 COMPONENT ${RUNTIME_COMPONENT_NAME})
-    endif ()
-endif ()
-
-if (BUILD_MINDDATA STREQUAL "lite_cv")
-    if (PLATFORM_ARM64)
-        install(DIRECTORY ${TOP_DIR}/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv DESTINATION ${MIND_DATA_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
-        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    elseif (PLATFORM_ARM32)
-        install(DIRECTORY ${TOP_DIR}/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv DESTINATION ${MIND_DATA_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
-        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    else ()
-        install(DIRECTORY ${TOP_DIR}/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv DESTINATION ${MIND_DATA_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
-        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    endif ()
-endif ()
-
-if (PLATFORM_ARM64)
-    if (SUPPORT_NPU)
-        install(FILES ${DDK_LIB_PATH}/libhiai.so DESTINATION ${RUNTIME_PKG_NAME}/third_party/hiai_ddk/lib COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${DDK_LIB_PATH}/libhiai_ir.so DESTINATION ${RUNTIME_PKG_NAME}/third_party/hiai_ddk/lib COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${DDK_LIB_PATH}/libhiai_ir_build.so DESTINATION ${RUNTIME_PKG_NAME}/third_party/hiai_ddk/lib COMPONENT ${RUNTIME_COMPONENT_NAME})
+    else()
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${jpeg_turbo_LIBPATH}/libjpeg.so.62.3.0 DESTINATION ${TURBO_DIR}/lib  RENAME libjpeg.so.62
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${jpeg_turbo_LIBPATH}/libturbojpeg.so.0.2.0 DESTINATION ${TURBO_DIR}/lib RENAME libturbojpeg.so.0
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
    endif()
-    if (SUPPORT_TRAIN)
-        install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
-    else ()
-        install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "train*" EXCLUDE)
-    endif ()
-    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.so DESTINATION ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.a DESTINATION ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${RUNTIME_INC_DIR}/ir/dtype COMPONENT ${RUNTIME_COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/lite/build/schema/model_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema COMPONENT ${RUNTIME_COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/lite/build/schema/ops_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema COMPONENT ${RUNTIME_COMPONENT_NAME})
+endif()
+
+if(BUILD_MINDDATA STREQUAL "lite")
+    install(DIRECTORY ${TOP_DIR}/mindspore/ccsrc/minddata/dataset/include/ DESTINATION ${MIND_DATA_INC_DIR}
+            COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
+    if(PLATFORM_ARM64)
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/third_party/libjpeg-turbo/lib/libjpeg.so DESTINATION ${TURBO_DIR}/lib
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/third_party/libjpeg-turbo/lib/libturbojpeg.so DESTINATION ${TURBO_DIR}/lib
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+    elseif(PLATFORM_ARM32)
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/third_party/libjpeg-turbo/lib/libjpeg.so DESTINATION ${TURBO_DIR}/lib
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/third_party/libjpeg-turbo/lib/libturbojpeg.so DESTINATION ${TURBO_DIR}/lib
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+    else()
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/third_party/libjpeg-turbo/lib/libjpeg.so.62.3.0
+                DESTINATION ${TURBO_DIR}/lib RENAME libjpeg.so.62 COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/third_party/libjpeg-turbo/lib/libturbojpeg.so.0.2.0
+                DESTINATION ${TURBO_DIR}/lib RENAME libturbojpeg.so.0 COMPONENT ${RUNTIME_COMPONENT_NAME})
+    endif()
+endif()
+
+if(BUILD_MINDDATA STREQUAL "lite_cv")
+    if(PLATFORM_ARM64)
+        install(DIRECTORY ${TOP_DIR}/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv
+                DESTINATION ${MIND_DATA_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so
+                DESTINATION ${MIND_DATA_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+    elseif(PLATFORM_ARM32)
+        install(DIRECTORY ${TOP_DIR}/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv
+                DESTINATION ${MIND_DATA_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+    else()
+        install(DIRECTORY ${TOP_DIR}/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv
+                DESTINATION ${MIND_DATA_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
+        install(FILES ${TOP_DIR}/mindspore/lite/build/minddata/libminddata-lite.so DESTINATION ${MIND_DATA_LIB_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+    endif()
+endif()
+
+if(PLATFORM_ARM64)
+    install(FILES ${TOP_DIR}/mindspore/lite/build/.commit_id DESTINATION ${RUNTIME_PKG_NAME}
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    if(SUPPORT_NPU)
+        install(FILES ${DDK_LIB_PATH}/libhiai.so DESTINATION ${RUNTIME_PKG_NAME}/third_party/hiai_ddk/lib
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${DDK_LIB_PATH}/libhiai_ir.so DESTINATION ${RUNTIME_PKG_NAME}/third_party/hiai_ddk/lib
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${DDK_LIB_PATH}/libhiai_ir_build.so DESTINATION ${RUNTIME_PKG_NAME}/third_party/hiai_ddk/lib
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+    endif()
+    if(SUPPORT_TRAIN)
+        install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
+    else()
+        install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "train*" EXCLUDE)
+    endif()
+    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.so DESTINATION ${RUNTIME_LIB_DIR}
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.a DESTINATION ${RUNTIME_LIB_DIR}
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${RUNTIME_INC_DIR}/ir/dtype
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/mindspore/lite/build/schema/model_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/mindspore/lite/build/schema/ops_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
    install(DIRECTORY ${flatbuffers_INC} DESTINATION ${FLATBF_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    if (ENABLE_TOOLS)
+    if(ENABLE_TOOLS)
        install(TARGETS benchmark RUNTIME DESTINATION ${RUNTIME_PKG_NAME}/benchmark COMPONENT ${RUNTIME_COMPONENT_NAME})
-    endif ()
-elseif (PLATFORM_ARM32)
-    if (SUPPORT_TRAIN)
-        install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
-    else ()
-        install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "train*" EXCLUDE)
-    endif ()
-    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.so DESTINATION ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.a DESTINATION ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${RUNTIME_INC_DIR}/ir/dtype COMPONENT ${RUNTIME_COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/lite/build/schema/model_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema COMPONENT ${RUNTIME_COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/lite/build/schema/ops_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema COMPONENT ${RUNTIME_COMPONENT_NAME})
+    endif()
+elseif(PLATFORM_ARM32)
+    install(FILES ${TOP_DIR}/mindspore/lite/build/.commit_id DESTINATION ${RUNTIME_PKG_NAME}
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    if(SUPPORT_TRAIN)
+        install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
+    else()
+        install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "train*" EXCLUDE)
+    endif()
+    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.so DESTINATION ${RUNTIME_LIB_DIR}
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.a DESTINATION ${RUNTIME_LIB_DIR}
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${RUNTIME_INC_DIR}/ir/dtype
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/mindspore/lite/build/schema/model_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/mindspore/lite/build/schema/ops_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
    install(DIRECTORY ${flatbuffers_INC} DESTINATION ${FLATBF_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    if (ENABLE_TOOLS)
+    if(ENABLE_TOOLS)
        install(TARGETS benchmark RUNTIME DESTINATION ${RUNTIME_PKG_NAME}/benchmark COMPONENT ${RUNTIME_COMPONENT_NAME})
-    endif ()
-elseif (WIN32)
+    endif()
+elseif(WIN32)
+    install(FILES ${TOP_DIR}/build/.commit_id DESTINATION ${RUNTIME_PKG_NAME}
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
    get_filename_component(CXX_DIR ${CMAKE_CXX_COMPILER} PATH)
-    file(GLOB LIB_LIST ${CXX_DIR}/libstdc++-6.dll ${CXX_DIR}/libwinpthread-1.dll ${CXX_DIR}/libssp-0.dll ${CXX_DIR}/libgcc_s_seh-1.dll)
-    if (ENABLE_CONVERTER)
-        install(TARGETS converter_lite RUNTIME DESTINATION ${CONVERTER_PKG_NAME}/converter COMPONENT ${CONVERTER_COMPONENT_NAME})
+    file(GLOB LIB_LIST ${CXX_DIR}/libstdc++-6.dll ${CXX_DIR}/libwinpthread-1.dll
+            ${CXX_DIR}/libssp-0.dll ${CXX_DIR}/libgcc_s_seh-1.dll)
+    if(ENABLE_CONVERTER)
+        install(FILES ${TOP_DIR}/build/.commit_id DESTINATION ${CONVERTER_PKG_NAME}
+                COMPONENT ${CONVERTER_COMPONENT_NAME})
+        install(TARGETS converter_lite RUNTIME DESTINATION ${CONVERTER_PKG_NAME}/converter
+                COMPONENT ${CONVERTER_COMPONENT_NAME})
        install(FILES ${LIB_LIST} DESTINATION ${CONVERTER_PKG_NAME}/converter COMPONENT ${CONVERTER_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/build/mindspore/tools/converter/mindspore_core/gvar/libmindspore_gvar.dll DESTINATION ${CONVERTER_PKG_NAME}/converter COMPONENT ${CONVERTER_COMPONENT_NAME})
-        install(FILES ${glog_LIBPATH}/../bin/libglog.dll DESTINATION ${CONVERTER_PKG_NAME}/converter COMPONENT ${CONVERTER_COMPONENT_NAME})
-    endif ()
-    if (ENABLE_TOOLS)
+        install(FILES ${TOP_DIR}/build/mindspore/tools/converter/mindspore_core/gvar/libmindspore_gvar.dll
+                DESTINATION ${CONVERTER_PKG_NAME}/converter COMPONENT ${CONVERTER_COMPONENT_NAME})
+        install(FILES ${glog_LIBPATH}/../bin/libglog.dll DESTINATION ${CONVERTER_PKG_NAME}/converter
+                COMPONENT ${CONVERTER_COMPONENT_NAME})
+    endif()
+    if(ENABLE_TOOLS)
        install(TARGETS benchmark RUNTIME DESTINATION ${RUNTIME_PKG_NAME}/benchmark COMPONENT ${RUNTIME_COMPONENT_NAME})
        install(FILES ${LIB_LIST} DESTINATION ${RUNTIME_PKG_NAME}/benchmark COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(DIRECTORY ${flatbuffers_INC} DESTINATION ${RUNTIME_PKG_NAME}/third_party/flatbuffers COMPONENT ${RUNTIME_COMPONENT_NAME})
-        if (SUPPORT_TRAIN)
-            install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
-        else ()
-            install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "train*" EXCLUDE)
-        endif ()
-        install(FILES ${TOP_DIR}/build/mindspore/schema/model_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/build/mindspore/schema/ops_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${RUNTIME_INC_DIR}/ir/dtype COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(DIRECTORY ${flatbuffers_INC} DESTINATION ${RUNTIME_PKG_NAME}/third_party/flatbuffers
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        if(SUPPORT_TRAIN)
+            install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR}
+                    COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
+        else()
+            install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR}
+                    COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "train*" EXCLUDE)
+        endif()
+        install(FILES ${TOP_DIR}/build/mindspore/schema/model_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/build/mindspore/schema/ops_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${RUNTIME_INC_DIR}/ir/dtype
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
        set(WIN_LIB_DIR_RUN_X86 ${RUNTIME_PKG_NAME}/benchmark)
-        install(FILES ${TOP_DIR}/build/mindspore/src/libmindspore-lite.a DESTINATION ${WIN_LIB_DIR_RUN_X86} COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/build/mindspore/src/libmindspore-lite.dll.a DESTINATION ${WIN_LIB_DIR_RUN_X86} COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/build/mindspore/src/libmindspore-lite.dll DESTINATION ${WIN_LIB_DIR_RUN_X86} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    endif ()
-else ()
-    if (SUPPORT_TRAIN)
-        install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
-    else ()
-        install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "train*" EXCLUDE)
-    endif ()
-    install(FILES ${TOP_DIR}/mindspore/lite/build/schema/model_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema COMPONENT ${RUNTIME_COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/lite/build/schema/ops_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema COMPONENT ${RUNTIME_COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${RUNTIME_INC_DIR}/ir/dtype COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/build/mindspore/src/libmindspore-lite.a DESTINATION ${WIN_LIB_DIR_RUN_X86}
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/build/mindspore/src/libmindspore-lite.dll.a DESTINATION ${WIN_LIB_DIR_RUN_X86}
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/build/mindspore/src/libmindspore-lite.dll DESTINATION ${WIN_LIB_DIR_RUN_X86}
+                COMPONENT ${RUNTIME_COMPONENT_NAME})
+    endif()
+else()
+    install(FILES ${TOP_DIR}/mindspore/lite/build/.commit_id DESTINATION ${RUNTIME_PKG_NAME}
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    if(SUPPORT_TRAIN)
+        install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
+    else()
+        install(DIRECTORY ${TOP_DIR}/mindspore/lite/include/ DESTINATION ${RUNTIME_INC_DIR}
+                COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "train*" EXCLUDE)
+    endif()
+    install(FILES ${TOP_DIR}/mindspore/lite/build/schema/model_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/mindspore/lite/build/schema/ops_generated.h DESTINATION ${RUNTIME_INC_DIR}/schema
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${RUNTIME_INC_DIR}/ir/dtype
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
    install(DIRECTORY ${flatbuffers_INC} DESTINATION ${FLATBF_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.so DESTINATION ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.a DESTINATION ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
-    if (ENABLE_CONVERTER)
-        install(TARGETS converter_lite RUNTIME DESTINATION ${CONVERTER_PKG_NAME}/converter COMPONENT ${CONVERTER_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/mindspore/lite/build/tools/converter/mindspore_core/gvar/libmindspore_gvar.so DESTINATION ${CONVERTER_PKG_NAME}/lib COMPONENT ${CONVERTER_COMPONENT_NAME})
-        install(FILES ${glog_LIBPATH}/libglog.so.0.4.0 DESTINATION ${CONVERTER_PKG_NAME}/third_party/glog/lib RENAME libglog.so.0 COMPONENT ${CONVERTER_COMPONENT_NAME})
-    endif ()
-    if (ENABLE_TOOLS)
+    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.so DESTINATION ${RUNTIME_LIB_DIR}
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.a DESTINATION ${RUNTIME_LIB_DIR}
+            COMPONENT ${RUNTIME_COMPONENT_NAME})
+    if(ENABLE_CONVERTER)
+        install(FILES ${TOP_DIR}/mindspore/lite/build/.commit_id DESTINATION ${CONVERTER_PKG_NAME}
+                COMPONENT ${CONVERTER_COMPONENT_NAME})
+        install(TARGETS converter_lite RUNTIME DESTINATION ${CONVERTER_PKG_NAME}/converter
+                COMPONENT ${CONVERTER_COMPONENT_NAME})
+        install(FILES ${TOP_DIR}/mindspore/lite/build/tools/converter/mindspore_core/gvar/libmindspore_gvar.so
+                DESTINATION ${CONVERTER_PKG_NAME}/lib COMPONENT ${CONVERTER_COMPONENT_NAME})
+        install(FILES ${glog_LIBPATH}/libglog.so.0.4.0
+                DESTINATION ${CONVERTER_PKG_NAME}/third_party/glog/lib RENAME libglog.so.0
+                COMPONENT ${CONVERTER_COMPONENT_NAME})
+    endif()
+    if(ENABLE_TOOLS)
        install(TARGETS benchmark RUNTIME DESTINATION ${RUNTIME_PKG_NAME}/benchmark COMPONENT ${RUNTIME_COMPONENT_NAME})
        install(TARGETS cropper RUNTIME DESTINATION ${RUNTIME_PKG_NAME}/cropper COMPONENT ${RUNTIME_COMPONENT_NAME})
-        install(FILES ${TOP_DIR}/mindspore/lite/build/tools/cropper/cropper_mapping_cpu.cfg DESTINATION ${RUNTIME_PKG_NAME}/cropper COMPONENT ${RUNTIME_COMPONENT_NAME})
-    endif ()
-endif ()
+        install(FILES ${TOP_DIR}/mindspore/lite/build/tools/cropper/cropper_mapping_cpu.cfg
+                DESTINATION ${RUNTIME_PKG_NAME}/cropper COMPONENT ${RUNTIME_COMPONENT_NAME})
+    endif()
+endif()

-if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+if(CMAKE_SYSTEM_NAME MATCHES "Windows")
    set(CPACK_GENERATOR ZIP)
-else ()
+else()
    set(CPACK_GENERATOR TGZ)
-endif ()
+endif()
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
-if (PLATFORM_ARM64 OR PLATFORM_ARM32)
+if(PLATFORM_ARM64 OR PLATFORM_ARM32)
    set(CPACK_COMPONENTS_ALL ${RUNTIME_COMPONENT_NAME})
-else ()
+else()
    set(CPACK_COMPONENTS_ALL ${RUNTIME_COMPONENT_NAME} ${CONVERTER_COMPONENT_NAME})
-endif ()
+endif()
 set(CPACK_PACKAGE_FILE_NAME ${MAIN_DIR})
-if (WIN32)
+if(WIN32)
    set(CPACK_PACKAGE_DIRECTORY ${TOP_DIR}/output)
-else ()
+else()
    set(CPACK_PACKAGE_DIRECTORY ${TOP_DIR}/output/tmp)
-endif ()
+endif()
 set(CPACK_PACKAGE_CHECKSUM SHA256)
 include(CPack)
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc
@ -76,6 +76,16 @@ void Reciprocal(const T *in, T *out, size_t start, size_t end) {
    out[i] = static_cast<T>(1.0 / in[i]);
  }
 }
+
+template <typename T>
+void Gelu(const T *in, T *out, size_t start, size_t end) {
+  for (size_t i = start; i < end; i++) {
+    T x = in[i];
+    auto double_x = static_cast<T>(x);
+    T tanh_res = (T)std::tanh(0.7978845608 * (double_x + 0.044715 * double_x * double_x * double_x));
+    out[i] = x * ((T)1.0 + tanh_res) / (T)2.0;
+  }
+}
 }  // namespace

 void ArithmeticSelfCPUKernel::InitKernel(const CNodePtr &kernel_node) {
@ -95,6 +105,8 @@ void ArithmeticSelfCPUKernel::InitKernel(const CNodePtr &kernel_node) {
    operate_type_ = FLOOR;
  } else if (kernel_name == prim::kPrimReciprocal->name()) {
    operate_type_ = RECIPROCAL;
+  } else if (kernel_name == prim::kPrimGelu->name()) {
+    operate_type_ = GELU;
  }
  dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
 }
@ -150,6 +162,8 @@ void ArithmeticSelfCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs
      threads.emplace_back(std::thread(Floor<T>, input, output, start, end));
    } else if (operate_type_ == RECIPROCAL) {
      threads.emplace_back(std::thread(Reciprocal<T>, input, output, start, end));
+    } else if (operate_type_ == GELU) {
+      threads.emplace_back(std::thread(Gelu<T>, input, output, start, end));
    }
    start += once_compute_size;
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h
@ -62,6 +62,8 @@ MS_REG_CPU_KERNEL(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutput
                  ArithmeticSelfCPUKernel);
 MS_REG_CPU_KERNEL(Reciprocal, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  ArithmeticSelfCPUKernel);
+MS_REG_CPU_KERNEL(Gelu, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  ArithmeticSelfCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
@ -89,6 +89,8 @@ enum OperateType {
  GREATER,
  GREATEREQUAL,
  RECIPROCAL,
+  GELU,
+  GELUGRAD,
 };

 class CPUKernel : public kernel::KernelMod {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
@ -78,6 +78,18 @@ void EltWiseGradCPUKernel::TanhGrad(const T *input1, const T *input2, T *out, si
  }
 }

+template <typename T>
+void EltWiseGradCPUKernel::GeluGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) {
+  for (size_t i = start; i < end; i++) {
+    T x = input2[i];
+    auto double_x = static_cast<T>(x);
+    T tanh_res = (T)std::tanh(0.7978845608 * (double_x + 0.044715 * double_x * double_x * double_x));
+    T mul_right = (T)(0.7978845608 + 0.1070322244 * double_x * double_x);
+    T y_res = (((T)1.0 + tanh_res) + x * ((T)1.0 - tanh_res * tanh_res) * mul_right) / (T)2.0;
+    out[i] = input1[i] * y_res;
+  }
+}
+
 void EltWiseGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
@ -93,6 +105,8 @@ void EltWiseGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
    operate_type_ = TANHGRAD;
  } else if (kernel_name == "SqrtGrad") {
    operate_type_ = SQRTGRAD;
+  } else if (kernel_name == "GeluGrad") {
+    operate_type_ = GELUGRAD;
  } else {
    MS_LOG(EXCEPTION) << "Not support " << kernel_name;
  }
@ -172,6 +186,8 @@ void EltWiseGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, c
      threads.emplace_back(std::thread(&EltWiseGradCPUKernel::TanhGrad<T>, this, input1, input2, output, start, end));
    } else if (operate_type_ == SQRTGRAD) {
      threads.emplace_back(std::thread(&EltWiseGradCPUKernel::SqrtGrad<T>, this, input1, input2, output, start, end));
+    } else if (operate_type_ == GELUGRAD) {
+      threads.emplace_back(std::thread(&EltWiseGradCPUKernel::GeluGrad<T>, this, input1, input2, output, start, end));
    } else {
      MS_LOG(EXCEPTION) << "Not support " << operate_type_;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h
@ -47,6 +47,8 @@ class EltWiseGradCPUKernel : public CPUKernel {
  void SqrtGrad(const T *input1, const T *input2, T *out, size_t start, size_t end);
  template <typename T>
  void TanhGrad(const T *input1, const T *input2, T *out, size_t start, size_t end);
+  template <typename T>
+  void GeluGrad(const T *input1, const T *input2, T *out, size_t start, size_t end);
  std::vector<size_t> input_shape0_;
  std::vector<size_t> input_shape1_;
  std::vector<size_t> input_element_num0_;
@ -81,6 +83,13 @@ MS_REG_CPU_KERNEL(
  TanhGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  EltWiseGradCPUKernel);
+MS_REG_CPU_KERNEL(GeluGrad,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  EltWiseGradCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_cpu_kernel.cc
@ -0,0 +1,105 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/cpu/layer_norm_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+void LayerNormCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  CheckParam(kernel_node);
+  dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
+  std::vector<size_t> x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  auto begin_norm_axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "begin_norm_axis");
+  auto begin_params_axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "begin_params_axis");
+  if (begin_norm_axis < 0) {
+    begin_norm_axis += x_shape.size();
+  }
+  if (begin_params_axis < 0) {
+    begin_params_axis += x_shape.size();
+  }
+  for (size_t i = 0; i < IntToSize(begin_norm_axis); i++) {
+    block_num_ *= x_shape[i];
+  }
+  for (size_t i = IntToSize(begin_norm_axis); i < x_shape.size(); i++) {
+    block_size_ *= x_shape[i];
+  }
+  for (size_t i = IntToSize(begin_params_axis); i < x_shape.size(); i++) {
+    param_num_ *= x_shape[i];
+  }
+  if (block_num_ <= 0 || block_size_ <= 0) {
+    MS_LOG(EXCEPTION) << "LayerNormCPUKernel input shape error, input shape: " << x_shape;
+  }
+}
+
+bool LayerNormCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
+                                const std::vector<kernel::AddressPtr> &outputs) {
+  if (dtype_ == kNumberTypeFloat16) {
+    LaunchKernel<float16>(inputs, outputs);
+  } else if (dtype_ == kNumberTypeFloat32 || dtype_ == kNumberTypeFloat64) {
+    LaunchKernel<float>(inputs, outputs);
+  } else {
+    MS_LOG(EXCEPTION) << "input dtype only support float16, float32, float64";
+  }
+  return true;
+}
+
+template <typename T>
+void LayerNormCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
+  size_t f_size = sizeof(T);
+  if (inputs[1]->size != f_size * param_num_ || inputs[2]->size != f_size * param_num_) {
+    MS_LOG(EXCEPTION) << "The product of gamma and beta's shape must be " << param_num_;
+  }
+  if (outputs[1]->size != f_size * block_num_ || outputs[2]->size != f_size * block_num_) {
+    MS_LOG(EXCEPTION) << "The product of mean and var's shape must be " << block_num_;
+  }
+  auto x = reinterpret_cast<T *>(inputs[0]->addr);
+  auto gamma = reinterpret_cast<T *>(inputs[1]->addr);
+  auto beta = reinterpret_cast<T *>(inputs[2]->addr);
+  auto y = reinterpret_cast<T *>(outputs[0]->addr);
+  auto mean = reinterpret_cast<T *>(outputs[1]->addr);
+  auto var = reinterpret_cast<T *>(outputs[2]->addr);
+  for (size_t i = 0; i < block_num_; ++i) {
+    T sum = (T)0.0;
+    T square_sum = (T)0.0;
+    for (size_t j = i * block_size_; j < (i + 1) * block_size_; ++j) {
+      sum += x[j];
+      square_sum += x[j] * x[j];
+    }
+    T block_mean = sum / block_size_;
+    T block_var = square_sum / block_size_ - block_mean * block_mean;
+    for (size_t j = i * block_size_; j < (i + 1) * block_size_; ++j) {
+      auto param_shift = j % param_num_;
+      y[j] = (x[j] - block_mean) / (T)std::sqrt(static_cast<double>(block_var) + eps_) * gamma[param_shift] +
+             beta[param_shift];
+    }
+    mean[i] = block_mean;
+    var[i] = block_var;
+  }
+}
+
+void LayerNormCPUKernel::CheckParam(const CNodePtr &kernel_node) {
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 3) {
+    MS_LOG(EXCEPTION) << "LayerNormCPUKernel needs 3 inputs, but gets " << input_num;
+  }
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 3) {
+    MS_LOG(EXCEPTION) << "LayerNormCPUKernel expects 3 output, but gets" << output_num;
+  }
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_cpu_kernel.h
@ -0,0 +1,70 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_CPU_KERNEL_H_
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class LayerNormCPUKernel : public CPUKernel {
+ public:
+  LayerNormCPUKernel() = default;
+  ~LayerNormCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+  template <typename T>
+  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
+
+ private:
+  void CheckParam(const CNodePtr &kernel_node);
+  TypeId dtype_{kTypeUnknown};
+  float eps_{1e-12};
+  size_t block_num_{1};
+  size_t block_size_{1};
+  size_t param_num_{1};
+};
+
+MS_REG_CPU_KERNEL(LayerNorm,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat16)
+                    .AddInputAttr(kNumberTypeFloat16)
+                    .AddInputAttr(kNumberTypeFloat16)
+                    .AddOutputAttr(kNumberTypeFloat16)
+                    .AddOutputAttr(kNumberTypeFloat16)
+                    .AddOutputAttr(kNumberTypeFloat16),
+                  LayerNormCPUKernel);
+
+MS_REG_CPU_KERNEL(LayerNorm,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  LayerNormCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_grad_cpu_kernel.cc
@ -0,0 +1,124 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/cpu/layer_norm_grad_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+void LayerNormGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  CheckParam(kernel_node);
+  dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
+  std::vector<size_t> x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  auto begin_norm_axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "begin_norm_axis");
+  auto begin_params_axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "begin_params_axis");
+  if (begin_norm_axis < 0) {
+    begin_norm_axis += x_shape.size();
+  }
+  if (begin_params_axis < 0) {
+    begin_params_axis += x_shape.size();
+  }
+  for (size_t i = 0; i < IntToSize(begin_norm_axis); i++) {
+    block_num_ *= x_shape[i];
+  }
+  for (size_t i = IntToSize(begin_norm_axis); i < x_shape.size(); i++) {
+    block_size_ *= x_shape[i];
+  }
+  for (size_t i = 0; i < IntToSize(begin_params_axis); i++) {
+    param_size_ *= x_shape[i];
+  }
+  for (size_t i = begin_params_axis; i < x_shape.size(); i++) {
+    param_num_ *= x_shape[i];
+  }
+  if (block_num_ <= 0 || block_size_ <= 0) {
+    MS_LOG(EXCEPTION) << "LayerNormGradCPUKernel input shape error, input shape: " << x_shape;
+  }
+}
+
+bool LayerNormGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                    const std::vector<kernel::AddressPtr> &workspace,
+                                    const std::vector<kernel::AddressPtr> &outputs) {
+  if (dtype_ == kNumberTypeFloat16) {
+    LaunchKernel<float16>(inputs, workspace, outputs);
+  } else if (dtype_ == kNumberTypeFloat32 || dtype_ == kNumberTypeFloat64) {
+    LaunchKernel<float>(inputs, workspace, outputs);
+  } else {
+    MS_LOG(EXCEPTION) << "input dtype only support float16, float32, float64";
+  }
+  return true;
+}
+
+template <typename T>
+void LayerNormGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
+                                          const std::vector<AddressPtr> &workspace,
+                                          const std::vector<AddressPtr> &outputs) {
+  auto x = reinterpret_cast<T *>(inputs[0]->addr);
+  auto dy = reinterpret_cast<T *>(inputs[1]->addr);
+  auto var = reinterpret_cast<T *>(inputs[2]->addr);
+  auto mean = reinterpret_cast<T *>(inputs[3]->addr);
+  auto gamma = reinterpret_cast<T *>(inputs[4]->addr);
+  auto dx = reinterpret_cast<T *>(outputs[0]->addr);
+  auto dg = reinterpret_cast<T *>(outputs[1]->addr);
+  auto db = reinterpret_cast<T *>(outputs[2]->addr);
+
+  for (size_t i = 0; i < param_num_; ++i) {
+    T dgamma = (T)0.0;
+    T dbeta = (T)0.0;
+    for (size_t j = i; j < param_size_ * param_num_; j += param_num_) {
+      auto norm_shift = static_cast<int>(j / block_size_);
+      dgamma += dy[j] * (T)std::pow(static_cast<double>(var[norm_shift]) + eps_, -0.5) * (x[j] - mean[norm_shift]);
+      dbeta += dy[j];
+    }
+    dg[i] = dgamma;
+    db[i] = dbeta;
+  }
+  for (size_t i = 0; i < block_num_; ++i) {
+    T sum1 = (T)0.0;
+    T sum2 = (T)0.0;
+    T sum3 = (T)0.0;
+    for (size_t j = i * block_size_; j < (i + 1) * block_size_; ++j) {
+      auto param_shift = j % param_num_;
+      auto norm_shift = static_cast<int>(j / block_size_);
+      auto dxm = x[j] - mean[norm_shift];
+      auto dyg = dy[j] * gamma[param_shift];
+      sum1 += (T)(-0.5) * dyg * dxm * (T)std::pow(static_cast<double>(var[norm_shift]) + eps_, -1.5);
+      sum2 += dyg;
+      sum3 += (T)(-2.0) * dxm;
+    }
+    for (size_t j = i * block_size_; j < (i + 1) * block_size_; ++j) {
+      auto param_shift = j % param_num_;
+      auto norm_shift = static_cast<int>(j / block_size_);
+      auto var_sqrt = (T)std::pow(static_cast<double>(var[norm_shift]) + eps_, -0.5);
+      auto dx1 = dy[j] * gamma[param_shift] * var_sqrt;
+      auto dx2 = sum1 * (T)2.0 / block_size_ * (x[j] - mean[norm_shift]);
+      auto dx3 = ((T)(-1.0) * var_sqrt * sum2 + ((T)1.0 / block_size_) * sum1 * sum3) * ((T)1.0 / block_size_);
+      dx[j] = dx1 + dx2 + dx3;
+    }
+  }
+}
+
+void LayerNormGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
+  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  if (input_num != 5) {
+    MS_LOG(EXCEPTION) << "LayerNormGradCPUKernel needs 5 inputs, but gets " << input_num;
+  }
+  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+  if (output_num != 3) {
+    MS_LOG(EXCEPTION) << "LayerNormGradCPUKernel expects 3 output, but gets" << output_num;
+  }
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_grad_cpu_kernel.h
@ -0,0 +1,76 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_GRAD_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_GRAD_CPU_KERNEL_H_
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class LayerNormGradCPUKernel : public CPUKernel {
+ public:
+  LayerNormGradCPUKernel() = default;
+  ~LayerNormGradCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+  template <typename T>
+  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                    const std::vector<AddressPtr> &outputs);
+
+ private:
+  void CheckParam(const CNodePtr &kernel_node);
+  TypeId dtype_{kTypeUnknown};
+  float eps_{1e-12};
+  size_t block_num_{1};
+  size_t block_size_{1};
+  size_t param_num_{1};
+  size_t param_size_{1};
+};
+
+MS_REG_CPU_KERNEL(LayerNormGrad,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat16)
+                    .AddInputAttr(kNumberTypeFloat16)
+                    .AddInputAttr(kNumberTypeFloat16)
+                    .AddInputAttr(kNumberTypeFloat16)
+                    .AddInputAttr(kNumberTypeFloat16)
+                    .AddOutputAttr(kNumberTypeFloat16)
+                    .AddOutputAttr(kNumberTypeFloat16)
+                    .AddOutputAttr(kNumberTypeFloat16),
+                  LayerNormGradCPUKernel);
+
+MS_REG_CPU_KERNEL(LayerNormGrad,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  LayerNormGradCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.cc
@ -53,6 +53,9 @@ void BnupdateEltwiseEltwiseFusionPass::MatchBnupdateAddRelu(const CNodePtr &cnod
  auto add = relu_input->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(add);
  auto tuple_getitem = add->input(1);
+  std::vector<int64_t> add_output_used_num;
+  add_output_used_num.emplace_back(SizeToLong(manager->node_users()[add].size()));
+  AnfAlgo::SetNodeAttr(kAttrOutputUsedNum, MakeValue(add_output_used_num), add);
  MS_EXCEPTION_IF_NULL(tuple_getitem);
  if (tuple_getitem->isa<CNode>() && AnfAlgo::GetCNodeName(tuple_getitem) == prim::kPrimTupleGetItem->name()) {
    auto getitem = tuple_getitem->cast<CNodePtr>();
--- a/mindspore/ccsrc/frontend/optimizer/irpass/inline.h
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/inline.h
@ -141,15 +141,6 @@ class InlinerBase : public AnfVisitor {
    }

    if (IsUniqueUse(nullptr, fg, nullptr)) {
-      // The other branch calling the last after block.
-      if (fg->has_flag(FUNC_GRAPH_FLAG_AFTER_BLOCK)) {
-        // Check if parameters' changed.
-        auto param_simplified_caller = SimplifyAfterParameter(fg, node, args);
-        if (param_simplified_caller != nullptr) {
-          return param_simplified_caller;
-        }
-      }
-
      // For the single used fg, including non-after and after not matched above,
      // we move the whole fg nodes.
      if (use_move_) {
@ -160,6 +151,15 @@ class InlinerBase : public AnfVisitor {
        mng->MoveAllCNodeDropGraph(fg, node->func_graph(), inputs[0]->scope());
        return out_node;
      }
+
+      // The other branch calling the last after block.
+      if (fg->has_flag(FUNC_GRAPH_FLAG_AFTER_BLOCK)) {
+        // Check if parameters' changed.
+        auto param_simplified_caller = SimplifyAfterParameter(fg, node, args);
+        if (param_simplified_caller != nullptr) {
+          return param_simplified_caller;
+        }
+      }
    } else {
      // We don't expand the middle multiple used after block, except the last one.
      if (GraphHasBranch(fg)) {
--- a/mindspore/common/api.py
+++ b/mindspore/common/api.py
@ -298,6 +298,49 @@ def _generate_pip_args(obj, *args, method="construct"):
    return args_names, args_list


+def _get_auto_split_param_names(parameter_layout_dict):
+    auto_split_params = {}
+    for key, value in parameter_layout_dict.items():
+        for dim in value[1]:
+            if dim != -1:
+                auto_split_params[key] = value
+                break
+    auto_split_param_names = (param_name for param_name in auto_split_params)
+    return auto_split_param_names
+
+
+def _build_broadcast_graph(broadcast_params_dict, broadcast_phase):
+    """Build broadcast graph."""
+    from mindspore.nn.wrap.cell_wrapper import _BroadCastCell
+
+    if not broadcast_params_dict:
+        broadcast_params_dict = {}
+    broadcast_params = []
+    for param in broadcast_params_dict.values():
+        broadcast_params.append(Tensor(param.asnumpy()))
+    _broadcast_net = _BroadCastCell(broadcast_params)
+    _broadcast_net.phase = broadcast_phase
+    broadcasted_params = _broadcast_net()
+    for param_name, param in zip(broadcast_params_dict.keys(), broadcasted_params):
+        broadcast_params_dict[param_name].set_data(param)
+
+
+def _parameter_broadcast(obj, auto_parallel_mode):
+    """Parameter broadcast."""
+    auto_split_param_names = []
+    if auto_parallel_mode:
+        auto_split_param_names = _get_auto_split_param_names(obj.parameter_layout_dict)
+
+    broadcast_params_dict = obj.parameters_broadcast_dict()
+    if auto_split_param_names and broadcast_params_dict:
+        broadcast_params_dict = OrderedDict()
+        for param_name, param in obj.parameters_broadcast_dict().items():
+            if param_name not in auto_split_param_names:
+                broadcast_params_dict[param_name] = param
+    broadcast_phase = "_broadcast_subgraph"
+    _build_broadcast_graph(broadcast_params_dict, broadcast_phase)
+
+
 class _PynativeExecutor:
    """
    An pynative executor used to compile/manage/run graph.
@ -339,6 +382,10 @@ class _PynativeExecutor:
    def leave_construct(self, cell):
        self._executor.leave_construct(cell)

+    def parameter_broadcast(self, obj, phase, auto_parallel_mode):
+        if BROADCAST_PHASE not in phase and _get_parameter_broadcast():
+            _parameter_broadcast(obj, auto_parallel_mode)
+
    def __call__(self, obj, *args, **kwargs):
        args = args + tuple(kwargs.values())
        return self._executor(obj, args, "")
@ -391,31 +438,6 @@ class _Executor:
    def _build_data_graph(self, obj, phase):
        self._executor.build_data_graph(obj.parameters_dict(), phase, obj.parameters_broadcast_dict())

-    def _get_auto_split_param_names(self, parameter_layout_dict):
-        auto_split_params = {}
-        for key, value in parameter_layout_dict.items():
-            for dim in value[1]:
-                if dim != -1:
-                    auto_split_params[key] = value
-                    break
-        auto_split_param_names = (param_name for param_name in auto_split_params)
-        return auto_split_param_names
-
-    def _build_broadcast_graph(self, broadcast_params_dict, broadcast_phase):
-        """Build broadcast graph."""
-        from mindspore.nn.wrap.cell_wrapper import _BroadCastCell
-
-        if not broadcast_params_dict:
-            broadcast_params_dict = {}
-        broadcast_params = []
-        for param in broadcast_params_dict.values():
-            broadcast_params.append(Tensor(param.asnumpy()))
-        _broadcast_net = _BroadCastCell(broadcast_params)
-        _broadcast_net.phase = broadcast_phase
-        broadcasted_params = _broadcast_net()
-        for param_name, param in zip(broadcast_params_dict.keys(), broadcasted_params):
-            broadcast_params_dict[param_name].set_data(param)
-
    def _set_dataset_mode(self, args_list):
        """set dataset mode."""
        # decide whether to sink based on whether the inputs is virtual or args_list is ()
@ -500,18 +522,7 @@ class _Executor:
        elif not enable_ge and "export" in phase:
            self._build_data_graph(obj, phase)
        elif BROADCAST_PHASE not in phase and _get_parameter_broadcast():
-            auto_split_param_names = []
-            if auto_parallel_mode:
-                auto_split_param_names = self._get_auto_split_param_names(obj.parameter_layout_dict)
-
-            broadcast_params_dict = obj.parameters_broadcast_dict()
-            if auto_split_param_names and broadcast_params_dict:
-                broadcast_params_dict = OrderedDict()
-                for param_name, param in obj.parameters_broadcast_dict().items():
-                    if param_name not in auto_split_param_names:
-                        broadcast_params_dict[param_name] = param
-            broadcast_phase = "_broadcast_subgraph"
-            self._build_broadcast_graph(broadcast_params_dict, broadcast_phase)
+            _parameter_broadcast(obj, auto_parallel_mode)

        return phase, True

--- a/mindspore/lite/examples/transfer_learning/src/net_runner.cc
+++ b/mindspore/lite/examples/transfer_learning/src/net_runner.cc
@ -114,7 +114,7 @@ std::vector<int> NetRunner::FillInputData(const std::vector<DataLabelTuple> &dat
    int label = 0;
    char *data = nullptr;
    std::tie(data, label) = dataset[idx];
-    std::copy(data, data + data_size, input_data + i * data_size);
+    std::copy(data, data + data_size_, input_data + i * data_size_);
    labels[i * num_of_classes_ + label] = 1.0;  // Model expects labels in onehot representation
    labels_vec.push_back(label);
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h
@ -36,13 +36,9 @@ class ConcatFp16CPUKernel : public LiteKernel {
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
    concat_param_ = reinterpret_cast<ConcatParameter *>(op_parameter_);
  }
-
  ~ConcatFp16CPUKernel() = default;
-
  int Init() override;
-
  int ReSize() override;
-
  int Run() override;

 private:
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@ -207,18 +207,12 @@ static int Convolution1x1Fp16RunHw(void *cdata, int task_id) {
 }

 int Convolution1x1FP16CPUKernel::Run() {
-  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Get executor tensor failed.";
-    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
-    return ret;
-  }
+  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

  pack_input_ = reinterpret_cast<float16_t *>(
    ctx_->allocator->Malloc(matmul_param_->row_16_ * matmul_param_->deep_ * sizeof(float16_t)));
  if (pack_input_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
-    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    return RET_MEMORY_FAILED;
  }

@ -232,6 +226,7 @@ int Convolution1x1FP16CPUKernel::Run() {
      input_ptr_ = batch_in;
    }

+    int ret = RET_ERROR;
    if (multi_thread_by_hw_) {
      ret = ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunHw, this, thread_count_);
    } else {
@ -240,16 +235,12 @@ int Convolution1x1FP16CPUKernel::Run() {
    }
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "ParallelLaunch failed.";
-      ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
      ctx_->allocator->Free(pack_input_);
      pack_input_ = nullptr;
      return ret;
    }
  }

-  ConvolutionBaseFP16CPUKernel::IfCastOutput();
-  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
-
  ctx_->allocator->Free(pack_input_);
  pack_input_ = nullptr;
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
@ -33,19 +33,10 @@ ConvolutionBaseFP16CPUKernel::~ConvolutionBaseFP16CPUKernel() {
 }

 int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() {
-  // ===================input====================//
-  auto input_tensor = in_tensors_.at(kInputIndex);
-  in_data_type_ = input_tensor->data_type();
-  MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16);
-
-  execute_input_ = ConvertInputFp32toFp16(input_tensor, context_);
-
-  // ==================output====================//
-  auto out_tensor = out_tensors_.at(kOutputIndex);
-  out_data_type_ = out_tensor->data_type();
-  MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16);
-
-  execute_output_ = MallocOutputFp16(out_tensor, context_);
+  auto input_tensor = in_tensors_.at(0);
+  auto output_tensor = out_tensors_.at(0);
+  execute_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
+  execute_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c());
  return RET_OK;
 }

@ -78,25 +69,4 @@ int ConvolutionBaseFP16CPUKernel::GetExecuteFilter() {
  }
  return RET_OK;
 }
-
-void ConvolutionBaseFP16CPUKernel::IfCastOutput() {
-  if (out_data_type_ == kNumberTypeFloat32) {
-    auto out_tensor = out_tensors_.at(kOutputIndex);
-    auto out_ele_num = out_tensor->ElementsNum();
-    auto output_addr = reinterpret_cast<float *>(out_tensor->MutableData());
-    Float16ToFloat32(execute_output_, output_addr, out_ele_num);
-  }
-}
-
-void ConvolutionBaseFP16CPUKernel::FreeTmpBuffer() {
-  if (in_data_type_ == kNumberTypeFloat32) {
-    context_->allocator->Free(execute_input_);
-    execute_input_ = nullptr;
-  }
-  if (out_data_type_ == kNumberTypeFloat32) {
-    context_->allocator->Free(execute_output_);
-    execute_output_ = nullptr;
-  }
-}
-
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
@ -38,16 +38,12 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel {
  int RunImpl(int task_id) { return mindspore::lite::RET_OK; }
  virtual int GetExecuteTensor();
  virtual int GetExecuteFilter();
-  virtual void IfCastOutput();
-  void FreeTmpBuffer();

 protected:
  float16_t *fp16_weight_ = nullptr;
  float16_t *execute_input_ = nullptr;
  float16_t *execute_weight_ = nullptr;
  float16_t *execute_output_ = nullptr;
-  TypeId in_data_type_;
-  TypeId out_data_type_;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@ -114,19 +114,13 @@ static int ConvDwFp16Run(void *cdata, int task_id) {
 }

 int ConvolutionDepthwiseFp16CPUKernel::Run() {
-  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Get Execute tensor failed.";
-    return ret;
-  }
+  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

-  ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_);
+  auto ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
  }

-  ConvolutionBaseFP16CPUKernel::IfCastOutput();
-  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
  return ret;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
@ -149,13 +149,8 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
    return ret;
  }

-  ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Get Execute tensor failed.";
-    FreePackedInputOutput();
-    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
-    return ret;
-  }
+  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
+
  if (need_align_) {
    PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
@ -172,8 +167,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
    PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
  }
-  ConvolutionBaseFP16CPUKernel::IfCastOutput();
-  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
+
  FreePackedInputOutput();
  return ret;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@ -128,17 +128,11 @@ static int ConvolutionFp16Impl(void *cdata, int task_id) {
 }

 int ConvolutionFP16CPUKernel::Run() {
-  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Get Execute tensor failed.";
-    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
-    return ret;
-  }
+  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

-  ret = InitTmpBuffer();
+  auto ret = InitTmpBuffer();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init tmp buffer failed.";
-    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    FreeTmpBuffer();
    return RET_ERROR;
  }
@ -147,8 +141,7 @@ int ConvolutionFP16CPUKernel::Run() {
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "conv fp16 error ret[" << ret << "]";
  }
-  ConvolutionBaseFP16CPUKernel::IfCastOutput();
-  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
+
  FreeTmpBuffer();
  return ret;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@ -195,17 +195,11 @@ static int ConvolutionWinogradFp16Impl(void *cdata, int task_id) {
 }

 int ConvolutionWinogradFP16CPUKernel::Run() {
-  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Get Execute tensor failed.";
-    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
-    return ret;
-  }
+  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

-  ret = InitTmpBuffer();
+  auto ret = InitTmpBuffer();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init tmp buffer failed.";
-    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    FreeTmpBuffer();
    return RET_ERROR;
  }
@ -215,8 +209,6 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
    MS_LOG(ERROR) << "conv winograd error error_code[" << ret << "]";
  }

-  ConvolutionBaseFP16CPUKernel::IfCastOutput();
-  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
  FreeTmpBuffer();
  return ret;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@ -162,13 +162,8 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
    return RET_ERROR;
  }

-  ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Get Execute tensor failed.";
-    FreePackedInputOutput();
-    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
-    return ret;
-  }
+  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
+
  if (need_align_) {
    PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
@ -189,8 +184,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
    PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
  }
-  ConvolutionBaseFP16CPUKernel::IfCastOutput();
-  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
+
  FreePackedInputOutput();
  return ret;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@ -189,7 +189,6 @@ int DeConvolutionFp16CPUKernel::Run() {
  int error_code = InitRunBuf();
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "deconv fp16 InitRunBuf error! error_code[" << error_code << "]";
-    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    FreeRunBuf();
    return RET_ERROR;
  }
@ -206,8 +205,6 @@ int DeConvolutionFp16CPUKernel::Run() {
    }
  }

-  ConvolutionBaseFP16CPUKernel::IfCastOutput();
-  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
  FreeRunBuf();
  return error_code;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
@ -405,9 +405,6 @@ int DeConvWinogradFp16CPUKernel::Run() {
    ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp16Run, this, thread_num_hw_);
  }

-  ConvolutionBaseFP16CPUKernel::IfCastOutput();
-  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
-
  return RET_OK;
 }
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc
@ -33,9 +33,6 @@ using mindspore::schema::PrimitiveType_Scale;
 namespace mindspore::kernel {

 int ScaleFp16CPUKernel::InitScaleOffset() {
-  auto input_tensor = in_tensors_.at(0);
-  malloc_input_ = input_tensor->data_type() == kNumberTypeFloat32;
-
  auto scale_tensor = in_tensors_.at(1);
  malloc_scale_ = scale_tensor->data_type() == kNumberTypeFloat32;

@ -45,9 +42,6 @@ int ScaleFp16CPUKernel::InitScaleOffset() {
    auto offset_tensor = in_tensors_.at(2);
    malloc_offset_ = offset_tensor->data_type() == kNumberTypeFloat32;
  }
-
-  auto output_tensor = out_tensors_.at(0);
-  malloc_output_ = output_tensor->data_type() == kNumberTypeFloat32;
  return RET_OK;
 }

@ -103,6 +97,11 @@ int ScaleFp16Run(void *cdata, int task_id) {
 }

 int ScaleFp16CPUKernel::Run() {
+  auto input_tensor = in_tensors_.at(0);
+  auto output_tensor = out_tensors_.at(0);
+  input_ = reinterpret_cast<float16_t *>(input_tensor->MutableData());
+  output_ = reinterpret_cast<float16_t *>(output_tensor->MutableData());
+
  auto ret = InitScaleOffset();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Scale fp16 InitScaleOffset failed.";
@ -123,20 +122,11 @@ int ScaleFp16CPUKernel::Run() {
    return RET_ERROR;
  }

-  // if output tensor is fp32, we need to transform
-  if (malloc_output_) {
-    auto out_tensor = out_tensors_.at(0);
-    Float16ToFloat32(output_, reinterpret_cast<float *>(out_tensor->MutableData()), out_tensor->ElementsNum());
-  }
  FreeTmpBuffer();
  return RET_OK;
 }

 int ScaleFp16CPUKernel::MallocAssignTmpBuffer() {
-  input_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_);
-  if (input_ == nullptr) {
-    return RET_ERROR;
-  }
  scale_ = ConvertInputFp32toFp16(in_tensors_.at(1), context_);
  if (scale_ == nullptr) {
    return RET_ERROR;
@ -155,18 +145,10 @@ int ScaleFp16CPUKernel::MallocAssignTmpBuffer() {
    }
    memset(offset_, 0, in_tensors_.at(1)->ElementsNum() * sizeof(float16_t));
  }
-  output_ = MallocOutputFp16(out_tensors_.at(0), context_);
-  if (output_ == nullptr) {
-    return RET_ERROR;
-  }
  return RET_OK;
 }

 void ScaleFp16CPUKernel::FreeTmpBuffer() {
-  if (malloc_input_ && input_ != nullptr) {
-    context_->allocator->Free(input_);
-    input_ = nullptr;
-  }
  if (malloc_scale_ && scale_ != nullptr) {
    context_->allocator->Free(scale_);
    scale_ = nullptr;
@ -175,10 +157,6 @@ void ScaleFp16CPUKernel::FreeTmpBuffer() {
    context_->allocator->Free(offset_);
    offset_ = nullptr;
  }
-  if (malloc_output_ && output_ != nullptr) {
-    context_->allocator->Free(output_);
-    output_ = nullptr;
-  }
 }

 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Scale, LiteKernelCreator<ScaleFp16CPUKernel>)
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h
@ -43,10 +43,8 @@ class ScaleFp16CPUKernel : public ScaleCPUKernel {
  void FreeTmpBuffer();

 private:
-  bool malloc_input_ = false;
  bool malloc_scale_ = false;
  bool malloc_offset_ = false;
-  bool malloc_output_ = false;

  float16_t *input_ = nullptr;
  float16_t *scale_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
@ -29,7 +29,6 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Stack;

 namespace mindspore::kernel {
-
 int StackFp16CPUKernel::Init() {
  if (!InferShapeDone()) {
    return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h
@ -27,9 +27,7 @@ class StackFp16CPUKernel : public StackCPUKernel {
                     const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                     const mindspore::lite::PrimitiveC *primitive)
      : StackCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
-
  ~StackFp16CPUKernel() = default;
-
  int Init() override;
  int Run() override;

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
@ -48,13 +48,12 @@ int ArgMinMaxOpenCLKernel::CheckSpecs() {
    return RET_ERROR;
  }
  auto *param = reinterpret_cast<ArgMinMaxParameter *>(this->op_parameter_);
-  param->dims_size_ = in_tensors_[0]->shape().size();
-  param->axis_ = (param->axis_ + param->dims_size_) % param->dims_size_;
-  if (param->axis_ < 0 || param->axis_ >= param->dims_size_) {
-    MS_LOG(ERROR) << "Invalid axis " << param->axis_;
+  auto dims_size = in_tensors_[0]->shape().size();
+  auto axis = (param->axis_ + dims_size) % dims_size;
+  if (axis < 0 || axis >= dims_size) {
+    MS_LOG(ERROR) << "Invalid axis " << axis;
    return RET_ERROR;
  }
-  param->get_max_ = (Type() == PrimitiveType_ArgMax);
  return RET_OK;
 }

@ -77,10 +76,10 @@ void ArgMinMaxOpenCLKernel::SetConstArgs() {

 void ArgMinMaxOpenCLKernel::SetGlobalLocal() {
  auto param = reinterpret_cast<ArgMinMaxParameter *>(op_parameter_);
-  auto in_shape = in_tensors_[0]->shape();
+  im_in_ = GpuTensorInfo(in_tensors_[0]);
+  std::vector<size_t> in_shape = {im_in_.N, im_in_.H, im_in_.W, im_in_.C};
  auto in_shape_align = in_shape;
  in_shape_align[3] = UP_ROUND(in_shape[3], C4NUM);
-  im_in_ = GpuTensorInfo(in_tensors_[0]);
  auto out_shape_align = in_shape_align;
  out_shape_align.at(param->axis_) = param->axis_ == 3 ? UP_ROUND(param->topk_, C4NUM) : param->topk_;
  int reduce_len = GetUpPow2(in_shape.at(param->axis_));
@ -92,7 +91,7 @@ void ArgMinMaxOpenCLKernel::SetGlobalLocal() {
  src_size_ = {std::accumulate(in_shape.begin() + param->axis_ + 1, in_shape.end(), 1, std::multiplies<int>()),
               std::accumulate(in_shape.begin(), in_shape.begin() + param->axis_, 1, std::multiplies<int>()),
               std::accumulate(in_shape.begin() + param->axis_, in_shape.end(), 1, std::multiplies<int>()),
-               in_shape.at(param->axis_)};
+               static_cast<int>(in_shape.at(param->axis_))};
  strides_ = {
    std::accumulate(in_shape_align.begin() + param->axis_ + 1, in_shape_align.end(), 1, std::multiplies<int>()),
    std::accumulate(in_shape_align.begin() + param->axis_, in_shape_align.end(), 1, std::multiplies<int>()),
@ -145,6 +144,12 @@ int ArgMinMaxOpenCLKernel::Prepare() {
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif

+  auto *param = reinterpret_cast<ArgMinMaxParameter *>(this->op_parameter_);
+  param->dims_size_ = in_tensors_[0]->shape().size();
+  param->axis_ = (param->axis_ + param->dims_size_) % param->dims_size_;
+  param->axis_ = (4 - param->dims_size_) + param->axis_;
+  param->get_max_ = (Type() == PrimitiveType_ArgMax);
+
  InitWeights();
  SetGlobalLocal();
  SetConstArgs();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@ -118,67 +118,77 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
    int alignment = ocl_runtime_->GetImagePitchAlignment();
    plane_out = UP_ROUND(plane_out, alignment) * C4NUM;
    pack_weight_size = plane_out * CO4;
-    auto shape = in_tensors_[1]->shape();
    size_t img_dtype = ocl_runtime_->GetFp16Enable() ? CL_HALF_FLOAT : CL_FLOAT;
-    img_size = {(size_t)plane_out / C4NUM, (size_t)shape[0] * CO4, img_dtype};
+    img_size = {(size_t)plane_out / C4NUM, (size_t)out_info.N * CO4, img_dtype};
  }
-  if (is_fp16) {
-    packed_weight_ = allocator->Malloc(pack_weight_size * sizeof(int16_t), img_size);
-    packed_weight_ = allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
-    if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) {
-      std::function<int16_t(int16_t)> to_dtype = [](int16_t x) -> int16_t { return x; };
-      PackNCHWToNC4HW4<int16_t, int16_t>(origin_weight, packed_weight_, 1, plane_in, plane_out, out_info.C, to_dtype);
-    } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) {
-      std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
-      PackNCHWToNC4HW4<float, float16_t>(origin_weight, packed_weight_, 1, plane_in, plane_out, out_info.C, to_dtype);
-    } else {  // int8 or int16
-      std::function<int16_t(int16_t)> to_dtype = [](int16_t x) -> int16_t { return x; };
-      PackNCHWToNC4HW4<int16_t, int16_t>(origin_weight, packed_weight_, 1, plane_in, plane_out, out_info.C, to_dtype);
+  pack_weight_size = is_fp16 ? pack_weight_size * sizeof(int16_t) : pack_weight_size * sizeof(float);
+  auto ConvertFilter = [](void *src, void *dst, TypeId src_type, TypeId dst_type, size_t plane_in, size_t plane_out,
+                          size_t channel) {
+    if (dst_type == kNumberTypeFloat16) {
+      if (src_type == kNumberTypeFloat16) {
+        std::function<int16_t(int16_t)> to_dtype = [](int16_t x) -> int16_t { return x; };
+        PackNCHWToNC4HW4<int16_t, int16_t>(src, dst, 1, plane_in, plane_out, channel, to_dtype);
+      } else if (src_type == kNumberTypeFloat32) {
+        std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
+        PackNCHWToNC4HW4<float, float16_t>(src, dst, 1, plane_in, plane_out, channel, to_dtype);
+      } else {  // int8 or int16
+        std::function<int16_t(int16_t)> to_dtype = [](int16_t x) -> int16_t { return x; };
+        PackNCHWToNC4HW4<int16_t, int16_t>(src, dst, 1, plane_in, plane_out, channel, to_dtype);
+      }
+    } else {
+      if (src_type == kNumberTypeFloat32) {
+        std::function<float(float)> to_dtype = [](float x) -> float { return x; };
+        PackNCHWToNC4HW4<float, float>(src, dst, 1, plane_in, plane_out, channel, to_dtype);
+      } else if (src_type == kNumberTypeFloat16) {
+        std::function<float(float16_t)> to_dtype = [](float16_t x) -> float { return static_cast<float>(x); };
+        PackNCHWToNC4HW4<float16_t, float>(src, dst, 1, plane_in, plane_out, channel, to_dtype);
+      } else {  // int8 or int16
+        std::function<float(float)> to_dtype = [](float x) -> float { return x; };
+        PackNCHWToNC4HW4<float, float>(src, dst, 1, plane_in, plane_out, channel, to_dtype);
+      }
    }
-  } else {
-    packed_weight_ = allocator->Malloc(pack_weight_size * sizeof(float), img_size);
-    packed_weight_ = allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
-    if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) {
-      std::function<float(float)> to_dtype = [](float x) -> float { return x; };
-      PackNCHWToNC4HW4<float, float>(origin_weight, packed_weight_, 1, plane_in, plane_out, out_info.C, to_dtype);
-    } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) {
-      std::function<float(float16_t)> to_dtype = [](float16_t x) -> float { return static_cast<float>(x); };
-      PackNCHWToNC4HW4<float16_t, float>(origin_weight, packed_weight_, 1, plane_in, plane_out, out_info.C, to_dtype);
-    } else {  // int8 or int16
-      std::function<float(float)> to_dtype = [](float x) -> float { return x; };
-      PackNCHWToNC4HW4<float, float>(origin_weight, packed_weight_, 1, plane_in, plane_out, out_info.C, to_dtype);
-    }
-  }
-  allocator->UnmapBuffer(packed_weight_);
+  };
+  std::vector<char> temp_filter(pack_weight_size);
+  auto src_type = in_tensors_.at(kWeightIndex)->data_type();
+  auto dst_type = is_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32;
+  ConvertFilter(origin_weight, temp_filter.data(), src_type, dst_type, plane_in, plane_out, out_info.C);
+  packed_weight_ = allocator->Malloc(pack_weight_size, img_size, temp_filter.data());
  FreeDequantedWeight();
+  if (packed_weight_ == nullptr) {
+    return RET_ERROR;
+  }

+  auto ConvertBias = [](void *src, void *dst, size_t size, size_t dtype_size, TypeId src_type, TypeId dst_type) {
+    if (dst_type == kNumberTypeFloat16 && src_type == kNumberTypeFloat32) {
+      float16_t *bias_ptr = static_cast<float16_t *>(dst);
+      for (size_t i = 0; i < size; ++i) {
+        bias_ptr[i] = static_cast<float16_t>(static_cast<float *>(src)[i]);
+      }
+    } else if (dst_type == kNumberTypeFloat32 && src_type == kNumberTypeFloat16) {
+      float32_t *bias_ptr = static_cast<float32_t *>(dst);
+      for (size_t i = 0; i < size; ++i) {
+        bias_ptr[i] = static_cast<float32_t>(static_cast<float16_t *>(src)[i]);
+      }
+    } else {
+      memcpy(dst, src, size * dtype_size);
+    }
+  };
  size_t dtype_size = sizeof(float);
  if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) {
    dtype_size = sizeof(int16_t);
  }
-  bias_data_ = allocator->Malloc(C4NUM * CO4 * dtype_size);
-  bias_data_ = allocator->MapBuffer(bias_data_, CL_MAP_WRITE, nullptr, true);
-  size_t up_co_size = C4NUM * CO4 * dtype_size;
-  memset(bias_data_, 0, up_co_size);
-  if (in_tensors_.size() == kInputSize2) {
-    auto ori_bias = in_tensors_.at(kBiasIndex)->data_c();
-    if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat32) {
-      float16_t *bias_ptr = static_cast<float16_t *>(bias_data_);
-      for (size_t i = 0; i < in_tensors_.at(kBiasIndex)->ElementsNum(); ++i) {
-        bias_ptr[i] = static_cast<float16_t>(static_cast<float *>(ori_bias)[i]);
-      }
-    } else if (!is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) {
-      float32_t *bias_ptr = static_cast<float32_t *>(bias_data_);
-      for (size_t i = 0; i < in_tensors_.at(kBiasIndex)->ElementsNum(); ++i) {
-        bias_ptr[i] = static_cast<float32_t>(static_cast<float16_t *>(ori_bias)[i]);
-      }
-    } else {
-      memcpy(bias_data_, ori_bias, out_info.C * dtype_size);
-    }
-  } else {
-    MS_ASSERT(in_tensors_.size() == kInputSize1);
+  std::vector<char> temp_bias(pack_weight_size, 0);
+  if (in_tensors_.size() == 3) {
+    src_type = in_tensors_.at(kBiasIndex)->data_type();
+    dst_type = is_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32;
+    auto element_size = in_tensors_.at(kBiasIndex)->ElementsNum();
+    ConvertBias(in_tensors_.at(kBiasIndex)->data_c(), temp_bias.data(), element_size, dtype_size, src_type, dst_type);
+  }
+  size_t bias_size = C4NUM * CO4 * dtype_size;
+  bias_data_ = allocator->Malloc(bias_size, {}, temp_bias.data());
+  if (bias_data_ == nullptr) {
+    return RET_ERROR;
  }
-  allocator->UnmapBuffer(bias_data_);
  return mindspore::lite::RET_OK;
 }
 void DepthwiseConv2dOpenCLKernel::SetConstArgs() {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc
@ -47,7 +47,6 @@ std::pair<bool, FusionEltwiseParameter *> CheckSupportOrCreateParam(
  LiteKernel *node, bool create_param = false,
  const std::map<lite::Tensor *, FusionEltwiseParameter *> &replace_map = {}) {
  MS_ASSERT(node);
-  MS_ASSERT(param);
  PrimitiveType node_type = node->Type();
  auto operator_ = static_cast<const EltwiseOperator>(node_type);
  auto *op_parameter = reinterpret_cast<OpenCLKernel *>(node)->GetParameter();
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
@ -107,14 +107,20 @@ void *OpenCLAllocator::CreateImage2D(size_t size, const std::vector<size_t> &img
  }
  if (*image == nullptr) {
    delete *buffer;
-    MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << ret << ")";
+    MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << mindspore::kernel::CLErrorCode(ret) << ")";
+    return nullptr;
+  }
+  if (ret != CL_SUCCESS) {
+    delete *buffer;
+    delete *image;
+    MS_LOG(ERROR) << "Create OpenCL Image2D  (ERROR CODE: " << mindspore::kernel::CLErrorCode(ret) << ")";
    return nullptr;
  }
  MS_LOG(DEBUG) << "Malloc a new Image2D, width=" << img_size[0] << ", height=" << img_size[1];
  void *host_ptr = nullptr;
  if (is_map) {
    std::vector<size_t> region{img_size[0], img_size[1], 1};
-    host_ptr = ocl_runtime_->MapBuffer(**image, 0, CL_MAP_READ | CL_MAP_WRITE, region);
+    host_ptr = ocl_runtime_->MapBuffer(**image, true, CL_MAP_READ | CL_MAP_WRITE, region);
    if (host_ptr == nullptr) {
      delete *buffer;
      delete *image;
@ -340,7 +346,7 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue,
    std::vector<size_t> region{mem_buf->img_size[0], mem_buf->img_size[1], 1};
    cl::Image2D *image = static_cast<cl::Image2D *>(mem_buf->image_ptr_);
    MS_ASSERT(image);
-    new_host_ptr = ocl_runtime_->MapBuffer(*image, 0, CL_MAP_READ | CL_MAP_WRITE, region);
+    new_host_ptr = ocl_runtime_->MapBuffer(*image, sync, CL_MAP_READ | CL_MAP_WRITE, region);
  }
  if (new_host_ptr == nullptr) {
    UnLock();
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/argminmax_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/argminmax_tests.cc
@ -185,4 +185,19 @@ TEST_F(TestOpenCL_ArgMinMax, axis3topk2value) {
    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
  }
 }
+TEST_F(TestOpenCL_ArgMinMax, axis1topk1index) {
+  schema::PrimitiveType type = schema::PrimitiveType_ArgMax;
+  int axis = 1;
+  int topk = 1;
+  bool out_value = false;
+  std::vector<int> input_shape = {1, 2, 14};
+  std::vector<int> output_shape = {1, 14};
+  float input_data[] = {10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25, 50,
+                        30, 10, 20, 30, 40, 90, 20, 11, 15, 1,  50, 30, 45, 25};
+  float output_data[] = {1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(type, axis, topk, out_value);
+    TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable, 1e-1, 1e-1, true);
+  }
+}
 }  // namespace mindspore::lite::opencl::test
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
@ -58,22 +58,24 @@ TEST_F(TestOpenCL_DepthwiseConv2d, NoPad) {
  std::vector<int> output_shape = {1, 2, 2, 4};
  std::vector<int> weight_shape = {1, kernel_h, kernel_w, output_shape.back()};
  std::vector<int> bias_shape = {output_shape.back()};
-  float input_data[] = {0.5488135,  0.0202184,  0.45615032, 0.31542835, 0.71518934, 0.83261985, 0.56843394, 0.36371076,
-                        0.60276335, 0.77815676, 0.0187898,  0.57019675, 0.5448832,  0.87001216, 0.6176355,  0.43860152,
-                        0.4236548,  0.9786183,  0.6120957,  0.9883738,  0.6458941,  0.7991586,  0.616934,   0.10204481,
-                        0.4375872,  0.46147937, 0.94374806, 0.20887676, 0.891773,   0.7805292,  0.6818203,  0.16130951,
-                        0.96366274, 0.11827443, 0.3595079,  0.6531083,  0.3834415,  0.639921,   0.43703195, 0.2532916,
-                        0.79172504, 0.14335328, 0.6976312,  0.46631077, 0.5288949,  0.9446689,  0.06022547, 0.2444256,
-                        0.56804454, 0.5218483,  0.6667667,  0.15896958, 0.92559665, 0.41466194, 0.67063785, 0.11037514,
-                        0.07103606, 0.2645556,  0.21038257, 0.6563296,  0.0871293,  0.7742337,  0.12892629, 0.13818295};
+  float input_data[] = {
+    0.5488135,  0.71518934, 0.60276335, 0.5448832,  0.4236548,  0.6458941,  0.4375872,  0.891773,
+    0.96366274, 0.3834415,  0.79172504, 0.5288949,  0.56804454, 0.92559665, 0.07103606, 0.0871293,
+    0.0202184,  0.83261985, 0.77815676, 0.87001216, 0.9786183,  0.7991586,  0.46147937, 0.7805292,
+    0.11827443, 0.639921,   0.14335328, 0.9446689,  0.5218483,  0.41466194, 0.2645556,  0.7742337,
+    0.45615032, 0.56843394, 0.0187898,  0.6176355,  0.6120957,  0.616934,   0.94374806, 0.6818203,
+    0.3595079,  0.43703195, 0.6976312,  0.06022547, 0.6667667,  0.67063785, 0.21038257, 0.12892629,
+    0.31542835, 0.36371076, 0.57019675, 0.43860152, 0.9883738,  0.10204481, 0.20887676, 0.16130951,
+    0.6531083,  0.2532916,  0.46631077, 0.2444256,  0.15896958, 0.11037514, 0.6563296,  0.13818295,
+  };
  float bias_data[] = {0, 0, 0, 0};
  float weight_data[] = {0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,  0.09609841, 0.97645944, 0.4686512,
                         0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696, 0.12019656, 0.2961402,  0.11872772,
                         0.31798318, 0.41426298, 0.06414749, 0.6924721,  0.56660146, 0.2653895,  0.5232481,  0.09394051,
                         0.5759465,  0.9292962,  0.31856894, 0.6674104,  0.13179787, 0.7163272,  0.2894061,  0.18319136,
                         0.5865129,  0.02010755, 0.82894003, 0.00469548};
-  float output_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022,  1.1872686,
-                         2.2294958, 1.6570128, 2.465089,  1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
+  float output_data[] = {2.9720426, 1.890834,  2.3618119, 2.3867798, 2.5666943, 1.6261611, 2.0977764, 1.6445805,
+                         2.462798,  1.6643658, 1.6861027, 1.8428761, 2.5156446, 1.5366757, 1.6767557, 1.6905226};

  for (auto fp16_enable : {false, true}) {
    auto *param = CreateParameter(kernel_h, kernel_w, stride_h, stride_w, pad_u, pad_d, pad_l, pad_r, dilation_h,
@ -132,4 +134,117 @@ TEST_F(TestOpenCL_DepthwiseConv2d, Pad) {
  }
 }

+TEST_F(TestOpenCL_DepthwiseConv2d, NoPad1) {
+  int kernel_h = 2;
+  int kernel_w = 2;
+  int stride_h = 1;
+  int stride_w = 1;
+  int pad_u = 0;
+  int pad_d = 0;
+  int pad_l = 0;
+  int pad_r = 0;
+  int dilation_h = 1;
+  int dilation_w = 1;
+  ActType act_type = ActType_No;
+
+  std::vector<int> input_shape = {1, 4, 4, 4};
+  std::vector<int> output_shape = {1, 3, 3, 4};
+  std::vector<int> weight_shape = {1, kernel_h, kernel_w, output_shape.back()};
+  std::vector<int> bias_shape = {output_shape.back()};
+  float input_data[] = {0.5488135,  0.71518934, 0.60276335, 0.5448832,  0.4236548,  0.6458941,  0.4375872,  0.891773,
+                        0.96366274, 0.3834415,  0.79172504, 0.5288949,  0.56804454, 0.92559665, 0.07103606, 0.0871293,
+                        0.0202184,  0.83261985, 0.77815676, 0.87001216, 0.9786183,  0.7991586,  0.46147937, 0.7805292,
+                        0.11827443, 0.639921,   0.14335328, 0.9446689,  0.5218483,  0.41466194, 0.2645556,  0.7742337,
+                        0.45615032, 0.56843394, 0.0187898,  0.6176355,  0.6120957,  0.616934,   0.94374806, 0.6818203,
+                        0.3595079,  0.43703195, 0.6976312,  0.06022547, 0.6667667,  0.67063785, 0.21038257, 0.12892629,
+                        0.31542835, 0.36371076, 0.57019675, 0.43860152, 0.9883738,  0.10204481, 0.20887676, 0.16130951,
+                        0.6531083,  0.2532916,  0.46631077, 0.2444256,  0.15896958, 0.11037514, 0.6563296,  0.13818295};
+  float bias_data[] = {0, 0, 0, 0};
+  float weight_data[] = {0.19658236, 0.36872517, 0.82099323, 0.09710128, 0.83794491, 0.09609841,
+                         0.97645947, 0.4686512,  0.97676109, 0.60484552, 0.73926358, 0.03918779,
+                         0.28280696, 0.12019656, 0.2961402,  0.11872772};
+  float output_data[] = {0.3757235,  1.8489048,  1.4467758,  0.6116009,  1.2535334, 1.6583176, 1.2530621,  0.6590755,
+                         0.5466661,  1.22944,    0.93263525, 0.5317252,  0.7987474, 1.618667,  1.090071,   0.60372007,
+                         0.773425,   1.5383728,  1.262479,   0.54334986, 0.5755667, 1.3171062, 0.82401496, 0.39336145,
+                         0.6703031,  0.9385749,  1.018886,   0.40566355, 1.1277528, 0.7773028, 1.5164642,  0.27685273,
+                         0.86816025, 0.72971237, 1.1791146,  0.12131907};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(kernel_h, kernel_w, stride_h, stride_w, pad_u, pad_d, pad_l, pad_r, dilation_h,
+                                  dilation_w, act_type, input_shape.back());
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, weight_data, CONST_TENSOR},
+              {bias_shape, bias_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-2 : 1e-5, 1e-1, true);
+  }
+}
+TEST_F(TestOpenCL_DepthwiseConv2d, Pad1) {
+  int kernel_h = 3;
+  int kernel_w = 3;
+  int stride_h = 1;
+  int stride_w = 1;
+  int pad_u = 1;
+  int pad_d = 1;
+  int pad_l = 1;
+  int pad_r = 1;
+  int dilation_h = 1;
+  int dilation_w = 1;
+  ActType act_type = ActType_No;
+
+  std::vector<int> input_shape = {1, 5, 5, 6};
+  std::vector<int> output_shape = {1, 5, 5, 6};
+  std::vector<int> weight_shape = {1, kernel_h, kernel_w, output_shape.back()};
+  std::vector<int> bias_shape = {output_shape.back()};
+  float input_data[] = {
+    0.5488135,  0.71518934, 0.60276335, 0.5448832,  0.4236548,  0.6458941,  0.4375872,  0.891773,   0.96366274,
+    0.3834415,  0.79172504, 0.5288949,  0.56804454, 0.92559665, 0.07103606, 0.0871293,  0.0202184,  0.83261985,
+    0.77815676, 0.87001216, 0.9786183,  0.7991586,  0.46147937, 0.7805292,  0.11827443, 0.639921,   0.14335328,
+    0.9446689,  0.5218483,  0.41466194, 0.2645556,  0.7742337,  0.45615032, 0.56843394, 0.0187898,  0.6176355,
+    0.6120957,  0.616934,   0.94374806, 0.6818203,  0.3595079,  0.43703195, 0.6976312,  0.06022547, 0.6667667,
+    0.67063785, 0.21038257, 0.12892629, 0.31542835, 0.36371076, 0.57019675, 0.43860152, 0.9883738,  0.10204481,
+    0.20887676, 0.16130951, 0.6531083,  0.2532916,  0.46631077, 0.2444256,  0.15896958, 0.11037514, 0.6563296,
+    0.13818295, 0.19658236, 0.36872518, 0.82099324, 0.09710128, 0.8379449,  0.09609841, 0.97645944, 0.4686512,
+    0.9767611,  0.6048455,  0.7392636,  0.03918779, 0.28280696, 0.12019656, 0.2961402,  0.11872772, 0.31798318,
+    0.41426298, 0.06414749, 0.6924721,  0.56660146, 0.2653895,  0.5232481,  0.09394051, 0.5759465,  0.9292962,
+    0.31856894, 0.6674104,  0.13179787, 0.7163272,  0.2894061,  0.18319136, 0.5865129,  0.02010755, 0.82894003,
+    0.00469548, 0.6778165,  0.27000797, 0.735194,   0.96218854, 0.24875315, 0.57615733, 0.5920419,  0.5722519,
+    0.22308163, 0.952749,   0.44712538, 0.84640867, 0.6994793,  0.29743695, 0.81379783, 0.39650574, 0.8811032,
+    0.5812729,  0.8817354,  0.6925316,  0.7252543,  0.50132436, 0.95608366, 0.6439902,  0.42385504, 0.6063932,
+    0.0191932,  0.30157483, 0.66017354, 0.2900776,  0.6180154,  0.4287687,  0.13547407, 0.29828233, 0.5699649,
+    0.59087276, 0.57432526, 0.6532008,  0.65210325, 0.43141845, 0.8965466,  0.36756188, 0.43586493, 0.89192337,
+    0.806194,   0.7038886,  0.10022689, 0.9194826,  0.7142413,  0.998847};
+  float weight_data[] = {0.1494483,  0.86812606, 0.16249293, 0.61555956, 0.12381998, 0.84800823, 0.80731896, 0.56910074,
+                         0.4071833,  0.069167,   0.69742877, 0.45354268, 0.7220556,  0.86638233, 0.97552151, 0.85580334,
+                         0.01171408, 0.35997806, 0.72999056, 0.17162968, 0.52103661, 0.05433799, 0.19999652, 0.01852179,
+                         0.7936977,  0.22392469, 0.34535168, 0.92808129, 0.7044144,  0.03183893, 0.16469416, 0.6214784,
+                         0.57722859, 0.23789282, 0.934214,   0.61396596, 0.5356328,  0.58990998, 0.73012203, 0.311945,
+                         0.39822106, 0.20984375, 0.18619301, 0.94437239, 0.7395508,  0.49045881, 0.22741463, 0.25435648,
+                         0.05802916, 0.43441663, 0.31179588, 0.69634349, 0.37775184, 0.17960368};
+  float bias_data[] = {0, 0, 0, 0, 0, 0};
+  float output_data[] = {
+    0.8388255,  1.7207233,  0.56646764, 1.50962,   0.6184657,  0.7572999, 1.7197044,  2.8834608, 1.0304408,  1.5622743,
+    0.95027775, 1.1451806,  2.0191956,  2.9541533, 1.1799709,  1.6366025, 1.3484346,  1.0071151, 1.3740869,  2.1602216,
+    1.0846798,  1.7810996,  1.6170096,  0.6889053, 0.8671698,  1.4957678, 0.68065727, 1.0596768, 0.9761665,  0.38881996,
+    1.524128,   2.2121127,  1.1506181,  1.330961,  1.8186853,  0.9094476, 2.3777275,  2.5568333, 1.8321692,  1.8297466,
+    2.069798,   1.3701197,  2.7548862,  2.0871775, 2.3611763,  1.5387508, 1.6725919,  1.2565864, 2.6130712,  2.0915375,
+    1.2955335,  1.6571269,  1.7603228,  1.3315495, 1.0005323,  1.0135669, 1.2701392,  1.8230836, 1.6048919,  1.4224635,
+    1.4651375,  1.0251865,  1.0325887,  1.2355556, 1.3313429,  0.6756204, 2.602416,   2.1827717, 1.4354478,  1.6628273,
+    2.0171032,  1.0299077,  2.6085434,  1.3310422, 2.1677747,  2.457499,  2.6715999,  1.0225507, 2.5822947,  2.1068158,
+    1.6401942,  2.5422354,  2.6937182,  1.3813802, 1.1241511,  1.273326,  1.2024405,  1.4564767, 2.016776,   1.0182433,
+    1.228782,   0.83329916, 1.033041,   1.3280122, 1.9437144,  0.6729013, 2.438968,   2.3275855, 2.289177,   1.4376242,
+    2.4595368,  1.325891,   2.018128,   2.676854,  1.9685578,  1.8240746, 2.3104675,  1.4958379, 2.474168,   2.6657124,
+    1.6738743,  2.336092,   2.3048637,  1.802324,  1.7594845,  1.6022205, 1.2564734,  1.8977238, 1.6991055,  1.8674731,
+    0.47793916, 1.2031221,  0.6579696,  1.0724078, 0.96408695, 0.5074543, 1.2399375,  1.410824,  0.56263226, 1.3138686,
+    1.4859737,  0.7219256,  1.3437214,  2.0015993, 1.0472497,  1.064316,  1.7359762,  0.9249617, 1.2835678,  2.1866667,
+    0.92954785, 2.005947,   1.8761289,  1.2612648, 1.2410495,  1.263778,  0.54638237, 1.8269669, 1.3152003,  0.7890457};
+
+  for (auto fp16_enable : {false, true}) {
+    auto *param = CreateParameter(kernel_h, kernel_w, stride_h, stride_w, pad_u, pad_d, pad_l, pad_r, dilation_h,
+                                  dilation_w, act_type, input_shape.back());
+    TestMain({{input_shape, input_data, VAR},
+              {weight_shape, weight_data, CONST_TENSOR},
+              {bias_shape, bias_data, CONST_TENSOR}},
+             {output_shape, output_data}, param, fp16_enable, fp16_enable ? 1e-2 : 1e-5, 1e-1, true);
+  }
+}
 }  // namespace mindspore::lite::opencl::test
--- a/mindspore/nn/cell.py
+++ b/mindspore/nn/cell.py
@ -23,6 +23,7 @@ import numpy

 from mindspore import log as logger
 from mindspore.common.parameter import PARAMETER_NAME_DEFAULT
+from mindspore.context import ParallelMode
 from .. import context
 from .._c_expression import init_pipeline, Cell_
 from .._checkparam import Validator
@ -90,6 +91,7 @@ class Cell(Cell_):
        self._parameter_layout_dict = {}
        self._create_time = int(time.time() * 1e9)
        self.phase_prefix = ""
+        self.parameter_broadcast_done = False
        init_pipeline()

        # call gc to release GE session resources used by non-used cell objects
@ -300,6 +302,11 @@ class Cell(Cell_):
            out = self.compile_and_run(*inputs)
            return out

+        if context.get_auto_parallel_context("parallel_mode") == ParallelMode.DATA_PARALLEL:
+            if not self.parameter_broadcast_done:
+                _pynative_exec.parameter_broadcast(self, self.phase, self._auto_parallel_mode)
+                self.parameter_broadcast_done = True
+
        for item in inputs:
            if isinstance(item, numpy.ndarray):
                raise TypeError("cell inputs should not be numpy array.")
--- a/mindspore/nn/loss/init.py
+++ b/mindspore/nn/loss/init.py
@ -21,8 +21,8 @@ It shows how well the model works on a dataset and the optimization target which

 from .loss import L1Loss, MSELoss, SmoothL1Loss, \
    SoftmaxCrossEntropyWithLogits, BCELoss, CosineEmbeddingLoss, \
-    SampledSoftmaxLoss
+    SampledSoftmaxLoss, DiceLoss

 __all__ = ['L1Loss', 'MSELoss', 'SmoothL1Loss',
           'SoftmaxCrossEntropyWithLogits', 'BCELoss',
-           'CosineEmbeddingLoss', 'SampledSoftmaxLoss']
+           'CosineEmbeddingLoss', 'SampledSoftmaxLoss', 'DiceLoss']
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@ -297,6 +297,67 @@ def _check_label_dtype(labels_dtype, cls_name):
    validator.check_type_name("labels", labels_dtype, [mstype.int32, mstype.int64], cls_name)


+class DiceLoss(_Loss):
+    r"""
+    The Dice coefficient is a set similarity loss. It is used to calculate the similarity between two samples. The
+    value of the Dice coefficient is 1 when the segmentation result is the best and 0 when the segmentation result
+    is the worst. The Dice coefficient indicates the ratio of the area between two objects to the total area.
+    The function is shown as follows:
+
+    .. math::
+        dice = 1 - \frac{2 * (pred \bigcap true)}{pred \bigcup true}
+
+    Args:
+        smooth (float): A term added to the denominator to improve numerical stability. Should be greater than 0.
+                        Default: 1e-5.
+        threshold (float): A threshold, which is used to compare with the input tensor. Default: 0.5.
+
+    Inputs:
+        - **y_pred** (Tensor) - Tensor of shape (N, C).
+        - **y** (Tensor) - Tensor of shape (N, C).
+
+    Outputs:
+        Tensor, a tensor of shape with the per-example sampled Dice losses.
+
+    Supported Platforms:
+        ``Ascend``
+
+    Examples:
+        >>> loss = nn.Diceloss(smooth=1e-5, threshold=0.5)
+        >>> y_pred = Tensor(np.array([[0.2, 0.5], [0.3, 0.1], [0.9, 0.6]]), mstype.float32)
+        >>> y = Tensor(np.array([[0, 1], [1, 0], [0, 1]]), mstype.float32)
+        >>> output = loss(y_pred, y)
+        >>> print(output)
+        [0.77777076]
+    """
+    def __init__(self, smooth=1e-5, threshold=0.5):
+        super(DiceLoss, self).__init__()
+        self.smooth = validator.check_positive_float(smooth, "smooth")
+        self.threshold = validator.check_value_type("threshold", threshold, [float])
+        self.reshape = P.Reshape()
+
+    def construct(self, logits, label):
+        _check_shape(logits.shape, label.shape)
+        logits = self.cast((logits > self.threshold), mstype.float32)
+        label = self.cast(label, mstype.float32)
+        dim = label.shape
+        pred_flat = self.reshape(logits, (dim[0], -1))
+        true_flat = self.reshape(label, (dim[0], -1))
+
+        intersection = self.reduce_sum((pred_flat * true_flat), 1)
+        unionset = self.reduce_sum(pred_flat, 1) + self.reduce_sum(true_flat, 1)
+
+        dice = (2 * intersection + self.smooth) / (unionset + self.smooth)
+        dice_loss = 1 - self.reduce_sum(dice) / dim[0]
+
+        return dice_loss
+
+
+@constexpr
+def _check_shape(logits_shape, label_shape):
+    validator.check('logits_shape', logits_shape, 'label_shape', label_shape)
+
+
 class SampledSoftmaxLoss(_Loss):
    r"""
    Computes the sampled softmax training loss.
--- a/mindspore/nn/metrics/dice.py
+++ b/mindspore/nn/metrics/dice.py
@ -26,7 +26,7 @@ class Dice(Metric):
        The function is shown as follows:

        .. math::
-            \text{dice} = \frac{2 * (\text{pred} \bigcap \text{true})}{\text{pred} \bigcup \text{true}}
+            dice = \frac{2 * (pred \bigcap true)}{pred \bigcup true}

        Args:
            smooth (float): A term added to the denominator to improve numerical stability. Should be greater than 0.
@ -58,7 +58,7 @@ class Dice(Metric):

    def update(self, *inputs):
        """
-        Updates the internal evaluation result :math:`y_{pred}` and :math:`y`.
+        Updates the internal evaluation result :math:`y_pred` and :math:`y`.

        Args:
            inputs: Input `y_pred` and `y`. `y_pred` and `y` are Tensor, list or numpy.ndarray. `y_pred` is the
--- a/mindspore/nn/metrics/hausdorff_distance.py
+++ b/mindspore/nn/metrics/hausdorff_distance.py
@ -70,9 +70,9 @@ class HausdorffDistance(Metric):
    Given two feature sets A and B, the Hausdorff distance between two point sets A and B is defined as follows:

    .. math::
-        \text{H}(A, B) = \text{max}[\text{h}(A, B), \text{h}(B, A)]
-        \text{h}(A, B) = \underset{a \in A}{\text{max}}\{\underset{b \in B}{\text{min}} \rVert a - b \rVert \}
-        \text{h}(A, B) = \underset{b \in B}{\text{max}}\{\underset{a \in A}{\text{min}} \rVert b - a \rVert \}
+        H(A, B) = \text{max}[h(A, B), h(B, A)]
+        h(A, B) = \underset{a \in A}{\text{max}}\{\underset{b \in B}{\text{min}} \rVert a - b \rVert \}
+        h(A, B) = \underset{b \in B}{\text{max}}\{\underset{a \in A}{\text{min}} \rVert b - a \rVert \}

    Args:
        distance_metric (string): The parameter of calculating Hausdorff distance supports three measurement methods,
--- a/model_zoo/official/cv/centerface/README.md
+++ b/model_zoo/official/cv/centerface/README.md
@ -84,7 +84,7 @@ other datasets need to use the same format as WiderFace.
 - Hardware（Ascend）
    - Prepare hardware environment with Ascend processor. If you want to try Ascend, please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources.
 - Framework
-    - [MindSpore](https://cmc-szv.clouddragon.huawei.com/cmcversion/index/search?searchKey=Do-MindSpore%20V100R001C00B622)
+    - [MindSpore](https://www.mindspore.cn/install/en)
 - For more information, please check the resources below：
    - [MindSpore tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html)
    - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html)
--- a/model_zoo/official/cv/deeptext/README.md
+++ b/model_zoo/official/cv/deeptext/README.md
@ -187,7 +187,7 @@ class 1 precision is 88.01%, recall is 82.77%
 | Loss Function              | SoftmaxCrossEntropyWithLogits for classification, SmoothL2Loss for bbox regression|
 | Loss                       | ~0.008                                                       |
 | Total time (8p)            | 4h                                                           |
-| Scripts                    | [deeptext script](https://gitee.com/mindspore/mindspore/tree/r1.1/mindspore/official/cv/deeptext) |
+| Scripts                    | [deeptext script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/deeptext) |

 #### Inference Performance

--- a/model_zoo/official/cv/psenet/README.md
+++ b/model_zoo/official/cv/psenet/README.md
@ -197,7 +197,7 @@ Calculated!{"precision": 0.814796668299853, "recall": 0.8006740491092923, "hmean
 | Total time                 | 1pc: 75.48 h;  8pcs: 10.01 h                                |
 | Parameters (M)             | 27.36                                                       |
 | Checkpoint for Fine tuning | 109.44M (.ckpt file)                                        |
-| Scripts                    | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/psenet> |
+| Scripts                    | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/psenet> |

 ### Inference Performance

--- a/model_zoo/official/cv/psenet/README_CN.md
+++ b/model_zoo/official/cv/psenet/README_CN.md
@ -195,7 +195,7 @@ Calculated!{"precision": 0.8147966668299853，"recall"：0.8006740491092923，"h
 | 总时间 | 1卡：75.48小时；4卡：18.87小时|
 | 参数(M) | 27.36 |
 | 微调检查点 | 109.44M （.ckpt file） |
-| 脚本 | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/psenet> |
+| 脚本 | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/psenet> |

 ### 推理性能

--- a/model_zoo/official/cv/ssd/README.md
+++ b/model_zoo/official/cv/ssd/README.md
@ -371,34 +371,34 @@ The ckpt_file parameter is required.

 #### Evaluation Performance

-| Parameters                 | Ascend                                                       | GPU                                                          |
-| -------------------------- | -------------------------------------------------------------| -------------------------------------------------------------|
-| Model Version              | SSD V1                                                       | SSD V1                                                       |
-| Resource                   | Ascend 910 ；CPU 2.60GHz，192cores；Memory，755G             | NV SMX2 V100-16G                                             |
-| uploaded Date              | 09/15/2020 (month/day/year)                                  | 09/24/2020 (month/day/year)                                  |
-| MindSpore Version          | 1.0.0                                                        | 1.0.0                                                        |
-| Dataset                    | COCO2017                                                     | COCO2017                                                     |
-| Training Parameters        | epoch = 500,  batch_size = 32                                | epoch = 800,  batch_size = 32                                |
-| Optimizer                  | Momentum                                                     | Momentum                                                     |
-| Loss Function              | Sigmoid Cross Entropy,SmoothL1Loss                           | Sigmoid Cross Entropy,SmoothL1Loss                           |
-| Speed                      | 8pcs: 90ms/step                                              | 8pcs: 121ms/step                                             |
-| Total time                 | 8pcs: 4.81hours                                              | 8pcs: 12.31hours                                             |
-| Parameters (M)             | 34                                                           | 34                                                           |
-| Scripts                    | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/ssd> | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/ssd> |
+| Parameters          | Ascend                                                                        | GPU                                                                           | Ascend                                                                        |
+| ------------------- | ----------------------------------------------------------------------------- | ----------------------------------------------------------------------------- | ----------------------------------------------------------------------------- |
+| Model Version       | SSD V1                                                                        | SSD V1                                                                        | SSD-Mobilenet-V1-Fpn                                                          |
+| Resource            | Ascend 910 ；CPU 2.60GHz，192cores；Memory，755G                              | NV SMX2 V100-16G                                                              | Ascend 910 ；CPU 2.60GHz，192cores；Memory，755G                              |
+| uploaded Date       | 09/15/2020 (month/day/year)                                                   | 09/24/2020 (month/day/year)                                                   | 01/13/2021 (month/day/year)                                                   |
+| MindSpore Version   | 1.0.0                                                                         | 1.0.0                                                                         | 1.1.0                                                                         |
+| Dataset             | COCO2017                                                                      | COCO2017                                                                      | COCO2017                                                                      |
+| Training Parameters | epoch = 500,  batch_size = 32                                                 | epoch = 800,  batch_size = 32                                                 | epoch = 60,  batch_size = 32                                                  |
+| Optimizer           | Momentum                                                                      | Momentum                                                                      | Momentum                                                                      |
+| Loss Function       | Sigmoid Cross Entropy,SmoothL1Loss                                            | Sigmoid Cross Entropy,SmoothL1Loss                                            | Sigmoid Cross Entropy,SmoothL1Loss                                            |
+| Speed               | 8pcs: 90ms/step                                                               | 8pcs: 121ms/step                                                              | 8pcs: 547ms/step                                                              |
+| Total time          | 8pcs: 4.81hours                                                               | 8pcs: 12.31hours                                                              | 8pcs: 4.22hours                                                               |
+| Parameters (M)      | 34                                                                            | 34                                                                            | 48M                                                                           |
+| Scripts             | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/ssd> | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/ssd> | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/ssd> |

 #### Inference Performance

-| Parameters          | Ascend                      | GPU                         |
-| ------------------- | ----------------------------| ----------------------------|
-| Model Version       | SSD V1                      | SSD V1                      |
-| Resource            | Ascend 910                  | GPU                         |
-| Uploaded Date       | 09/15/2020 (month/day/year) | 09/24/2020 (month/day/year) |
-| MindSpore Version   | 1.0.0                       | 1.0.0                       |
-| Dataset             | COCO2017                    | COCO2017                    |
-| batch_size          | 1                           | 1                           |
-| outputs             | mAP                         | mAP                         |
-| Accuracy            | IoU=0.50: 23.8%             | IoU=0.50: 22.4%             |
-| Model for inference | 34M(.ckpt file)             | 34M(.ckpt file)             |
+| Parameters          | Ascend                      | GPU                         | Ascend                      |
+| ------------------- | --------------------------- | --------------------------- | --------------------------- |
+| Model Version       | SSD V1                      | SSD V1                      | SSD-Mobilenet-V1-Fpn        |
+| Resource            | Ascend 910                  | GPU                         | Ascend 910                  |
+| Uploaded Date       | 09/15/2020 (month/day/year) | 09/24/2020 (month/day/year) | 09/24/2020 (month/day/year) |
+| MindSpore Version   | 1.0.0                       | 1.0.0                       | 1.1.0                       |
+| Dataset             | COCO2017                    | COCO2017                    | COCO2017                    |
+| batch_size          | 1                           | 1                           | 1                           |
+| outputs             | mAP                         | mAP                         | mAP                         |
+| Accuracy            | IoU=0.50: 23.8%             | IoU=0.50: 22.4%             | Iout=0.50: 30%              |
+| Model for inference | 34M(.ckpt file)             | 34M(.ckpt file)             | 48M(.ckpt file)             |

 ## [Description of Random Situation](#contents)

--- a/model_zoo/official/cv/ssd/eval.py
+++ b/model_zoo/official/cv/ssd/eval.py
@ -21,10 +21,11 @@ import time
 import numpy as np
 from mindspore import context, Tensor
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from src.ssd import SSD300, ssd_mobilenet_v2, ssd_mobilenet_v1_fpn
+from src.ssd import SSD300, SsdInferWithDecoder, ssd_mobilenet_v2, ssd_mobilenet_v1_fpn
 from src.dataset import create_ssd_dataset, create_mindrecord
 from src.config import config
 from src.eval_utils import metrics
+from src.box_utils import default_boxes

 def ssd_eval(dataset_path, ckpt_path, anno_json):
    """SSD evaluation."""
@ -35,6 +36,8 @@ def ssd_eval(dataset_path, ckpt_path, anno_json):
        net = SSD300(ssd_mobilenet_v2(), config, is_training=False)
    else:
        net = ssd_mobilenet_v1_fpn(config=config)
+    net = SsdInferWithDecoder(net, Tensor(default_boxes), config)
+
    print("Load Checkpoint!")
    param_dict = load_checkpoint(ckpt_path)
    net.init_parameters_data()
--- a/model_zoo/official/cv/ssd/export.py
+++ b/model_zoo/official/cv/ssd/export.py
@ -19,8 +19,9 @@ import numpy as np
 import mindspore
 from mindspore import context, Tensor
 from mindspore.train.serialization import load_checkpoint, load_param_into_net, export
-from src.ssd import SSD300, ssd_mobilenet_v2, ssd_mobilenet_v1_fpn
+from src.ssd import SSD300, SsdInferWithDecoder, ssd_mobilenet_v2, ssd_mobilenet_v1_fpn
 from src.config import config
+from src.box_utils import default_boxes

 parser = argparse.ArgumentParser(description='SSD export')
 parser.add_argument("--device_id", type=int, default=0, help="Device id")
@ -41,6 +42,7 @@ if __name__ == '__main__':
        net = SSD300(ssd_mobilenet_v2(), config, is_training=False)
    else:
        net = ssd_mobilenet_v1_fpn(config=config)
+    net = SsdInferWithDecoder(net, Tensor(default_boxes), config)

    param_dict = load_checkpoint(args.ckpt_file)
    net.init_parameters_data()
--- a/model_zoo/official/cv/ssd/scripts/run_distribute_train.sh
+++ b/model_zoo/official/cv/ssd/scripts/run_distribute_train.sh
@ -31,7 +31,7 @@ fi
 # Before start distribute train, first create mindrecord files.
 BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
 cd $BASE_PATH/../ || exit
-python train.py --only_create_dataset=True
+python train.py --only_create_dataset=True --dataset=$4

 echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt"

--- a/model_zoo/official/cv/ssd/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/cv/ssd/scripts/run_distribute_train_gpu.sh
@ -31,7 +31,7 @@ fi
 # Before start distribute train, first create mindrecord files.
 BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
 cd $BASE_PATH/../ || exit
-python train.py --only_create_dataset=True --run_platform="GPU"
+python train.py --only_create_dataset=True --run_platform="GPU" --dataset=$4

 echo "After running the scipt, the network runs in the background. The log will be generated in LOG/log.txt"

--- a/model_zoo/official/cv/ssd/src/dataset.py
+++ b/model_zoo/official/cv/ssd/src/dataset.py
@ -207,10 +207,10 @@ def create_voc_label(is_training):
                print(f'Label "{cls_name}" not in "{config.classes}"')
                continue
            bnd_box = obj.find('bndbox')
-            x_min = int(bnd_box.find('xmin').text) - 1
-            y_min = int(bnd_box.find('ymin').text) - 1
-            x_max = int(bnd_box.find('xmax').text) - 1
-            y_max = int(bnd_box.find('ymax').text) - 1
+            x_min = int(float(bnd_box.find('xmin').text)) - 1
+            y_min = int(float(bnd_box.find('ymin').text)) - 1
+            x_max = int(float(bnd_box.find('xmax').text)) - 1
+            y_max = int(float(bnd_box.find('ymax').text)) - 1
            labels.append([y_min, x_min, y_max, x_max, cls_map[cls_name]])

            if not is_training:
--- a/model_zoo/official/cv/ssd/src/eval_utils.py
+++ b/model_zoo/official/cv/ssd/src/eval_utils.py
@ -17,7 +17,6 @@
 import json
 import numpy as np
 from .config import config
-from .box_utils import ssd_bboxes_decode


 def apply_nms(all_boxes, all_scores, thres, max_boxes):
@ -81,7 +80,6 @@ def metrics(pred_data, anno_json):
        img_id = sample['img_id']
        h, w = sample['image_shape']

-        pred_boxes = ssd_bboxes_decode(pred_boxes)
        final_boxes = []
        final_label = []
        final_score = []
--- a/model_zoo/official/cv/ssd/src/ssd.py
+++ b/model_zoo/official/cv/ssd/src/ssd.py
@ -569,6 +569,42 @@ class SSDWithMobileNetV2(nn.Cell):
        return self.last_channel


+class SsdInferWithDecoder(nn.Cell):
+    """
+    SSD Infer wrapper to decode the bbox locations.
+
+    Args:
+        network (Cell): the origin ssd infer network without bbox decoder.
+        default_boxes (Tensor): the default_boxes from anchor generator
+        config (dict): ssd config
+    Returns:
+        Tensor, the locations for bbox after decoder representing (y0,x0,y1,x1)
+        Tensor, the prediction labels.
+
+    """
+    def __init__(self, network, default_boxes, config):
+        super(SsdInferWithDecoder, self).__init__()
+        self.network = network
+        self.default_boxes = default_boxes
+        self.prior_scaling_xy = config.prior_scaling[0]
+        self.prior_scaling_wh = config.prior_scaling[1]
+
+    def construct(self, x):
+        pred_loc, pred_label = self.network(x)
+
+        default_bbox_xy = self.default_boxes[..., :2]
+        default_bbox_wh = self.default_boxes[..., 2:]
+        pred_xy = pred_loc[..., :2] * self.prior_scaling_xy * default_bbox_wh + default_bbox_xy
+        pred_wh = P.Exp()(pred_loc[..., 2:] * self.prior_scaling_wh) * default_bbox_wh
+
+        pred_xy_0 = pred_xy - pred_wh / 2.0
+        pred_xy_1 = pred_xy + pred_wh / 2.0
+        pred_xy = P.Concat(-1)((pred_xy_0, pred_xy_1))
+        pred_xy = P.Maximum()(pred_xy, 0)
+        pred_xy = P.Minimum()(pred_xy, 1)
+        return pred_xy, pred_label
+
+
 def ssd_mobilenet_v1_fpn(**kwargs):
    return SsdMobilenetV1Fpn(**kwargs)

--- a/model_zoo/official/cv/unet/eval.py
+++ b/model_zoo/official/cv/unet/eval.py
@ -85,6 +85,7 @@ class dice_coeff(nn.Metric):
            raise RuntimeError('Total samples num must not be 0.')
        return self._dice_coeff_sum / float(self._samples_num)

+
 def test_net(data_dir,
             ckpt_path,
             cross_valid_ind=1,
@ -102,6 +103,7 @@ def test_net(data_dir,
    dice_score = model.eval(valid_dataset, dataset_sink_mode=False)
    print("============== Cross valid dice coeff is:", dice_score)

+
 def get_args():
    parser = argparse.ArgumentParser(description='Test the UNet on images and target masks',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
--- a/model_zoo/official/nlp/fasttext/README.md
+++ b/model_zoo/official/nlp/fasttext/README.md
@ -1,4 +1,4 @@
-![](https://www.mindspore.cn/static/img/logo.a3e472c9.png)
+![](https://www.mindspore.cn/static/img/logo_black.6a5c850d.png)

 <!-- TOC -->

--- a/model_zoo/official/nlp/gnmt_v2/README.md
+++ b/model_zoo/official/nlp/gnmt_v2/README.md
@ -1,4 +1,4 @@
-![](https://www.mindspore.cn/static/img/logo.a3e472c9.png)
+![](https://www.mindspore.cn/static/img/logo_black.6a5c850d.png)

 <!-- TOC -->

--- a/model_zoo/official/nlp/mass/README_CN.md
+++ b/model_zoo/official/nlp/mass/README_CN.md
@ -47,7 +47,7 @@ BERT（Devlin等人，2018年）采用有屏蔽的语料丰富文本预训练Tra

 受BERT、GPT及其他语言模型的启发，微软致力于在此基础上研究[掩式序列到序列（MASS）预训练语言生成](https://www.microsoft.com/en-us/research/uploads/prod/2019/06/MASS-paper-updated-002.pdf)。MASS的参数k很重要，用来控制屏蔽后的分片长度。BERT和GPT属于特例，k等于1或者句长。

-[MASS介绍 — 序列对序列语言生成任务中性能优于BERT和GPT的预训练方法](https://www.microsoft.com/en-us/research/blog/introduction-mass-a-pre-training-method-thing-forts-bert-and-gpt-in-sequence-to-sequence-language-generate-tasks/)
+[MASS介绍 — 序列对序列语言生成任务中性能优于BERT和GPT的预训练方法](https://www.microsoft.com/en-us/research/blog/introducing-mass-a-pre-training-method-that-outperforms-bert-and-gpt-in-sequence-to-sequence-language-generation-tasks/)

 [论文](https://www.microsoft.com/en-us/research/uploads/prod/2019/06/MASS-paper-updated-002.pdf): Song, Kaitao, Xu Tan, Tao Qin, Jianfeng Lu and Tie-Yan Liu.“MASS: Masked Sequence to Sequence Pre-training for Language Generation.”ICML (2019).

--- a/model_zoo/official/nlp/prophetnet/README.md
+++ b/model_zoo/official/nlp/prophetnet/README.md
@ -655,4 +655,4 @@ The model has been validated on Ascend environment, not validated on CPU and GPU

 # ModelZoo Homepage  

- [Link](https://gitee.com/mindspore/mindspore/tree/master/mindspore/model_zoo)
+ [Link](https://gitee.com/mindspore/mindspore/tree/master/model_zoo)
--- a/model_zoo/research/audio/fcn-4/README.md
+++ b/model_zoo/research/audio/fcn-4/README.md
@ -192,7 +192,7 @@ Parameters for both training and evaluation can be set in config.py
 | Speed                      | 1pc: 160 samples/sec;                                       |
 | Total time                 | 1pc: 20 mins;                                               |
 | Checkpoint for Fine tuning | 198.73M(.ckpt file)                                         |
-| Scripts                    | [music_auto_tagging script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/audio/fcn-4) |
+| Scripts                    | [music_auto_tagging script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/audio/fcn-4) |

 ## [ModelZoo Homepage](#contents)  

--- a/model_zoo/research/cv/centernet/README.md
+++ b/model_zoo/research/cv/centernet/README.md
@ -79,7 +79,7 @@ Dataset used: [COCO2017](https://cocodataset.org/)
 - Hardware（Ascend）
    - Prepare hardware environment with Ascend processor. If you want to try Ascend, please send the [application form](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/file/other/Ascend%20Model%20Zoo%E4%BD%93%E9%AA%8C%E8%B5%84%E6%BA%90%E7%94%B3%E8%AF%B7%E8%A1%A8.docx) to ascend@huawei.com. Once approved, you can get the resources.
 - Framework
-    - [MindSpore](https://cmc-szv.clouddragon.huawei.com/cmcversion/index/search?searchKey=Do-MindSpore%20V100R001C00B622)
+    - [MindSpore](https://www.mindspore.cn/install/en)
 - For more information, please check the resources below：
    - [MindSpore tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html)
    - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html)
--- a/tests/st/ops/cpu/test_gelu_grad_op.py
+++ b/tests/st/ops/cpu/test_gelu_grad_op.py
@ -0,0 +1,63 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import composite as C
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+
+
+class GeluNet(nn.Cell):
+    def __init__(self):
+        super(GeluNet, self).__init__()
+        self.gelu = P.Gelu()
+
+    def construct(self, x):
+        return self.gelu(x)
+
+
+class Grad(nn.Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.grad = C.GradOperation(get_all=True, sens_param=True)
+        self.network = network
+
+    def construct(self, input_data, sens):
+        gout = self.grad(self.network)(input_data, sens)
+        return gout
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_gelugrad():
+    x_ms = Tensor(np.array([0.58401114, 0.68800163, 0.9760397, 0.14702141, 0.46563736, 0.9607501,
+                            0.14567593, 0.12261796, 0.37054458, 0.46421242]).astype(np.float32))
+    dy_ms = Tensor(np.array([0.5559598, 0.96994054, 0.24770357, 0.34646875, 0.2984393, 0.03287048,
+                             0.55681044, 0.966908, 0.06015943, 0.6099489]).astype(np.float32))
+
+    net = GeluNet()
+    grad = Grad(net)
+
+    output = grad(x_ms, dy_ms)
+    expect = [0.50963277, 0.9414753, 0.2667653, 0.21358444, 0.25243032, 0.0352667,
+              0.34266686, 0.57757664, 0.04707306, 0.51536125]
+    assert np.allclose(output[0].asnumpy(), expect)
--- a/tests/st/ops/cpu/test_gelu_op.py
+++ b/tests/st/ops/cpu/test_gelu_op.py
@ -0,0 +1,93 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+
+
+class GeluNet(nn.Cell):
+    def __init__(self):
+        super(GeluNet, self).__init__()
+        self.gelu = P.Gelu()
+
+    def construct(self, x):
+        return self.gelu(x)
+
+
+def GeluCompute(x):
+    return 0.5 * x * (1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x * x * x)))
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_gelu_1d():
+    x_np = np.random.random((50,)).astype(np.float32)
+    y_np = GeluCompute(x_np)
+
+    x_ms = Tensor(x_np)
+    net = GeluNet()
+    y_ms = net(x_ms)
+
+    assert np.allclose(y_np, y_ms.asnumpy())
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_gelu_2d():
+    x_np = np.random.random((50, 40)).astype(np.float32)
+    y_np = GeluCompute(x_np)
+
+    x_ms = Tensor(x_np)
+    net = GeluNet()
+    y_ms = net(x_ms)
+
+    assert np.allclose(y_np, y_ms.asnumpy())
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_gelu_4d():
+    x_np = np.random.random((32, 3, 224, 224)).astype(np.float32)
+    y_np = GeluCompute(x_np)
+
+    x_ms = Tensor(x_np)
+    net = GeluNet()
+    y_ms = net(x_ms)
+
+    assert np.allclose(y_np, y_ms.asnumpy())
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_gelu_neg():
+    x_np = np.random.random((32, 3, 224, 224)).astype(np.float32) * -1
+    y_np = GeluCompute(x_np)
+
+    x_ms = Tensor(x_np)
+    net = GeluNet()
+    y_ms = net(x_ms)
+
+    assert np.allclose(y_np, y_ms.asnumpy())
--- a/tests/st/ops/cpu/test_layer_norm_grad_op.py
+++ b/tests/st/ops/cpu/test_layer_norm_grad_op.py
@ -0,0 +1,221 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops.operations import _grad_ops as G
+
+context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+
+
+class LayerNormGradNet(nn.Cell):
+    def __init__(self, begin_norm_axis, begin_params_axis):
+        super(LayerNormGradNet, self).__init__()
+        self.norm = G.LayerNormGrad(begin_norm_axis, begin_params_axis)
+
+    def construct(self, dy, x, var, mean, gamma):
+        return self.norm(dy, x, var, mean, gamma)
+
+
+def LayerNormGradReference(x, dy, gamma, epsilon, begin_norm_axis, begin_params_axis):
+    begin_norm_axis = begin_norm_axis if begin_norm_axis >= 0 else begin_norm_axis + len(x.shape)
+    begin_params_axis = begin_params_axis if begin_params_axis >= 0 else begin_params_axis + len(x.shape)
+
+    norm_axis = [i for i in range(begin_norm_axis, len(x.shape))]
+    param_axis = [i for i in range(0, begin_params_axis)]
+    num = 1
+    for i in range(begin_norm_axis, len(x.shape)):
+        num *= x.shape[i]
+
+    mean = np.mean(x, axis=tuple(norm_axis), keepdims=True)
+    var = np.var(x, axis=tuple(norm_axis), keepdims=True)
+
+    gamma = gamma.reshape((*((1,) * begin_params_axis), *x.shape[begin_params_axis:]))
+    dg = np.sum(dy * np.power(var + epsilon, -0.5) * (x - mean), axis=tuple(param_axis), keepdims=True)
+    db = np.sum(dy, axis=tuple(param_axis), keepdims=True)
+
+    sum1 = np.sum((-0.5) * dy * gamma * (x - mean) * np.power(var + epsilon, -1.5), axis=tuple(norm_axis),
+                  keepdims=True)
+    sum2 = np.sum(dy * gamma, axis=tuple(norm_axis), keepdims=True)
+    sum3 = np.sum(-2.0 * (x - mean), axis=tuple(norm_axis), keepdims=True)
+
+    dx1 = dy * gamma * np.power(var + epsilon, -0.5)
+    dx2 = sum1 * 2.0 / num * (x - mean)
+    dx3 = ((-1.0) * np.power(var + epsilon, -0.5) * sum2 + (1.0 / num) * sum1 * sum3) * (1.0 / num)
+    dx = dx1 + dx2 + dx3
+    return dx, dg, db, mean, var
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernormgrad0():
+    begin_norm_axis = 1
+    begin_params_axis = 1
+    x_np = np.random.randn(4096, 3072).astype(np.float32)
+    dy_np = np.random.randn(4096, 3072).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    epsilon = 10e-12
+    dx_np, dg_np, db_np, mean_np, var_np = LayerNormGradReference(x_np, dy_np, gamma_np, epsilon, begin_norm_axis,
+                                                                  begin_params_axis)
+
+    dy_ms = Tensor(dy_np)
+    x_ms = Tensor(x_np)
+    var_ms = Tensor(var_np)
+    mean_ms = Tensor(mean_np)
+    gamma_ms = Tensor(gamma_np)
+
+    net = LayerNormGradNet(begin_norm_axis, begin_params_axis)
+    dx_ms, dg_ms, db_ms = net(x_ms, dy_ms, var_ms, mean_ms, gamma_ms)
+
+    assert np.allclose(dx_ms.asnumpy(), dx_np, rtol=1e-4, atol=1e-4)
+    assert np.allclose(dg_ms.asnumpy(), dg_np, rtol=1e-4, atol=1e-3)
+    assert np.allclose(db_ms.asnumpy(), db_np, rtol=1e-4, atol=1e-3)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernormgrad1():
+    begin_norm_axis = 1
+    begin_params_axis = 1
+    x_np = np.random.randn(640, 768).astype(np.float32)
+    dy_np = np.random.randn(640, 768).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    epsilon = 10e-12
+    dx_np, dg_np, db_np, mean_np, var_np = LayerNormGradReference(x_np, dy_np, gamma_np, epsilon, begin_norm_axis,
+                                                                  begin_params_axis)
+
+    dy_ms = Tensor(dy_np)
+    x_ms = Tensor(x_np)
+    var_ms = Tensor(var_np)
+    mean_ms = Tensor(mean_np)
+    gamma_ms = Tensor(gamma_np)
+
+    net = LayerNormGradNet(begin_norm_axis, begin_params_axis)
+    dx_ms, dg_ms, db_ms = net(x_ms, dy_ms, var_ms, mean_ms, gamma_ms)
+
+    assert np.allclose(dx_ms.asnumpy(), dx_np, rtol=1e-4, atol=1e-4)
+    assert np.allclose(dg_ms.asnumpy(), dg_np, rtol=1e-4, atol=1e-3)
+    assert np.allclose(db_ms.asnumpy(), db_np, rtol=1e-4, atol=1e-3)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernormgrad2():
+    begin_norm_axis = -1
+    begin_params_axis = -1
+    x_np = np.random.randn(32, 128, 768).astype(np.float32)
+    dy_np = np.random.randn(32, 128, 768).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    epsilon = 10e-12
+    dx_np, dg_np, db_np, mean_np, var_np = LayerNormGradReference(x_np, dy_np, gamma_np, epsilon, begin_norm_axis,
+                                                                  begin_params_axis)
+
+    dy_ms = Tensor(dy_np)
+    x_ms = Tensor(x_np)
+    var_ms = Tensor(var_np)
+    mean_ms = Tensor(mean_np)
+    gamma_ms = Tensor(gamma_np)
+
+    net = LayerNormGradNet(begin_norm_axis, begin_params_axis)
+    dx_ms, dg_ms, db_ms = net(x_ms, dy_ms, var_ms, mean_ms, gamma_ms)
+
+    assert np.allclose(dx_ms.asnumpy(), dx_np, rtol=1e-4, atol=1e-4)
+    assert np.allclose(dg_ms.asnumpy(), dg_np, rtol=1e-4, atol=1e-3)
+    assert np.allclose(db_ms.asnumpy(), db_np, rtol=1e-4, atol=1e-3)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernormgrad3():
+    begin_norm_axis = -1
+    begin_params_axis = -1
+    x_np = np.random.randn(32, 64).astype(np.float32)
+    dy_np = np.random.randn(32, 64).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    epsilon = 10e-12
+    dx_np, dg_np, db_np, mean_np, var_np = LayerNormGradReference(x_np, dy_np, gamma_np, epsilon, begin_norm_axis,
+                                                                  begin_params_axis)
+
+    dy_ms = Tensor(dy_np)
+    x_ms = Tensor(x_np)
+    var_ms = Tensor(var_np)
+    mean_ms = Tensor(mean_np)
+    gamma_ms = Tensor(gamma_np)
+
+    net = LayerNormGradNet(begin_norm_axis, begin_params_axis)
+    dx_ms, dg_ms, db_ms = net(x_ms, dy_ms, var_ms, mean_ms, gamma_ms)
+    assert np.allclose(dx_ms.asnumpy(), dx_np, rtol=1e-4, atol=1e-4)
+    assert np.allclose(dg_ms.asnumpy(), dg_np, rtol=1e-4, atol=1e-3)
+    assert np.allclose(db_ms.asnumpy(), db_np, rtol=1e-4, atol=1e-3)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernormgrad4():
+    begin_norm_axis = -1
+    begin_params_axis = -1
+    x_np = np.random.randn(32, 64).astype(np.float32)
+    dy_np = np.random.randn(32, 64).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    epsilon = 10e-12
+    dx_np, dg_np, db_np, mean_np, var_np = LayerNormGradReference(x_np, dy_np, gamma_np, epsilon, begin_norm_axis,
+                                                                  begin_params_axis)
+
+    dy_ms = Tensor(dy_np)
+    x_ms = Tensor(x_np)
+    var_ms = Tensor(var_np)
+    mean_ms = Tensor(mean_np)
+    gamma_ms = Tensor(gamma_np)
+
+    net = LayerNormGradNet(begin_norm_axis, begin_params_axis)
+    dx_ms, dg_ms, db_ms = net(x_ms, dy_ms, var_ms, mean_ms, gamma_ms)
+    assert np.allclose(dx_ms.asnumpy(), dx_np, rtol=1e-4, atol=1e-4)
+    assert np.allclose(dg_ms.asnumpy(), dg_np, rtol=1e-4, atol=1e-3)
+    assert np.allclose(db_ms.asnumpy(), db_np, rtol=1e-4, atol=1e-3)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernormgrad5():
+    begin_norm_axis = 2
+    begin_params_axis = 1
+    x_np = np.random.randn(128, 2, 16, 32).astype(np.float32)
+    dy_np = np.random.randn(128, 2, 16, 32).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    epsilon = 10e-12
+    dx_np, dg_np, db_np, mean_np, var_np = LayerNormGradReference(x_np, dy_np, gamma_np, epsilon, begin_norm_axis,
+                                                                  begin_params_axis)
+
+    dy_ms = Tensor(dy_np)
+    x_ms = Tensor(x_np)
+    var_ms = Tensor(var_np)
+    mean_ms = Tensor(mean_np)
+    gamma_ms = Tensor(gamma_np)
+
+    net = LayerNormGradNet(begin_norm_axis, begin_params_axis)
+    dx_ms, dg_ms, db_ms = net(x_ms, dy_ms, var_ms, mean_ms, gamma_ms)
+    assert np.allclose(dx_ms.asnumpy(), dx_np, rtol=1e-4, atol=1e-4)
+    assert np.allclose(db_ms.asnumpy(), db_np, rtol=1e-4, atol=1e-3)
+    assert np.allclose(dg_ms.asnumpy(), dg_np, rtol=1e-4, atol=1e-3)
--- a/tests/st/ops/cpu/test_layer_norm_op.py
+++ b/tests/st/ops/cpu/test_layer_norm_op.py
@ -0,0 +1,199 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+
+
+class LayerNormNet(nn.Cell):
+    def __init__(self, begin_norm_axis, begin_params_axis):
+        super(LayerNormNet, self).__init__()
+        self.norm = P.LayerNorm(begin_norm_axis, begin_params_axis)
+
+    def construct(self, x, gamma, beta):
+        return self.norm(x, gamma, beta)
+
+
+def LayerNormReference(begin_norm_axis, begin_params_axis, x, gamma, beta):
+    begin_norm_axis = begin_norm_axis if begin_norm_axis >= 0 else begin_norm_axis + len(x.shape)
+    begin_params_axis = begin_params_axis if begin_params_axis >= 0 else begin_params_axis + len(x.shape)
+
+    axis = [i for i in range(begin_norm_axis, len(x.shape))]
+    mean = np.mean(x, axis=tuple(axis), keepdims=True)
+    var = np.var(x, axis=tuple(axis), keepdims=True)
+
+    gamma = gamma.reshape((*((1,) * begin_params_axis), *x.shape[begin_params_axis:]))
+    beta = beta.reshape((*((1,) * begin_params_axis), *x.shape[begin_params_axis:]))
+    y = np.subtract(x, mean) / np.sqrt(var + 1e-12) * gamma + beta
+    return y, mean, var
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernorm0():
+    begin_norm_axis = 1
+    begin_params_axis = 1
+    x_np = np.random.randn(4096, 3072).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    beta_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    y_np, mean_np, var_np = LayerNormReference(begin_norm_axis, begin_params_axis, x_np, gamma_np, beta_np)
+
+    x_ms = Tensor(x_np)
+    gamma_ms = Tensor(gamma_np)
+    beta_ms = Tensor(beta_np)
+    net = LayerNormNet(begin_norm_axis, begin_params_axis)
+    y_ms, mean_ms, var_ms = net(x_ms, gamma_ms, beta_ms)
+
+    assert np.allclose(y_ms.asnumpy(), y_np, atol=1e-4)
+    assert np.allclose(mean_ms.asnumpy(), mean_np, atol=1e-4)
+    assert np.allclose(var_ms.asnumpy(), var_np, atol=1e-4)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernorm1():
+    begin_norm_axis = 1
+    begin_params_axis = 1
+    x_np = np.random.randn(640, 768).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    beta_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    y_np, mean_np, var_np = LayerNormReference(begin_norm_axis, begin_params_axis, x_np, gamma_np, beta_np)
+
+    x_ms = Tensor(x_np)
+    gamma_ms = Tensor(gamma_np)
+    beta_ms = Tensor(beta_np)
+    net = LayerNormNet(begin_norm_axis, begin_params_axis)
+    y_ms, mean_ms, var_ms = net(x_ms, gamma_ms, beta_ms)
+
+    assert np.allclose(y_ms.asnumpy(), y_np, rtol=1e-6, atol=1e-4)
+    assert np.allclose(mean_ms.asnumpy(), mean_np, rtol=1e-6, atol=1e-4)
+    assert np.allclose(var_ms.asnumpy(), var_np, rtol=1e-6, atol=1e-4)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernorm3d_1():
+    begin_norm_axis = -1
+    begin_params_axis = -1
+    x_np = np.random.randn(32, 128, 768).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    beta_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    y_np, mean_np, var_np = LayerNormReference(begin_norm_axis, begin_params_axis, x_np, gamma_np, beta_np)
+
+    x_ms = Tensor(x_np)
+    gamma_ms = Tensor(gamma_np)
+    beta_ms = Tensor(beta_np)
+    net = LayerNormNet(begin_norm_axis, begin_params_axis)
+    y_ms, mean_ms, var_ms = net(x_ms, gamma_ms, beta_ms)
+
+    assert np.allclose(y_ms.asnumpy(), y_np, rtol=1e-6, atol=1e-4)
+    assert np.allclose(mean_ms.asnumpy(), mean_np, rtol=1e-6, atol=1e-4)
+    assert np.allclose(var_ms.asnumpy(), var_np, rtol=1e-6, atol=1e-4)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernorm3d_2():
+    begin_norm_axis = -1
+    begin_params_axis = 1
+    x_np = np.random.randn(32, 128, 768).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    beta_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    y_np, mean_np, var_np = LayerNormReference(begin_norm_axis, begin_params_axis, x_np, gamma_np, beta_np)
+
+    x_ms = Tensor(x_np)
+    gamma_ms = Tensor(gamma_np)
+    beta_ms = Tensor(beta_np)
+    net = LayerNormNet(begin_norm_axis, begin_params_axis)
+    y_ms, mean_ms, var_ms = net(x_ms, gamma_ms, beta_ms)
+
+    assert np.allclose(y_ms.asnumpy(), y_np, rtol=1e-6, atol=1e-4)
+    assert np.allclose(mean_ms.asnumpy(), mean_np, rtol=1e-6, atol=1e-4)
+    assert np.allclose(var_ms.asnumpy(), var_np, rtol=1e-6, atol=1e-4)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernorm2d_2():
+    begin_norm_axis = -1
+    begin_params_axis = 1
+    x_np = np.random.randn(64, 32).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    beta_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    y_np, mean_np, var_np = LayerNormReference(begin_norm_axis, begin_params_axis, x_np, gamma_np, beta_np)
+
+    x_ms = Tensor(x_np)
+    gamma_ms = Tensor(gamma_np)
+    beta_ms = Tensor(beta_np)
+    net = LayerNormNet(begin_norm_axis, begin_params_axis)
+    y_ms, mean_ms, var_ms = net(x_ms, gamma_ms, beta_ms)
+    assert np.allclose(y_ms.asnumpy(), y_np, rtol=1e-6, atol=1e-4)
+    assert np.allclose(mean_ms.asnumpy(), mean_np, rtol=1e-6, atol=1e-4)
+    assert np.allclose(var_ms.asnumpy(), var_np, rtol=1e-6, atol=1e-4)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernorm2d_3():
+    begin_norm_axis = -1
+    begin_params_axis = 1
+    x_np = np.random.randn(128, 128).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    beta_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    y_np, mean_np, var_np = LayerNormReference(begin_norm_axis, begin_params_axis, x_np, gamma_np, beta_np)
+
+    x_ms = Tensor(x_np)
+    gamma_ms = Tensor(gamma_np)
+    beta_ms = Tensor(beta_np)
+    net = LayerNormNet(begin_norm_axis, begin_params_axis)
+    y_ms, mean_ms, var_ms = net(x_ms, gamma_ms, beta_ms)
+    assert np.allclose(y_ms.asnumpy(), y_np, rtol=1e-6, atol=1e-4)
+    assert np.allclose(mean_ms.asnumpy(), mean_np, rtol=1e-6, atol=1e-4)
+    assert np.allclose(var_ms.asnumpy(), var_np, rtol=1e-6, atol=1e-4)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_layernorm2d_4():
+    begin_norm_axis = 2
+    begin_params_axis = 1
+    np.random.seed(42)
+    x_np = np.random.randn(128, 2, 16, 32).astype(np.float32)
+    gamma_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    beta_np = np.random.randn(*x_np.shape[begin_params_axis:]).astype(np.float32)
+    y_np, mean_np, var_np = LayerNormReference(begin_norm_axis, begin_params_axis, x_np, gamma_np, beta_np)
+
+    x_ms = Tensor(x_np)
+    gamma_ms = Tensor(gamma_np)
+    beta_ms = Tensor(beta_np)
+    net = LayerNormNet(begin_norm_axis, begin_params_axis)
+    y_ms, mean_ms, var_ms = net(x_ms, gamma_ms, beta_ms)
+    assert np.allclose(y_ms.asnumpy(), y_np, rtol=1e-6, atol=1e-4)
+    assert np.allclose(mean_ms.asnumpy(), mean_np, rtol=1e-6, atol=1e-4)
+    assert np.allclose(var_ms.asnumpy(), var_np, rtol=1e-6, atol=1e-4)
--- a/tests/ut/python/nn/test_loss.py
+++ b/tests/ut/python/nn/test_loss.py
@ -14,7 +14,8 @@
 # ============================================================================
 """ test loss """
 import numpy as np
-
+import pytest
+import mindspore.common.dtype as mstype
 import mindspore.nn as nn
 from mindspore import Tensor
 from ..ut_filter import non_graph_engine
@ -88,3 +89,22 @@ def test_cosine_embedding_loss():
    x2 = Tensor(np.array([[0.4, 1.2], [-0.4, -0.9]]).astype(np.float32))
    label = Tensor(np.array([1, -1]).astype(np.int32))
    loss(x1, x2, label)
+
+
+def test_dice_loss():
+    """ test_dice_loss """
+    loss = nn.DiceLoss()
+    y_pred = Tensor(np.array([[0.2, 0.5], [0.3, 0.1], [0.9, 0.6]]), mstype.float32)
+    y = Tensor(np.array([[0, 1], [1, 0], [0, 1]]), mstype.float32)
+    # Pass the test if no error is reported
+    loss(y_pred, y).asnumpy()
+
+
+
+def test_dice_loss_check_shape():
+    """ test_dice_loss """
+    loss = nn.DiceLoss()
+    y_pred = Tensor(np.array([[0.2, 0.5], [0.3, 0.1], [0.9, 0.6]]), mstype.float32)
+    y = Tensor(np.array([[1, 0], [0, 1]]), mstype.float32)
+    with pytest.raises(ValueError):
+        loss(y_pred, y)