add 35xx build

2021-12-11 14:36:21 +08:00 · 2021-12-11 14:36:21 +08:00 · bbfd0dbdce
parent 74c8a66ab9
commit bbfd0dbdce
45 changed files with 3759 additions and 310 deletions
--- a/.jenkins/check/config/whitelizard.txt
+++ b/.jenkins/check/config/whitelizard.txt
@ -6,6 +6,7 @@
 mindspore/mindspore/lite/src/ops/primitive_c.cc:mindspore::lite::PrimitiveC::Create
 mindspore/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/csv_op.cc:mindspore::dataset::CsvOp::CsvParser::InitCsvParser
 mindspore/mindspore/lite/tools/converter/graphdef_transform.cc:mindspore::lite::GraphDefTransform::Transform
+mindspore/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal.cc:mindspore::proposal::Rpn
 mindspore/mindspore/core/abstract/primitive_infer_map.cc:mindspore::abstract::GetPrimitiveToEvalImplMap
 mindspore/mindspore/ccsrc/frontend/optimizer/irpass.cc:mindspore::opt::irpass::OptimizeIRPassLib::OptimizeIRPassLib
 mindspore/mindspore/ccsrc/frontend/parallel/ops_info/gather_v2_p_info.cc:mindspore::parallel::GatherV2PInfo::CheckStrategy
--- a/cmake/package_lite.cmake
+++ b/cmake/package_lite.cmake
@ -11,6 +11,7 @@ set(TEST_CASE_DIR ${TOP_DIR}/mindspore/lite/test/build)
 set(RUNTIME_DIR ${RUNTIME_PKG_NAME}/runtime)
 set(RUNTIME_INC_DIR ${RUNTIME_PKG_NAME}/runtime/include)
 set(RUNTIME_LIB_DIR ${RUNTIME_PKG_NAME}/runtime/lib)
+set(PROVIDERS_LIB_DIR ${RUNTIME_PKG_NAME}/providers)
 set(MIND_DATA_INC_DIR ${RUNTIME_PKG_NAME}/runtime/include/dataset)
 set(TURBO_DIR ${RUNTIME_PKG_NAME}/runtime/third_party/libjpeg-turbo)
 set(GLOG_DIR ${RUNTIME_PKG_NAME}/runtime/third_party/glog)
@ -18,6 +19,10 @@ set(SECUREC_DIR ${RUNTIME_PKG_NAME}/runtime/third_party/securec)
 set(MINDSPORE_LITE_LIB_NAME libmindspore-lite)
 set(MINDSPORE_CORE_LIB_NAME libmindspore_core)
 set(BENCHMARK_NAME benchmark)
+set(MSLITE_NNIE_LIB_NAME libmslite_nnie)
+set(MSLITE_PROPOSAL_LIB_NAME libmslite_proposal)
+set(MICRO_NNIE_LIB_NAME libmicro_nnie)
+set(DPICO_ACL_ADAPTER_LIB_NAME libdpico_acl_adapter)
 set(BENCHMARK_ROOT_DIR ${RUNTIME_PKG_NAME}/tools/benchmark)

 set(MINDSPORE_LITE_TRAIN_LIB_NAME libmindspore-lite-train)
@ -227,11 +232,31 @@ if(PLATFORM_ARM64)
            COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ops*" EXCLUDE)
    install(DIRECTORY ${TOP_DIR}/include/c_api/ DESTINATION ${RUNTIME_INC_DIR}/c_api
            COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
-    if(NOT MSLITE_ENABLE_DPICO_ACL_ADAPTER)
+    if(NOT TARGET_MIX210)
        __install_micro_wrapper()
    endif()
    if(MSLITE_ENABLE_TOOLS)
-        install(TARGETS ${BENCHMARK_NAME} RUNTIME DESTINATION ${BENCHMARK_ROOT_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+        if(NOT BUILD_FIRST)
+            install(TARGETS ${BENCHMARK_NAME} RUNTIME DESTINATION ${BENCHMARK_ROOT_DIR}
+                    COMPONENT ${RUNTIME_COMPONENT_NAME})
+            if(TARGET_HIMIX)
+                if(${MSLITE_REGISTRY_DEVICE}  STREQUAL "Hi3559A")
+                    install(FILES ${TOP_DIR}/mindspore/lite/build/tools/benchmark/nnie/${MSLITE_NNIE_LIB_NAME}.so
+                            DESTINATION ${PROVIDERS_LIB_DIR}/${MSLITE_REGISTRY_DEVICE}
+                            COMPONENT ${RUNTIME_COMPONENT_NAME})
+                    install(FILES
+                            ${TOP_DIR}/mindspore/lite/build/tools/benchmark/nnie_proposal/${MSLITE_PROPOSAL_LIB_NAME}.so
+                            DESTINATION ${PROVIDERS_LIB_DIR}/${MSLITE_REGISTRY_DEVICE}
+                            COMPONENT ${RUNTIME_COMPONENT_NAME})
+                endif()
+            elseif(TARGET_MIX210)
+                if(${MSLITE_REGISTRY_DEVICE}  STREQUAL "SD3403")
+                    install(FILES ${TOP_DIR}/mindspore/lite/build/tools/benchmark/dpico/${DPICO_ACL_ADAPTER_LIB_NAME}.so
+                            DESTINATION ${PROVIDERS_LIB_DIR}/${MSLITE_REGISTRY_DEVICE}
+                            COMPONENT ${RUNTIME_COMPONENT_NAME})
+                endif()
+            endif()
+        endif()
        if(SUPPORT_TRAIN)
            install(TARGETS ${BENCHMARK_TRAIN_NAME} RUNTIME DESTINATION ${BENCHMARK_TRAIN_ROOT_DIR} COMPONENT
                    ${RUNTIME_COMPONENT_NAME})
@ -310,7 +335,27 @@ elseif(PLATFORM_ARM32)
            COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
    __install_micro_wrapper()
    if(MSLITE_ENABLE_TOOLS AND NOT TARGET_OHOS_LITE)
-        install(TARGETS ${BENCHMARK_NAME} RUNTIME DESTINATION ${BENCHMARK_ROOT_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+        if(NOT BUILD_FIRST)
+            install(TARGETS ${BENCHMARK_NAME} RUNTIME
+                    DESTINATION ${BENCHMARK_ROOT_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+            if(TARGET_HIMIX)
+                if(${MSLITE_REGISTRY_DEVICE}  STREQUAL "Hi3516D" OR ${MSLITE_REGISTRY_DEVICE}  STREQUAL "Hi3519A")
+                    install(FILES ${TOP_DIR}/mindspore/lite/build/tools/benchmark/nnie/${MSLITE_NNIE_LIB_NAME}.so
+                            DESTINATION ${PROVIDERS_LIB_DIR}/${MSLITE_REGISTRY_DEVICE}
+                            COMPONENT ${RUNTIME_COMPONENT_NAME})
+                    install(FILES
+                            ${TOP_DIR}/mindspore/lite/build/tools/benchmark/nnie_proposal/${MSLITE_PROPOSAL_LIB_NAME}.so
+                            DESTINATION ${PROVIDERS_LIB_DIR}/${MSLITE_REGISTRY_DEVICE}
+                            COMPONENT ${RUNTIME_COMPONENT_NAME})
+                    if(${MSLITE_REGISTRY_DEVICE}  STREQUAL "Hi3516D")
+                        install(FILES
+                                ${TOP_DIR}/mindspore/lite/tools/benchmark/nnie/third_patry/${MICRO_NNIE_LIB_NAME}.so
+                                DESTINATION ${PROVIDERS_LIB_DIR}/${MSLITE_REGISTRY_DEVICE}
+                                COMPONENT ${RUNTIME_COMPONENT_NAME})
+                    endif()
+                endif()
+            endif()
+        endif()
        if(SUPPORT_TRAIN)
            install(TARGETS ${BENCHMARK_TRAIN_NAME} RUNTIME DESTINATION ${BENCHMARK_TRAIN_ROOT_DIR} COMPONENT
                    ${RUNTIME_COMPONENT_NAME})
@ -516,7 +561,10 @@ else()
        __install_micro_codegen()
    endif()
    if(MSLITE_ENABLE_TOOLS)
-        install(TARGETS ${BENCHMARK_NAME} RUNTIME DESTINATION ${BENCHMARK_ROOT_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
+        if(NOT BUILD_FIRST)
+            install(TARGETS ${BENCHMARK_NAME} RUNTIME DESTINATION ${BENCHMARK_ROOT_DIR}
+                    COMPONENT ${RUNTIME_COMPONENT_NAME})
+        endif()
        if(SUPPORT_TRAIN)
            install(TARGETS ${BENCHMARK_TRAIN_NAME} RUNTIME DESTINATION ${BENCHMARK_TRAIN_ROOT_DIR} COMPONENT
                    ${RUNTIME_COMPONENT_NAME})
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/optimize/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/optimize/CMakeLists.txt
@ -35,13 +35,10 @@ if(NOT PLATFORM_ARM32 AND NOT TARGET_HIMIX AND NOT MACHINE_LINUX_ARM64)
    list(APPEND SDOT_FILES ${SDOT_SRC})
    add_library(nnacl_optimize_mid OBJECT ${SDOT_FILES})
    add_dependencies(nnacl_optimize_mid fbs_src)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8.2-a+dotprod+fp16")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+dotprod+fp16")
-endif()
-
-if(TARGET_MIX210)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8.2-a+fp16")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+fp16")
+    if(NOT TARGET_MIX210)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8.2-a+dotprod+fp16")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+dotprod+fp16")
+    endif()
 endif()

 if(MSLITE_ENABLE_FP16)
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@ -10,7 +10,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_link_option.cmake)
 set(MSLITE_GPU_BACKEND "" CACHE STRING "enable gpu backend, \
    opencl only support arm64 and x86_64 , tensorrt only support x86_64, opencl/cuda/tensorrt/off")
 set(MSLITE_REGISTRY_DEVICE "off" CACHE STRING "Compile Mindspore Lite that supports specific devices, \
-    currently supported devices: Hi3516D/Hi3519A/Hi3559A/sd3403")
+    currently supported devices: Hi3516D/Hi3519A/Hi3559A/SD3403")
 option(MSLITE_ENABLE_NPU "enable npu, only arm64 or arm32 support" off)
 option(MSLITE_ENABLE_TRAIN "enable train" on)
 option(MSLITE_ENABLE_SSE "enable SSE instruction set, only x86_64 support" off)
@ -53,13 +53,6 @@ if(DEFINED ENV{MSLITE_GPU_BACKEND})
 endif()
 if(DEFINED ENV{MSLITE_REGISTRY_DEVICE})
    set(MSLITE_REGISTRY_DEVICE $ENV{MSLITE_REGISTRY_DEVICE})
-    if(MSLITE_REGISTRY_DEVICE STREQUAL sd3403)
-        if(NOT PLATFORM_ARM64)
-            set(MSLITE_ENABLE_DPICO_ATC_ADAPTER on)
-        else()
-            set(MSLITE_ENABLE_DPICO_ACL_ADAPTER on)
-        endif()
-    endif()
 endif()
 if(DEFINED ENV{MSLITE_ENABLE_NPU})
    set(MSLITE_ENABLE_NPU $ENV{MSLITE_ENABLE_NPU})
@ -190,6 +183,9 @@ elseif(PLATFORM_ARM32)
 elseif(WIN32)
    set(MSLITE_GPU_BACKEND "off")
 else()
+    if(${MSLITE_REGISTRY_DEVICE}  STREQUAL "SD3403")
+        set(MSLITE_ENABLE_DPICO_ATC_ADAPTER on)
+    endif()
    if(MSLITE_GPU_BACKEND STREQUAL "")
        set(MSLITE_GPU_BACKEND "off")
    endif()
@ -379,10 +375,6 @@ else()
    set(RUNTIME_COMPONENT_NAME "linux-x64")
 endif()

-if(MSLITE_ENABLE_DPICO_ACL_ADAPTER)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/benchmark/dpico)
-endif()
-
 string(REPLACE "/mindspore/lite" "" TOP_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(CORE_DIR ${TOP_DIR}/mindspore/core)
 set(CCSRC_DIR ${TOP_DIR}/mindspore/ccsrc)
@ -567,16 +559,16 @@ if(BUILD_MINDDATA STREQUAL "lite_cv")
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/minddata)
 endif()

-if(NOT MSLITE_ENABLE_DPICO_ACL_ADAPTER)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/ops)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/micro/coder)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src)
-    add_subdirectory(${CCSRC_DIR}/backend/kernel_compiler/cpu/nnacl build)
-endif()
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/ops)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/micro/coder)

+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src)
+add_subdirectory(${CCSRC_DIR}/backend/kernel_compiler/cpu/nnacl build)

 if(MSLITE_ENABLE_TOOLS)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/benchmark)
+    if(NOT BUILD_FIRST)
+        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/benchmark)
+    endif()
    if(SUPPORT_TRAIN)
        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/benchmark_train)
    endif()
--- a/mindspore/lite/build_lite.sh
+++ b/mindspore/lite/build_lite.sh
@ -25,6 +25,15 @@ checkndk() {
    fi
 }

+check_Hi35xx() {
+  if [[ "X${HI35XX_SDK_PATH}" == "X" ]]; then
+    echo "error: to compile the runtime package of Hi35XX, you need to set HI35XX_SDK_PATH to declare the path of Hi35XX sdk."
+    exit 1
+  else
+    cp -r ${HI35XX_SDK_PATH}/third_patry ${BASEPATH}/mindspore/lite/tools/benchmark/nnie/
+  fi
+}
+
 get_version() {
    VERSION_MAJOR=$(grep "const int ms_version_major =" ${BASEPATH}/mindspore/lite/include/version.h | tr -dc "[0-9]")
    VERSION_MINOR=$(grep "const int ms_version_minor =" ${BASEPATH}/mindspore/lite/include/version.h | tr -dc "[0-9]")
@ -142,16 +151,19 @@ build_lite() {
      CMAKE_TOOLCHAIN_FILE=${BASEPATH}/cmake/lite_ios.cmake
    fi

-    BRANCH_NAME=nnie_3516_master_dev
+    BRANCH_NAME=nnie_3516_master
    if [[ ("${MSLITE_REGISTRY_DEVICE}" == "Hi3516D" || "${TOOLCHAIN_NAME}" == "himix200") && "${local_lite_platform}" == "arm32" ]]; then
      TOOLCHAIN_NAME="himix200"
      MSLITE_REGISTRY_DEVICE=Hi3516D
+      check_Hi35xx
    elif [[ "${MSLITE_REGISTRY_DEVICE}" == "Hi3559A" && "${local_lite_platform}" == "arm64" ]]; then
      TOOLCHAIN_NAME="himix100"
-    elif [[ "${MSLITE_REGISTRY_DEVICE}" == "sd3403" && "${local_lite_platform}" == "arm64" ]]; then
+      check_Hi35xx
+    elif [[ "${MSLITE_REGISTRY_DEVICE}" == "SD3403" && "${local_lite_platform}" == "arm64" ]]; then
      TOOLCHAIN_NAME="mix210"
    elif [[ "${MSLITE_REGISTRY_DEVICE}" == "Hi3519A" && "${local_lite_platform}" == "arm32" ]]; then
      TOOLCHAIN_NAME="himix200"
+      check_Hi35xx
    elif [[ ("${MSLITE_ENABLE_NNIE}" == "on" || "${MSLITE_REGISTRY_DEVICE}" == "Hi3516D") && "${local_lite_platform}" == "x86_64" ]]; then
      MSLITE_REGISTRY_DEVICE=Hi3516D
    fi
@ -190,13 +202,11 @@ build_lite() {
        LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DTOOLCHAIN_NAME=himix100"
        LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DBUILD_MINDDATA=off"
        LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DMSLITE_ENABLE_FP16=off -DMSLITE_ENABLE_TRAIN=off -DMSLITE_GPU_BACKEND=off"
-        LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DMSLITE_ENABLE_TOOLS=off"
      elif [[ "${TOOLCHAIN_NAME}" == "mix210" ]]; then
        CMAKE_TOOLCHAIN_FILE=${BASEPATH}/mindspore/lite/cmake/mix210.toolchain.cmake
        LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DTOOLCHAIN_NAME=mix210"
        LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DBUILD_MINDDATA=off"
-        LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DMSLITE_ENABLE_FP16=off -DMSLITE_ENABLE_TRAIN=off -DMSLITE_GPU_BACKEND=off"
-        LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DMSLITE_ENABLE_TOOLS=off"
+        LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DMSLITE_ENABLE_FP16=on -DMSLITE_ENABLE_TRAIN=off -DMSLITE_GPU_BACKEND=off"
      else
        if [[ "${machine}" == "aarch64" ]]; then
          LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DMACHINE_LINUX_ARM64=on"
@ -228,26 +238,25 @@ build_lite() {
    if [[ "X$CMAKE_TOOLCHAIN_FILE" != "X" ]]; then
      LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}"
    fi
-    if [[ "X$MSLITE_REGISTRY_DEVICE" != "X" ]] && [[ "${MSLITE_REGISTRY_DEVICE}" != "sd3403" ]]; then
+    if [[ "X$MSLITE_REGISTRY_DEVICE" != "X" ]]; then
      LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DMSLITE_REGISTRY_DEVICE=${MSLITE_REGISTRY_DEVICE}"
    fi
    if [[ "${local_lite_platform}" == "arm64" || "${local_lite_platform}" == "arm32" ]]; then
      echo "default link libc++_static.a, export MSLITE_ANDROID_STL=c++_shared to link libc++_shared.so"
    fi
-    echo "cmake ${LITE_CMAKE_ARGS} ${BASEPATH}/mindspore/lite"
-    if [[ "${MSLITE_REGISTRY_DEVICE}" == "sd3403" ]] && [[ "${local_lite_platform}" == "arm64" ]]; then
-      export MSLITE_REGISTRY_DEVICE=""
-      cmake ${LITE_CMAKE_ARGS}  "${BASEPATH}/mindspore/lite"
-      export MSLITE_REGISTRY_DEVICE=sd3403
-    else
-      cmake ${LITE_CMAKE_ARGS}  "${BASEPATH}/mindspore/lite"
-    fi
+
+    echo "cmake ${LITE_CMAKE_ARGS} -DBUILD_FIRST=ON ${BASEPATH}/mindspore/lite"
+    cmake ${LITE_CMAKE_ARGS} -DBUILD_FIRST=ON "${BASEPATH}/mindspore/lite"

    if [[ "$(uname)" == "Darwin" && "${local_lite_platform}" != "x86_64" ]]; then
        xcodebuild ONLY_ACTIVE_ARCH=NO -configuration Release -scheme mindspore-lite_static -target mindspore-lite_static -sdk iphoneos -quiet
    elif [[ "$(uname)" == "Darwin" && "${local_lite_platform}" == "x86_64" ]]; then
        xcodebuild ONLY_ACTIVE_ARCH=NO -configuration Release -scheme mindspore-lite_static -target mindspore-lite_static -sdk iphonesimulator -quiet
    else
+      make -j$THREAD_NUM && make install
+      cp -r ${BASEPATH}/output/tmp/mindspore*/runtime ${BASEPATH}/mindspore/lite/tools/benchmark
+      cmake ${LITE_CMAKE_ARGS} -DBUILD_FIRST=off --target benchmark "${BASEPATH}/mindspore/lite"
+
      make -j$THREAD_NUM && make install && make package
      if [[ "${local_lite_platform}" == "x86_64" ]]; then
        if [ "${JAVA_HOME}" ]; then
@ -288,37 +297,16 @@ build_lite() {
        fi

        [ -n "${BASEPATH}" ] && rm -rf ${BASEPATH}/output/tmp/
-        if [[ "X$MSLITE_REGISTRY_DEVICE" != "X" ]] && [[ "${MSLITE_REGISTRY_DEVICE}" != "sd3403" ]]; then
+        if [[ "X$MSLITE_REGISTRY_DEVICE" != "X" ]] && [[ "${MSLITE_REGISTRY_DEVICE}" != "SD3403" ]]; then
          local compile_nnie_script=${BASEPATH}/mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh
          cd ${BASEPATH}/../
          if [[ "${local_lite_platform}" == "x86_64" ]]; then
            bash ${compile_nnie_script} -I ${local_lite_platform} -b ${BRANCH_NAME} -j $THREAD_NUM
-          else
-            bash ${compile_nnie_script} -I ${local_lite_platform} -b ${BRANCH_NAME} -t ${TOOLCHAIN_NAME} -d ${MSLITE_REGISTRY_DEVICE} -j $THREAD_NUM
          fi
          if [[ $? -ne 0 ]]; then
            echo "compile ${local_lite_platform} for nnie failed."
            exit 1
          fi
-        elif [[ "${MSLITE_REGISTRY_DEVICE}" == "sd3403" ]] && [[ "${local_lite_platform}" == "arm64" ]]; then
-          LITE_CMAKE_ARGS=$(echo ${LITE_CMAKE_ARGS} | sed -e "s/MSLITE_ENABLE_TOOLS=off/MSLITE_ENABLE_TOOLS=on/g")
-          LITE_CMAKE_ARGS="${LITE_CMAKE_ARGS} -DMSLITE_REGISTRY_DEVICE=${MSLITE_REGISTRY_DEVICE}"
-          cmake ${LITE_CMAKE_ARGS}  "${BASEPATH}/mindspore/lite"
-          cd ${BASEPATH}
-          compile_dpico_script=${BASEPATH}/mindspore/lite/tools/providers/dpico/sd3403/compile_3403.sh
-          bash ${compile_dpico_script} -t prepare_third_party
-          if [[ $? -ne 0 ]]; then
-              echo "prepare for dpico failed."
-              exit 1
-          fi
-          cd ${BASEPATH}/mindspore/lite/build
-          make -j$THREAD_NUM
-          cd ${BASEPATH}
-          sh ${compile_dpico_script}
-          if [[ $? -ne 0 ]]; then
-              echo "second compile arm64 for dpico failed."
-              exit 1
-          fi
        fi
        echo "---------------- mindspore lite: build success ----------------"
    fi
--- a/mindspore/lite/cmake/compile_link_option.cmake
+++ b/mindspore/lite/cmake/compile_link_option.cmake
@ -21,6 +21,8 @@ else()
    if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
        string(REPLACE "-O2" "-O0" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
        string(REPLACE "-O2" "-O0" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+        string(REPLACE "-D_FORTIFY_SOURCE=2" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+        string(REPLACE "-D_FORTIFY_SOURCE=2" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
    endif()
    set(CMAKE_SHARED_LINKER_FLAGS "${SECURE_SHARED_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
    set(CMAKE_EXE_LINKER_FLAGS "${SECURE_EXE_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
--- a/mindspore/lite/cmake/merge.cmake
+++ b/mindspore/lite/cmake/merge.cmake
@ -3,6 +3,9 @@ function(merge_parser CL_SRC_DIR OUT_FILE_NAME)
    if(NOT EXISTS ${CL_SRC_DIR})
        return()
    endif()
+    if(DEFINED BUILD_FIRST AND NOT BUILD_FIRST)
+        return()
+    endif()
    file(GLOB_RECURSE CL_LIST ${CL_SRC_DIR}/*.cc)
    list(SORT CL_LIST)
    set(out_file ${OUT_FILE_NAME})
--- a/mindspore/lite/cmake/mix210.toolchain.cmake
+++ b/mindspore/lite/cmake/mix210.toolchain.cmake
@ -19,6 +19,9 @@ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)

 #set(CMAKE_CXX_FLAGS "-march= -mfloat-abi=softfp -mfpu=neon-vfpv4  ${CMAKE_CXX_FLAGS}")

+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8.2-a+fp16")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+fp16")
+
 # cache flags
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
--- a/mindspore/lite/micro/cmake/package_cmsis.cmake
+++ b/mindspore/lite/micro/cmake/package_cmsis.cmake
@ -1,20 +0,0 @@
-set(CMSIS_DIR ${CMAKE_BINARY_DIR}/cmsis)
-message("build cmsis kernels")
-include_directories(${CMSIS_DIR}/CMSIS/Core/Include)
-include_directories(${CMSIS_DIR}/CMSIS/DSP/Include)
-include_directories(${CMSIS_DIR}/CMSIS/NN/Include)
-
-file(REMOVE ${CMSIS_DIR}/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c)
-
-file(GLOB CMSIS_OPS
-        ${CMSIS_DIR}/CMSIS/NN/Source/BasicMathFunctions/*.c
-        ${CMSIS_DIR}/CMSIS/NN/Source/ActivationFunctions/*.c
-        ${CMSIS_DIR}/CMSIS/NN/Source/ConcatenationFunctions/*.c
-        ${CMSIS_DIR}/CMSIS/NN/Source/ConvolutionFunctions/*.c
-        ${CMSIS_DIR}/CMSIS/NN/Source/FullyConnectedFunctions/*.c
-        ${CMSIS_DIR}/CMSIS/NN/Source/NNSupportFunctions/*.c
-        ${CMSIS_DIR}/CMSIS/NN/Source/PoolingFunctions/*.c
-        ${CMSIS_DIR}/CMSIS/NN/Source/ReshapeFunctions/*.c
-        ${CMSIS_DIR}/CMSIS/NN/Source/SoftmaxFunctions/*.c
-        )
-
--- a/mindspore/lite/micro/coder/wrapper/CMakeLists.txt
+++ b/mindspore/lite/micro/coder/wrapper/CMakeLists.txt
@ -16,7 +16,24 @@ if(PLATFORM_ARM64)
 elseif(PLATFORM_ARM32)
    add_compile_definitions(ENABLE_ARM32)
 else()
-    include(${MICRO_DIR}/cmake/package_cmsis.cmake)
+    set(CMSIS_DIR ${CMAKE_BINARY_DIR}/cmsis)
+    message("build cmsis kernels")
+    include_directories(${CMSIS_DIR}/CMSIS/Core/Include)
+    include_directories(${CMSIS_DIR}/CMSIS/DSP/Include)
+    include_directories(${CMSIS_DIR}/CMSIS/NN/Include)
+
+    file(REMOVE ${CMSIS_DIR}/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c)
+    file(GLOB CMSIS_OPS
+            ${CMSIS_DIR}/CMSIS/NN/Source/BasicMathFunctions/*.c
+            ${CMSIS_DIR}/CMSIS/NN/Source/ActivationFunctions/*.c
+            ${CMSIS_DIR}/CMSIS/NN/Source/ConcatenationFunctions/*.c
+            ${CMSIS_DIR}/CMSIS/NN/Source/ConvolutionFunctions/*.c
+            ${CMSIS_DIR}/CMSIS/NN/Source/FullyConnectedFunctions/*.c
+            ${CMSIS_DIR}/CMSIS/NN/Source/NNSupportFunctions/*.c
+            ${CMSIS_DIR}/CMSIS/NN/Source/PoolingFunctions/*.c
+            ${CMSIS_DIR}/CMSIS/NN/Source/ReshapeFunctions/*.c
+            ${CMSIS_DIR}/CMSIS/NN/Source/SoftmaxFunctions/*.c
+            )
    add_library(cmsis_nn STATIC ${CMSIS_OPS})
 endif()

--- a/mindspore/lite/test/st/scripts/dpico/run_converter_3403.sh
+++ b/mindspore/lite/test/st/scripts/dpico/run_converter_3403.sh
@ -2,7 +2,7 @@

 # Build x86 tar.gz file for dpico
 function Run_Build_x86() {
-  export MSLITE_REGISTRY_DEVICE=sd3403
+  export MSLITE_REGISTRY_DEVICE=SD3403
  unset JAVA_HOME
  bash ${mindspore_top_dir}/build.sh -I x86_64 -j 80
  if [ $? = 0 ]; then
@ -19,7 +19,7 @@ function Run_Build_x86() {

 # Build arm32 tar.gz file for dpico
 function Run_Build_arm64() {
-  export MSLITE_REGISTRY_DEVICE=sd3403
+  export MSLITE_REGISTRY_DEVICE=SD3403
  unset JAVA_HOME
  bash ${mindspore_top_dir}/build.sh -I arm64 -j 80
  if [ $? = 0 ]; then
--- a/mindspore/lite/tools/benchmark/CMakeLists.txt
+++ b/mindspore/lite/tools/benchmark/CMakeLists.txt
@ -1,4 +1,48 @@
-# add shared link library
+cmake_minimum_required(VERSION 3.14)
+project(Lite_benchmark)
+
+set(BENCHMARK_LINK_LIB mindspore-lite)
+if(TARGET_HIMIX)
+    add_subdirectory(nnie)
+    add_subdirectory(nnie_proposal)
+    set(CMAKE_SKIP_BUILD_RPATH on)
+    set(BENCHMARK_LINK_LIB ${BENCHMARK_LINK_LIB} pthread
+            mslite_proposal mslite_nnie dl nnie mpi  VoiceEngine upvqe dnvqe securec)
+    if(${MSLITE_REGISTRY_DEVICE}  STREQUAL "Hi3516D")
+        include_directories(${CMAKE_CURRENT_SOURCE_DIR}/nnie/third_patry/hi3516_sdk)
+        link_directories(${CMAKE_CURRENT_SOURCE_DIR}/nnie/third_patry/hi3516_sdk/lib)
+        set(BENCHMARK_LINK_LIB ${BENCHMARK_LINK_LIB} mindspore::json)
+    elseif(${MSLITE_REGISTRY_DEVICE}  STREQUAL "Hi3519A")
+        include_directories(${CMAKE_CURRENT_SOURCE_DIR}/nnie/third_patry/hi3519_sdk)
+        link_directories(${CMAKE_CURRENT_SOURCE_DIR}/nnie/third_patry/hi3519_sdk/lib)
+        set(BENCHMARK_LINK_LIB ${BENCHMARK_LINK_LIB} mindspore::json)
+    elseif(${MSLITE_REGISTRY_DEVICE}  STREQUAL "Hi3559A")
+        include_directories(${CMAKE_CURRENT_SOURCE_DIR}/nnie/third_patry/hi3559_sdk)
+        link_directories(${CMAKE_CURRENT_SOURCE_DIR}/nnie/third_patry/hi3559_sdk/lib)
+        add_compile_definitions(BENCHMARK_CLIP_JSON)
+    endif()
+elseif(TARGET_MIX210)
+    set(CMAKE_SKIP_BUILD_RPATH on)
+    set(BENCHMARK_LINK_LIB ${BENCHMARK_LINK_LIB} mindspore::json pthread
+        dpico_acl_adapter svp_acl dl securec protobuf-c stdc++)
+    if(${MSLITE_REGISTRY_DEVICE}  STREQUAL "SD3403")
+        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dpico)
+        message("34xx_sdk_SOURCE_DIR:${34xx_sdk_SOURCE_DIR}.")
+        include_directories(${34xx_sdk_SOURCE_DIR}/include)
+        include_directories(${34xx_sdk_SOURCE_DIR})
+        link_directories(${34xx_sdk_SOURCE_DIR}/lib)
+    endif()
+else()
+    set(BENCHMARK_LINK_LIB ${BENCHMARK_LINK_LIB} mindspore::json)
+    if(PLATFORM_ARM32 OR PLATFORM_ARM64 AND NOT TARGET_OHOS_LITE AND NOT MACHINE_LINUX_ARM64)
+        if(SUPPORT_NPU AND ANDROID_STL STREQUAL "c++_static")
+            set(BENCHMARK_LINK_LIB ${BENCHMARK_LINK_LIB} c++_shared)
+        endif()
+    elseif(NOT MSVC)
+        set(BENCHMARK_LINK_LIB ${BENCHMARK_LINK_LIB} pthread)
+    endif()
+endif()
+
 include_directories(${CCSRC_DIR}/backend/kernel_compiler/cpu)
 set(COMMON_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/../common/flag_parser.cc
@ -7,67 +51,32 @@ set(COMMON_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/common/utils.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/../../../ccsrc/backend/kernel_compiler/cpu/nnacl/nnacl_common.c
        )
-if(NOT MSLITE_ENABLE_DPICO_ACL_ADAPTER)
-    if(MSLITE_ENABLE_SHARING_MEM_WITH_OPENGL)
-        set(COMMON_SRC ${COMMON_SRC} ../common/opengl_util.cc)
-    endif()

-    add_executable(benchmark
-            ${CMAKE_CURRENT_SOURCE_DIR}/main.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/run_benchmark.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_base.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/benchmark.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_unified_api.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_c_api.cc
-            ${COMMON_SRC})
+if(MSLITE_ENABLE_SHARING_MEM_WITH_OPENGL)
+    set(COMMON_SRC ${COMMON_SRC} ../common/opengl_util.cc)
+endif()

-    add_dependencies(benchmark fbs_src)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../lite)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../core)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/runtime)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/runtime/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/runtime/include/third_party)
+link_directories(${CMAKE_CURRENT_SOURCE_DIR}/runtime/lib)

-    if(MSLITE_ENABLE_SHARING_MEM_WITH_OPENGL)
-        list(APPEND opengl_lib EGL GLESv3)
-        target_link_libraries(benchmark ${opengl_lib})
-    endif()
+if(MSLITE_ENABLE_SHARING_MEM_WITH_OPENGL)
+    list(APPEND opengl_lib EGL GLESv3)
+    set(BENCHMARK_LINK_LIB  ${BENCHMARK_LINK_LIB} ${opengl_lib})
+endif()

-    if((PLATFORM_ARM32 OR PLATFORM_ARM64) AND NOT TARGET_HIMIX
-            AND NOT TARGET_OHOS_LITE AND NOT MACHINE_LINUX_ARM64 AND NOT TARGET_MIX210)
-        if(SUPPORT_NPU AND ANDROID_STL STREQUAL "c++_static")
-            target_link_libraries(benchmark mindspore-lite mindspore::json c++_shared)
-        else()
-            target_link_libraries(benchmark mindspore-lite mindspore::json)
-        endif()
-    elseif(MSVC)
-        target_link_libraries(benchmark mindspore-lite mindspore::json)
-    else()
-        target_link_libraries(benchmark mindspore-lite mindspore::json pthread)
-    endif()
-else()
-    __download_pkg(34xx_sdk
-            http://mindspore-repo.csi.rnd.huawei.com/mindspore/enterprise/dpico/34xx_sdk.tar.gz
-            f64a9129615b3b41b63debe17c6785af)
+add_executable(benchmark
+        ${CMAKE_CURRENT_SOURCE_DIR}/main.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/run_benchmark.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_base.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/benchmark.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_unified_api.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_c_api.cc
+        ${COMMON_SRC})

-    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../lite)
-    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../core)
-    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/dpico/third_party/runtime)
-    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/dpico/third_party/runtime/include)
-    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/dpico/third_party/runtime/include/third_party)
+add_dependencies(benchmark fbs_src)

-    include_directories(${34xx_sdk_SOURCE_DIR}/include)
-    include_directories(${34xx_sdk_SOURCE_DIR})
-    link_directories(${34xx_sdk_SOURCE_DIR}/lib)
-    link_directories(${CMAKE_CURRENT_SOURCE_DIR}/dpico/third_party/runtime/lib)
-    set(CMAKE_SKIP_BUILD_RPATH on)
-
-    add_executable(benchmark
-            ${CMAKE_CURRENT_SOURCE_DIR}/main.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/run_benchmark.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_base.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/benchmark.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_unified_api.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_c_api.cc
-            ${COMMON_SRC})
-
-    add_dependencies(benchmark fbs_src)
-
-    target_link_libraries(benchmark mindspore-lite mindspore::json pthread
-            dpico_acl_adapter dl svp_acl securec protobuf-c stdc++)
-endif()
+target_link_libraries(benchmark ${BENCHMARK_LINK_LIB})
--- a/mindspore/lite/tools/benchmark/benchmark.cc
+++ b/mindspore/lite/tools/benchmark/benchmark.cc
@ -35,6 +35,12 @@
 #include <asm/unistd.h>
 #include <unistd.h>
 #endif
+#ifdef SUPPORT_NNIE
+#include "include/hi_common.h"
+#include "include/hi_comm_vb.h"
+#include "include/mpi_sys.h"
+#include "include/mpi_vb.h"
+#endif

 namespace mindspore {
 namespace lite {
@ -344,18 +350,48 @@ int Benchmark::InitContext(const std::shared_ptr<Context> &context) {
  return RET_OK;
 }

+tensor::MSTensor *Benchmark::GetTensorByNodeShape(const std::vector<size_t> &node_shape) {
+  std::vector<tensor::MSTensor *> match_tensors;
+  std::vector<int> shape_vector;
+  (void)std::transform(node_shape.begin(), node_shape.end(), std::back_inserter(shape_vector),
+                       [](const size_t &value) { return static_cast<int>(value); });
+  auto tensors = session_->GetOutputs();
+  for (auto &out_tensor_pair : tensors) {
+    if (out_tensor_pair.second->shape() == shape_vector) {
+      match_tensors.emplace_back(out_tensor_pair.second);
+    }
+  }
+  if (match_tensors.empty() || match_tensors.size() != 1) {
+    MS_LOG(ERROR) << "get tensor by node shape failed";
+    return nullptr;
+  }
+  return match_tensors.front();
+}
+
+tensor::MSTensor *Benchmark::GetTensorByNameOrShape(const std::string &node_or_tensor_name,
+                                                    const std::vector<size_t> &dims) {
+  tensor::MSTensor *tensor = session_->GetOutputByTensorName(node_or_tensor_name);
+  if (tensor == nullptr) {
+    MS_LOG(INFO) << "Cannot find output node: " << node_or_tensor_name
+                 << " or node has more than one output tensor, switch to GetOutputByTensorName";
+    auto tensors = session_->GetOutputsByNodeName(node_or_tensor_name);
+    if (!tensors.empty() && tensors.size() == 1) {
+      tensor = tensors.front();
+    } else {
+      return GetTensorByNodeShape(dims);
+    }
+  }
+  return tensor;
+}
+
 int Benchmark::CompareOutput() {
  std::cout << "================ Comparing Output data ================" << std::endl;
  float total_bias = 0;
  int total_size = 0;
-  // check the output tensor name.
-  if (this->benchmark_tensor_names_ != session_->GetOutputTensorNames()) {
-    MS_LOG(ERROR) << "The output tensor name is wrong.";
-    return RET_ERROR;
-  }
+
  for (const auto &calib_tensor : benchmark_data_) {
    std::string tensor_name = calib_tensor.first;
-    tensor::MSTensor *tensor = session_->GetOutputByTensorName(tensor_name);
+    tensor::MSTensor *tensor = GetTensorByNameOrShape(tensor_name, calib_tensor.second->shape);
    if (tensor == nullptr) {
      MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
      return RET_ERROR;
@ -940,7 +976,7 @@ std::string DumpMSTensor(tensor::MSTensor *tensor) {
  }
  return oss.str();
 }
-
+#ifndef BENCHMARK_CLIP_JSON
 std::string GenerateOutputFileName(tensor::MSTensor *tensor, const std::string &op_name, const std::string &file_type,
                                   const size_t &idx) {
  std::string file_name = op_name;
@ -962,6 +998,7 @@ std::string GenerateOutputFileName(tensor::MSTensor *tensor, const std::string &
  }
  return file_name;
 }
+#endif
 }  // namespace

 int Benchmark::InitPrintTensorDataCallbackParameter() {
@ -990,6 +1027,7 @@ int Benchmark::InitPrintTensorDataCallbackParameter() {
  return RET_OK;
 }
 int Benchmark::InitDumpTensorDataCallbackParameter() {
+#ifndef BENCHMARK_CLIP_JSON
  // before callback
  before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
                          const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
@ -1035,6 +1073,7 @@ int Benchmark::InitDumpTensorDataCallbackParameter() {
    }
    return true;
  };
+#endif
  return RET_OK;
 }

--- a/mindspore/lite/tools/benchmark/benchmark.h
+++ b/mindspore/lite/tools/benchmark/benchmark.h
@ -29,7 +29,9 @@
 #include <memory>
 #include <cfloat>
 #include <utility>
+#ifndef BENCHMARK_CLIP_JSON
 #include <nlohmann/json.hpp>
+#endif
 #include "tools/benchmark/benchmark_base.h"
 #include "include/model.h"
 #include "tools/common/flag_parser.h"
@ -96,6 +98,8 @@ class MS_API Benchmark : public BenchmarkBase {

  int CompareDataGetTotalCosineDistanceAndSize(const std::string &name, tensor::MSTensor *tensor,
                                               float *total_cosine_distance, int *total_size);
+  tensor::MSTensor *GetTensorByNodeShape(const std::vector<size_t> &node_shape);
+  tensor::MSTensor *GetTensorByNameOrShape(const std::string &node_or_tensor_name, const std::vector<size_t> &dims);

 private:
 #ifdef ENABLE_OPENGL_TEXTURE
--- a/mindspore/lite/tools/benchmark/benchmark_base.cc
+++ b/mindspore/lite/tools/benchmark/benchmark_base.cc
@ -34,6 +34,12 @@
 #include <asm/unistd.h>
 #include <unistd.h>
 #endif
+#ifdef SUPPORT_NNIE
+#include "include/hi_common.h"
+#include "include/hi_comm_vb.h"
+#include "include/mpi_sys.h"
+#include "include/mpi_vb.h"
+#endif

 namespace mindspore {
 namespace lite {
@ -57,6 +63,10 @@ constexpr int16_t kInputDataInt8Min = -127;
 constexpr int16_t kInputDataInt8Max = 127;
 constexpr int16_t kInputDataUint8Min = 0;
 constexpr int16_t kInputDataUint8Max = 254;
+#ifdef SUPPORT_NNIE
+constexpr int kNNIEMaxPoolCnt = 2;
+constexpr int kNNIEBlkSize = 768 * 576 * 2;
+#endif

 const std::unordered_map<int, std::string> kTypeIdMap{
  {kNumberTypeFloat16, "Float16"}, {kNumberTypeFloat, "Float32"},    {kNumberTypeFloat32, "Float32"},
@ -294,6 +304,7 @@ int BenchmarkBase::CheckDeviceTypeValid() {
 }

 int BenchmarkBase::InitDumpConfigFromJson(char *path) {
+#ifndef BENCHMARK_CLIP_JSON
  auto real_path = RealPath(path);
  std::ifstream ifs(real_path);
  if (!ifs.good()) {
@ -354,7 +365,7 @@ int BenchmarkBase::InitDumpConfigFromJson(char *path) {
    MS_LOG(ERROR) << "create data output directory failed.";
    return RET_ERROR;
  }
-
+#endif
  return RET_OK;
 }

@ -623,6 +634,72 @@ int BenchmarkBase::PrintPerfResult(const std::vector<std::string> &title,
 }
 #endif

+#ifdef SUPPORT_NNIE
+int SvpSysInit() {
+  HI_S32 ret = HI_SUCCESS;
+  VB_CONFIG_S struVbConf;
+  ret = HI_MPI_SYS_Exit();
+  if (HI_SUCCESS != ret) {
+    MS_LOG(ERROR) << "HI_MPI_SYS_Exit failed!";
+    return RET_ERROR;
+  }
+
+  ret = HI_MPI_VB_Exit();
+  if (HI_SUCCESS != ret) {
+    MS_LOG(WARNING) << "HI_MPI_VB_Exit failed!";
+    ret = HI_MPI_SYS_Init();
+    if (HI_SUCCESS != ret) {
+      MS_LOG(ERROR) << "Error:HI_MPI_SYS_Init failed!";
+      return RET_ERROR;
+    }
+    return RET_OK;
+  }
+
+  memset(&struVbConf, 0, sizeof(VB_CONFIG_S));
+  struVbConf.u32MaxPoolCnt = kNNIEMaxPoolCnt;
+  struVbConf.astCommPool[1].u64BlkSize = kNNIEBlkSize;
+  struVbConf.astCommPool[1].u32BlkCnt = 1;
+
+  ret = HI_MPI_VB_SetConfig((const VB_CONFIG_S *)&struVbConf);
+  if (HI_SUCCESS != ret) {
+    MS_LOG(ERROR) << "Error:HI_MPI_VB_SetConf failed!";
+    return RET_ERROR;
+  }
+
+  ret = HI_MPI_VB_Init();
+  if (HI_SUCCESS != ret) {
+    MS_LOG(ERROR) << "Error:HI_MPI_VB_Init failed!";
+    return RET_ERROR;
+  }
+
+  ret = HI_MPI_SYS_Init();
+  if (HI_SUCCESS != ret) {
+    MS_LOG(ERROR) << "Error:HI_MPI_SYS_Init failed!";
+    return RET_ERROR;
+  }
+
+  return RET_OK;
+}
+
+int SvpSysExit() {
+  HI_S32 ret = HI_SUCCESS;
+
+  ret = HI_MPI_SYS_Exit();
+  if (HI_SUCCESS != ret) {
+    MS_LOG(ERROR) << "HI_MPI_SYS_Exit failed!";
+    return RET_ERROR;
+  }
+
+  ret = HI_MPI_VB_Exit();
+  if (HI_SUCCESS != ret) {
+    MS_LOG(WARNING) << "HI_MPI_VB_Exit failed!";
+    return RET_OK;
+  }
+
+  return RET_OK;
+}
+#endif
+
 BenchmarkBase::~BenchmarkBase() {
  for (auto &iter : this->benchmark_data_) {
    iter.second->shape.clear();
@ -631,6 +708,9 @@ BenchmarkBase::~BenchmarkBase() {
    iter.second = nullptr;
  }
  this->benchmark_data_.clear();
+#ifdef SUPPORT_NNIE
+  SvpSysExit();
+#endif
 }
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/benchmark_base.h
+++ b/mindspore/lite/tools/benchmark/benchmark_base.h
@ -29,7 +29,9 @@
 #include <memory>
 #include <cfloat>
 #include <utility>
+#ifndef BENCHMARK_CLIP_JSON
 #include <nlohmann/json.hpp>
+#endif
 #include "include/model.h"
 #include "include/api/types.h"
 #include "include/api/format.h"
@ -419,9 +421,10 @@ class MS_API BenchmarkBase {
  float op_cost_total_ = 0.0f;
  std::map<std::string, std::pair<int, float>> op_times_by_type_;
  std::map<std::string, std::pair<int, float>> op_times_by_name_;
-
+#ifndef BENCHMARK_CLIP_JSON
  // dump data
  nlohmann::json dump_cfg_json_;
+#endif
  std::string dump_file_output_dir_;
 #ifdef ENABLE_ARM64
  int perf_fd = 0;
@ -432,6 +435,10 @@ class MS_API BenchmarkBase {
 #endif
  std::mt19937 random_engine_;
 };
+#ifdef SUPPORT_NNIE
+int SvpSysInit();
+int SvpSysExit();
+#endif

 }  // namespace mindspore::lite
 #endif  // MINNIE_BENCHMARK_BENCHMARK_BASE_H_
--- a/mindspore/lite/tools/benchmark/benchmark_unified_api.cc
+++ b/mindspore/lite/tools/benchmark/benchmark_unified_api.cc
@ -36,6 +36,12 @@
 #include <asm/unistd.h>
 #include <unistd.h>
 #endif
+#ifdef SUPPORT_NNIE
+#include "include/hi_common.h"
+#include "include/hi_comm_vb.h"
+#include "include/mpi_sys.h"
+#include "include/mpi_vb.h"
+#endif

 namespace mindspore {
 constexpr size_t kDataToStringMaxNum = 40;
@ -1081,7 +1087,7 @@ std::string DumpMSTensor(mindspore::MSTensor *tensor) {
  }
  return oss.str();
 }
-
+#ifndef BENCHMARK_CLIP_JSON
 std::string GenerateOutputFileName(mindspore::MSTensor *tensor, const std::string &op_name,
                                   const std::string &file_type, const size_t &idx) {
  std::string file_name = op_name;
@ -1105,6 +1111,7 @@ std::string GenerateOutputFileName(mindspore::MSTensor *tensor, const std::strin
  file_name += +".bin";
  return file_name;
 }
+#endif
 }  // namespace

 int BenchmarkUnifiedApi::InitPrintTensorDataCallbackParameter() {
@ -1132,6 +1139,7 @@ int BenchmarkUnifiedApi::InitPrintTensorDataCallbackParameter() {
  return RET_OK;
 }
 int BenchmarkUnifiedApi::InitDumpTensorDataCallbackParameter() {
+#ifndef BENCHMARK_CLIP_JSON
  // before callback
  ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
                             const std::vector<mindspore::MSTensor> &before_outputs,
@ -1177,6 +1185,7 @@ int BenchmarkUnifiedApi::InitDumpTensorDataCallbackParameter() {
    }
    return true;
  };
+#endif
  return RET_OK;
 }

--- a/mindspore/lite/tools/benchmark/benchmark_unified_api.h
+++ b/mindspore/lite/tools/benchmark/benchmark_unified_api.h
@ -29,7 +29,9 @@
 #include <memory>
 #include <cfloat>
 #include <utility>
+#ifndef BENCHMARK_CLIP_JSON
 #include <nlohmann/json.hpp>
+#endif
 #include "tools/benchmark/benchmark_base.h"
 #include "include/model.h"
 #include "tools/common/flag_parser.h"
--- a/mindspore/lite/tools/benchmark/dpico/CMakeLists.txt
+++ b/mindspore/lite/tools/benchmark/dpico/CMakeLists.txt
@ -7,9 +7,9 @@ __download_pkg(34xx_sdk
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${34xx_sdk_SOURCE_DIR})
 include_directories(${34xx_sdk_SOURCE_DIR}/include)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/runtime)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/runtime/include)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/runtime/include/third_party)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../runtime)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../runtime/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../runtime/include/third_party)
 link_directories(${34xx_sdk_SOURCE_DIR}/lib)

 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/src COMMON_SRC3)
--- a/mindspore/lite/tools/benchmark/dpico/src/custom_infer.cc
+++ b/mindspore/lite/tools/benchmark/dpico/src/custom_infer.cc
@ -27,7 +27,7 @@ using mindspore::schema::PrimitiveType_Custom;
 namespace mindspore {
 namespace dpico {
 namespace {
-constexpr int kBaseValue = 10;
+constexpr int kDecimal = 10;
 constexpr auto kInputShape = "inputs_shape";
 constexpr auto kOutputShape = "outputs_shape";
 constexpr auto kOutputsFormat = "outputs_format";
@ -66,13 +66,13 @@ Status GetCustomShape(const std::map<std::string, std::string> &attrs, const std
  char *save_ptr = nullptr;
  res = strtok_r(attr.data(), delims, &save_ptr);
  while (res != nullptr) {
-    int64_t ndims = strtol(res, &res, kBaseValue);
+    int64_t ndims = strtol(res, &res, kDecimal);
    int j = 0;
    std::vector<int64_t> shape;
    shape.resize(ndims);
    for (; j < ndims; j++) {
      res = strtok_r(NULL, delims, &save_ptr);
-      shape[j] = static_cast<int64_t>(strtol(res, &res, kBaseValue));
+      shape[j] = static_cast<int64_t>(strtol(res, &res, kDecimal));
    }
    shapes->push_back(shape);

--- a/mindspore/lite/tools/benchmark/nnie/CMakeLists.txt
+++ b/mindspore/lite/tools/benchmark/nnie/CMakeLists.txt
@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.14)
+project(NNIE_Custom)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+if(${MSLITE_REGISTRY_DEVICE}  STREQUAL "Hi3516D")
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_patry/hi3516_sdk/)
+    link_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_patry/hi3516_sdk/lib)
+elseif(${MSLITE_REGISTRY_DEVICE}  STREQUAL "Hi3519A")
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_patry/hi3519_sdk/)
+    link_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_patry/hi3519_sdk/lib)
+elseif(${MSLITE_REGISTRY_DEVICE}  STREQUAL "Hi3559A")
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_patry/hi3559_sdk/)
+    link_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_patry/hi3559_sdk/lib)
+endif()
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../runtime)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../runtime/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../runtime/include/third_party)
+
+
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/src COMMON_SRC3)
+
+set(MSLITE_NNIE_LINK_LIB nnie mpi VoiceEngine upvqe dnvqe)
+
+add_library(mslite_nnie SHARED
+        ${COMMON_SRC3})
+target_link_libraries(mslite_nnie ${MSLITE_NNIE_LINK_LIB} securec)
+
+if(DEFINED HIMIX_STRIP)
+    set(NDK_STRIP ${HIMIX_STRIP})
+else()
+    set(NDK_STRIP "arm-himix200-linux-strip")
+endif()
+
+if("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
+    add_custom_command(TARGET mslite_nnie POST_BUILD COMMAND ${NDK_STRIP}
+            ${CMAKE_CURRENT_BINARY_DIR}/libmslite_nnie.so)
+endif()
--- a/mindspore/lite/tools/benchmark/nnie/src/custom_fp32.cc
+++ b/mindspore/lite/tools/benchmark/nnie/src/custom_fp32.cc
@ -0,0 +1,178 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/custom_fp32.h"
+#include <map>
+#include <memory>
+#include "schema/model_generated.h"
+#include "include/registry/register_kernel.h"
+#include "include/errorcode.h"
+#include "src/nnie_manager.h"
+#include "src/nnie_print.h"
+#include "src/nnie_cfg_parser.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Custom;
+
+namespace mindspore {
+namespace nnie {
+bool CustomCPUKernel::load_model_ = false;
+
+int CustomCPUKernel::run_seg_ = 0;
+bool CustomCPUKernel::roi_used_ = false;
+int CustomCPUKernel::Prepare() {
+  if (!load_model_) {
+    Flags flags;
+    flags.Init();
+    if (nnie::NNIEManager::GetInstance()->CfgInit(flags.max_roi_num_, flags.time_step_, flags.core_ids_) != RET_OK) {
+      LOGE("Nnie init cfg fail");
+      return RET_ERROR;
+    }
+
+    if (nnie::NNIEManager::GetInstance()->Init(reinterpret_cast<char *>(inputs_[inputs_.size() - 1].MutableData()),
+                                               static_cast<int>(inputs_[inputs_.size() - 1].ElementNum()), inputs_)) {
+      // LOGW("Load WK Model Fail");
+      return RET_OK;
+    }
+    load_model_ = true;
+  }
+  outputs_shapes_.resize(outputs_.size());
+  for (size_t i = 0; i < outputs_.size(); i++) {
+    outputs_shapes_[i] = outputs_[i].Shape();
+  }
+  return RET_OK;
+}
+
+int CustomCPUKernel::ReSize() {
+  if (load_model_) {
+    nnie::NNIEManager::GetInstance()->Release();
+    load_model_ = false;
+  }
+
+  return Prepare();
+}
+
+int CustomCPUKernel::Execute() {
+  if (!load_model_) {
+    LOGE("WK Model is not load.");
+    return RET_ERROR;
+  }
+  run_seg_ = seg_id_;
+
+  if (nnie::NNIEManager::GetInstance()->FillData(&inputs_, run_seg_)) {
+    LOGE("Fail Fill Data");
+    return RET_ERROR;
+  }
+
+  if (nnie::NNIEManager::GetInstance()->Run(&outputs_, run_seg_, outputs_shapes_)) {
+    LOGE("Fail WK Run");
+    return RET_ERROR;
+  }
+  run_seg_++;
+  return RET_OK;
+}
+
+CustomCPUKernel::~CustomCPUKernel() {
+  if (load_model_) {
+    nnie::NNIEManager::GetInstance()->Release();
+    load_model_ = false;
+  }
+}
+
+bool GetCustomAttr(char *buf, int buf_size, const mindspore::schema::Custom *op, const std::string &attr) {
+  int attr_size;
+  for (size_t i = 0; i < op->attr()->size(); i++) {
+    if (op->attr()->Get(i)->name()->str() == attr) {
+      auto output_info = op->attr()->Get(i)->data();
+      attr_size = static_cast<int>(output_info->size());
+      if (attr_size >= buf_size) {
+        LOGE("attr size too big");
+        return false;
+      }
+      for (int j = 0; j < attr_size; j++) {
+        buf[j] = static_cast<char>(output_info->Get(j));
+      }
+      buf[attr_size] = 0;
+      return true;
+    }
+  }
+  return false;
+}
+
+std::shared_ptr<mindspore::kernel::Kernel> CustomCreateKernel(const std::vector<MSTensor> &inputs,
+                                                              const std::vector<MSTensor> &outputs,
+                                                              const mindspore::schema::Primitive *primitive,
+                                                              const mindspore::Context *ctx) {
+  if (primitive->value_type() != mindspore::schema::PrimitiveType_Custom) {
+    LOGE("Primitive type is not PrimitiveType_Custom");
+    return nullptr;
+  }
+
+  auto op = primitive->value_as_Custom();
+  if (op->attr()->size() < 1) {
+    LOGE("There are at least 1 attribute of Custom");
+    return nullptr;
+  }
+
+  int64_t ndims;
+  bool forward_bbox = false;
+  char *res = nullptr;
+  char buf[kMaxSize];
+  if (GetCustomAttr(buf, kMaxSize, op, "id")) {
+    res = nullptr;
+    ndims = strtol(buf, &res, kDecimal);
+    if ((*res) != 0) {
+      LOGE("Get attr id data fail");
+      return nullptr;
+    }
+  } else {
+    LOGE("Custom op should have id");
+    return nullptr;
+  }
+
+  if (GetCustomAttr(buf, kMaxSize, op, "ForwardWithBbox")) {
+    res = nullptr;
+    int64_t temp_val = strtol(buf, &res, kDecimal);
+    if ((*res) != 0) {
+      LOGE("Get attr ForwardWithBbox data fail");
+      return nullptr;
+    }
+    if (temp_val > 0) {
+      forward_bbox = true;
+    }
+  }
+  auto kernel = std::make_shared<CustomCPUKernel>(ndims, forward_bbox, inputs, outputs, primitive, ctx);
+  if (kernel == nullptr) {
+    LOGE("new custom kernel is nullptr");
+    return nullptr;
+  }
+  return kernel;
+}
+}  // namespace nnie
+}  // namespace mindspore
+namespace mindspore {
+namespace registry {
+namespace {
+const auto kFloat32 = DataType::kNumberTypeFloat32;
+const auto kInt8 = DataType::kNumberTypeInt8;
+const auto kUint8 = DataType::kNumberTypeUInt8;
+}  // namespace
+REGISTER_CUSTOM_KERNEL(CPU, NNIE, kFloat32, NNIE, nnie::CustomCreateKernel)
+REGISTER_CUSTOM_KERNEL(CPU, NNIE, kInt8, NNIE, nnie::CustomCreateKernel)
+REGISTER_CUSTOM_KERNEL(CPU, NNIE, kUint8, NNIE, nnie::CustomCreateKernel)
+}  // namespace registry
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/nnie/src/custom_fp32.h
+++ b/mindspore/lite/tools/benchmark/nnie/src/custom_fp32.h
@ -0,0 +1,66 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CUSTOM_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CUSTOM_H_
+
+#include <vector>
+#include <string>
+#include "include/schema/model_generated.h"
+#include "include/context.h"
+#include "include/api/kernel.h"
+#include "src/custom_infer.h"
+
+using mindspore::kernel::Kernel;
+using mindspore::tensor::MSTensor;
+namespace mindspore {
+namespace nnie {
+class CustomCPUKernel : public Kernel {
+ public:
+  CustomCPUKernel(int seg_id, bool forward_bbox, const std::vector<MSTensor> &inputs,
+                  const std::vector<MSTensor> &outputs, const mindspore::schema::Primitive *primitive,
+                  const mindspore::Context *ctx)
+      : Kernel(inputs, outputs, primitive, ctx), seg_id_(seg_id), forward_bbox_(forward_bbox) {
+    if (forward_bbox) {
+      roi_used_ = true;
+    }
+  }
+
+  ~CustomCPUKernel() override;
+
+  int Prepare() override;
+  int ReSize() override;
+  int Execute() override;
+
+  int seg_id(void) const { return seg_id_; }
+
+  void set_seg_id(int id) { seg_id_ = id; }
+
+  int forward_bbox(void) const { return forward_bbox_; }
+
+  void set_forward_bbox(bool flag) { forward_bbox_ = flag; }
+
+ private:
+  static bool load_model_;
+  static int run_seg_;
+  static bool roi_used_;
+  int seg_id_ = 0;
+  bool forward_bbox_ = false;
+  std::vector<std::vector<int64_t>> outputs_shapes_;
+};
+}  // namespace nnie
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CUSTOM_H_
--- a/mindspore/lite/tools/benchmark/nnie/src/custom_infer.cc
+++ b/mindspore/lite/tools/benchmark/nnie/src/custom_infer.cc
@ -0,0 +1,160 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/custom_infer.h"
+#include <string>
+#include <iostream>
+#include "include/errorcode.h"
+#include "src/nnie_print.h"
+#include "include/api/format.h"
+#include "include/registry/register_kernel_interface.h"
+
+using mindspore::kernel::KernelInterface;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Custom;
+
+namespace mindspore {
+namespace nnie {
+std::shared_ptr<KernelInterface> CustomInferCreater() {
+  auto infer = new (std::nothrow) CustomInterface();
+  if (infer == nullptr) {
+    LOGE("new custom infer is nullptr");
+    return nullptr;
+  }
+  return std::shared_ptr<KernelInterface>(infer);
+}
+
+int GetCustomShape(const mindspore::schema::Custom *op, const std::string &attr,
+                   std::vector<std::vector<int64_t>> *shapes) {
+  char buf[kMaxSize];
+  bool has_outputs_shape = false;
+
+  for (size_t i = 0; i < op->attr()->size(); i++) {
+    if (op->attr()->Get(i)->name()->str() == attr) {
+      auto output_info = op->attr()->Get(i)->data();
+      int attr_size = static_cast<int>(output_info->size());
+      if (attr_size >= kMaxSize) {
+        LOGE("attr size too big");
+        return RET_ERROR;
+      }
+      for (int j = 0; j < attr_size; j++) {
+        buf[j] = static_cast<char>(output_info->Get(j));
+      }
+      buf[attr_size] = 0;
+      has_outputs_shape = true;
+      break;
+    }
+  }
+
+  if (!has_outputs_shape) {
+    LOGE("Custom op don't have %s attr.", attr.c_str());
+    return RET_ERROR;
+  }
+
+  char delims[] = ",";
+  char *res = nullptr;
+  char *save_ptr = nullptr;
+  res = strtok_r(buf, delims, &save_ptr);
+  while (res != nullptr) {
+    // 待补完
+    // outputs[id]->format_ = input->format_;
+    // outputs[id]->data_type_ = kNumberTypeFloat32;
+    int64_t ndims = strtol(res, &res, kDecimal);
+    int j = 0;
+    std::vector<int64_t> shape;
+    shape.resize(ndims);
+    for (; j < ndims; j++) {
+      res = strtok_r(NULL, delims, &save_ptr);
+      shape[j] = static_cast<int64_t>(strtol(res, &res, kDecimal));
+    }
+    shapes->push_back(shape);
+
+    res = strtok_r(NULL, delims, &save_ptr);
+  }
+  return RET_OK;
+}
+
+Status CustomInterface::Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
+                              const mindspore::schema::Primitive *primitive) {
+  if (inputs->empty()) {
+    LOGE("Inputs size 0");
+    return kLiteError;
+  }
+  if (outputs->empty()) {
+    LOGE("Outputs size 0");
+    return kLiteError;
+  }
+  if (primitive->value_type() != mindspore::schema::PrimitiveType_Custom) {
+    LOGE("Primitive type is not PrimitiveType_Custom");
+    return kLiteError;
+  }
+
+  auto op = primitive->value_as_Custom();
+  if (op->attr()->size() < 1) {
+    LOGE("There are at least 1 attribute of Custom");
+    return kLiteError;
+  }
+  std::vector<std::vector<int64_t>> inputs_shape;
+  if (GetCustomShape(op, "inputs_shape", &inputs_shape) != RET_OK) {
+    LOGE("parser inputs_shape attribute err.");
+    return kLiteError;
+  }
+  std::vector<std::vector<int64_t>> outputs_shape;
+  if (GetCustomShape(op, "outputs_shape", &outputs_shape) != RET_OK) {
+    LOGE("parser outputs_shape attribute err.");
+    return kLiteError;
+  }
+  if (inputs_shape.size() != (inputs->size() - 1)) {
+    LOGE("inputs num diff inputs_shape num.");
+    return kLiteError;
+  }
+  if (inputs_shape[0].size() != (*inputs)[0].Shape().size()) {
+    LOGE("shape size err.");
+    return kLiteError;
+  }
+  bool resize_flag = false;
+  int resize_num = 1;
+  for (size_t i = 0; i < inputs_shape[0].size(); i++) {
+    if (inputs_shape[0][i] != (*inputs)[0].Shape()[i]) {
+      if (i == 0) {
+        resize_flag = true;
+        resize_num = (*inputs)[0].Shape()[i];
+      } else {
+        LOGE("Custom of NNIE only support batch_num resize.");
+        return kLiteError;
+      }
+    }
+  }
+  if (resize_flag) {
+    for (auto &output_shape : outputs_shape) {
+      output_shape[0] = resize_num;
+    }
+  }
+  for (size_t i = 0; i < outputs->size(); i++) {
+    (*outputs)[i].SetShape(outputs_shape[i]);
+    (*outputs)[i].SetDataType(DataType::kNumberTypeFloat32);
+    (*outputs)[i].SetFormat(Format::NCHW);
+  }
+  return kSuccess;
+}
+}  // namespace nnie
+}  // namespace mindspore
+namespace mindspore {
+namespace kernel {
+REGISTER_CUSTOM_KERNEL_INTERFACE(NNIE, NNIE, nnie::CustomInferCreater);
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/nnie/src/custom_infer.h
+++ b/mindspore/lite/tools/benchmark/nnie/src/custom_infer.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_CUSTOM_PARAMETER_H_
+#define MINDSPORE_LITE_NNACL_CUSTOM_PARAMETER_H_
+#include <vector>
+#include <memory>
+#include "include/kernel_interface.h"
+
+namespace mindspore {
+namespace nnie {
+class CustomInterface : public mindspore::kernel::KernelInterface {
+ public:
+  CustomInterface() {}
+
+  ~CustomInterface() = default;
+
+  Status Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
+               const mindspore::schema::Primitive *primitive) override;
+};
+}  // namespace nnie
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_NNACL_CUSTOM_PARAMETER_H_
--- a/mindspore/lite/tools/benchmark/nnie/src/nnie_cfg_parser.cc
+++ b/mindspore/lite/tools/benchmark/nnie/src/nnie_cfg_parser.cc
@ -0,0 +1,101 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/nnie_cfg_parser.h"
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "include/errorcode.h"
+#include "src/nnie_manager.h"
+#include "src/nnie_print.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+namespace mindspore {
+namespace nnie {
+namespace {
+constexpr auto ENV_TIME_STEP = "TIME_STEP";
+constexpr auto ENV_MAX_ROI_NUM = "MAX_ROI_NUM";
+constexpr auto ENV_CORE_IDS = "CORE_IDS";
+constexpr auto DELIM = ",";
+constexpr int MAX_CORE_ID = 7;
+}  // namespace
+void Flags::Init() {
+  auto *time_step = std::getenv(ENV_TIME_STEP);
+  if (time_step != nullptr) {
+    auto iter = std::find_if(time_step, time_step + strlen(time_step), [](char val) { return val < '0' || val > '9'; });
+    if (iter != time_step) {
+      *iter = '\0';
+      this->time_step_ = atoi(time_step);
+    } else {
+      LOGE("TIME_STEP ENV is invalid, now set to default value %d", this->time_step_);
+    }
+  } else {
+    LOGW("TIME_STEP ENV is not set, now set to default value %d", this->time_step_);
+  }
+  auto *max_roi_num = std::getenv(ENV_MAX_ROI_NUM);
+  if (max_roi_num != nullptr) {
+    auto iter =
+      std::find_if(max_roi_num, max_roi_num + strlen(max_roi_num), [](char val) { return val < '0' || val > '9'; });
+    if (iter != max_roi_num) {
+      *iter = '\0';
+      this->max_roi_num_ = atoi(max_roi_num);
+    } else {
+      LOGW("MAX_ROI_NUM ENV is invalid, now set to default value %d", this->max_roi_num_);
+    }
+  } else {
+    LOGW("MAX_ROI_NUM ENV is not set, now set to default value %d", this->max_roi_num_);
+  }
+  auto ids = std::getenv(ENV_CORE_IDS);
+  if (ids != nullptr) {
+    auto iter = std::find_if(ids, ids + strlen(ids), [](char val) { return (val < '0' || val > '9') && val != ','; });
+    std::vector<int> core_ids;
+    if (iter != ids) {
+      *iter = '\0';
+      char *saveptr;
+      char *p = strtok_r(ids, DELIM, &saveptr);
+      while (p != nullptr) {
+        int id = atoi(p);
+        p = strtok_r(NULL, DELIM, &saveptr);
+        if (id > MAX_CORE_ID || id < 0) {
+          LOGE("id is out of range");
+          continue;
+        }
+        if (std::find(core_ids.begin(), core_ids.end(), id) != core_ids.end()) {
+          continue;
+        }
+        core_ids.push_back(id);
+      }
+    }
+    if (!core_ids.empty()) {
+      this->core_ids_ = core_ids;
+    } else {
+      std::string message =
+        "CORE_IDS ENV is invalid, now set to default value {" + std::to_string(this->core_ids_.front()) + "}";
+      LOGW(message.c_str());
+    }
+  } else {
+    std::string message =
+      "CORE_IDS ENV is not set, now set to default value {" + std::to_string(this->core_ids_.front()) + "}";
+    LOGW(message.c_str());
+  }
+}
+}  // namespace nnie
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/nnie/src/nnie_cfg_parser.h
+++ b/mindspore/lite/tools/benchmark/nnie/src/nnie_cfg_parser.h
@ -0,0 +1,44 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_NNIE_CFG_PARSER_H_
+#define MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_NNIE_CFG_PARSER_H_
+#include <vector>
+
+namespace mindspore {
+namespace nnie {
+/**
+ * Flags is a config container.
+ * Member objects:
+ *  1.time_step_: step num only for rnn or lstm model. Default is 1.
+ *  2.max_roi_num_: maximum number of ROI area, which is single picture supports, must be greater than 0.Default is 300.
+ *  3.core_ids_: running kernels' id, support multi-core, separated by commas when setting, such as {0, 1, 2}.
+ *               each element must be a integer, wch meet such inequality 0 <= val < 8.
+ *               Default is {0}.
+ */
+class Flags {
+ public:
+  Flags() = default;
+  ~Flags() = default;
+  void Init();
+
+ public:
+  int time_step_{1};
+  int max_roi_num_{300};
+  std::vector<int> core_ids_{0};
+};
+}  // namespace nnie
+}  // namespace mindspore
+#endif
--- a/mindspore/lite/tools/benchmark/nnie/src/nnie_common.cc
+++ b/mindspore/lite/tools/benchmark/nnie/src/nnie_common.cc
@ -0,0 +1,943 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/nnie_common.h"
+#include "include/mpi_nnie.h"
+#include "include/hi_type.h"
+#include "include/errorcode.h"
+#include "src/nnie_print.h"
+#include "src/nnie_memory.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+constexpr int kSleepUs = 100;
+
+namespace mindspore {
+namespace nnie {
+static void NnieParamRelease(NnieParam *nnie_param) {
+  if (nnie_param == nullptr) {
+    return;
+  }
+
+  if (nnie_param->task_buf_.u64PhyAddr != 0 && nnie_param->task_buf_.u64VirAddr != 0) {
+    NNIE_MEM_FREE(nnie_param->task_buf_.u64PhyAddr, nnie_param->task_buf_.u64VirAddr);
+    nnie_param->task_buf_.u64PhyAddr = 0;
+    nnie_param->task_buf_.u64VirAddr = 0;
+  }
+
+  if (nnie_param->step_buf_.u64PhyAddr != 0 && nnie_param->step_buf_.u64VirAddr != 0) {
+    NNIE_MEM_FREE(nnie_param->step_buf_.u64PhyAddr, nnie_param->step_buf_.u64VirAddr);
+    nnie_param->step_buf_.u64PhyAddr = 0;
+    nnie_param->step_buf_.u64VirAddr = 0;
+  }
+}
+
+bool CheckNnieInnerNode(const HI_CHAR *name, NnieParam *nnie_param) {
+  for (HI_U32 i = 0; i < nnie_param->model_->u32NetSegNum; i++) {
+    for (HI_U32 j = 0; j < nnie_param->model_->astSeg[i].u16DstNum; j++)
+      if (strncmp(name, nnie_param->model_->astSeg[i].astDstNode[j].szName, SVP_NNIE_NODE_NAME_LEN) == 0) {
+        nnie_param->mem_cfg_.seg_[i].dst_node_[j] = true;
+        return true;
+      }
+  }
+  return false;
+}
+
+bool ConnectNnieInnerNode(const HI_CHAR *name, NnieParam *nnie_param, SVP_SRC_BLOB_S *blob) {
+  for (HI_U32 i = 0; i < nnie_param->model_->u32NetSegNum; i++) {
+    for (HI_U32 j = 0; j < nnie_param->model_->astSeg[i].u16DstNum; j++)
+      if (strncmp(name, nnie_param->model_->astSeg[i].astDstNode[j].szName, SVP_NNIE_NODE_NAME_LEN) == 0) {
+        blob->u64PhyAddr = nnie_param->seg_data_[i].dst_[j].u64PhyAddr;
+        blob->u64VirAddr = nnie_param->seg_data_[i].dst_[j].u64VirAddr;
+        return true;
+      }
+  }
+  return false;
+}
+
+static void FillForwardInfo(NnieCfg *nnie_cfg, NnieParam *nnie_param) {
+  HI_U32 i, j;
+  HI_U32 num;
+  memset(&nnie_param->mem_cfg_, false, sizeof(NNIEMemCfg));
+  for (i = 0; i < nnie_param->model_->u32NetSegNum; i++) {
+    if (SVP_NNIE_NET_TYPE_ROI == nnie_param->model_->astSeg[i].enNetType) {
+      nnie_param->forward_with_bbox_ctrl_[i].enNnieId = nnie_cfg->nnie_core_id_[i];
+      nnie_param->forward_with_bbox_ctrl_[i].u32SrcNum = nnie_param->model_->astSeg[i].u16SrcNum;
+      nnie_param->forward_with_bbox_ctrl_[i].u32DstNum = nnie_param->model_->astSeg[i].u16DstNum;
+      nnie_param->forward_with_bbox_ctrl_[i].u32ProposalNum = 1;
+      nnie_param->forward_with_bbox_ctrl_[i].u32NetSegId = i;
+    } else if (SVP_NNIE_NET_TYPE_CNN == nnie_param->model_->astSeg[i].enNetType ||
+               SVP_NNIE_NET_TYPE_RECURRENT == nnie_param->model_->astSeg[i].enNetType) {
+      nnie_param->forward_ctrl_[i].enNnieId = nnie_cfg->nnie_core_id_[i];
+      nnie_param->forward_ctrl_[i].u32SrcNum = nnie_param->model_->astSeg[i].u16SrcNum;
+      nnie_param->forward_ctrl_[i].u32DstNum = nnie_param->model_->astSeg[i].u16DstNum;
+      nnie_param->forward_ctrl_[i].u32NetSegId = i;
+    }
+
+    for (j = 0; j < nnie_param->model_->astSeg[i].u16SrcNum; j++) {
+      if (i > 0) {
+        if (CheckNnieInnerNode(nnie_param->model_->astSeg[i].astSrcNode[j].szName, nnie_param)) {
+          nnie_param->mem_cfg_.seg_[i].src_node_[j] = true;
+        }
+      }
+
+      if (SVP_BLOB_TYPE_SEQ_S32 == nnie_param->model_->astSeg[i].astSrcNode[j].enType) {
+        nnie_param->seg_data_[i].src_[j].enType = nnie_param->model_->astSeg[i].astSrcNode[j].enType;
+        nnie_param->seg_data_[i].src_[j].unShape.stSeq.u32Dim =
+          nnie_param->model_->astSeg[i].astSrcNode[j].unShape.u32Dim;
+        nnie_param->seg_data_[i].src_[j].u32Num = nnie_cfg->max_input_num_;
+        nnie_param->seg_data_[i].src_[j].unShape.stSeq.u64VirAddrStep =
+          nnie_cfg->step_vir_addr_[i * NNIE_EACH_SEG_STEP_ADDR_NUM];
+      } else {
+        nnie_param->seg_data_[i].src_[j].enType = nnie_param->model_->astSeg[i].astSrcNode[j].enType;
+        nnie_param->seg_data_[i].src_[j].unShape.stWhc.u32Chn =
+          nnie_param->model_->astSeg[i].astSrcNode[j].unShape.stWhc.u32Chn;
+        nnie_param->seg_data_[i].src_[j].unShape.stWhc.u32Height =
+          nnie_param->model_->astSeg[i].astSrcNode[j].unShape.stWhc.u32Height;
+        nnie_param->seg_data_[i].src_[j].unShape.stWhc.u32Width =
+          nnie_param->model_->astSeg[i].astSrcNode[j].unShape.stWhc.u32Width;
+        nnie_param->seg_data_[i].src_[j].u32Num = nnie_cfg->max_input_num_;
+      }
+    }
+
+    if (SVP_NNIE_NET_TYPE_ROI == nnie_param->model_->astSeg[i].enNetType) {
+      num = nnie_cfg->max_roi_num_ * nnie_cfg->max_input_num_;
+    } else {
+      num = nnie_cfg->max_input_num_;
+    }
+
+    for (j = 0; j < nnie_param->model_->astSeg[i].u16DstNum; j++) {
+      if (SVP_BLOB_TYPE_SEQ_S32 == nnie_param->model_->astSeg[i].astDstNode[j].enType) {
+        nnie_param->seg_data_[i].dst_[j].enType = nnie_param->model_->astSeg[i].astDstNode[j].enType;
+        nnie_param->seg_data_[i].dst_[j].unShape.stSeq.u32Dim =
+          nnie_param->model_->astSeg[i].astDstNode[j].unShape.u32Dim;
+        nnie_param->seg_data_[i].dst_[j].u32Num = num;
+        nnie_param->seg_data_[i].dst_[j].unShape.stSeq.u64VirAddrStep =
+          nnie_cfg->step_vir_addr_[i * NNIE_EACH_SEG_STEP_ADDR_NUM + 1];
+      } else {
+        nnie_param->seg_data_[i].dst_[j].enType = nnie_param->model_->astSeg[i].astDstNode[j].enType;
+        nnie_param->seg_data_[i].dst_[j].unShape.stWhc.u32Chn =
+          nnie_param->model_->astSeg[i].astDstNode[j].unShape.stWhc.u32Chn;
+        nnie_param->seg_data_[i].dst_[j].unShape.stWhc.u32Height =
+          nnie_param->model_->astSeg[i].astDstNode[j].unShape.stWhc.u32Height;
+        nnie_param->seg_data_[i].dst_[j].unShape.stWhc.u32Width =
+          nnie_param->model_->astSeg[i].astDstNode[j].unShape.stWhc.u32Width;
+        nnie_param->seg_data_[i].dst_[j].u32Num = num;
+      }
+    }
+  }
+}
+
+static void GetBlobMemSize(SVP_NNIE_NODE_S nnie_node[], HI_U32 node_num, HI_U32 total_step, SVP_BLOB_S blob[],
+                           HI_U32 align32, HI_U32 *total_size, HI_U32 blob_size[], bool *mem_alloc = nullptr) {
+  HI_U32 i = 0;
+  HI_U32 size;
+  HI_U32 stride;
+
+  for (i = 0; i < node_num; i++) {
+    if (SVP_BLOB_TYPE_S32 == nnie_node[i].enType || SVP_BLOB_TYPE_VEC_S32 == nnie_node[i].enType ||
+        SVP_BLOB_TYPE_SEQ_S32 == nnie_node[i].enType) {
+      size = sizeof(HI_U32);
+    } else {
+      size = sizeof(HI_U8);
+    }
+    if (SVP_BLOB_TYPE_SEQ_S32 == nnie_node[i].enType) {
+      if (NNIE_ALIGN_16 == align32) {
+        stride = NNIE_ALIGN16(nnie_node[i].unShape.u32Dim * size);
+      } else {
+        stride = NNIE_ALIGN32(nnie_node[i].unShape.u32Dim * size);
+      }
+      blob_size[i] = total_step * stride;
+    } else {
+      if (NNIE_ALIGN_16 == align32) {
+        stride = NNIE_ALIGN16(nnie_node[i].unShape.stWhc.u32Width * size);
+      } else {
+        stride = NNIE_ALIGN32(nnie_node[i].unShape.stWhc.u32Width * size);
+      }
+      blob_size[i] = blob[i].u32Num * stride * nnie_node[i].unShape.stWhc.u32Height * nnie_node[i].unShape.stWhc.u32Chn;
+    }
+    if (mem_alloc != nullptr) {
+      if (mem_alloc[i]) {
+        blob_size[i] = 0;
+      }
+    }
+    *total_size += blob_size[i];
+    blob[i].u32Stride = stride;
+  }
+}
+
+static int GetTaskAndBlobBufSize(NnieCfg *nnie_cfg, NnieParam *nnie_param, HI_U32 *total_task_buf_size,
+                                 HI_U32 *tmp_buf_size, NnieBlobSize blob_size[], HI_U32 *total_size) {
+  HI_S32 ret = HI_SUCCESS;
+  HI_U32 i, j;
+  HI_U32 total_step = 0;
+
+  ret = HI_MPI_SVP_NNIE_GetTskBufSize(nnie_cfg->max_input_num_, nnie_cfg->max_roi_num_, nnie_param->model_,
+                                      nnie_param->task_buf_size_, nnie_param->model_->u32NetSegNum);
+  if (HI_SUCCESS != ret) {
+    LOGE("HI_MPI_SVP_NNIE_GetTskBufSize");
+    return RET_ERROR;
+  }
+
+  *total_task_buf_size = 0;
+  for (i = 0; i < nnie_param->model_->u32NetSegNum; i++) {
+    *total_task_buf_size += nnie_param->task_buf_size_[i];
+  }
+
+  *tmp_buf_size = nnie_param->model_->u32TmpBufSize;
+  *total_size += *total_task_buf_size + *tmp_buf_size;
+
+  for (i = 0; i < nnie_param->model_->u32NetSegNum; i++) {
+    if (SVP_NNIE_NET_TYPE_RECURRENT == nnie_param->model_->astSeg[i].enNetType) {
+      for (j = 0; j < nnie_param->seg_data_[i].src_[0].u32Num; j++) {
+        total_step += *(reinterpret_cast<HI_S32 *>(
+                          static_cast<HI_UL>(nnie_param->seg_data_[i].src_[0].unShape.stSeq.u64VirAddrStep)) +
+                        j);
+      }
+    }
+    GetBlobMemSize(&(nnie_param->model_->astSeg[i].astSrcNode[0]), nnie_param->model_->astSeg[i].u16SrcNum, total_step,
+                   &(nnie_param->seg_data_[i].src_[0]), NNIE_ALIGN_16, total_size, &(blob_size[i].src_size_[0]),
+                   &(nnie_param->mem_cfg_.seg_[i].src_node_[0]));
+
+    GetBlobMemSize(&(nnie_param->model_->astSeg[i].astDstNode[0]), nnie_param->model_->astSeg[i].u16DstNum, total_step,
+                   &(nnie_param->seg_data_[i].dst_[0]), NNIE_ALIGN_16, total_size, &(blob_size[i].dst_size_[0]));
+  }
+  return RET_OK;
+}
+
+static int NnieParamInit(NnieCfg *nnie_cfg, NnieParam *nnie_param) {
+  HI_U32 i, j;
+  HI_U32 total_size = 0;
+  HI_U32 total_task_buf_size = 0;
+  HI_U32 tmp_buf_size_ = 0;
+  HI_S32 ret = HI_SUCCESS;
+  HI_U32 off_set = 0;
+  HI_U64 phy_addr = 0;
+  HI_U8 *vir_addr = nullptr;
+  NnieBlobSize blob_size[SVP_NNIE_MAX_NET_SEG_NUM] = {0};
+
+  FillForwardInfo(nnie_cfg, nnie_param);
+
+  ret = GetTaskAndBlobBufSize(nnie_cfg, nnie_param, &total_task_buf_size, &tmp_buf_size_, blob_size, &total_size);
+  if (HI_SUCCESS != ret) {
+    LOGE("Error,Malloc memory failed! ");
+    return RET_ERROR;
+  }
+  bool has_roi = false;
+  for (i = 0; i < nnie_param->model_->u32NetSegNum; i++) {
+    if (SVP_NNIE_NET_TYPE_ROI == nnie_param->model_->astSeg[i].enNetType) {
+      has_roi = true;
+    }
+  }
+  if (has_roi) {
+    nnie_param->rpn_bbox_.enType = SVP_BLOB_TYPE_S32;
+    nnie_param->rpn_bbox_.unShape.stWhc.u32Chn = 1;
+    nnie_param->rpn_bbox_.unShape.stWhc.u32Height = nnie_cfg->max_roi_num_;
+    nnie_param->rpn_bbox_.unShape.stWhc.u32Width = NNIE_COORDI_NUM;
+    nnie_param->rpn_bbox_.u32Stride = NNIE_ALIGN16(NNIE_COORDI_NUM * sizeof(HI_U32));
+    nnie_param->rpn_bbox_.u32Num = nnie_cfg->max_input_num_;
+    total_size +=
+      nnie_param->rpn_bbox_.u32Num * nnie_param->rpn_bbox_.unShape.stWhc.u32Height * nnie_param->rpn_bbox_.u32Stride;
+  }
+
+  ret = NnieMemMallocCached(std::string("NNIE_NNIE_TASK").data(), nullptr, reinterpret_cast<HI_U64 *>(&phy_addr),
+                            reinterpret_cast<void **>(&vir_addr), total_size);
+  if (HI_SUCCESS != ret) {
+    LOGE("Error,Malloc memory failed! ");
+    return RET_ERROR;
+  }
+  memset(vir_addr, 0, total_size);
+  NnieMemFlushCache(phy_addr, reinterpret_cast<void *>(vir_addr), total_size);
+
+  nnie_param->task_buf_.u32Size = total_task_buf_size;
+  nnie_param->task_buf_.u64PhyAddr = phy_addr;
+  nnie_param->task_buf_.u64VirAddr = (HI_U64)(HI_UL)vir_addr;
+
+  nnie_param->tmp_buf_.u32Size = tmp_buf_size_;
+  nnie_param->tmp_buf_.u64PhyAddr = phy_addr + total_task_buf_size;
+  nnie_param->tmp_buf_.u64VirAddr = (HI_U64)(HI_UL)vir_addr + total_task_buf_size;
+
+  for (i = 0; i < nnie_param->model_->u32NetSegNum; i++) {
+    if (SVP_NNIE_NET_TYPE_ROI == nnie_param->model_->astSeg[i].enNetType) {
+      nnie_param->forward_with_bbox_ctrl_[i].stTmpBuf = nnie_param->tmp_buf_;
+      nnie_param->forward_with_bbox_ctrl_[i].stTskBuf.u64PhyAddr = nnie_param->task_buf_.u64PhyAddr + off_set;
+      nnie_param->forward_with_bbox_ctrl_[i].stTskBuf.u64VirAddr = nnie_param->task_buf_.u64VirAddr + off_set;
+      nnie_param->forward_with_bbox_ctrl_[i].stTskBuf.u32Size = nnie_param->task_buf_size_[i];
+    } else if (SVP_NNIE_NET_TYPE_CNN == nnie_param->model_->astSeg[i].enNetType ||
+               SVP_NNIE_NET_TYPE_RECURRENT == nnie_param->model_->astSeg[i].enNetType) {
+      nnie_param->forward_ctrl_[i].stTmpBuf = nnie_param->tmp_buf_;
+      nnie_param->forward_ctrl_[i].stTskBuf.u64PhyAddr = nnie_param->task_buf_.u64PhyAddr + off_set;
+      nnie_param->forward_ctrl_[i].stTskBuf.u64VirAddr = nnie_param->task_buf_.u64VirAddr + off_set;
+      nnie_param->forward_ctrl_[i].stTskBuf.u32Size = nnie_param->task_buf_size_[i];
+    }
+    off_set += nnie_param->task_buf_size_[i];
+  }
+
+  phy_addr = phy_addr + total_task_buf_size + tmp_buf_size_;
+  vir_addr = vir_addr + total_task_buf_size + tmp_buf_size_;
+  for (i = 0; i < nnie_param->model_->u32NetSegNum; i++) {
+    for (j = 0; j < nnie_param->model_->astSeg[i].u16SrcNum; j++) {
+      if (j != 0) {
+        phy_addr += blob_size[i].src_size_[j - 1];
+        vir_addr += blob_size[i].src_size_[j - 1];
+      }
+      if (nnie_param->mem_cfg_.seg_[i].src_node_[j]) {
+        if (!ConnectNnieInnerNode(nnie_param->model_->astSeg[i].astSrcNode[j].szName, nnie_param,
+                                  &(nnie_param->seg_data_[i].src_[j]))) {
+          LOGE("ConnectNnieInnerNode failed! ");
+          return RET_ERROR;
+        }
+      } else {
+        nnie_param->seg_data_[i].src_[j].u64PhyAddr = phy_addr;
+        nnie_param->seg_data_[i].src_[j].u64VirAddr = (HI_U64)(HI_UL)vir_addr;
+      }
+    }
+    phy_addr += blob_size[i].src_size_[j - 1];
+    vir_addr += blob_size[i].src_size_[j - 1];
+
+    for (j = 0; j < nnie_param->model_->astSeg[i].u16DstNum; j++) {
+      if (j != 0) {
+        phy_addr += blob_size[i].dst_size_[j - 1];
+        vir_addr += blob_size[i].dst_size_[j - 1];
+      }
+      nnie_param->seg_data_[i].dst_[j].u64PhyAddr = phy_addr;
+      nnie_param->seg_data_[i].dst_[j].u64VirAddr = (HI_U64)(HI_UL)vir_addr;
+    }
+    phy_addr += blob_size[i].dst_size_[j - 1];
+    vir_addr += blob_size[i].dst_size_[j - 1];
+  }
+  if (has_roi) {
+    nnie_param->rpn_bbox_.u64PhyAddr = phy_addr;
+    nnie_param->rpn_bbox_.u64VirAddr = (HI_U64)((HI_UL)vir_addr);
+  }
+  return RET_OK;
+}
+
+static int NnieLoadModel(char *model_buf, int size, NnieModel *nnie_model) {
+  HI_S32 ret = HI_INVALID_VALUE;
+  HI_U64 phy_addr = 0;
+  HI_U8 *vir_addr = nullptr;
+  ret = NnieMemMalloc(std::string("NNIE_NNIE_MODEL").data(), nullptr, reinterpret_cast<HI_U64 *>(&phy_addr),
+                      reinterpret_cast<void **>(&vir_addr), size);
+  if (HI_SUCCESS != ret) {
+    LOGE("Error,Malloc memory failed! ");
+    return RET_ERROR;
+  }
+  nnie_model->model_buf_.u32Size = (HI_U32)size;
+  nnie_model->model_buf_.u64PhyAddr = phy_addr;
+  nnie_model->model_buf_.u64VirAddr = (HI_U64)(HI_UL)vir_addr;
+  memcpy(vir_addr, model_buf, size);
+  ret = HI_MPI_SVP_NNIE_LoadModel(&nnie_model->model_buf_, &nnie_model->model_);
+  if (HI_SUCCESS != ret) {
+    NNIE_MEM_FREE(nnie_model->model_buf_.u64PhyAddr, nnie_model->model_buf_.u64VirAddr);
+    nnie_model->model_buf_.u32Size = 0;
+    LOGE("HI_MPI_SVP_NNIE_LoadModel failed!");
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+static void NnieUnloadModel(NnieModel *nnie_model) {
+  if (nnie_model == nullptr) {
+    return;
+  }
+
+  if (nnie_model->model_buf_.u64PhyAddr != 0 && nnie_model->model_buf_.u64VirAddr != 0) {
+    NNIE_MEM_FREE(nnie_model->model_buf_.u64PhyAddr, nnie_model->model_buf_.u64VirAddr);
+    nnie_model->model_buf_.u64PhyAddr = 0;
+    nnie_model->model_buf_.u64VirAddr = 0;
+  }
+}
+
+static int NnieForward(NnieParam *nnie_param, NnieDataIndex *input_data_idx, HI_BOOL instant) {
+  HI_S32 ret = HI_SUCCESS;
+  HI_U32 i, j;
+  HI_BOOL finish = HI_FALSE;
+  SVP_NNIE_HANDLE svp_nnie_handle = 0;
+  HI_U32 total_step_num = 0;
+  SVP_NNIE_FORWARD_CTRL_S *forward_handle = &nnie_param->forward_ctrl_[input_data_idx->seg_idx_];
+  NnieSegData *seg_data = &nnie_param->seg_data_[input_data_idx->seg_idx_];
+
+  NnieMemFlushCache(forward_handle->stTskBuf.u64PhyAddr,
+                    NNIE_CONVERT_64BIT_ADDR(HI_VOID, forward_handle->stTskBuf.u64VirAddr),
+                    forward_handle->stTskBuf.u32Size);
+
+  for (i = 0; i < forward_handle->u32DstNum; i++) {
+    if (SVP_BLOB_TYPE_SEQ_S32 == seg_data->dst_[i].enType) {
+      for (j = 0; j < seg_data->dst_[i].u32Num; j++) {
+        total_step_num += *(NNIE_CONVERT_64BIT_ADDR(HI_U32, seg_data->dst_[i].unShape.stSeq.u64VirAddrStep) + j);
+      }
+      NnieMemFlushCache(seg_data->dst_[i].u64PhyAddr, NNIE_CONVERT_64BIT_ADDR(HI_VOID, seg_data->dst_[i].u64VirAddr),
+                        total_step_num * seg_data->dst_[i].u32Stride);
+    } else {
+      NnieMemFlushCache(seg_data->dst_[i].u64PhyAddr, NNIE_CONVERT_64BIT_ADDR(HI_VOID, seg_data->dst_[i].u64VirAddr),
+                        seg_data->dst_[i].u32Num * seg_data->dst_[i].unShape.stWhc.u32Chn *
+                          seg_data->dst_[i].unShape.stWhc.u32Height * seg_data->dst_[i].u32Stride);
+    }
+  }
+
+  ret = HI_MPI_SVP_NNIE_Forward(&svp_nnie_handle, seg_data->src_, nnie_param->model_, seg_data->dst_, forward_handle,
+                                instant);
+  if (HI_SUCCESS != ret) {
+    LOGE("HI_MPI_SVP_NNIE_Forward failed!");
+    return RET_ERROR;
+  }
+
+  if (instant) {
+    while (HI_ERR_SVP_NNIE_QUERY_TIMEOUT ==
+           (ret = HI_MPI_SVP_NNIE_Query(forward_handle->enNnieId, svp_nnie_handle, &finish, HI_TRUE))) {
+      usleep(kSleepUs);
+    }
+  }
+
+  total_step_num = 0;
+  for (i = 0; i < forward_handle->u32DstNum; i++) {
+    if (SVP_BLOB_TYPE_SEQ_S32 == seg_data->dst_[i].enType) {
+      for (j = 0; j < seg_data->dst_[i].u32Num; j++) {
+        total_step_num += *(NNIE_CONVERT_64BIT_ADDR(HI_U32, seg_data->dst_[i].unShape.stSeq.u64VirAddrStep) + j);
+      }
+      NnieMemFlushCache(seg_data->dst_[i].u64PhyAddr, NNIE_CONVERT_64BIT_ADDR(HI_VOID, seg_data->dst_[i].u64VirAddr),
+                        total_step_num * seg_data->dst_[i].u32Stride);
+    } else {
+      NnieMemFlushCache(seg_data->dst_[i].u64PhyAddr, NNIE_CONVERT_64BIT_ADDR(HI_VOID, seg_data->dst_[i].u64VirAddr),
+                        seg_data->dst_[i].u32Num * seg_data->dst_[i].unShape.stWhc.u32Chn *
+                          seg_data->dst_[i].unShape.stWhc.u32Height * seg_data->dst_[i].u32Stride);
+    }
+  }
+
+  return RET_OK;
+}
+
+static HI_S32 NNIE_ForwardWithBbox(NnieParam *pstNnieParam, NnieDataIndex *pstInputDataIdx, SVP_SRC_BLOB_S astBbox[],
+                                   HI_BOOL bInstant) {
+  HI_S32 ret = HI_SUCCESS;
+  HI_BOOL finish = HI_FALSE;
+  SVP_NNIE_HANDLE svp_nnie_handle = 0;
+  HI_U32 total_step_num = 0;
+  HI_U32 i, j;
+
+  NnieMemFlushCache(pstNnieParam->forward_with_bbox_ctrl_[pstInputDataIdx->seg_idx_].stTskBuf.u64PhyAddr,
+                    NNIE_CONVERT_64BIT_ADDR(
+                      HI_VOID, pstNnieParam->forward_with_bbox_ctrl_[pstInputDataIdx->seg_idx_].stTskBuf.u64VirAddr),
+                    pstNnieParam->forward_with_bbox_ctrl_[pstInputDataIdx->seg_idx_].stTskBuf.u32Size);
+
+  for (i = 0; i < pstNnieParam->forward_with_bbox_ctrl_[pstInputDataIdx->seg_idx_].u32DstNum; i++) {
+    if (SVP_BLOB_TYPE_SEQ_S32 == pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].enType) {
+      for (j = 0; j < pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u32Num; j++) {
+        total_step_num +=
+          *(NNIE_CONVERT_64BIT_ADDR(
+              HI_U32, pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].unShape.stSeq.u64VirAddrStep) +
+            j);
+      }
+      NnieMemFlushCache(
+        pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u64PhyAddr,
+        NNIE_CONVERT_64BIT_ADDR(HI_VOID, pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u64VirAddr),
+        total_step_num * pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u32Stride);
+    } else {
+      NnieMemFlushCache(
+        pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u64PhyAddr,
+        NNIE_CONVERT_64BIT_ADDR(HI_VOID, pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u64VirAddr),
+        pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u32Num *
+          pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].unShape.stWhc.u32Chn *
+          pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].unShape.stWhc.u32Height *
+          pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u32Stride);
+    }
+  }
+
+  ret =
+    HI_MPI_SVP_NNIE_ForwardWithBbox(&svp_nnie_handle, pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].src_, astBbox,
+                                    pstNnieParam->model_, pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_,
+                                    &pstNnieParam->forward_with_bbox_ctrl_[pstInputDataIdx->seg_idx_], bInstant);
+  if (HI_SUCCESS != ret) {
+    LOGE("HI_MPI_SVP_NNIE_ForwardWithBbox failed!");
+    return RET_ERROR;
+  }
+
+  if (bInstant) {
+    while (HI_ERR_SVP_NNIE_QUERY_TIMEOUT ==
+           (ret = HI_MPI_SVP_NNIE_Query(pstNnieParam->forward_with_bbox_ctrl_[pstInputDataIdx->seg_idx_].enNnieId,
+                                        svp_nnie_handle, &finish, HI_TRUE))) {
+      usleep(kSleepUs);
+      LOGE("HI_MPI_SVP_NNIE_Query Query timeout!");
+    }
+  }
+
+  total_step_num = 0;
+
+  for (i = 0; i < pstNnieParam->forward_with_bbox_ctrl_[pstInputDataIdx->seg_idx_].u32DstNum; i++) {
+    if (SVP_BLOB_TYPE_SEQ_S32 == pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].enType) {
+      for (j = 0; j < pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u32Num; j++) {
+        total_step_num +=
+          *(NNIE_CONVERT_64BIT_ADDR(
+              HI_U32, pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].unShape.stSeq.u64VirAddrStep) +
+            j);
+      }
+      NnieMemFlushCache(
+        pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u64PhyAddr,
+        NNIE_CONVERT_64BIT_ADDR(HI_VOID, pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u64VirAddr),
+        total_step_num * pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u32Stride);
+    } else {
+      NnieMemFlushCache(
+        pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u64PhyAddr,
+        NNIE_CONVERT_64BIT_ADDR(HI_VOID, pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u64VirAddr),
+        pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u32Num *
+          pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].unShape.stWhc.u32Chn *
+          pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].unShape.stWhc.u32Height *
+          pstNnieParam->seg_data_[pstInputDataIdx->seg_idx_].dst_[i].u32Stride);
+    }
+  }
+
+  return ret;
+}
+
+int FillByUnsignedChar(HI_U32 input_size, HI_U32 num, HI_U32 width, HI_U32 stride, HI_U8 *src, HI_U8 *dst) {
+  HI_U32 i, j;
+  if (input_size != num * width) {
+    LOGE("input size error:%d <-> %d.", input_size, num * width);
+    return RET_ERROR;
+  }
+  for (i = 0; i < num; i++) {
+    for (j = 0; j < width; j++) {
+      dst[j] = src[j];
+    }
+    dst += stride;
+    src += width;
+  }
+  return RET_OK;
+}
+
+int FillByFloat(HI_U32 input_size, HI_U32 num, HI_U32 width, HI_U32 stride, HI_FLOAT *src, HI_S32 *dst, HI_U8 *dst_u8) {
+  HI_U32 i, j;
+  if (input_size != num * width) {
+    LOGE("input size error:%d <-> %d.", input_size, num * width);
+    return RET_ERROR;
+  }
+  for (i = 0; i < num; i++) {
+    for (j = 0; j < width; j++) {
+      dst[j] = (src[j] * NNIE_QUANT_BASE);
+    }
+    dst_u8 += stride;
+    dst = reinterpret_cast<HI_S32 *>(dst_u8);
+    src += width;
+  }
+  return RET_OK;
+}
+
+static int NnieFillSrcData(NnieCfg *nnie_cfg, NnieParam *nnie_param, NnieDataIndex *input_data_idx, int64_t *shape,
+                           int size) {
+  HI_U32 i, j, n, ret;
+  HI_U32 height, width, channel, stride, dim;
+  HI_U8 *input_addr_u8 = nullptr;
+  HI_S32 *input_addr_s32 = nullptr;
+  HI_U32 *step_addr_u32 = nullptr;
+  HI_FLOAT *float_src_data = nullptr;
+  HI_U8 *u8_src_data = nullptr;
+  HI_U32 total_step_num = 0;
+  HI_U32 input_size = 1;
+  SVP_SRC_BLOB_S *blob = &nnie_param->seg_data_[input_data_idx->seg_idx_].src_[input_data_idx->node_idx_];
+  for (n = 0; n < (HI_U32)size; n++) {
+    input_size *= shape[n];
+  }
+  input_addr_u8 = NNIE_CONVERT_64BIT_ADDR(HI_U8, blob->u64VirAddr);
+  input_addr_s32 = NNIE_CONVERT_64BIT_ADDR(HI_S32, blob->u64VirAddr);
+  float_src_data = reinterpret_cast<float *>(nnie_cfg->data_ptr_);
+  u8_src_data = reinterpret_cast<unsigned char *>(nnie_cfg->data_ptr_);
+  if (SVP_BLOB_TYPE_SEQ_S32 == blob->enType) {
+    step_addr_u32 = NNIE_CONVERT_64BIT_ADDR(HI_U32, blob->unShape.stSeq.u64VirAddrStep);
+    dim = blob->unShape.stSeq.u32Dim;
+    stride = blob->u32Stride;
+
+    for (n = 0; n < blob->u32Num; n++) {
+      total_step_num += *(step_addr_u32 + n);
+    }
+
+    if (input_size != total_step_num * dim) {
+      LOGE("input size error:%d <-> %d.", input_size, total_step_num * dim);
+      return RET_ERROR;
+    }
+    for (n = 0; n < blob->u32Num; n++) {
+      for (i = 0; i < *(step_addr_u32 + n); i++) {
+        for (j = 0; j < dim; j++) {
+          input_addr_s32[j] = (float_src_data[j] * NNIE_QUANT_BASE);
+        }
+        input_addr_u8 += stride;
+        input_addr_s32 = reinterpret_cast<HI_S32 *>(input_addr_u8);
+        float_src_data += dim;
+      }
+    }
+    NnieMemFlushCache(blob->u64PhyAddr, NNIE_CONVERT_64BIT_ADDR(HI_VOID, blob->u64VirAddr), total_step_num * stride);
+  } else {
+    height = blob->unShape.stWhc.u32Height;
+    width = blob->unShape.stWhc.u32Width;
+    channel = blob->unShape.stWhc.u32Chn;
+    stride = blob->u32Stride;
+    if (SVP_BLOB_TYPE_YVU420SP == blob->enType) {
+      ret = FillByUnsignedChar(input_size, blob->u32Num * static_cast<HI_U32>(channel * height / 2), width, stride,
+                               u8_src_data, input_addr_u8);
+    } else if (SVP_BLOB_TYPE_YVU422SP == blob->enType) {
+      ret = FillByUnsignedChar(input_size, blob->u32Num * height * 2, width, stride, u8_src_data, input_addr_u8);
+    } else {
+      if (SVP_BLOB_TYPE_U8 == blob->enType) {
+        ret =
+          FillByUnsignedChar(input_size, blob->u32Num * channel * height, width, stride, u8_src_data, input_addr_u8);
+      } else {
+        ret = FillByFloat(input_size, blob->u32Num * channel * height, width, stride, float_src_data, input_addr_s32,
+                          input_addr_u8);
+      }
+    }
+    if (ret != RET_OK) {
+      return ret;
+    }
+    NnieMemFlushCache(blob->u64PhyAddr, NNIE_CONVERT_64BIT_ADDR(HI_VOID, blob->u64VirAddr),
+                      blob->u32Num * channel * height * stride);
+  }
+
+  return RET_OK;
+}
+
+static int NnieGetDstData(NnieCfg *nnie_cfg, NnieParam *nnie_param, NnieDataIndex *input_data_idx, int64_t *shape,
+                          int size) {
+  HI_U32 i, j, n;
+  HI_U32 height, width, channel, stride, dim;
+  HI_U8 *output_addr_u8 = nullptr;
+  HI_S32 *output_addr_s32 = nullptr;
+  HI_U32 *step_addr_u32 = nullptr;
+  HI_FLOAT *float_dst_data = nullptr;
+  HI_U32 total_step_num = 0;
+  HI_U32 input_num = 1;
+  SVP_SRC_BLOB_S *blob = &nnie_param->seg_data_[input_data_idx->seg_idx_ - 1].dst_[input_data_idx->node_idx_];
+  for (n = 0; n < (HI_U32)size; n++) {
+    input_num *= shape[n];
+  }
+
+  if (SVP_BLOB_TYPE_U8 <= blob->enType && SVP_BLOB_TYPE_YVU422SP >= blob->enType) {
+    LOGE("Nnie output type error");
+    return RET_ERROR;
+  }
+
+  output_addr_u8 = NNIE_CONVERT_64BIT_ADDR(HI_U8, blob->u64VirAddr);
+  output_addr_s32 = NNIE_CONVERT_64BIT_ADDR(HI_S32, blob->u64VirAddr);
+  float_dst_data = reinterpret_cast<float *>(nnie_cfg->data_ptr_);
+
+  if (SVP_BLOB_TYPE_SEQ_S32 == blob->enType) {
+    dim = blob->unShape.stSeq.u32Dim;
+    stride = blob->u32Stride;
+    step_addr_u32 = NNIE_CONVERT_64BIT_ADDR(HI_U32, blob->unShape.stSeq.u64VirAddrStep);
+
+    for (n = 0; n < blob->u32Num; n++) {
+      total_step_num += *(step_addr_u32 + n);
+    }
+    if (input_num != total_step_num * dim) {
+      LOGE("input shape");
+      return RET_ERROR;
+    }
+    for (n = 0; n < blob->u32Num; n++) {
+      for (i = 0; i < *(step_addr_u32 + n); i++) {
+        for (j = 0; j < dim; j++) {
+          float_dst_data[j] = (HI_FLOAT)output_addr_s32[j] / NNIE_QUANT_BASE;
+        }
+        output_addr_u8 += stride;
+        output_addr_s32 = reinterpret_cast<HI_S32 *>(output_addr_u8);
+        float_dst_data += dim;
+      }
+    }
+  } else {
+    height = blob->unShape.stWhc.u32Height;
+    width = blob->unShape.stWhc.u32Width;
+    channel = blob->unShape.stWhc.u32Chn;
+    stride = blob->u32Stride;
+    if (input_num != height * channel * width * blob->u32Num) {
+      LOGE("output shape diff:%d<->%d.", input_num, height * channel * width * blob->u32Num);
+      return RET_ERROR;
+    }
+    for (n = 0; n < blob->u32Num; n++) {
+      for (i = 0; i < channel * height; i++) {
+        for (j = 0; j < width; j++) {
+          float_dst_data[j] = (HI_FLOAT)output_addr_s32[j] / NNIE_QUANT_BASE;
+        }
+        output_addr_u8 += stride;
+        output_addr_s32 = reinterpret_cast<HI_S32 *>(output_addr_u8);
+        float_dst_data += width;
+      }
+    }
+  }
+  return RET_OK;
+}
+
+int CheckMsShapeN(NnieRunCfg *nnie_run_cfg, const std::vector<int64_t> &input_shape, const SVP_NNIE_NODE_S &nnie_node) {
+  size_t ms_input_size = 1, i;
+  for (i = 1; i < input_shape.size(); i++) {
+    ms_input_size *= input_shape[i];
+  }
+
+  size_t nnie_input_size;
+  if (SVP_BLOB_TYPE_SEQ_S32 == nnie_node.enType) {
+    if (nnie_run_cfg->cfg_.step_ == 0) {
+      LOGE("request time_step set! Please export NNIE_RUNTIME_CONFIG_PATH");
+      return RET_ERROR;
+    }
+    if (ms_input_size != nnie_node.unShape.u32Dim) {
+      LOGE("The input data does not meet the required size %d <-> %d.", static_cast<int>(ms_input_size),
+           nnie_node.unShape.u32Dim);
+      return RET_ERROR;
+    }
+    if ((input_shape[0] < static_cast<int>(nnie_run_cfg->cfg_.step_)) ||
+        (input_shape[0] % nnie_run_cfg->cfg_.step_ != 0)) {
+      LOGW("The num value(%d) of input must be an integer multiple of time_step(%d)", static_cast<int>(input_shape[0]),
+           nnie_run_cfg->cfg_.step_);
+      return RET_ERROR;
+    }
+    nnie_input_size = nnie_node.unShape.u32Dim * nnie_run_cfg->cfg_.step_;
+  } else {
+    auto height = nnie_node.unShape.stWhc.u32Height;
+    auto width = nnie_node.unShape.stWhc.u32Width;
+    auto channel = nnie_node.unShape.stWhc.u32Chn;
+    if (SVP_BLOB_TYPE_YVU420SP == nnie_node.enType) {
+      nnie_input_size = static_cast<HI_U32>(channel * height / 2) * width;
+    } else if (SVP_BLOB_TYPE_YVU422SP == nnie_node.enType) {
+      nnie_input_size = height * 2 * width;
+    } else {
+      nnie_input_size = channel * height * width;
+    }
+    if (ms_input_size != nnie_input_size) {
+      LOGE("The input data does not meet the required size %d <-> %d.", static_cast<int>(ms_input_size),
+           static_cast<int>(nnie_input_size));
+      return RET_ERROR;
+    }
+  }
+  nnie_run_cfg->cfg_.max_input_num_ = (ms_input_size * input_shape[0]) / nnie_input_size;
+  fprintf(stdout, "The input num is %d.", nnie_run_cfg->cfg_.max_input_num_);
+  return RET_OK;
+}
+
+size_t GetFillIndex(const std::vector<mindspore::MSTensor> &inputs, size_t input_size, const HI_CHAR *name) {
+  size_t j;
+  for (j = 0; j < input_size; j++) {
+    auto input_str = inputs[j].Name();
+    if (input_str.length() > 4) {
+      if (input_str.substr(input_str.length() - 4) == "_pre") {
+        input_str = input_str.substr(0, input_str.length() - 4);
+      } else if (input_str.length() > 5) {
+        if (input_str.substr(input_str.length() - 5) == "_post") {
+          input_str = input_str.substr(0, input_str.length() - 5);
+        }
+      }
+    }
+
+    if (strcmp(input_str.c_str(), name) == 0) {
+      break;
+    }
+  }
+  if (j == input_size) {
+    for (j = 0; j < input_size; j++) {
+      auto input_str = inputs[j].Name();
+      if (input_str.length() > 4) {
+        if (input_str.substr(input_str.length() - 4) == "_pre") {
+          input_str = input_str.substr(0, input_str.length() - 4);
+        } else if (input_str.length() > 5) {
+          if (input_str.substr(input_str.length() - 5) == "_post") {
+            input_str = input_str.substr(0, input_str.length() - 5);
+          }
+        }
+      }
+
+      if (strncmp(input_str.c_str(), name, input_str.length()) == 0) {
+        break;
+      }
+    }
+  }
+  return j;
+}
+
+int NnieCommCreate(NnieRunCfg *nnie_run_cfg, char *model_buf, int size,
+                   const std::vector<mindspore::MSTensor> &inputs) {
+  HI_U8 *vir_addr = nullptr;
+  HI_U32 seg_num;
+  HI_U32 off_set;
+  HI_U32 total_size;
+  HI_U32 i, j;
+  HI_S32 ret = HI_SUCCESS;
+  NnieModel *model = &nnie_run_cfg->model_;
+  NnieParam *param = &nnie_run_cfg->param_;
+  NnieCfg *cfg = &nnie_run_cfg->cfg_;
+  HI_U32 step = cfg->step_;  // time step
+
+  ret = NnieLoadModel(model_buf, size, model);
+  if (ret != RET_OK) {
+    LOGE("NnieLoadModel failed!");
+    return RET_ERROR;
+  }
+  if (inputs.size() <= 1) {
+    LOGE("inputs size need greater than 1!");
+    return RET_ERROR;
+  }
+  if (inputs[0].Shape().size() <= 1) {
+    LOGE("input shape size need greater than 1!");
+    return RET_ERROR;
+  }
+
+  j = GetFillIndex(inputs, inputs.size() - 1, model->model_.astSeg[0].astSrcNode[0].szName);
+  if (j == (inputs.size() - 1)) {
+    j = 0;
+    // LOGW("input tensor name(%s) can't match wk node name(%s).", inputs[0].Name().c_str(),
+    //     model->model_.astSeg[0].astSrcNode[0].szName);
+  }
+  if (CheckMsShapeN(nnie_run_cfg, inputs[j].Shape(), model->model_.astSeg[0].astSrcNode[0]) != RET_OK) {
+    return RET_ERROR;
+  }
+
+  bool has_roi = false;
+  for (i = 0; i < model->model_.u32NetSegNum; i++) {
+    if (SVP_NNIE_NET_TYPE_ROI == model->model_.astSeg[i].enNetType) {
+      has_roi = true;
+    }
+  }
+  if (has_roi) {
+    if (cfg->max_roi_num_ == 0) {
+      LOGE("NNIE_RUNTIME_CONFIG_PATH: max_roi_num(0) should greater than 0!");
+      return RET_ERROR;
+    }
+  } else {
+    if (cfg->max_roi_num_ != 0) {
+      LOGW("NNIE_RUNTIME_CONFIG_PATH: max_roi_num should euqal to 0!");
+      cfg->max_roi_num_ = 0;
+    }
+  }
+
+  if (model->model_.astSeg[0].enNetType == SVP_NNIE_NET_TYPE_RECURRENT) {
+    if (step == 0) {
+      LOGE("request time_step set! No NNIE_RUNTIME_CONFIG_PATH, please export NNIE_RUNTIME_CONFIG_PATH");
+      return RET_ERROR;
+    }
+    seg_num = model->model_.u32NetSegNum;
+    total_size = cfg->max_input_num_ * sizeof(HI_S32) * seg_num * 2;
+    ret = NnieMemMalloc(std::string("SVP_NNIE_STEP").data(), nullptr,
+                        reinterpret_cast<HI_U64 *>(&param->step_buf_.u64PhyAddr), reinterpret_cast<void **>(&vir_addr),
+                        total_size);
+    if (HI_SUCCESS != ret) {
+      LOGE("Malloc memory failed:");
+      return RET_ERROR;
+    }
+
+    param->step_buf_.u64VirAddr = (HI_U64)((HI_UL)vir_addr);
+    for (i = 0; i < seg_num * NNIE_EACH_SEG_STEP_ADDR_NUM; i++) {
+      cfg->step_vir_addr_[i] = param->step_buf_.u64VirAddr + i * cfg->max_input_num_ * sizeof(HI_S32);
+    }
+
+    for (i = 0; i < seg_num; i++) {
+      off_set = i * NNIE_EACH_SEG_STEP_ADDR_NUM;
+      for (j = 0; j < cfg->max_input_num_; j++) {
+        *(reinterpret_cast<HI_U32 *>(static_cast<HI_UL>(cfg->step_vir_addr_[off_set])) + j) =
+          step;  // step of input x_t
+        *(reinterpret_cast<HI_U32 *>(static_cast<HI_UL>(cfg->step_vir_addr_[off_set + 1])) + j) =
+          step;  // step of output h_t
+      }
+    }
+  }
+  param->model_ = &(model->model_);
+  ret = NnieParamInit(cfg, param);
+  if (ret != RET_OK) {
+    LOGE("NnieParamInit failed!");
+    return RET_ERROR;
+  }
+  nnie_run_cfg->run_idx_.seg_idx_ = 0;
+  return RET_OK;
+}
+
+void NnieCommDelete(NnieParam *pstNnieParamm, NnieModel *nnie_model) {
+  NnieParamRelease(pstNnieParamm);
+  NnieUnloadModel(nnie_model);
+}
+
+int NnieCommGetOutputData(NnieRunCfg *nnie_run_cfg, float *data, int64_t *shape, int size, int tensor_index) {
+  if (nnie_run_cfg->run_idx_.seg_idx_ <= 0) {
+    LOGE("output seg index error.");
+    return RET_ERROR;
+  }
+  HI_U32 ret = 0;
+  int id = tensor_index;
+
+  nnie_run_cfg->run_idx_.node_idx_ = id;
+  nnie_run_cfg->cfg_.data_ptr_ = data;
+  ret = NnieGetDstData(&nnie_run_cfg->cfg_, &nnie_run_cfg->param_, &nnie_run_cfg->run_idx_, shape, size);
+  if (ret != RET_OK) {
+    LOGE("NnieGetDstData failed!");
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int NnieCommFillData(NnieRunCfg *nnie_run_cfg, void *data, mindspore::DataType dtype, int64_t *shape, int size,
+                     int tensor_index) {
+  HI_U32 ret = 0;
+  int id = tensor_index;
+  HI_U32 seg_idx = nnie_run_cfg->run_idx_.seg_idx_;
+
+  if (id >= nnie_run_cfg->param_.model_->astSeg[seg_idx].u16SrcNum) {
+    LOGE("Nnie input node index error!");
+    return RET_ERROR;
+  }
+  SVP_BLOB_TYPE_E src_type = nnie_run_cfg->param_.seg_data_[seg_idx].src_[id].enType;
+  if (SVP_BLOB_TYPE_U8 <= src_type && src_type <= SVP_BLOB_TYPE_YVU422SP) {
+    if (!(dtype == DataType::kNumberTypeUInt8 || dtype == DataType::kNumberTypeInt8)) {
+      LOGE("Nnie input node type error!");
+      return RET_ERROR;
+    }
+  } else {
+    if (dtype != DataType::kNumberTypeFloat32) {
+      LOGE("Nnie input node type error!");
+      return RET_ERROR;
+    }
+  }
+  nnie_run_cfg->run_idx_.node_idx_ = id;
+  nnie_run_cfg->cfg_.data_ptr_ = data;
+  ret = NnieFillSrcData(&nnie_run_cfg->cfg_, &nnie_run_cfg->param_, &nnie_run_cfg->run_idx_, shape, size);
+  if (ret != RET_OK) {
+    LOGE("NnieFillSrcData failed!");
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int NnieCommRun(NnieRunCfg *nnie_run_cfg, bool run_box) {
+  HI_U32 segidx = nnie_run_cfg->run_idx_.seg_idx_;
+  HI_U32 ret = 0;
+
+  if (segidx >= nnie_run_cfg->param_.model_->u32NetSegNum) {
+    LOGE("seg num err!\n");
+    return RET_ERROR;
+  }
+  // NniePrintReportResultInputSeg(&nnie_run_cfg->param_, segidx);
+  nnie_run_cfg->run_idx_.node_idx_ = 0;
+  if (run_box) {
+    ret =
+      NNIE_ForwardWithBbox(&nnie_run_cfg->param_, &nnie_run_cfg->run_idx_, &nnie_run_cfg->param_.rpn_bbox_, HI_TRUE);
+    if (HI_SUCCESS != ret) {
+      LOGE("NnieForward failed!");
+      return RET_ERROR;
+    }
+  } else {
+    ret = NnieForward(&nnie_run_cfg->param_, &nnie_run_cfg->run_idx_, HI_TRUE);
+    if (HI_SUCCESS != ret) {
+      LOGE("NnieForward failed!");
+      return RET_ERROR;
+    }
+  }
+
+  nnie_run_cfg->run_idx_.seg_idx_ = ++segidx;
+  return RET_OK;
+}
+}  // namespace nnie
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/nnie/src/nnie_common.h
+++ b/mindspore/lite/tools/benchmark/nnie/src/nnie_common.h
@ -0,0 +1,115 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_COMMON_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_COMMON_H_
+#include <iostream>
+#include <string>
+#include <vector>
+#include "include/api/types.h"
+#include "include/mpi_vb.h"
+#include "include/hi_comm_svp.h"
+#include "include/hi_nnie.h"
+#include "include/mpi_nnie.h"
+#include "include/ir/dtype/type_id.h"
+
+namespace mindspore {
+namespace nnie {
+#define NNIE_ALIGN_16 16
+#define NNIE_ALIGN16(u32Num) ((u32Num + NNIE_ALIGN_16 - 1) / NNIE_ALIGN_16 * NNIE_ALIGN_16)
+
+#define NNIE_ALIGN_32 32
+#define NNIE_ALIGN32(u32Num) ((u32Num + NNIE_ALIGN_32 - 1) / NNIE_ALIGN_32 * NNIE_ALIGN_32)
+
+#define NNIE_CONVERT_64BIT_ADDR(Type, Addr) reinterpret_cast<Type *>((HI_UL)(Addr))
+#define NNIE_QUANT_BASE 4096
+
+#define NNIE_COORDI_NUM 4
+#define NNIE_EACH_SEG_STEP_ADDR_NUM 2
+#define NNIE_REPORT_NAME_LENGTH 64
+
+typedef struct {
+  SVP_NNIE_MODEL_S model_;
+  SVP_MEM_INFO_S model_buf_;  // store Model file
+} NnieModel;
+typedef struct {
+  SVP_SRC_BLOB_S src_[SVP_NNIE_MAX_INPUT_NUM];
+  SVP_DST_BLOB_S dst_[SVP_NNIE_MAX_OUTPUT_NUM];
+} NnieSegData;
+
+typedef struct {
+  bool src_node_[SVP_NNIE_MAX_INPUT_NUM];
+  bool dst_node_[SVP_NNIE_MAX_OUTPUT_NUM];
+} NNIEMemSegInfo;
+
+typedef struct {
+  NNIEMemSegInfo seg_[SVP_NNIE_MAX_NET_SEG_NUM];
+} NNIEMemCfg;
+
+typedef struct {
+  SVP_NNIE_MODEL_S *model_;
+  HI_U32 task_buf_size_[SVP_NNIE_MAX_NET_SEG_NUM];
+  SVP_MEM_INFO_S task_buf_;
+  SVP_MEM_INFO_S tmp_buf_;
+  SVP_MEM_INFO_S step_buf_;  // store Lstm step info
+  SVP_SRC_BLOB_S rpn_bbox_;
+  NnieSegData seg_data_[SVP_NNIE_MAX_NET_SEG_NUM];  // each seg's input and output blob
+  SVP_NNIE_FORWARD_CTRL_S forward_ctrl_[SVP_NNIE_MAX_NET_SEG_NUM];
+  SVP_NNIE_FORWARD_WITHBBOX_CTRL_S forward_with_bbox_ctrl_[SVP_NNIE_MAX_NET_SEG_NUM];
+  NNIEMemCfg mem_cfg_;
+} NnieParam;
+
+typedef struct {
+  HI_VOID *data_ptr_;
+  HI_U32 max_input_num_;
+  HI_U32 max_roi_num_;
+  HI_U32 step_;
+  HI_U64 step_vir_addr_[NNIE_EACH_SEG_STEP_ADDR_NUM *
+                        SVP_NNIE_MAX_NET_SEG_NUM];  // virtual addr of LSTM's or RNN's step buffer
+  SVP_NNIE_ID_E nnie_core_id_[SVP_NNIE_MAX_NET_SEG_NUM];
+} NnieCfg;
+
+typedef struct {
+  HI_U32 seg_idx_;
+  HI_U32 node_idx_;
+} NnieDataIndex;
+
+typedef struct {
+  HI_U32 src_size_[SVP_NNIE_MAX_INPUT_NUM];
+  HI_U32 dst_size_[SVP_NNIE_MAX_OUTPUT_NUM];
+} NnieBlobSize;
+
+typedef struct {
+  NnieModel model_;
+  NnieParam param_;
+  NnieCfg cfg_;
+  NnieDataIndex run_idx_;
+} NnieRunCfg;
+
+int NnieCommCreate(NnieRunCfg *nnie_run_cfg, char *model_buf, int size, const std::vector<mindspore::MSTensor> &inputs);
+
+size_t GetFillIndex(const std::vector<mindspore::MSTensor> &inputs, size_t input_size, const HI_CHAR *name);
+
+void NnieCommDelete(NnieParam *pstNnieParamm, NnieModel *nnie_model);
+
+int NnieCommRun(NnieRunCfg *nnie_run_cfg, bool run_box);
+
+int NnieCommFillData(NnieRunCfg *nnie_run_cfg, void *data, mindspore::DataType dtype, int64_t *shape, int size, int id);
+
+int NnieCommGetOutputData(NnieRunCfg *nnie_run_cfg, float *data, int64_t *shape, int size, int tensor_index);
+}  // namespace nnie
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_COMMON_H_
--- a/mindspore/lite/tools/benchmark/nnie/src/nnie_manager.cc
+++ b/mindspore/lite/tools/benchmark/nnie/src/nnie_manager.cc
@ -0,0 +1,222 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cstring>
+#include "src/nnie_manager.h"
+#include "src/nnie_common.h"
+#include "src/nnie_print.h"
+#include "src/nnie_memory.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+constexpr int kNumInput2 = 2;
+
+namespace mindspore {
+namespace nnie {
+int NNIEManager::CfgInit(int max_roi_num, int step, const std::vector<int> &core_id) {
+  memset(&nnie_cfg_, 0, sizeof(NnieRunCfg));
+
+  nnie_cfg_.cfg_.max_roi_num_ = max_roi_num;
+
+  nnie_cfg_.cfg_.step_ = step;
+  for (size_t i = 0; i < SVP_NNIE_MAX_NET_SEG_NUM && i < core_id.size(); i++) {
+    if (core_id[i] < SVP_NNIE_ID_BUTT) {
+      nnie_cfg_.cfg_.nnie_core_id_[i] = (SVP_NNIE_ID_E)core_id[i];
+    } else {
+      LOGE("nnie core num toobig.\n");
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+void NNIEManager::SetInputNum(int max_input_num) { nnie_cfg_.cfg_.max_input_num_ = max_input_num; }
+
+int NNIEManager::Init(char *model_buf, int size, const std::vector<mindspore::MSTensor> &inputs) {
+  if (NnieCommCreate(&nnie_cfg_, model_buf, size, inputs) != RET_OK) {
+    NnieCommDelete(&nnie_cfg_.param_, &nnie_cfg_.model_);
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int NNIEManager::Run(std::vector<mindspore::MSTensor> *outputs, unsigned int seg_id,
+                     const std::vector<std::vector<int64_t>> &outputs_shape) {
+  bool run_box = false;
+  nnie_cfg_.run_idx_.seg_idx_ = seg_id;
+  if (nnie_cfg_.param_.model_->astSeg[seg_id].enNetType == SVP_NNIE_NET_TYPE_ROI) {
+    run_box = true;
+  }
+
+  if (NnieCommRun(&nnie_cfg_, run_box)) {
+    LOGE("Nnie Run Fail!");
+    return RET_ERROR;
+  }
+  if (GetOutputData(outputs, outputs_shape, run_box)) {
+    LOGE("Get Output Data Fail!");
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+void NNIEManager::Release() {
+  // NniePrintReportResult(&nnie_cfg_.param_);
+  NnieCommDelete(&nnie_cfg_.param_, &nnie_cfg_.model_);
+}
+
+int NNIEManager::GetOutputData(std::vector<mindspore::MSTensor> *outputs,
+                               const std::vector<std::vector<int64_t>> &outputs_shape, bool run_box) {
+  int i, j, output_size = outputs->size();
+  if (output_size != nnie_cfg_.param_.model_->astSeg[nnie_cfg_.run_idx_.seg_idx_ - 1].u16DstNum) {
+    LOGE("seg%d: %d output tensors are required, but there are %d outputs.", nnie_cfg_.run_idx_.seg_idx_ - 1,
+         nnie_cfg_.param_.model_->astSeg[nnie_cfg_.run_idx_.seg_idx_ - 1].u16DstNum, output_size);
+    return RET_ERROR;
+  }
+
+  if (run_box) {
+    for (i = 0; i < output_size; i++) {
+      auto input_data_type = (*outputs)[i].DataType();
+      if (input_data_type == DataType::kNumberTypeFloat32) {
+        auto ptr_shape = outputs_shape[i];
+        int max_roi_num = nnie_cfg_.param_.seg_data_[nnie_cfg_.run_idx_.seg_idx_ - 1].dst_[0].u32Num;
+        ptr_shape.insert(ptr_shape.begin(), max_roi_num);
+        (*outputs)[i].SetShape(ptr_shape);
+      } else {
+        LOGE("Unsupported DataType!");
+        return RET_ERROR;
+      }
+    }
+  }
+  HI_U32 seg_idx = nnie_cfg_.run_idx_.seg_idx_ - 1;
+  for (i = 0; i < nnie_cfg_.param_.model_->astSeg[seg_idx].u16DstNum; i++) {
+    if (nnie_cfg_.param_.mem_cfg_.seg_[seg_idx].dst_node_[i]) {
+      continue;
+    }
+
+    j = GetFillIndex(*outputs, output_size, nnie_cfg_.param_.model_->astSeg[seg_idx].astDstNode[i].szName);
+    if (j == output_size) {
+      j = i;
+      // LOGW("output tensor name(%s) can't match wk node name(%s).", (*outputs)[j].Name().c_str(),
+      //     nnie_cfg_.param_.model_->astSeg[seg_idx].astDstNode[i].szName);
+    }
+
+    auto input_data_type = (*outputs)[j].DataType();
+    if (input_data_type == DataType::kNumberTypeFloat32) {
+      auto ptr_shape = (*outputs)[j].Shape();
+      auto ptr = reinterpret_cast<float *>((*outputs)[j].MutableData());
+      if (NnieCommGetOutputData(&nnie_cfg_, ptr, ptr_shape.data(), ptr_shape.size(), i) != RET_OK) {
+        return RET_ERROR;
+      }
+    } else {
+      LOGE("Unsupported DataType!");
+      return RET_ERROR;
+    }
+  }
+
+  return RET_OK;
+}
+
+int NNIEManager::FillRoiPooling(mindspore::MSTensor *input) {
+  auto roi_shape = input->Shape();
+  if (roi_shape[1] != NNIE_COORDI_NUM) {
+    LOGE("Roi shape err!");
+    return RET_ERROR;
+  }
+
+  if (roi_shape[0] > static_cast<int64_t>(nnie_cfg_.cfg_.max_roi_num_)) {
+    LOGE("NNIE_RUNTIME_CONFIG_PATH: The maximum [max_roi_num] value set is less than the actual value: %d < %d.",
+         nnie_cfg_.cfg_.max_roi_num_, static_cast<int>(roi_shape[0]));
+    return RET_ERROR;
+  }
+  nnie_cfg_.param_.rpn_bbox_.unShape.stWhc.u32Height = roi_shape[0];
+  HI_U32 dst_stride = nnie_cfg_.param_.rpn_bbox_.u32Stride;
+  auto proposal_result = NNIE_CONVERT_64BIT_ADDR(HI_S32, nnie_cfg_.param_.rpn_bbox_.u64VirAddr);
+  auto float_src_data = reinterpret_cast<float *>(input->MutableData());
+
+  for (size_t j = 0; j < nnie_cfg_.param_.rpn_bbox_.unShape.stWhc.u32Height; j++) {
+    proposal_result[dst_stride / sizeof(HI_U32) * j] = *(float_src_data++) * NNIE_QUANT_BASE;
+    proposal_result[dst_stride / sizeof(HI_U32) * j + 1] = *(float_src_data++) * NNIE_QUANT_BASE;
+    proposal_result[dst_stride / sizeof(HI_U32) * j + 2] = *(float_src_data++) * NNIE_QUANT_BASE;
+    proposal_result[dst_stride / sizeof(HI_U32) * j + 3] = *(float_src_data++) * NNIE_QUANT_BASE;
+  }
+  NnieMemFlushCache(nnie_cfg_.param_.rpn_bbox_.u64PhyAddr,
+                    NNIE_CONVERT_64BIT_ADDR(HI_VOID, nnie_cfg_.param_.rpn_bbox_.u64VirAddr),
+                    dst_stride * nnie_cfg_.param_.rpn_bbox_.unShape.stWhc.u32Height);
+
+  return RET_OK;
+}
+
+int NNIEManager::FillData(std::vector<mindspore::MSTensor> *inputs, unsigned int seg_id) {
+  bool run_box = false;
+  size_t i, j;
+  size_t input_size = inputs->size();
+  if (seg_id >= nnie_cfg_.param_.model_->u32NetSegNum) {
+    LOGE("seg num err!");
+    return RET_ERROR;
+  }
+
+  nnie_cfg_.run_idx_.seg_idx_ = seg_id;
+
+  if (nnie_cfg_.param_.model_->astSeg[nnie_cfg_.run_idx_.seg_idx_].enNetType == SVP_NNIE_NET_TYPE_ROI) {
+    run_box = true;
+    for (i = 0; i < (input_size - 1); i++) {
+      if ((*inputs)[i].Name() == "proposal") {
+        FillRoiPooling(&(*inputs)[i]);
+        break;
+      }
+    }
+    if (i == (input_size - 1)) {
+      LOGE("Can't find proposal out!");
+      return RET_ERROR;
+    }
+  } else if ((input_size < kNumInput2) ||
+             (input_size - 1) != nnie_cfg_.param_.model_->astSeg[nnie_cfg_.run_idx_.seg_idx_].u16SrcNum) {
+    LOGE("Input Size Err!");
+    return RET_ERROR;
+  }
+
+  for (i = 0; i < nnie_cfg_.param_.model_->astSeg[seg_id].u16SrcNum; i++) {
+    if (nnie_cfg_.param_.mem_cfg_.seg_[seg_id].src_node_[i]) {
+      continue;
+    }
+    j = GetFillIndex(*inputs, input_size - 1, nnie_cfg_.param_.model_->astSeg[seg_id].astSrcNode[i].szName);
+    if (j == (input_size - 1)) {
+      if (run_box && (*inputs)[i].Name() == "proposal") {
+        continue;
+      } else {
+        j = i;
+        // LOGW("input tensor name(%s) can't match wk node name(%s).", (*inputs)[i].Name().c_str(),
+        //      nnie_cfg_.param_.model_->astSeg[seg_id].astSrcNode[i].szName);
+      }
+    }
+
+    auto input_data_type = (*inputs)[j].DataType();
+    if ((input_data_type == DataType::kNumberTypeFloat32) || (input_data_type == DataType::kNumberTypeUInt8) ||
+        (input_data_type == DataType::kNumberTypeInt8)) {
+      auto ptr_shape = (*inputs)[j].Shape();
+      if (NnieCommFillData(&nnie_cfg_, (*inputs)[j].MutableData(), input_data_type, ptr_shape.data(), ptr_shape.size(),
+                           i) != RET_OK) {
+        LOGE("FillData failed!");
+        return RET_ERROR;
+      }
+    } else {
+      LOGE("Unsupported DataType!");
+      return RET_ERROR;
+    }
+  }
+
+  return RET_OK;
+}
+}  // namespace nnie
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/nnie/src/nnie_manager.h
+++ b/mindspore/lite/tools/benchmark/nnie/src/nnie_manager.h
@ -0,0 +1,62 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_MANAGER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_MANAGER_H_
+#include <vector>
+#include "include/errorcode.h"
+#include "include/api/types.h"
+#include "src/nnie_common.h"
+
+namespace mindspore {
+namespace nnie {
+class NNIEManager {
+ public:
+  static NNIEManager *GetInstance() {
+    static NNIEManager manager;
+    return &manager;
+  }
+
+  NNIEManager() {}
+
+  ~NNIEManager() {}
+
+  int Init(char *model_buf, int size, const std::vector<mindspore::MSTensor> &inputs);
+
+  int CfgInit(int max_roi_num, int step, const std::vector<int> &core_id);
+
+  void SetInputNum(int max_input_num);
+
+  int FillData(std::vector<mindspore::MSTensor> *inputs, unsigned int seg_id);
+
+  int Run(std::vector<mindspore::MSTensor> *outputs, unsigned int seg_id,
+          const std::vector<std::vector<int64_t>> &outputs_shape);
+
+  void Release();
+
+ private:
+  int GetOutputData(std::vector<mindspore::MSTensor> *outputs, const std::vector<std::vector<int64_t>> &outputs_shape,
+                    bool run_box = false);
+  int FillRoiPooling(mindspore::MSTensor *input);
+  char *wk_model_ = nullptr;
+
+  int model_size_ = 0;
+
+  NnieRunCfg nnie_cfg_;
+};
+}  // namespace nnie
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_MANAGER_H_
--- a/mindspore/lite/tools/benchmark/nnie/src/nnie_memory.cc
+++ b/mindspore/lite/tools/benchmark/nnie/src/nnie_memory.cc
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/nnie_memory.h"
+#include "include/hi_common.h"
+#include "include/mpi_sys.h"
+
+namespace mindspore {
+namespace nnie {
+HI_S32 NnieMemMalloc(const HI_CHAR *mmb, HI_CHAR *zone, HI_U64 *pu_phy_addr, HI_VOID **ppv_vir_addr, HI_U32 size) {
+  return HI_MPI_SYS_MmzAlloc(pu_phy_addr, ppv_vir_addr, mmb, zone, size);
+}
+
+HI_S32 NnieMemMallocCached(const HI_CHAR *mmb, HI_CHAR *zone, HI_U64 *pu_phy_addr, HI_VOID **ppv_vir_addr,
+                           HI_U32 size) {
+  return HI_MPI_SYS_MmzAlloc_Cached(pu_phy_addr, ppv_vir_addr, mmb, zone, size);
+}
+
+HI_S32 NnieMemFlushCache(HI_U64 phy_addr, HI_VOID *pv_vir_addr, HI_U32 size) {
+  return HI_MPI_SYS_MmzFlushCache(phy_addr, pv_vir_addr, size);
+}
+}  // namespace nnie
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/nnie/src/nnie_memory.h
+++ b/mindspore/lite/tools/benchmark/nnie/src/nnie_memory.h
@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_MEMORY_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_MEMORY_H_
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "include/hi_common.h"
+#include "include/hi_debug.h"
+#include "include/hi_comm_svp.h"
+#include "include/hi_nnie.h"
+#include "include/mpi_nnie.h"
+#include "include/mpi_sys.h"
+
+namespace mindspore {
+namespace nnie {
+#define NNIE_MEM_FREE(phy, vir)                                                     \
+  do {                                                                              \
+    if ((0 != (phy)) && (0 != (vir))) {                                             \
+      HI_MPI_SYS_MmzFree((phy), reinterpret_cast<void *>(static_cast<HI_UL>(vir))); \
+      (phy) = 0;                                                                    \
+      (vir) = 0;                                                                    \
+    }                                                                               \
+  } while (0)
+
+HI_S32 NnieMemMalloc(const HI_CHAR *mmb, HI_CHAR *zone, HI_U64 *pu_phy_addr, HI_VOID **ppv_vir_addr, HI_U32 size);
+
+HI_S32 NnieMemMallocCached(const HI_CHAR *mmb, HI_CHAR *zone, HI_U64 *pu_phy_addr, HI_VOID **ppv_vir_addr, HI_U32 size);
+
+HI_S32 NnieMemFlushCache(HI_U64 phy_addr, HI_VOID *pv_vir_addr, HI_U32 size);
+}  // namespace nnie
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_MEMORY_H_
--- a/mindspore/lite/tools/benchmark/nnie/src/nnie_print.cc
+++ b/mindspore/lite/tools/benchmark/nnie/src/nnie_print.cc
@ -0,0 +1,176 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/nnie_print.h"
+
+namespace mindspore {
+namespace nnie {
+HI_S32 NniePrintReportResult(NnieParam *pst_nnie_param) {
+  HI_U32 u32seg_num = pst_nnie_param->model_->u32NetSegNum;
+  HI_U32 i, j, k, n;
+  HI_U32 seg_idx_, node_idx_;
+  HI_S32 ret;
+  HI_CHAR acReportFileName[NNIE_REPORT_NAME_LENGTH] = {'\0'};
+  FILE *fp = nullptr;
+  HI_U32 *pu32StepAddr = nullptr;
+  HI_S32 *ps32ResultAddr = nullptr;
+  HI_U32 u32Height, u32Width, u32Chn, u32Stride, u32Dim;
+
+  for (seg_idx_ = 0; seg_idx_ < u32seg_num; seg_idx_++) {
+    for (node_idx_ = 0; node_idx_ < pst_nnie_param->model_->astSeg[seg_idx_].u16DstNum; node_idx_++) {
+      ret = snprintf(acReportFileName, NNIE_REPORT_NAME_LENGTH, "./ms/fseg%d(%d,%d)_%s.txt", seg_idx_, node_idx_,
+                     pst_nnie_param->model_->astSeg[seg_idx_].astDstNode[node_idx_].u32NodeId,
+                     pst_nnie_param->model_->astSeg[seg_idx_].astDstNode[node_idx_].szName);
+      if (ret < 0) {
+        LOGE("Error,create file name failed!");
+        return HI_FAILURE;
+      }
+
+      fp = fopen(acReportFileName, "w");
+      if (fp == nullptr) {
+        LOGE("Error,open file failed!");
+        return HI_FAILURE;
+      }
+
+      if (SVP_BLOB_TYPE_SEQ_S32 == pst_nnie_param->seg_data_[seg_idx_].dst_[node_idx_].enType) {
+        u32Dim = pst_nnie_param->seg_data_[seg_idx_].dst_[node_idx_].unShape.stSeq.u32Dim;
+        u32Stride = pst_nnie_param->seg_data_[seg_idx_].dst_[node_idx_].u32Stride;
+        pu32StepAddr = NNIE_CONVERT_64BIT_ADDR(
+          HI_U32, pst_nnie_param->seg_data_[seg_idx_].dst_[node_idx_].unShape.stSeq.u64VirAddrStep);
+        ps32ResultAddr =
+          NNIE_CONVERT_64BIT_ADDR(HI_S32, pst_nnie_param->seg_data_[seg_idx_].dst_[node_idx_].u64VirAddr);
+
+        for (n = 0; n < pst_nnie_param->seg_data_[seg_idx_].dst_[node_idx_].u32Num; n++) {
+          for (i = 0; i < *(pu32StepAddr + n); i++) {
+            for (j = 0; j < u32Dim; j++) {
+              fprintf(fp, "%f ", static_cast<float>(*(ps32ResultAddr + j)) / NNIE_QUANT_BASE);
+            }
+            ps32ResultAddr += u32Stride / sizeof(HI_U32);
+          }
+        }
+      } else {
+        u32Height = pst_nnie_param->seg_data_[seg_idx_].dst_[node_idx_].unShape.stWhc.u32Height;
+        u32Width = pst_nnie_param->seg_data_[seg_idx_].dst_[node_idx_].unShape.stWhc.u32Width;
+        u32Chn = pst_nnie_param->seg_data_[seg_idx_].dst_[node_idx_].unShape.stWhc.u32Chn;
+        u32Stride = pst_nnie_param->seg_data_[seg_idx_].dst_[node_idx_].u32Stride;
+        ps32ResultAddr =
+          NNIE_CONVERT_64BIT_ADDR(HI_S32, pst_nnie_param->seg_data_[seg_idx_].dst_[node_idx_].u64VirAddr);
+        fprintf(fp, "%s 4 1 %d %d %d\n", pst_nnie_param->model_->astSeg[seg_idx_].astDstNode[node_idx_].szName,
+                u32Height, u32Width, u32Chn);
+        for (n = 0; n < pst_nnie_param->seg_data_[seg_idx_].dst_[node_idx_].u32Num; n++) {
+          for (i = 0; i < u32Chn; i++) {
+            for (j = 0; j < u32Height; j++) {
+              for (k = 0; k < u32Width; k++) {
+                ret = fprintf(fp, "%f ", static_cast<float>(*(ps32ResultAddr + k)) / NNIE_QUANT_BASE);
+                if (ret < 0) {
+                  fclose(fp);
+                  return HI_FAILURE;
+                }
+              }
+              ps32ResultAddr += u32Stride / sizeof(HI_U32);
+            }
+          }
+        }
+      }
+      fclose(fp);
+    }
+  }
+  return HI_SUCCESS;
+}
+
+HI_S32 NniePrintReportResultInputSeg(NnieParam *pst_nnie_param, int segnum) {
+  HI_U32 i, j, k, n;
+  HI_U32 seg_idx_ = segnum, node_idx_;
+  HI_S32 ret;
+  HI_CHAR acReportFileName[NNIE_REPORT_NAME_LENGTH] = {'\0'};
+  FILE *fp = nullptr;
+  HI_U32 *pu32StepAddr = nullptr;
+  HI_S32 *ps32ResultAddr = nullptr;
+  HI_U8 *pu8ResultAddr = nullptr;
+  HI_U32 u32Height, u32Width, u32Chn, u32Stride, u32Dim;
+
+  for (node_idx_ = 0; node_idx_ < pst_nnie_param->model_->astSeg[seg_idx_].u16SrcNum; node_idx_++) {
+    ret = snprintf(acReportFileName, NNIE_REPORT_NAME_LENGTH, "seg%d_layer%d_input(%s)_inst.linear.hex", seg_idx_,
+                   pst_nnie_param->model_->astSeg[seg_idx_].astSrcNode[node_idx_].u32NodeId,
+                   pst_nnie_param->model_->astSeg[seg_idx_].astSrcNode[node_idx_].szName);
+    if (ret < 0) {
+      LOGE("Error,create file name failed!\n");
+      return HI_FAILURE;
+    }
+
+    fp = fopen(acReportFileName, "w");
+    if (fp == nullptr) {
+      LOGE("Error,open file failed!");
+      return HI_FAILURE;
+    }
+
+    if (SVP_BLOB_TYPE_SEQ_S32 == pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].enType) {
+      u32Dim = pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].unShape.stSeq.u32Dim;
+      u32Stride = pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].u32Stride;
+      pu32StepAddr = NNIE_CONVERT_64BIT_ADDR(
+        HI_U32, pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].unShape.stSeq.u64VirAddrStep);
+      ps32ResultAddr = NNIE_CONVERT_64BIT_ADDR(HI_S32, pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].u64VirAddr);
+
+      for (n = 0; n < pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].u32Num; n++) {
+        for (i = 0; i < *(pu32StepAddr + n); i++) {
+          for (j = 0; j < u32Dim; j++) {
+            fprintf(fp, "%d ", *(ps32ResultAddr + j));
+          }
+          ps32ResultAddr += u32Stride / sizeof(HI_U32);
+        }
+      }
+    } else if (pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].enType == SVP_BLOB_TYPE_U8) {
+      u32Height = pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].unShape.stWhc.u32Height;
+      u32Width = pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].unShape.stWhc.u32Width;
+      u32Chn = pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].unShape.stWhc.u32Chn;
+      u32Stride = pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].u32Stride;
+      pu8ResultAddr = NNIE_CONVERT_64BIT_ADDR(HI_U8, pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].u64VirAddr);
+      for (n = 0; n < pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].u32Num; n++) {
+        for (i = 0; i < u32Chn; i++) {
+          for (j = 0; j < u32Height; j++) {
+            for (k = 0; k < u32Width; k++) {
+              fprintf(fp, "%d ", *(pu8ResultAddr + k));
+            }
+            pu8ResultAddr += u32Stride / sizeof(HI_U8);
+          }
+        }
+      }
+    } else {
+      u32Height = pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].unShape.stWhc.u32Height;
+      u32Width = pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].unShape.stWhc.u32Width;
+      u32Chn = pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].unShape.stWhc.u32Chn;
+      u32Stride = pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].u32Stride;
+      ps32ResultAddr = NNIE_CONVERT_64BIT_ADDR(HI_S32, pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].u64VirAddr);
+      fprintf(fp, "%s 4 1 %d %d %d\n", pst_nnie_param->model_->astSeg[seg_idx_].astSrcNode[node_idx_].szName, u32Height,
+              u32Width, u32Chn);
+      for (n = 0; n < pst_nnie_param->seg_data_[seg_idx_].src_[node_idx_].u32Num; n++) {
+        for (i = 0; i < u32Chn; i++) {
+          for (j = 0; j < u32Height; j++) {
+            for (k = 0; k < u32Width; k++) {
+              fprintf(fp, "%f ", static_cast<float>(*(ps32ResultAddr + k) / NNIE_QUANT_BASE));
+            }
+            ps32ResultAddr += u32Stride / sizeof(HI_U32);
+          }
+        }
+      }
+    }
+    fclose(fp);
+  }
+
+  return HI_SUCCESS;
+}
+}  // namespace nnie
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/nnie/src/nnie_print.h
+++ b/mindspore/lite/tools/benchmark/nnie/src/nnie_print.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_PRINT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_PRINT_H_
+#include "include/mpi_nnie.h"
+#include "include/hi_type.h"
+#include "src/nnie_common.h"
+#include "src/nnie_memory.h"
+
+#define LOG_TAG1 "NNIE"
+#define LOGE(format, ...)                                                                       \
+  do {                                                                                          \
+    if (1) {                                                                                    \
+      fprintf(stderr, "\n[ERROR] " LOG_TAG1 " [" __FILE__ ":%d] %s] ", __LINE__, __FUNCTION__); \
+      fprintf(stderr, format, ##__VA_ARGS__);                                                   \
+    }                                                                                           \
+  } while (0)
+
+#define LOGW(format, ...)                                                                         \
+  do {                                                                                            \
+    if (1) {                                                                                      \
+      fprintf(stderr, "\n[Warning] " LOG_TAG1 " [" __FILE__ ":%d] %s] ", __LINE__, __FUNCTION__); \
+      fprintf(stderr, format, ##__VA_ARGS__);                                                     \
+    }                                                                                             \
+  } while (0)
+
+constexpr int kMaxSize = 1024;
+constexpr int kDecimal = 10;
+
+namespace mindspore {
+namespace nnie {
+HI_S32 NniePrintReportResult(NnieParam *pst_nnie_param);
+
+HI_S32 NniePrintReportResultInputSeg(NnieParam *pst_nnie_param, int segnum);
+}  // namespace nnie
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NNIE_NNIE_PRINT_H_
--- a/mindspore/lite/tools/benchmark/nnie_proposal/CMakeLists.txt
+++ b/mindspore/lite/tools/benchmark/nnie_proposal/CMakeLists.txt
@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.14)
+project(NNIE_proposal)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../runtime)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../runtime/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../runtime/include/third_party)
+
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/src COMMON_SRC3)
+
+add_library(mslite_proposal SHARED ${COMMON_SRC3})
+target_link_libraries(mslite_proposal ${LINK_LOCAT_LIB})
+
+if(DEFINED HIMIX_STRIP)
+    set(NDK_STRIP ${HIMIX_STRIP})
+else()
+    set(NDK_STRIP "arm-himix200-linux-strip")
+endif()
+
+if("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
+    add_custom_command(TARGET mslite_proposal POST_BUILD COMMAND ${NDK_STRIP}
+            ${CMAKE_CURRENT_BINARY_DIR}/libmslite_proposal.so)
+endif()
--- a/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal.cc
+++ b/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal.cc
@ -0,0 +1,650 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/proposal.h"
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include "include/errorcode.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+constexpr int kNumInput2 = 2;
+constexpr int kNCHWDims = 4;
+
+namespace mindspore {
+namespace proposal {
+uint32_t RpnTmpBufSize(uint32_t num_ratio_anchors, uint32_t num_scale_anchors, uint32_t input_height,
+                       uint32_t input_width) {
+  uint32_t anchors_num = num_ratio_anchors * num_scale_anchors * input_height * input_width;
+  uint32_t anchors_size = sizeof(uint32_t) * COORDI_NUM * anchors_num;
+  uint32_t bbox_delta_size = anchors_size;
+  uint32_t proposal_size = sizeof(uint32_t) * PROPOSAL_WIDTH * anchors_num;
+  uint32_t ratio_anchors_size = sizeof(float) * num_ratio_anchors * COORDI_NUM;
+  uint32_t scale_anchors_size = sizeof(float) * num_ratio_anchors * num_scale_anchors * COORDI_NUM;
+  uint32_t score_size = sizeof(float) * anchors_num * 2;
+  uint32_t stack_size = sizeof(Stack) * anchors_num;
+  uint32_t total_size =
+    anchors_size + bbox_delta_size + proposal_size + ratio_anchors_size + scale_anchors_size + score_size + stack_size;
+  return total_size;
+}
+
+static float exp_coef[10][16] = {
+  {1.0f, 1.00024f, 1.00049f, 1.00073f, 1.00098f, 1.00122f, 1.00147f, 1.00171f, 1.00196f, 1.0022f, 1.00244f, 1.00269f,
+   1.00293f, 1.00318f, 1.00342f, 1.00367f},
+  {1.0f, 1.00391f, 1.00784f, 1.01179f, 1.01575f, 1.01972f, 1.02371f, 1.02772f, 1.03174f, 1.03578f, 1.03984f, 1.04391f,
+   1.04799f, 1.05209f, 1.05621f, 1.06034f},
+  {1.0f, 1.06449f, 1.13315f, 1.20623f, 1.28403f, 1.36684f, 1.45499f, 1.54883f, 1.64872f, 1.75505f, 1.86825f, 1.98874f,
+   2.117f, 2.25353f, 2.39888f, 2.55359f},
+  {1.0f, 2.71828f, 7.38906f, 20.0855f, 54.5981f, 148.413f, 403.429f, 1096.63f, 2980.96f, 8103.08f, 22026.5f, 59874.1f,
+   162755.0f, 442413.0f, 1.2026e+006f, 3.26902e+006f},
+  {1.0f, 8.88611e+006f, 7.8963e+013f, 7.01674e+020f, 6.23515e+027f, 5.54062e+034f, 5.54062e+034f, 5.54062e+034f,
+   5.54062e+034f, 5.54062e+034f, 5.54062e+034f, 5.54062e+034f, 5.54062e+034f, 5.54062e+034f, 5.54062e+034f,
+   5.54062e+034f},
+  {1.0f, 0.999756f, 0.999512f, 0.999268f, 0.999024f, 0.99878f, 0.998536f, 0.998292f, 0.998049f, 0.997805f, 0.997562f,
+   0.997318f, 0.997075f, 0.996831f, 0.996588f, 0.996345f},
+  {1.0f, 0.996101f, 0.992218f, 0.98835f, 0.984496f, 0.980658f, 0.976835f, 0.973027f, 0.969233f, 0.965455f, 0.961691f,
+   0.957941f, 0.954207f, 0.950487f, 0.946781f, 0.94309f},
+  {1.0f, 0.939413f, 0.882497f, 0.829029f, 0.778801f, 0.731616f, 0.687289f, 0.645649f, 0.606531f, 0.569783f, 0.535261f,
+   0.502832f, 0.472367f, 0.443747f, 0.416862f, 0.391606f},
+  {1.0f, 0.367879f, 0.135335f, 0.0497871f, 0.0183156f, 0.00673795f, 0.00247875f, 0.000911882f, 0.000335463f,
+   0.00012341f, 4.53999e-005f, 1.67017e-005f, 6.14421e-006f, 2.26033e-006f, 8.31529e-007f, 3.05902e-007f},
+  {1.0f, 1.12535e-007f, 1.26642e-014f, 1.42516e-021f, 1.60381e-028f, 1.80485e-035f, 2.03048e-042f, 0.0f, 0.0f, 0.0f,
+   0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}};
+static float QuickExp(int32_t value) {
+  if (value & 0x80000000) {
+    value = ~value + 0x00000001;
+    return exp_coef[5][value & 0x0000000F] * exp_coef[6][(value >> 4) & 0x0000000F] *
+           exp_coef[7][(value >> 8) & 0x0000000F] * exp_coef[8][(value >> 12) & 0x0000000F] *
+           exp_coef[9][(value >> 16) & 0x0000000F];
+  } else {
+    return exp_coef[0][value & 0x0000000F] * exp_coef[1][(value >> 4) & 0x0000000F] *
+           exp_coef[2][(value >> 8) & 0x0000000F] * exp_coef[3][(value >> 12) & 0x0000000F] *
+           exp_coef[4][(value >> 16) & 0x0000000F];
+  }
+}
+
+static int32_t SoftMax(float *src, uint32_t num) {
+  float max = 0;
+  float sum = 0;
+  uint32_t i = 0;
+
+  for (i = 0; i < num; ++i) {
+    if (max < src[i]) {
+      max = src[i];
+    }
+  }
+
+  for (i = 0; i < num; ++i) {
+    src[i] = QuickExp(static_cast<int32_t>((src[i] - max) * QUANT_BASE));
+    sum += src[i];
+  }
+
+  for (i = 0; i < num; ++i) {
+    src[i] /= sum;
+  }
+  return RET_OK;
+}
+static void Argswap(int32_t *src1, int32_t *src2) {
+  for (uint32_t i = 0; i < PROPOSAL_WIDTH; i++) {
+    int32_t tmp = src1[i];
+    src1[i] = src2[i];
+    src2[i] = tmp;
+  }
+}
+
+static int32_t NonRecursiveArgQuickSort(int32_t *array, int32_t low, int32_t high, Stack *stack, int32_t max_num) {
+  int32_t top = 0;
+  stack[top].min_ = low;
+  stack[top].max_ = high;
+
+  while (top > -1) {
+    low = stack[top].min_;
+    high = stack[top].max_;
+    int32_t i = low;
+    int32_t j = high;
+
+    int32_t key_confidence = array[PROPOSAL_WIDTH * low + 4];
+    top--;
+    while (i < j) {
+      while ((i < j) && (key_confidence > array[j * PROPOSAL_WIDTH + 4])) {
+        j--;
+      }
+      if (i < j) {
+        Argswap(&array[i * PROPOSAL_WIDTH], &array[j * PROPOSAL_WIDTH]);
+        i++;
+      }
+
+      while ((i < j) && (key_confidence < array[i * PROPOSAL_WIDTH + 4])) {
+        i++;
+      }
+      if (i < j) {
+        Argswap(&array[i * PROPOSAL_WIDTH], &array[j * PROPOSAL_WIDTH]);
+        j--;
+      }
+    }
+
+    if (low <= max_num) {
+      if (low < i - 1) {
+        top++;
+        stack[top].min_ = low;
+        stack[top].max_ = i - 1;
+      }
+
+      if (high > i + 1) {
+        top++;
+        stack[top].min_ = i + 1;
+        stack[top].max_ = high;
+      }
+    }
+  }
+  return RET_OK;
+}
+
+static int32_t FilterLowScoreBbox(int32_t *proposals, uint32_t anchors_num, uint32_t filter_thresh,
+                                  uint32_t *num_after_filter) {
+  uint32_t proposal_cnt = anchors_num;
+
+  if (filter_thresh > 0) {
+    uint32_t i;
+    for (i = 0; i < anchors_num; i++) {
+      if (proposals[PROPOSAL_WIDTH * i + 4] < static_cast<int32_t>(filter_thresh)) {
+        proposals[PROPOSAL_WIDTH * i + 5] = 1;
+      }
+    }
+
+    proposal_cnt = 0;
+    for (i = 0; i < anchors_num; i++) {
+      if (proposals[PROPOSAL_WIDTH * i + 5] == 0) {
+        proposals[PROPOSAL_WIDTH * proposal_cnt] = proposals[PROPOSAL_WIDTH * i];
+        proposals[PROPOSAL_WIDTH * proposal_cnt + 1] = proposals[PROPOSAL_WIDTH * i + 1];
+        proposals[PROPOSAL_WIDTH * proposal_cnt + 2] = proposals[PROPOSAL_WIDTH * i + 2];
+        proposals[PROPOSAL_WIDTH * proposal_cnt + 3] = proposals[PROPOSAL_WIDTH * i + 3];
+        proposals[PROPOSAL_WIDTH * proposal_cnt + 4] = proposals[PROPOSAL_WIDTH * i + 4];
+        proposals[PROPOSAL_WIDTH * proposal_cnt + 5] = proposals[PROPOSAL_WIDTH * i + 5];
+        proposal_cnt++;
+      }
+    }
+  }
+  *num_after_filter = proposal_cnt;
+  return RET_OK;
+}
+
+static int32_t SVP_NNIE_Overlap(int32_t x_min1, int32_t y_min1, int32_t x_max1, int32_t y_max1, int32_t x_min2,
+                                int32_t y_min2, int32_t x_max2, int32_t y_max2, int32_t *area_sum,
+                                int32_t *area_inter) {
+  /*** Check the input, and change the Return value  ***/
+  int32_t inter = 0;
+  int32_t total = 0;
+  int32_t x_min = 0;
+  int32_t y_min = 0;
+  int32_t x_max = 0;
+  int32_t y_max = 0;
+  int32_t area1 = 0;
+  int32_t area2 = 0;
+  int32_t inter_width = 0;
+  int32_t inter_height = 0;
+
+  x_min = MAX(x_min1, x_min2);
+  y_min = MAX(y_min1, y_min2);
+  x_max = MIN(x_max1, x_max2);
+  y_max = MIN(y_max1, y_max2);
+
+  inter_width = x_max - x_min + 1;
+  inter_height = y_max - y_min + 1;
+
+  inter_width = (inter_width >= 0) ? inter_width : 0;
+  inter_height = (inter_height >= 0) ? inter_height : 0;
+
+  inter = inter_width * inter_height;
+  area1 = (x_max1 - x_min1 + 1) * (y_max1 - y_min1 + 1);
+  area2 = (x_max2 - x_min2 + 1) * (y_max2 - y_min2 + 1);
+
+  total = area1 + area2 - inter;
+
+  *area_sum = total;
+  *area_inter = inter;
+  return RET_OK;
+}
+
+static int32_t SVP_NNIE_NonMaxSuppression(int32_t *proposals, uint32_t anchors_num, uint32_t nms_thresh,
+                                          uint32_t max_roi_num) {
+  /****** define variables *******/
+  int32_t x_min1;
+  int32_t y_min1;
+  int32_t x_max1;
+  int32_t y_max1;
+  int32_t x_min2;
+  int32_t y_min2;
+  int32_t x_max2;
+  int32_t y_max2;
+  int32_t s32AreaTotal = 0;
+  int32_t area_inter = 0;
+  uint32_t i;
+  uint32_t j;
+  uint32_t num = 0;
+  bool bNoOverlap;
+  for (i = 0; i < anchors_num && num < max_roi_num; i++) {
+    if (proposals[PROPOSAL_WIDTH * i + 5] == 0) {
+      num++;
+      x_min1 = proposals[PROPOSAL_WIDTH * i];
+      y_min1 = proposals[PROPOSAL_WIDTH * i + 1];
+      x_max1 = proposals[PROPOSAL_WIDTH * i + 2];
+      y_max1 = proposals[PROPOSAL_WIDTH * i + 3];
+      for (j = i + 1; j < anchors_num; j++) {
+        if (proposals[PROPOSAL_WIDTH * j + 5] == 0) {
+          x_min2 = proposals[PROPOSAL_WIDTH * j];
+          y_min2 = proposals[PROPOSAL_WIDTH * j + 1];
+          x_max2 = proposals[PROPOSAL_WIDTH * j + 2];
+          y_max2 = proposals[PROPOSAL_WIDTH * j + 3];
+          bNoOverlap = (x_min2 > x_max1) || (x_max2 < x_min1) || (y_min2 > y_max1) || (y_max2 < y_min1);
+          if (bNoOverlap) {
+            continue;
+          }
+          (void)SVP_NNIE_Overlap(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2, x_max2, y_max2, &s32AreaTotal,
+                                 &area_inter);
+          if (area_inter * QUANT_BASE > static_cast<int32_t>(nms_thresh * s32AreaTotal)) {
+            if (proposals[PROPOSAL_WIDTH * i + 4] >= proposals[PROPOSAL_WIDTH * j + 4]) {
+              proposals[PROPOSAL_WIDTH * j + 5] = 1;
+            } else {
+              proposals[PROPOSAL_WIDTH * i + 5] = 1;
+            }
+          }
+        }
+      }
+    }
+  }
+  return RET_OK;
+}
+
+static void Rpn(float **inputs, uint32_t num_ratio_anchors, uint32_t num_scale_anchors, uint32_t *scales,
+                uint32_t *ratios, uint32_t ori_image_height, uint32_t ori_image_width, uint32_t *inputs_height,
+                uint32_t *inputs_width, uint32_t *inputs_channel, uint32_t inputs_stride, uint32_t max_rois,
+                uint32_t min_size, uint32_t spatial_scale, uint32_t nms_thresh, uint32_t filter_thresh,
+                uint32_t num_before_nms, char *pu32MemPool, float *proposal_result, uint32_t dst_stride,
+                uint32_t *num_rois) {
+#if 1
+  /******************** define parameters ****************/
+  uint32_t size;
+  int32_t *anchors = nullptr;
+  int32_t *bbox_delta = nullptr;
+  int32_t *proposals = nullptr;
+  int32_t *ptr1 = nullptr;
+  int32_t *ptr2 = nullptr;
+  int32_t *ptr3 = nullptr;
+  uint32_t num_after_filter = 0;
+  uint32_t num_anchors;
+  float base_w;
+  float base_h;
+  float base_x_ctr;
+  float base_y_ctr;
+  float *ratio_anchors = nullptr;
+  float *f32_ptr = nullptr;
+  float *f32_ptr2 = nullptr;
+  float *scale_anchors = nullptr;
+  float *scores = nullptr;
+  float f32_size;
+  uint32_t pixel_interval;
+  uint32_t src_bbox_index;
+  uint32_t src_fg_prob_index;
+  uint32_t src_bg_prob_index;
+  uint32_t src_bbox_bias;
+  uint32_t src_prob_bias;
+  uint32_t des_box;
+  uint32_t bg_blob_size;
+  uint32_t anchors_per_pixel;
+  uint32_t map_size;
+  uint32_t line_size;
+  int32_t proposal_width;
+  int32_t proposal_height;
+  uint32_t roi_count;
+  Stack *stack = nullptr;
+  uint32_t c;
+  uint32_t h;
+  uint32_t w;
+  uint32_t i;
+  uint32_t j;
+  uint32_t p;
+  uint32_t q;
+  uint32_t z;
+  uint32_t base_anchor[4] = {0, 0, (min_size - 1), (min_size - 1)};
+
+  /*********************************** Faster RCNN *********************************************/
+  /********* calculate the start pointer of each part in MemPool *********/
+  anchors = reinterpret_cast<int32_t *>(pu32MemPool);
+  num_anchors = num_ratio_anchors * num_scale_anchors * (inputs_height[0] * inputs_width[0]);
+  size = COORDI_NUM * num_anchors;
+  pu32MemPool += size * sizeof(int32_t);
+
+  bbox_delta = reinterpret_cast<int32_t *>(pu32MemPool);
+  pu32MemPool += size * sizeof(int32_t);
+
+  proposals = reinterpret_cast<int32_t *>(pu32MemPool);
+  size = PROPOSAL_WIDTH * num_anchors;
+  pu32MemPool += size * sizeof(int32_t);
+
+  ratio_anchors = reinterpret_cast<float *>(static_cast<void *>(pu32MemPool));
+  f32_ptr = reinterpret_cast<float *>(static_cast<void *>(pu32MemPool));
+  size = num_ratio_anchors * COORDI_NUM;
+  f32_ptr = f32_ptr + size;
+
+  scale_anchors = f32_ptr;
+  size = num_scale_anchors * num_ratio_anchors * COORDI_NUM;
+  f32_ptr = f32_ptr + size;
+
+  scores = f32_ptr;
+  size = num_anchors * SCORE_NUM;
+  f32_ptr = f32_ptr + size;
+
+  stack = reinterpret_cast<Stack *>(f32_ptr);
+
+  /********************* Generate the base anchor ***********************/
+  base_w = static_cast<float>(base_anchor[2] - base_anchor[0] + 1);
+  base_h = static_cast<float>(base_anchor[3] - base_anchor[1] + 1);
+  base_x_ctr = static_cast<float>(base_anchor[0] + ((base_w - 1) * 0.5));
+  base_y_ctr = static_cast<float>(base_anchor[1] + ((base_h - 1) * 0.5));
+
+  /*************** Generate Ratio Anchors for the base anchor ***********/
+  f32_ptr = ratio_anchors;
+  f32_size = base_w * base_h;
+  for (i = 0; i < num_ratio_anchors; i++) {
+    float f32_ratios = static_cast<float>(ratios[i]) / QUANT_BASE;
+    base_w = sqrt(f32_size / f32_ratios);
+    base_w = static_cast<float>(
+      1.0 * ((base_w) >= 0 ? static_cast<int32_t>(base_w + HALF_VAL) : static_cast<int32_t>(base_w - HALF_VAL)));
+    base_h = base_w * f32_ratios;
+    base_h = static_cast<float>(
+      1.0 * ((base_h) >= 0 ? static_cast<int32_t>(base_h + HALF_VAL) : static_cast<int32_t>(base_h - HALF_VAL)));
+
+    *f32_ptr++ = static_cast<float>(base_x_ctr - ((base_w - 1) * HALF_VAL));
+    *(f32_ptr++) = static_cast<float>(base_y_ctr - ((base_h - 1) * HALF_VAL));
+    *(f32_ptr++) = static_cast<float>(base_x_ctr + ((base_w - 1) * HALF_VAL));
+    *(f32_ptr++) = static_cast<float>(base_y_ctr + ((base_h - 1) * HALF_VAL));
+  }
+
+  /********* Generate Scale Anchors for each Ratio Anchor **********/
+  f32_ptr = ratio_anchors;
+  f32_ptr2 = scale_anchors;
+  /* Generate Scale Anchors for one pixel */
+  for (i = 0; i < num_ratio_anchors; i++) {
+    for (j = 0; j < num_scale_anchors; j++) {
+      base_w = *(f32_ptr + 2) - *(f32_ptr) + 1;
+      base_h = *(f32_ptr + 3) - *(f32_ptr + 1) + 1;
+      base_x_ctr = static_cast<float>(*(f32_ptr) + ((base_w - 1) * HALF_VAL));
+      base_y_ctr = static_cast<float>(*(f32_ptr + 1) + ((base_h - 1) * HALF_VAL));
+
+      *(f32_ptr2++) =
+        static_cast<float>(base_x_ctr - ((base_w * (static_cast<float>(scales[j]) / QUANT_BASE) - 1) * HALF_VAL));
+      *(f32_ptr2++) =
+        static_cast<float>(base_y_ctr - ((base_h * (static_cast<float>(scales[j]) / QUANT_BASE) - 1) * HALF_VAL));
+      *(f32_ptr2++) =
+        static_cast<float>(base_x_ctr + ((base_w * (static_cast<float>(scales[j]) / QUANT_BASE) - 1) * HALF_VAL));
+      *(f32_ptr2++) =
+        static_cast<float>(base_y_ctr + ((base_h * (static_cast<float>(scales[j]) / QUANT_BASE) - 1) * HALF_VAL));
+    }
+    f32_ptr += COORDI_NUM;
+  }
+
+  /******************* Copy the anchors to every pixel in the feature map ******************/
+  ptr1 = anchors;
+  pixel_interval = QUANT_BASE / spatial_scale;
+
+  for (p = 0; p < inputs_height[0]; p++) {
+    for (q = 0; q < inputs_width[0]; q++) {
+      f32_ptr2 = scale_anchors;
+      for (z = 0; z < num_scale_anchors * num_ratio_anchors; z++) {
+        *(ptr1++) = static_cast<int32_t>(q * pixel_interval + *(f32_ptr2++));
+        *(ptr1++) = static_cast<int32_t>(p * pixel_interval + *(f32_ptr2++));
+        *(ptr1++) = static_cast<int32_t>(q * pixel_interval + *(f32_ptr2++));
+        *(ptr1++) = static_cast<int32_t>(p * pixel_interval + *(f32_ptr2++));
+      }
+    }
+  }
+
+  /********** do transpose, convert the blob from (M,C,H,W) to (M,H,W,C) **********/
+  map_size = inputs_height[1] * inputs_stride / sizeof(uint32_t);
+  anchors_per_pixel = num_ratio_anchors * num_scale_anchors;
+  bg_blob_size = anchors_per_pixel * map_size;
+  line_size = inputs_stride / sizeof(uint32_t);
+  src_prob_bias = 0;
+  src_bbox_bias = 0;
+
+  for (c = 0; c < inputs_channel[1]; c++) {
+    for (h = 0; h < inputs_height[1]; h++) {
+      for (w = 0; w < inputs_width[1]; w++) {
+        src_bbox_index = src_bbox_bias + c * map_size + h * line_size + w;
+        src_bg_prob_index = src_prob_bias + (c / COORDI_NUM) * map_size + h * line_size + w;
+        src_fg_prob_index = bg_blob_size + src_bg_prob_index;
+
+        des_box = (anchors_per_pixel) * (h * inputs_width[1] + w) + c / COORDI_NUM;
+
+        uint32_t des_bbox_delta_index = COORDI_NUM * des_box + c % COORDI_NUM;
+        bbox_delta[des_bbox_delta_index] = static_cast<int32_t>(inputs[1][src_bbox_index] * QUANT_BASE);
+
+        uint32_t des_score_index = (SCORE_NUM)*des_box;
+        scores[des_score_index] = inputs[0][src_bg_prob_index];
+        scores[des_score_index + 1] = inputs[0][src_fg_prob_index];
+      }
+    }
+  }
+
+  /************************* do softmax ****************************/
+  f32_ptr = scores;
+  for (i = 0; i < num_anchors; i++) {
+    SoftMax(f32_ptr, SCORE_NUM);
+    f32_ptr += SCORE_NUM;
+  }
+
+  /************************* BBox Transform *****************************/
+  for (i = 0; i < num_anchors; i++) {
+    ptr1 = anchors;
+    ptr1 = ptr1 + COORDI_NUM * i;
+    ptr2 = proposals;
+    ptr2 = ptr2 + PROPOSAL_WIDTH * i;
+    ptr3 = bbox_delta;
+    ptr3 = ptr3 + COORDI_NUM * i;
+    f32_ptr = scores;
+    f32_ptr = f32_ptr + i * (SCORE_NUM);
+
+    proposal_width = *(ptr1 + 2) - *(ptr1) + 1;
+    proposal_height = *(ptr1 + 3) - *(ptr1 + 1) + 1;
+    int32_t proposal_center_x = *(ptr1) + static_cast<int32_t>(proposal_width * HALF_VAL);
+    int32_t proposal_center_y = *(ptr1 + 1) + static_cast<int32_t>(proposal_height * HALF_VAL);
+    int32_t pred_center_x =
+      static_cast<int32_t>((static_cast<float>(*(ptr3)) / QUANT_BASE) * proposal_width + proposal_center_x);
+    int32_t pred_center_y =
+      static_cast<int32_t>((static_cast<float>(*(ptr3 + 1)) / QUANT_BASE) * proposal_height + proposal_center_y);
+
+    int32_t pred_w = static_cast<int32_t>(proposal_width * QuickExp(static_cast<int32_t>(*(ptr3 + 2))));
+    int32_t pred_h = static_cast<int32_t>(proposal_height * QuickExp(static_cast<int32_t>(*(ptr3 + 3))));
+    *(ptr2) = static_cast<int32_t>(pred_center_x - HALF_VAL * pred_w);
+    *(ptr2 + 1) = static_cast<int32_t>(pred_center_y - HALF_VAL * pred_h);
+    *(ptr2 + 2) = static_cast<int32_t>(pred_center_x + HALF_VAL * pred_w);
+    *(ptr2 + 3) = static_cast<int32_t>(pred_center_y + HALF_VAL * pred_h);
+    *(ptr2 + 4) = static_cast<int32_t>(*(f32_ptr + 1) * QUANT_BASE);
+    *(ptr2 + 5) = 0;
+  }
+
+  /************************ clip bbox *****************************/
+  for (i = 0; i < num_anchors; i++) {
+    ptr1 = proposals;
+    ptr1 = ptr1 + PROPOSAL_WIDTH * i;
+    *ptr1 = MAX(MIN(*ptr1, static_cast<int32_t>(ori_image_width) - 1), 0);
+    *(ptr1 + 1) = MAX(MIN(*(ptr1 + 1), static_cast<int32_t>(ori_image_height) - 1), 0);
+    *(ptr1 + 2) = MAX(MIN(*(ptr1 + 2), static_cast<int32_t>(ori_image_width) - 1), 0);
+    *(ptr1 + 3) = MAX(MIN(*(ptr1 + 3), static_cast<int32_t>(ori_image_height) - 1), 0);
+  }
+
+  /************ remove the bboxes which are too small *************/
+  for (i = 0; i < num_anchors; i++) {
+    ptr1 = proposals;
+    ptr1 = ptr1 + PROPOSAL_WIDTH * i;
+    proposal_width = *(ptr1 + 2) - *(ptr1) + 1;
+    proposal_height = *(ptr1 + 3) - *(ptr1 + 1) + 1;
+    if (proposal_width < static_cast<int32_t>(min_size) || proposal_height < static_cast<int32_t>(min_size)) {
+      *(ptr1 + 5) = 1;
+    }
+  }
+
+  /********** remove low score bboxes ************/
+  (void)FilterLowScoreBbox(proposals, num_anchors, filter_thresh, &num_after_filter);
+
+  /********** sort ***********/
+  (void)NonRecursiveArgQuickSort(proposals, 0, num_after_filter - 1, stack, static_cast<int32_t>(num_before_nms));
+  num_after_filter = (num_after_filter < num_before_nms) ? num_after_filter : num_before_nms;
+
+  /* do nms to remove highly overlapped bbox */
+  (void)SVP_NNIE_NonMaxSuppression(proposals, num_after_filter, nms_thresh, max_rois); /* function NMS */
+
+  /************** write the final result to output ***************/
+  roi_count = 0;
+  for (i = 0; i < num_after_filter; i++) {
+    ptr1 = proposals;
+    ptr1 = ptr1 + PROPOSAL_WIDTH * i;
+    if (*(ptr1 + 5) == 0) {
+      proposal_result[dst_stride / sizeof(uint32_t) * roi_count] = *ptr1;
+      proposal_result[dst_stride / sizeof(uint32_t) * roi_count + 1] = *(ptr1 + 1);
+      proposal_result[dst_stride / sizeof(uint32_t) * roi_count + 2] = *(ptr1 + 2);
+      proposal_result[dst_stride / sizeof(uint32_t) * roi_count + 3] = *(ptr1 + 3);
+      roi_count++;
+    }
+    if (roi_count >= max_rois) {
+      break;
+    }
+  }
+
+  *num_rois = roi_count;
+#endif
+}
+
+int32_t ProposalInit(ProposalParam *param, const std::vector<mindspore::MSTensor> &inputs, uint32_t max_roi_num,
+                     uint32_t ori_image_height, uint32_t ori_image_width) {
+  uint32_t tmp_buf_size = 0;
+  uint32_t bbox_buf_size = 0;
+  uint32_t total_size = 0;
+  param->max_roi_num_ = max_roi_num;
+
+  param->num_ratio_anchors_ = 1;
+  param->num_scale_anchors_ = NUM_SCALE_ANCHORS;
+  param->scales_[0] = 1.5 * QUANT_BASE;
+  param->scales_[1] = 2.1 * QUANT_BASE;
+  param->scales_[2] = 2.9 * QUANT_BASE;
+  param->scales_[3] = 4.1 * QUANT_BASE;
+  param->scales_[4] = 5.8 * QUANT_BASE;
+  param->scales_[5] = 8.0 * QUANT_BASE;
+  param->scales_[6] = 11.3 * QUANT_BASE;
+  param->scales_[7] = 15.8 * QUANT_BASE;
+  param->scales_[8] = 22.1 * QUANT_BASE;
+  param->ratios_[0] = 2.44 * QUANT_BASE;
+
+  param->ori_image_height_ = ori_image_height;
+  param->ori_image_width_ = ori_image_width;
+  param->min_size_ = MIN_SIZE;
+  param->spatial_scale_ = (uint32_t)(0.0625 * QUANT_BASE);
+  param->nms_thresh_ = (uint32_t)(0.7 * QUANT_BASE);
+  param->filter_thresh_ = 0;
+  param->num_before_nms_ = NUM_NMS;
+
+  param->rpn_bounding_box_.chn_ = 1;
+  param->rpn_bounding_box_.height_ = max_roi_num;
+  param->rpn_bounding_box_.width_ = COORDI_NUM;
+  param->rpn_bounding_box_.stride_ = COORDI_NUM * sizeof(float);
+  param->rpn_bounding_box_.num_ = 1;
+  if (inputs.size() < kNumInput2) {
+    LOGE("inputs tensor size error.");
+    return RET_ERROR;
+  }
+
+  for (int i = 0; i < kNumInput2; i++) {
+    auto input_data_type = inputs[i].DataType();
+    if (input_data_type == DataType::kNumberTypeFloat32) {
+      auto ptr_shape = inputs[i].Shape();
+      if ((ptr_shape.size() == kNCHWDims)) {
+        param->inputs_height_[i] = ptr_shape[2];
+        param->inputs_width_[i] = ptr_shape[3];
+        param->inputs_channel_[i] = ptr_shape[1];
+        if (i == 0) {
+          param->inputs_stride_ = ptr_shape[3] * sizeof(float);
+        }
+      }
+    }
+  }
+
+  tmp_buf_size = RpnTmpBufSize(param->num_ratio_anchors_, param->num_scale_anchors_, param->inputs_height_[0],
+                               param->inputs_width_[0]);
+
+  bbox_buf_size = param->rpn_bounding_box_.num_ * param->rpn_bounding_box_.height_ * param->rpn_bounding_box_.stride_;
+  total_size = tmp_buf_size + bbox_buf_size;
+
+  if (param->rpn_tmp_buf_ != nullptr) {
+    free(param->rpn_tmp_buf_);
+    param->rpn_tmp_buf_ = nullptr;
+  }
+  param->rpn_tmp_buf_ = malloc(total_size);
+  if (param->rpn_tmp_buf_ == nullptr) {
+    LOGE("malloc buf fail.");
+    return RET_ERROR;
+  }
+  param->rpn_bounding_box_.data_ = reinterpret_cast<char *>(param->rpn_tmp_buf_) + tmp_buf_size;
+
+  return RET_OK;
+}
+
+int32_t ProposalRun(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
+                    ProposalParam *param) {
+  if (inputs->size() < kNumInput2) {
+    LOGE("inputs tensor size error.");
+    return RET_ERROR;
+  }
+  if (outputs->size() != 1) {
+    LOGE("outputs tensor size error.");
+    return RET_ERROR;
+  }
+  for (int i = 0; i < kNumInput2; i++) {
+    auto input_data_type = inputs->at(i).DataType();
+    if (input_data_type == DataType::kNumberTypeFloat32) {
+      param->inputs_[i] = reinterpret_cast<float *>((*inputs)[i].MutableData());
+    }
+  }
+  auto output_data_type = (*outputs)[0].DataType();
+  if (output_data_type != DataType::kNumberTypeFloat32) {
+    LOGE("outputs tensor data type error.");
+    return RET_ERROR;
+  }
+
+  Rpn(param->inputs_, param->num_ratio_anchors_, param->num_scale_anchors_, param->scales_, param->ratios_,
+      param->ori_image_height_, param->ori_image_width_, param->inputs_height_, param->inputs_width_,
+      param->inputs_channel_, param->inputs_stride_, param->max_roi_num_, param->min_size_, param->spatial_scale_,
+      param->nms_thresh_, param->filter_thresh_, param->num_before_nms_, reinterpret_cast<char *>(param->rpn_tmp_buf_),
+      reinterpret_cast<float *>(param->rpn_bounding_box_.data_), param->rpn_bounding_box_.stride_,
+      &param->rpn_bounding_box_.height_);
+
+  std::vector<int64_t> shape{static_cast<int64_t>(param->rpn_bounding_box_.height_), COORDI_NUM};
+  (*outputs)[0].SetShape(shape);
+  auto output_data = (*outputs)[0].MutableData();
+  memcpy(output_data, param->rpn_bounding_box_.data_, param->rpn_bounding_box_.height_ * COORDI_NUM * sizeof(float));
+
+  return RET_OK;
+}
+
+void ProposalDeInit(ProposalParam *param) {
+  if (param->rpn_tmp_buf_ != 0) {
+    free(param->rpn_tmp_buf_);
+    param->rpn_tmp_buf_ = 0;
+  }
+}
+}  // namespace proposal
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal.h
+++ b/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal.h
@ -0,0 +1,95 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_PROPOSAL_PROPOSAL_H_
+#define MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_PROPOSAL_PROPOSAL_H_
+#include <vector>
+#include "include/api/types.h"
+
+#define LOG_TAG1 "Proposal"
+#define LOGE(format, ...)                                                                       \
+  do {                                                                                          \
+    if (1) {                                                                                    \
+      fprintf(stderr, "\n[ERROR] " LOG_TAG1 " [" __FILE__ ":%d] %s] ", __LINE__, __FUNCTION__); \
+      fprintf(stderr, format, ##__VA_ARGS__);                                                   \
+    }                                                                                           \
+  } while (0)
+
+#define LOGW(format, ...)                                                                         \
+  do {                                                                                            \
+    if (1) {                                                                                      \
+      fprintf(stderr, "\n[Warning] " LOG_TAG1 " [" __FILE__ ":%d] %s] ", __LINE__, __FUNCTION__); \
+      fprintf(stderr, format, ##__VA_ARGS__);                                                     \
+    }                                                                                             \
+  } while (0)
+
+namespace mindspore {
+namespace proposal {
+typedef struct {
+  uint32_t stride_;
+  void *data_;
+  uint32_t num_;
+  uint32_t width_;
+  uint32_t height_;
+  uint32_t chn_;
+} RpnBoundingBox;
+
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define HALF_VAL 0.5f     // the half value
+#define COORDI_NUM 4      // coordinate numbers
+#define PROPOSAL_WIDTH 6  // the number of proposal values
+#define QUANT_BASE 4096   // the base value
+#define SCORE_NUM 2       // the num of RPN scores
+#define NUM_SCALE_ANCHORS 9
+#define NUM_NMS 6000
+#define MIN_SIZE 16
+
+typedef struct {
+  uint32_t scales_[9];
+  uint32_t ratios_[9];
+  uint32_t inputs_height_[2];
+  uint32_t inputs_width_[2];
+  uint32_t inputs_channel_[2];
+  uint32_t inputs_stride_;
+  uint32_t num_ratio_anchors_;
+  uint32_t num_scale_anchors_;
+  uint32_t ori_image_height_;
+  uint32_t ori_image_width_;
+  uint32_t min_size_;
+  uint32_t spatial_scale_;
+  uint32_t nms_thresh_;
+  uint32_t filter_thresh_;
+  uint32_t max_roi_num_;
+  uint32_t num_before_nms_;
+  float *inputs_[2];
+  void *rpn_tmp_buf_;
+  RpnBoundingBox rpn_bounding_box_;
+} ProposalParam;
+
+typedef struct {
+  int32_t min_;
+  int32_t max_;
+} Stack;
+
+int32_t ProposalInit(ProposalParam *param, const std::vector<mindspore::MSTensor> &inputs, uint32_t max_roi_num,
+                     uint32_t ori_image_height, uint32_t ori_image_width);
+int32_t ProposalRun(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
+                    ProposalParam *param);
+void ProposalDeInit(ProposalParam *param);
+}  // namespace proposal
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_PROPOSAL_PROPOSAL_H_
--- a/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal_fp32.cc
+++ b/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal_fp32.cc
@ -0,0 +1,200 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/proposal_fp32.h"
+#include <memory>
+#include <string>
+#include "schema/model_generated.h"
+#include "include/registry/register_kernel.h"
+#include "include/errorcode.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Custom;
+constexpr int kMaxSize = 1024;
+constexpr int kNumInput2 = 2;
+constexpr int kDecimal = 10;
+
+namespace mindspore {
+namespace proposal {
+int ProposalCPUKernel::Prepare() {
+  if (inputs_.size() < kNumInput2) {
+    LOGE("inputs tensor num error.");
+    return RET_ERROR;
+  }
+  if (outputs_.size() != 1) {
+    LOGE("outputs tensor num error.");
+    return RET_ERROR;
+  }
+  std::vector<std::string> inputs_name = {"rpn_cls_score", "rpn_bbox_pred"};
+  std::vector<mindspore::MSTensor> inputs;
+  for (size_t i = 0; i < inputs_name.size(); i++) {
+    bool find_flag = false;
+    for (auto &input : inputs_) {
+      if (input.Name() == inputs_name[i]) {
+        inputs.push_back(input);
+        find_flag = true;
+        break;
+      }
+    }
+    if (!find_flag) {
+      for (auto &input : inputs_) {
+        if (std::find(inputs.begin(), inputs.end(), input) != inputs.end()) {
+          continue;
+        }
+        inputs.push_back(input);
+        LOGW("input tensor name diff '%s' vs '%s'.", inputs_name[i].c_str(), input.Name().c_str());
+        break;
+      }
+    }
+  }
+  if (inputs.size() != inputs_name.size()) {
+    LOGE("inputs size error.");
+    return RET_ERROR;
+  }
+  this->set_inputs(inputs);
+  if (inputs[0].Shape()[0] != 1) {
+    LOGE("proposal only support input num == 1.");
+    return RET_ERROR;
+  }
+
+  outputs_[0].SetTensorName("proposal");
+
+  int max_roi_num_int = 300;
+  auto *max_roi_num = std::getenv("MAX_ROI_NUM");
+  if (max_roi_num != nullptr) {
+    auto iter =
+      std::find_if(max_roi_num, max_roi_num + strlen(max_roi_num), [](char val) { return val < '0' || val > '9'; });
+    if (iter != max_roi_num) {
+      *iter = '\0';
+      max_roi_num_int = atoi(max_roi_num);
+    } else {
+      LOGW("MAX_ROI_NUM ENV is invalid, now set to default value %d", max_roi_num_int);
+    }
+  } else {
+    LOGW("MAX_ROI_NUM ENV is not set, now set to default value %d", max_roi_num_int);
+  }
+
+  return ProposalInit(&proposal_param_, inputs_, max_roi_num_int, image_height_, image_weight_);
+}
+
+int ProposalCPUKernel::ReSize() {
+  if (inputs_[0].Shape()[0] != 1) {
+    LOGE("proposal only support input num == 1.");
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ProposalCPUKernel::Execute() { return ProposalRun(&inputs_, &outputs_, &proposal_param_); }
+
+ProposalCPUKernel::~ProposalCPUKernel() { ProposalDeInit(&proposal_param_); }
+
+bool GetCustomAttr(char *buf, int buf_size, const mindspore::schema::Custom *op, const std::string &attr) {
+  int attr_size;
+  for (size_t i = 0; i < op->attr()->size(); i++) {
+    if (op->attr()->Get(i)->name()->str() == attr) {
+      auto output_info = op->attr()->Get(i)->data();
+      attr_size = static_cast<int>(output_info->size());
+      if (attr_size >= buf_size) {
+        LOGE("attr size too big");
+        return false;
+      }
+      for (int j = 0; j < attr_size; j++) {
+        buf[j] = static_cast<char>(output_info->Get(j));
+      }
+      buf[attr_size] = 0;
+      return true;
+    }
+  }
+  return false;
+}
+
+std::shared_ptr<mindspore::kernel::Kernel> ProposalCreateKernel(const std::vector<mindspore::MSTensor> &inputs,
+                                                                const std::vector<mindspore::MSTensor> &outputs,
+                                                                const mindspore::schema::Primitive *primitive,
+                                                                const mindspore::Context *ctx) {
+  if (primitive->value_type() != mindspore::schema::PrimitiveType_Custom) {
+    LOGE("Primitive type is not PrimitiveType_Custom");
+    return nullptr;
+  }
+
+  auto op = primitive->value_as_Custom();
+  if (op->attr()->size() < 1) {
+    LOGE("There are at least 1 attribute of Custom");
+    return nullptr;
+  }
+  int64_t ndims;
+  int64_t image_height;
+  int64_t image_width;
+
+  char *res = nullptr;
+  char buf[kMaxSize];
+  if (GetCustomAttr(buf, kMaxSize, op, "proposal_id")) {
+    res = nullptr;
+    ndims = strtol(buf, &res, kDecimal);
+    if ((*res) != 0) {
+      LOGE("Get attr id data fail");
+      return nullptr;
+    }
+  } else {
+    LOGE("Proposal Custom op should have id");
+    return nullptr;
+  }
+
+  if (GetCustomAttr(buf, kMaxSize, op, "image_height")) {
+    res = nullptr;
+    image_height = strtol(buf, &res, kDecimal);
+    if ((*res) != 0) {
+      LOGE("Get attr id data fail");
+      return nullptr;
+    }
+  } else {
+    LOGE("Proposal Custom op should have image_height");
+    return nullptr;
+  }
+  if (GetCustomAttr(buf, kMaxSize, op, "image_width")) {
+    res = nullptr;
+    image_width = strtol(buf, &res, kDecimal);
+    if ((*res) != 0) {
+      LOGE("Get attr id data fail");
+      return nullptr;
+    }
+  } else {
+    LOGE("Proposal Custom op should have image_width");
+    return nullptr;
+  }
+
+  auto kernel = std::make_shared<ProposalCPUKernel>(inputs, outputs, primitive, ctx, ndims, image_height, image_width);
+  // auto kernel = new (std::nothrow) ProposalCPUKernel(inputs, outputs, primitive, ctx, ndims, image_height,
+  // image_width);
+  if (kernel == nullptr) {
+    LOGE("new custom kernel is nullptr");
+    return nullptr;
+  }
+  return kernel;
+}
+}  // namespace proposal
+}  // namespace mindspore
+
+namespace mindspore {
+namespace kernel {
+namespace {
+const auto kFloat32 = DataType::kNumberTypeFloat32;
+}
+REGISTER_CUSTOM_KERNEL(CPU, NNIE, kFloat32, Proposal, proposal::ProposalCreateKernel)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal_fp32.h
+++ b/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal_fp32.h
@ -0,0 +1,51 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_PROPOSAL_PROPOSAL_FP32_H_
+#define MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_PROPOSAL_PROPOSAL_FP32_H_
+
+#include <vector>
+#include "schema/model_generated.h"
+#include "include/context.h"
+#include "include/api/kernel.h"
+#include "src/proposal.h"
+
+using mindspore::kernel::Kernel;
+namespace mindspore {
+namespace proposal {
+class ProposalCPUKernel : public Kernel {
+ public:
+  ProposalCPUKernel(const std::vector<mindspore::MSTensor> &inputs, const std::vector<mindspore::MSTensor> &outputs,
+                    const mindspore::schema::Primitive *primitive, const mindspore::Context *ctx, int id,
+                    int image_height, int image_width)
+      : Kernel(inputs, outputs, primitive, ctx), id_(id), image_height_(image_height), image_weight_(image_width) {}
+
+  ~ProposalCPUKernel() override;
+
+  int Prepare() override;
+  int ReSize() override;
+  int Execute() override;
+
+ private:
+  proposal::ProposalParam proposal_param_ = {0};
+  int64_t id_;
+  int64_t image_height_;
+  int64_t image_weight_;
+};
+}  // namespace proposal
+}  // namespace mindspore
+
+#endif  // MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_PROPOSAL_PROPOSAL_FP32_H_
--- a/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal_infer.cc
+++ b/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal_infer.cc
@ -0,0 +1,77 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/proposal_infer.h"
+#include <memory>
+#include <vector>
+#include "include/errorcode.h"
+#include "src/proposal.h"
+#include "include/api/format.h"
+#include "include/registry/register_kernel_interface.h"
+
+using mindspore::kernel::KernelInterface;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Custom;
+
+namespace mindspore {
+namespace proposal {
+std::shared_ptr<KernelInterface> ProposalInferCreater() {
+  auto infer = std::make_shared<ProposalInterface>();
+  if (infer == nullptr) {
+    LOGE("new custom infer is nullptr");
+    return nullptr;
+  }
+
+  return infer;
+}
+Status ProposalInterface::Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
+                                const mindspore::schema::Primitive *primitive) {
+  if (inputs->size() != 2) {
+    LOGE("Inputs size less 2");
+    return kLiteError;
+  }
+  if (outputs->size() == 0) {
+    LOGE("Outputs size 0");
+    return kLiteError;
+  }
+  if (primitive->value_type() != mindspore::schema::PrimitiveType_Custom) {
+    LOGE("Primitive type is not PrimitiveType_Custom");
+    return kLiteError;
+  }
+
+  size_t id = 0;
+  while (id < outputs->size()) {
+    // 待补完
+    // outputs[id]->format_ = input->format_;
+    // outputs[id]->data_type_ = kNumberTypeFloat32;
+    // 设置type为int
+    std::vector<int64_t> shape{-1, COORDI_NUM};
+    (*outputs)[id].SetShape(shape);
+    (*outputs)[id].SetDataType(DataType::kNumberTypeFloat32);
+    (*outputs)[id].SetFormat(Format::NCHW);
+    id++;
+  }
+  return kSuccess;
+}
+}  // namespace proposal
+}  // namespace mindspore
+namespace mindspore {
+namespace kernel {
+// static KernelInterfaceReg a(aa, schema::PrimitiveType_Custom, CustomInferCreater);
+REGISTER_CUSTOM_KERNEL_INTERFACE(NNIE, Proposal, proposal::ProposalInferCreater);
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal_infer.h
+++ b/mindspore/lite/tools/benchmark/nnie_proposal/src/proposal_infer.h
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_PROPOSAL_PROPOSAL_INFER_H_
+#define MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_PROPOSAL_PROPOSAL_INFER_H_
+#include <vector>
+#include "include/kernel_interface.h"
+
+namespace mindspore {
+namespace proposal {
+class ProposalInterface : public mindspore::kernel::KernelInterface {
+ public:
+  ProposalInterface() {}
+
+  ~ProposalInterface() = default;
+
+  Status Infer(std::vector<mindspore::MSTensor> *inputs, std::vector<mindspore::MSTensor> *outputs,
+               const mindspore::schema::Primitive *primitive) override;
+};
+}  // namespace proposal
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_TOOLS_BENCHMARK_NNIE_PROPOSAL_PROPOSAL_INFER_H_
--- a/mindspore/lite/tools/benchmark/run_benchmark.cc
+++ b/mindspore/lite/tools/benchmark/run_benchmark.cc
@ -26,6 +26,12 @@ namespace lite {
 int RunBenchmark(int argc, const char **argv) {
  BenchmarkFlags flags;
  Option<std::string> err = flags.ParseFlags(argc, argv);
+#ifdef SUPPORT_NNIE
+  if (SvpSysInit() != RET_OK) {
+    std::cerr << "SVP Init failed" << std::endl;
+    return RET_ERROR;
+  }
+#endif
  if (err.IsSome()) {
    std::cerr << err.Get() << std::endl;
    std::cerr << flags.Usage() << std::endl;
@ -36,7 +42,9 @@ int RunBenchmark(int argc, const char **argv) {
    std::cerr << flags.Usage() << std::endl;
    return RET_OK;
  }
-
+#ifdef SUPPORT_NNIE
+  BenchmarkBase *benchmark = new (std::nothrow) Benchmark(&flags);
+#else
  auto api_type = std::getenv("MSLITE_API_TYPE");
  if (api_type != nullptr) {
    MS_LOG(INFO) << "MSLITE_API_TYPE = " << api_type;
@ -53,6 +61,7 @@ int RunBenchmark(int argc, const char **argv) {
    BENCHMARK_LOG_ERROR("Invalid MSLITE_API_TYPE, (OLD/NEW/C, default:OLD)");
    return RET_ERROR;
  }
+#endif
  if (benchmark == nullptr) {
    BENCHMARK_LOG_ERROR("new benchmark failed ");
    return RET_ERROR;
@ -61,6 +70,7 @@ int RunBenchmark(int argc, const char **argv) {
  auto status = benchmark->Init();
  if (status != 0) {
    BENCHMARK_LOG_ERROR("Benchmark init Error : " << status);
+    delete benchmark;
    return RET_ERROR;
  }
  auto model_name = flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1);
@ -68,6 +78,7 @@ int RunBenchmark(int argc, const char **argv) {
  status = benchmark->RunBenchmark();
  if (status != 0) {
    BENCHMARK_LOG_ERROR("Run Benchmark " << model_name << " Failed : " << status);
+    delete benchmark;
    return RET_ERROR;
  }

--- a/mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh
+++ b/mindspore/lite/tools/providers/NNIE/Hi3516D/compile_nnie.sh
@ -53,58 +53,6 @@ function Run_Build_x86() {
  fi
 }

-# Build arm32 for nnie
-function Run_Build_arm() {
-  # decompress release_pkg
-  cd ${open_source_ms_path}/output/ || exit 1
-  file_name=$(ls ./*linux-${package_name}.tar.gz)
-  IFS="-" read -r -a file_name_array <<< "$file_name"
-  version=${file_name_array[2]}
-  tar -xf mindspore-lite-${version}-linux-${package_name}.tar.gz
-
-  # cp runtime folder
-  cd ${open_source_ms_path}/output/mindspore-lite-${version}-linux-${package_name} || exit 1
-  rm -rf ${nnie_code_path}/mindspore/mindspore/lite/tools/benchmark/nnie/third_patry/runtime/
-  mkdir -p ${nnie_code_path}/mindspore/mindspore/lite/tools/benchmark/nnie/third_patry/runtime/ || exit 1
-  rm -rf ${nnie_code_path}/mindspore/mindspore/lite/tools/benchmark/nnie_proposal/third_patry/runtime/
-  mkdir -p ${nnie_code_path}/mindspore/mindspore/lite/tools/benchmark/nnie_proposal/third_patry/runtime/ || exit 1
-  cp -r ./runtime/ ${nnie_code_path}/mindspore/mindspore/lite/tools/benchmark/nnie/third_patry/
-  cp -r ./runtime/ ${nnie_code_path}/mindspore/mindspore/lite/tools/benchmark/nnie_proposal/third_patry/
-
-  # compile nnie runtime so
-  export TOOLCHAIN_NAME=${toolchain_name}
-  export TOOLCHAIN_FILE=${open_source_ms_path}/mindspore/lite/cmake/${toolchain_name}.toolchain.cmake
-  export MSLITE_REGISTRY_DEVICE=${device_name}
-
-  # disable gpu & npu & train
-  export MSLITE_GPU_BACKEND=off
-  export MSLITE_ENABLE_NPU=off
-  export MSLITE_ENABLE_TRAIN=off
-  export MSLITE_ENABLE_NNIE=on
-
-  bash ${nnie_code_path}/mindspore/build.sh -I ${task} -e cpu -j ${thread_num}
-  if [ $? = 0 ]; then
-    echo "build arm for nnie success"
-    release_path=${open_source_ms_path}/output/mindspore-lite-${version}-linux-${package_name}/providers/${device_name}/
-    rm -rf ${release_path}
-    mkdir -p ${release_path}
-    mkdir -p ${open_source_ms_path}/output/mindspore-lite-${version}-linux-${package_name}/tools/benchmark/
-    cp ${nnie_code_path}/mindspore/mindspore/lite/build/tools/benchmark/benchmark ${open_source_ms_path}/output/mindspore-lite-${version}-linux-${package_name}/tools/benchmark/ || exit 1
-    cp ${nnie_code_path}/mindspore/mindspore/lite/build/tools/benchmark/nnie/libmslite_nnie.so ${release_path}/ || exit 1
-    cp ${nnie_code_path}/mindspore/mindspore/lite/build/tools/benchmark/nnie_proposal/libmslite_proposal.so ${release_path}/ || exit 1
-    if [ ${device_name} == "Hi3516D" ]; then
-      cp ${nnie_code_path}/mindspore/mindspore/lite/micro/example/hi3516d/libmicro_nnie.so ${release_path}/ || exit 1
-    fi
-    echo "cp new nnie so to release pkg success"
-    cd ${open_source_ms_path}/output/ || exit 1
-    rm ${open_source_ms_path}/output/mindspore-lite-${version}-linux-${package_name}.tar.gz
-    tar -zcf ./mindspore-lite-${version}-linux-${package_name}.tar.gz ./mindspore-lite-${version}-linux-${package_name}/ || exit 1
-    sha256sum ./mindspore-lite-${version}-linux-${package_name}.tar.gz > ./mindspore-lite-${version}-linux-${package_name}.tar.gz.sha256 || exit 1
-  else
-    echo "build arm for nnie failed"; return 1
-  fi
-}
-
 # bashpath should be /home/jenkins/agent-working-dir/workspace/Compile_Lite_ARM32_3516D/
 basepath=$(pwd)
 echo "basepath is ${basepath}"
@ -123,12 +71,8 @@ while getopts "I:b:j:t:d:" opt; do
            echo "branch name is ${OPTARG}"
            ;;
        t)
-            toolchain_name=${OPTARG}
-            echo "toolchain_name is ${OPTARG}"
            ;;
        d)
-            device_name=${OPTARG}
-            echo "device_name is ${OPTARG}"
            ;;
        j)
            thread_num=${OPTARG}
@ -163,14 +107,6 @@ fi
 if [ ${task} == "x86_64" ]; then
  echo "start building x86 for nnie..."
  Run_Build_x86
-elif [ ${task} == "arm32" ]; then
-  echo "start building arm32 for nnie..."
-  package_name=aarch32
-  Run_Build_arm
-elif [ ${task} == "arm64" ]; then
-  echo "start building arm64 for nnie..."
-  package_name=aarch64
-  Run_Build_arm
 fi

 Run_build_PID=$!
--- a/mindspore/lite/tools/providers/dpico/sd3403/compile_3403.sh
+++ b/mindspore/lite/tools/providers/dpico/sd3403/compile_3403.sh
@ -1,81 +0,0 @@
-#!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-prepare_third_party() {
-  dpico_third_party=${mindspore_lite_top_dir}/tools/benchmark/dpico/third_party
-  rm -rf ${dpico_third_party} || exit 1
-  mkdir -p ${dpico_third_party} || exit 1
-  cd ${mindspore_top_dir}/output || exit 1
-  file_name=$(ls *tar.gz)
-  tar_name=${file_name%%.tar.gz}
-  tar xzvf ${tar_name}.tar.gz || exit 1
-  cd ..
-  cp -rf ${mindspore_top_dir}/output/${tar_name}/runtime/ ${dpico_third_party} || exit 1
-}
-
-# Build arm64 for dpico
-make_dpico_benchmark_package() {
-  cd ${mindspore_top_dir}/output || exit 1
-  file_name=$(ls *tar.gz)
-  tar_name=${file_name%%.tar.gz}
-  dpico_sd3403_release_path=${mindspore_top_dir}/output/${tar_name}/providers/SD3403/
-  mkdir -p ${dpico_sd3403_release_path}
-  dpico_benchmark_path=${mindspore_top_dir}/mindspore/lite/build/tools/benchmark
-  cp ${dpico_benchmark_path}/dpico/libdpico_acl_adapter.so ${dpico_sd3403_release_path} || exit 1
-  echo "install dpico adapter so success."
-  rm ${tar_name}.tar.gz || exit 1
-  tar -zcf ${tar_name}.tar.gz ${tar_name} || exit 1
-  rm -rf ${tar_name} || exit 1
-  sha256sum ${tar_name}.tar.gz > ${tar_name}.tar.gz.sha256 || exit 1
-  echo "generate dpico package success!"
-  cd ${basepath}
-  rm -rf ${dpico_third_party} || exit 1
-}
-
-basepath=$(pwd)
-echo "basepath is ${basepath}"
-#set -e
-mindspore_top_dir=${basepath}
-mindspore_lite_top_dir=${mindspore_top_dir}/mindspore/lite
-
-while getopts "t:" opt; do
-    case ${opt} in
-        t)
-            task=${OPTARG}
-            echo "compile task is ${OPTARG}"
-            ;;
-        ?)
-        echo "unknown para"
-        exit 1;;
-    esac
-done
-
-if [[ ${task} == "prepare_third_party" ]]; then
-    prepare_third_party
-    if [ $? -eq 1 ]; then
-      echo "prepare third party failed"
-      return 1
-    fi
-else
-    echo "start make package for dpico..."
-    make_dpico_benchmark_package &
-    make_dpico_benchmark_package_pid=$!
-    sleep 1
-
-    wait ${make_dpico_benchmark_package_pid}
-    make_dpico_benchmark_package_status=$?
-    exit ${make_dpico_benchmark_package_status}
-fi