From a9c90f71e45035c49b2a3e3ffebdb3cdf21d8121 Mon Sep 17 00:00:00 2001
From: yeyunpeng <yeyunpeng2020@huawei.com>
Date: Fri, 13 Nov 2020 09:34:48 +0800
Subject: [PATCH] target link fp16 and optimize to lite

---
 build.sh                                      |  2 -
 cmake/package_lite.cmake                      |  2 -
 mindspore/lite/nnacl/optimize/CMakeLists.txt  |  6 +-
 mindspore/lite/nnacl/optimized_kernel.h       | 99 -------------------
 mindspore/lite/src/CMakeLists.txt             | 21 +---
 mindspore/lite/src/common/utils.cc            | 34 +++++++
 mindspore/lite/src/common/utils.h             |  3 +
 mindspore/lite/src/kernel_registry.cc         | 16 ++-
 .../kernel/arm/fp16/convolution_1x1_fp16.h    |  2 +-
 .../kernel/arm/fp16/convolution_base_fp16.h   |  2 +-
 .../arm/fp16/convolution_winograd_fp16.h      |  2 +-
 .../{fp16_op_handler.cc => fp16_op_handler.h} |  2 +
 .../kernel/arm/fp16/quant_dtype_cast_fp16.cc  |  6 +-
 .../kernel/arm/int8/convolution_1x1_int8.cc   | 18 ++--
 .../kernel/arm/int8/convolution_1x1_int8.h    |  2 +-
 .../kernel/arm/int8/convolution_int8.cc       | 21 ++--
 .../kernel/arm/int8/convolution_int8.h        |  2 +-
 .../kernel/arm/int8/deconvolution_int8.cc     | 18 +---
 .../runtime/kernel/arm/int8/opt_op_handler.cc |  2 +-
 .../runtime/kernel/arm/int8/opt_op_handler.h  | 40 ++++++++
 mindspore/lite/src/scheduler.cc               |  3 +-
 mindspore/lite/src/sub_graph_kernel.cc        | 25 +++--
 mindspore/lite/src/sub_graph_kernel.h         | 33 +------
 mindspore/lite/test/CMakeLists.txt            | 13 +--
 mindspore/lite/test/run_benchmark_nets.sh     |  2 -
 mindspore/lite/tools/converter/CMakeLists.txt |  1 +
 26 files changed, 141 insertions(+), 236 deletions(-)
 delete mode 100644 mindspore/lite/nnacl/optimized_kernel.h
 rename mindspore/lite/src/runtime/kernel/arm/fp16/{fp16_op_handler.cc => fp16_op_handler.h} (97%)
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.h

diff --git a/build.sh b/build.sh
index f8b6298bbe3..42199dbefd2 100755
--- a/build.sh
+++ b/build.sh
@@ -752,8 +752,6 @@ build_lite_java_arm64() {
     [ -n "${JAVA_PATH}" ] && rm -rf ${JAVA_PATH}/java/app/libs/arm64-v8a/
     mkdir -p ${JAVA_PATH}/java/app/libs/arm64-v8a/
     cp ${BASEPATH}/output/mindspore-lite-${VERSION_STR}-runtime-arm64-cpu/lib/libmindspore-lite.so ${JAVA_PATH}/java/app/libs/arm64-v8a/
-    cp ${BASEPATH}/output/mindspore-lite-${VERSION_STR}-runtime-arm64-cpu/lib/libmindspore-lite-fp16.so ${JAVA_PATH}/java/app/libs/arm64-v8a/
-    cp ${BASEPATH}/output/mindspore-lite-${VERSION_STR}-runtime-arm64-cpu/lib/libmindspore-lite-optimize.so ${JAVA_PATH}/java/app/libs/arm64-v8a/
     echo mindspore-lite-${VERSION_STR}-runtime-arm64-cpu
     [ -n "${VERSION_STR}" ] && rm -rf mindspore-lite-${VERSION_STR}-runtime-arm64-cpu
 }
diff --git a/cmake/package_lite.cmake b/cmake/package_lite.cmake
index e8021e96576..055606e95b4 100644
--- a/cmake/package_lite.cmake
+++ b/cmake/package_lite.cmake
@@ -77,8 +77,6 @@ if (PLATFORM_ARM64)
     install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.a DESTINATION ${LIB_DIR} COMPONENT ${COMPONENT_NAME})
     install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${INC_DIR}/ir/dtype COMPONENT ${COMPONENT_NAME})
     install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/schema/ DESTINATION ${INC_DIR}/schema COMPONENT ${COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "inner" EXCLUDE)
-    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite-optimize.so DESTINATION ${LIB_DIR} COMPONENT ${COMPONENT_NAME})
-    install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite-fp16.so DESTINATION ${LIB_DIR} COMPONENT ${COMPONENT_NAME})
     install(DIRECTORY ${flatbuffers_INC} DESTINATION ${FLATBF_DIR} COMPONENT ${COMPONENT_NAME})
     if (ENABLE_TOOLS)
         install(TARGETS benchmark RUNTIME DESTINATION ${MAIN_DIR}-${COMPONENT_NAME}/benchmark COMPONENT ${COMPONENT_NAME})
diff --git a/mindspore/lite/nnacl/optimize/CMakeLists.txt b/mindspore/lite/nnacl/optimize/CMakeLists.txt
index e6acd66df74..c3bf1d8c9e7 100644
--- a/mindspore/lite/nnacl/optimize/CMakeLists.txt
+++ b/mindspore/lite/nnacl/optimize/CMakeLists.txt
@@ -21,8 +21,6 @@ string(REPLACE "-fvisibility=hidden" "-fvisibility=default" CMAKE_C_FLAGS "${CMA
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8.2-a+dotprod+fp16")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+dotprod+fp16")
 
-add_library(nnacl_optimize STATIC ${SDOT_FILES})
-target_link_libraries(nnacl_optimize mindspore-lite)
+add_library(nnacl_optimize_mid OBJECT ${SDOT_FILES})
 
-add_library(nnacl_fp16 STATIC ${FP16_FILES})
-target_link_libraries(nnacl_fp16 mindspore-lite)
+add_library(nnacl_fp16_mid OBJECT ${FP16_FILES})
diff --git a/mindspore/lite/nnacl/optimized_kernel.h b/mindspore/lite/nnacl/optimized_kernel.h
deleted file mode 100644
index d070ec26d57..00000000000
--- a/mindspore/lite/nnacl/optimized_kernel.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_NNACL_OPTIMIZED_KERNEL_H_
-#define MINDSPORE_LITE_NNACL_OPTIMIZED_KERNEL_H_
-
-#ifndef _WIN32
-#include <dlfcn.h>
-#endif
-#ifdef __ANDROID__
-#include <asm/hwcap.h>
-#include "nnacl/nnacl_utils.h"
-#endif
-#include "src/common/log_adapter.h"
-
-#define OPTIMIZE_SHARED_LIBRARY_PATH "libmindspore-lite-optimize.so"
-#define FLOAT16_SHARED_LIBRARY_PATH "libmindspore-lite-fp16.so"
-
-class OptimizeModule {
- public:
-  OptimizeModule() {
-    bool support_optimize_ops = false;
-
-#ifdef ENABLE_ARM64
-    int hwcap_type = 16;
-    uint32_t hwcap = getHwCap(hwcap_type);
-    if (hwcap & HWCAP_ASIMDDP) {
-      MS_LOG(INFO) << "Hw cap support SMID Dot Product, hwcap: 0x" << hwcap;
-      support_optimize_ops = true;
-    } else {
-      MS_LOG(INFO) << "Hw cap NOT support SIMD Dot Product, hwcap: 0x" << hwcap;
-    }
-#endif
-    if (support_optimize_ops == false) {
-      return;
-    }
-#ifdef ENABLE_ARM64
-    optimized_op_handler_ = dlopen(OPTIMIZE_SHARED_LIBRARY_PATH, RTLD_LAZY);
-    if (optimized_op_handler_ == nullptr) {
-      MS_LOG(INFO) << "Open optimize shared library failed: " << dlerror();
-    }
-#endif
-  }
-
-  ~OptimizeModule() = default;
-
-  static OptimizeModule *GetInstance() {
-    static OptimizeModule opt_module;
-    return &opt_module;
-  }
-  void *optimized_op_handler_ = nullptr;
-};
-
-class Float16Module {
- public:
-  Float16Module() {
-    bool support_fp16 = false;
-#ifdef ENABLE_ARM64
-    int hwcap_type = 16;
-    uint32_t hwcap = getHwCap(hwcap_type);
-    if (hwcap & HWCAP_FPHP) {
-      MS_LOG(INFO) << "Hw cap support FP16, hwcap: 0x" << hwcap;
-      support_fp16 = true;
-    }
-#endif
-    if (support_fp16 == false) {
-      return;
-    }
-#ifdef ENABLE_ARM64
-    float16_op_handler_ = dlopen(FLOAT16_SHARED_LIBRARY_PATH, RTLD_LAZY);
-    if (float16_op_handler_ == nullptr) {
-      MS_LOG(INFO) << "Open optimize shared library failed: " << dlerror();
-    }
-#endif
-  }
-
-  ~Float16Module() = default;
-
-  static Float16Module *GetInstance() {
-    static Float16Module fp16_module;
-    return &fp16_module;
-  }
-  void *float16_op_handler_ = nullptr;
-};
-
-#endif  // MINDSPORE_LITE_NNACL_OPTIMIZED_KERNEL_H_
diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt
index b73e287955c..311723204f1 100644
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -17,6 +17,7 @@ if (PLATFORM_ARM32 OR PLATFORM_ARM64)
 endif ()
 
 set(LITE_SRC
+        ${CMAKE_CURRENT_SOURCE_DIR}/common/utils.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/common/graph_util.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/common/log_adapter.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/common/string_util.cc
@@ -114,22 +115,10 @@ endif ()
 
 ########################## build optimize and float16 library #################################3
 if (PLATFORM_ARM64)
-    add_library(mindspore-lite-optimize SHARED)
-    target_link_libraries(mindspore-lite-optimize cpu_opt_kernel_mid)
-    target_link_libraries(mindspore-lite-optimize nnacl_optimize)
+    target_link_libraries(mindspore-lite cpu_opt_kernel_mid nnacl_optimize_mid)
+    target_link_libraries(mindspore-lite_static cpu_opt_kernel_mid nnacl_optimize_mid)
 
-    add_library(mindspore-lite-fp16 SHARED)
-    target_link_libraries(mindspore-lite-fp16 cpu_fp16_kernel_mid)
-    target_link_libraries(mindspore-lite-fp16 nnacl_fp16)
-endif ()
-
-if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release" AND (PLATFORM_ARM64))
-    add_custom_command(TARGET mindspore-lite-optimize POST_BUILD COMMAND
-            ${ANDROID_NDK}/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/aarch64-linux-android/bin/strip
-            ${CMAKE_BINARY_DIR}/src/libmindspore-lite-optimize.so)
-
-    add_custom_command(TARGET mindspore-lite-fp16 POST_BUILD COMMAND
-            ${ANDROID_NDK}/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/aarch64-linux-android/bin/strip
-            ${CMAKE_BINARY_DIR}/src/libmindspore-lite-fp16.so)
+    target_link_libraries(mindspore-lite cpu_fp16_kernel_mid nnacl_fp16_mid)
+    target_link_libraries(mindspore-lite_static cpu_fp16_kernel_mid nnacl_fp16_mid)
 endif ()
 
diff --git a/mindspore/lite/src/common/utils.cc b/mindspore/lite/src/common/utils.cc
index da8534422b4..ed54fe2c678 100644
--- a/mindspore/lite/src/common/utils.cc
+++ b/mindspore/lite/src/common/utils.cc
@@ -16,6 +16,7 @@
 
 #ifdef __ANDROID__
 #include <sys/auxv.h>
+#include <asm/hwcap.h>
 #endif
 #include "src/common/utils.h"
 
@@ -257,5 +258,38 @@ uint32_t getHwCap(int hwcap_type) {
   return ret;
 }
 #endif
+
+bool IsSupportSDot() {
+  bool status = false;
+#ifdef ENABLE_ARM64
+  int hwcap_type = 16;
+  uint32_t hwcap = getHwCap(hwcap_type);
+  if (hwcap & HWCAP_ASIMDDP) {
+    MS_LOG(DEBUG) << "Hw cap support SMID Dot Product, hwcap: 0x" << hwcap;
+    status = true;
+  } else {
+    MS_LOG(DEBUG) << "Hw cap NOT support SIMD Dot Product, hwcap: 0x" << hwcap;
+    status = false;
+  }
+#endif
+  return status;
+}
+
+bool IsSupportFloat16() {
+  bool status = false;
+#ifdef ENABLE_ARM64
+  int hwcap_type = 16;
+  uint32_t hwcap = getHwCap(hwcap_type);
+  if (hwcap & HWCAP_FPHP) {
+    MS_LOG(DEBUG) << "Hw cap support FP16, hwcap: 0x" << hwcap;
+    status = true;
+  } else {
+    MS_LOG(DEBUG) << "Hw cap NOT support FP16, hwcap: 0x" << hwcap;
+    status = false;
+  }
+#endif
+  return status;
+}
+
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/common/utils.h b/mindspore/lite/src/common/utils.h
index 245b32ece32..43a37bd3c20 100644
--- a/mindspore/lite/src/common/utils.h
+++ b/mindspore/lite/src/common/utils.h
@@ -44,6 +44,9 @@ void ShortToFloat32(const int16_t *srcdata, float *dstdata, size_t elementSize);
 
 void Float32ToShort(const float *srcdata, int16_t *dstdata, size_t elementSize);
 
+bool IsSupportSDot();
+
+bool IsSupportFloat16();
 #if defined(__arm__) || defined(__aarch64__)
 uint32_t getHwCap(int hwcap_type);
 #endif
diff --git a/mindspore/lite/src/kernel_registry.cc b/mindspore/lite/src/kernel_registry.cc
index 4ed813a482c..30c5dc1ec6b 100644
--- a/mindspore/lite/src/kernel_registry.cc
+++ b/mindspore/lite/src/kernel_registry.cc
@@ -20,7 +20,7 @@
 #include <asm/hwcap.h>
 #include "common/utils.h"
 #include "src/common/log_adapter.h"
-#include "nnacl/optimized_kernel.h"
+#include "src/common/utils.h"
 #endif
 
 using mindspore::kernel::kCPU;
@@ -36,17 +36,15 @@ KernelRegistry *KernelRegistry::GetInstance() {
 
 int KernelRegistry::Init() {
 #ifdef ENABLE_ARM64
-  void *optimized_lib_handler = OptimizeModule::GetInstance()->optimized_op_handler_;
-  if (optimized_lib_handler != nullptr) {
-    MS_LOG(INFO) << "load optimize lib success.";
+  if (mindspore::lite::IsSupportSDot()) {
+    MS_LOG(INFO) << "The current device supports Sdot.";
   } else {
-    MS_LOG(INFO) << "load optimize lib failed.";
+    MS_LOG(INFO) << "The current device NOT supports Sdot.";
   }
-  void *float16_op_handler = Float16Module::GetInstance()->float16_op_handler_;
-  if (float16_op_handler != nullptr) {
-    MS_LOG(INFO) << "load float16 lib success.";
+  if (mindspore::lite::IsSupportFloat16()) {
+    MS_LOG(INFO) << "The current device supports float16.";
   } else {
-    MS_LOG(INFO) << "load float16 lib failed.";
+    MS_LOG(INFO) << "The current device NOT supports float16.";
   }
 #endif
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
index 318fa37e59a..78b3c95a417 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
@@ -21,7 +21,7 @@
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
-#include "nnacl/optimized_kernel.h"
+#include "src/common/utils.h"
 #include "nnacl/matmul_parameter.h"
 #include "nnacl/fp16/matmul_fp16.h"
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
index ab2d10c155b..972795cd121 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
@@ -21,7 +21,7 @@
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/runtime/kernel/arm/base/convolution_base.h"
-#include "nnacl/optimized_kernel.h"
+#include "src/common/utils.h"
 
 namespace mindspore::kernel {
 class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
index 813cafbc10c..567e5a7a9fe 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
@@ -23,7 +23,7 @@
 #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
 #include "nnacl/fp16/conv_fp16.h"
 #include "nnacl/fp16/winograd_utils_fp16.h"
-#include "nnacl/optimized_kernel.h"
+#include "src/common/utils.h"
 #include "nnacl/minimal_filtering_generator.h"
 
 namespace mindspore::kernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fp16_op_handler.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/fp16_op_handler.h
similarity index 97%
rename from mindspore/lite/src/runtime/kernel/arm/fp16/fp16_op_handler.cc
rename to mindspore/lite/src/runtime/kernel/arm/fp16/fp16_op_handler.h
index 715513b8ba9..636969c4f59 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/fp16_op_handler.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fp16_op_handler.h
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#ifdef ENABLE_ARM64
 #include <arm_neon.h>
+#endif
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc
index f6f9d435fa3..0879b04edf2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc
@@ -102,11 +102,11 @@ int QuantDTypeCastFp16CPUKernel::QuantDTypeCast(int task_id) {
   return RET_OK;
 }
 
-int QuantDTypeCastRun(void *cdata, int task_id) {
+int QuantDTypeCastFP16Run(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<QuantDTypeCastFp16CPUKernel *>(cdata);
   auto ret = g_kernel->QuantDTypeCast(task_id);
   if (ret != RET_OK) {
-    MS_LOG(ERROR) << "QuantDTypeCastRun error task_id[" << task_id << "] error_code[" << ret << "]";
+    MS_LOG(ERROR) << "QuantDTypeCastFP16Run error task_id[" << task_id << "] error_code[" << ret << "]";
     return RET_ERROR;
   }
   return RET_OK;
@@ -126,7 +126,7 @@ int QuantDTypeCastFp16CPUKernel::Run() {
     return RET_ERROR;
   }
 
-  auto ret = ParallelLaunch(this->context_->thread_pool_, QuantDTypeCastRun, this, thread_n_num_);
+  auto ret = ParallelLaunch(this->context_->thread_pool_, QuantDTypeCastFP16Run, this, thread_n_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
index 7b1f38d510c..3b3aa30cb91 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
@@ -17,6 +17,9 @@
 #include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
 #include "src/runtime/runtime_api.h"
 #include "src/common/file_utils.h"
+#ifdef ENABLE_ARM64
+#include "src/runtime/kernel/arm/int8/opt_op_handler.h"
+#endif
 
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_MEMORY_FAILED;
@@ -74,18 +77,9 @@ void Convolution1x1Int8CPUKernel::CheckSupportOptimize() {
   support_optimize_ = false;
   matmul_func_ = MatMulInt8_8x8_r;
 #ifdef ENABLE_ARM64
-  void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_;
-  if (optimize_op_handler != nullptr) {
-    dlerror();
-    *(reinterpret_cast<void **>(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulRInt8_optimize_handler");
-    auto dlopen_error = dlerror();
-    if (dlopen_error != nullptr) {
-      MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << ".";
-      support_optimize_ = false;
-      matmul_func_ = nullptr;
-    } else {
-      support_optimize_ = true;
-    }
+  if (mindspore::lite::IsSupportSDot()) {
+    support_optimize_ = true;
+    matmul_func_ = MatMulRInt8_optimize_handler;
   } else {
     support_optimize_ = false;
     matmul_func_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
index 7543ce55cb5..e224cb98030 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
@@ -25,7 +25,7 @@
 #include "nnacl/int8/conv_int8.h"
 #include "nnacl/int8/matmul_int8.h"
 #include "nnacl/matmul_parameter.h"
-#include "nnacl/optimized_kernel.h"
+#include "src/common/utils.h"
 
 namespace mindspore::kernel {
 class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
index e6f78016481..39591176bbb 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@@ -23,6 +23,9 @@
 #include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
 #include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h"
 #include "src/runtime/runtime_api.h"
+#ifdef ENABLE_ARM64
+#include "src/runtime/kernel/arm/int8/opt_op_handler.h"
+#endif
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
@@ -39,18 +42,9 @@ void ConvolutionInt8CPUKernel::CheckSupportOptimize() {
 #endif
 
 #ifdef ENABLE_ARM64
-  void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_;
-  if (optimize_op_handler != nullptr) {
-    dlerror();
-    *(reinterpret_cast<void **>(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulRInt8_optimize_handler");
-    auto dlopen_error = dlerror();
-    if (dlopen_error != nullptr) {
-      MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << ".";
-      support_optimize_ = false;
-      tile_num_ = 4;
-    } else {
-      support_optimize_ = true;
-    }
+  if (mindspore::lite::IsSupportSDot()) {
+    matmul_func_ = MatMulRInt8_optimize_handler;
+    support_optimize_ = true;
   } else {
     tile_num_ = 4;
     support_optimize_ = false;
@@ -260,8 +254,7 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::Tensor *> &
   kernel::LiteKernel *kernel;
   if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
 #ifdef ENABLE_ARM64
-    void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_;
-    if (optimize_op_handler != nullptr) {
+    if (mindspore::lite::IsSupportSDot()) {
       kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
     } else {
       kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
index e838164c16a..0f8a4e2cf76 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/runtime/kernel/arm/base/convolution_base.h"
-#include "nnacl/optimized_kernel.h"
+#include "src/common/utils.h"
 #include "nnacl/int8/conv_int8.h"
 
 namespace mindspore::kernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc
index 69f111f30df..d9504cb72f8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc
@@ -16,7 +16,8 @@
 
 #include "src/runtime/kernel/arm/int8/deconvolution_int8.h"
 #include "src/runtime/runtime_api.h"
-#include "nnacl/optimized_kernel.h"
+#include "src/common/utils.h"
+#include "src/runtime/kernel/arm/int8/opt_op_handler.h"
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
@@ -95,18 +96,9 @@ void DeConvInt8CPUKernel::CheckSupportOptimize() {
   support_optimize_ = true;
   matmul_func_ = MatMulInt8_16x4;
 #ifdef ENABLE_ARM64
-  void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_;
-  if (optimize_op_handler != nullptr) {
-    dlerror();
-    *(reinterpret_cast<void **>(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulR4Int8_optimize_handler");
-    auto dlopen_error = dlerror();
-    if (dlopen_error != nullptr) {
-      MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << ".";
-      support_optimize_ = false;
-      matmul_func_ = MatMulR4Int8Neon64;
-    } else {
-      support_optimize_ = true;
-    }
+  if (mindspore::lite::IsSupportSDot()) {
+    support_optimize_ = true;
+    matmul_func_ = MatMulR4Int8_optimize_handler;
   } else {
     support_optimize_ = false;
     matmul_func_ = MatMulR4Int8Neon64;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.cc b/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.cc
index f8527744ba1..46e242bdecd 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.cc
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include "src/runtime/kernel/arm/int8/opt_op_handler.h"
 #include <stdlib.h>
-#include <stdbool.h>
 #include "nnacl/op_base.h"
 
 #ifdef __cplusplus
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.h b/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.h
new file mode 100644
index 00000000000..1e273706e1e
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include "nnacl/op_base.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef ENABLE_ARM64
+void IndirectGemmInt8_optimize_handler(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
+                                       size_t ksize, size_t ic4, size_t output_channel, size_t offset,
+                                       const int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp,
+                                       int32_t *out_multiplier, int32_t *shift_before, int32_t *shift_after,
+                                       size_t asymmetric, size_t per_channel, size_t per_channel_offset);
+void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
+                                   const int *input_sum, const int *bias);
+
+void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
+                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
+                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
+                                  int32_t maxi, size_t per_channel);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc
index 9b5eed568dc..8cc4fe74af6 100644
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@@ -275,7 +275,8 @@ kernel::LiteKernel *Scheduler::ScheduleNode(const std::vector<Tensor *> &in_tens
     }
   }
 #endif
-  if ((context_->IsCpuFloat16Enabled() && data_type == kNumberTypeFloat32) || data_type == kNumberTypeFloat16) {
+  if (mindspore::lite::IsSupportFloat16() &&
+      ((context_->IsCpuFloat16Enabled() && data_type == kNumberTypeFloat32) || data_type == kNumberTypeFloat16)) {
     kernel::KernelKey fp16_cpu_desc{desc.arch, kNumberTypeFloat16, desc.type};
     auto *kernel =
       KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, primitive, context_, fp16_cpu_desc);
diff --git a/mindspore/lite/src/sub_graph_kernel.cc b/mindspore/lite/src/sub_graph_kernel.cc
index 1795957946c..871d18fd114 100644
--- a/mindspore/lite/src/sub_graph_kernel.cc
+++ b/mindspore/lite/src/sub_graph_kernel.cc
@@ -17,7 +17,8 @@
 #include "src/sub_graph_kernel.h"
 #include "src/tensor.h"
 #ifdef ENABLE_ARM64
-#include "nnacl/optimized_kernel.h"
+#include "src/common/utils.h"
+#include "src/runtime/kernel/arm/fp16/fp16_op_handler.h"
 #endif
 
 namespace mindspore::kernel {
@@ -183,9 +184,9 @@ void CpuFp16SubGraph::FreeOriginInputData() {
 }
 
 int CpuFp16SubGraph::PreProcess() {
-  auto fp32_to_fp16_cast_func = Float16CastUtil::GetInstance()->float32_to_float16_func_;
-  if (fp32_to_fp16_cast_func == nullptr) {
-    MS_LOG(ERROR) << "Can not find cast fp32 to fp16 func";
+#ifdef ENABLE_ARM64
+  if (!mindspore::lite::IsSupportFloat16()) {
+    MS_LOG(ERROR) << "Unsupport fp16 in this devices";
     return RET_ERROR;
   }
   MS_ASSERT(origin_input_data_.empty());
@@ -203,7 +204,7 @@ int CpuFp16SubGraph::PreProcess() {
         return RET_ERROR;
       }
       MS_ASSERT(tensor->data_c() != nullptr);
-      fp32_to_fp16_cast_func(float32_data, tensor->data_c(), tensor->ElementsNum());
+      Float32ToFloat16_fp16_handler(float32_data, tensor->data_c(), tensor->ElementsNum());
       auto *data_store = DataStore::CreateDataStore(float32_data, tensor->allocator(), this->context_->allocator.get());
       if (data_store == nullptr) {
         MS_LOG(ERROR) << "Create DataStore failed";
@@ -223,12 +224,15 @@ int CpuFp16SubGraph::PreProcess() {
     }
   }
   return RET_OK;
+#else
+  return RET_OK;
+#endif
 }
 
 int CpuFp16SubGraph::PostProcess() {
-  auto fp16_to_fp32_cast_func = Float16CastUtil::GetInstance()->float16_to_float32_func_;
-  if (fp16_to_fp32_cast_func == nullptr) {
-    MS_LOG(ERROR) << "Can not find cast fp16 to fp32 func";
+#ifdef ENABLE_ARM64
+  if (!mindspore::lite::IsSupportFloat16()) {
+    MS_LOG(ERROR) << "Unsupport fp16 in this devices";
     return RET_ERROR;
   }
   for (auto tensor : this->out_tensors_) {
@@ -249,7 +253,7 @@ int CpuFp16SubGraph::PostProcess() {
         return RET_ERROR;
       }
       MS_ASSERT(tensor->data_c() != nullptr);
-      fp16_to_fp32_cast_func(float16_data, tensor->data_c(), tensor->ElementsNum());
+      Float16ToFloat32_fp16_handler(float16_data, tensor->data_c(), tensor->ElementsNum());
       if (tensor->allocator() != nullptr) {
         tensor->allocator()->Free(float16_data);
       } else {
@@ -273,5 +277,8 @@ int CpuFp16SubGraph::PostProcess() {
   }
   this->FreeOriginInputData();
   return RET_OK;
+#else
+  return RET_OK;
+#endif
 }
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/sub_graph_kernel.h b/mindspore/lite/src/sub_graph_kernel.h
index e8de8e9b23e..b537f18e0ce 100644
--- a/mindspore/lite/src/sub_graph_kernel.h
+++ b/mindspore/lite/src/sub_graph_kernel.h
@@ -24,41 +24,10 @@
 #include "src/executor.h"
 #include "src/common/log_adapter.h"
 #ifdef ENABLE_ARM64
-#include "nnacl/optimized_kernel.h"
+#include "src/common/utils.h"
 #endif
 
 namespace mindspore::kernel {
-using Float16CastFunc = void (*)(const void *, void *, int);
-
-class Float16CastUtil {
- public:
-  static Float16CastUtil *GetInstance() {
-    static Float16CastUtil float16_cast_util;
-    return &float16_cast_util;
-  }
-
- private:
-  Float16CastUtil() {
-#ifdef ENABLE_ARM64
-    void *fp16_op_handler = Float16Module::GetInstance()->float16_op_handler_;
-    if (fp16_op_handler != nullptr) {
-      dlerror();
-      *(reinterpret_cast<void **>(&float16_to_float32_func_)) = dlsym(fp16_op_handler, "Float16ToFloat32_fp16_handler");
-      *(reinterpret_cast<void **>(&float32_to_float16_func_)) = dlsym(fp16_op_handler, "Float32ToFloat16_fp16_handler");
-      auto dlopen_error = dlerror();
-      if (dlopen_error != nullptr) {
-        MS_LOG(ERROR) << "load float16 cast func failed! " << dlopen_error << ".";
-      }
-    }
-#endif
-  }
-  ~Float16CastUtil() = default;
-
- public:
-  Float16CastFunc float16_to_float32_func_ = nullptr;
-  Float16CastFunc float32_to_float16_func_ = nullptr;
-};
-
 // store origin data and allocator of input tensor of subgraph for PreProcess and PostProcess
 struct DataStore {
   void *data_ = nullptr;
diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt
index b3ce4be0d7e..535136a36fd 100644
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -66,17 +66,6 @@ if (PLATFORM_ARM32)
             )
 endif()
 
-if (ENABLE_FP16)
-    file(GLOB KERNEL_OP_FP16_SRC
-            ${LITE_DIR}/src/runtime/kernel/arm/fp16/*.cc
-            ${LITE_DIR}/nnacl/fp16/*.c
-            )
-    set(KERNEL_OP_SRC
-            ${KERNEL_OP_SRC}
-            ${KERNEL_OP_FP16_SRC}
-            )
-endif ()
-
 if ("${X86_64_SIMD}" STREQUAL "sse")
     file(GLOB TEST_ASSEMBLY_SRC ${LITE_DIR}/nnacl/x86_64_sse/*.c)
     set_property(SOURCE ${TEST_ASSEMBLY_SRC} PROPERTY LANGUAGE C)
@@ -295,7 +284,7 @@ add_executable(lite-test ${TEST_SRC})
 
 target_link_libraries(lite-test dl ${GTEST_LIBRARY})
 if (PLATFORM_ARM64)
-    target_link_libraries(lite-test mslite_internal)
+    target_link_libraries(lite-test mslite_internal nnacl_fp16_mid nnacl_optimize_mid)
 endif()
 
 if (PLATFORM_ARM)
diff --git a/mindspore/lite/test/run_benchmark_nets.sh b/mindspore/lite/test/run_benchmark_nets.sh
index d4c7a3375aa..581b3a3935b 100644
--- a/mindspore/lite/test/run_benchmark_nets.sh
+++ b/mindspore/lite/test/run_benchmark_nets.sh
@@ -727,8 +727,6 @@ function Run_arm64() {
     fi
 
     cp -a ${arm64_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm64}/lib/libmindspore-lite.so ${benchmark_test_path}/libmindspore-lite.so || exit 1
-    cp -a ${arm64_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm64}/lib/libmindspore-lite-fp16.so ${benchmark_test_path}/libmindspore-lite-fp16.so || exit 1
-    cp -a ${arm64_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm64}/lib/libmindspore-lite-optimize.so ${benchmark_test_path}/libmindspore-lite-optimize.so || exit 1
     cp -a ${arm64_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm64}/benchmark/benchmark ${benchmark_test_path}/benchmark || exit 1
 
     # adb push all needed files to the phone
diff --git a/mindspore/lite/tools/converter/CMakeLists.txt b/mindspore/lite/tools/converter/CMakeLists.txt
index 4ef94eb111d..51413a75069 100644
--- a/mindspore/lite/tools/converter/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/CMakeLists.txt
@@ -28,6 +28,7 @@ file(GLOB_RECURSE CONVERTER_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/flag_parser.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/storage.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/ir/primitive_t_value.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/common/utils.cc
 
         ../optimizer/common/node_pass_extends.cc
         ../optimizer/common/pass_manager_extends.cc