From a9c90f71e45035c49b2a3e3ffebdb3cdf21d8121 Mon Sep 17 00:00:00 2001 From: yeyunpeng Date: Fri, 13 Nov 2020 09:34:48 +0800 Subject: [PATCH] target link fp16 and optimize to lite --- build.sh | 2 - cmake/package_lite.cmake | 2 - mindspore/lite/nnacl/optimize/CMakeLists.txt | 6 +- mindspore/lite/nnacl/optimized_kernel.h | 99 ------------------- mindspore/lite/src/CMakeLists.txt | 21 +--- mindspore/lite/src/common/utils.cc | 34 +++++++ mindspore/lite/src/common/utils.h | 3 + mindspore/lite/src/kernel_registry.cc | 16 ++- .../kernel/arm/fp16/convolution_1x1_fp16.h | 2 +- .../kernel/arm/fp16/convolution_base_fp16.h | 2 +- .../arm/fp16/convolution_winograd_fp16.h | 2 +- .../{fp16_op_handler.cc => fp16_op_handler.h} | 2 + .../kernel/arm/fp16/quant_dtype_cast_fp16.cc | 6 +- .../kernel/arm/int8/convolution_1x1_int8.cc | 18 ++-- .../kernel/arm/int8/convolution_1x1_int8.h | 2 +- .../kernel/arm/int8/convolution_int8.cc | 21 ++-- .../kernel/arm/int8/convolution_int8.h | 2 +- .../kernel/arm/int8/deconvolution_int8.cc | 18 +--- .../runtime/kernel/arm/int8/opt_op_handler.cc | 2 +- .../runtime/kernel/arm/int8/opt_op_handler.h | 40 ++++++++ mindspore/lite/src/scheduler.cc | 3 +- mindspore/lite/src/sub_graph_kernel.cc | 25 +++-- mindspore/lite/src/sub_graph_kernel.h | 33 +------ mindspore/lite/test/CMakeLists.txt | 13 +-- mindspore/lite/test/run_benchmark_nets.sh | 2 - mindspore/lite/tools/converter/CMakeLists.txt | 1 + 26 files changed, 141 insertions(+), 236 deletions(-) delete mode 100644 mindspore/lite/nnacl/optimized_kernel.h rename mindspore/lite/src/runtime/kernel/arm/fp16/{fp16_op_handler.cc => fp16_op_handler.h} (97%) create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.h diff --git a/build.sh b/build.sh index f8b6298bbe3..42199dbefd2 100755 --- a/build.sh +++ b/build.sh @@ -752,8 +752,6 @@ build_lite_java_arm64() { [ -n "${JAVA_PATH}" ] && rm -rf ${JAVA_PATH}/java/app/libs/arm64-v8a/ mkdir -p ${JAVA_PATH}/java/app/libs/arm64-v8a/ cp ${BASEPATH}/output/mindspore-lite-${VERSION_STR}-runtime-arm64-cpu/lib/libmindspore-lite.so ${JAVA_PATH}/java/app/libs/arm64-v8a/ - cp ${BASEPATH}/output/mindspore-lite-${VERSION_STR}-runtime-arm64-cpu/lib/libmindspore-lite-fp16.so ${JAVA_PATH}/java/app/libs/arm64-v8a/ - cp ${BASEPATH}/output/mindspore-lite-${VERSION_STR}-runtime-arm64-cpu/lib/libmindspore-lite-optimize.so ${JAVA_PATH}/java/app/libs/arm64-v8a/ echo mindspore-lite-${VERSION_STR}-runtime-arm64-cpu [ -n "${VERSION_STR}" ] && rm -rf mindspore-lite-${VERSION_STR}-runtime-arm64-cpu } diff --git a/cmake/package_lite.cmake b/cmake/package_lite.cmake index e8021e96576..055606e95b4 100644 --- a/cmake/package_lite.cmake +++ b/cmake/package_lite.cmake @@ -77,8 +77,6 @@ if (PLATFORM_ARM64) install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.a DESTINATION ${LIB_DIR} COMPONENT ${COMPONENT_NAME}) install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${INC_DIR}/ir/dtype COMPONENT ${COMPONENT_NAME}) install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/schema/ DESTINATION ${INC_DIR}/schema COMPONENT ${COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "inner" EXCLUDE) - install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite-optimize.so DESTINATION ${LIB_DIR} COMPONENT ${COMPONENT_NAME}) - install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite-fp16.so DESTINATION ${LIB_DIR} COMPONENT ${COMPONENT_NAME}) install(DIRECTORY ${flatbuffers_INC} DESTINATION ${FLATBF_DIR} COMPONENT ${COMPONENT_NAME}) if (ENABLE_TOOLS) install(TARGETS benchmark RUNTIME DESTINATION ${MAIN_DIR}-${COMPONENT_NAME}/benchmark COMPONENT ${COMPONENT_NAME}) diff --git a/mindspore/lite/nnacl/optimize/CMakeLists.txt b/mindspore/lite/nnacl/optimize/CMakeLists.txt index e6acd66df74..c3bf1d8c9e7 100644 --- a/mindspore/lite/nnacl/optimize/CMakeLists.txt +++ b/mindspore/lite/nnacl/optimize/CMakeLists.txt @@ -21,8 +21,6 @@ string(REPLACE "-fvisibility=hidden" "-fvisibility=default" CMAKE_C_FLAGS "${CMA set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8.2-a+dotprod+fp16") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+dotprod+fp16") -add_library(nnacl_optimize STATIC ${SDOT_FILES}) -target_link_libraries(nnacl_optimize mindspore-lite) +add_library(nnacl_optimize_mid OBJECT ${SDOT_FILES}) -add_library(nnacl_fp16 STATIC ${FP16_FILES}) -target_link_libraries(nnacl_fp16 mindspore-lite) +add_library(nnacl_fp16_mid OBJECT ${FP16_FILES}) diff --git a/mindspore/lite/nnacl/optimized_kernel.h b/mindspore/lite/nnacl/optimized_kernel.h deleted file mode 100644 index d070ec26d57..00000000000 --- a/mindspore/lite/nnacl/optimized_kernel.h +++ /dev/null @@ -1,99 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_LITE_NNACL_OPTIMIZED_KERNEL_H_ -#define MINDSPORE_LITE_NNACL_OPTIMIZED_KERNEL_H_ - -#ifndef _WIN32 -#include -#endif -#ifdef __ANDROID__ -#include -#include "nnacl/nnacl_utils.h" -#endif -#include "src/common/log_adapter.h" - -#define OPTIMIZE_SHARED_LIBRARY_PATH "libmindspore-lite-optimize.so" -#define FLOAT16_SHARED_LIBRARY_PATH "libmindspore-lite-fp16.so" - -class OptimizeModule { - public: - OptimizeModule() { - bool support_optimize_ops = false; - -#ifdef ENABLE_ARM64 - int hwcap_type = 16; - uint32_t hwcap = getHwCap(hwcap_type); - if (hwcap & HWCAP_ASIMDDP) { - MS_LOG(INFO) << "Hw cap support SMID Dot Product, hwcap: 0x" << hwcap; - support_optimize_ops = true; - } else { - MS_LOG(INFO) << "Hw cap NOT support SIMD Dot Product, hwcap: 0x" << hwcap; - } -#endif - if (support_optimize_ops == false) { - return; - } -#ifdef ENABLE_ARM64 - optimized_op_handler_ = dlopen(OPTIMIZE_SHARED_LIBRARY_PATH, RTLD_LAZY); - if (optimized_op_handler_ == nullptr) { - MS_LOG(INFO) << "Open optimize shared library failed: " << dlerror(); - } -#endif - } - - ~OptimizeModule() = default; - - static OptimizeModule *GetInstance() { - static OptimizeModule opt_module; - return &opt_module; - } - void *optimized_op_handler_ = nullptr; -}; - -class Float16Module { - public: - Float16Module() { - bool support_fp16 = false; -#ifdef ENABLE_ARM64 - int hwcap_type = 16; - uint32_t hwcap = getHwCap(hwcap_type); - if (hwcap & HWCAP_FPHP) { - MS_LOG(INFO) << "Hw cap support FP16, hwcap: 0x" << hwcap; - support_fp16 = true; - } -#endif - if (support_fp16 == false) { - return; - } -#ifdef ENABLE_ARM64 - float16_op_handler_ = dlopen(FLOAT16_SHARED_LIBRARY_PATH, RTLD_LAZY); - if (float16_op_handler_ == nullptr) { - MS_LOG(INFO) << "Open optimize shared library failed: " << dlerror(); - } -#endif - } - - ~Float16Module() = default; - - static Float16Module *GetInstance() { - static Float16Module fp16_module; - return &fp16_module; - } - void *float16_op_handler_ = nullptr; -}; - -#endif // MINDSPORE_LITE_NNACL_OPTIMIZED_KERNEL_H_ diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt index b73e287955c..311723204f1 100644 --- a/mindspore/lite/src/CMakeLists.txt +++ b/mindspore/lite/src/CMakeLists.txt @@ -17,6 +17,7 @@ if (PLATFORM_ARM32 OR PLATFORM_ARM64) endif () set(LITE_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/common/utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/common/graph_util.cc ${CMAKE_CURRENT_SOURCE_DIR}/common/log_adapter.cc ${CMAKE_CURRENT_SOURCE_DIR}/common/string_util.cc @@ -114,22 +115,10 @@ endif () ########################## build optimize and float16 library #################################3 if (PLATFORM_ARM64) - add_library(mindspore-lite-optimize SHARED) - target_link_libraries(mindspore-lite-optimize cpu_opt_kernel_mid) - target_link_libraries(mindspore-lite-optimize nnacl_optimize) + target_link_libraries(mindspore-lite cpu_opt_kernel_mid nnacl_optimize_mid) + target_link_libraries(mindspore-lite_static cpu_opt_kernel_mid nnacl_optimize_mid) - add_library(mindspore-lite-fp16 SHARED) - target_link_libraries(mindspore-lite-fp16 cpu_fp16_kernel_mid) - target_link_libraries(mindspore-lite-fp16 nnacl_fp16) -endif () - -if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release" AND (PLATFORM_ARM64)) - add_custom_command(TARGET mindspore-lite-optimize POST_BUILD COMMAND - ${ANDROID_NDK}/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/aarch64-linux-android/bin/strip - ${CMAKE_BINARY_DIR}/src/libmindspore-lite-optimize.so) - - add_custom_command(TARGET mindspore-lite-fp16 POST_BUILD COMMAND - ${ANDROID_NDK}/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/aarch64-linux-android/bin/strip - ${CMAKE_BINARY_DIR}/src/libmindspore-lite-fp16.so) + target_link_libraries(mindspore-lite cpu_fp16_kernel_mid nnacl_fp16_mid) + target_link_libraries(mindspore-lite_static cpu_fp16_kernel_mid nnacl_fp16_mid) endif () diff --git a/mindspore/lite/src/common/utils.cc b/mindspore/lite/src/common/utils.cc index da8534422b4..ed54fe2c678 100644 --- a/mindspore/lite/src/common/utils.cc +++ b/mindspore/lite/src/common/utils.cc @@ -16,6 +16,7 @@ #ifdef __ANDROID__ #include +#include #endif #include "src/common/utils.h" @@ -257,5 +258,38 @@ uint32_t getHwCap(int hwcap_type) { return ret; } #endif + +bool IsSupportSDot() { + bool status = false; +#ifdef ENABLE_ARM64 + int hwcap_type = 16; + uint32_t hwcap = getHwCap(hwcap_type); + if (hwcap & HWCAP_ASIMDDP) { + MS_LOG(DEBUG) << "Hw cap support SMID Dot Product, hwcap: 0x" << hwcap; + status = true; + } else { + MS_LOG(DEBUG) << "Hw cap NOT support SIMD Dot Product, hwcap: 0x" << hwcap; + status = false; + } +#endif + return status; +} + +bool IsSupportFloat16() { + bool status = false; +#ifdef ENABLE_ARM64 + int hwcap_type = 16; + uint32_t hwcap = getHwCap(hwcap_type); + if (hwcap & HWCAP_FPHP) { + MS_LOG(DEBUG) << "Hw cap support FP16, hwcap: 0x" << hwcap; + status = true; + } else { + MS_LOG(DEBUG) << "Hw cap NOT support FP16, hwcap: 0x" << hwcap; + status = false; + } +#endif + return status; +} + } // namespace lite } // namespace mindspore diff --git a/mindspore/lite/src/common/utils.h b/mindspore/lite/src/common/utils.h index 245b32ece32..43a37bd3c20 100644 --- a/mindspore/lite/src/common/utils.h +++ b/mindspore/lite/src/common/utils.h @@ -44,6 +44,9 @@ void ShortToFloat32(const int16_t *srcdata, float *dstdata, size_t elementSize); void Float32ToShort(const float *srcdata, int16_t *dstdata, size_t elementSize); +bool IsSupportSDot(); + +bool IsSupportFloat16(); #if defined(__arm__) || defined(__aarch64__) uint32_t getHwCap(int hwcap_type); #endif diff --git a/mindspore/lite/src/kernel_registry.cc b/mindspore/lite/src/kernel_registry.cc index 4ed813a482c..30c5dc1ec6b 100644 --- a/mindspore/lite/src/kernel_registry.cc +++ b/mindspore/lite/src/kernel_registry.cc @@ -20,7 +20,7 @@ #include #include "common/utils.h" #include "src/common/log_adapter.h" -#include "nnacl/optimized_kernel.h" +#include "src/common/utils.h" #endif using mindspore::kernel::kCPU; @@ -36,17 +36,15 @@ KernelRegistry *KernelRegistry::GetInstance() { int KernelRegistry::Init() { #ifdef ENABLE_ARM64 - void *optimized_lib_handler = OptimizeModule::GetInstance()->optimized_op_handler_; - if (optimized_lib_handler != nullptr) { - MS_LOG(INFO) << "load optimize lib success."; + if (mindspore::lite::IsSupportSDot()) { + MS_LOG(INFO) << "The current device supports Sdot."; } else { - MS_LOG(INFO) << "load optimize lib failed."; + MS_LOG(INFO) << "The current device NOT supports Sdot."; } - void *float16_op_handler = Float16Module::GetInstance()->float16_op_handler_; - if (float16_op_handler != nullptr) { - MS_LOG(INFO) << "load float16 lib success."; + if (mindspore::lite::IsSupportFloat16()) { + MS_LOG(INFO) << "The current device supports float16."; } else { - MS_LOG(INFO) << "load float16 lib failed."; + MS_LOG(INFO) << "The current device NOT supports float16."; } #endif return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h index 318fa37e59a..78b3c95a417 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h @@ -21,7 +21,7 @@ #include #include "src/lite_kernel.h" #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" -#include "nnacl/optimized_kernel.h" +#include "src/common/utils.h" #include "nnacl/matmul_parameter.h" #include "nnacl/fp16/matmul_fp16.h" diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h index ab2d10c155b..972795cd121 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h @@ -21,7 +21,7 @@ #include #include "src/lite_kernel.h" #include "src/runtime/kernel/arm/base/convolution_base.h" -#include "nnacl/optimized_kernel.h" +#include "src/common/utils.h" namespace mindspore::kernel { class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h index 813cafbc10c..567e5a7a9fe 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h @@ -23,7 +23,7 @@ #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" #include "nnacl/fp16/conv_fp16.h" #include "nnacl/fp16/winograd_utils_fp16.h" -#include "nnacl/optimized_kernel.h" +#include "src/common/utils.h" #include "nnacl/minimal_filtering_generator.h" namespace mindspore::kernel { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fp16_op_handler.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/fp16_op_handler.h similarity index 97% rename from mindspore/lite/src/runtime/kernel/arm/fp16/fp16_op_handler.cc rename to mindspore/lite/src/runtime/kernel/arm/fp16/fp16_op_handler.h index 715513b8ba9..636969c4f59 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/fp16_op_handler.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fp16_op_handler.h @@ -14,7 +14,9 @@ * limitations under the License. */ +#ifdef ENABLE_ARM64 #include +#endif #ifdef __cplusplus extern "C" { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc index f6f9d435fa3..0879b04edf2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc @@ -102,11 +102,11 @@ int QuantDTypeCastFp16CPUKernel::QuantDTypeCast(int task_id) { return RET_OK; } -int QuantDTypeCastRun(void *cdata, int task_id) { +int QuantDTypeCastFP16Run(void *cdata, int task_id) { auto g_kernel = reinterpret_cast(cdata); auto ret = g_kernel->QuantDTypeCast(task_id); if (ret != RET_OK) { - MS_LOG(ERROR) << "QuantDTypeCastRun error task_id[" << task_id << "] error_code[" << ret << "]"; + MS_LOG(ERROR) << "QuantDTypeCastFP16Run error task_id[" << task_id << "] error_code[" << ret << "]"; return RET_ERROR; } return RET_OK; @@ -126,7 +126,7 @@ int QuantDTypeCastFp16CPUKernel::Run() { return RET_ERROR; } - auto ret = ParallelLaunch(this->context_->thread_pool_, QuantDTypeCastRun, this, thread_n_num_); + auto ret = ParallelLaunch(this->context_->thread_pool_, QuantDTypeCastFP16Run, this, thread_n_num_); if (ret != RET_OK) { MS_LOG(ERROR) << "Scale error error_code[" << ret << "]"; return RET_ERROR; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc index 7b1f38d510c..3b3aa30cb91 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc @@ -17,6 +17,9 @@ #include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h" #include "src/runtime/runtime_api.h" #include "src/common/file_utils.h" +#ifdef ENABLE_ARM64 +#include "src/runtime/kernel/arm/int8/opt_op_handler.h" +#endif using mindspore::lite::RET_ERROR; using mindspore::lite::RET_MEMORY_FAILED; @@ -74,18 +77,9 @@ void Convolution1x1Int8CPUKernel::CheckSupportOptimize() { support_optimize_ = false; matmul_func_ = MatMulInt8_8x8_r; #ifdef ENABLE_ARM64 - void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_; - if (optimize_op_handler != nullptr) { - dlerror(); - *(reinterpret_cast(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulRInt8_optimize_handler"); - auto dlopen_error = dlerror(); - if (dlopen_error != nullptr) { - MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << "."; - support_optimize_ = false; - matmul_func_ = nullptr; - } else { - support_optimize_ = true; - } + if (mindspore::lite::IsSupportSDot()) { + support_optimize_ = true; + matmul_func_ = MatMulRInt8_optimize_handler; } else { support_optimize_ = false; matmul_func_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h index 7543ce55cb5..e224cb98030 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h @@ -25,7 +25,7 @@ #include "nnacl/int8/conv_int8.h" #include "nnacl/int8/matmul_int8.h" #include "nnacl/matmul_parameter.h" -#include "nnacl/optimized_kernel.h" +#include "src/common/utils.h" namespace mindspore::kernel { class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel { diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc index e6f78016481..39591176bbb 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc @@ -23,6 +23,9 @@ #include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h" #include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h" #include "src/runtime/runtime_api.h" +#ifdef ENABLE_ARM64 +#include "src/runtime/kernel/arm/int8/opt_op_handler.h" +#endif using mindspore::kernel::KERNEL_ARCH::kCPU; using mindspore::lite::KernelRegistrar; @@ -39,18 +42,9 @@ void ConvolutionInt8CPUKernel::CheckSupportOptimize() { #endif #ifdef ENABLE_ARM64 - void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_; - if (optimize_op_handler != nullptr) { - dlerror(); - *(reinterpret_cast(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulRInt8_optimize_handler"); - auto dlopen_error = dlerror(); - if (dlopen_error != nullptr) { - MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << "."; - support_optimize_ = false; - tile_num_ = 4; - } else { - support_optimize_ = true; - } + if (mindspore::lite::IsSupportSDot()) { + matmul_func_ = MatMulRInt8_optimize_handler; + support_optimize_ = true; } else { tile_num_ = 4; support_optimize_ = false; @@ -260,8 +254,7 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector & kernel::LiteKernel *kernel; if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { #ifdef ENABLE_ARM64 - void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_; - if (optimize_op_handler != nullptr) { + if (mindspore::lite::IsSupportSDot()) { kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive); } else { kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive); diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h index e838164c16a..0f8a4e2cf76 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h @@ -20,7 +20,7 @@ #include #include "src/lite_kernel.h" #include "src/runtime/kernel/arm/base/convolution_base.h" -#include "nnacl/optimized_kernel.h" +#include "src/common/utils.h" #include "nnacl/int8/conv_int8.h" namespace mindspore::kernel { diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc index 69f111f30df..d9504cb72f8 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc @@ -16,7 +16,8 @@ #include "src/runtime/kernel/arm/int8/deconvolution_int8.h" #include "src/runtime/runtime_api.h" -#include "nnacl/optimized_kernel.h" +#include "src/common/utils.h" +#include "src/runtime/kernel/arm/int8/opt_op_handler.h" using mindspore::kernel::KERNEL_ARCH::kCPU; using mindspore::lite::KernelRegistrar; @@ -95,18 +96,9 @@ void DeConvInt8CPUKernel::CheckSupportOptimize() { support_optimize_ = true; matmul_func_ = MatMulInt8_16x4; #ifdef ENABLE_ARM64 - void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_; - if (optimize_op_handler != nullptr) { - dlerror(); - *(reinterpret_cast(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulR4Int8_optimize_handler"); - auto dlopen_error = dlerror(); - if (dlopen_error != nullptr) { - MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << "."; - support_optimize_ = false; - matmul_func_ = MatMulR4Int8Neon64; - } else { - support_optimize_ = true; - } + if (mindspore::lite::IsSupportSDot()) { + support_optimize_ = true; + matmul_func_ = MatMulR4Int8_optimize_handler; } else { support_optimize_ = false; matmul_func_ = MatMulR4Int8Neon64; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.cc b/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.cc index f8527744ba1..46e242bdecd 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.cc @@ -14,8 +14,8 @@ * limitations under the License. */ +#include "src/runtime/kernel/arm/int8/opt_op_handler.h" #include -#include #include "nnacl/op_base.h" #ifdef __cplusplus diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.h b/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.h new file mode 100644 index 00000000000..1e273706e1e --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/int8/opt_op_handler.h @@ -0,0 +1,40 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "nnacl/op_base.h" +#ifdef __cplusplus +extern "C" { +#endif +#ifdef ENABLE_ARM64 +void IndirectGemmInt8_optimize_handler(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias, + size_t ksize, size_t ic4, size_t output_channel, size_t offset, + const int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp, + int32_t *out_multiplier, int32_t *shift_before, int32_t *shift_after, + size_t asymmetric, size_t per_channel, size_t per_channel_offset); +void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16, + const int *input_sum, const int *bias); + +void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4, + size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, + int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, + int32_t maxi, size_t per_channel); +#endif + +#ifdef __cplusplus +} +#endif diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc index 9b5eed568dc..8cc4fe74af6 100644 --- a/mindspore/lite/src/scheduler.cc +++ b/mindspore/lite/src/scheduler.cc @@ -275,7 +275,8 @@ kernel::LiteKernel *Scheduler::ScheduleNode(const std::vector &in_tens } } #endif - if ((context_->IsCpuFloat16Enabled() && data_type == kNumberTypeFloat32) || data_type == kNumberTypeFloat16) { + if (mindspore::lite::IsSupportFloat16() && + ((context_->IsCpuFloat16Enabled() && data_type == kNumberTypeFloat32) || data_type == kNumberTypeFloat16)) { kernel::KernelKey fp16_cpu_desc{desc.arch, kNumberTypeFloat16, desc.type}; auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, primitive, context_, fp16_cpu_desc); diff --git a/mindspore/lite/src/sub_graph_kernel.cc b/mindspore/lite/src/sub_graph_kernel.cc index 1795957946c..871d18fd114 100644 --- a/mindspore/lite/src/sub_graph_kernel.cc +++ b/mindspore/lite/src/sub_graph_kernel.cc @@ -17,7 +17,8 @@ #include "src/sub_graph_kernel.h" #include "src/tensor.h" #ifdef ENABLE_ARM64 -#include "nnacl/optimized_kernel.h" +#include "src/common/utils.h" +#include "src/runtime/kernel/arm/fp16/fp16_op_handler.h" #endif namespace mindspore::kernel { @@ -183,9 +184,9 @@ void CpuFp16SubGraph::FreeOriginInputData() { } int CpuFp16SubGraph::PreProcess() { - auto fp32_to_fp16_cast_func = Float16CastUtil::GetInstance()->float32_to_float16_func_; - if (fp32_to_fp16_cast_func == nullptr) { - MS_LOG(ERROR) << "Can not find cast fp32 to fp16 func"; +#ifdef ENABLE_ARM64 + if (!mindspore::lite::IsSupportFloat16()) { + MS_LOG(ERROR) << "Unsupport fp16 in this devices"; return RET_ERROR; } MS_ASSERT(origin_input_data_.empty()); @@ -203,7 +204,7 @@ int CpuFp16SubGraph::PreProcess() { return RET_ERROR; } MS_ASSERT(tensor->data_c() != nullptr); - fp32_to_fp16_cast_func(float32_data, tensor->data_c(), tensor->ElementsNum()); + Float32ToFloat16_fp16_handler(float32_data, tensor->data_c(), tensor->ElementsNum()); auto *data_store = DataStore::CreateDataStore(float32_data, tensor->allocator(), this->context_->allocator.get()); if (data_store == nullptr) { MS_LOG(ERROR) << "Create DataStore failed"; @@ -223,12 +224,15 @@ int CpuFp16SubGraph::PreProcess() { } } return RET_OK; +#else + return RET_OK; +#endif } int CpuFp16SubGraph::PostProcess() { - auto fp16_to_fp32_cast_func = Float16CastUtil::GetInstance()->float16_to_float32_func_; - if (fp16_to_fp32_cast_func == nullptr) { - MS_LOG(ERROR) << "Can not find cast fp16 to fp32 func"; +#ifdef ENABLE_ARM64 + if (!mindspore::lite::IsSupportFloat16()) { + MS_LOG(ERROR) << "Unsupport fp16 in this devices"; return RET_ERROR; } for (auto tensor : this->out_tensors_) { @@ -249,7 +253,7 @@ int CpuFp16SubGraph::PostProcess() { return RET_ERROR; } MS_ASSERT(tensor->data_c() != nullptr); - fp16_to_fp32_cast_func(float16_data, tensor->data_c(), tensor->ElementsNum()); + Float16ToFloat32_fp16_handler(float16_data, tensor->data_c(), tensor->ElementsNum()); if (tensor->allocator() != nullptr) { tensor->allocator()->Free(float16_data); } else { @@ -273,5 +277,8 @@ int CpuFp16SubGraph::PostProcess() { } this->FreeOriginInputData(); return RET_OK; +#else + return RET_OK; +#endif } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/sub_graph_kernel.h b/mindspore/lite/src/sub_graph_kernel.h index e8de8e9b23e..b537f18e0ce 100644 --- a/mindspore/lite/src/sub_graph_kernel.h +++ b/mindspore/lite/src/sub_graph_kernel.h @@ -24,41 +24,10 @@ #include "src/executor.h" #include "src/common/log_adapter.h" #ifdef ENABLE_ARM64 -#include "nnacl/optimized_kernel.h" +#include "src/common/utils.h" #endif namespace mindspore::kernel { -using Float16CastFunc = void (*)(const void *, void *, int); - -class Float16CastUtil { - public: - static Float16CastUtil *GetInstance() { - static Float16CastUtil float16_cast_util; - return &float16_cast_util; - } - - private: - Float16CastUtil() { -#ifdef ENABLE_ARM64 - void *fp16_op_handler = Float16Module::GetInstance()->float16_op_handler_; - if (fp16_op_handler != nullptr) { - dlerror(); - *(reinterpret_cast(&float16_to_float32_func_)) = dlsym(fp16_op_handler, "Float16ToFloat32_fp16_handler"); - *(reinterpret_cast(&float32_to_float16_func_)) = dlsym(fp16_op_handler, "Float32ToFloat16_fp16_handler"); - auto dlopen_error = dlerror(); - if (dlopen_error != nullptr) { - MS_LOG(ERROR) << "load float16 cast func failed! " << dlopen_error << "."; - } - } -#endif - } - ~Float16CastUtil() = default; - - public: - Float16CastFunc float16_to_float32_func_ = nullptr; - Float16CastFunc float32_to_float16_func_ = nullptr; -}; - // store origin data and allocator of input tensor of subgraph for PreProcess and PostProcess struct DataStore { void *data_ = nullptr; diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt index b3ce4be0d7e..535136a36fd 100644 --- a/mindspore/lite/test/CMakeLists.txt +++ b/mindspore/lite/test/CMakeLists.txt @@ -66,17 +66,6 @@ if (PLATFORM_ARM32) ) endif() -if (ENABLE_FP16) - file(GLOB KERNEL_OP_FP16_SRC - ${LITE_DIR}/src/runtime/kernel/arm/fp16/*.cc - ${LITE_DIR}/nnacl/fp16/*.c - ) - set(KERNEL_OP_SRC - ${KERNEL_OP_SRC} - ${KERNEL_OP_FP16_SRC} - ) -endif () - if ("${X86_64_SIMD}" STREQUAL "sse") file(GLOB TEST_ASSEMBLY_SRC ${LITE_DIR}/nnacl/x86_64_sse/*.c) set_property(SOURCE ${TEST_ASSEMBLY_SRC} PROPERTY LANGUAGE C) @@ -295,7 +284,7 @@ add_executable(lite-test ${TEST_SRC}) target_link_libraries(lite-test dl ${GTEST_LIBRARY}) if (PLATFORM_ARM64) - target_link_libraries(lite-test mslite_internal) + target_link_libraries(lite-test mslite_internal nnacl_fp16_mid nnacl_optimize_mid) endif() if (PLATFORM_ARM) diff --git a/mindspore/lite/test/run_benchmark_nets.sh b/mindspore/lite/test/run_benchmark_nets.sh index d4c7a3375aa..581b3a3935b 100644 --- a/mindspore/lite/test/run_benchmark_nets.sh +++ b/mindspore/lite/test/run_benchmark_nets.sh @@ -727,8 +727,6 @@ function Run_arm64() { fi cp -a ${arm64_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm64}/lib/libmindspore-lite.so ${benchmark_test_path}/libmindspore-lite.so || exit 1 - cp -a ${arm64_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm64}/lib/libmindspore-lite-fp16.so ${benchmark_test_path}/libmindspore-lite-fp16.so || exit 1 - cp -a ${arm64_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm64}/lib/libmindspore-lite-optimize.so ${benchmark_test_path}/libmindspore-lite-optimize.so || exit 1 cp -a ${arm64_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm64}/benchmark/benchmark ${benchmark_test_path}/benchmark || exit 1 # adb push all needed files to the phone diff --git a/mindspore/lite/tools/converter/CMakeLists.txt b/mindspore/lite/tools/converter/CMakeLists.txt index 4ef94eb111d..51413a75069 100644 --- a/mindspore/lite/tools/converter/CMakeLists.txt +++ b/mindspore/lite/tools/converter/CMakeLists.txt @@ -28,6 +28,7 @@ file(GLOB_RECURSE CONVERTER_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../common/flag_parser.cc ${CMAKE_CURRENT_SOURCE_DIR}/../common/storage.cc ${CMAKE_CURRENT_SOURCE_DIR}/../../src/ir/primitive_t_value.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/common/utils.cc ../optimizer/common/node_pass_extends.cc ../optimizer/common/pass_manager_extends.cc