From 57da3c73feaf11a8da353314cc146c0f6aabfc27 Mon Sep 17 00:00:00 2001 From: zhanyuan Date: Sat, 29 May 2021 15:07:34 +0800 Subject: [PATCH] [CPU] Support avx isa by default --- build.sh | 4 +-- mindspore/ccsrc/CMakeLists.txt | 17 +++++------- .../kernel_compiler/cpu/nnacl/CMakeLists.txt | 27 ++++++++++--------- .../cpu/nnacl/fp32/activation_fp32.c | 4 +++ .../nnacl/intrinsics/ms_simd_instructions.h | 1 + 5 files changed, 28 insertions(+), 25 deletions(-) diff --git a/build.sh b/build.sh index 2fb941d870c..6b6eddf05da 100755 --- a/build.sh +++ b/build.sh @@ -60,7 +60,7 @@ usage() echo " -l Compile with python dependency, default on" echo " -S Enable enable download cmake compile dependency from gitee , default off" echo " -k Enable make clean, clean up compilation generated cache " - echo " -W Enable x86_64 SSE or AVX instruction set, use [sse|avx|neon|off], default off" + echo " -W Enable x86_64 SSE or AVX instruction set, use [sse|avx|neon|off], default off for lite and avx for CPU" echo " -H Enable hidden" echo " -L Link and specify Tensor-RT library path, default disable Tensor-RT lib linking" } @@ -387,7 +387,7 @@ build_mindspore() echo "start build mindspore project." mkdir -pv "${BUILD_PATH}/mindspore" cd "${BUILD_PATH}/mindspore" - CMAKE_ARGS="-DDEBUG_MODE=$DEBUG_MODE -DBUILD_PATH=$BUILD_PATH -DX86_64_SIMD=${X86_64_SIMD}" + CMAKE_ARGS="-DDEBUG_MODE=$DEBUG_MODE -DBUILD_PATH=$BUILD_PATH" if [[ "X$ENABLE_COVERAGE" = "Xon" ]]; then CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_COVERAGE=ON" fi diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt index 15a46247231..e3e314f255f 100644 --- a/mindspore/ccsrc/CMakeLists.txt +++ b/mindspore/ccsrc/CMakeLists.txt @@ -6,18 +6,13 @@ include_directories(${CMAKE_SOURCE_DIR}/mindspore/core/mindrt/include) include_directories(${CMAKE_SOURCE_DIR}/mindspore/core/mindrt/src) if(ENABLE_CPU) + if(${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "aarch64") + set(PLATFORM_ARM64 "on") + set(X86_64_SIMD "off") + elseif("${X86_64_SIMD}" STREQUAL "off") + set(X86_64_SIMD "avx") + endif() include_directories(${CMAKE_CURRENT_SOURCE_DIR}/backend/kernel_compiler/cpu) - if("${X86_64_SIMD}" STREQUAL "sse") - add_compile_definitions(ENABLE_SSE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1 -msse4.2") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1 -msse4.2") - endif() - if("${X86_64_SIMD}" STREQUAL "avx") - add_compile_definitions(ENABLE_SSE) - add_compile_definitions(ENABLE_AVX) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1 -msse4.2 -mfma -mavx -mavx2") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1 -msse4.2 -mfma -mavx -mavx2") - endif() add_subdirectory(backend/kernel_compiler/cpu/nnacl) endif() diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt index 0eefb7d7a7f..ce4c4ae2dc1 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt @@ -13,9 +13,6 @@ if(PLATFORM_ARM32 OR PLATFORM_ARM64) -ffunction-sections -fdata-sections -ffast-math") endif() endif() -if(ENABLE_CPU) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") -endif() if("${X86_64_SIMD}" STREQUAL "avx") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1 -mavx -mavx2 -mfma") endif() @@ -59,21 +56,27 @@ if(APPLE) set_source_files_properties(${ASSEMBLY_SRC} PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp") endif() -########################### build nnacl static library ######################## +########################### build nnacl library ######################## string(REPLACE "-fvisibility=hidden" "-fvisibility=default" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - -add_library(nnacl_mid OBJECT ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC}) - if(ENABLE_CPU) - add_library(nnacl SHARED $) + add_library(nnacl SHARED ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC}) + if(${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "aarch64") + target_compile_definitions(nnacl PRIVATE ENABLE_ARM ENABLE_ARM64 ENABLE_NEON) + target_compile_options(nnacl PRIVATE -ffast-math -flax-vector-conversions) + elseif("${X86_64_SIMD}" STREQUAL "sse") + target_compile_definitions(nnacl PRIVATE ENABLE_SSE) + elseif("${X86_64_SIMD}" STREQUAL "avx") + target_compile_definitions(nnacl PRIVATE ENABLE_SSE ENABLE_AVX) + endif() + target_compile_options(nnacl PRIVATE -fPIC) + if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows") + target_link_options(nnacl PRIVATE -Wl,-z,relro,-z,now) + endif() else() + add_library(nnacl_mid OBJECT ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC}) add_library(nnacl STATIC $) -endif() - -if(NOT ENABLE_CPU) add_dependencies(nnacl_mid fbs_src) endif() - ########################### arm fp16 build optimize library ######################## if(ENABLE_FP16) add_subdirectory(${NNACL_DIR}/optimize) diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c index 4ae4abdf8bc..a90baa80c00 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c @@ -86,7 +86,11 @@ int LRelu(const float *src, int length, float *dst, float alpha) { for (; i < length - 4; i += 4) { MS_FLOAT32X4 src_tmp = MS_LDQ_F32(src + i); MS_FLOAT32X4 mul_tmp = MS_MULQ_N_F32(src_tmp, alpha); +#ifdef ENABLE_ARM + MS_UINT32X4 mask = MS_CMPGTQ_F32(src_tmp, MS_MOVQ_F32(0.0f)); +#else MS_FLOAT32X4 mask = MS_CMPGTQ_F32(src_tmp, MS_MOVQ_F32(0.0f)); +#endif MS_STQ_F32(dst + i, MS_BLENDQ_F32(mul_tmp, src_tmp, mask)); } #endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h index 070c498f4a3..73c1cbf1662 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h @@ -29,6 +29,7 @@ #ifdef ENABLE_ARM #define MS_FLOAT32X4 float32x4_t #define MS_INT32X4 int32x4_t +#define MS_UINT32X4 uint32x4_t #define MS_LDQ_F32 vld1q_f32 #define MS_LDQ_EPI32 vld1q_s32 #define MS_ADDQ_F32 vaddq_f32