diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/ConvDwFp32Avx3x3.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/ConvDwFp32Avx3x3.S index a5201893eb5..198be0634e6 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/ConvDwFp32Avx3x3.S +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/ConvDwFp32Avx3x3.S @@ -41,7 +41,7 @@ asm_function ConvDwFp32Avx3x3 pushq %rdi // -96 addq $96, %rsp -#ifdef WIN32 +#ifdef _WIN32 movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/ConvDwFp32BorderAvx.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/ConvDwFp32BorderAvx.S index de240b4bfe2..605b5c84ea1 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/ConvDwFp32BorderAvx.S +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/ConvDwFp32BorderAvx.S @@ -24,7 +24,7 @@ asm_function ConvDwFp32Border addq $96, %rsp movq %rdi, %rdx -#ifdef WIN32 +#ifdef _WIN32 movq %rcx, %rdx #endif movq 8(%rdx), %r12 // src diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/ConvDwFp32RowAvx.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/ConvDwFp32RowAvx.S index c77b05c570c..290dbd01687 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/ConvDwFp32RowAvx.S +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/ConvDwFp32RowAvx.S @@ -31,7 +31,7 @@ asm_function ConvDwFp32Row pushq %rdi addq $48, %rsp -#ifdef WIN32 +#ifdef _WIN32 movq %rcx, %rdi // output_ptr movq %rdx, %rsi // input_ptr movq %r8, %rdx // weight_ptr diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/MatmulAvx.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/MatmulAvx.S index 904e903132f..56426ac486d 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/MatmulAvx.S +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/avx/MatmulAvx.S @@ -47,7 +47,7 @@ asm_function MatmulFloatAvxOpt pushq %rsi // -104 rsi pushq %rdi // -112 rdi addq $112, %rsp -#ifdef WIN32 +#ifdef _WIN32 movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c index dec4915a69d..558eb5dc3d4 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c @@ -104,7 +104,7 @@ int Sigmoid(const float *src, int length, float *dst) { int i = 0; #if defined(ENABLE_AVX) for (; i <= length - 8; i += 8) { - simd_exp_avx(-(MS_LD256_F32(src + i)), dst + i); + simd_exp_avx(MS_SUB256_F32(MS_MOV256_F32(0.0f), (MS_LD256_F32(src + i))), dst + i); MS_ST256_F32(dst + i, MS_DIV256_F32(MS_MOV256_F32(1.0f), MS_ADD256_F32(MS_MOV256_F32(1.0f), MS_LD256_F32(dst + i)))); } @@ -232,25 +232,32 @@ int Gelu(const float *src, int length, float *dst, bool approximate) { if (approximate) { // dst = 0.5 * x * (1 + tanh((2 / pi) ^ 0.5 * (x + 0.044715x^3))) #if defined(ENABLE_AVX) + MS_FLOAT32X8 para1 = MS_MOV256_F32(0.79788456080287f); + MS_FLOAT32X8 para2 = MS_MOV256_F32(0.035677408136f); + MS_FLOAT32X8 para3 = MS_MOV256_F32(1.0f); + MS_FLOAT32X8 para4 = MS_MOV256_F32(0.5f); int C8 = DOWN_ROUND(length, C8NUM); for (; i < C8; i += C8NUM) { MS_FLOAT32X8 in = MS_LD256_F32(src + i); - const MS_FLOAT32X8 res = 0.5 * in * (1.0 + MS_TANHX8_F32((0.79788456080287f + 0.035677408136f * in * in) * in)); + const MS_FLOAT32X8 res = MS_MUL256_F32( + MS_MUL256_F32(para4, in), + MS_ADD256_F32( + para3, MS_TANHX8_F32(MS_MUL256_F32(MS_ADD256_F32(para1, MS_MUL256_F32(MS_MUL256_F32(para2, in), in)), in)))); MS_ST256_F32(dst + i, res); } #endif #if defined(ENABLE_SSE) || defined(ENABLE_ARM) - MS_FLOAT32X4 para1 = MS_MOVQ_F32(0.79788456080287f); - MS_FLOAT32X4 para2 = MS_MOVQ_F32(0.035677408136f); - MS_FLOAT32X4 para3 = MS_MOVQ_F32(1.0f); - MS_FLOAT32X4 para4 = MS_MOVQ_F32(0.5f); + MS_FLOAT32X4 para5 = MS_MOVQ_F32(0.79788456080287f); + MS_FLOAT32X4 para6 = MS_MOVQ_F32(0.035677408136f); + MS_FLOAT32X4 para7 = MS_MOVQ_F32(1.0f); + MS_FLOAT32X4 para8 = MS_MOVQ_F32(0.5f); int C4 = DOWN_ROUND(length, C4NUM); for (; i < C4; i += C4NUM) { MS_FLOAT32X4 in = MS_LDQ_F32(src + i); MS_FLOAT32X4 res = MS_MULQ_F32( - MS_MULQ_F32(para4, in), - MS_ADDQ_F32(para3, - MS_TANHX4_F32(MS_MULQ_F32(MS_ADDQ_F32(para1, MS_MULQ_F32(MS_MULQ_F32(para2, in), in)), in)))); + MS_MULQ_F32(para8, in), + MS_ADDQ_F32(para7, + MS_TANHX4_F32(MS_MULQ_F32(MS_ADDQ_F32(para5, MS_MULQ_F32(MS_MULQ_F32(para6, in), in)), in)))); MS_STQ_F32(dst + i, res); } #endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_1x1_x86_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_1x1_x86_fp32.c index 691272b54f7..655eadc86e5 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_1x1_x86_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_1x1_x86_fp32.c @@ -15,7 +15,11 @@ */ #ifdef ENABLE_AVX #include "nnacl/fp32/conv_1x1_x86_fp32.h" +#ifdef _MSC_VER +#include +#else #include +#endif // sliding window to compate 1x1 conv in x86 void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const float *bias_data, float *output_data, diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c index d5e0d916bf1..e2ec2c755de 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c @@ -17,8 +17,12 @@ #include "nnacl/fp32/conv_common_fp32.h" #include #ifdef ENABLE_AVX +#ifdef _MSC_VER +#include +#else #include #endif +#endif #include "nnacl/fp32/matmul_fp32.h" // fp32 conv common diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.h index ad991393978..d82f59257b7 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.h @@ -105,7 +105,7 @@ void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float void DepthwiseBorderAvxFp32(float *dst, const float *src, const float *weight, const float *bias, int top, int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sw_param, - DepthwiseSWKernel kernel, int act_type, int ow_bock, int oc_block); + const DepthwiseSWKernel kernel, int act_type, int ow_bock, int oc_block); void ConvDwFp32Avx3x3(float *output, float **input, const float *weights, const float *bias, size_t channels, size_t output_width, size_t input_stride, size_t relu, size_t relu6); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/exp_fp32.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/exp_fp32.h index ae3ecfb19ad..a4780704af1 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/exp_fp32.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/exp_fp32.h @@ -67,13 +67,13 @@ static inline void simd_exp_avx(MS_FLOAT32X8 input, float *dst) { {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}}; input = MS_MAX256_F32(minv, MS_MIN256_F32(input, maxv)); - MS_INT32X8 integer = MS_CVT256PS_EPI32(input / param[0]); - MS_FLOAT32X8 decimal = input - MS_CVT256EPI32_PS(integer) * param[0]; + MS_INT32X8 integer = MS_CVT256PS_EPI32(MS_DIV256_F32(input, param[0])); + MS_FLOAT32X8 decimal = MS_SUB256_F32(input, MS_MUL256_F32(MS_CVT256EPI32_PS(integer), param[0])); MS_INT32X8 int_exp = MS_SLLI256_EPI32(MS_ADD256_EPI32(integer, MS_MOV256_EPI32(127)), 23); - MS_FLOAT32X8 decimal_exp = - param[5] + - decimal * (param[5] + decimal * (param[4] + decimal * (param[3] + decimal * (param[2] + decimal * param[1])))); - MS_ST256_F32(dst, decimal_exp * MS_CAST256_F32_S32(int_exp)); + MS_FLOAT32X8 tmp = MS_MUL256_F32(decimal, (MS_ADD256_F32(param[2], MS_MUL256_F32(decimal, param[1])))); + tmp = MS_MUL256_F32(decimal, MS_ADD256_F32(param[4], MS_MUL256_F32(decimal, MS_ADD256_F32(param[3], tmp)))); + MS_FLOAT32X8 decimal_exp = MS_ADD256_F32(param[5], MS_MUL256_F32(decimal, MS_ADD256_F32(param[5], tmp))); + MS_ST256_F32(dst, MS_MUL256_F32(decimal_exp, MS_CAST256_F32_S32(int_exp))); } #endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.c index 366d1a9cf6a..fcf25c041be 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.c @@ -116,7 +116,7 @@ int SoftplusGrad(const float *src0, const float *src1, int length, float *dst) { int i = 0; #if defined(ENABLE_AVX) for (; i <= length - C8NUM; i += C8NUM) { - simd_exp_avx(-(MS_LD256_F32(src1 + i)), dst + i); + simd_exp_avx(MS_SUB256_F32(MS_MOV256_F32(0.0f), (MS_LD256_F32(src1 + i))), dst + i); MS_ST256_F32(dst + i, MS_DIV256_F32(MS_LD256_F32(src0 + i), MS_ADD256_F32(MS_MOV256_F32(1.0f), MS_LD256_F32(dst + i)))); } diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.c index a38b1b554fc..1444dea477a 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.c @@ -47,7 +47,6 @@ #include "nnacl/infer/string/custom_normalize_infer.h" #include "nnacl/infer/string/custom_predict_infer.h" #include "nnacl/infer/deconv2d_infer.h" -#include "nnacl/infer/dedepthwise_conv2d_infer.h" #include "nnacl/infer/depth_to_space_infer.h" #include "nnacl/infer/depthwise_conv2d_infer.h" #include "nnacl/infer/detection_post_process_infer.h" diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.c index 35357fcf237..0140a0ec612 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.c @@ -15,11 +15,8 @@ */ #include "nnacl/int8/add_int8.h" -#ifdef ENABLE_NEON -#include -#endif +#include "nnacl/intrinsics/ms_simd_instructions.h" #ifdef ENABLE_AVX -#include #include "nnacl/intrinsics/avx/common_utils.h" #endif #include "nnacl/int8/fixed_point.h" @@ -319,8 +316,8 @@ void AddInt8_AVX2(const int8_t *input0, const int8_t *input1, int8_t *output, in const __m128i out_multiplier = _mm_set1_epi32(params->out_multiplier_); int index = 0; for (; index <= size - 16; index += 16) { - const __m128i in0_src = _mm_loadu_si128((__m128i_u *)(input0 + index)); - const __m128i in1_src = _mm_loadu_si128((__m128i_u *)(input1 + index)); + const __m128i in0_src = _mm_loadu_si128((__m128i *)(input0 + index)); + const __m128i in1_src = _mm_loadu_si128((__m128i *)(input1 + index)); const __m256i in0_s16 = _mm256_cvtepi8_epi16(in0_src); const __m128i in0_s16_low = _mm256_extractf128_si256(in0_s16, 0); @@ -398,7 +395,7 @@ void AddInt8_AVX2(const int8_t *input0, const int8_t *input1, int8_t *output, in __m128i out = _mm_packs_epi16(out_s16_1, out_s16_2); __m128i int8_out = _mm_max_epi8(min_vec, _mm_min_epi8(max_vec, out)); - _mm_storeu_si128((__m128i_u *)(output + index), int8_out); + _mm_storeu_si128((__m128i *)(output + index), int8_out); } for (; index < size; index++) { const int32_t in0_left = (input0[index] + params->in0_args_.zp_) * in0_left_shift; @@ -452,7 +449,7 @@ void AddOptInt8_AVX2(const int8_t *ptr_in, const int8_t element_in, int8_t *outp int index = 0; for (; index <= size - 16; index += 16) { - const __m128i in0_src = _mm_loadu_si128((__m128i_u *)(ptr_in + index)); + const __m128i in0_src = _mm_loadu_si128((__m128i *)(ptr_in + index)); const __m256i in0_s16 = _mm256_cvtepi8_epi16(in0_src); const __m128i in0_s16_low = _mm256_extractf128_si256(in0_s16, 0); const __m128i in0_s16_high = _mm256_extractf128_si256(in0_s16, 1); @@ -516,7 +513,7 @@ void AddOptInt8_AVX2(const int8_t *ptr_in, const int8_t element_in, int8_t *outp __m128i out = _mm_packs_epi16(out_s16_1, out_s16_2); __m128i int8_out = _mm_max_epi8(min_vec, _mm_min_epi8(max_vec, out)); - _mm_storeu_si128((__m128i_u *)(output + index), int8_out); + _mm_storeu_si128((__m128i *)(output + index), int8_out); } for (; index < size; index++) { const int32_t in0_left = (ptr_in[index] + ptr_args->zp_) * in0_left_shift; diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.h index cdd9e2c753e..f435a2b7f63 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/add_int8.h @@ -13,13 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #ifndef MINDSPORE_NNACL_ADD_INT8_H_ #define MINDSPORE_NNACL_ADD_INT8_H_ -#ifdef ENABLE_AVX -#include -#endif #include "nnacl/op_base.h" #include "nnacl/errorcode.h" #include "nnacl/arithmetic.h" diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/TiledC4MatMulFp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/TiledC4MatMulFp32.c index 101f3f6561d..e0578567006 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/TiledC4MatMulFp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/TiledC4MatMulFp32.c @@ -14,7 +14,11 @@ * limitations under the License. */ #ifdef ENABLE_AVX +#ifdef _MSC_VER +#include +#else #include +#endif #include "nnacl/fp32/common_func_fp32.h" static inline __m256 padd(__m256 v0, __m256 v1, __m256 v2, __m256 v3) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/common_utils.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/common_utils.c index 3152b30f97e..7b16f1239bb 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/common_utils.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/common_utils.c @@ -14,11 +14,7 @@ * limitations under the License. */ #include "nnacl/intrinsics/avx/common_utils.h" -#ifdef WIN32 -#ifdef ENABLE_AVX #include -#endif -#endif __m128i _mm_adds_epi32(__m128i a, __m128i b) { __m128i int_min = _mm_set1_epi32(0x80000000); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/common_utils.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/common_utils.h index 1ef3d43182c..f976cc20143 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/common_utils.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/common_utils.h @@ -16,7 +16,7 @@ #ifndef MINDSPORE_NNACL_X86_64_AVX_COMMON_UTILS_H_ #define MINDSPORE_NNACL_X86_64_AVX_COMMON_UTILS_H_ -#ifdef SUPPORT_MSVC +#ifdef _MSC_VER #include #else #include diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h index cb6885fe8d2..887dd64411d 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h @@ -24,7 +24,7 @@ #endif #if defined(ENABLE_SSE) || defined(ENABLE_AVX) -#ifdef SUPPORT_MSVC +#ifdef _MSC_VER #include #define MS_F32X4_GETI(src, i) src.m128_f32[i] #else @@ -224,13 +224,25 @@ static inline MS_FLOAT32X4 MS_ERFX4_F32(MS_FLOAT32X4 src) { MS_ST256_F32(output_ptr + 7 * num, dst##8); static inline MS_FLOAT32X8 MS_TANHX8_F32(MS_FLOAT32X8 src) { - static const float data[] = {378.0f, 17325.0f, 135135.0f, 28.0f, 3150.0f, 62370.0f}; + static const MS_FLOAT32X8 data0 = {378.0f, 378.0f, 378.0f, 378.0f, 378.0f, 378.0f, 378.0f, 378.0f}; + static const MS_FLOAT32X8 data1 = {17325.0f, 17325.0f, 17325.0f, 17325.0f, 17325.0f, 17325.0f, 17325.0f, 17325.0f}; + static const MS_FLOAT32X8 data2 = {135135.0f, 135135.0f, 135135.0f, 135135.0f, + 135135.0f, 135135.0f, 135135.0f, 135135.0f}; + static const MS_FLOAT32X8 data3 = {28.0f, 28.0f, 28.0f, 28.0f, 28.0f, 28.0f, 28.0f, 28.0f}; + static const MS_FLOAT32X8 data4 = {3150.0f, 3150.0f, 3150.0f, 3150.0f, 3150.0f, 3150.0f, 3150.0f, 3150.0f}; + static const MS_FLOAT32X8 data5 = {62370.0f, 62370.0f, 62370.0f, 62370.0f, 62370.0f, 62370.0f, 62370.0f, 62370.0f}; static const MS_FLOAT32X8 neg = {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f}; static const MS_FLOAT32X8 pos = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - MS_FLOAT32X8 square = src * src; - MS_FLOAT32X8 a = (((square + data[0]) * square + data[1]) * square + data[2]) * src; - MS_FLOAT32X8 b = ((data[3] * square + data[4]) * square + data[5]) * square + data[2]; - return MS_MIN256_F32(MS_MAX256_F32(a / b, neg), pos); + MS_FLOAT32X8 square = MS_MUL256_F32(src, src); + MS_FLOAT32X8 a = MS_MUL256_F32( + MS_ADD256_F32(MS_MUL256_F32(MS_ADD256_F32(MS_MUL256_F32(MS_ADD256_F32(square, data0), square), data1), square), + data2), + src); + MS_FLOAT32X8 b = MS_ADD256_F32( + MS_MUL256_F32(MS_ADD256_F32(MS_MUL256_F32(MS_ADD256_F32(MS_MUL256_F32(data3, square), data4), square), data5), + square), + data2); + return MS_MIN256_F32(MS_MAX256_F32(MS_DIV256_F32(a, b), neg), pos); } #endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/ConvDwFp32IndirectRow.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/ConvDwFp32IndirectRow.c index 1c23b6a2a21..d0a25ab1b7d 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/ConvDwFp32IndirectRow.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/ConvDwFp32IndirectRow.c @@ -15,19 +15,23 @@ */ #ifdef ENABLE_AVX - +#ifdef _MSC_VER +#include +#else #include +#endif #include "nnacl/fp32/conv_depthwise_fp32.h" +#define INPUT_SIZE 25 + void ConvDwFp32Avx5x5(float *output, float **input, const float *weights, const float *bias, size_t channels, size_t output_width, size_t input_stride, size_t relu, size_t relu6) { input_stride /= sizeof(float *); size_t c8 = UP_DIV(channels, C8NUM) * C8NUM; size_t c8_mod = channels % C8NUM; - const int kernel = 25; + float *in[INPUT_SIZE]; for (int i = 0; i < output_width; ++i) { - float *in[kernel]; - for (int k = 0; k < kernel; k++) { + for (int k = 0; k < INPUT_SIZE; k++) { in[k] = input[k]; } input += input_stride; @@ -37,7 +41,7 @@ void ConvDwFp32Avx5x5(float *output, float **input, const float *weights, const for (; c >= C8NUM; c -= C8NUM) { __m256 out1 = _mm256_loadu_ps(bias1); bias1 += 8; - for (int k = 0; k < kernel; k += 5) { + for (int k = 0; k < INPUT_SIZE; k += 5) { __m256 in1 = _mm256_loadu_ps(in[k]); __m256 w1 = _mm256_loadu_ps(w); __m256 in2 = _mm256_loadu_ps(in[k + 1]); diff --git a/mindspore/lite/CMakeLists.txt b/mindspore/lite/CMakeLists.txt index 110a047bacd..72e2fdb0229 100644 --- a/mindspore/lite/CMakeLists.txt +++ b/mindspore/lite/CMakeLists.txt @@ -444,8 +444,10 @@ if(NOT PLATFORM_ARM) set(X86_64_SIMD "avx") add_compile_definitions(ENABLE_SSE) add_compile_definitions(ENABLE_AVX) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mfma") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx -mfma") + if(NOT MSVC) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mfma") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx -mfma") + endif() elseif(MSLITE_ENABLE_SSE) set(X86_64_SIMD "sse") add_compile_definitions(ENABLE_SSE)