diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/activation_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/activation_fp16.c index 4f63c98b547..0e77ed85e18 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/activation_fp16.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/activation_fp16.c @@ -56,7 +56,7 @@ int Relu6Fp16(const float16_t *data, float16_t *dst, int ele_num) { int LReluFp16(const float16_t *src, float16_t *dst, int ele_num, float16_t alpha) { int i = 0; #ifdef ENABLE_NEON - int ele_c8 = UP_ROUND(ele_num, C8NUM); + int ele_c8 = DOWN_ROUND(ele_num, C8NUM); for (; i < ele_c8; i += C8NUM) { float16x8_t src_tmp = vld1q_f16(src + i); float16x8_t mul_tmp = vmulq_n_f16(src_tmp, alpha); @@ -187,7 +187,7 @@ int GeluFp16(const float16_t *src, int length, float16_t *dst, bool approximate) if (approximate) { // dst = 0.5 * x * (1 + tanh((2 / pi) ^ 0.5 * (x + 0.044715x^3))) #ifdef ENABLE_NEON - int C8 = UP_ROUND(length, C8NUM); + int C8 = DOWN_ROUND(length, C8NUM); for (; i < C8; i += C8NUM) { float16x8_t in = vld1q_f16(src + i); float16x8_t res = @@ -202,7 +202,7 @@ int GeluFp16(const float16_t *src, int length, float16_t *dst, bool approximate) } } else { #ifdef ENABLE_NEON - int C8 = UP_ROUND(length, C8NUM); + int C8 = DOWN_ROUND(length, C8NUM); for (; i < C8; i += C8NUM) { float16x8_t in = vld1q_f16(src + i); const float16x8_t res = 0.5f * in * (1.0f + MS_ERFX8_F16(in / (float16_t)1.4142135623730951f)); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/power_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/power_fp16.c index 5d23c750319..53638fd3926 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/power_fp16.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/power_fp16.c @@ -70,7 +70,7 @@ void PowerBroadCastFp16(const float16_t *input, const float16_t *exponent, float } int i = 0; #ifdef ENABLE_NEON - int len_c8 = UP_ROUND(len, C8NUM); + int len_c8 = DOWN_ROUND(len, C8NUM); float16x8_t scale_8 = vmovq_n_f16(scale); float16x8_t shift_8 = vmovq_n_f16(shift); for (; i < len_c8; i += C8NUM) { @@ -88,7 +88,7 @@ void PowerSingleFp16(const float16_t *input, const float16_t *exponent, float16_ int i = 0; PowerScalarFunFp16 PowerScalarFunFp16_ = NULL; #ifdef ENABLE_NEON - int len_c8 = UP_ROUND(len, C8NUM); + int len_c8 = DOWN_ROUND(len, C8NUM); float16x8_t scale_8 = vmovq_n_f16(scale); float16x8_t shift_8 = vmovq_n_f16(shift); for (; i < len_c8; i += C8NUM) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c index cb1be6d2209..4ae4abdf8bc 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/activation_fp32.c @@ -228,7 +228,7 @@ int Gelu(const float *src, int length, float *dst, bool approximate) { if (approximate) { // dst = 0.5 * x * (1 + tanh((2 / pi) ^ 0.5 * (x + 0.044715x^3))) #if defined(ENABLE_AVX) - int C8 = UP_ROUND(length, C8NUM); + int C8 = DOWN_ROUND(length, C8NUM); for (; i < C8; i += C8NUM) { MS_FLOAT32X8 in = MS_LD256_F32(src + i); MS_FLOAT32X8 res = 0.5 * in * (1.0 + MS_TANHX8_F32((0.79788456080287f + 0.035677408136f * in * in) * in)); @@ -236,7 +236,7 @@ int Gelu(const float *src, int length, float *dst, bool approximate) { } #endif #if defined(ENABLE_SSE) || defined(ENABLE_ARM) - int C4 = UP_ROUND(length, C4NUM); + int C4 = DOWN_ROUND(length, C4NUM); for (; i < C4; i += C4NUM) { MS_FLOAT32X4 in = MS_LDQ_F32(src + i); MS_FLOAT32X4 res = 0.5 * in * (1.0 + MS_TANHX4_F32((0.79788456080287f + 0.035677408136f * in * in) * in)); @@ -248,7 +248,7 @@ int Gelu(const float *src, int length, float *dst, bool approximate) { } } else { #if defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM) - int C4 = UP_ROUND(length, C4NUM); + int C4 = DOWN_ROUND(length, C4NUM); for (; i < C4; i += C4NUM) { MS_FLOAT32X4 in = MS_LDQ_F32(src + i); MS_FLOAT32X4 res = 0.5 * in * (1.0 + MS_ERFX4_F32(in / 1.4142135623730951f)); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c index e2b20006630..21ce22633bf 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_common_fp32.c @@ -158,9 +158,9 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float if (ow_block < ow_block_num[oc_block - 1]) { // ow is not enough and process one ow ow_block = 1; } - kernel[oc_block - 1][ow_block > 1](dst_w + ow * sw_param->block_channel_, src_w, weight, bias, kernel_h, - kernel_w, act_type, ow_block, oc_block, oc_algin, ic_algin, in_kw_step, - in_kh_step, in_sw_step, 0); + kernel[oc_block - 1][ow_block / ow_block_num[oc_block - 1]]( + dst_w + ow * sw_param->block_channel_, src_w, weight, bias, kernel_h, kernel_w, act_type, ow_block, + oc_block, oc_algin, ic_algin, in_kw_step, in_kh_step, in_sw_step, 0); src_w += ow_block * in_sw_step; } // ow in left @@ -177,11 +177,11 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float void SWConv3x32Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h, size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin, size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) { - in_kh_step *= 4; - in_sw_step *= 4; - in_kw_step *= 4; - oc_algin *= 4; - kw_remainder *= 4; + in_kh_step *= sizeof(float); + in_sw_step *= sizeof(float); + in_kw_step *= sizeof(float); + oc_algin *= sizeof(float); + kw_remainder *= sizeof(float); asm volatile( "cmpq $0, %2\n" "je 0f\n" @@ -315,10 +315,10 @@ void SWConv3x32Kernel(float *dst, const float *src, const float *weight, const f void SWConv1x32Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h, size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin, size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) { - in_kh_step *= 4; - in_kw_step *= 4; - oc_algin *= 4; - kw_remainder *= 4; + in_kh_step *= sizeof(float); + in_kw_step *= sizeof(float); + oc_algin *= sizeof(float); + kw_remainder *= sizeof(float); asm volatile( "cmpq $0, %2\n" "je 0f\n" @@ -397,13 +397,13 @@ void SWConv1x32Kernel(float *dst, const float *src, const float *weight, const f void SWConv4x24Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h, size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin, size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) { - in_kh_step *= 4; - in_kw_step *= 4; - in_sw_step *= 4; - kw_remainder *= 4; + in_kh_step *= sizeof(float); + in_kw_step *= sizeof(float); + in_sw_step *= sizeof(float); + kw_remainder *= sizeof(float); size_t src_3_step = 3 * in_sw_step; float *dst_3 = dst + 3 * oc_algin; - oc_algin *= 4; + oc_algin *= sizeof(float); asm volatile( "cmpq $0, %2\n" "je 0f\n" @@ -545,10 +545,10 @@ void SWConv4x24Kernel(float *dst, const float *src, const float *weight, const f void SWConv1x24Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h, size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin, size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) { - in_kh_step *= 4; - in_kw_step *= 4; - kw_remainder *= 4; - oc_algin *= 4; + in_kh_step *= sizeof(float); + in_kw_step *= sizeof(float); + kw_remainder *= sizeof(float); + oc_algin *= sizeof(float); asm volatile( "cmpq $0, %2\n" "je 0f\n" @@ -621,13 +621,13 @@ void SWConv1x24Kernel(float *dst, const float *src, const float *weight, const f void SWConv6x16Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h, size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin, size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) { - in_kh_step *= 4; - in_kw_step *= 4; - in_sw_step *= 4; - kw_remainder *= 4; + in_kh_step *= sizeof(float); + in_kw_step *= sizeof(float); + in_sw_step *= sizeof(float); + kw_remainder *= sizeof(float); size_t src_3_step = 3 * in_sw_step; float *dst_3 = dst + 3 * oc_algin; - oc_algin *= 4; + oc_algin *= sizeof(float); asm volatile( "cmpq $0, %2\n" "je 0f\n" @@ -772,10 +772,10 @@ void SWConv6x16Kernel(float *dst, const float *src, const float *weight, const f void SWConv1x16Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h, size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin, size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) { - in_kh_step *= 4; - in_kw_step *= 4; - kw_remainder *= 4; - oc_algin *= 4; + in_kh_step *= sizeof(float); + in_kw_step *= sizeof(float); + kw_remainder *= sizeof(float); + oc_algin *= sizeof(float); asm volatile( "cmpq $0, %2\n" "je 0f\n" @@ -842,15 +842,15 @@ void SWConv1x16Kernel(float *dst, const float *src, const float *weight, const f void SWConv12x8Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h, size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin, size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) { - in_kh_step *= 4; - in_sw_step *= 4; - in_kw_step *= 4; - kw_remainder *= 4; + in_kh_step *= sizeof(float); + in_sw_step *= sizeof(float); + in_kw_step *= sizeof(float); + kw_remainder *= sizeof(float); size_t src_3_step = 3 * in_sw_step; float *dst_3 = dst + 3 * oc_algin; float *dst_5 = dst + 5 * oc_algin; float *dst_9 = dst + 9 * oc_algin; - oc_algin *= 4; + oc_algin *= sizeof(float); asm volatile( "cmpq $0, %0\n" "je 0f\n" @@ -1001,12 +1001,12 @@ void SWConv12x8Kernel(float *dst, const float *src, const float *weight, const f void SWConv4x8Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h, size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin, size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) { - in_kh_step *= 4; - in_sw_step *= 4; - in_kw_step *= 4; + in_kh_step *= sizeof(float); + in_sw_step *= sizeof(float); + in_kw_step *= sizeof(float); size_t src_step = 3 * in_sw_step; float *dst_3 = dst + 3 * oc_algin; - oc_algin *= 4; + oc_algin *= sizeof(float); asm volatile( "cmpq $0, %2\n" "je 0f\n" @@ -1091,10 +1091,10 @@ void SWConv4x8Kernel(float *dst, const float *src, const float *weight, const fl void SWConv1x8Kernel(float *dst, const float *src, const float *weight, const float *bias, size_t kernel_h, size_t kernel_w, size_t act_flag, size_t ow_block, size_t oc_block, size_t oc_algin, size_t ic_algin, size_t in_kw_step, size_t in_kh_step, size_t in_sw_step, size_t kw_remainder) { - in_kh_step *= 4; - in_kw_step *= 4; - kw_remainder *= 4; - oc_algin *= 4; + in_kh_step *= sizeof(float); + in_kw_step *= sizeof(float); + kw_remainder *= sizeof(float); + oc_algin *= sizeof(float); asm volatile( "cmpq $0, %2\n" "je 0f\n" diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.c index 57a93a707d9..0b3fb7061b4 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.c @@ -67,7 +67,7 @@ void PowerBroadCast(const float *input, const float *exponent, float *output, in } int i = 0; #if defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM) - int len_c4 = UP_ROUND(len, C4NUM); + int len_c4 = DOWN_ROUND(len, C4NUM); MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale); MS_FLOAT32X4 shift_4 = MS_MOVQ_F32(shift); for (; i < len_c4; i += C4NUM) { @@ -84,7 +84,7 @@ void PowerSingle(const float *input, const float *exponent, float *output, int l int i = 0; PowerScalarFun PowerScalarFun_ = NULL; #if defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM) - int len_c4 = UP_ROUND(len, C4NUM); + int len_c4 = DOWN_ROUND(len, C4NUM); MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale); MS_FLOAT32X4 shift_4 = MS_MOVQ_F32(shift); for (; i < len_c4; i += C4NUM) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h index 51da043bb56..ab1b9bab84e 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/op_base.h @@ -39,7 +39,8 @@ #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) #define UP_ROUND(x, y) (((x) + (y) - (1)) / (y) * (y)) #define UP_ROUND_DIV(x, y) (x % y == 0 ? (x / y) : (x / y) + 1) -#define DOWN_DIV(x, y) (((x) - (y) + (1)) / (y)) +#define DOWN_DIV(x, y) ((x) / (y)) +#define DOWN_ROUND(x, y) ((x) / (y) * (y)) #define MSVALID(left, x, right) (MSMIN((MSMAX(left, x)), right)) diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc index 17af2cacec6..68bbd780f07 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc @@ -186,8 +186,8 @@ int ConvolutionSWCPUKernel::Run() { FreeTmpBuffer(); return error_code; } - auto out_data = reinterpret_cast(out_tensors_.front()->MutableData()); if (oc_res_ != 0) { + auto out_data = reinterpret_cast(out_tensors_.front()->MutableData()); PackNHWCXToNHWCFp32(output_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_, oc_tile_); }