diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c index ace1e320cea..2cbdca0d000 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/conv_depthwise_fp32.c @@ -13,10 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "nnacl/fp32/conv_depthwise_fp32.h" #include "nnacl/common_func.h" #include "nnacl/fp32/common_func_fp32.h" +#include "nnacl/intrinsics/ms_simd_instructions.h" #if !defined(ENABLE_ARM) && !defined(ENABLE_SSE) void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, int num_pixels, @@ -622,8 +622,8 @@ void ConvDw3x3Line(float *dst, float **lines, const float *weight, const float * MS_STQ_F32(cur_dst + ori_channel, res1); } else { for (int i = 0; i < channel; i++) { - cur_dst[i] = res0[i]; - cur_dst[ori_channel + i] = res1[i]; + cur_dst[i] = MS_F32X4_GETI(res0, i); + cur_dst[ori_channel + i] = MS_F32X4_GETI(res1, i); } } cur_dst += 2 * ori_channel; @@ -653,7 +653,7 @@ void ConvDw3x3Line(float *dst, float **lines, const float *weight, const float * MS_STQ_F32(cur_dst, res0); } else { for (int i = 0; i < channel; i++) { - cur_dst[i] = res0[i]; + cur_dst[i] = MS_F32X4_GETI(res0, i); } } } diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.c index 5c5c6e4ca5a..213cc345dcf 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.c @@ -91,7 +91,7 @@ void PowerSingle(const float *input, const float *exponent, float *output, int l MS_FLOAT32X4 tmp_4 = MS_ADDQ_F32(MS_MULQ_F32(scale_4, MS_LDQ_F32(input + i)), shift_4); for (int j = 0; j < 4; ++j) { PowerScalarFun_ = CheckInteger(exponent[i + j]) ? OptimizedPowerScalar : StdPowerScalar; - output[i + j] = PowerScalarFun_(tmp_4[j], exponent + i + j); + output[i + j] = PowerScalarFun_(MS_F32X4_GETI(tmp_4, j), exponent + i + j); } } #endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.h index 928c3cd9de2..633228d8591 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.h @@ -20,6 +20,7 @@ #include #include "nnacl/op_base.h" #include "nnacl/power_parameter.h" +#include "nnacl/intrinsics/ms_simd_instructions.h" #if defined(ENABLE_ARM) || defined(ENABLE_AVX) || defined(ENABLE_SSE) typedef MS_FLOAT32X4 (*PowerSimdFun)(MS_FLOAT32X4 x, const void *exponent); @@ -38,7 +39,7 @@ static inline float StdPowerScalar(float x, const void *exponent) { return powf( static inline MS_FLOAT32X4 StdPowerSimd(MS_FLOAT32X4 x, const void *exponent) { MS_FLOAT32X4 result; for (int i = 0; i < 4; ++i) { - result[i] = powf(x[i], *(float *)exponent); + MS_F32X4_GETI(result, i) = powf(MS_F32X4_GETI(x, i), *(float *)exponent); } return result; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/rmsprop_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/rmsprop_fp32.c index dbf1250e391..7dba3d58202 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/rmsprop_fp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/rmsprop_fp32.c @@ -14,8 +14,12 @@ * limitations under the License. */ #ifdef ENABLE_SSE +#ifdef SUPPORT_MSVC +#include +#else #include #endif +#endif #ifdef ENABLE_AVX #include diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.c index f1a0f25e273..a94e3672fa7 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/winograd_utils.c @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "nnacl/fp32/winograd_utils.h" +#include "nnacl/intrinsics/ms_simd_instructions.h" #include "nnacl/base/minimal_filtering_generator.h" #include "nnacl/errorcode.h" @@ -486,7 +486,7 @@ void OutputTransform4x2Unit(const float *src_data, float *dst_data, const float int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 2; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -551,7 +551,7 @@ void OutputTransform4x2ReluUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 2; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -621,7 +621,7 @@ void OutputTransform4x2Relu6Unit(const float *src_data, float *dst_data, const f int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 2; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -690,7 +690,7 @@ void OutputTransform4x3Unit(const float *src_data, float *dst_data, const float int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 3; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -762,7 +762,7 @@ void OutputTransform4x3ReluUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 3; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -840,7 +840,7 @@ void OutputTransform4x3Relu6Unit(const float *src_data, float *dst_data, const f int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 3; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -917,7 +917,7 @@ void OutputTransform6x2Unit(const float *src_data, float *dst_data, const float int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 2; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -992,7 +992,7 @@ void OutputTransform6x2ReluUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 2; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -1072,7 +1072,7 @@ void OutputTransform6x2Relu6Unit(const float *src_data, float *dst_data, const f int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 2; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -1146,7 +1146,7 @@ void OutputTransform6x3Unit(const float *src_data, float *dst_data, const float int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 3; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -1223,7 +1223,7 @@ void OutputTransform6x3ReluUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 3; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -1306,7 +1306,7 @@ void OutputTransform6x3Relu6Unit(const float *src_data, float *dst_data, const f int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 3; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -1385,7 +1385,7 @@ void OutputTransform6x4Unit(const float *src_data, float *dst_data, const float int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 4; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -1468,7 +1468,7 @@ void OutputTransform6x4ReluUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 4; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -1558,7 +1558,7 @@ void OutputTransform6x4Relu6Unit(const float *src_data, float *dst_data, const f int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 4; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -1641,7 +1641,7 @@ void OutputTransform6x5Unit(const float *src_data, float *dst_data, const float int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 5; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -1729,7 +1729,7 @@ void OutputTransform6x5ReluUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 5; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -1825,7 +1825,7 @@ void OutputTransform6x5Relu6Unit(const float *src_data, float *dst_data, const f int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 5; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -1911,7 +1911,7 @@ void OutputTransform8x2Unit(const float *src_data, float *dst_data, const float int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 2; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -1994,7 +1994,7 @@ void OutputTransform8x2ReluUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 2; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -2082,7 +2082,7 @@ void OutputTransform8x2Relu6Unit(const float *src_data, float *dst_data, const f int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 2; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -2168,7 +2168,7 @@ void OutputTransform8x3Unit(const float *src_data, float *dst_data, const float int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 3; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -2259,7 +2259,7 @@ void OutputTransform8x3ReluUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 3; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -2356,7 +2356,7 @@ void OutputTransform8x3Relu6Unit(const float *src_data, float *dst_data, const f int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 3; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -2450,7 +2450,7 @@ void OutputTransform8x4Unit(const float *src_data, float *dst_data, const float int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 4; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -2550,7 +2550,7 @@ void OutputTransform8x4ReluUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 4; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -2657,7 +2657,7 @@ void OutputTransform8x4Relu6Unit(const float *src_data, float *dst_data, const f int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 4; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -2761,7 +2761,7 @@ void OutputTransform8x5Unit(const float *src_data, float *dst_data, const float int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 5; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -2869,7 +2869,7 @@ void OutputTransform8x5ReluUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 5; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -2988,7 +2988,7 @@ void OutputTransform8x5Relu6Unit(const float *src_data, float *dst_data, const f int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 5; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -3108,7 +3108,7 @@ void OutputTransform8x6Unit(const float *src_data, float *dst_data, const float int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 6; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -3236,7 +3236,7 @@ void OutputTransform8x6ReluUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 6; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -3373,7 +3373,7 @@ void OutputTransform8x6Relu6Unit(const float *src_data, float *dst_data, const f int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 6; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -3501,7 +3501,7 @@ void OutputTransform8x7Unit(const float *src_data, float *dst_data, const float int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 7; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -3638,7 +3638,7 @@ void OutputTransform8x7ReluUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 7; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } @@ -3785,7 +3785,7 @@ void OutputTransform8x7Relu6Unit(const float *src_data, float *dst_data, const f int dst_k_offset = j * dst_step * out_c; int m_k_offset = j * 7; for (int k = 0; k < r_w; k++) { - dst_data[i + dst_k_offset + k * out_c] = m[k + m_k_offset][i]; + dst_data[i + dst_k_offset + k * out_c] = MS_F32X4_GETI(m[k + m_k_offset], i); } } } diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/batch_norm.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/batch_norm.c index ec381acc8d9..057de3e61df 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/batch_norm.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/batch_norm.c @@ -22,10 +22,14 @@ void var2Invar(float *save_var, int size, float eps) { save_var[i] = 1.0f / sqrt(save_var[i] + eps); } } - +#ifdef SUPPORT_MSVC +void backwardAll(const float *in, const float *yt, const float *mean, const float *invar, const float *scale, int size, + int ch, float *dxhat_sum, float *dxhathat_sum, float *dbias, float *dscale, float *dx) { +#else void backwardAll(const float *restrict in, const float *restrict yt, const float *restrict mean, const float *restrict invar, const float *restrict scale, int size, int ch, float *restrict dxhat_sum, float *restrict dxhathat_sum, float *restrict dbias, float *restrict dscale, float *restrict dx) { +#endif float N = (float)size; for (int i = 0; i < size; i++) { for (int c = 0; c < ch; c++) { @@ -50,9 +54,15 @@ void backwardAll(const float *restrict in, const float *restrict yt, const float } } } + +#ifdef SUPPORT_MSVC +void backwardP1(const float *in, const float *yt, const float *mean, const float *invar, const float *scale, int size, + int ch, float *dxhat_sum, float *dxhathat_sum, float *dbias, float *dscale) { +#else void backwardP1(const float *restrict in, const float *restrict yt, const float *restrict mean, const float *restrict invar, const float *restrict scale, int size, int ch, float *restrict dxhat_sum, float *restrict dxhathat_sum, float *restrict dbias, float *restrict dscale) { +#endif for (int i = 0; i < size; i++) { for (int c = 0; c < ch; c++) { int ix = i * ch + c; @@ -68,9 +78,14 @@ void backwardP1(const float *restrict in, const float *restrict yt, const float } } +#ifdef SUPPORT_MSVC +void backwardP2(const float *in, const float *yt, const float *mean, const float *invar, const float *scale, int size, + int total_size, int ch, const float *dxhat_sum, const float *dxhathat_sum, float *dx) { +#else void backwardP2(const float *restrict in, const float *restrict yt, const float *restrict mean, const float *restrict invar, const float *restrict scale, int size, int total_size, int ch, const float *dxhat_sum, const float *dxhathat_sum, float *restrict dx) { +#endif const float N = (float)total_size; for (int i = 0; i < size; i++) { for (int c = 0; c < ch; c++) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/gemm.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/gemm.c index bd9dd67004d..03d918de5d6 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/gemm.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/gemm.c @@ -21,7 +21,11 @@ #endif #include "nnacl/fp32/matmul_fp32.h" +#ifdef SUPPORT_MSVC +void AddMatrix(const float *__restrict v1, float *__restrict v2, float beta, int row, int col, int stride) { +#else void AddMatrix(const float *restrict v1, float *restrict v2, float beta, int row, int col, int stride) { +#endif const float *src_ptr = v1; float *dst_ptr = v2; for (int r = 0; r < row; r++) { @@ -86,8 +90,7 @@ static void RowMajor2Row12MajorStride(const float *src_ptr, float *dst_ptr, int return; } -static void RowMajor2Col12MajorStride(const float *restrict src_ptr, float *restrict dst_ptr, size_t row, size_t col, - int lead) { +static void RowMajor2Col12MajorStride(const float *src_ptr, float *dst_ptr, size_t row, size_t col, int lead) { size_t row_up_12 = UP_ROUND(row, C12NUM); size_t row12 = row / C12NUM * C12NUM; size_t col4 = col / C4NUM * C4NUM; diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.c index 52d3c0abff8..3943ca45903 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.c @@ -15,7 +15,333 @@ */ #include "nnacl/infer/infer_register.h" -__attribute__((init_priority(101))) InferShape g_infer_func[PrimType_MAX * sizeof(InferShape)]; +#ifdef SUPPORT_MSVC +#include "nnacl/infer/adam_infer.h" +#include "nnacl/infer/add_sub_grad_infer.h" +#include "nnacl/infer/addn_infer.h" +#include "nnacl/infer/apply_momentum_infer.h" +#include "nnacl/infer/argmin_max_infer.h" +#include "nnacl/infer/arithmetic_compare_infer.h" +#include "nnacl/infer/arithmetic_grad_infer.h" +#include "nnacl/infer/arithmetic_infer.h" +#include "nnacl/infer/assert_op_infer.h" +#include "nnacl/infer/assign_add_infer.h" +#include "nnacl/infer/assign_infer.h" +#include "nnacl/infer/audio_spectrogram_infer.h" +#include "nnacl/infer/batch_to_space_infer.h" +#include "nnacl/infer/bias_grad_infer.h" +#include "nnacl/infer/binary_cross_entropy_infer.h" +#include "nnacl/infer/bn_grad_infer.h" +#include "nnacl/infer/broadcast_to_infer.h" +#include "nnacl/infer/cast_infer.h" +#include "nnacl/infer/common_infer.h" +#include "nnacl/infer/concat_infer.h" +#include "nnacl/infer/constant_of_shape_infer.h" +#include "nnacl/infer/conv2d_grad_filter_infer.h" +#include "nnacl/infer/conv2d_grad_input_infer.h" +#include "nnacl/infer/conv2d_infer.h" +#include "nnacl/infer/crop_and_resize_infer.h" +#include "nnacl/infer/crop_infer.h" +#include "nnacl/infer/cumsum_infer.h" +#include "nnacl/infer/custom_extract_features_infer.h" +#include "nnacl/infer/custom_normalize_infer.h" +#include "nnacl/infer/custom_predict_infer.h" +#include "nnacl/infer/deconv2d_infer.h" +#include "nnacl/infer/dedepthwise_conv2d_infer.h" +#include "nnacl/infer/depth_to_space_infer.h" +#include "nnacl/infer/depthwise_conv2d_infer.h" +#include "nnacl/infer/detection_post_process_infer.h" +#include "nnacl/infer/dropout_grad_infer.h" +#include "nnacl/infer/dropout_infer.h" +#include "nnacl/infer/embedding_lookup_infer.h" +#include "nnacl/infer/expand_dims_infer.h" +#include "nnacl/infer/fft_imag_infer.h" +#include "nnacl/infer/fft_real_infer.h" +#include "nnacl/infer/fill_infer.h" +#include "nnacl/infer/flatten_grad_infer.h" +#include "nnacl/infer/flatten_infer.h" +#include "nnacl/infer/full_connection_infer.h" +#include "nnacl/infer/fused_batchnorm_infer.h" +#include "nnacl/infer/gather_infer.h" +#include "nnacl/infer/gather_nd_infer.h" +#include "nnacl/infer/group_conv2d_grad_input_infer.h" +#include "nnacl/infer/gru_infer.h" +#include "nnacl/infer/hashtable_lookup_infer.h" +#include "nnacl/infer/invert_permutation_infer.h" +#include "nnacl/infer/layer_norm_grad_infer.h" +#include "nnacl/infer/layer_norm_infer.h" +#include "nnacl/infer/lin_space_infer.h" +#include "nnacl/infer/log_softmax_infer.h" +#include "nnacl/infer/lsh_projection_infer.h" +#include "nnacl/infer/lstm_infer.h" +#include "nnacl/infer/matmul_infer.h" +#include "nnacl/infer/max_min_grad_infer.h" +#include "nnacl/infer/mean_infer.h" +#include "nnacl/infer/merge_infer.h" +#include "nnacl/infer/mfcc_infer.h" +#include "nnacl/infer/non_max_suppression_infer.h" +#include "nnacl/infer/one_hot_infer.h" +#include "nnacl/infer/pad_infer.h" +#include "nnacl/infer/partial_infer.h" +#include "nnacl/infer/pooling_grad_infer.h" +#include "nnacl/infer/pooling_infer.h" +#include "nnacl/infer/power_infer.h" +#include "nnacl/infer/prior_box_infer.h" +#include "nnacl/infer/quant_dtype_cast_infer.h" +#include "nnacl/infer/random_standard_normal_infer.h" +#include "nnacl/infer/range_infer.h" +#include "nnacl/infer/rank_infer.h" +#include "nnacl/infer/reduce_infer.h" +#include "nnacl/infer/reshape_infer.h" +#include "nnacl/infer/resize_grad_infer.h" +#include "nnacl/infer/resize_infer.h" +#include "nnacl/infer/rfft_infer.h" +#include "nnacl/infer/roi_pooling_infer.h" +#include "nnacl/infer/scatter_nd_infer.h" +#include "nnacl/infer/select_infer.h" +#include "nnacl/infer/sgd_infer.h" +#include "nnacl/infer/shape_infer.h" +#include "nnacl/infer/size_infer.h" +#include "nnacl/infer/skip_gram_infer.h" +#include "nnacl/infer/slice_infer.h" +#include "nnacl/infer/softmax_cross_entropy_infer.h" +#include "nnacl/infer/softmax_infer.h" +#include "nnacl/infer/space_to_batch_infer.h" +#include "nnacl/infer/space_to_batch_nd_infer.h" +#include "nnacl/infer/space_to_depth_infer.h" +#include "nnacl/infer/sparse_softmax_cross_entropy_with_logits_infer.h" +#include "nnacl/infer/sparse_to_dense_infer.h" +#include "nnacl/infer/splice_infer.h" +#include "nnacl/infer/split_infer.h" +#include "nnacl/infer/squeeze_infer.h" +#include "nnacl/infer/stack_infer.h" +#include "nnacl/infer/strided_slice_grad_infer.h" +#include "nnacl/infer/strided_slice_infer.h" +#include "nnacl/infer/switch_infer.h" +#include "nnacl/infer/tensorlist_fromtensor_infer.h" +#include "nnacl/infer/tensorlist_getitem_infer.h" +#include "nnacl/infer/tensorlist_reserve_infer.h" +#include "nnacl/infer/tensorlist_setitem_infer.h" +#include "nnacl/infer/tensorlist_stack_infer.h" +#include "nnacl/infer/tile_infer.h" +#include "nnacl/infer/topk_infer.h" +#include "nnacl/infer/transpose_infer.h" +#include "nnacl/infer/uniform_real_infer.h" +#include "nnacl/infer/unique_infer.h" +#include "nnacl/infer/unsorted_segment_sum_infer.h" +#include "nnacl/infer/unsqueeze_infer.h" +#include "nnacl/infer/unstack_infer.h" +#include "nnacl/infer/where_infer.h" +#include "nnacl/infer/while_infer.h" + +InferShape g_infer_func[PrimType_MAX * sizeof(InferShape)] = {0}; +void RegAllInferFunc1() { + g_infer_func[PrimType_NONE] = NULL; + g_infer_func[PrimType_Abs] = CommonInferShape; + g_infer_func[PrimType_Activation] = CommonInferShape; + g_infer_func[PrimType_ActivationGrad] = CommonInferShape; + g_infer_func[PrimType_Adam] = AdamInferShape; + g_infer_func[PrimType_AddFusion] = ArithmeticInferShape; + g_infer_func[PrimType_AdderFusion] = Conv2dInferShape; + g_infer_func[PrimType_AddGrad] = AddSubGradInferShape; + g_infer_func[PrimType_AddN] = AddnInferShape; + g_infer_func[PrimType_All] = NULL; + g_infer_func[PrimType_ApplyMomentum] = ApplyMomentumInferShape; + g_infer_func[PrimType_ArgMaxFusion] = ArgMinMaxInferShape; + g_infer_func[PrimType_ArgMinFusion] = ArgMinMaxInferShape; + g_infer_func[PrimType_Assert] = AssertOpInferShape; + g_infer_func[PrimType_Assign] = AssignInferShape; + g_infer_func[PrimType_AssignAdd] = AssignAddInferShape; + g_infer_func[PrimType_AudioSpectrogram] = AudioSpectrogramInferShape; + g_infer_func[PrimType_AvgPoolFusion] = PoolingInferShape; + g_infer_func[PrimType_AvgPoolGrad] = PoolingGradInferShape; + g_infer_func[PrimType_BatchNorm] = CommonInferShape; + g_infer_func[PrimType_BatchNormGrad] = BnGradInferShape; + g_infer_func[PrimType_BatchToSpace] = BatchToSpaceInferShape; + g_infer_func[PrimType_BatchToSpaceND] = NULL; + g_infer_func[PrimType_BiasAdd] = CommonInferShape; + g_infer_func[PrimType_BinaryCrossEntropy] = BinaryCrossEntropyInferShape; + g_infer_func[PrimType_BinaryCrossEntropyGrad] = CommonInferShape; + g_infer_func[PrimType_BiasAddGrad] = BiasGradInferShape; + g_infer_func[PrimType_BroadcastTo] = BroadcastToInferShape; + g_infer_func[PrimType_Cast] = CastInferShape; + g_infer_func[PrimType_Ceil] = CommonInferShape; + g_infer_func[PrimType_Clip] = CommonInferShape; + g_infer_func[PrimType_Concat] = ConcatInferShape; + g_infer_func[PrimType_ControlDepend] = CommonInferShape; + g_infer_func[PrimType_Conv2DBackpropFilterFusion] = Conv2dGradFilterInferShape; + g_infer_func[PrimType_Conv2DBackpropInputFusion] = Conv2dGradInputInferShape; + g_infer_func[PrimType_Conv2DFusion] = Conv2dInferShape; + g_infer_func[PrimType_Conv2dTransposeFusion] = Deconv2dInferShape; + g_infer_func[PrimType_Cos] = CommonInferShape; + g_infer_func[PrimType_ConstantOfShape] = ConstantOfShapeInferShape; + g_infer_func[PrimType_Crop] = CropInferShape; + g_infer_func[PrimType_CustomExtractFeatures] = CustomExtractFeaturesInferShape; + g_infer_func[PrimType_CustomNormalize] = CustomNormalizeInferShape; + g_infer_func[PrimType_CustomPredict] = CustomPredictInferShape; + g_infer_func[PrimType_DeConv2DGradFilter] = NULL; + g_infer_func[PrimType_Depend] = CommonInferShape; + g_infer_func[PrimType_DepthToSpace] = DepthToSpaceInferShape; + g_infer_func[PrimType_DetectionPostProcess] = DetectionPostProcessInferShape; + g_infer_func[PrimType_DivFusion] = ArithmeticInferShape; + g_infer_func[PrimType_DivGrad] = ArithmeticGradInferShape; + g_infer_func[PrimType_Dropout] = DropoutInferShape; + g_infer_func[PrimType_DropoutGrad] = DropoutGradInferShape; + g_infer_func[PrimType_Elu] = CommonInferShape; + g_infer_func[PrimType_Eltwise] = ArithmeticInferShape; + g_infer_func[PrimType_Equal] = ArithmeticCompareInferShape; + g_infer_func[PrimType_EmbeddingLookupFusion] = EmbeddingLookupInferShape; + g_infer_func[PrimType_ExpFusion] = CommonInferShape; + g_infer_func[PrimType_ExpandDims] = ExpandDimsInferShape; + g_infer_func[PrimType_FakeQuantWithMinMaxVars] = CommonInferShape; + g_infer_func[PrimType_FakeQuantWithMinMaxVarsPerChannel] = NULL; + g_infer_func[PrimType_FftReal] = FftRealInferShape; + g_infer_func[PrimType_FftImag] = FftImagInferShape; + g_infer_func[PrimType_Flatten] = FlattenInferShape; + g_infer_func[PrimType_FlattenGrad] = FlattenGradInferShape; + g_infer_func[PrimType_Floor] = CommonInferShape; + g_infer_func[PrimType_FloorDiv] = ArithmeticInferShape; + g_infer_func[PrimType_FloorMod] = ArithmeticInferShape; + g_infer_func[PrimType_Fill] = FillInferShape; + g_infer_func[PrimType_FullConnection] = FullConnectionInferShape; + g_infer_func[PrimType_FusedBatchNorm] = FusedBatchNormInferShape; + g_infer_func[PrimType_Gather] = GatherInferShape; + g_infer_func[PrimType_GatherNd] = GatherNdInferShape; + g_infer_func[PrimType_Greater] = ArithmeticCompareInferShape; + g_infer_func[PrimType_GreaterEqual] = ArithmeticCompareInferShape; + g_infer_func[PrimType_HashtableLookup] = HashtableLoopupInferShape; + g_infer_func[PrimType_InstanceNorm] = CommonInferShape; + g_infer_func[PrimType_LayerNormFusion] = LayerNormGradInferShape; + g_infer_func[PrimType_LeakyRelu] = CommonInferShape; + g_infer_func[PrimType_Less] = ArithmeticCompareInferShape; + g_infer_func[PrimType_LessEqual] = ArithmeticCompareInferShape; + g_infer_func[PrimType_Log] = CommonInferShape; + g_infer_func[PrimType_LogGrad] = CommonInferShape; + g_infer_func[PrimType_LogicalAnd] = ArithmeticInferShape; + g_infer_func[PrimType_LogicalNot] = CommonInferShape; + g_infer_func[PrimType_LogicalOr] = ArithmeticInferShape; + g_infer_func[PrimType_LpNormalization] = NULL; + g_infer_func[PrimType_LRN] = CommonInferShape; + g_infer_func[PrimType_LshProjection] = LshProjectionInferShape; + g_infer_func[PrimType_LSTM] = LstmInferShape; + g_infer_func[PrimType_L2NormalizeFusion] = CommonInferShape; + g_infer_func[PrimType_MatMul] = MatmulInferShape; + g_infer_func[PrimType_Maximum] = ArithmeticInferShape; + g_infer_func[PrimType_MaximumGrad] = MaxMinGradInferShape; + g_infer_func[PrimType_MaxPoolFusion] = PoolingInferShape; + g_infer_func[PrimType_MaxPoolGrad] = PoolingGradInferShape; + g_infer_func[PrimType_Merge] = MergeInferShape; + g_infer_func[PrimType_Mfcc] = MfccInferShape; + g_infer_func[PrimType_Minimum] = ArithmeticInferShape; + g_infer_func[PrimType_MinimumGrad] = MaxMinGradInferShape; +} + +void RegAllInferFunc2() { + g_infer_func[PrimType_Mod] = ArithmeticInferShape; + g_infer_func[PrimType_MulFusion] = ArithmeticInferShape; + g_infer_func[PrimType_MulGrad] = ArithmeticGradInferShape; + g_infer_func[PrimType_Neg] = CommonInferShape; + g_infer_func[PrimType_NegGrad] = CommonInferShape; + g_infer_func[PrimType_NotEqual] = ArithmeticCompareInferShape; + g_infer_func[PrimType_NonMaxSuppression] = NonMaxSuppressionInferShape; + g_infer_func[PrimType_OneHot] = OneHotInferShape; + g_infer_func[PrimType_OnesLike] = NULL; + g_infer_func[PrimType_PadFusion] = PadInferShape; + g_infer_func[PrimType_PartialFusion] = PartialInferShape; + g_infer_func[PrimType_PowerGrad] = CommonInferShape; + g_infer_func[PrimType_PowFusion] = PowerInferShape; + g_infer_func[PrimType_PriorBox] = PriorBoxInferShape; + g_infer_func[PrimType_PReLUFusion] = CommonInferShape; + g_infer_func[PrimType_QuantDTypeCast] = QuantDtypeCastInferShape; + g_infer_func[PrimType_Rank] = RankInferShape; + g_infer_func[PrimType_Range] = RangeInferShape; + g_infer_func[PrimType_Reciprocal] = CommonInferShape; + g_infer_func[PrimType_RealDiv] = ArithmeticInferShape; + g_infer_func[PrimType_ReduceFusion] = ReduceInferShape; + g_infer_func[PrimType_Reshape] = ReshapeInferShape; + g_infer_func[PrimType_Resize] = ResizeInferShape; + g_infer_func[PrimType_ReverseSequence] = CommonInferShape; + g_infer_func[PrimType_ReverseV2] = CommonInferShape; + g_infer_func[PrimType_Rfft] = RfftInferShape; + g_infer_func[PrimType_ROIPooling] = ROIPoolingInferShape; + g_infer_func[PrimType_Round] = CommonInferShape; + g_infer_func[PrimType_Rsqrt] = CommonInferShape; + g_infer_func[PrimType_ScaleFusion] = CommonInferShape; + g_infer_func[PrimType_ScatterNd] = ScatterNdInferShape; + g_infer_func[PrimType_SGD] = SgdInferShape; + g_infer_func[PrimType_Shape] = ShapeInferShape; + g_infer_func[PrimType_SigmoidCrossEntropyWithLogits] = CommonInferShape; + g_infer_func[PrimType_SigmoidCrossEntropyWithLogitsGrad] = CommonInferShape; + g_infer_func[PrimType_Sin] = CommonInferShape; + g_infer_func[PrimType_SkipGram] = SkipGramInferShape; + g_infer_func[PrimType_SliceFusion] = SliceInferShape; + g_infer_func[PrimType_SmoothL1Loss] = CommonInferShape; + g_infer_func[PrimType_SmoothL1LossGrad] = CommonInferShape; + g_infer_func[PrimType_Softmax] = SoftMaxInferShape; + g_infer_func[PrimType_SoftmaxCrossEntropyWithLogits] = SoftmaxCrossEntropyInferShape; + g_infer_func[PrimType_SpaceToBatch] = SpaceToBatchInferShape; + g_infer_func[PrimType_SpaceToBatchND] = SpaceToBatchNdInferShape; + g_infer_func[PrimType_SpaceToDepth] = SpaceToDepthInferShape; + g_infer_func[PrimType_SparseSoftmaxCrossEntropyWithLogits] = SparseSoftmaxCrossEntropyWithLogitsInferShape; + g_infer_func[PrimType_SparseToDense] = SparseToDenseInferShape; + g_infer_func[PrimType_Split] = SplitInferShape; + g_infer_func[PrimType_Sqrt] = CommonInferShape; + g_infer_func[PrimType_Squeeze] = SqueezeInferShape; + g_infer_func[PrimType_Square] = CommonInferShape; + g_infer_func[PrimType_SquaredDifference] = ArithmeticInferShape; + g_infer_func[PrimType_Stack] = StackInferShape; + g_infer_func[PrimType_StridedSlice] = StridedSliceInferShape; + g_infer_func[PrimType_SubFusion] = ArithmeticInferShape; + g_infer_func[PrimType_SubGrad] = AddSubGradInferShape; + g_infer_func[PrimType_Switch] = SwitchInferShape; + g_infer_func[PrimType_TensorListFromTensor] = TensorListFromTensorInferShape; + g_infer_func[PrimType_TensorListGetItem] = TensorListGetItemInferShape; + g_infer_func[PrimType_TensorListReserve] = TensorListReserveInferShape; + g_infer_func[PrimType_TensorListSetItem] = TensorListSetItemInferShape; + g_infer_func[PrimType_TensorListStack] = TensorListStackInferShape; + g_infer_func[PrimType_TileFusion] = TileInferShape; + g_infer_func[PrimType_TopKFusion] = TopKInferShape; + g_infer_func[PrimType_Transpose] = TransposeInferShape; + g_infer_func[PrimType_Unique] = UniqueInferShape; + g_infer_func[PrimType_UnsortedSegmentSum] = UnsortedSegmentSumInferShape; + g_infer_func[PrimType_Unsqueeze] = UnsqueezeInferShape; + g_infer_func[PrimType_Unstack] = UnstackInferShape; + g_infer_func[PrimType_While] = WhileInferShape; + g_infer_func[PrimType_Where] = WhereInferShape; + g_infer_func[PrimType_ZerosLike] = CommonInferShape; + g_infer_func[PrimType_Select] = SelectInferShape; + g_infer_func[PrimType_If] = CommonInferShape; + g_infer_func[PrimType_GRU] = GruInferShape; + g_infer_func[PrimType_NonZero] = NULL; + g_infer_func[PrimType_InvertPermutation] = InvertPermutationInferShape; + g_infer_func[PrimType_Size] = SizeInferShape; + g_infer_func[PrimType_RandomStandardNormal] = RandomStandardNormalInferShape; + g_infer_func[PrimType_CropAndResize] = CropAndResizeInferShape; + g_infer_func[PrimType_Erf] = CommonInferShape; + g_infer_func[PrimType_StridedSliceGrad] = StridedSliceGradInferShape; + g_infer_func[PrimType_IsFinite] = CommonInferShape; + g_infer_func[PrimType_LinSpace] = LinSpaceInferShape; + g_infer_func[PrimType_UniformReal] = UniformRealInferShape; + g_infer_func[PrimType_AbsGrad] = CommonInferShape; + g_infer_func[PrimType_RsqrtGrad] = NULL; + g_infer_func[PrimType_SqrtGrad] = NULL; + g_infer_func[PrimType_LayerNormGrad] = LayerNormGradInferShape; + g_infer_func[PrimType_ResizeGrad] = ResizeGradInferShape; + g_infer_func[PrimType_Splice] = SpliceInferShape; + g_infer_func[PrimType_LogSoftmax] = LogSoftmaxInferShape; + g_infer_func[PrimType_Call] = NULL; + g_infer_func[PrimType_Custom] = NULL; + g_infer_func[PrimType_CumSum] = CumsumInferShape; +} + +typedef void RegFunc(); +#pragma data_seg(".CRT$XIU") +static RegFunc *before[] = {RegAllInferFunc1, RegAllInferFunc2}; +#pragma data_seg() + +#else +__attribute__((init_priority(101))) InferShape g_infer_func[PrimType_MAX * sizeof(InferShape)] = {0}; +#endif // SUPPORT_MSVC InferShape GetInferFunc(int prim_type) { if (prim_type < PrimType_MAX) { diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.h index 0499b7f367e..b05c9017913 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/infer_register.h @@ -232,8 +232,13 @@ enum PrimType { void RegInfer(int prim_type, InferShape func); +#ifdef SUPPORT_MSVC +#define REG_INFER(op, type, func) +#else #define REG_INFER(op, type, func) \ __attribute__((constructor(102))) void Reg##op##Infer() { RegInfer(type, func); } +#endif + #ifdef __cplusplus } #endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/common_utils.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/common_utils.h index f53de26dacc..1ef3d43182c 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/common_utils.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/avx/common_utils.h @@ -16,7 +16,11 @@ #ifndef MINDSPORE_NNACL_X86_64_AVX_COMMON_UTILS_H_ #define MINDSPORE_NNACL_X86_64_AVX_COMMON_UTILS_H_ +#ifdef SUPPORT_MSVC +#include +#else #include +#endif #ifdef __cplusplus extern "C" { diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h index 9ea0a2e24f8..929fdb8fde0 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions.h @@ -20,10 +20,17 @@ #ifdef ENABLE_ARM #include +#define MS_F32X4_GETI(src, i) src[i] #endif #if defined(ENABLE_SSE) || defined(ENABLE_AVX) +#ifdef SUPPORT_MSVC +#include +#define MS_F32X4_GETI(src, i) src.m128_f32[i] +#else #include +#define MS_F32X4_GETI(src, i) src[i] +#endif #endif #ifdef ENABLE_ARM @@ -137,26 +144,7 @@ static inline float32x4_t vrecp(float32x4_t v) { #define MS_CAST_F32_S32(src) _mm_castsi128_ps(src) #endif -#define LOAD256X8_F32(src, input_ptr, num) \ - MS_FLOAT32X8 src##1 = MS_LD256_F32(input_ptr + 0 * num); \ - MS_FLOAT32X8 src##2 = MS_LD256_F32(input_ptr + 1 * num); \ - MS_FLOAT32X8 src##3 = MS_LD256_F32(input_ptr + 2 * num); \ - MS_FLOAT32X8 src##4 = MS_LD256_F32(input_ptr + 3 * num); \ - MS_FLOAT32X8 src##5 = MS_LD256_F32(input_ptr + 4 * num); \ - MS_FLOAT32X8 src##6 = MS_LD256_F32(input_ptr + 5 * num); \ - MS_FLOAT32X8 src##7 = MS_LD256_F32(input_ptr + 6 * num); \ - MS_FLOAT32X8 src##8 = MS_LD256_F32(input_ptr + 7 * num); - -#define STORE256X8_F32(output_ptr, num, dst) \ - MS_ST256_F32(output_ptr + 0 * num, dst##1); \ - MS_ST256_F32(output_ptr + 1 * num, dst##2); \ - MS_ST256_F32(output_ptr + 2 * num, dst##3); \ - MS_ST256_F32(output_ptr + 3 * num, dst##4); \ - MS_ST256_F32(output_ptr + 4 * num, dst##5); \ - MS_ST256_F32(output_ptr + 5 * num, dst##6); \ - MS_ST256_F32(output_ptr + 6 * num, dst##7); \ - MS_ST256_F32(output_ptr + 7 * num, dst##8); - +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) #define LOAD128X8_F32(src, input_ptr, num) \ MS_FLOAT32X4 src##1 = MS_LDQ_F32(input_ptr + 0 * num); \ MS_FLOAT32X4 src##2 = MS_LDQ_F32(input_ptr + 1 * num); \ @@ -195,7 +183,37 @@ static inline MS_FLOAT32X4 MS_TANHX4_F32(MS_FLOAT32X4 src) { return MS_MINQ_F32(MS_MAXQ_F32(MS_DIVQ_F32(a, b), neg), pos); } +static inline MS_FLOAT32X4 MS_ERFX4_F32(MS_FLOAT32X4 src) { + MS_FLOAT32X4 dst; + MS_F32X4_GETI(dst, 0) = erff(MS_F32X4_GETI(src, 0)); + MS_F32X4_GETI(dst, 1) = erff(MS_F32X4_GETI(src, 1)); + MS_F32X4_GETI(dst, 2) = erff(MS_F32X4_GETI(src, 2)); + MS_F32X4_GETI(dst, 3) = erff(MS_F32X4_GETI(src, 3)); + return dst; +} +#endif + #ifdef ENABLE_AVX +#define LOAD256X8_F32(src, input_ptr, num) \ + MS_FLOAT32X8 src##1 = MS_LD256_F32(input_ptr + 0 * num); \ + MS_FLOAT32X8 src##2 = MS_LD256_F32(input_ptr + 1 * num); \ + MS_FLOAT32X8 src##3 = MS_LD256_F32(input_ptr + 2 * num); \ + MS_FLOAT32X8 src##4 = MS_LD256_F32(input_ptr + 3 * num); \ + MS_FLOAT32X8 src##5 = MS_LD256_F32(input_ptr + 4 * num); \ + MS_FLOAT32X8 src##6 = MS_LD256_F32(input_ptr + 5 * num); \ + MS_FLOAT32X8 src##7 = MS_LD256_F32(input_ptr + 6 * num); \ + MS_FLOAT32X8 src##8 = MS_LD256_F32(input_ptr + 7 * num); + +#define STORE256X8_F32(output_ptr, num, dst) \ + MS_ST256_F32(output_ptr + 0 * num, dst##1); \ + MS_ST256_F32(output_ptr + 1 * num, dst##2); \ + MS_ST256_F32(output_ptr + 2 * num, dst##3); \ + MS_ST256_F32(output_ptr + 3 * num, dst##4); \ + MS_ST256_F32(output_ptr + 4 * num, dst##5); \ + MS_ST256_F32(output_ptr + 5 * num, dst##6); \ + MS_ST256_F32(output_ptr + 6 * num, dst##7); \ + MS_ST256_F32(output_ptr + 7 * num, dst##8); + static inline MS_FLOAT32X8 MS_TANHX8_F32(MS_FLOAT32X8 src) { static const float data[] = {378.0f, 17325.0f, 135135.0f, 28.0f, 3150.0f, 62370.0f}; static const MS_FLOAT32X8 neg = {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f}; @@ -207,13 +225,4 @@ static inline MS_FLOAT32X8 MS_TANHX8_F32(MS_FLOAT32X8 src) { } #endif -static inline MS_FLOAT32X4 MS_ERFX4_F32(MS_FLOAT32X4 src) { - MS_FLOAT32X4 dst; - dst[0] = erff(src[0]); - dst[1] = erff(src[1]); - dst[2] = erff(src[2]); - dst[3] = erff(src[3]); - return dst; -} - #endif // MINDSPORE_NNACL_INTRINSICS_MS_SIMD_INSTRUCTIONS_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/ConvDwFp32Row_sse.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/ConvDwFp32Row_sse.c index e254e95a096..d5194d381f6 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/ConvDwFp32Row_sse.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/ConvDwFp32Row_sse.c @@ -15,7 +15,7 @@ */ #if defined(ENABLE_SSE) && !defined(ENABLE_AVX) -#include +#include "nnacl/intrinsics/ms_simd_instructions.h" #include "nnacl/fp32/common_func_fp32.h" void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/DepthwiseFp32_Sse.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/DepthwiseFp32_Sse.c index 2c6893bf9d6..777d626a6af 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/DepthwiseFp32_Sse.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/DepthwiseFp32_Sse.c @@ -15,7 +15,7 @@ */ #ifdef ENABLE_SSE -#include +#include "nnacl/intrinsics/ms_simd_instructions.h" #include "nnacl/fp32/conv_depthwise_fp32.h" #include "nnacl/intrinsics/sse/sse_common.h" diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/MatMul_Sse.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/MatMul_Sse.c index d58ff403dd6..226a1eeaee6 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/MatMul_Sse.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/MatMul_Sse.c @@ -15,7 +15,7 @@ */ #ifdef ENABLE_SSE -#include +#include "nnacl/intrinsics/ms_simd_instructions.h" #include "nnacl/fp32/matmul_fp32.h" #include "nnacl/op_base.h" #include "nnacl/matmul_parameter.h" diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/PostFuncBiasReluC4.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/PostFuncBiasReluC4.c index c049240a030..9589b14a166 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/PostFuncBiasReluC4.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/PostFuncBiasReluC4.c @@ -15,7 +15,7 @@ */ #ifdef ENABLE_SSE -#include +#include "nnacl/intrinsics/ms_simd_instructions.h" #include "nnacl/fp32/common_func_fp32.h" #include "nnacl/intrinsics/sse/sse_common.h" diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/PostFuncBiasReluC8.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/PostFuncBiasReluC8.c index fb72a814674..554a6348a79 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/PostFuncBiasReluC8.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/PostFuncBiasReluC8.c @@ -15,7 +15,7 @@ */ #ifdef ENABLE_SSE -#include +#include "nnacl/intrinsics/ms_simd_instructions.h" #include "nnacl/fp32/common_func_fp32.h" #include "nnacl/intrinsics/sse/sse_common.h" diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/TiledC4MatMulFp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/TiledC4MatMulFp32.c index e74814f443b..eb1c8efb277 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/TiledC4MatMulFp32.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/TiledC4MatMulFp32.c @@ -14,7 +14,7 @@ * limitations under the License. */ #if defined(ENABLE_SSE) && !defined(ENABLE_AVX) -#include +#include "nnacl/intrinsics/ms_simd_instructions.h" #include "nnacl/fp32/common_func_fp32.h" static inline void TiledC4MatmulFp32_Transfer(__m128 *dst1, __m128 *dst2, __m128 *dst3, __m128 *dst4, @@ -51,21 +51,23 @@ void TiledC4MatmulFp32(float *dst, const float *src, const float *weight, size_t weight_data[2] = _mm_loadu_ps(weight + 8); weight_data[3] = _mm_loadu_ps(weight + 12); weight += 16; - __m128 dst1 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0])); - __m128 dst2 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0])); - __m128 dst3 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0])); - __m128 dst4 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0])); + __m128 dst1 = _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src1, 0))); + __m128 dst2 = _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src2, 0))); + __m128 dst3 = _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src3, 0))); + __m128 dst4 = _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src4, 0))); for (int j = 1; j < 4; ++j) { - TiledC4MatmulFp32_Transfer(&dst1, &dst2, &dst3, &dst4, weight_data[j], src1[j], src2[j], src3[j], src4[j]); + TiledC4MatmulFp32_Transfer(&dst1, &dst2, &dst3, &dst4, weight_data[j], MS_F32X4_GETI(src1, j), + MS_F32X4_GETI(src2, j), MS_F32X4_GETI(src3, j), MS_F32X4_GETI(src4, j)); } TiledC4MatmulFp32_LoadData(&src1, &src2, &src3, &src4, src); src += 16; - __m128 dst5 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0])); - __m128 dst6 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0])); - __m128 dst7 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0])); - __m128 dst8 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0])); + __m128 dst5 = _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src1, 0))); + __m128 dst6 = _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src2, 0))); + __m128 dst7 = _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src3, 0))); + __m128 dst8 = _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src4, 0))); for (int j = 1; j < 4; ++j) { - TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[j], src1[j], src2[j], src3[j], src4[j]); + TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[j], MS_F32X4_GETI(src1, j), + MS_F32X4_GETI(src2, j), MS_F32X4_GETI(src3, j), MS_F32X4_GETI(src4, j)); } if (ic4_tmp != 0) { ic4_tmp -= 1; @@ -75,64 +77,74 @@ void TiledC4MatmulFp32(float *dst, const float *src, const float *weight, size_t weight_data[1] = _mm_loadu_ps(weight + 4); weight += 8; - dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0]))); - dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0]))); + dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src1, 0)))); + dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src2, 0)))); for (; ic4_tmp != 0; ic4_tmp -= 1) { - dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0]))); - dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0]))); + dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src3, 0)))); + dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src4, 0)))); - TiledC4MatmulFp32_Transfer(&dst1, &dst2, &dst3, &dst4, weight_data[1], src1[1], src2[1], src3[1], src4[1]); + TiledC4MatmulFp32_Transfer(&dst1, &dst2, &dst3, &dst4, weight_data[1], MS_F32X4_GETI(src1, 1), + MS_F32X4_GETI(src2, 1), MS_F32X4_GETI(src3, 1), MS_F32X4_GETI(src4, 1)); weight_data[2] = _mm_loadu_ps(weight); weight_data[3] = _mm_loadu_ps(weight + 4); weight += 8; - TiledC4MatmulFp32_Transfer(&dst1, &dst2, &dst3, &dst4, weight_data[2], src1[2], src2[2], src3[2], src4[2]); + TiledC4MatmulFp32_Transfer(&dst1, &dst2, &dst3, &dst4, weight_data[2], MS_F32X4_GETI(src1, 2), + MS_F32X4_GETI(src2, 2), MS_F32X4_GETI(src3, 2), MS_F32X4_GETI(src4, 2)); - dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[3], _mm_set_ps1(src1[3]))); - dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[3], _mm_set_ps1(src2[3]))); + dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[3], _mm_set_ps1(MS_F32X4_GETI(src1, 3)))); + dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[3], _mm_set_ps1(MS_F32X4_GETI(src2, 3)))); src1 = _mm_loadu_ps(src); src2 = _mm_loadu_ps(src + 4); - dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[3], _mm_set_ps1(src3[3]))); - dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[3], _mm_set_ps1(src4[3]))); + dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[3], _mm_set_ps1(MS_F32X4_GETI(src3, 3)))); + dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[3], _mm_set_ps1(MS_F32X4_GETI(src4, 3)))); src3 = _mm_loadu_ps(src + 8); src4 = _mm_loadu_ps(src + 12); src += 16; - TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[0], src1[0], src2[0], src3[0], src4[0]); + TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[0], MS_F32X4_GETI(src1, 0), + MS_F32X4_GETI(src2, 0), MS_F32X4_GETI(src3, 0), MS_F32X4_GETI(src4, 0)); - TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[1], src1[1], src2[1], src3[1], src4[1]); + TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[1], MS_F32X4_GETI(src1, 1), + MS_F32X4_GETI(src2, 1), MS_F32X4_GETI(src3, 1), MS_F32X4_GETI(src4, 1)); - TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[2], src1[2], src2[2], src3[2], src4[2]); + TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[2], MS_F32X4_GETI(src1, 2), + MS_F32X4_GETI(src2, 2), MS_F32X4_GETI(src3, 2), MS_F32X4_GETI(src4, 2)); weight_data[0] = _mm_loadu_ps(weight); weight_data[1] = _mm_loadu_ps(weight + 4); weight += 8; - TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[3], src1[3], src2[3], src3[3], src4[3]); + TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[3], MS_F32X4_GETI(src1, 3), + MS_F32X4_GETI(src2, 3), MS_F32X4_GETI(src3, 3), MS_F32X4_GETI(src4, 3)); TiledC4MatmulFp32_LoadData(&src1, &src2, &src3, &src4, src); src += 16; - dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0]))); - dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0]))); + dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src1, 0)))); + dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src2, 0)))); } - dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0]))); - dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0]))); + dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src3, 0)))); + dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[0], _mm_set_ps1(MS_F32X4_GETI(src4, 0)))); - TiledC4MatmulFp32_Transfer(&dst1, &dst2, &dst3, &dst4, weight_data[1], src1[1], src2[1], src3[1], src4[1]); + TiledC4MatmulFp32_Transfer(&dst1, &dst2, &dst3, &dst4, weight_data[1], MS_F32X4_GETI(src1, 1), + MS_F32X4_GETI(src2, 1), MS_F32X4_GETI(src3, 1), MS_F32X4_GETI(src4, 1)); weight_data[2] = _mm_loadu_ps(weight); weight_data[3] = _mm_loadu_ps(weight + 4); weight += 8; - TiledC4MatmulFp32_Transfer(&dst1, &dst2, &dst3, &dst4, weight_data[2], src1[2], src2[2], src3[2], src4[2]); + TiledC4MatmulFp32_Transfer(&dst1, &dst2, &dst3, &dst4, weight_data[2], MS_F32X4_GETI(src1, 2), + MS_F32X4_GETI(src2, 2), MS_F32X4_GETI(src3, 2), MS_F32X4_GETI(src4, 2)); - TiledC4MatmulFp32_Transfer(&dst1, &dst2, &dst3, &dst4, weight_data[3], src1[3], src2[3], src3[3], src4[3]); + TiledC4MatmulFp32_Transfer(&dst1, &dst2, &dst3, &dst4, weight_data[3], MS_F32X4_GETI(src1, 3), + MS_F32X4_GETI(src2, 3), MS_F32X4_GETI(src3, 3), MS_F32X4_GETI(src4, 3)); TiledC4MatmulFp32_LoadData(&src1, &src2, &src3, &src4, src); src += 16; for (int j = 0; j < 4; ++j) { - TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[j], src1[j], src2[j], src3[j], src4[j]); + TiledC4MatmulFp32_Transfer(&dst5, &dst6, &dst7, &dst8, weight_data[j], MS_F32X4_GETI(src1, j), + MS_F32X4_GETI(src2, j), MS_F32X4_GETI(src3, j), MS_F32X4_GETI(src4, j)); } } _mm_storeu_ps(dst, dst1); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/WinogradTrans.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/WinogradTrans.c index a1821af201b..78b10a52d89 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/WinogradTrans.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/sse/WinogradTrans.c @@ -14,7 +14,7 @@ * limitations under the License. */ #ifdef ENABLE_SSE -#include +#include "nnacl/intrinsics/ms_simd_instructions.h" #include "nnacl/fp32/common_func_fp32.h" void WinogradTransLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) { diff --git a/mindspore/lite/src/lite_model.cc b/mindspore/lite/src/lite_model.cc index 83c19e9b154..aaded161e85 100644 --- a/mindspore/lite/src/lite_model.cc +++ b/mindspore/lite/src/lite_model.cc @@ -477,7 +477,11 @@ int Model::Export(Model *model, const char *filename) { ofs.seekp(0, std::ios::beg); ofs.write(liteModel->buf, liteModel->buf_size_); ofs.close(); +#ifdef SUPPORT_MSVC + return RET_OK; +#else return chmod(filename, S_IRUSR); +#endif } } // namespace mindspore::lite