diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/activation_grad.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/activation_grad.c index da48f5253bd..0cde9a80ba2 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/activation_grad.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/activation_grad.c @@ -14,21 +14,26 @@ * limitations under the License. */ -#include -#include "nnacl/op_base.h" #include "nnacl/fp16_grad/activation_grad.h" +#include +#include +#ifdef ENABLE_NEON +#include +#include "nnacl/fp32/exp_fp32.h" +#endif +#include "nnacl/op_base.h" #include "nnacl/errorcode.h" -int Fp16ReluGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst) { +int ReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) { int i = 0; #ifdef ENABLE_NEON - float16x8_t zero_4 = vdupq_n_f16(0); - for (; i < length - 4; i += 4) { - float16x8_t src0_4 = vld1q_f16(src0 + i); - float16x8_t src1_4 = vld1q_f16(src1 + i); - uint16x8_t mask_4 = vcgtq_f16(src1_4, zero_4); - float16x8_t dst_4 = vbslq_f16(mask_4, src0_4, zero_4); - vst1q_f16(dst + i, dst_4); + float16x8_t zero_v = vdupq_n_f16(0); + for (; i <= length - C8NUM; i += C8NUM) { + float16x8_t src0_v = vld1q_f16(src0 + i); + float16x8_t src1_v = vld1q_f16(src1 + i); + uint16x8_t mask_v = vcgtq_f16(src1_v, zero_v); + float16x8_t dst_v = vbslq_f16(mask_v, src0_v, zero_v); + vst1q_f16(dst + i, dst_v); } #endif for (; i < length; i++) { @@ -37,15 +42,43 @@ int Fp16ReluGrad(const float16_t *src0, const float16_t *src1, size_t length, fl return NNACL_OK; } -int Fp16SigmoidGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst) { +int Relu6Fp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) { int i = 0; #ifdef ENABLE_NEON - float16x8_t one_4 = vdupq_n_f16(1); - for (; i < length - 4; i += 4) { - float16x8_t src0_4 = vld1q_f16(src0 + i); - float16x8_t src1_4 = vld1q_f16(src1 + i); - float16x8_t dst_4 = vmulq_f16(src0_4, vmulq_f16(src1_4, vsubq_f16(one_4, src1_4))); - vst1q_f16(dst + i, dst_4); + float16x8_t zero_8 = vdupq_n_f16(0); + float16x8_t six_8 = vdupq_n_f16(6); + for (; i < length - 8; i += 8) { + float16x8_t src1_8 = vld1q_f16(src1 + i); + float16x8_t src0_8 = vld1q_f16(src0 + i); + float16x8_t max_8 = vmaxq_f16(src1_8, zero_8); + float16x8_t min_max_8 = vminq_f16(max_8, six_8); + uint16x8_t mask_8 = vceqq_f16(min_max_8, src1_8); + float16x8_t dst_8 = vbslq_f16(mask_8, src0_8, zero_8); + vst1q_f16(dst + i, dst_8); + } +#endif + for (; i < length; ++i) { + dst[i] = (src1[i] > 0.0f && src1[i] <= 6.0f) ? src0[i] : 0.0f; + } + return NNACL_OK; +} + +int LReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha) { + for (int i = 0; i < length; ++i) { + dst[i] = src0[i] * (src1[i] * (1.0f - src1[i])); + } + return NNACL_OK; +} + +int SigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) { + int i = 0; +#ifdef ENABLE_NEON + float16x8_t one_8 = vdupq_n_f16(1); + for (; i < length - 8; i += 8) { + float16x8_t src0_8 = vld1q_f16(src0 + i); + float16x8_t src1_8 = vld1q_f16(src1 + i); + float16x8_t dst_8 = vmulq_f16(src0_8, vmulq_f16(src1_8, vsubq_f16(one_8, src1_8))); + vst1q_f16(dst + i, dst_8); } #endif for (; i < length; i++) { @@ -53,3 +86,56 @@ int Fp16SigmoidGrad(const float16_t *src0, const float16_t *src1, size_t length, } return NNACL_OK; } + +int TanhFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) { + for (int i = 0; i < length; ++i) { + dst[i] = (float16_t)((1.0f - ((float)src1[i] * (float)src1[i])) * (float)src0[i]); + } + return NNACL_OK; +} + +int HSwishFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) { + for (int i = 0; i < length; ++i) { + float16_t tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : (2.0f * src1[i] + 3.0f) / 6.0f)); + dst[i] = tmp * src0[i]; + } + return NNACL_OK; +} + +int HSigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) { + for (int i = 0; i < length; ++i) { + float16_t tmp = (src1[i] > 3.0f ? 0.0f : (src1[i] < -3.0f ? 0.0f : 1.0f / 6.0f)); + dst[i] = tmp * src0[i]; + } + return NNACL_OK; +} +int EluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha) { + int i = 0; +#ifdef ENABLE_NEON + float16x4_t zero_4 = vdup_n_f16(0); + float16x4_t one_4 = vdup_n_f16(1); + float16x4_t alpha_4 = vdup_n_f16(alpha); + for (; i < length - 4; i += 4) { + float16x4_t src0_4 = vld1_f16(src0 + i); + float16x4_t src1_4 = vld1_f16(src1 + i); + uint16x4_t mask_4 = vcgt_f16(src1_4, zero_4); + float32x4_t tmp; + simd_exp(vcvt_f32_f16(src1_4), (float *)&tmp); + uint16x4_t expm1_4 = vsub_f16(vcvt_f16_f32(tmp), one_4); + float16x4_t dst_4 = vbsl_f16(mask_4, src0_4, vmul_f16(alpha_4, vmul_f16(expm1_4, src0_4))); + vst1_f16(dst + i, dst_4); + } +#endif + for (; i < length; ++i) { + dst[i] = (src1[i] > 0.0f ? src0[i] : alpha * expm1(src1[i]) * src0[i]); + } + return NNACL_OK; +} + +int GeluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) { + for (int i = 0; i < length; ++i) { + dst[i] = src0[i] * ((0.5 * (1.0 + erf(src1[i] / 1.4142135623730951))) + + (src1[i] * exp(-0.5 * src1[i] * src1[i]) / 2.5066282746)); + } + return NNACL_OK; +} diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/activation_grad.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/activation_grad.h index 36d3e8d333c..d787b5fc20e 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/activation_grad.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/activation_grad.h @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef MINDSPORE_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_ -#define MINDSPORE_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_ #ifdef ENABLE_NEON #include @@ -23,20 +23,22 @@ #include "nnacl/op_base.h" #include "nnacl/int8/fixed_point.h" -typedef struct ActivationGradParameterFp16 { - OpParameter op_parameter; - int type_; - float alpha_; -} ActivationGradParameterFp16; #ifdef __cplusplus extern "C" { #endif -int Fp16ReluGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst); -int Fp16SigmoidGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst); +int ReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst); +int Relu6Fp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst); +int LReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha); +int SigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst); +int TanhFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst); +int HSwishFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst); +int HSigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst); +int EluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha); +int GeluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst); #ifdef __cplusplus } #endif -#endif // MINDSPORE_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_ +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/arithmetic_grad.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/arithmetic_grad.c new file mode 100644 index 00000000000..1111058c32c --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/arithmetic_grad.c @@ -0,0 +1,157 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp16_grad/arithmetic_grad.h" +#include +#include +#include "nnacl/fp32_grad/utils.h" +#include "nnacl/errorcode.h" + +void ElementDivNegSquareFp16(const float16_t *nom, const float16_t *denom, float16_t *output, int element_size) { + for (int i = 0; i < element_size; i++) { + output[i] = -nom[i] / (denom[i] * denom[i]); + } +} + +void ElementMulAndDivNegSquareFp16(const float16_t *a, const float16_t *b, const float16_t *denom, float16_t *output, + int element_size) { + for (int i = 0; i < element_size; i++) { + output[i] = -a[i] * b[i] / (denom[i] * denom[i]); + } +} + +int ElementAbsGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, int element_size) { + for (int i = 0; i < element_size; i++) { + out[i] = (in1[i] < 0.f) ? -in2[i] : ((in1[i] > 0.f) ? in2[i] : 0); + } + return NNACL_OK; +} + +void MaximumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims, + const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1, + int num_dims) { + int num_output0 = 1; + int num_output1 = 1; + bool same_shape = true; + for (int idx = 0; idx < num_dims; ++idx) { + num_output0 *= input0_dims[idx]; + num_output1 *= input1_dims[idx]; + if (input0_dims[idx] != input1_dims[idx]) { + same_shape = false; + } + } + + if (same_shape) { + int input_iter[8] = {0}; + + // Iterate through input_data. + do { + size_t offset = GetInputOffset(num_dims, input0_dims, input_iter); + output0[offset] = input0[offset] > input1[offset] ? dy[offset] : 0.; + output1[offset] = input1[offset] >= input0[offset] ? dy[offset] : 0.; + } while (NextIndex(num_dims, input0_dims, input_iter)); + } else { + memset(output0, 0, num_output0 * sizeof(float16_t)); // zero output + memset(output1, 0, num_output1 * sizeof(float16_t)); // zero output + + int input_iter[8] = {0}; + int axes0[5] = {0}; + int axes1[5] = {0}; + int num_axes0 = 0; + int num_axes1 = 0; + for (int i = 0; i < num_dims; i++) { + if (input0_dims[i] == 1) { + axes0[num_axes0++] = i; + } + if (input1_dims[i] == 1) { + axes1[num_axes1++] = i; + } + } + + do { + size_t offset0 = GetOutputOffset(num_dims, input0_dims, input_iter, num_axes0, axes0); + size_t offset1 = GetOutputOffset(num_dims, input1_dims, input_iter, num_axes1, axes1); + size_t yt_offset = GetInputOffset(num_dims, input0_dims, input_iter); + output0[offset0] += input0[offset0] > input1[offset1] ? dy[yt_offset] : 0.; + output1[offset1] += input1[offset1] >= input0[offset0] ? dy[yt_offset] : 0.; + } while (NextIndex(num_dims, dy_dims, input_iter)); + } +} + +void MinimumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims, + const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1, + int num_dims) { + int num_output0 = 1; + int num_output1 = 1; + bool same_shape = true; + for (int idx = 0; idx < num_dims; ++idx) { + num_output0 *= input0_dims[idx]; + num_output1 *= input1_dims[idx]; + if (input0_dims[idx] != input1_dims[idx]) { + same_shape = false; + } + } + + if (same_shape) { + int input_iter[8] = {0}; + + // Iterate through input_data. + do { + size_t offset = GetInputOffset(num_dims, input0_dims, input_iter); + output0[offset] = input0[offset] < input1[offset] ? dy[offset] : 0.; + output1[offset] = input1[offset] <= input0[offset] ? dy[offset] : 0.; + } while (NextIndex(num_dims, input0_dims, input_iter)); + } else { + memset(output0, 0, num_output0 * sizeof(float16_t)); // zero output + memset(output1, 0, num_output1 * sizeof(float16_t)); // zero output + + int input_iter[8] = {0}; + int axes0[5] = {0}; + int axes1[5] = {0}; + int num_axes0 = 0; + int num_axes1 = 0; + for (int i = 0; i < num_dims; i++) { + if (input0_dims[i] == 1) { + axes0[num_axes0++] = i; + } + if (input1_dims[i] == 1) { + axes1[num_axes1++] = i; + } + } + + do { + size_t offset0 = GetOutputOffset(num_dims, input0_dims, input_iter, num_axes0, axes0); + size_t offset1 = GetOutputOffset(num_dims, input1_dims, input_iter, num_axes1, axes1); + size_t yt_offset = GetInputOffset(num_dims, input0_dims, input_iter); + output0[offset0] += input0[offset0] < input1[offset1] ? dy[yt_offset] : 0.; + output1[offset1] += input1[offset1] <= input0[offset0] ? dy[yt_offset] : 0.; + } while (NextIndex(num_dims, dy_dims, input_iter)); + } +} + +int ElementSqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size) { + for (int i = 0; i < element_size; i++) { + out[i] = 0.5f * in2[i] / in1[i]; + } + return NNACL_OK; +} + +int ElementRsqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size) { + for (int i = 0; i < element_size; i++) { + out[i] = -0.5f * in2[i] * in1[i] * in1[1] * in1[i]; + } + return NNACL_OK; +} diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/arithmetic_grad.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/arithmetic_grad.h new file mode 100644 index 00000000000..1e1f8242235 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/arithmetic_grad.h @@ -0,0 +1,41 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ARITHMETIC_GRAD_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ARITHMETIC_GRAD_H_ + +#include "nnacl/op_base.h" + +#ifdef __cplusplus +extern "C" { +#endif +void ElementDivNegSquareFp16(const float16_t *nom, const float16_t *denom, float16_t *output, int element_size); +void ElementMulAndDivNegSquareFp16(const float16_t *a, const float16_t *b, const float16_t *denom, float16_t *output, + int element_size); +int ElementAbsGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, int element_size); +void MaximumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims, + const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1, + int num_dims); +void MinimumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims, + const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1, + int num_dims); +int ElementSqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size); +int ElementRsqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size); + +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ARITHMETIC_GRAD_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/batch_norm.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/batch_norm.c new file mode 100644 index 00000000000..8867ed73fb5 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/batch_norm.c @@ -0,0 +1,86 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "nnacl/fp16_grad/batch_norm.h" + +void var2InvarFp16(float16_t *save_var, int size, float eps) { + for (int i = 0; i < size; i++) { + save_var[i] = (float16_t)(1.0f / sqrtf((float)save_var[i] + eps)); + } +} + +void backwardAllFp16(const float16_t *restrict in, const float16_t *restrict yt, const float16_t *restrict mean, + const float16_t *restrict invar, const float16_t *restrict scale, int size, int ch, + float *restrict dxhat_sum, float *restrict dxhathat_sum, float16_t *restrict dbias, + float16_t *restrict dscale, float16_t *restrict dx) { + float16_t N = (float16_t)size; + for (int i = 0; i < size; i++) { + for (int c = 0; c < ch; c++) { + int ix = i * ch + c; + dbias[c] += yt[ix]; + // dscale + float16_t x_hat = (in[ix] - mean[c]) * invar[c]; + dscale[c] += (yt[ix] * x_hat); + // dx_1 + float dx_hat = (float)(yt[ix] * scale[c]); + dxhat_sum[c] += dx_hat; + dxhathat_sum[c] += (float)(dx_hat * x_hat); + } + } + for (int i = 0; i < size; i++) { + for (int c = 0; c < ch; c++) { + // dx_2 + int ix = i * ch + c; + float16_t x_hat = (in[ix] - mean[c]) * invar[c]; + float16_t dx_hat = yt[ix] * scale[c]; + dx[ix] = 1.0f / N * (float)((invar[c]) * (N * dx_hat - dxhat_sum[c] - x_hat * dxhathat_sum[c])); + } + } +} +void backwardP1Fp16(const float16_t *restrict in, const float16_t *restrict yt, const float16_t *restrict mean, + const float16_t *restrict invar, const float16_t *restrict scale, int size, int ch, + float *restrict dxhat_sum, float *restrict dxhathat_sum, float16_t *restrict dbias, + float16_t *restrict dscale) { + for (int i = 0; i < size; i++) { + for (int c = 0; c < ch; c++) { + int ix = i * ch + c; + dbias[c] += yt[ix]; + // dscale + float x_hat = (float)((in[ix] - mean[c]) * invar[c]); + dscale[c] += (yt[ix] * x_hat); + // dx_1 + float dx_hat = (float)(yt[ix] * scale[c]); + dxhat_sum[c] += dx_hat; + dxhathat_sum[c] += dx_hat * x_hat; + } + } +} + +void backwardP2Fp16(const float16_t *restrict in, const float16_t *restrict yt, const float16_t *restrict mean, + const float16_t *restrict invar, const float16_t *restrict scale, int size, int total_size, int ch, + const float *dxhat_sum, const float *dxhathat_sum, float16_t *restrict dx) { + const float N = (float)total_size; + for (int i = 0; i < size; i++) { + for (int c = 0; c < ch; c++) { + // dx_2 + int ix = i * ch + c; + float x_hat = (float)((in[ix] - mean[c]) * invar[c]); + float dx_hat = (float)(yt[ix] * scale[c]); + dx[ix] = (float16_t)(1.0f / N * (float)(invar[c]) * (N * dx_hat - dxhat_sum[c] - x_hat * dxhathat_sum[c])); + } + } +} diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/batch_norm.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/batch_norm.h new file mode 100644 index 00000000000..e8e15f34099 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/batch_norm.h @@ -0,0 +1,40 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_BATCH_NORM_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_BATCH_NORM_H_ + +#include "nnacl/op_base.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void var2InvarFp16(float16_t *save_var, int size, float eps); +void backwardAllFp16(const float16_t *in, const float16_t *yt, const float16_t *mean, const float16_t *invar, + const float16_t *scale, int size, int ch, float *dxhat_sum, float *dxhathat_sum, float16_t *dbias, + float16_t *dscale, float16_t *dx); +void backwardP1Fp16(const float16_t *in, const float16_t *yt, const float16_t *mean, const float16_t *invar, + const float16_t *scale, int size, int ch, float *dxhat_sum, float *dxhathat_sum, float16_t *dbias, + float16_t *dscale); +void backwardP2Fp16(const float16_t *in, const float16_t *yt, const float16_t *mean, const float16_t *invar, + const float16_t *scale, int size, int total_size, int ch, const float *dxhat_sum, + const float *dxhathat_sum, float16_t *dx); +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_BATCH_NORM_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/convolution_grad_filter.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/convolution_grad_filter.c new file mode 100644 index 00000000000..6162a88bb90 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/convolution_grad_filter.c @@ -0,0 +1,360 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp16_grad/convolution_grad_filter.h" +#include "nnacl/intrinsics/ms_simd_instructions_fp16.h" +#ifdef ENABLE_NEON +#include +#endif + +#ifdef ENABLE_NEON + +static int FilterGrad32Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw, + const ConvParameter *conv_param) { + int in_h = conv_param->input_h_; + int in_w = conv_param->input_w_; + int k_h = conv_param->kernel_h_; + int k_w = conv_param->kernel_w_; + int batch = conv_param->output_batch_; + int out_ch = conv_param->output_channel_; + int in_ch = conv_param->input_channel_; + int out_h = conv_param->output_h_; + int out_w = conv_param->output_w_; + + int m = out_h * out_w; + int x_size = in_h * in_w * in_ch; + int y_size = out_ch * out_h * out_w; + int k_spatial = k_w * k_h; + int i_kh = k_idx / k_w; + int i_kw = k_idx % k_w; + for (; i_c < (out_ch & ~31); i_c += 32) { + float32x4_t sum_0 = vdupq_n_f32(0.0f); + float32x4_t sum_1 = vdupq_n_f32(0.0f); + float32x4_t sum_2 = vdupq_n_f32(0.0f); + float32x4_t sum_3 = vdupq_n_f32(0.0f); + float32x4_t sum_4 = vdupq_n_f32(0.0f); + float32x4_t sum_5 = vdupq_n_f32(0.0f); + float32x4_t sum_6 = vdupq_n_f32(0.0f); + float32x4_t sum_7 = vdupq_n_f32(0.0f); + + for (int b = 0; b < batch; ++b) { + const float16_t *x_addr = &x[b * x_size]; + const float16_t *dy_addr = &dy[b * y_size]; + for (int i = 0; i < m; i++) { + int idx = i; + int input_h = idx / out_w * conv_param->stride_h_; + int input_w = idx % out_w * conv_param->stride_w_; + int input_row = -conv_param->pad_u_ + i_kh + input_h; + int input_col = -conv_param->pad_l_ + i_kw + input_w; + if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) { + int offset_x = (input_row * in_w + input_col) * out_ch + i_c; + int offset_dy = idx * out_ch + i_c; + + float16x8_t x_0 = vld1q_f16(x_addr + offset_x); + float16x8_t dy_0 = vld1q_f16(dy_addr + offset_dy); + sum_0 = MS_VMLAL_F16(vget_low_f16(x_0), vget_low_f16(dy_0), sum_0); + sum_1 = MS_VMLAL_F16(vget_high_f16(x_0), vget_high_f16(dy_0), sum_1); + + float16x8_t x_1 = vld1q_f16(x_addr + offset_x + 8); + float16x8_t dy_1 = vld1q_f16(dy_addr + offset_dy + 8); + sum_2 = MS_VMLAL_F16(vget_low_f16(x_1), vget_low_f16(dy_1), sum_2); + sum_3 = MS_VMLAL_F16(vget_high_f16(x_1), vget_high_f16(dy_1), sum_3); + + float16x8_t x_2 = vld1q_f16(x_addr + offset_x + 16); + float16x8_t dy_2 = vld1q_f16(dy_addr + offset_dy + 16); + sum_4 = MS_VMLAL_F16(vget_low_f16(x_2), vget_low_f16(dy_2), sum_4); + sum_5 = MS_VMLAL_F16(vget_high_f16(x_2), vget_high_f16(dy_2), sum_5); + + float16x8_t x_3 = vld1q_f16(x_addr + offset_x + 24); + float16x8_t dy_3 = vld1q_f16(dy_addr + offset_dy + 24); + sum_6 = MS_VMLAL_F16(vget_low_f16(x_3), vget_low_f16(dy_3), sum_6); + sum_7 = MS_VMLAL_F16(vget_high_f16(x_3), vget_high_f16(dy_3), sum_7); + } + } + } + // store into memory + for (int l = 0; l < 4; l++) { + dw[(i_c + l) * k_spatial + k_idx] = sum_0[l]; + dw[(i_c + 4 + l) * k_spatial + k_idx] = sum_1[l]; + dw[(i_c + 8 + l) * k_spatial + k_idx] = sum_2[l]; + dw[(i_c + 12 + l) * k_spatial + k_idx] = sum_3[l]; + dw[(i_c + 16 + l) * k_spatial + k_idx] = sum_4[l]; + dw[(i_c + 20 + l) * k_spatial + k_idx] = sum_5[l]; + dw[(i_c + 24 + l) * k_spatial + k_idx] = sum_6[l]; + dw[(i_c + 28 + l) * k_spatial + k_idx] = sum_7[l]; + } + } + return i_c; +} + +static int FilterGrad16Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw, + const ConvParameter *conv_param) { + int in_h = conv_param->input_h_; + int in_w = conv_param->input_w_; + int k_h = conv_param->kernel_h_; + int k_w = conv_param->kernel_w_; + int batch = conv_param->output_batch_; + int out_ch = conv_param->output_channel_; + int in_ch = conv_param->input_channel_; + int out_h = conv_param->output_h_; + int out_w = conv_param->output_w_; + + int m = out_h * out_w; + int x_size = in_h * in_w * in_ch; + int y_size = out_ch * out_h * out_w; + int k_spatial = k_w * k_h; + int i_kh = k_idx / k_w; + int i_kw = k_idx % k_w; + for (; i_c < (out_ch & ~15); i_c += 16) { + float32x4_t sum_0 = vdupq_n_f32(0.0f); + float32x4_t sum_1 = vdupq_n_f32(0.0f); + float32x4_t sum_2 = vdupq_n_f32(0.0f); + float32x4_t sum_3 = vdupq_n_f32(0.0f); + for (int b = 0; b < batch; ++b) { + const float16_t *x_addr = &x[b * x_size]; + const float16_t *dy_addr = &dy[b * y_size]; + for (int i = 0; i < m; i++) { + int idx = i; + int input_h = idx / out_w * conv_param->stride_h_; + int input_w = idx % out_w * conv_param->stride_w_; + int input_row = -conv_param->pad_u_ + i_kh + input_h; + int input_col = -conv_param->pad_l_ + i_kw + input_w; + if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) { + int offset_x = (input_row * in_w + input_col) * out_ch + i_c; + int offset_dy = idx * out_ch + i_c; + + float16x8_t x_0 = vld1q_f16(x_addr + offset_x); + float16x8_t dy_0 = vld1q_f16(dy_addr + offset_dy); + sum_0 = MS_VMLAL_F16(vget_low_f16(x_0), vget_low_f16(dy_0), sum_0); + sum_1 = MS_VMLAL_F16(vget_high_f16(x_0), vget_high_f16(dy_0), sum_1); + + float16x8_t x_1 = vld1q_f16(x_addr + offset_x + 8); + float16x8_t dy_1 = vld1q_f16(dy_addr + offset_dy + 8); + sum_2 = MS_VMLAL_F16(vget_low_f16(x_1), vget_low_f16(dy_1), sum_2); + sum_3 = MS_VMLAL_F16(vget_high_f16(x_1), vget_high_f16(dy_1), sum_3); + } + } + } + for (int l = 0; l < 4; l++) { + dw[(i_c + l) * k_spatial + k_idx] = sum_0[l]; + dw[(i_c + l + 4) * k_spatial + k_idx] = sum_1[l]; + dw[(i_c + l + 8) * k_spatial + k_idx] = sum_2[l]; + dw[(i_c + l + 12) * k_spatial + k_idx] = sum_3[l]; + } + } + return i_c; +} + +static int FilterGrad8Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw, + const ConvParameter *conv_param) { + int in_h = conv_param->input_h_; + int in_w = conv_param->input_w_; + int k_h = conv_param->kernel_h_; + int k_w = conv_param->kernel_w_; + int batch = conv_param->output_batch_; + int out_ch = conv_param->output_channel_; + int in_ch = conv_param->input_channel_; + int out_h = conv_param->output_h_; + int out_w = conv_param->output_w_; + + int m = out_h * out_w; + int x_size = in_h * in_w * in_ch; + int y_size = out_ch * out_h * out_w; + int k_spatial = k_w * k_h; + int i_kh = k_idx / k_w; + int i_kw = k_idx % k_w; + for (; i_c < (out_ch & ~7); i_c += 8) { + float32x4_t sum_0 = vdupq_n_f32(0.0f); + float32x4_t sum_1 = vdupq_n_f32(0.0f); + + for (int b = 0; b < batch; ++b) { + const float16_t *x_addr = &x[b * x_size]; + const float16_t *dy_addr = &dy[b * y_size]; + for (int i = 0; i < m; i++) { + int idx = i; + int input_h = idx / out_w * conv_param->stride_h_; + int input_w = idx % out_w * conv_param->stride_w_; + int input_row = -conv_param->pad_u_ + i_kh + input_h; + int input_col = -conv_param->pad_l_ + i_kw + input_w; + if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) { + int offset_x = (input_row * in_w + input_col) * out_ch + i_c; + int offset_dy = idx * out_ch + i_c; + float16x8_t x_0 = vld1q_f16(x_addr + offset_x); + float16x8_t dy_0 = vld1q_f16(dy_addr + offset_dy); + sum_0 = MS_VMLAL_F16(vget_low_f16(x_0), vget_low_f16(dy_0), sum_0); + sum_1 = MS_VMLAL_F16(vget_high_f16(x_0), vget_high_f16(dy_0), sum_1); + } + } + } + for (int l = 0; l < 4; l++) { + dw[(i_c + l) * k_spatial + k_idx] = sum_0[l]; + dw[(i_c + 4 + l) * k_spatial + k_idx] = sum_1[l]; + } + } + return i_c; +} + +static int FilterGrad4Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw, + const ConvParameter *conv_param) { + int in_h = conv_param->input_h_; + int in_w = conv_param->input_w_; + int k_h = conv_param->kernel_h_; + int k_w = conv_param->kernel_w_; + int batch = conv_param->output_batch_; + int out_ch = conv_param->output_channel_; + int in_ch = conv_param->input_channel_; + int out_h = conv_param->output_h_; + int out_w = conv_param->output_w_; + + int m = out_h * out_w; + int x_size = in_h * in_w * in_ch; + int y_size = out_ch * out_h * out_w; + int k_spatial = k_w * k_h; + int i_kh = k_idx / k_w; + int i_kw = k_idx % k_w; + for (; i_c < (out_ch & ~3); i_c += 4) { + float32x4_t sum_0 = vdupq_n_f32(0.0f); + + for (int b = 0; b < batch; ++b) { + const float16_t *x_addr = &x[b * x_size]; + const float16_t *dy_addr = &dy[b * y_size]; + + for (int i = 0; i < m; i++) { + int idx = i; + int input_h = idx / out_w * conv_param->stride_h_; + int input_w = idx % out_w * conv_param->stride_w_; + int input_row = -conv_param->pad_u_ + i_kh + input_h; + int input_col = -conv_param->pad_l_ + i_kw + input_w; + if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) { + int offset_x = (input_row * in_w + input_col) * out_ch + i_c; + int offset_dy = idx * out_ch + i_c; + float16x4_t x_0 = vld1_f16(x_addr + offset_x); + float16x4_t dy_0 = vld1_f16(dy_addr + offset_dy); + sum_0 = MS_VMLAL_F16(x_0, dy_0, sum_0); + } + } + } + dw[(i_c + 0) * k_spatial + k_idx] = sum_0[0]; + dw[(i_c + 1) * k_spatial + k_idx] = sum_0[1]; + dw[(i_c + 2) * k_spatial + k_idx] = sum_0[2]; + dw[(i_c + 3) * k_spatial + k_idx] = sum_0[3]; + } + return i_c; +} + +static int FilterGradLeftoverArm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw, + const ConvParameter *conv_param) { + int in_h = conv_param->input_h_; + int in_w = conv_param->input_w_; + int k_h = conv_param->kernel_h_; + int k_w = conv_param->kernel_w_; + int batch = conv_param->output_batch_; + int out_ch = conv_param->output_channel_; + int in_ch = conv_param->input_channel_; + int out_h = conv_param->output_h_; + int out_w = conv_param->output_w_; + + int m = out_h * out_w; + int x_size = in_h * in_w * in_ch; + int y_size = out_ch * out_h * out_w; + int k_spatial = k_w * k_h; + int i_kh = k_idx / k_w; + int i_kw = k_idx % k_w; + int leftover = out_ch - i_c; + if (leftover > 0) { + float32x4_t sum_0 = vdupq_n_f32(0.0f); + + for (int b = 0; b < batch; ++b) { + const float16_t *x_addr = &x[b * x_size]; + const float16_t *dy_addr = &dy[b * y_size]; + + for (int i = 0; i < m; i++) { + int idx = i; + int input_h = idx / out_w * conv_param->stride_h_; + int input_w = idx % out_w * conv_param->stride_w_; + int input_row = -conv_param->pad_u_ + i_kh + input_h; + int input_col = -conv_param->pad_l_ + i_kw + input_w; + if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) { + int offset_x = (input_row * in_w + input_col) * out_ch + i_c; + int offset_dy = idx * out_ch + i_c; + float16x4_t x_0 = vld1_f16(x_addr + offset_x); + float16x4_t dy_0 = vld1_f16(dy_addr + offset_dy); + sum_0 = MS_VMLAL_F16(x_0, dy_0, sum_0); + } + } + } + for (int l = 0; l < leftover; l++) { + dw[(i_c + l) * k_spatial + k_idx] = sum_0[l]; + } + } + return out_ch; +} + +#endif + +int ConvDwFilterFp16Grad(const float16_t *x, const float16_t *dy, float16_t *dw, int start, int count, + const ConvParameter *conv_param) { + int in_h = conv_param->input_h_; + int in_w = conv_param->input_w_; + int k_h = conv_param->kernel_h_; + int k_w = conv_param->kernel_w_; + int batch = conv_param->output_batch_; + int out_ch = conv_param->output_channel_; + int in_ch = conv_param->input_channel_; + int out_h = conv_param->output_h_; + int out_w = conv_param->output_w_; + + int m = out_h * out_w; + int x_size = in_h * in_w * in_ch; + int y_size = out_ch * out_h * out_w; + int k_spatial = k_w * k_h; + + for (int i_k = 0; i_k < count; i_k++) { + int k_idx = start + i_k; + int i_kh = k_idx / k_w; + int i_kw = k_idx % k_w; + int i_c = 0; +#ifdef ENABLE_NEON + i_c = FilterGrad32Arm(x, dy, i_c, k_idx, dw, conv_param); + i_c = FilterGrad16Arm(x, dy, i_c, k_idx, dw, conv_param); + i_c = FilterGrad8Arm(x, dy, i_c, k_idx, dw, conv_param); + i_c = FilterGrad4Arm(x, dy, i_c, k_idx, dw, conv_param); + i_c = FilterGradLeftoverArm(x, dy, i_c, k_idx, dw, conv_param); +#endif + for (; i_c < out_ch; i_c++) { + float sum = 0; + for (int b = 0; b < batch; ++b) { + const float16_t *x_addr = &x[b * x_size]; + const float16_t *dy_addr = &dy[b * y_size]; + + for (int i = 0; i < m; i++) { + int idx = i; + int input_h = idx / out_w * conv_param->stride_h_; + int input_w = idx % out_w * conv_param->stride_w_; + int input_row = -conv_param->pad_u_ + i_kh + input_h; + int input_col = -conv_param->pad_l_ + i_kw + input_w; + if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) { + int offset_x = (input_row * in_w + input_col) * out_ch + i_c; + int offset_dy = idx * out_ch + i_c; + sum += x_addr[offset_x] * dy_addr[offset_dy]; + } + } + } + dw[i_c * k_spatial + k_idx] = sum; + } + } + return 0; +} diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/convolution_grad_filter.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/convolution_grad_filter.h new file mode 100644 index 00000000000..2b6ddc96f3c --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/convolution_grad_filter.h @@ -0,0 +1,33 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_CONVOLUTION_GRAD_FILTER_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_CONVOLUTION_GRAD_FILTER_H_ + +#include +#include "nnacl/conv_parameter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int ConvDwFilterFp16Grad(const float16_t *x, const float16_t *dy, float16_t *dw, int start, int count, + const ConvParameter *conv_param); +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_CONVOLUTION_GRAD_FILTER_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/dropout_grad.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/dropout_grad.c new file mode 100644 index 00000000000..aee7a0fa273 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/dropout_grad.c @@ -0,0 +1,24 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp16_grad/dropout_grad.h" + +void DropoutFp16Grad(const float16_t *yt_ptr, const float16_t *mask, float16_t *output_ptr, int length, + float16_t scale) { + for (int i = 0; i < length; i++) { + output_ptr[i] = yt_ptr[i] * mask[i] * scale; + } +} diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/dropout_grad.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/dropout_grad.h new file mode 100644 index 00000000000..f5ead0f3791 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/dropout_grad.h @@ -0,0 +1,32 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_DROPOUT_GRAD_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_DROPOUT_GRAD_H_ + +#include "nnacl/op_base.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void DropoutFp16Grad(const float16_t *yt_ptr, const float16_t *mask, float16_t *output_ptr, int length, + float16_t ratio); +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_DROPOUT_GRAD_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/gemm_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/gemm_fp16.c new file mode 100644 index 00000000000..36622420ac0 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/gemm_fp16.c @@ -0,0 +1,373 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp16_grad/gemm_fp16.h" +#include +#ifdef __ARM_NEON +#include +#endif +#include "nnacl/fp16/matmul_fp16.h" +#include "nnacl/fp16/pack_fp16.h" + +#ifdef ENABLE_ARM64 +static void Row2Col16Block16(const float16_t *src_ptr, float16_t *dst_ptr, size_t col) { + size_t stride = col * 2; + asm volatile( + "mov x10, %[src_c]\n" + "mov x11, %[dst_c]\n" + + "ld1 {v0.8h}, [x10], %[stride]\n" + "ld1 {v1.8h}, [x10], %[stride]\n" + "ld1 {v2.8h}, [x10], %[stride]\n" + "ld1 {v3.8h}, [x10], %[stride]\n" + "ld1 {v4.8h}, [x10], %[stride]\n" + "ld1 {v5.8h}, [x10], %[stride]\n" + "ld1 {v6.8h}, [x10], %[stride]\n" + "ld1 {v7.8h}, [x10], %[stride]\n" + + "zip1 v16.8h, v0.8h, v1.8h\n" + "zip1 v17.8h, v2.8h, v3.8h\n" + "zip1 v18.8h, v4.8h, v5.8h\n" + "zip1 v19.8h, v6.8h, v7.8h\n" + + "ld1 {v8.8h}, [x10], %[stride]\n" + "ld1 {v9.8h}, [x10], %[stride]\n" + "ld1 {v10.8h}, [x10], %[stride]\n" + "ld1 {v11.8h}, [x10], %[stride]\n" + "ld1 {v12.8h}, [x10], %[stride]\n" + "ld1 {v13.8h}, [x10], %[stride]\n" + "ld1 {v14.8h}, [x10], %[stride]\n" + "ld1 {v15.8h}, [x10], %[stride]\n" + + "trn1 v20.4s, v16.4s, v17.4s\n" + "trn2 v21.4s, v16.4s, v17.4s\n" + "trn1 v22.4s, v18.4s, v19.4s\n" + "trn2 v23.4s, v18.4s, v19.4s\n" + + "trn1 v24.2d, v20.2d, v22.2d\n" + "trn2 v25.2d, v20.2d, v22.2d\n" + "trn1 v26.2d, v21.2d, v23.2d\n" + "trn2 v27.2d, v21.2d, v23.2d\n" + + "zip1 v16.8h, v8.8h, v9.8h\n" + "zip1 v17.8h, v10.8h, v11.8h\n" + "zip1 v18.8h, v12.8h, v13.8h\n" + "zip1 v19.8h, v14.8h, v15.8h\n" + + "trn1 v20.4s, v16.4s, v17.4s\n" + "trn2 v21.4s, v16.4s, v17.4s\n" + "trn1 v22.4s, v18.4s, v19.4s\n" + "trn2 v23.4s, v18.4s, v19.4s\n" + + "trn1 v28.2d, v20.2d, v22.2d\n" + "trn2 v29.2d, v20.2d, v22.2d\n" + "trn1 v30.2d, v21.2d, v23.2d\n" + "trn2 v31.2d, v21.2d, v23.2d\n" + + "st1 {v24.8h}, [x11], #16\n" + "st1 {v28.8h}, [x11], #16\n" + "st1 {v26.8h}, [x11], #16\n" + "st1 {v30.8h}, [x11], #16\n" + "st1 {v25.8h}, [x11], #16\n" + "st1 {v29.8h}, [x11], #16\n" + "st1 {v27.8h}, [x11], #16\n" + "st1 {v31.8h}, [x11], #16\n" + + "zip2 v16.8h, v0.8h, v1.8h\n" + "zip2 v17.8h, v2.8h, v3.8h\n" + "zip2 v18.8h, v4.8h, v5.8h\n" + "zip2 v19.8h, v6.8h, v7.8h\n" + + "trn1 v20.4s, v16.4s, v17.4s\n" + "trn2 v21.4s, v16.4s, v17.4s\n" + "trn1 v22.4s, v18.4s, v19.4s\n" + "trn2 v23.4s, v18.4s, v19.4s\n" + + "trn1 v24.2d, v20.2d, v22.2d\n" + "trn2 v25.2d, v20.2d, v22.2d\n" + "trn1 v26.2d, v21.2d, v23.2d\n" + "trn2 v27.2d, v21.2d, v23.2d\n" + + "zip2 v16.8h, v8.8h, v9.8h\n" + "zip2 v17.8h, v10.8h, v11.8h\n" + "zip2 v18.8h, v12.8h, v13.8h\n" + "zip2 v19.8h, v14.8h, v15.8h\n" + + "trn1 v20.4s, v16.4s, v17.4s\n" + "trn2 v21.4s, v16.4s, v17.4s\n" + "trn1 v22.4s, v18.4s, v19.4s\n" + "trn2 v23.4s, v18.4s, v19.4s\n" + + "trn1 v28.2d, v20.2d, v22.2d\n" + "trn2 v29.2d, v20.2d, v22.2d\n" + "trn1 v30.2d, v21.2d, v23.2d\n" + "trn2 v31.2d, v21.2d, v23.2d\n" + + "st1 {v24.8h}, [x11], #16\n" + "st1 {v28.8h}, [x11], #16\n" + "st1 {v26.8h}, [x11], #16\n" + "st1 {v30.8h}, [x11], #16\n" + "st1 {v25.8h}, [x11], #16\n" + "st1 {v29.8h}, [x11], #16\n" + "st1 {v27.8h}, [x11], #16\n" + "st1 {v31.8h}, [x11], #16\n" + : + : [ dst_c ] "r"(dst_ptr), [ src_c ] "r"(src_ptr), [ stride ] "r"(stride) + : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); +} +#endif + +void AddMatrixFp16(const float16_t *restrict v1, float16_t *restrict v2, float16_t beta, int row, int col, int stride) { + const float16_t *src_ptr = v1; + float16_t *dst_ptr = v2; + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + dst_ptr[c] += beta * src_ptr[c]; + } + src_ptr += stride; + dst_ptr += stride; + } +} + +int MatSizeFp16(int row, int col, int round) { + int res = UP_ROUND(row, round) * col; + return res; +} + +int MatSizeTotalFp16(int row, int col, int deep, int stride) { +#ifdef ENABLE_ARM64 + const int num = C16NUM; +#else + const int num = C12NUM; +#endif + int res = MatSizeFp16(row, deep, num) + MatSizeFp16(col, deep, C8NUM); + if (stride > 0) res += row * stride; + return res; +} + +#ifdef ENABLE_ARM64 +static void RowMajor2Col16MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) { + size_t row_up_16 = UP_ROUND(row, C16NUM); + size_t row16 = row / C16NUM * C16NUM; + size_t col8 = col / C8NUM * C8NUM; + const float16_t *src_r = src; + float16_t *dst_r = dst; + size_t ri = 0; + // find 16 block unit + for (; ri < row16; ri += C16NUM) { + size_t ci = 0; + for (; ci < col8; ci += C8NUM) { + const float16_t *src_c = src_r + ci; + float16_t *dst_c = dst_r + ci * C16NUM; +#ifdef ENABLE_ARM64 + Row2Col16Block16(src_c, dst_c, stride); +#else + for (int tr = 0; tr < C16NUM; tr++) { + for (int tc = 0; tc < C8NUM; tc++) { + dst_c[tc * C16NUM + tr] = src_c[tr * stride + tc]; + } + } +#endif + } + for (; ci < col; ci++) { + const float16_t *src_c = src_r + ci; + float16_t *dst_c = dst_r + ci * C16NUM; + for (size_t i = 0; i < C16NUM; i++) { + dst_c[i] = src_c[i * stride]; + } + } + src_r += C16NUM * stride; + dst_r += C16NUM * col; + } + for (; ri < row; ri++) { + for (size_t i = 0; i < col; ++i) { + dst_r[i * C16NUM] = src_r[i]; + } + src_r += stride; + dst_r += 1; + } + for (; ri < row_up_16; ri++) { + for (size_t i = 0; i < col; i++) { + dst_r[i * C16NUM] = 0; + } + dst_r += 1; + } + return; +} +#endif + +void RowMajor2Row16MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) { + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int c_div16 = c / 16; + int c_mod16 = c % 16; + dst[c_div16 * 16 * row + r * 16 + c_mod16] = src[r * stride + c]; + } + } +} + +void RowMajor2Col12MajorStrideFp16(const float16_t *src, float16_t *dst, size_t row, size_t col, int stride) { + size_t row_up_12 = UP_ROUND(row, C12NUM); + size_t row12 = row / C12NUM * C12NUM; + size_t col8 = col / C8NUM * C8NUM; + const float16_t *src_r = src; + float16_t *dst_r = dst; + size_t ri = 0; + // transpose 12x8 + for (; ri < row12; ri += C12NUM) { + size_t ci = 0; + for (; ci < col8; ci += C8NUM) { + const float16_t *src_c = src_r + ci; + float16_t *dst_c = dst_r + ci * C12NUM; +#ifdef ENABLE_ARM82_A32 + Transpose12x8A32Fp16(src_c, dst_c, stride * sizeof(float16_t), 24); +#else + for (int tr = 0; tr < C12NUM; tr++) { + for (int tc = 0; tc < C8NUM; tc++) { + dst_c[tc * C12NUM + tr] = src_c[tr * stride + tc]; + } + } +#endif + } + for (; ci < col; ci++) { + const float16_t *src_c = src_r + ci; + float16_t *dst_c = dst_r + ci * C12NUM; + for (size_t i = 0; i < C12NUM; i++) { + dst_c[i] = src_c[i * stride]; + } + } + src_r += C12NUM * stride; + dst_r += C12NUM * col; + } + for (; ri < row; ri++) { + for (size_t i = 0; i < col; ++i) { + dst_r[i * C12NUM] = src_r[i]; + } + src_r += stride; + dst_r += 1; + } + for (; ri < row_up_12; ri++) { + for (size_t i = 0; i < col; i++) { + dst_r[i * C12NUM] = 0; + } + dst_r += 1; + } +} + +void RowMajor2Row12MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) { + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int c_div12 = c / 12; + int c_mod12 = c % 12; + dst[c_div12 * 12 * row + r * 12 + c_mod12] = src[r * stride + c]; + } + } +} + +static void RowMajor2Col8MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) { + for (int r = 0; r < row; r++) { + for (int c = 0; c < col; c++) { + int r_div8 = r / 8; + int r_mod8 = r % 8; + dst[r_div8 * 8 * col + c * 8 + r_mod8] = src[r * stride + c]; + } + } +} + +static void RowMajor2Row8MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) { + for (int r = 0; r < row; r++) { + const float16_t *src_ptr = src + r * stride; + int c = 0; + for (; c < col; c++) { + int cd8 = c / C8NUM; + int cm8 = c % C8NUM; + dst[cd8 * C8NUM * row + r * C8NUM + cm8] = src_ptr[c]; + } + for (; c < UP_ROUND(col, C8NUM); c++) { + int cd8 = c / C8NUM; + int cm8 = c % C8NUM; + dst[cd8 * C8NUM * row + r * C8NUM + cm8] = 0; + } + } + return; +} + +static void RowMajor2ColXMajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) { +#ifdef ENABLE_ARM64 + RowMajor2Col16MajorStrideFp16(src, dst, row, col, stride); +#else + RowMajor2Col12MajorStrideFp16(src, dst, row, col, stride); +#endif +} + +static void RowMajor2RowXMajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) { +#ifdef ENABLE_ARM64 + RowMajor2Row16MajorStrideFp16(src, dst, row, col, stride); +#else + RowMajor2Row12MajorStrideFp16(src, dst, row, col, stride); +#endif +} + +void GemmMatmulFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda, + const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc, float16_t *workspace) { + GemmCbFp16 gcb; + gcb.atype = ActType_No; + gcb.ca = 0; + gcb.cb = 0; + gcb.bias = NULL; + GemmMatmulPlusFp16(ta, tb, M, N, K, alpha, mat_a, lda, mat_b, ldb, beta, mat_c, ldc, workspace, &gcb); +} + +void GemmMatmulPlusFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda, + const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc, + float16_t *workspace, GemmCbFp16 *gcb) { +#ifdef ENABLE_ARM64 + const int num = C16NUM; +#else + const int num = C12NUM; +#endif + float16_t *output = mat_c; + float16_t *fworkspace = workspace; + int incremental = (beta < 0.f) || (beta > 0.f); + float16_t *mat_a_input = (float16_t *)mat_a; + float16_t *mat_b_input = (float16_t *)mat_b; + + if (!gcb->ca) { + mat_a_input = fworkspace; + fworkspace += MatSizeFp16(M, K, num); + if (ta) { + RowMajor2RowXMajorStrideFp16(mat_a, mat_a_input, K, M, lda); + } else { + RowMajor2ColXMajorStrideFp16(mat_a, mat_a_input, M, K, lda); + } + } + if (!gcb->cb) { + mat_b_input = fworkspace; + fworkspace += MatSizeFp16(N, K, C8NUM); + if (tb) { + RowMajor2Col8MajorStrideFp16(mat_b, mat_b_input, N, K, ldb); + } else { + RowMajor2Row8MajorStrideFp16(mat_b, mat_b_input, K, N, ldb); + } + } + if (incremental) output = fworkspace; + MatMulFp16(mat_a_input, mat_b_input, output, gcb->bias, gcb->atype, K, M, N, ldc, OutType_Nhwc); + if (incremental) AddMatrixFp16(output, mat_c, beta, M, N, ldc); + gcb->mat_a = mat_a_input; + gcb->mat_b = mat_b_input; +} diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/gemm_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/gemm_fp16.h new file mode 100644 index 00000000000..f342e2e1135 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/gemm_fp16.h @@ -0,0 +1,46 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_GEMM_FP16_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_GEMM_FP16_H_ + +#include +#include "nnacl/op_base.h" +#ifdef __cplusplus +extern "C" { +#endif +typedef struct { + int ca; + int cb; + ActType atype; + float16_t *bias; + float16_t *mat_a; + float16_t *mat_b; +} GemmCbFp16; + +void GemmMatmulFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda, + const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc, float16_t *workspace); +void GemmMatmulPlusFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda, + const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc, + float16_t *workspace, GemmCbFp16 *gcb); +int MatSizeFp16(int row, int col, int round); +int MatSizeTotalFp16(int row, int col, int deep, int inc); +void AddMatrixFp16(const float16_t *v1, float16_t *v2, float16_t beta, int row, int col, int stride); +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_GEMM_FP16_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/layernorm_grad.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/layernorm_grad.c new file mode 100644 index 00000000000..2e5e7baf084 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/layernorm_grad.c @@ -0,0 +1,59 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nnacl/fp16_grad/layernorm_grad.h" +#include +#include + +void LayerNormFp16Grad(const float16_t *x, const float16_t *dy, const float16_t *var, const float16_t *mean, + const float16_t *gamma, int param_num, int param_size, int block_num, int block_size, + float16_t *dx, float16_t *dg, float16_t *db) { + // var is actually 1/sqrf(var)-> var^0.5 + const float16_t *var_sqrt_rev = var; + for (size_t i = 0; i < param_num; ++i) { + float dgamma = 0.0f; + float dbeta = 0.0f; + for (size_t j = i; j < param_size * param_num; j += param_num) { + int norm_shift = (int)(j / block_size); + dgamma += dy[j] * var_sqrt_rev[norm_shift] * (x[j] - mean[norm_shift]); + dbeta += dy[j]; + } + dg[i] = (float16_t)dgamma; + db[i] = (float16_t)dbeta; + } + for (size_t i = 0; i < block_num; ++i) { + float sum1 = 0.0f; + float sum2 = 0.0f; + float sum3 = 0.0f; + for (size_t j = i * block_size; j < (i + 1) * block_size; ++j) { + int param_shift = j % param_num; + int norm_shift = (int)(j / block_size); + float16_t dxm = x[j] - mean[norm_shift]; + float16_t dyg = dy[j] * gamma[param_shift]; + sum1 += -0.5f * dyg * dxm * var_sqrt_rev[norm_shift] * var_sqrt_rev[norm_shift] * var_sqrt_rev[norm_shift]; + sum2 += dyg; + sum3 += -2.0f * dxm; + } + for (size_t j = i * block_size; j < (i + 1) * block_size; ++j) { + int param_shift = j % param_num; + int norm_shift = (int)(j / block_size); + float16_t var_sqrt = var_sqrt_rev[norm_shift]; + float dx1 = dy[j] * gamma[param_shift] * var_sqrt; + float dx2 = sum1 * 2.0f / block_size * (x[j] - mean[norm_shift]); + float dx3 = (-1.0f * var_sqrt * sum2 + (1.0f / block_size) * sum1 * sum3) * (1.0f / block_size); + dx[j] = (float16_t)(dx1 + dx2 + dx3); + } + } +} diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/layernorm_grad.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/layernorm_grad.h new file mode 100644 index 00000000000..9d8740fec2c --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/layernorm_grad.h @@ -0,0 +1,32 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_LAYERNORM_GRAD_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_LAYERNORM_GRAD_H_ + +#include "nnacl/op_base.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void LayerNormFp16Grad(const float16_t *x, const float16_t *dy, const float16_t *var, const float16_t *mean, + const float16_t *gamma, int param_num, int param_size, int block_num, int block_size, + float16_t *dx, float16_t *dg, float16_t *db); + +#ifdef __cplusplus +} +#endif +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_LAYERNORM_GRAD_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pack_fp16_ext.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pack_fp16_ext.c new file mode 100644 index 00000000000..8d7b1bbc7b3 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pack_fp16_ext.c @@ -0,0 +1,201 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "nnacl/fp16_grad/pack_fp16_ext.h" + +void RollingIm2ColPackDwUnitFp16(const float16_t *in_data, const ConvParameter *conv_param, float16_t *data_col_orig, + int real_cal_num, int start) { + const int pad_left = conv_param->pad_l_; + const int pad_up = conv_param->pad_u_; + + const int stride_h = conv_param->stride_h_; + const int stride_w = conv_param->stride_w_; + + const int dilation_h = conv_param->dilation_h_; + const int dilation_w = conv_param->dilation_w_; + + const int kernel_h = conv_param->kernel_h_; + const int kernel_w = conv_param->kernel_w_; + + const int in_height = conv_param->input_h_; + const int in_width = conv_param->input_w_; + + const int output_w = conv_param->output_w_; + + const int channels = conv_param->input_channel_; + const int stride = kernel_h * kernel_w; + + int kernel_row, kernel_col; + + for (int i = 0; i < real_cal_num; i++) { + int block_start = start + i; + int input_h = block_start / output_w * stride_h; + int input_w = block_start % output_w * stride_w; + float16_t *data_col = data_col_orig + i * channels * stride; + for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) { + int input_row = -pad_up + kernel_row * dilation_h + input_h; + for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) { + int input_col = -pad_left + kernel_col * dilation_w + input_w; + if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) { + const int offset = (input_row * in_width + input_col) * channels; + for (int c = 0; c < channels; c++) { + data_col[c * stride] = in_data[offset + c]; + } + data_col++; + } else { + for (int c = 0; c < channels; c++) { + data_col[c * stride] = 0; + } + data_col++; + } + } + } + } +} + +void RollingIm2ColPackUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input, + int real_cal_num, int block_index) { + const int pad_left = conv_param->pad_l_; + const int pad_up = conv_param->pad_u_; + + const int stride_h = conv_param->stride_h_; + const int stride_w = conv_param->stride_w_; + + const int dilation_h = conv_param->dilation_h_; + const int dilation_w = conv_param->dilation_w_; + + const int kernel_h = conv_param->kernel_h_; + const int kernel_w = conv_param->kernel_w_; + + const int in_height = conv_param->input_h_; + const int in_width = conv_param->input_w_; + + const int output_w = conv_param->output_w_; + + const int channels = conv_param->input_channel_ / conv_param->group_; + const int tot_channels = conv_param->input_channel_; + + int kernel_row, kernel_col; + + if (channels == 1) { + for (int i = 0; i < real_cal_num; i++) { + int block_start = block_index + i; + int input_h = block_start / output_w * stride_h; + int input_w = block_start % output_w * stride_w; + for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) { + int input_row = -pad_up + kernel_row * dilation_h + input_h; + for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) { + int input_col = -pad_left + kernel_col * dilation_w + input_w; + if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) { + const int offset = (input_row * in_width + input_col) * tot_channels; + *packed_input = input_data[offset]; + packed_input++; + } else { + *packed_input = 0; + packed_input++; + } + } + } + } + } else { + for (int i = 0; i < real_cal_num; i++) { + int block_start = block_index + i; + int input_h = block_start / output_w * stride_h; + int input_w = block_start % output_w * stride_w; + for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) { + int input_row = -pad_up + kernel_row * dilation_h + input_h; + for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) { + int input_col = -pad_left + kernel_col * dilation_w + input_w; + if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) { + const int offset = (input_row * in_width + input_col) * tot_channels; + memcpy(packed_input, input_data + offset, sizeof(float16_t) * channels); + packed_input += channels; + } else { + memset(packed_input, 0, sizeof(float16_t) * channels); + packed_input += channels; + } + } + } + } + } +} + +void RollingCol2ImPackUnitFp16(const float16_t *data_col, float16_t *data_im, const ConvParameter *conv_param, + int real_cal_num, int block_index) { + const int pad_left = conv_param->pad_l_; + const int pad_up = conv_param->pad_u_; + + const int stride_h = conv_param->stride_h_; + const int stride_w = conv_param->stride_w_; + + const int dilation_h = conv_param->dilation_h_; + const int dilation_w = conv_param->dilation_w_; + + const int kernel_h = conv_param->kernel_h_; + const int kernel_w = conv_param->kernel_w_; + + const int in_height = conv_param->input_h_; + const int in_width = conv_param->input_w_; + + const int output_w = conv_param->output_w_; + const int channels = conv_param->input_channel_ / conv_param->group_; + const int tot_channels = conv_param->input_channel_; + + int kernel_row, kernel_col; + + if (channels == 1) { + for (int r = 0; r < real_cal_num; r++) { + int output_col = (block_index + r) % output_w; + int output_row = (block_index + r) / output_w; + int row_stride_offset = output_row * stride_h; + int col_stride_offset = output_col * stride_w; + for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) { + int input_row = -pad_up + kernel_row * dilation_h + row_stride_offset; + for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) { + int input_col = -pad_left + kernel_col * dilation_w + col_stride_offset; + if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) { + int offset = (input_row * in_width + input_col) * tot_channels; + float16_t *data_im_ptr = data_im + offset; + *data_im_ptr += *data_col; + } + data_col++; + } + } + } + } else { + for (int r = 0; r < real_cal_num; r++) { + int output_col = (block_index + r) % output_w; + int output_row = (block_index + r) / output_w; + int row_stride_offset = output_row * stride_h; + int col_stride_offset = output_col * stride_w; + for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) { + int input_row = -pad_up + kernel_row * dilation_h + row_stride_offset; + for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) { + int input_col = -pad_left + kernel_col * dilation_w + col_stride_offset; + if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) { + int offset = (input_row * in_width + input_col) * tot_channels; + float16_t *data_im_ptr = &data_im[offset]; + for (int i = 0; i < channels; i++) { + data_im_ptr[i] += data_col[i]; + } + } + data_col += channels; + } + } + } + } +} diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pack_fp16_ext.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pack_fp16_ext.h new file mode 100644 index 00000000000..64fd9a22ca9 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pack_fp16_ext.h @@ -0,0 +1,37 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_PACK_FP16_EXT_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_PACK_FP16_EXT_H_ + +#include +#include "nnacl/conv_parameter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void RollingIm2ColPackUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input, + int real_cal_num, int block_index); +void RollingIm2ColPackDwUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input, + int real_cal_num, int block_index); +void RollingCol2ImPackUnitFp16(const float16_t *data_col, float16_t *data_im, const ConvParameter *conv_param, + int real_cal_num, int block_index); +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_PACK_FP16_EXT_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pooling_grad.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pooling_grad.c new file mode 100644 index 00000000000..a5e996fe376 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pooling_grad.c @@ -0,0 +1,189 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include "nnacl/fp16_grad/pooling_grad.h" + +void AvgPoolingFp16Grad(const float16_t *input_ptr, float16_t *output_ptr, int count, PoolingParameter *pooling_param) { + int stride_w = pooling_param->stride_w_; + int stride_h = pooling_param->stride_h_; + int pad_w = pooling_param->pad_l_; + int pad_h = pooling_param->pad_u_; + int win_w = pooling_param->window_w_; + int win_h = pooling_param->window_h_; + int channel = pooling_param->input_channel_; + int in_w = pooling_param->input_w_; + int in_h = pooling_param->input_h_; + int output_w = pooling_param->output_w_; + int output_h = pooling_param->output_h_; + + const float16_t kk = 1.0f / (float16_t)(win_h * win_w); +#if ENABLE_NEON + const float16x4_t factor = vdup_n_f16(kk); +#endif + for (int ib = 0; ib < count; ib++) { + float16_t *out = &output_ptr[(ib * in_h * in_w * channel)]; + const float16_t *inPtr = &input_ptr[(ib * output_h * output_w * channel)]; + // iterate over yt + for (int yh = 0; yh < output_h; yh++) { + int over_h = pad_h - yh * stride_h; + int kh_s = MSMAX(0, over_h); + int kh_e = MSMIN(win_h, in_h + over_h); + for (int yw = 0; yw < output_w; yw++) { + int over_w = pad_w - yw * stride_w; + int kw_s = MSMAX(0, over_w); + int kw_e = MSMIN(win_w, in_w + over_w); + int ic = 0; + for (; ic < channel - 4; ic += 4) { + int idx = (yw + yh * output_w) * channel + ic; +#ifdef ENABLE_NEON + float16x4_t in = vld1_f16(inPtr + idx); + float16x4_t delta = vmul_f16(in, factor); +#else + float16_t delta[4] = {inPtr[idx], inPtr[idx + 1], inPtr[idx + 2], inPtr[idx + 3]}; + for (int i = 0; i < 4; i++) delta[i] *= kk; +#endif + for (int kh = kh_s; kh < kh_e; kh++) { + int xh = yh * stride_h + kh - pad_h; + for (int kw = kw_s; kw < kw_e; kw++) { + int xw = yw * stride_w + kw - pad_w; +#ifdef ENABLE_NEON + float16_t *out_vec = out + (xw + in_w * xh) * channel + ic; + float16x4_t outr = vld1_f16(out + (xw + in_w * xh) * channel + ic); + float16x4_t outs = vadd_f16(outr, delta); + vst1_f16(out_vec, outs); +#else + + for (int i = 0; i < 4; i++) { + out[(xw + in_w * xh) * channel + ic + i] += ((float16_t *)&delta)[i]; + } +#endif + } + } + } + for (; ic < channel; ic++) { + int idx = (yw + yh * output_w) * channel + ic; + float16_t delta = inPtr[idx] * kk; + for (int kh = kh_s; kh < kh_e; kh++) { + int xh = yh * stride_h + kh - pad_h; + for (int kw = kw_s; kw < kw_e; kw++) { + int xw = yw * stride_w + kw - pad_w; + out[(xw + in_w * xh) * channel + ic] += delta; + } + } + } + } + } + } +} + +#ifdef ENABLE_NEON +static int32x4_t MaxIndex(float16x4_t in, float16x4_t *max, uint32x4_t index, uint32x4_t prev_index) { + uint16x4_t res = vcgt_f16(in, *max); + int16x4_t tmp = vreinterpret_s16_u16(res); + uint32x4_t res_tmp = vreinterpretq_u32_s32(vmovl_s16(tmp)); + int32x4_t m_index = vbslq_s32(res_tmp, index, prev_index); + *max = vbsl_f16(res, in, *max); + return m_index; +} +#endif + +void MaxPoolingFp16Grad(const float16_t *input_ptr, const float16_t *dy_ptr, float16_t *output_ptr, int output_batch, + PoolingParameter *pooling_param) { + int stride_w = pooling_param->stride_w_; + int stride_h = pooling_param->stride_h_; + int pad_w = pooling_param->pad_l_; + int pad_h = pooling_param->pad_u_; + int win_w = pooling_param->window_w_; + int win_h = pooling_param->window_h_; + int channel = pooling_param->input_channel_; + int in_w = pooling_param->input_w_; + int in_h = pooling_param->input_h_; + int output_w = pooling_param->output_w_; + int output_h = pooling_param->output_h_; + + for (int ib = 0; ib < output_batch; ib++) { + float16_t *out = &output_ptr[(ib * in_h * in_w * channel)]; + const float16_t *inPtr = &input_ptr[(ib * in_h * in_w * channel)]; + const float16_t *dyPtr = &dy_ptr[(ib * output_h * output_w * channel)]; + for (int yh = 0; yh < output_h; yh++) { + int over_h = pad_h - yh * stride_h; + int kh_s = MSMAX(0, over_h); + int kh_e = MSMIN(win_h, in_h + over_h); + for (int yw = 0; yw < output_w; yw++) { + int over_w = pad_w - yw * stride_w; + int kw_s = MSMAX(0, over_w); + int kw_e = MSMIN(win_w, in_w + over_w); + int ic = 0; + for (; ic < (channel & ~3); ic += 4) { + int idx = (yw + yh * output_w) * channel + ic; +#ifdef ENABLE_NEON + uint32x4_t max_idx = vdupq_n_u32(0); + float16x4_t max_val = vdup_n_f16(-FLT16_MAX); + float16x4_t delta = vld1_f16(dyPtr + idx); +#else + float16_t delta[4] = {dyPtr[idx], dyPtr[idx + 1], dyPtr[idx + 2], dyPtr[idx + 3]}; + float16_t max_val[4] = {-FLT16_MAX, -FLT16_MAX, -FLT16_MAX, -FLT16_MAX}; + uint max_idx[4] = {0}; +#endif + for (int kh = kh_s; kh < kh_e; kh++) { + int xh = yh * stride_h + kh - pad_h; + for (int kw = kw_s; kw < kw_e; kw++) { + int xw = yw * stride_w + kw - pad_w; + int val_idx = (xw + in_w * xh) * channel + ic; +#ifdef ENABLE_NEON + uint32x4_t index = {val_idx, val_idx + 1, val_idx + 2, val_idx + 3}; + float16x4_t in = vld1_f16(inPtr + val_idx); + max_idx = MaxIndex(in, &max_val, index, max_idx); +#else + float16_t val[4] = {inPtr[val_idx], inPtr[val_idx + 1], inPtr[val_idx + 2], inPtr[val_idx + 3]}; + for (int i = 0; i < 4; i++) { + if (val[i] > max_val[i]) { + max_val[i] = val[i]; + max_idx[i] = val_idx + i; + } + } +#endif + } + } + for (int i = 0; i < 4; i++) { + out[((int *)&max_idx)[i]] += ((float16_t *)&delta)[i]; + } + } + for (; ic < channel; ic++) { + float16_t max_val = -FLT16_MAX; + int max_idx = 0; + int idx = (yw + yh * output_w) * channel + ic; + float16_t delta = dyPtr[idx]; + for (int kh = kh_s; kh < kh_e; kh++) { + int xh = yh * stride_h + kh - pad_h; + for (int kw = kw_e; kw < kw_s; kw++) { + int xw = yw * stride_w + kw - pad_w; + int val_idx = (xw + in_w * xh) * channel + ic; + float16_t val = inPtr[val_idx]; + if (val > max_val) { + max_val = val; + max_idx = val_idx; + } + } + } + out[max_idx] += delta; + } + } + } + } +} diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pooling_grad.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pooling_grad.h new file mode 100644 index 00000000000..fa3da0850d7 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/pooling_grad.h @@ -0,0 +1,32 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_POOLING_GRAD_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_POOLING_GRAD_H_ + +#include "nnacl/fp16/pooling_fp16.h" + +#ifdef __cplusplus +extern "C" { +#endif +void AvgPoolingFp16Grad(const float16_t *input_ptr, float16_t *output_ptr, int count, PoolingParameter *pooling_param); +void MaxPoolingFp16Grad(const float16_t *input_ptr, const float16_t *dy_ptr, float16_t *output_ptr, int output_batch, + PoolingParameter *pooling_param); +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_POOLING_GRAD_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/resize_grad.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/resize_grad.c new file mode 100644 index 00000000000..600fea3f8c9 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/resize_grad.c @@ -0,0 +1,142 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nnacl/fp16_grad/resize_grad.h" +#include +#include "nnacl/infer/common_infer.h" + +void ResizeNearestNeighborFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format, + ResizeFp16GradParameter *param) { + bool align_corners = param->align_corners_; + size_t in_hw_size = param->in_width_ * param->in_height_; + size_t out_hw_size = param->out_width_ * param->out_height_; + + if (format == Format_NHWC) { + for (int32_t b = 0; b < batch_size; ++b) { + for (size_t i = 0; i < in_hw_size; ++i) { + size_t in_y = i / param->in_width_; + size_t in_x = i % param->in_width_; + for (int32_t c = 0; c < channel; ++c) { + size_t out_y = MSMIN( + (align_corners) ? (size_t)roundf(in_y * param->height_scale_) : (size_t)floorf(in_y * param->height_scale_), + param->out_height_ - 1); + size_t out_x = MSMIN( + (align_corners) ? (size_t)roundf(in_x * param->width_scale_) : (size_t)floorf(in_x * param->width_scale_), + param->out_width_ - 1); + size_t out_offset = out_y * (param->out_width_ * channel) + (out_x * channel) + c; + size_t in_offset = in_y * (param->in_width_ * channel) + (in_x * channel) + c; + out_addr[out_offset] += in_addr[in_offset]; + } + } + out_addr += out_hw_size * channel; + in_addr += in_hw_size * channel; + } + } else if (format == Format_NCHW) { + for (int32_t b = 0; b < batch_size; ++b) { + for (int32_t c = 0; c < channel; ++c) { + for (size_t h = 0; h < param->in_height_; ++h) { + size_t out_y = + MSMIN((align_corners) ? (size_t)roundf(h * param->height_scale_) : (size_t)floorf(h * param->height_scale_), + param->out_height_ - 1); + for (size_t w = 0; w < param->in_width_; ++w) { + size_t out_x = + MSMIN((align_corners) ? (size_t)roundf(w * param->width_scale_) : (size_t)floorf(w * param->width_scale_), + param->out_width_ - 1); + out_addr[out_y * param->out_width_ + out_x] += in_addr[h * param->in_width_ + w]; + } + } + out_addr += out_hw_size; + in_addr += in_hw_size; + } + } + } +} + +void ResizeBiLinearFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format, + ResizeFp16GradParameter *param) { + size_t in_hw_size = param->in_width_ * param->in_height_; + size_t out_hw_size = param->out_width_ * param->out_height_; + + if (format == Format_NHWC) { + for (int32_t b = 0; b < batch_size; ++b) { + for (size_t i = 0; i < in_hw_size; ++i) { + size_t h = i / param->in_width_; + size_t w = i % param->in_width_; + for (int32_t c = 0; c < channel; ++c) { + float16_t in_y = (float16_t)h * param->height_scale_; + size_t top_y_index = MSMAX((size_t)(floorf(in_y)), (size_t)(0)); + size_t bottom_y_index = MSMIN((size_t)(ceilf(in_y)), param->out_height_ - 1); + float16_t y_lerp = in_y - floorf(in_y); + float16_t inverse_y_lerp = 1.0 - y_lerp; + + float16_t in_x = (float16_t)w * param->width_scale_; + size_t left_x_index = MSMAX((size_t)(floorf(in_x)), (size_t)(0)); + size_t right_x_index = MSMIN((size_t)(ceilf(in_x)), param->out_width_ - 1); + float16_t x_lerp = in_x - floorf(in_x); + float16_t inverse_x_lerp = 1.0 - x_lerp; + + size_t in_offset = h * (param->in_width_ * channel) + (w * channel) + c; + size_t out_offset_top_y_left_x = top_y_index * (param->out_width_ * channel) + (left_x_index * channel) + c; + size_t out_offset_top_y_right_x = top_y_index * (param->out_width_ * channel) + (right_x_index * channel) + c; + size_t out_offset_bottom_y_left_x = + bottom_y_index * (param->out_width_ * channel) + (left_x_index * channel) + c; + size_t out_offset_bottom_y_right_x = + bottom_y_index * (param->out_width_ * channel) + (right_x_index * channel) + c; + + out_addr[out_offset_top_y_left_x] += in_addr[in_offset] * (float16_t)(inverse_y_lerp * inverse_x_lerp); + out_addr[out_offset_top_y_right_x] += in_addr[in_offset] * (float16_t)(inverse_y_lerp * x_lerp); + out_addr[out_offset_bottom_y_left_x] += in_addr[in_offset] * (float16_t)(y_lerp * inverse_x_lerp); + out_addr[out_offset_bottom_y_right_x] += in_addr[in_offset] * (float16_t)(y_lerp * x_lerp); + } + } + out_addr += out_hw_size * channel; + in_addr += in_hw_size * channel; + } + } else if (format == Format_NCHW) { + size_t in_height = param->in_height_; + size_t in_width = param->in_width_; + size_t out_height = param->out_height_; + size_t out_width = param->out_width_; + + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channel; ++c) { + for (size_t h = 0; h < in_height; ++h) { + const float16_t in_y = (float16_t)(h)*param->height_scale_; + const size_t top_y_index = MSMAX((size_t)floorf(in_y), 0); + const size_t bottom_y_index = MSMIN((size_t)ceilf(in_y), out_height - 1); + const float16_t y_lerp = in_y - floorf(in_y); + const float16_t inverse_y_lerp = 1.0 - y_lerp; + for (size_t w = 0; w < in_width; ++w) { + const float16_t in_x = (float16_t)(w)*param->width_scale_; + const size_t left_x_index = MSMAX((size_t)floorf(in_x), 0); + const size_t right_x_index = MSMIN((size_t)ceilf(in_x), out_width - 1); + const float16_t x_lerp = in_x - floorf(in_x); + const float16_t inverse_x_lerp = 1.0 - x_lerp; + out_addr[top_y_index * out_width + left_x_index] += + in_addr[h * in_width + w] * (float16_t)(inverse_y_lerp * inverse_x_lerp); + out_addr[top_y_index * out_width + right_x_index] += + in_addr[h * in_width + w] * (float16_t)(inverse_y_lerp * x_lerp); + out_addr[bottom_y_index * out_width + left_x_index] += + in_addr[h * in_width + w] * (float16_t)(y_lerp * inverse_x_lerp); + out_addr[bottom_y_index * out_width + right_x_index] += + in_addr[h * in_width + w] * (float16_t)(y_lerp * x_lerp); + } + } + out_addr += out_hw_size; + in_addr += in_hw_size; + } + } + } +} diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/resize_grad.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/resize_grad.h new file mode 100644 index 00000000000..a4b22c0191f --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/resize_grad.h @@ -0,0 +1,45 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_RESIZE_GRAD_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_RESIZE_GRAD_H_ + +#include "nnacl/op_base.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct ResizeFp16GradParameter { + OpParameter op_parameter_; + bool align_corners_; + int method; + size_t in_height_; + size_t in_width_; + size_t out_height_; + size_t out_width_; + float16_t height_scale_; + float16_t width_scale_; +} ResizeFp16GradParameter; + +void ResizeNearestNeighborFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format, + ResizeFp16GradParameter *param); +void ResizeBiLinearFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format, + ResizeFp16GradParameter *param); +#ifdef __cplusplus +} +#endif +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_RESIZE_GRAD_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/strided_slice_grad.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/strided_slice_grad.c new file mode 100644 index 00000000000..75facd1d305 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/strided_slice_grad.c @@ -0,0 +1,62 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp16_grad/strided_slice_grad.h" +#include "nnacl/errorcode.h" + +static size_t CalcIndex(const int *shape, size_t size, int i, size_t pos) { + size_t res = 1; + for (size_t j = 0; j < size; j++) { + res *= shape[(i + 1) + j]; + } + return (pos / res % shape[i]); +} + +int DoStridedSliceFp16Grad(const float16_t *inputs, float16_t *output, const int *dx_shape, + StridedSliceParameter *param) { + if (inputs == NULL || output == NULL || param == NULL) { + return NNACL_NULL_PTR; + } + if (param->num_axes_ > DIMENSION_7D) { + return NNACL_PARAM_INVALID; + } + + size_t size = 1; + int *s = param->strides_; + int *b = param->begins_; + for (int i = 0; i < DIMENSION_7D; i++) { + size *= param->in_shape_[i]; + } + + for (size_t pos = 0; pos < size; pos++) { + size_t i = CalcIndex(param->in_shape_, 6, 0, pos); + size_t j = CalcIndex(param->in_shape_, 5, 1, pos); + size_t k = CalcIndex(param->in_shape_, 4, 2, pos); + size_t l = CalcIndex(param->in_shape_, 3, 3, pos); + size_t m = CalcIndex(param->in_shape_, 2, 4, pos); + size_t n = CalcIndex(param->in_shape_, 1, 5, pos); + size_t o = CalcIndex(param->in_shape_, 0, 6, pos); + + size_t input_idx = + (i * s[0] + b[0]) * dx_shape[1] * dx_shape[2] * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] + + (j * s[1] + b[1]) * dx_shape[2] * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] + + (k * s[2] + b[2]) * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] + + (l * s[3] + b[3]) * dx_shape[4] * dx_shape[5] * dx_shape[6] + (m * s[4] + b[4]) * dx_shape[5] * dx_shape[6] + + (n * s[5] + b[5]) * dx_shape[6] + (o * s[6] + b[6]); + output[input_idx] = inputs[pos]; + } + return NNACL_OK; +} diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/strided_slice_grad.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/strided_slice_grad.h new file mode 100644 index 00000000000..0823db6dbf0 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/strided_slice_grad.h @@ -0,0 +1,31 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_STRIDED_SLICE_GRAD_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_STRIDED_SLICE_GRAD_H_ + +#include "nnacl/op_base.h" +#include "nnacl/strided_slice_parameter.h" + +#ifdef __cplusplus +extern "C" { +#endif +int DoStridedSliceFp16Grad(const float16_t *inputs, float16_t *output, const int *dx_shape, + StridedSliceParameter *param); +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_STRIDED_SLICE_GRAD_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/unsorted_segment_sum.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/unsorted_segment_sum.c new file mode 100644 index 00000000000..9d8c4aa1f19 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/unsorted_segment_sum.c @@ -0,0 +1,33 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nnacl/fp16_grad/unsorted_segment_sum.h" +#include "nnacl/errorcode.h" + +int UnsortedSegmentSumFp16(const float16_t *input, int unit_num, int input_dim1, const int *indices, float16_t *output, + int output_dim0, int output_dim1) { + for (int i = 0; i < unit_num; ++i) { + int j = i / input_dim1; + int k = i % input_dim1; + + int index = indices[j]; + if (index < 0 || index >= output_dim0) { + continue; + } + int output_index = index * output_dim1 + k; + output[output_index] += input[i]; + } + return NNACL_OK; +} diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/unsorted_segment_sum.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/unsorted_segment_sum.h new file mode 100644 index 00000000000..9ea09e31294 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16_grad/unsorted_segment_sum.h @@ -0,0 +1,31 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_UNSORTED_SEGMENT_SUM_H_ +#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_UNSORTED_SEGMENT_SUM_H_ + +#include "nnacl/op_base.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int UnsortedSegmentSumFp16(const float16_t *input, int unit_num, int input_dim1, const int *indices, float16_t *output, + int output_dim0, int output_dim1); +#ifdef __cplusplus +} +#endif +#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_UNSORTED_SEGMENT_SUM_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/convolution_grad_filter.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/convolution_grad_filter.c index 3fdf8570a4b..24214089150 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/convolution_grad_filter.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/convolution_grad_filter.c @@ -321,7 +321,7 @@ static int Filtergrad2Arm(const float *x, const float *dy, int i_c, int k_idx, f dw[(i_c + 1) * k_spatial + k_idx] = sum_2[1]; i_c += 2; } - return i_c += 2; + return i_c; } #endif int ConvDwFilterGrad(const float *x, const float *dy, float *dw, int start, int count, diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h index 890711a0ef5..fe83cafbd5f 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ms_simd_instructions_fp16.h @@ -118,4 +118,11 @@ static inline float16x8_t MS_ERFX8_F16(float16x8_t src) { dst[7] = erff(src[7]); return dst; } + +static inline float32x4_t MS_VMLAL_F16(float16x4_t x, float16x4_t dy, float32x4_t sum) { + float32x4_t x_fp32 = MS_CVT_F32_F16(x); + float32x4_t dy_fp32 = MS_CVT_F32_F16(dy); + return vmlaq_f32(sum, x_fp32, dy_fp32); +} + #endif // MINDSPORE_NNACL_INTRINSICS_MS_SIMD_INSTRUCTIONS_FP16_H_ diff --git a/mindspore/lite/src/ops/populate/v0/partial_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/partial_populate_v0.cc index 3eae49ef8f0..158701b70bf 100644 --- a/mindspore/lite/src/ops/populate/v0/partial_populate_v0.cc +++ b/mindspore/lite/src/ops/populate/v0/partial_populate_v0.cc @@ -39,7 +39,7 @@ OpParameter *PopulatePartialParameter(const void *prim) { MS_LOG(ERROR) << "malloc partial parameter failed."; return nullptr; } - memset(partial_parameter, 0, sizeof(PartialParameter)); + memset(reinterpret_cast(partial_parameter), 0, sizeof(PartialParameter)); partial_parameter->op_parameter_.type_ = schema::PrimitiveType_PartialFusion; partial_parameter->sub_graph_index_ = partial_prim->subGraphIndex(); diff --git a/mindspore/lite/src/ops/populate/v0/while_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/while_populate_v0.cc index 443ace2c25e..47f8417bd80 100644 --- a/mindspore/lite/src/ops/populate/v0/while_populate_v0.cc +++ b/mindspore/lite/src/ops/populate/v0/while_populate_v0.cc @@ -39,7 +39,7 @@ OpParameter *PopulateWhileParameter(const void *prim) { MS_LOG(ERROR) << "malloc WhileParemeter failed."; return nullptr; } - memset(while_paremeter, 0, sizeof(WhileParemeter)); + memset(reinterpret_cast(while_paremeter), 0, sizeof(WhileParemeter)); while_paremeter->op_parameter_.type_ = schema::PrimitiveType_While; while_paremeter->body_subgraph_index = while_prim->bodySubgraphIndex(); diff --git a/mindspore/lite/src/ops/populate/while_populate.cc b/mindspore/lite/src/ops/populate/while_populate.cc index ea0f12bb66d..742fba6a8c9 100644 --- a/mindspore/lite/src/ops/populate/while_populate.cc +++ b/mindspore/lite/src/ops/populate/while_populate.cc @@ -38,7 +38,7 @@ OpParameter *PopulateWhileParemeter(const void *prim) { MS_LOG(ERROR) << "malloc WhileParemeter failed."; return nullptr; } - memset(param, 0, sizeof(WhileParemeter)); + memset(reinterpret_cast(param), 0, sizeof(WhileParemeter)); param->op_parameter_.type_ = primitive->value_type(); param->body_subgraph_index = value->body_subgraph_index(); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.cc index 0acff65b01c..5091d5b1a55 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.cc @@ -52,10 +52,26 @@ int ActivationGradCPUKernelFp16::DoActivation(int task_id) { auto error_code = RET_OK; if (param_act_grad_->type_ == schema::ActivationType_RELU) { - error_code = Fp16ReluGrad(yt_addr + start, input_addr + start, count, output_addr + start); + error_code = ReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); + } else if (param_act_grad_->type_ == schema::ActivationType_RELU6) { + error_code = Relu6Fp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); + } else if (param_act_grad_->type_ == schema::ActivationType_LEAKY_RELU) { + error_code = LReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, + (float16_t)param_act_grad_->alpha_); } else if (param_act_grad_->type_ == schema::ActivationType_SIGMOID) { // Sigmoid gets the input tensors in reverse order! - error_code = Fp16SigmoidGrad(input_addr + start, yt_addr + start, count, output_addr + start); + error_code = SigmoidFp16Grad(input_addr + start, yt_addr + start, count, output_addr + start); + } else if (param_act_grad_->type_ == schema::ActivationType_TANH) { + error_code = TanhFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); + } else if (param_act_grad_->type_ == schema::ActivationType_HSWISH) { + error_code = HSwishFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); + } else if (param_act_grad_->type_ == schema::ActivationType_HSIGMOID) { + error_code = HSigmoidFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); + } else if (param_act_grad_->type_ == schema::ActivationType_ELU) { + error_code = + EluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, (float16_t)param_act_grad_->alpha_); + } else if (param_act_grad_->type_ == schema::ActivationType_GELU) { + error_code = GeluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start); } else { MS_LOG(ERROR) << "Activation type error"; return RET_ERROR; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.h index 5390875b2a7..982794820cf 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.h @@ -14,12 +14,13 @@ * limitations under the License. */ -#ifndef MINDSPORE_ACTIVATION_FP16_GRAD_H -#define MINDSPORE_ACTIVATION_FP16_GRAD_H +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ACTIVATION_FP16_GRAD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ACTIVATION_FP16_GRAD_H_ #include #include "src/inner_kernel.h" #include "nnacl/fp16_grad/activation_grad.h" +#include "nnacl/fp32_grad/activation_grad.h" namespace mindspore::kernel { class ActivationGradCPUKernelFp16 : public InnerKernel { @@ -27,7 +28,7 @@ class ActivationGradCPUKernelFp16 : public InnerKernel { explicit ActivationGradCPUKernelFp16(OpParameter *param, const std::vector &inputs, const std::vector &outputs, const lite::InnerContext *ctx) : InnerKernel(param, inputs, outputs, ctx), thread_count_(ctx->thread_num_) { - param_act_grad_ = reinterpret_cast(param); + param_act_grad_ = reinterpret_cast(param); } ~ActivationGradCPUKernelFp16() override = default; @@ -37,9 +38,9 @@ class ActivationGradCPUKernelFp16 : public InnerKernel { int DoActivation(int task_id); private: - ActivationGradParameterFp16 *param_act_grad_; + ActivationGradParameter *param_act_grad_; int thread_count_; }; } // namespace mindspore::kernel -#endif // MINDSPORE_ACTIVATION_FP16_GRAD_H +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ACTIVATION_FP16_GRAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.cc new file mode 100644 index 00000000000..4a23150b3bf --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.cc @@ -0,0 +1,89 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.h" +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "nnacl/fp16_grad/arithmetic_grad.h" +#include "include/errorcode.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; + +namespace mindspore::kernel { + +int ArithmeticGradCPUKernelFp16::Init() { return RET_OK; } + +void ArithmeticGradCPUKernelFp16::ArithmeticGradMaximum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size, + float16_t *dx2, int dx2_size) { + auto x1 = reinterpret_cast(in_tensors_[0]->data_c()); + auto x2 = reinterpret_cast(in_tensors_[1]->data_c()); + dy = reinterpret_cast(in_tensors_[2]->data_c()); + + MaximumByAxesFp16(x1, x2, dy, arithmeticParameter_->in_shape0_, arithmeticParameter_->in_shape1_, + arithmeticParameter_->out_shape_, dx1, dx2, arithmeticParameter_->ndim_); +} + +void ArithmeticGradCPUKernelFp16::ArithmeticGradMinimum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size, + float16_t *dx2, int dx2_size) { + auto x1 = reinterpret_cast(in_tensors_[0]->data_c()); + auto x2 = reinterpret_cast(in_tensors_[1]->data_c()); + dy = reinterpret_cast(in_tensors_[2]->data_c()); + + MinimumByAxesFp16(x1, x2, dy, arithmeticParameter_->in_shape0_, arithmeticParameter_->in_shape1_, + arithmeticParameter_->out_shape_, dx1, dx2, arithmeticParameter_->ndim_); +} + +int ArithmeticGradCPUKernelFp16::ReSize() { return RET_OK; } + +int ArithmeticGradCPUKernelFp16::Execute(int task_id) { + auto dy = reinterpret_cast(in_tensors_[0]->data_c()); + auto dx1 = reinterpret_cast(out_tensors_[0]->data_c()); + auto dx2 = reinterpret_cast(out_tensors_[1]->data_c()); + + size_t dy_size = in_tensors_.at(0)->ElementsNum(); + size_t dx1_size = out_tensors_.at(0)->ElementsNum(); + size_t dx2_size = out_tensors_.at(1)->ElementsNum(); + (this->*arithmetic_grad_)(dy, dy_size, dx1, dx1_size, dx2, dx2_size); + return RET_OK; +} + +int ArithmeticGradRunFp16(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + MS_ASSERT(cdata != nullptr); + auto Arithmetic_kernel = reinterpret_cast(cdata); + auto error_code = Arithmetic_kernel->Execute(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "ArithmeticGradRunFp16 error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int ArithmeticGradCPUKernelFp16::Run() { + int error_code = static_cast(this->context_) + ->thread_pool_->ParallelLaunch(ArithmeticGradRunFp16, this, 1); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Arithmetic Grad function error error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MaximumGrad, LiteKernelCreator) +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MinimumGrad, LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.h new file mode 100644 index 00000000000..f74430c590e --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.h @@ -0,0 +1,80 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ARITHMETIC_FP16_GRAD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ARITHMETIC_FP16_GRAD_H_ + +#include +#include "src/inner_kernel.h" +#include "nnacl/fp16/arithmetic_fp16.h" +#include "schema/model_generated.h" + +using mindspore::schema::PrimitiveType_AddGrad; +using mindspore::schema::PrimitiveType_DivGrad; +using mindspore::schema::PrimitiveType_MaximumGrad; +using mindspore::schema::PrimitiveType_MinimumGrad; +using mindspore::schema::PrimitiveType_MulGrad; +using mindspore::schema::PrimitiveType_SubGrad; + +namespace mindspore::kernel { + +class ArithmeticGradCPUKernelFp16; + +class ArithmeticGradCPUKernelFp16 : public InnerKernel { + typedef void (ArithmeticGradCPUKernelFp16::*ArithmeticGradOperation)(float16_t *, int, float16_t *, int, float16_t *, + int); + + public: + explicit ArithmeticGradCPUKernelFp16(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : InnerKernel(parameter, inputs, outputs, ctx), tile_data0(NULL), tile_data1(NULL), tile_data2(NULL) { + switch (type()) { + case PrimitiveType_MaximumGrad: + arithmetic_grad_ = &ArithmeticGradCPUKernelFp16::ArithmeticGradMaximum; + break; + case PrimitiveType_MinimumGrad: + arithmetic_grad_ = &ArithmeticGradCPUKernelFp16::ArithmeticGradMinimum; + break; + default: + MS_LOG(ERROR) << "Error Operator type " << parameter->type_; + break; + } + arithmeticParameter_ = reinterpret_cast(parameter); + } + ~ArithmeticGradCPUKernelFp16() override { + if (tile_data0) delete[] tile_data0; + if (tile_data1) delete[] tile_data1; + if (tile_data2) delete[] tile_data2; + } + + int Init() override; + int InferShape(); + int ReSize() override; + int Run() override; + int Execute(int task_id); + + private: + void ArithmeticGradMaximum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size, float16_t *dx2, int dx2_size); + void ArithmeticGradMinimum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size, float16_t *dx2, int dx2_size); + ArithmeticParameter *arithmeticParameter_; + ArithmeticGradOperation arithmetic_grad_; + float16_t *tile_data0; + float16_t *tile_data1; + float16_t *tile_data2; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ARITHMETIC_FP16_GRAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.cc new file mode 100644 index 00000000000..7b74f607e63 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.cc @@ -0,0 +1,96 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.h" +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "include/errorcode.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_BiasAddGrad; + +namespace mindspore::kernel { + +int BiasGradCPUKernelFp16::ReSize() { + auto dims = in_tensors_[0]->shape(); + bias_param->ndim_ = dims.size(); + for (unsigned int i = 0; i < bias_param->ndim_; i++) { + bias_param->in_shape0_[i] = dims[i]; + bias_param->out_shape_[i] = 1; // 1 dimension for N,H,W, + } + bias_param->out_shape_[bias_param->ndim_ - 1] = dims[bias_param->ndim_ - 1]; + for (auto i = bias_param->ndim_; i < 4; i++) { + bias_param->in_shape0_[i] = 0; + bias_param->out_shape_[i] = 0; + } + return RET_OK; +} + +int BiasGradCPUKernelFp16::Init() { + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int BiasGradCPUKernelFp16::Execute(int task_id) { + auto in = reinterpret_cast(in_tensors_.at(0)->data_c()); + auto out = reinterpret_cast(out_tensors_.at(0)->data_c()); + + size_t nhw_size = 1; + size_t channels = bias_param->in_shape0_[bias_param->ndim_ - 1]; // C in NHWC + for (unsigned int i = 0; i < bias_param->ndim_ - 1; i++) { + nhw_size *= bias_param->in_shape0_[i]; + } + + size_t total_size = channels * nhw_size; + for (size_t c = 0; c < channels; ++c) { + float sum = 0; + for (size_t offset = 0; offset < total_size; offset += channels) { + sum += in[offset + c]; + } + out[c] = (float16_t)sum; + } + + return RET_OK; +} + +int BiasGradFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + auto bias_kernel = reinterpret_cast(cdata); + auto error_code = bias_kernel->Execute(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "bias error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int BiasGradCPUKernelFp16::Run() { + int error_code = + static_cast(this->context_)->thread_pool_->ParallelLaunch(BiasGradFp16Run, this, 1); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "bias function error error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BiasAddGrad, LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.h new file mode 100644 index 00000000000..9ed10b1bb34 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.h @@ -0,0 +1,44 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BIAS_FP16_GRAD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BIAS_FP16_GRAD_H_ + +#include +#include "src/lite_kernel.h" +#include "nnacl/fp16/arithmetic_fp16.h" + +namespace mindspore::kernel { +class BiasGradCPUKernelFp16 : public InnerKernel { + public: + explicit BiasGradCPUKernelFp16(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : InnerKernel(parameter, inputs, outputs, ctx) { + bias_param = reinterpret_cast(parameter); + } + ~BiasGradCPUKernelFp16() override = default; + + int Init() override; + int ReSize() override; + int Run() override; + int Execute(int task_id); + + private: + ArithmeticParameter *bias_param; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BIAS_FP16_GRAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.cc new file mode 100644 index 00000000000..85d74646295 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.cc @@ -0,0 +1,166 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.h" +#include +#include +#include +#include +#include +#include + +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "nnacl/fp16_grad/batch_norm.h" +#include "include/errorcode.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_BatchNormGrad; + +namespace mindspore::kernel { +int BNGradCPUKernelFp16::ReSize() { + auto *input_x = in_tensors_.at(1); + int channels = input_x->shape().at(kNHWC_C); + ws_size_ = 2 * channels; + set_workspace_size(ws_size_ * sizeof(float16_t)); + return RET_OK; +} + +int BNGradCPUKernelFp16::Init() { + for (int i = 0; i < in_tensors_.size(); i++) { + if (in_tensors_.at(i)->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << "BNGradCPUKernelFp16 type error in_tensor_[" << i << "]"; + } + } + return ReSize(); +} + +int BNGradCPUKernelFp16::Execute(int task_id) { + auto *input_yt = in_tensors_.at(0); + auto *input_x = in_tensors_.at(1); + auto *input_scale = in_tensors_.at(2); + auto *input_mean = in_tensors_.at(3); + auto *input_var = in_tensors_.at(4); + + auto kernel_name = this->name(); + if (kernel_name.find("FusedBatchNormGradCPU") != std::string::npos) { + input_mean = in_tensors_.at(4); + input_var = in_tensors_.at(5); + } + auto bn_param = reinterpret_cast(op_parameter_); + int stage = stage_; + int thread_num = thread_num_; + float16_t *save_mean = reinterpret_cast(input_mean->data_c()); + float16_t *save_var = reinterpret_cast(input_var->data_c()); + + auto *output_dx = out_tensors_.at(0); + auto *output_scale = out_tensors_.at(1); + auto *output_bias = out_tensors_.at(2); + int32_t batch = input_x->Batch(); + int32_t channels = input_x->Channel(); + int32_t spatial = input_x->Height() * input_x->Width(); + + float *workspace_temp = static_cast(workspace()); + float *dxhat_sum = workspace_temp; + float *dxhathat_sum = dxhat_sum + channels; + float16_t *x = reinterpret_cast(input_x->data_c()); + float16_t *yt = reinterpret_cast(input_yt->data_c()); + float16_t *scale = reinterpret_cast(input_scale->data_c()); + float16_t *dx = reinterpret_cast(output_dx->data_c()); + float16_t *dbias = reinterpret_cast(output_bias->data_c()); + float16_t *dscale = reinterpret_cast(output_scale->data_c()); + int total = spatial * batch; + int stride = UP_DIV(total, thread_num); + int count = MSMIN(stride, total - stride * task_id); + count = (count < 0) ? 0 : count; + switch (stage) { + case 0: { + for (int job = task_id; job < 4; job += thread_num) { + switch (job) { + case 0: + var2InvarFp16(save_var, input_var->ElementsNum(), bn_param->epsilon_); + break; + case 1: + std::fill(workspace_temp, workspace_temp + ws_size_, 0.f); + break; + case 2: + std::fill(dbias, dbias + channels, 0.f); + break; + case 3: + std::fill(dscale, dscale + channels, 0.f); + break; + } + } + if (thread_num == 1) { + backwardAllFp16(x, yt, save_mean, save_var, scale, total, channels, dxhat_sum, dxhathat_sum, dbias, dscale, dx); + } + break; + } + case 1: { + backwardP1Fp16(x, yt, save_mean, save_var, scale, total, channels, dxhat_sum, dxhathat_sum, dbias, dscale); + break; + } + case 2: { + backwardP2Fp16(x + task_id * stride * channels, yt + task_id * stride * channels, save_mean, save_var, scale, + count, total, channels, dxhat_sum, dxhathat_sum, dx + task_id * stride * channels); + break; + } + } + + return RET_OK; +} + +int BNGradFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + MS_ASSERT(cdata != nullptr); + auto bn_kernel = reinterpret_cast(cdata); + auto error_code = bn_kernel->Execute(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "BNGradRun error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int BNGradCPUKernelFp16::Run() { + stage_ = 0; + thread_num_ = context_->thread_num_; + if (thread_num_ == 1) { + int error_code = static_cast(this->context_) + ->thread_pool_->ParallelLaunch(BNGradFp16Run, this, thread_num_); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "BN function error error_code[" << error_code << "]"; + return RET_ERROR; + } + } else { + const std::vector threads = {thread_num_, 1, thread_num_}; + for (size_t stage = 0; stage < threads.size(); stage++) { + stage_ = static_cast(stage); + int error_code = static_cast(this->context_) + ->thread_pool_->ParallelLaunch(BNGradFp16Run, this, threads.at(stage)); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "BN function error error_code[" << error_code << "]"; + return RET_ERROR; + } + } + } + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BatchNormGrad, LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.h new file mode 100644 index 00000000000..fe29e28335f --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.h @@ -0,0 +1,43 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BN_FP16_GRAD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BN_FP16_GRAD_H_ + +#include +#include "src/lite_kernel.h" +#include "nnacl/fp32_grad/batch_norm.h" + +namespace mindspore::kernel { + +class BNGradCPUKernelFp16 : public InnerKernel { + public: + explicit BNGradCPUKernelFp16(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : InnerKernel(parameter, inputs, outputs, ctx) {} + ~BNGradCPUKernelFp16() override {} + int Init() override; + int ReSize() override; + int Run() override; + int Execute(int task_id); + + private: + int thread_num_ = 1; + int stage_ = 0; + size_t ws_size_ = 0; +}; +} // namespace mindspore::kernel +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BN_FP16_GRAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.cc new file mode 100644 index 00000000000..6b9f47874fd --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.cc @@ -0,0 +1,205 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.h" +#include "src/kernel_registry.h" +#include "nnacl/pack.h" +#include "nnacl/fp16_grad/convolution_grad_filter.h" +#include "nnacl/fp16_grad/pack_fp16_ext.h" +#include "nnacl/fp16_grad/gemm_fp16.h" +#include "include/errorcode.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_Conv2DBackpropFilterFusion; + +namespace mindspore::kernel { +int ConvolutionGradFilterCPUKernelFp16::ReSize() { + // dy is in input 0 + // x is in input 1 + // dw is output 0 + + auto *x_tensor = in_tensors_.at(1); + MS_ASSERT(x_tensor != nullptr); + auto *dy_tensor = in_tensors_.at(0); + MS_ASSERT(dy_tensor != nullptr); + + auto conv_param = reinterpret_cast(op_parameter_); + conv_param->output_batch_ = dy_tensor->shape().at(kNHWC_N); + conv_param->input_batch_ = x_tensor->shape().at(kNHWC_N); + conv_param->input_h_ = x_tensor->shape().at(kNHWC_H); + conv_param->input_w_ = x_tensor->shape().at(kNHWC_W); + // assume OutCh|kh|kw|InCh + conv_param->input_channel_ = x_tensor->shape().at(kNHWC_C); + conv_param->output_channel_ = dy_tensor->shape().at(kNHWC_C); + + conv_param->output_h_ = dy_tensor->shape()[kNHWC_H]; + conv_param->output_w_ = dy_tensor->shape()[kNHWC_W]; + + do_img2col_ = (conv_param->kernel_h_ == 1) && (conv_param->kernel_w_ == 1) && (conv_param->pad_d_ == 0) && + (conv_param->pad_u_ == 0) && (conv_param->pad_l_ == 0) && (conv_param->pad_r_ == 0) && + (conv_param->dilation_h_ == 1) && (conv_param->dilation_w_ == 1) && (conv_param->stride_h_ == 1) && + (conv_param->stride_w_ == 1) && (conv_param->group_ == 1) + ? false + : true; + do_dw_ = (conv_param->output_channel_ == conv_param->group_) && + (conv_param->input_channel_ == conv_param->output_channel_) && (conv_param->dilation_h_ == 1) && + (conv_param->dilation_w_ == 1) + ? true + : false; + + ws_size_ = chunk_ * conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_; + ws_size_ = do_dw_ ? ws_size_ : ws_size_ / conv_param->group_; + int n = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_ / conv_param->group_; + int k = conv_param->output_channel_ / conv_param->group_; + int thread_num = context_->thread_num_; + mat_alloc_ = MatSizeTotalFp16(k, n, chunk_, 0); + set_workspace_size((ws_size_ + mat_alloc_ + (k * n)) * thread_num * sizeof(float16_t)); + + return RET_OK; +} + +int ConvolutionGradFilterCPUKernelFp16::Init() { return ReSize(); } + +int ConvolutionGradFilterCPUKernelFp16::Execute(int task_id) { + auto conv_param = reinterpret_cast(op_parameter_); + auto *input_dy = in_tensors_.at(0); + auto *input_x = in_tensors_.at(1); + auto *out_dw = out_tensors_.at(0); + + auto x_addr = reinterpret_cast(input_x->data_c()); + auto dy_addr = reinterpret_cast(input_dy->data_c()); + auto dw_addr = reinterpret_cast(out_dw->data_c()); + + int nweights = out_dw->ElementsNum(); + int in_ch = conv_param->input_channel_; + int in_h = conv_param->input_h_; + int in_w = conv_param->input_w_; + int k_h = conv_param->kernel_h_; + int k_w = conv_param->kernel_w_; + int batch = conv_param->output_batch_; + int out_ch = conv_param->output_channel_; + int groups = conv_param->group_; + int out_h = conv_param->output_h_; + int out_w = conv_param->output_w_; + + int m = out_h * out_w; + int n = k_h * k_w * in_ch / groups; + int k = out_ch / groups; + int thread_num = context_->thread_num_; + float16_t *workspace_temp = reinterpret_cast(workspace()); + float16_t *mat_workspace = workspace_temp + ws_size_ * thread_num + task_id * (mat_alloc_ + k * n); + float16_t *mat_tmp = mat_workspace + mat_alloc_; + int stride = UP_DIV(batch, thread_num); + int count = MSMIN(stride, batch - stride * task_id); + count = (count < 0) ? 0 : count; + int start = stride * task_id; + int end = start + count; + + if (do_dw_) { +#ifdef ENABLE_ARM + stride = UP_DIV(k_h * k_w, thread_num); + count = MSMIN(stride, k_h * k_w - stride * task_id); + count = (count < 0) ? 0 : count; + start = stride * task_id; + ConvDwFilterFp16Grad(x_addr, dy_addr, dw_addr, start, count, conv_param); +#else + stride = UP_DIV(groups, thread_num); + count = MSMIN(stride, groups - stride * task_id); + start = stride * task_id; + end = start + count; + + const int kernel_spatial = k_h * k_w; + for (int i = 0; i < batch; ++i) { + for (int ci = 0; ci < m; ci += chunk_) { + int real_chunk = MSMIN(m - ci, chunk_); + float16_t *mat_b = workspace_temp + task_id * ws_size_; + float16_t *im = x_addr + (i * in_ch * in_h * in_w); + RollingIm2ColPackDwUnitFp16(im, conv_param, mat_b, real_chunk, ci); + for (int j = start; j < end; ++j) { + float16_t *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups) + ci * out_ch; + float16_t *mat_c = dw_addr + j * nweights / groups; + GemmMatmulFp16(1, 0, k, n, real_chunk, 1, mat_a, out_ch, mat_b + (j * kernel_spatial), n * groups, 1, mat_c, + n, mat_workspace); + } + } + } +#endif + } else if (do_img2col_) { + for (int i = start; i < end; ++i) { + for (int ci = 0; ci < m; ci += chunk_) { + for (int j = 0; j < groups; ++j) { + int real_chunk = MSMIN(m - ci, chunk_); + float16_t *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups) + ci * out_ch; + float16_t *mat_b = workspace_temp + task_id * ws_size_; + float16_t *mat_c = dw_addr + j * nweights / groups; + float16_t *im = x_addr + (i * in_ch * in_h * in_w) + j * (in_ch / groups); + RollingIm2ColPackUnitFp16(im, conv_param, mat_b, real_chunk, ci); + GemmMatmulFp16(1, 0, k, n, real_chunk, 1, mat_a, out_ch, mat_b, n, 0, mat_tmp, n, mat_workspace); + std::unique_lock merge_lock(lock_); + AddMatrixFp16(mat_tmp, mat_c, 1, k, n, n); + } + } + } + } else { + float16_t *mat_c = dw_addr; + const size_t in_plane_size = in_ch * in_h * in_w; + for (int i = start; i < end; ++i) { + for (int ci = 0; ci < m; ci += chunk_) { + int real_chunk = MSMIN(m - ci, chunk_); + float16_t *mat_a = dy_addr + i * m * k + ci * out_ch; + float16_t *im = x_addr + i * in_plane_size; + int input_h = ci / out_w * conv_param->stride_h_; + int input_w = ci % out_w * conv_param->stride_w_; + int offset = (input_h * in_w + input_w) * in_ch; + GemmMatmulFp16(1, 0, k, n, real_chunk, 1, mat_a, out_ch, im + offset, n, 0, mat_tmp, n, mat_workspace); + std::unique_lock merge_lock(lock_); + AddMatrixFp16(mat_tmp, mat_c, 1, k, n, n); + } + } + } + return RET_OK; +} + +int ConvolutionGradFilterFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + MS_ASSERT(cdata != nullptr); + auto convfilter_kernel = reinterpret_cast(cdata); + auto error_code = convfilter_kernel->Execute(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "ConvolutionGradFilterRun error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionGradFilterCPUKernelFp16::Run() { + auto *out_dw = out_tensors_.at(0); + auto dw_addr = reinterpret_cast(out_dw->data_c()); + memset(dw_addr, 0, out_dw->Size()); + int error_code = static_cast(this->context_) + ->thread_pool_->ParallelLaunch(ConvolutionGradFilterFp16Run, this, context_->thread_num_); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "conv filter function error error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2DBackpropFilterFusion, + LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.h new file mode 100644 index 00000000000..3bfbacf8ed0 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.h @@ -0,0 +1,51 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_FILTER_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_FILTER_H_ + +#include +#include "src/inner_kernel.h" + +namespace mindspore::kernel { +class ConvolutionGradFilterCPUKernelFp16 : public InnerKernel { + public: + explicit ConvolutionGradFilterCPUKernelFp16(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + + : InnerKernel(parameter, inputs, outputs, ctx) {} + ~ConvolutionGradFilterCPUKernelFp16() override {} + + int Init() override; + int ReSize() override; + int Run() override; + int Execute(int task_id); + + private: + size_t ws_size_ = 0; + bool do_img2col_ = true; + bool do_dw_ = false; + std::mutex lock_; + size_t mat_alloc_ = 0; +#ifdef ENABLE_ARM32 + const int chunk_ = C4NUM * 2; +#else + const int chunk_ = C12NUM * 2; +#endif +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_FILTER_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.cc new file mode 100644 index 00000000000..41a52d4c3cc --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.cc @@ -0,0 +1,172 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.h" +#include "src/kernel_registry.h" +#include "nnacl/pack.h" +#include "nnacl/fp16_grad/pack_fp16_ext.h" +#include "nnacl/fp16_grad/gemm_fp16.h" +#include "include/errorcode.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_Conv2DBackpropInputFusion; + +namespace mindspore::kernel { +int ConvolutionGradInputCPUKernelFp16::ReSize() { + auto *dy_tensor = in_tensors_.at(kInputIndex); + MS_ASSERT(dy_tensor != nullptr); + auto *weight_tensor = in_tensors_.at(kWeightIndex); + MS_ASSERT(weight_tensor != nullptr); + auto *dx_tensor = out_tensors_.at(kOutputIndex); + MS_ASSERT(dx_tensor != nullptr); + + auto conv_param = reinterpret_cast(op_parameter_); + conv_param->output_batch_ = dx_tensor->shape()[(kNHWC_N)]; + conv_param->input_batch_ = dy_tensor->shape()[(kNHWC_N)]; + + conv_param->input_h_ = dx_tensor->shape()[(kNHWC_H)]; + conv_param->input_w_ = dx_tensor->shape()[(kNHWC_W)]; + + // assume OutCh|kh|kw|In + conv_param->input_channel_ = dx_tensor->shape()[(kNHWC_C)]; + conv_param->output_channel_ = weight_tensor->shape()[(kNHWC_N)]; + + conv_param->output_h_ = dy_tensor->shape()[kNHWC_H]; + conv_param->output_w_ = dy_tensor->shape()[kNHWC_W]; + ws_size_ = chunk_ * conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_ / conv_param->group_; + + int n = conv_param->kernel_w_ * conv_param->kernel_h_ * conv_param->input_channel_ / conv_param->group_; + int k = conv_param->output_channel_ / conv_param->group_; + int thread_num = context_->thread_num_; + mat_alloc_ = MatSizeTotalFp16(chunk_, n, k, 0); + set_workspace_size((ws_size_ + mat_alloc_) * sizeof(float16_t) * thread_num); + + do_img2col_ = (conv_param->kernel_h_ == 1) && (conv_param->kernel_w_ == 1) && (conv_param->pad_d_ == 0) && + (conv_param->pad_u_ == 0) && (conv_param->pad_l_ == 0) && (conv_param->pad_r_ == 0) && + (conv_param->dilation_h_ == 1) && (conv_param->dilation_w_ == 1) && (conv_param->stride_h_ == 1) && + (conv_param->stride_w_ == 1) && (conv_param->group_ == 1) + ? false + : true; + + return RET_OK; +} + +int ConvolutionGradInputCPUKernelFp16::Init() { return ReSize(); } + +int ConvolutionGradInputCPUKernelFp16::Execute(int task_id) { + auto conv_param = reinterpret_cast(op_parameter_); + auto *input_dy = in_tensors_.at(0); + auto *input_w = in_tensors_.at(1); + auto *out_dx = out_tensors_.at(0); + + auto dy_addr = reinterpret_cast(input_dy->data_c()); + auto w_addr = reinterpret_cast(input_w->data_c()); + auto dx_addr = reinterpret_cast(out_dx->data_c()); + + int i, j; + int nweights = input_w->ElementsNum(); + int in_ch = conv_param->input_channel_; + int in_h = conv_param->input_h_; + int in_w = conv_param->input_w_; + int k_h = conv_param->kernel_h_; // out_dw->shape()[1]; + int k_w = conv_param->kernel_w_; // out_dw->shape()[2]; + int batch = conv_param->output_batch_; + int out_ch = conv_param->output_channel_; + int groups = conv_param->group_; + int out_h = conv_param->output_h_; + int out_w = conv_param->output_w_; + int thread_num = context_->thread_num_; + int m = out_h * out_w; + int n = k_w * k_h * in_ch / groups; + int k = out_ch / groups; + float16_t *workspace_temp = reinterpret_cast(workspace()) + task_id * (mat_alloc_ + ws_size_); + float16_t *mat_workspace = workspace_temp + ws_size_; + int stride = UP_DIV(batch, thread_num); + int count = MSMIN(stride, batch - stride * task_id); + count = (count < 0) ? 0 : count; + int start = stride * task_id; + int end = start + count; + + for (i = start; i < end; ++i) { + for (j = 0; j < groups; ++j) { + GemmCbFp16 gcb; + for (int ci = 0; ci < m; ci += chunk_) { + float16_t *mat_b = nullptr; + if (ci == 0) { + mat_b = w_addr + j * nweights / groups; + gcb.ca = 0; + gcb.cb = 0; + gcb.bias = nullptr; + gcb.atype = ActType_No; + } else { + mat_b = gcb.mat_b; + gcb.cb = 1; + } + int real_chunk = MSMIN(m - ci, chunk_); + float16_t *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups) + ci * out_ch; + if (do_img2col_) { + float16_t *mat_c = workspace_temp; + GemmMatmulPlusFp16(0, 0, real_chunk, n, k, 1, mat_a, out_ch, mat_b, n, 0, mat_c, n, mat_workspace, &gcb); + RollingCol2ImPackUnitFp16(mat_c, + dx_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups), + conv_param, real_chunk, ci); + } else { + float16_t *mat_c = + dx_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups) + ci * (in_ch / groups); + GemmMatmulPlusFp16(0, 0, real_chunk, n, k, 1, mat_a, out_ch, mat_b, n, 0, mat_c, n, mat_workspace, &gcb); + } + } + } + } + + return RET_OK; +} + +int ConvolutionGradInputFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + MS_ASSERT(cdata != nullptr); + auto convinput_kernel = reinterpret_cast(cdata); + auto error_code = convinput_kernel->Execute(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "conv input error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionGradInputCPUKernelFp16::Run() { + auto conv_param = reinterpret_cast(op_parameter_); + int batch = conv_param->output_batch_; + int in_ch = conv_param->input_channel_; + int in_h = conv_param->input_h_; + int in_w = conv_param->input_w_; + auto *out_dx = out_tensors_.at(0); + auto dx_addr = reinterpret_cast(out_dx->data_c()); + memset(dx_addr, 0, sizeof(float16_t) * batch * in_ch * in_h * in_w); + int error_code = static_cast(this->context_) + ->thread_pool_->ParallelLaunch(ConvolutionGradInputFp16Run, this, context_->thread_num_); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "bias function error error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2DBackpropInputFusion, + LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.h new file mode 100644 index 00000000000..fe8f57ecb3f --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.h @@ -0,0 +1,44 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_INPUT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_INPUT_H_ + +#include +#include "src/inner_kernel.h" + +namespace mindspore::kernel { +class ConvolutionGradInputCPUKernelFp16 : public InnerKernel { + public: + explicit ConvolutionGradInputCPUKernelFp16(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : InnerKernel(parameter, inputs, outputs, ctx) {} + ~ConvolutionGradInputCPUKernelFp16() override {} + + int Init() override; + int ReSize() override; + int Run() override; + int Execute(int task_id); + + private: + size_t ws_size_ = 0; + size_t mat_alloc_ = 0; + bool do_img2col_ = true; + const int chunk_ = C12NUM; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_INPUT_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.cc new file mode 100644 index 00000000000..67648d59450 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.cc @@ -0,0 +1,95 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.h" +#include "nnacl/fp16_grad/dropout_grad.h" +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "include/errorcode.h" +#include "nnacl/fp32_grad/dropout_parameter.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_NULL_PTR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_DropoutGrad; + +namespace mindspore::kernel { + +int DropoutGradCPUKernelFp16::Init() { + auto param = reinterpret_cast(op_parameter_); + if (param == nullptr) { + MS_LOG(ERROR) << "Dropout op_parameter_ nullptr"; + return RET_NULL_PTR; + } + auto ratio = param->ratio_; + if ((ratio > 1.0f) || (ratio < 0.0f)) { + MS_LOG(ERROR) << "unsupported ratio value - Dropout ratio should be between zero to one"; + return RET_ERROR; + } + + if (ratio >= 1.0f) { + scale_ = 1.0f; + } else { + scale_ = 1. / (1. - ratio); + } + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int DropoutGradCPUKernelFp16::ReSize() { return RET_OK; } + +int DropoutGradCPUKernelFp16::Execute(int task_id) { + auto yt_ptr = reinterpret_cast(in_tensors_.at(kInputIndex)->data_c()); + auto mask_ptr = reinterpret_cast(in_tensors_.at(1)->data_c()); + auto output_ptr = reinterpret_cast(out_tensors_.at(kOutputIndex)->data_c()); + auto length = in_tensors_.at(kInputIndex)->ElementsNum(); + int stride = UP_DIV(length, thread_count_); + int count = MSMIN(stride, length - stride * task_id); + + if (count > 0) { + int start = stride * task_id; + DropoutFp16Grad(&(yt_ptr[start]), &(mask_ptr[start]), &(output_ptr[start]), count, (float16_t)scale_); + } + return RET_OK; +} + +int RunDropoutFp16Grad(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + auto dropout = reinterpret_cast(cdata); + auto error_code = dropout->Execute(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Dropout Grad Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int DropoutGradCPUKernelFp16::Run() { + int error_code = static_cast(this->context_) + ->thread_pool_->ParallelLaunch(RunDropoutFp16Grad, this, thread_count_); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Dropout Grad function error error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_DropoutGrad, LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.h new file mode 100644 index 00000000000..8a55c20ff77 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.h @@ -0,0 +1,43 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_DROPOUT_FP16_GRAD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_DROPOUT_FP16_GRAD_H_ + +#include +#include "src/inner_kernel.h" + +namespace mindspore::kernel { +class DropoutGradCPUKernelFp16 : public InnerKernel { + public: + DropoutGradCPUKernelFp16(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : InnerKernel(parameter, inputs, outputs, ctx), thread_count_(ctx->thread_num_) {} + + ~DropoutGradCPUKernelFp16() override = default; + + int Init() override; + int ReSize() override; + int Run() override; + int Execute(int task_id); + + private: + float scale_; + int thread_count_ = 1; +}; + +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_DROPOUT_FP16_GRAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.cc new file mode 100644 index 00000000000..d56d94500e4 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.cc @@ -0,0 +1,109 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.h" +#include + +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "nnacl/fp16_grad/layernorm_grad.h" +#include "nnacl/fp32_grad/layernormgrad_parameter.h" +#include "include/errorcode.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_LayerNormGrad; + +namespace mindspore::kernel { +int LayerNormGradCPUKernelFp16::ReSize() { return RET_OK; } + +int LayerNormGradCPUKernelFp16::Init() { + auto lngrad_param = reinterpret_cast(op_parameter_); + auto *input_x = in_tensors_.at(0); + std::vector x_shape = input_x->shape(); + int begin_norm_axis = lngrad_param->begin_norm_axis_; + if (begin_norm_axis < 0) { + begin_norm_axis += x_shape.size(); + } + auto begin_params_axis = lngrad_param->begin_params_axis_; + if (begin_params_axis < 0) { + begin_params_axis += x_shape.size(); + } + for (size_t i = 0; i < static_cast(begin_norm_axis); i++) { + block_num_ *= x_shape[i]; + } + for (size_t i = static_cast(begin_norm_axis); i < x_shape.size(); i++) { + block_size_ *= x_shape[i]; + } + for (size_t i = 0; i < static_cast(begin_params_axis); i++) { + param_size_ *= x_shape[i]; + } + for (size_t i = begin_params_axis; i < x_shape.size(); i++) { + param_num_ *= x_shape[i]; + } + if (block_num_ <= 0 || block_size_ <= 0) { + MS_LOG(ERROR) << "LayerNormGradCPUKernelFp16 input shape error, input shape: " << x_shape; + } + return RET_OK; +} + +int LayerNormGradCPUKernelFp16::Execute(int task_id) { + auto input_x = in_tensors_.at(0); + auto input_dy = in_tensors_.at(1); + auto input_var = in_tensors_.at(2); + auto input_mean = in_tensors_.at(3); + auto input_gamma = in_tensors_.at(4); + auto output_dx = out_tensors_.at(0); + auto output_dg = out_tensors_.at(1); + auto output_db = out_tensors_.at(2); + + float16_t *x = reinterpret_cast(input_x->data_c()); + float16_t *dy = reinterpret_cast(input_dy->data_c()); + float16_t *var = reinterpret_cast(input_var->data_c()); + float16_t *mean = reinterpret_cast(input_mean->data_c()); + float16_t *gamma = reinterpret_cast(input_gamma->data_c()); + float16_t *dx = reinterpret_cast(output_dx->data_c()); + float16_t *dg = reinterpret_cast(output_dg->data_c()); + float16_t *db = reinterpret_cast(output_db->data_c()); + LayerNormFp16Grad(x, dy, var, mean, gamma, param_num_, param_size_, block_num_, block_size_, dx, dg, db); + return RET_OK; +} + +int LayerNormF16GradRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + MS_ASSERT(cdata != nullptr); + auto ln_kernel = reinterpret_cast(cdata); + auto error_code = ln_kernel->Execute(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "LayerNormGradRun error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int LayerNormGradCPUKernelFp16::Run() { + int error_code = + static_cast(this->context_)->thread_pool_->ParallelLaunch(LayerNormF16GradRun, this, 1); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "LayerNorm function error error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LayerNormGrad, LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.h new file mode 100644 index 00000000000..f2cdb1e4efb --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.h @@ -0,0 +1,43 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_LAYERNORM_FP16_GRAD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_LAYERNORM_FP16_GRAD_H_ + +#include +#include "src/inner_kernel.h" + +namespace mindspore::kernel { + +class LayerNormGradCPUKernelFp16 : public InnerKernel { + public: + explicit LayerNormGradCPUKernelFp16(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : InnerKernel(parameter, inputs, outputs, ctx) {} + ~LayerNormGradCPUKernelFp16() override {} + int Init() override; + int ReSize() override; + int Run() override; + int Execute(int task_id); + + private: + int block_num_ = 1; + int block_size_ = 1; + int param_num_ = 1; + int param_size_ = 1; +}; +} // namespace mindspore::kernel +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_LAYERNORM_FP16_GRAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.cc new file mode 100644 index 00000000000..a04b6ff02b7 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.cc @@ -0,0 +1,69 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.h" +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "include/errorcode.h" +#include "nnacl/fp16/arithmetic_self_fp16.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_NegGrad; + +namespace mindspore::kernel { +namespace { +int NegGradRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + MS_ASSERT(cdata != nullptr); + auto kernel = reinterpret_cast(cdata); + MS_ASSERT(kernel != nullptr); + return kernel->DoNegGrad(task_id); +} +} // namespace + +int NegGradCPUKernelFp16::Init() { return RET_OK; } + +int NegGradCPUKernelFp16::DoNegGrad(int task_id) { + auto dy = reinterpret_cast(in_tensors_.at(0)->data_c()); + auto dx = reinterpret_cast(out_tensors_.at(0)->data_c()); + int length = in_tensors_.at(0)->ElementsNum(); + + int stride = UP_DIV(length, thread_count_); + int count = MSMIN(stride, length - stride * task_id); + count = (count < 0) ? 0 : count; + int start = stride * task_id; + + ElementNegativeFp16(dy + start, dx + start, count); + return RET_OK; +} + +int NegGradCPUKernelFp16::ReSize() { return RET_OK; } + +int NegGradCPUKernelFp16::Run() { + int ret = static_cast(this->context_) + ->thread_pool_->ParallelLaunch(NegGradRun, this, thread_count_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "parallel launch fail!ret: " << ret; + return ret; + } + + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_NegGrad, LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.h new file mode 100644 index 00000000000..583f9f38628 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.h @@ -0,0 +1,42 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_NEG_FP16_GRAD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_NEG_FP16_GRAD_H_ + +#include +#include "src/inner_kernel.h" +#include "schema/model_generated.h" + +namespace mindspore::kernel { + +class NegGradCPUKernelFp16 : public InnerKernel { + public: + explicit NegGradCPUKernelFp16(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : InnerKernel(parameter, inputs, outputs, ctx), thread_count_(ctx->thread_num_) {} + ~NegGradCPUKernelFp16() override {} + int Init() override; + int ReSize() override; + int Run() override; + int DoNegGrad(int thread_id); + + private: + int thread_count_; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_NEG_FP16_GRAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.cc new file mode 100644 index 00000000000..367aa8dfe80 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.cc @@ -0,0 +1,113 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.h" +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "nnacl/fp16/pooling_fp16.h" +#include "nnacl/fp16_grad/pooling_grad.h" +#include "include/errorcode.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_AvgPoolGrad; +using mindspore::schema::PrimitiveType_MaxPoolGrad; + +namespace mindspore::kernel { +int PoolingGradCPUKernelFp16::ReSize() { + PoolingParameter *pool_param = reinterpret_cast(op_parameter_); + + auto in_shape = in_tensors_.at(0)->shape(); + auto out_shape = in_tensors_.at(1)->shape(); + + if (pool_param->pool_mode_ == PoolMode_AvgPool) { + out_shape = in_tensors_.at(2)->shape(); + } + + int input_h = in_shape.at(1); + int input_w = in_shape.at(2); + + if (pool_param->global_) { + pool_param->window_w_ = input_w; + pool_param->window_h_ = input_h; + } + + pool_param->input_h_ = in_shape[kNHWC_H]; + pool_param->input_w_ = in_shape[kNHWC_W]; + pool_param->input_batch_ = in_shape[kNHWC_N]; + pool_param->input_channel_ = in_shape[kNHWC_C]; + pool_param->output_h_ = out_shape[kNHWC_H]; + pool_param->output_w_ = out_shape[kNHWC_W]; + pool_param->output_batch_ = out_shape[kNHWC_N]; + pool_param->output_channel_ = out_shape[kNHWC_C]; + + return RET_OK; +} + +int PoolingGradCPUKernelFp16::Init() { return ReSize(); } + +int PoolingGradCPUKernelFp16::Execute(int task_id) { + PoolingParameter *pool_param = reinterpret_cast(op_parameter_); + auto input_ptr = reinterpret_cast(in_tensors_.at(0)->data_c()); + auto output_ptr = reinterpret_cast(out_tensors_.at(0)->data_c()); + int stride = UP_DIV(pool_param->output_batch_, thread_num_); + int count = MSMIN(stride, pool_param->output_batch_ - stride * task_id); + + if (count > 0) { + int in_batch_size = pool_param->input_h_ * pool_param->input_w_ * pool_param->input_channel_; + int out_batch_size = pool_param->output_h_ * pool_param->output_w_ * pool_param->input_channel_; + std::fill(output_ptr + task_id * stride * in_batch_size, output_ptr + ((task_id * stride) + count) * in_batch_size, + 0.f); + if (pool_param->pool_mode_ == PoolMode_MaxPool) { + auto dy_ptr = reinterpret_cast(in_tensors_.at(2)->data_c()); + MaxPoolingFp16Grad(input_ptr + task_id * stride * in_batch_size, dy_ptr + task_id * stride * out_batch_size, + output_ptr + task_id * stride * in_batch_size, count, pool_param); + } else { + input_ptr = reinterpret_cast(in_tensors_.at(2)->data_c()); + AvgPoolingFp16Grad(input_ptr + task_id * stride * out_batch_size, output_ptr + task_id * stride * in_batch_size, + count, pool_param); + } + } + return RET_OK; +} + +int PoolingFp16GradImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + MS_ASSERT(cdata != nullptr); + auto pooling = reinterpret_cast(cdata); + auto error_code = pooling->Execute(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Pooling Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int PoolingGradCPUKernelFp16::Run() { + thread_num_ = context_->thread_num_; + int error_code = static_cast(this->context_) + ->thread_pool_->ParallelLaunch(PoolingFp16GradImpl, this, thread_num_); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_AvgPoolGrad, LiteKernelCreator) +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MaxPoolGrad, LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.h new file mode 100644 index 00000000000..195a76b48ca --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.h @@ -0,0 +1,47 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_POOLING_FP16_GRAD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_POOLING_FP16_GRAD_H_ + +#include +#include "src/inner_kernel.h" + +namespace mindspore::kernel { +using mindspore::schema::PadMode; +using mindspore::schema::PoolMode; +using mindspore::schema::QuantType; +using mindspore::schema::RoundMode; + +class PoolingGradCPUKernelFp16 : public InnerKernel { + public: + explicit PoolingGradCPUKernelFp16(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : InnerKernel(parameter, inputs, outputs, ctx) {} + ~PoolingGradCPUKernelFp16() override = default; + + int Init() override; + int ReSize() override; + int Run() override; + int Execute(int task_id); + + private: + int thread_num_ = 1; +}; + +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_POOLING_FP16_GRAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.cc new file mode 100644 index 00000000000..63d554326be --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.cc @@ -0,0 +1,104 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.h" +#include "nnacl/fp16_grad/resize_grad.h" +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "include/errorcode.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_ResizeGrad; + +namespace mindspore::kernel { + +float16_t ScalingFp16(size_t in_size, size_t out_size, bool align_corners) { + return (align_corners && out_size > 1) ? (in_size - 1) / static_cast(out_size - 1) + : in_size / static_cast(out_size); +} + +int ResizeGradCPUKernelFp16::ReSize() { + auto param = reinterpret_cast(op_parameter_); + if (param == nullptr) { + MS_LOG(ERROR) << "ResizeGradCPUKernelFp16 op_parameter_ is nullptr"; + return RET_ERROR; + } + bool align_corners = param->align_corners_; + param->in_height_ = static_cast(in_tensors_.at(0)->Height()); + param->in_width_ = static_cast(in_tensors_.at(0)->Width()); + param->out_height_ = static_cast(out_tensors_.at(0)->Height()); + param->out_width_ = static_cast(out_tensors_.at(0)->Width()); + param->height_scale_ = ScalingFp16(param->out_height_, param->in_height_, align_corners); + param->width_scale_ = ScalingFp16(param->out_width_, param->in_width_, align_corners); + + return RET_OK; +} + +int ResizeGradCPUKernelFp16::Init() { + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int ResizeGradCPUKernelFp16::Execute(int task_id) { + auto in_addr = reinterpret_cast(in_tensors_.at(0)->data_c()); + auto out_addr = reinterpret_cast(out_tensors_.at(0)->data_c()); + auto param = reinterpret_cast(op_parameter_); + if (param == nullptr) { + MS_LOG(ERROR) << "ResizeGradCPUKernelFp16 op_parameter_ is nullptr"; + return RET_ERROR; + } + auto batch_size = in_tensors_.at(0)->Batch(); + auto channel = in_tensors_.at(0)->Channel(); + + if (param->method == static_cast(schema::ResizeMethod_NEAREST)) { + ResizeNearestNeighborFp16Grad(in_addr, out_addr, batch_size, channel, in_tensors_.at(0)->format(), param); + } else { + ResizeBiLinearFp16Grad(in_addr, out_addr, batch_size, channel, in_tensors_.at(0)->format(), param); + } + return RET_OK; +} + +int ResizeFp16GradRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + auto resize_grad_kernel = reinterpret_cast(cdata); + auto error_code = resize_grad_kernel->Execute(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "resize grad error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int ResizeGradCPUKernelFp16::Run() { + auto out_addr = reinterpret_cast(out_tensors_.at(0)->data_c()); + size_t elem_number = out_tensors_.at(0)->ElementsNum(); + std::fill(out_addr, out_addr + elem_number, 0.f); + int error_code = + static_cast(this->context_)->thread_pool_->ParallelLaunch(ResizeFp16GradRun, this, 1); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "ResizeGradCPUKernelFp16 function error error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_ResizeGrad, LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.h new file mode 100644 index 00000000000..8091c1b004e --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.h @@ -0,0 +1,38 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_RESIZE_FP16_GRAD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_RESIZE_FP16_GRAD_H_ + +#include +#include "src/inner_kernel.h" + +namespace mindspore::kernel { +class ResizeGradCPUKernelFp16 : public InnerKernel { + public: + explicit ResizeGradCPUKernelFp16(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : InnerKernel(parameter, inputs, outputs, ctx) {} + ~ResizeGradCPUKernelFp16() override = default; + int Init() override; + int ReSize() override; + int Run() override; + int ExecuteInit(int task_id); + int Execute(int task_id); +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_RESIZE_FP16_GRAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.cc new file mode 100644 index 00000000000..213ca109a36 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.cc @@ -0,0 +1,154 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.h" +#include +#include +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "nnacl/fp16_grad/strided_slice_grad.h" +#include "src/ops/populate/strided_slice_populate.h" +#include "include/errorcode.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_StridedSliceGrad; + +namespace mindspore::kernel { + +int StridedSliceGradCPUKernelFp16::Init() { + if (!InferShapeDone()) { + return RET_OK; + } + param_ = reinterpret_cast(op_parameter_); + auto input = in_tensors_.at(0); + MS_ASSERT(input); + switch (input->data_type()) { + case kNumberTypeFloat16: + param_->data_type = kDataTypeFloat16; + break; + default: + MS_LOG(ERROR) << "Not supported data type: " << input->data_type(); + return RET_ERROR; + } + FillEmptyDims(); + FillOutputDim(); + return ReSize(); +} + +void StridedSliceGradCPUKernelFp16::FillEmptyDims() { + int32_t begins[DIMENSION_7D]; + int32_t ends[DIMENSION_7D]; + int32_t strides[DIMENSION_7D]; + int32_t input_shape[DIMENSION_7D]; + int32_t i; + for (i = 0; i < param_->num_axes_; ++i) { + begins[i] = param_->begins_[i]; + ends[i] = MSMIN(param_->ends_[i], param_->in_shape_[i]); + strides[i] = param_->strides_[i]; + input_shape[i] = param_->in_shape_[i]; + } + for (i = param_->num_axes_; i < param_->in_shape_length_; ++i) { + input_shape[i] = param_->in_shape_[i]; + begins[i] = 0; + ends[i] = param_->in_shape_[i]; + strides[i] = 1; + } + + int32_t real_index = param_->in_shape_length_ - 1; + for (i = DIMENSION_7D - 1; i >= 0; --i) { + if (real_index >= 0) { + param_->begins_[i] = begins[real_index]; + param_->ends_[i] = ends[real_index]; + param_->strides_[i] = strides[real_index]; + param_->in_shape_[i] = input_shape[real_index--]; + } else { + param_->begins_[i] = 0; + param_->ends_[i] = 1; + param_->strides_[i] = 1; + param_->in_shape_[i] = 1; + } + } + param_->num_axes_ = DIMENSION_7D; + param_->in_shape_length_ = DIMENSION_7D; + + for (i = 0; i < DIMENSION_7D; ++i) { + if (param_->begins_[i] < 0) { + param_->begins_[i] += param_->in_shape_[i]; + } + if (param_->ends_[i] < 0) { + param_->ends_[i] += param_->in_shape_[i]; + } + } +} + +void StridedSliceGradCPUKernelFp16::FillOutputDim() { + auto output = out_tensors_.at(0); + size_t out_size = output->shape().size(); + for (size_t i = 0; i < DIMENSION_7D; i++) { + if (i < out_size) { + output_shape_.push_back(output->shape()[i]); + } else { + output_shape_.insert(output_shape_.begin(), 1); + } + } +} + +int StridedSliceGradCPUKernelFp16::ReSize() { return RET_OK; } + +int StridedSliceFp16GradImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + MS_ASSERT(cdata != nullptr); + auto slice = reinterpret_cast(cdata); + auto error_code = slice->Execute(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "StridedSliceGrad Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int StridedSliceGradCPUKernelFp16::Run() { + int error_code = static_cast(this->context_) + ->thread_pool_->ParallelLaunch(StridedSliceFp16GradImpl, this, 1); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Strided slice error error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int StridedSliceGradCPUKernelFp16::Execute(int task_id) { + auto input = in_tensors_.at(0); + auto output = out_tensors_.at(0); + MS_ASSERT(output); + + int *po = output_shape_.data(); + auto dx = reinterpret_cast(output->data_c()); + auto dy = reinterpret_cast(input->data_c()); + + std::fill(dx, dx + output->ElementsNum(), 0.f); + auto ret = DoStridedSliceFp16Grad(dy, dx, po, param_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "StridedSliceGrad error error_code[" << ret << "]"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_StridedSliceGrad, LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.h new file mode 100644 index 00000000000..62dc1efb4f9 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.h @@ -0,0 +1,49 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_STRIDED_SLICE_FP16_GRAD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_STRIDED_SLICE_FP16_GRAD_H_ + +#include +#include "nnacl/fp16_grad/strided_slice_grad.h" +#include "src/lite_kernel.h" + +namespace mindspore::kernel { +class StridedSliceGradCPUKernelFp16 : public InnerKernel { + public: + StridedSliceGradCPUKernelFp16(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : InnerKernel(parameter, inputs, outputs, ctx) { + param_ = reinterpret_cast(parameter); + } + ~StridedSliceGradCPUKernelFp16() override = default; + + int Init() override; + int ReSize() override; + int Run() override; + int Execute(int task_id); + + private: + void FillEmptyDims(); + void FillOutputDim(); + void ParseMasks(); + + StridedSliceParameter *param_; + std::vector output_shape_; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_STRIDED_SLICE_FP16_GRAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.cc new file mode 100644 index 00000000000..95a26470efb --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.cc @@ -0,0 +1,98 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.h" +#include +#include +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "nnacl/fp16_grad/unsorted_segment_sum.h" +#include "include/errorcode.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_UnsortedSegmentSum; + +namespace mindspore::kernel { + +int UnsortedSegmentSumCPUKernelFp16::Init() { + if (!InferShapeDone()) { + return RET_OK; + } + auto input_shape = in_tensors_.at(0)->shape(); + auto segment_ids_shape = in_tensors_.at(1)->shape(); + auto output_shape = out_tensors_.at(0)->shape(); + unit_num_ = 1; + input_dim1_ = 1; + for (size_t i = 0; i < input_shape.size(); ++i) { + unit_num_ *= input_shape[i]; + if (i >= segment_ids_shape.size()) { + input_dim1_ *= input_shape[i]; + } + } + output_dim0_ = output_shape[0]; + output_dim1_ = 1; + for (size_t j = 1; j < output_shape.size(); j++) { + output_dim1_ *= output_shape[j]; + } + return RET_OK; +} + +int UnsortedSegmentSumCPUKernelFp16::ReSize() { return RET_OK; } + +int UnsortedSegmentSumFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) { + MS_ASSERT(cdata != nullptr); + auto kernel = reinterpret_cast(cdata); + auto error_code = kernel->Execute(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "UnsortedSegmentSum Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int UnsortedSegmentSumCPUKernelFp16::Run() { + int error_code = static_cast(this->context_) + ->thread_pool_->ParallelLaunch(UnsortedSegmentSumFp16Run, this, 1); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Strided slice error error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int UnsortedSegmentSumCPUKernelFp16::Execute(int task_id) { + int ret; + auto input_tensor = in_tensors_.at(0); + auto indices_tensor = in_tensors_.at(1); + auto output_tensor = out_tensors_.at(0); + float16_t *input = reinterpret_cast(input_tensor->data_c()); + int *indices = reinterpret_cast(indices_tensor->data_c()); + float16_t *output = reinterpret_cast(output_tensor->data_c()); + std::fill(output, output + output_tensor->ElementsNum(), 0.f); + ret = UnsortedSegmentSumFp16(input, unit_num_, input_dim1_, indices, output, output_dim0_, output_dim1_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "UnsortedSegmentSumFp16 error error_code[" << ret << "]"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_UnsortedSegmentSum, + LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.h new file mode 100644 index 00000000000..8108802a70c --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.h @@ -0,0 +1,44 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_UNSORTED_SEGMENT_SUM_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_UNSORTED_SEGMENT_SUM_FP16_H_ + +#include +#include "src/lite_kernel.h" + +namespace mindspore::kernel { +class UnsortedSegmentSumCPUKernelFp16 : public InnerKernel { + public: + UnsortedSegmentSumCPUKernelFp16(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : InnerKernel(parameter, inputs, outputs, ctx) {} + ~UnsortedSegmentSumCPUKernelFp16() override = default; + + int Init() override; + int ReSize() override; + int Run() override; + int Execute(int task_id); + size_t unit_num_ = 0; + size_t input_dim1_ = 0; + size_t output_dim0_ = 0; + size_t output_dim1_ = 0; + + private: +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_UNSORTED_SEGMENT_SUM_FP16_H_ diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16_grad/activation_grad_fp16_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16_grad/activation_grad_fp16_test.cc index 0eec475d2ba..1d7a17bdb4e 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16_grad/activation_grad_fp16_test.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16_grad/activation_grad_fp16_test.cc @@ -58,7 +58,7 @@ TEST_F(TestActGradFp16, ReluGradFp16) { input_buf[i] = (float16_t)input_data[i]; } - Fp16ReluGrad(yt_buf, input_buf, 50, output_buf); + ReluFp16Grad(yt_buf, input_buf, 50, output_buf); int res = 0; float error = 0; @@ -113,7 +113,7 @@ TEST_F(TestActGradFp16, SigmoidGradFp16) { input_buf[i] = (float16_t)input_data[i]; } - Fp16SigmoidGrad(yt_buf, input_buf, 50, output_buf); + SigmoidFp16Grad(yt_buf, input_buf, 50, output_buf); int res = 0; float error = 0;