float16 kernels

This commit is contained in:
yoni 2021-06-09 10:41:14 +03:00
parent 346a35ba7d
commit 394a8398e2
56 changed files with 4276 additions and 40 deletions

View File

@ -14,21 +14,26 @@
* limitations under the License.
*/
#include <math.h>
#include "nnacl/op_base.h"
#include "nnacl/fp16_grad/activation_grad.h"
#include <math.h>
#include <float.h>
#ifdef ENABLE_NEON
#include <arm_neon.h>
#include "nnacl/fp32/exp_fp32.h"
#endif
#include "nnacl/op_base.h"
#include "nnacl/errorcode.h"
int Fp16ReluGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst) {
int ReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
int i = 0;
#ifdef ENABLE_NEON
float16x8_t zero_4 = vdupq_n_f16(0);
for (; i < length - 4; i += 4) {
float16x8_t src0_4 = vld1q_f16(src0 + i);
float16x8_t src1_4 = vld1q_f16(src1 + i);
uint16x8_t mask_4 = vcgtq_f16(src1_4, zero_4);
float16x8_t dst_4 = vbslq_f16(mask_4, src0_4, zero_4);
vst1q_f16(dst + i, dst_4);
float16x8_t zero_v = vdupq_n_f16(0);
for (; i <= length - C8NUM; i += C8NUM) {
float16x8_t src0_v = vld1q_f16(src0 + i);
float16x8_t src1_v = vld1q_f16(src1 + i);
uint16x8_t mask_v = vcgtq_f16(src1_v, zero_v);
float16x8_t dst_v = vbslq_f16(mask_v, src0_v, zero_v);
vst1q_f16(dst + i, dst_v);
}
#endif
for (; i < length; i++) {
@ -37,15 +42,43 @@ int Fp16ReluGrad(const float16_t *src0, const float16_t *src1, size_t length, fl
return NNACL_OK;
}
int Fp16SigmoidGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst) {
int Relu6Fp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
int i = 0;
#ifdef ENABLE_NEON
float16x8_t one_4 = vdupq_n_f16(1);
for (; i < length - 4; i += 4) {
float16x8_t src0_4 = vld1q_f16(src0 + i);
float16x8_t src1_4 = vld1q_f16(src1 + i);
float16x8_t dst_4 = vmulq_f16(src0_4, vmulq_f16(src1_4, vsubq_f16(one_4, src1_4)));
vst1q_f16(dst + i, dst_4);
float16x8_t zero_8 = vdupq_n_f16(0);
float16x8_t six_8 = vdupq_n_f16(6);
for (; i < length - 8; i += 8) {
float16x8_t src1_8 = vld1q_f16(src1 + i);
float16x8_t src0_8 = vld1q_f16(src0 + i);
float16x8_t max_8 = vmaxq_f16(src1_8, zero_8);
float16x8_t min_max_8 = vminq_f16(max_8, six_8);
uint16x8_t mask_8 = vceqq_f16(min_max_8, src1_8);
float16x8_t dst_8 = vbslq_f16(mask_8, src0_8, zero_8);
vst1q_f16(dst + i, dst_8);
}
#endif
for (; i < length; ++i) {
dst[i] = (src1[i] > 0.0f && src1[i] <= 6.0f) ? src0[i] : 0.0f;
}
return NNACL_OK;
}
int LReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha) {
for (int i = 0; i < length; ++i) {
dst[i] = src0[i] * (src1[i] * (1.0f - src1[i]));
}
return NNACL_OK;
}
int SigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
int i = 0;
#ifdef ENABLE_NEON
float16x8_t one_8 = vdupq_n_f16(1);
for (; i < length - 8; i += 8) {
float16x8_t src0_8 = vld1q_f16(src0 + i);
float16x8_t src1_8 = vld1q_f16(src1 + i);
float16x8_t dst_8 = vmulq_f16(src0_8, vmulq_f16(src1_8, vsubq_f16(one_8, src1_8)));
vst1q_f16(dst + i, dst_8);
}
#endif
for (; i < length; i++) {
@ -53,3 +86,56 @@ int Fp16SigmoidGrad(const float16_t *src0, const float16_t *src1, size_t length,
}
return NNACL_OK;
}
int TanhFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
for (int i = 0; i < length; ++i) {
dst[i] = (float16_t)((1.0f - ((float)src1[i] * (float)src1[i])) * (float)src0[i]);
}
return NNACL_OK;
}
int HSwishFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
for (int i = 0; i < length; ++i) {
float16_t tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : (2.0f * src1[i] + 3.0f) / 6.0f));
dst[i] = tmp * src0[i];
}
return NNACL_OK;
}
int HSigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
for (int i = 0; i < length; ++i) {
float16_t tmp = (src1[i] > 3.0f ? 0.0f : (src1[i] < -3.0f ? 0.0f : 1.0f / 6.0f));
dst[i] = tmp * src0[i];
}
return NNACL_OK;
}
int EluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha) {
int i = 0;
#ifdef ENABLE_NEON
float16x4_t zero_4 = vdup_n_f16(0);
float16x4_t one_4 = vdup_n_f16(1);
float16x4_t alpha_4 = vdup_n_f16(alpha);
for (; i < length - 4; i += 4) {
float16x4_t src0_4 = vld1_f16(src0 + i);
float16x4_t src1_4 = vld1_f16(src1 + i);
uint16x4_t mask_4 = vcgt_f16(src1_4, zero_4);
float32x4_t tmp;
simd_exp(vcvt_f32_f16(src1_4), (float *)&tmp);
uint16x4_t expm1_4 = vsub_f16(vcvt_f16_f32(tmp), one_4);
float16x4_t dst_4 = vbsl_f16(mask_4, src0_4, vmul_f16(alpha_4, vmul_f16(expm1_4, src0_4)));
vst1_f16(dst + i, dst_4);
}
#endif
for (; i < length; ++i) {
dst[i] = (src1[i] > 0.0f ? src0[i] : alpha * expm1(src1[i]) * src0[i]);
}
return NNACL_OK;
}
int GeluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
for (int i = 0; i < length; ++i) {
dst[i] = src0[i] * ((0.5 * (1.0 + erf(src1[i] / 1.4142135623730951))) +
(src1[i] * exp(-0.5 * src1[i] * src1[i]) / 2.5066282746));
}
return NNACL_OK;
}

View File

@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
#define MINDSPORE_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
#ifdef ENABLE_NEON
#include <arm_neon.h>
@ -23,20 +23,22 @@
#include "nnacl/op_base.h"
#include "nnacl/int8/fixed_point.h"
typedef struct ActivationGradParameterFp16 {
OpParameter op_parameter;
int type_;
float alpha_;
} ActivationGradParameterFp16;
#ifdef __cplusplus
extern "C" {
#endif
int Fp16ReluGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst);
int Fp16SigmoidGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst);
int ReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
int Relu6Fp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
int LReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha);
int SigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
int TanhFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
int HSwishFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
int HSigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
int EluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha);
int GeluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_

View File

@ -0,0 +1,157 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp16_grad/arithmetic_grad.h"
#include <string.h>
#include <math.h>
#include "nnacl/fp32_grad/utils.h"
#include "nnacl/errorcode.h"
void ElementDivNegSquareFp16(const float16_t *nom, const float16_t *denom, float16_t *output, int element_size) {
for (int i = 0; i < element_size; i++) {
output[i] = -nom[i] / (denom[i] * denom[i]);
}
}
void ElementMulAndDivNegSquareFp16(const float16_t *a, const float16_t *b, const float16_t *denom, float16_t *output,
int element_size) {
for (int i = 0; i < element_size; i++) {
output[i] = -a[i] * b[i] / (denom[i] * denom[i]);
}
}
int ElementAbsGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, int element_size) {
for (int i = 0; i < element_size; i++) {
out[i] = (in1[i] < 0.f) ? -in2[i] : ((in1[i] > 0.f) ? in2[i] : 0);
}
return NNACL_OK;
}
void MaximumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims,
const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1,
int num_dims) {
int num_output0 = 1;
int num_output1 = 1;
bool same_shape = true;
for (int idx = 0; idx < num_dims; ++idx) {
num_output0 *= input0_dims[idx];
num_output1 *= input1_dims[idx];
if (input0_dims[idx] != input1_dims[idx]) {
same_shape = false;
}
}
if (same_shape) {
int input_iter[8] = {0};
// Iterate through input_data.
do {
size_t offset = GetInputOffset(num_dims, input0_dims, input_iter);
output0[offset] = input0[offset] > input1[offset] ? dy[offset] : 0.;
output1[offset] = input1[offset] >= input0[offset] ? dy[offset] : 0.;
} while (NextIndex(num_dims, input0_dims, input_iter));
} else {
memset(output0, 0, num_output0 * sizeof(float16_t)); // zero output
memset(output1, 0, num_output1 * sizeof(float16_t)); // zero output
int input_iter[8] = {0};
int axes0[5] = {0};
int axes1[5] = {0};
int num_axes0 = 0;
int num_axes1 = 0;
for (int i = 0; i < num_dims; i++) {
if (input0_dims[i] == 1) {
axes0[num_axes0++] = i;
}
if (input1_dims[i] == 1) {
axes1[num_axes1++] = i;
}
}
do {
size_t offset0 = GetOutputOffset(num_dims, input0_dims, input_iter, num_axes0, axes0);
size_t offset1 = GetOutputOffset(num_dims, input1_dims, input_iter, num_axes1, axes1);
size_t yt_offset = GetInputOffset(num_dims, input0_dims, input_iter);
output0[offset0] += input0[offset0] > input1[offset1] ? dy[yt_offset] : 0.;
output1[offset1] += input1[offset1] >= input0[offset0] ? dy[yt_offset] : 0.;
} while (NextIndex(num_dims, dy_dims, input_iter));
}
}
void MinimumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims,
const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1,
int num_dims) {
int num_output0 = 1;
int num_output1 = 1;
bool same_shape = true;
for (int idx = 0; idx < num_dims; ++idx) {
num_output0 *= input0_dims[idx];
num_output1 *= input1_dims[idx];
if (input0_dims[idx] != input1_dims[idx]) {
same_shape = false;
}
}
if (same_shape) {
int input_iter[8] = {0};
// Iterate through input_data.
do {
size_t offset = GetInputOffset(num_dims, input0_dims, input_iter);
output0[offset] = input0[offset] < input1[offset] ? dy[offset] : 0.;
output1[offset] = input1[offset] <= input0[offset] ? dy[offset] : 0.;
} while (NextIndex(num_dims, input0_dims, input_iter));
} else {
memset(output0, 0, num_output0 * sizeof(float16_t)); // zero output
memset(output1, 0, num_output1 * sizeof(float16_t)); // zero output
int input_iter[8] = {0};
int axes0[5] = {0};
int axes1[5] = {0};
int num_axes0 = 0;
int num_axes1 = 0;
for (int i = 0; i < num_dims; i++) {
if (input0_dims[i] == 1) {
axes0[num_axes0++] = i;
}
if (input1_dims[i] == 1) {
axes1[num_axes1++] = i;
}
}
do {
size_t offset0 = GetOutputOffset(num_dims, input0_dims, input_iter, num_axes0, axes0);
size_t offset1 = GetOutputOffset(num_dims, input1_dims, input_iter, num_axes1, axes1);
size_t yt_offset = GetInputOffset(num_dims, input0_dims, input_iter);
output0[offset0] += input0[offset0] < input1[offset1] ? dy[yt_offset] : 0.;
output1[offset1] += input1[offset1] <= input0[offset0] ? dy[yt_offset] : 0.;
} while (NextIndex(num_dims, dy_dims, input_iter));
}
}
int ElementSqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size) {
for (int i = 0; i < element_size; i++) {
out[i] = 0.5f * in2[i] / in1[i];
}
return NNACL_OK;
}
int ElementRsqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size) {
for (int i = 0; i < element_size; i++) {
out[i] = -0.5f * in2[i] * in1[i] * in1[1] * in1[i];
}
return NNACL_OK;
}

View File

@ -0,0 +1,41 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ARITHMETIC_GRAD_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ARITHMETIC_GRAD_H_
#include "nnacl/op_base.h"
#ifdef __cplusplus
extern "C" {
#endif
void ElementDivNegSquareFp16(const float16_t *nom, const float16_t *denom, float16_t *output, int element_size);
void ElementMulAndDivNegSquareFp16(const float16_t *a, const float16_t *b, const float16_t *denom, float16_t *output,
int element_size);
int ElementAbsGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, int element_size);
void MaximumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims,
const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1,
int num_dims);
void MinimumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims,
const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1,
int num_dims);
int ElementSqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size);
int ElementRsqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ARITHMETIC_GRAD_H_

View File

@ -0,0 +1,86 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <math.h>
#include <string.h>
#include "nnacl/fp16_grad/batch_norm.h"
void var2InvarFp16(float16_t *save_var, int size, float eps) {
for (int i = 0; i < size; i++) {
save_var[i] = (float16_t)(1.0f / sqrtf((float)save_var[i] + eps));
}
}
void backwardAllFp16(const float16_t *restrict in, const float16_t *restrict yt, const float16_t *restrict mean,
const float16_t *restrict invar, const float16_t *restrict scale, int size, int ch,
float *restrict dxhat_sum, float *restrict dxhathat_sum, float16_t *restrict dbias,
float16_t *restrict dscale, float16_t *restrict dx) {
float16_t N = (float16_t)size;
for (int i = 0; i < size; i++) {
for (int c = 0; c < ch; c++) {
int ix = i * ch + c;
dbias[c] += yt[ix];
// dscale
float16_t x_hat = (in[ix] - mean[c]) * invar[c];
dscale[c] += (yt[ix] * x_hat);
// dx_1
float dx_hat = (float)(yt[ix] * scale[c]);
dxhat_sum[c] += dx_hat;
dxhathat_sum[c] += (float)(dx_hat * x_hat);
}
}
for (int i = 0; i < size; i++) {
for (int c = 0; c < ch; c++) {
// dx_2
int ix = i * ch + c;
float16_t x_hat = (in[ix] - mean[c]) * invar[c];
float16_t dx_hat = yt[ix] * scale[c];
dx[ix] = 1.0f / N * (float)((invar[c]) * (N * dx_hat - dxhat_sum[c] - x_hat * dxhathat_sum[c]));
}
}
}
void backwardP1Fp16(const float16_t *restrict in, const float16_t *restrict yt, const float16_t *restrict mean,
const float16_t *restrict invar, const float16_t *restrict scale, int size, int ch,
float *restrict dxhat_sum, float *restrict dxhathat_sum, float16_t *restrict dbias,
float16_t *restrict dscale) {
for (int i = 0; i < size; i++) {
for (int c = 0; c < ch; c++) {
int ix = i * ch + c;
dbias[c] += yt[ix];
// dscale
float x_hat = (float)((in[ix] - mean[c]) * invar[c]);
dscale[c] += (yt[ix] * x_hat);
// dx_1
float dx_hat = (float)(yt[ix] * scale[c]);
dxhat_sum[c] += dx_hat;
dxhathat_sum[c] += dx_hat * x_hat;
}
}
}
void backwardP2Fp16(const float16_t *restrict in, const float16_t *restrict yt, const float16_t *restrict mean,
const float16_t *restrict invar, const float16_t *restrict scale, int size, int total_size, int ch,
const float *dxhat_sum, const float *dxhathat_sum, float16_t *restrict dx) {
const float N = (float)total_size;
for (int i = 0; i < size; i++) {
for (int c = 0; c < ch; c++) {
// dx_2
int ix = i * ch + c;
float x_hat = (float)((in[ix] - mean[c]) * invar[c]);
float dx_hat = (float)(yt[ix] * scale[c]);
dx[ix] = (float16_t)(1.0f / N * (float)(invar[c]) * (N * dx_hat - dxhat_sum[c] - x_hat * dxhathat_sum[c]));
}
}
}

View File

@ -0,0 +1,40 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_BATCH_NORM_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_BATCH_NORM_H_
#include "nnacl/op_base.h"
#ifdef __cplusplus
extern "C" {
#endif
void var2InvarFp16(float16_t *save_var, int size, float eps);
void backwardAllFp16(const float16_t *in, const float16_t *yt, const float16_t *mean, const float16_t *invar,
const float16_t *scale, int size, int ch, float *dxhat_sum, float *dxhathat_sum, float16_t *dbias,
float16_t *dscale, float16_t *dx);
void backwardP1Fp16(const float16_t *in, const float16_t *yt, const float16_t *mean, const float16_t *invar,
const float16_t *scale, int size, int ch, float *dxhat_sum, float *dxhathat_sum, float16_t *dbias,
float16_t *dscale);
void backwardP2Fp16(const float16_t *in, const float16_t *yt, const float16_t *mean, const float16_t *invar,
const float16_t *scale, int size, int total_size, int ch, const float *dxhat_sum,
const float *dxhathat_sum, float16_t *dx);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_BATCH_NORM_H_

View File

@ -0,0 +1,360 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp16_grad/convolution_grad_filter.h"
#include "nnacl/intrinsics/ms_simd_instructions_fp16.h"
#ifdef ENABLE_NEON
#include <arm_neon.h>
#endif
#ifdef ENABLE_NEON
static int FilterGrad32Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
const ConvParameter *conv_param) {
int in_h = conv_param->input_h_;
int in_w = conv_param->input_w_;
int k_h = conv_param->kernel_h_;
int k_w = conv_param->kernel_w_;
int batch = conv_param->output_batch_;
int out_ch = conv_param->output_channel_;
int in_ch = conv_param->input_channel_;
int out_h = conv_param->output_h_;
int out_w = conv_param->output_w_;
int m = out_h * out_w;
int x_size = in_h * in_w * in_ch;
int y_size = out_ch * out_h * out_w;
int k_spatial = k_w * k_h;
int i_kh = k_idx / k_w;
int i_kw = k_idx % k_w;
for (; i_c < (out_ch & ~31); i_c += 32) {
float32x4_t sum_0 = vdupq_n_f32(0.0f);
float32x4_t sum_1 = vdupq_n_f32(0.0f);
float32x4_t sum_2 = vdupq_n_f32(0.0f);
float32x4_t sum_3 = vdupq_n_f32(0.0f);
float32x4_t sum_4 = vdupq_n_f32(0.0f);
float32x4_t sum_5 = vdupq_n_f32(0.0f);
float32x4_t sum_6 = vdupq_n_f32(0.0f);
float32x4_t sum_7 = vdupq_n_f32(0.0f);
for (int b = 0; b < batch; ++b) {
const float16_t *x_addr = &x[b * x_size];
const float16_t *dy_addr = &dy[b * y_size];
for (int i = 0; i < m; i++) {
int idx = i;
int input_h = idx / out_w * conv_param->stride_h_;
int input_w = idx % out_w * conv_param->stride_w_;
int input_row = -conv_param->pad_u_ + i_kh + input_h;
int input_col = -conv_param->pad_l_ + i_kw + input_w;
if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
int offset_dy = idx * out_ch + i_c;
float16x8_t x_0 = vld1q_f16(x_addr + offset_x);
float16x8_t dy_0 = vld1q_f16(dy_addr + offset_dy);
sum_0 = MS_VMLAL_F16(vget_low_f16(x_0), vget_low_f16(dy_0), sum_0);
sum_1 = MS_VMLAL_F16(vget_high_f16(x_0), vget_high_f16(dy_0), sum_1);
float16x8_t x_1 = vld1q_f16(x_addr + offset_x + 8);
float16x8_t dy_1 = vld1q_f16(dy_addr + offset_dy + 8);
sum_2 = MS_VMLAL_F16(vget_low_f16(x_1), vget_low_f16(dy_1), sum_2);
sum_3 = MS_VMLAL_F16(vget_high_f16(x_1), vget_high_f16(dy_1), sum_3);
float16x8_t x_2 = vld1q_f16(x_addr + offset_x + 16);
float16x8_t dy_2 = vld1q_f16(dy_addr + offset_dy + 16);
sum_4 = MS_VMLAL_F16(vget_low_f16(x_2), vget_low_f16(dy_2), sum_4);
sum_5 = MS_VMLAL_F16(vget_high_f16(x_2), vget_high_f16(dy_2), sum_5);
float16x8_t x_3 = vld1q_f16(x_addr + offset_x + 24);
float16x8_t dy_3 = vld1q_f16(dy_addr + offset_dy + 24);
sum_6 = MS_VMLAL_F16(vget_low_f16(x_3), vget_low_f16(dy_3), sum_6);
sum_7 = MS_VMLAL_F16(vget_high_f16(x_3), vget_high_f16(dy_3), sum_7);
}
}
}
// store into memory
for (int l = 0; l < 4; l++) {
dw[(i_c + l) * k_spatial + k_idx] = sum_0[l];
dw[(i_c + 4 + l) * k_spatial + k_idx] = sum_1[l];
dw[(i_c + 8 + l) * k_spatial + k_idx] = sum_2[l];
dw[(i_c + 12 + l) * k_spatial + k_idx] = sum_3[l];
dw[(i_c + 16 + l) * k_spatial + k_idx] = sum_4[l];
dw[(i_c + 20 + l) * k_spatial + k_idx] = sum_5[l];
dw[(i_c + 24 + l) * k_spatial + k_idx] = sum_6[l];
dw[(i_c + 28 + l) * k_spatial + k_idx] = sum_7[l];
}
}
return i_c;
}
static int FilterGrad16Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
const ConvParameter *conv_param) {
int in_h = conv_param->input_h_;
int in_w = conv_param->input_w_;
int k_h = conv_param->kernel_h_;
int k_w = conv_param->kernel_w_;
int batch = conv_param->output_batch_;
int out_ch = conv_param->output_channel_;
int in_ch = conv_param->input_channel_;
int out_h = conv_param->output_h_;
int out_w = conv_param->output_w_;
int m = out_h * out_w;
int x_size = in_h * in_w * in_ch;
int y_size = out_ch * out_h * out_w;
int k_spatial = k_w * k_h;
int i_kh = k_idx / k_w;
int i_kw = k_idx % k_w;
for (; i_c < (out_ch & ~15); i_c += 16) {
float32x4_t sum_0 = vdupq_n_f32(0.0f);
float32x4_t sum_1 = vdupq_n_f32(0.0f);
float32x4_t sum_2 = vdupq_n_f32(0.0f);
float32x4_t sum_3 = vdupq_n_f32(0.0f);
for (int b = 0; b < batch; ++b) {
const float16_t *x_addr = &x[b * x_size];
const float16_t *dy_addr = &dy[b * y_size];
for (int i = 0; i < m; i++) {
int idx = i;
int input_h = idx / out_w * conv_param->stride_h_;
int input_w = idx % out_w * conv_param->stride_w_;
int input_row = -conv_param->pad_u_ + i_kh + input_h;
int input_col = -conv_param->pad_l_ + i_kw + input_w;
if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
int offset_dy = idx * out_ch + i_c;
float16x8_t x_0 = vld1q_f16(x_addr + offset_x);
float16x8_t dy_0 = vld1q_f16(dy_addr + offset_dy);
sum_0 = MS_VMLAL_F16(vget_low_f16(x_0), vget_low_f16(dy_0), sum_0);
sum_1 = MS_VMLAL_F16(vget_high_f16(x_0), vget_high_f16(dy_0), sum_1);
float16x8_t x_1 = vld1q_f16(x_addr + offset_x + 8);
float16x8_t dy_1 = vld1q_f16(dy_addr + offset_dy + 8);
sum_2 = MS_VMLAL_F16(vget_low_f16(x_1), vget_low_f16(dy_1), sum_2);
sum_3 = MS_VMLAL_F16(vget_high_f16(x_1), vget_high_f16(dy_1), sum_3);
}
}
}
for (int l = 0; l < 4; l++) {
dw[(i_c + l) * k_spatial + k_idx] = sum_0[l];
dw[(i_c + l + 4) * k_spatial + k_idx] = sum_1[l];
dw[(i_c + l + 8) * k_spatial + k_idx] = sum_2[l];
dw[(i_c + l + 12) * k_spatial + k_idx] = sum_3[l];
}
}
return i_c;
}
static int FilterGrad8Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
const ConvParameter *conv_param) {
int in_h = conv_param->input_h_;
int in_w = conv_param->input_w_;
int k_h = conv_param->kernel_h_;
int k_w = conv_param->kernel_w_;
int batch = conv_param->output_batch_;
int out_ch = conv_param->output_channel_;
int in_ch = conv_param->input_channel_;
int out_h = conv_param->output_h_;
int out_w = conv_param->output_w_;
int m = out_h * out_w;
int x_size = in_h * in_w * in_ch;
int y_size = out_ch * out_h * out_w;
int k_spatial = k_w * k_h;
int i_kh = k_idx / k_w;
int i_kw = k_idx % k_w;
for (; i_c < (out_ch & ~7); i_c += 8) {
float32x4_t sum_0 = vdupq_n_f32(0.0f);
float32x4_t sum_1 = vdupq_n_f32(0.0f);
for (int b = 0; b < batch; ++b) {
const float16_t *x_addr = &x[b * x_size];
const float16_t *dy_addr = &dy[b * y_size];
for (int i = 0; i < m; i++) {
int idx = i;
int input_h = idx / out_w * conv_param->stride_h_;
int input_w = idx % out_w * conv_param->stride_w_;
int input_row = -conv_param->pad_u_ + i_kh + input_h;
int input_col = -conv_param->pad_l_ + i_kw + input_w;
if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
int offset_dy = idx * out_ch + i_c;
float16x8_t x_0 = vld1q_f16(x_addr + offset_x);
float16x8_t dy_0 = vld1q_f16(dy_addr + offset_dy);
sum_0 = MS_VMLAL_F16(vget_low_f16(x_0), vget_low_f16(dy_0), sum_0);
sum_1 = MS_VMLAL_F16(vget_high_f16(x_0), vget_high_f16(dy_0), sum_1);
}
}
}
for (int l = 0; l < 4; l++) {
dw[(i_c + l) * k_spatial + k_idx] = sum_0[l];
dw[(i_c + 4 + l) * k_spatial + k_idx] = sum_1[l];
}
}
return i_c;
}
static int FilterGrad4Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
const ConvParameter *conv_param) {
int in_h = conv_param->input_h_;
int in_w = conv_param->input_w_;
int k_h = conv_param->kernel_h_;
int k_w = conv_param->kernel_w_;
int batch = conv_param->output_batch_;
int out_ch = conv_param->output_channel_;
int in_ch = conv_param->input_channel_;
int out_h = conv_param->output_h_;
int out_w = conv_param->output_w_;
int m = out_h * out_w;
int x_size = in_h * in_w * in_ch;
int y_size = out_ch * out_h * out_w;
int k_spatial = k_w * k_h;
int i_kh = k_idx / k_w;
int i_kw = k_idx % k_w;
for (; i_c < (out_ch & ~3); i_c += 4) {
float32x4_t sum_0 = vdupq_n_f32(0.0f);
for (int b = 0; b < batch; ++b) {
const float16_t *x_addr = &x[b * x_size];
const float16_t *dy_addr = &dy[b * y_size];
for (int i = 0; i < m; i++) {
int idx = i;
int input_h = idx / out_w * conv_param->stride_h_;
int input_w = idx % out_w * conv_param->stride_w_;
int input_row = -conv_param->pad_u_ + i_kh + input_h;
int input_col = -conv_param->pad_l_ + i_kw + input_w;
if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
int offset_dy = idx * out_ch + i_c;
float16x4_t x_0 = vld1_f16(x_addr + offset_x);
float16x4_t dy_0 = vld1_f16(dy_addr + offset_dy);
sum_0 = MS_VMLAL_F16(x_0, dy_0, sum_0);
}
}
}
dw[(i_c + 0) * k_spatial + k_idx] = sum_0[0];
dw[(i_c + 1) * k_spatial + k_idx] = sum_0[1];
dw[(i_c + 2) * k_spatial + k_idx] = sum_0[2];
dw[(i_c + 3) * k_spatial + k_idx] = sum_0[3];
}
return i_c;
}
static int FilterGradLeftoverArm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
const ConvParameter *conv_param) {
int in_h = conv_param->input_h_;
int in_w = conv_param->input_w_;
int k_h = conv_param->kernel_h_;
int k_w = conv_param->kernel_w_;
int batch = conv_param->output_batch_;
int out_ch = conv_param->output_channel_;
int in_ch = conv_param->input_channel_;
int out_h = conv_param->output_h_;
int out_w = conv_param->output_w_;
int m = out_h * out_w;
int x_size = in_h * in_w * in_ch;
int y_size = out_ch * out_h * out_w;
int k_spatial = k_w * k_h;
int i_kh = k_idx / k_w;
int i_kw = k_idx % k_w;
int leftover = out_ch - i_c;
if (leftover > 0) {
float32x4_t sum_0 = vdupq_n_f32(0.0f);
for (int b = 0; b < batch; ++b) {
const float16_t *x_addr = &x[b * x_size];
const float16_t *dy_addr = &dy[b * y_size];
for (int i = 0; i < m; i++) {
int idx = i;
int input_h = idx / out_w * conv_param->stride_h_;
int input_w = idx % out_w * conv_param->stride_w_;
int input_row = -conv_param->pad_u_ + i_kh + input_h;
int input_col = -conv_param->pad_l_ + i_kw + input_w;
if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
int offset_dy = idx * out_ch + i_c;
float16x4_t x_0 = vld1_f16(x_addr + offset_x);
float16x4_t dy_0 = vld1_f16(dy_addr + offset_dy);
sum_0 = MS_VMLAL_F16(x_0, dy_0, sum_0);
}
}
}
for (int l = 0; l < leftover; l++) {
dw[(i_c + l) * k_spatial + k_idx] = sum_0[l];
}
}
return out_ch;
}
#endif
int ConvDwFilterFp16Grad(const float16_t *x, const float16_t *dy, float16_t *dw, int start, int count,
const ConvParameter *conv_param) {
int in_h = conv_param->input_h_;
int in_w = conv_param->input_w_;
int k_h = conv_param->kernel_h_;
int k_w = conv_param->kernel_w_;
int batch = conv_param->output_batch_;
int out_ch = conv_param->output_channel_;
int in_ch = conv_param->input_channel_;
int out_h = conv_param->output_h_;
int out_w = conv_param->output_w_;
int m = out_h * out_w;
int x_size = in_h * in_w * in_ch;
int y_size = out_ch * out_h * out_w;
int k_spatial = k_w * k_h;
for (int i_k = 0; i_k < count; i_k++) {
int k_idx = start + i_k;
int i_kh = k_idx / k_w;
int i_kw = k_idx % k_w;
int i_c = 0;
#ifdef ENABLE_NEON
i_c = FilterGrad32Arm(x, dy, i_c, k_idx, dw, conv_param);
i_c = FilterGrad16Arm(x, dy, i_c, k_idx, dw, conv_param);
i_c = FilterGrad8Arm(x, dy, i_c, k_idx, dw, conv_param);
i_c = FilterGrad4Arm(x, dy, i_c, k_idx, dw, conv_param);
i_c = FilterGradLeftoverArm(x, dy, i_c, k_idx, dw, conv_param);
#endif
for (; i_c < out_ch; i_c++) {
float sum = 0;
for (int b = 0; b < batch; ++b) {
const float16_t *x_addr = &x[b * x_size];
const float16_t *dy_addr = &dy[b * y_size];
for (int i = 0; i < m; i++) {
int idx = i;
int input_h = idx / out_w * conv_param->stride_h_;
int input_w = idx % out_w * conv_param->stride_w_;
int input_row = -conv_param->pad_u_ + i_kh + input_h;
int input_col = -conv_param->pad_l_ + i_kw + input_w;
if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
int offset_dy = idx * out_ch + i_c;
sum += x_addr[offset_x] * dy_addr[offset_dy];
}
}
}
dw[i_c * k_spatial + k_idx] = sum;
}
}
return 0;
}

View File

@ -0,0 +1,33 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_CONVOLUTION_GRAD_FILTER_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_CONVOLUTION_GRAD_FILTER_H_
#include <stddef.h>
#include "nnacl/conv_parameter.h"
#ifdef __cplusplus
extern "C" {
#endif
int ConvDwFilterFp16Grad(const float16_t *x, const float16_t *dy, float16_t *dw, int start, int count,
const ConvParameter *conv_param);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_CONVOLUTION_GRAD_FILTER_H_

View File

@ -0,0 +1,24 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp16_grad/dropout_grad.h"
void DropoutFp16Grad(const float16_t *yt_ptr, const float16_t *mask, float16_t *output_ptr, int length,
float16_t scale) {
for (int i = 0; i < length; i++) {
output_ptr[i] = yt_ptr[i] * mask[i] * scale;
}
}

View File

@ -0,0 +1,32 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_DROPOUT_GRAD_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_DROPOUT_GRAD_H_
#include "nnacl/op_base.h"
#ifdef __cplusplus
extern "C" {
#endif
void DropoutFp16Grad(const float16_t *yt_ptr, const float16_t *mask, float16_t *output_ptr, int length,
float16_t ratio);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_DROPOUT_GRAD_H_

View File

@ -0,0 +1,373 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp16_grad/gemm_fp16.h"
#include <string.h>
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#include "nnacl/fp16/matmul_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#ifdef ENABLE_ARM64
static void Row2Col16Block16(const float16_t *src_ptr, float16_t *dst_ptr, size_t col) {
size_t stride = col * 2;
asm volatile(
"mov x10, %[src_c]\n"
"mov x11, %[dst_c]\n"
"ld1 {v0.8h}, [x10], %[stride]\n"
"ld1 {v1.8h}, [x10], %[stride]\n"
"ld1 {v2.8h}, [x10], %[stride]\n"
"ld1 {v3.8h}, [x10], %[stride]\n"
"ld1 {v4.8h}, [x10], %[stride]\n"
"ld1 {v5.8h}, [x10], %[stride]\n"
"ld1 {v6.8h}, [x10], %[stride]\n"
"ld1 {v7.8h}, [x10], %[stride]\n"
"zip1 v16.8h, v0.8h, v1.8h\n"
"zip1 v17.8h, v2.8h, v3.8h\n"
"zip1 v18.8h, v4.8h, v5.8h\n"
"zip1 v19.8h, v6.8h, v7.8h\n"
"ld1 {v8.8h}, [x10], %[stride]\n"
"ld1 {v9.8h}, [x10], %[stride]\n"
"ld1 {v10.8h}, [x10], %[stride]\n"
"ld1 {v11.8h}, [x10], %[stride]\n"
"ld1 {v12.8h}, [x10], %[stride]\n"
"ld1 {v13.8h}, [x10], %[stride]\n"
"ld1 {v14.8h}, [x10], %[stride]\n"
"ld1 {v15.8h}, [x10], %[stride]\n"
"trn1 v20.4s, v16.4s, v17.4s\n"
"trn2 v21.4s, v16.4s, v17.4s\n"
"trn1 v22.4s, v18.4s, v19.4s\n"
"trn2 v23.4s, v18.4s, v19.4s\n"
"trn1 v24.2d, v20.2d, v22.2d\n"
"trn2 v25.2d, v20.2d, v22.2d\n"
"trn1 v26.2d, v21.2d, v23.2d\n"
"trn2 v27.2d, v21.2d, v23.2d\n"
"zip1 v16.8h, v8.8h, v9.8h\n"
"zip1 v17.8h, v10.8h, v11.8h\n"
"zip1 v18.8h, v12.8h, v13.8h\n"
"zip1 v19.8h, v14.8h, v15.8h\n"
"trn1 v20.4s, v16.4s, v17.4s\n"
"trn2 v21.4s, v16.4s, v17.4s\n"
"trn1 v22.4s, v18.4s, v19.4s\n"
"trn2 v23.4s, v18.4s, v19.4s\n"
"trn1 v28.2d, v20.2d, v22.2d\n"
"trn2 v29.2d, v20.2d, v22.2d\n"
"trn1 v30.2d, v21.2d, v23.2d\n"
"trn2 v31.2d, v21.2d, v23.2d\n"
"st1 {v24.8h}, [x11], #16\n"
"st1 {v28.8h}, [x11], #16\n"
"st1 {v26.8h}, [x11], #16\n"
"st1 {v30.8h}, [x11], #16\n"
"st1 {v25.8h}, [x11], #16\n"
"st1 {v29.8h}, [x11], #16\n"
"st1 {v27.8h}, [x11], #16\n"
"st1 {v31.8h}, [x11], #16\n"
"zip2 v16.8h, v0.8h, v1.8h\n"
"zip2 v17.8h, v2.8h, v3.8h\n"
"zip2 v18.8h, v4.8h, v5.8h\n"
"zip2 v19.8h, v6.8h, v7.8h\n"
"trn1 v20.4s, v16.4s, v17.4s\n"
"trn2 v21.4s, v16.4s, v17.4s\n"
"trn1 v22.4s, v18.4s, v19.4s\n"
"trn2 v23.4s, v18.4s, v19.4s\n"
"trn1 v24.2d, v20.2d, v22.2d\n"
"trn2 v25.2d, v20.2d, v22.2d\n"
"trn1 v26.2d, v21.2d, v23.2d\n"
"trn2 v27.2d, v21.2d, v23.2d\n"
"zip2 v16.8h, v8.8h, v9.8h\n"
"zip2 v17.8h, v10.8h, v11.8h\n"
"zip2 v18.8h, v12.8h, v13.8h\n"
"zip2 v19.8h, v14.8h, v15.8h\n"
"trn1 v20.4s, v16.4s, v17.4s\n"
"trn2 v21.4s, v16.4s, v17.4s\n"
"trn1 v22.4s, v18.4s, v19.4s\n"
"trn2 v23.4s, v18.4s, v19.4s\n"
"trn1 v28.2d, v20.2d, v22.2d\n"
"trn2 v29.2d, v20.2d, v22.2d\n"
"trn1 v30.2d, v21.2d, v23.2d\n"
"trn2 v31.2d, v21.2d, v23.2d\n"
"st1 {v24.8h}, [x11], #16\n"
"st1 {v28.8h}, [x11], #16\n"
"st1 {v26.8h}, [x11], #16\n"
"st1 {v30.8h}, [x11], #16\n"
"st1 {v25.8h}, [x11], #16\n"
"st1 {v29.8h}, [x11], #16\n"
"st1 {v27.8h}, [x11], #16\n"
"st1 {v31.8h}, [x11], #16\n"
:
: [ dst_c ] "r"(dst_ptr), [ src_c ] "r"(src_ptr), [ stride ] "r"(stride)
: "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
#endif
void AddMatrixFp16(const float16_t *restrict v1, float16_t *restrict v2, float16_t beta, int row, int col, int stride) {
const float16_t *src_ptr = v1;
float16_t *dst_ptr = v2;
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
dst_ptr[c] += beta * src_ptr[c];
}
src_ptr += stride;
dst_ptr += stride;
}
}
int MatSizeFp16(int row, int col, int round) {
int res = UP_ROUND(row, round) * col;
return res;
}
int MatSizeTotalFp16(int row, int col, int deep, int stride) {
#ifdef ENABLE_ARM64
const int num = C16NUM;
#else
const int num = C12NUM;
#endif
int res = MatSizeFp16(row, deep, num) + MatSizeFp16(col, deep, C8NUM);
if (stride > 0) res += row * stride;
return res;
}
#ifdef ENABLE_ARM64
static void RowMajor2Col16MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
size_t row_up_16 = UP_ROUND(row, C16NUM);
size_t row16 = row / C16NUM * C16NUM;
size_t col8 = col / C8NUM * C8NUM;
const float16_t *src_r = src;
float16_t *dst_r = dst;
size_t ri = 0;
// find 16 block unit
for (; ri < row16; ri += C16NUM) {
size_t ci = 0;
for (; ci < col8; ci += C8NUM) {
const float16_t *src_c = src_r + ci;
float16_t *dst_c = dst_r + ci * C16NUM;
#ifdef ENABLE_ARM64
Row2Col16Block16(src_c, dst_c, stride);
#else
for (int tr = 0; tr < C16NUM; tr++) {
for (int tc = 0; tc < C8NUM; tc++) {
dst_c[tc * C16NUM + tr] = src_c[tr * stride + tc];
}
}
#endif
}
for (; ci < col; ci++) {
const float16_t *src_c = src_r + ci;
float16_t *dst_c = dst_r + ci * C16NUM;
for (size_t i = 0; i < C16NUM; i++) {
dst_c[i] = src_c[i * stride];
}
}
src_r += C16NUM * stride;
dst_r += C16NUM * col;
}
for (; ri < row; ri++) {
for (size_t i = 0; i < col; ++i) {
dst_r[i * C16NUM] = src_r[i];
}
src_r += stride;
dst_r += 1;
}
for (; ri < row_up_16; ri++) {
for (size_t i = 0; i < col; i++) {
dst_r[i * C16NUM] = 0;
}
dst_r += 1;
}
return;
}
#endif
void RowMajor2Row16MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
int c_div16 = c / 16;
int c_mod16 = c % 16;
dst[c_div16 * 16 * row + r * 16 + c_mod16] = src[r * stride + c];
}
}
}
void RowMajor2Col12MajorStrideFp16(const float16_t *src, float16_t *dst, size_t row, size_t col, int stride) {
size_t row_up_12 = UP_ROUND(row, C12NUM);
size_t row12 = row / C12NUM * C12NUM;
size_t col8 = col / C8NUM * C8NUM;
const float16_t *src_r = src;
float16_t *dst_r = dst;
size_t ri = 0;
// transpose 12x8
for (; ri < row12; ri += C12NUM) {
size_t ci = 0;
for (; ci < col8; ci += C8NUM) {
const float16_t *src_c = src_r + ci;
float16_t *dst_c = dst_r + ci * C12NUM;
#ifdef ENABLE_ARM82_A32
Transpose12x8A32Fp16(src_c, dst_c, stride * sizeof(float16_t), 24);
#else
for (int tr = 0; tr < C12NUM; tr++) {
for (int tc = 0; tc < C8NUM; tc++) {
dst_c[tc * C12NUM + tr] = src_c[tr * stride + tc];
}
}
#endif
}
for (; ci < col; ci++) {
const float16_t *src_c = src_r + ci;
float16_t *dst_c = dst_r + ci * C12NUM;
for (size_t i = 0; i < C12NUM; i++) {
dst_c[i] = src_c[i * stride];
}
}
src_r += C12NUM * stride;
dst_r += C12NUM * col;
}
for (; ri < row; ri++) {
for (size_t i = 0; i < col; ++i) {
dst_r[i * C12NUM] = src_r[i];
}
src_r += stride;
dst_r += 1;
}
for (; ri < row_up_12; ri++) {
for (size_t i = 0; i < col; i++) {
dst_r[i * C12NUM] = 0;
}
dst_r += 1;
}
}
void RowMajor2Row12MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
int c_div12 = c / 12;
int c_mod12 = c % 12;
dst[c_div12 * 12 * row + r * 12 + c_mod12] = src[r * stride + c];
}
}
}
static void RowMajor2Col8MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
int r_div8 = r / 8;
int r_mod8 = r % 8;
dst[r_div8 * 8 * col + c * 8 + r_mod8] = src[r * stride + c];
}
}
}
static void RowMajor2Row8MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
for (int r = 0; r < row; r++) {
const float16_t *src_ptr = src + r * stride;
int c = 0;
for (; c < col; c++) {
int cd8 = c / C8NUM;
int cm8 = c % C8NUM;
dst[cd8 * C8NUM * row + r * C8NUM + cm8] = src_ptr[c];
}
for (; c < UP_ROUND(col, C8NUM); c++) {
int cd8 = c / C8NUM;
int cm8 = c % C8NUM;
dst[cd8 * C8NUM * row + r * C8NUM + cm8] = 0;
}
}
return;
}
static void RowMajor2ColXMajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
#ifdef ENABLE_ARM64
RowMajor2Col16MajorStrideFp16(src, dst, row, col, stride);
#else
RowMajor2Col12MajorStrideFp16(src, dst, row, col, stride);
#endif
}
static void RowMajor2RowXMajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
#ifdef ENABLE_ARM64
RowMajor2Row16MajorStrideFp16(src, dst, row, col, stride);
#else
RowMajor2Row12MajorStrideFp16(src, dst, row, col, stride);
#endif
}
void GemmMatmulFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda,
const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc, float16_t *workspace) {
GemmCbFp16 gcb;
gcb.atype = ActType_No;
gcb.ca = 0;
gcb.cb = 0;
gcb.bias = NULL;
GemmMatmulPlusFp16(ta, tb, M, N, K, alpha, mat_a, lda, mat_b, ldb, beta, mat_c, ldc, workspace, &gcb);
}
void GemmMatmulPlusFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda,
const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc,
float16_t *workspace, GemmCbFp16 *gcb) {
#ifdef ENABLE_ARM64
const int num = C16NUM;
#else
const int num = C12NUM;
#endif
float16_t *output = mat_c;
float16_t *fworkspace = workspace;
int incremental = (beta < 0.f) || (beta > 0.f);
float16_t *mat_a_input = (float16_t *)mat_a;
float16_t *mat_b_input = (float16_t *)mat_b;
if (!gcb->ca) {
mat_a_input = fworkspace;
fworkspace += MatSizeFp16(M, K, num);
if (ta) {
RowMajor2RowXMajorStrideFp16(mat_a, mat_a_input, K, M, lda);
} else {
RowMajor2ColXMajorStrideFp16(mat_a, mat_a_input, M, K, lda);
}
}
if (!gcb->cb) {
mat_b_input = fworkspace;
fworkspace += MatSizeFp16(N, K, C8NUM);
if (tb) {
RowMajor2Col8MajorStrideFp16(mat_b, mat_b_input, N, K, ldb);
} else {
RowMajor2Row8MajorStrideFp16(mat_b, mat_b_input, K, N, ldb);
}
}
if (incremental) output = fworkspace;
MatMulFp16(mat_a_input, mat_b_input, output, gcb->bias, gcb->atype, K, M, N, ldc, OutType_Nhwc);
if (incremental) AddMatrixFp16(output, mat_c, beta, M, N, ldc);
gcb->mat_a = mat_a_input;
gcb->mat_b = mat_b_input;
}

View File

@ -0,0 +1,46 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_GEMM_FP16_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_GEMM_FP16_H_
#include <stdlib.h>
#include "nnacl/op_base.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
int ca;
int cb;
ActType atype;
float16_t *bias;
float16_t *mat_a;
float16_t *mat_b;
} GemmCbFp16;
void GemmMatmulFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda,
const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc, float16_t *workspace);
void GemmMatmulPlusFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda,
const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc,
float16_t *workspace, GemmCbFp16 *gcb);
int MatSizeFp16(int row, int col, int round);
int MatSizeTotalFp16(int row, int col, int deep, int inc);
void AddMatrixFp16(const float16_t *v1, float16_t *v2, float16_t beta, int row, int col, int stride);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_GEMM_FP16_H_

View File

@ -0,0 +1,59 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp16_grad/layernorm_grad.h"
#include <stddef.h>
#include <math.h>
void LayerNormFp16Grad(const float16_t *x, const float16_t *dy, const float16_t *var, const float16_t *mean,
const float16_t *gamma, int param_num, int param_size, int block_num, int block_size,
float16_t *dx, float16_t *dg, float16_t *db) {
// var is actually 1/sqrf(var)-> var^0.5
const float16_t *var_sqrt_rev = var;
for (size_t i = 0; i < param_num; ++i) {
float dgamma = 0.0f;
float dbeta = 0.0f;
for (size_t j = i; j < param_size * param_num; j += param_num) {
int norm_shift = (int)(j / block_size);
dgamma += dy[j] * var_sqrt_rev[norm_shift] * (x[j] - mean[norm_shift]);
dbeta += dy[j];
}
dg[i] = (float16_t)dgamma;
db[i] = (float16_t)dbeta;
}
for (size_t i = 0; i < block_num; ++i) {
float sum1 = 0.0f;
float sum2 = 0.0f;
float sum3 = 0.0f;
for (size_t j = i * block_size; j < (i + 1) * block_size; ++j) {
int param_shift = j % param_num;
int norm_shift = (int)(j / block_size);
float16_t dxm = x[j] - mean[norm_shift];
float16_t dyg = dy[j] * gamma[param_shift];
sum1 += -0.5f * dyg * dxm * var_sqrt_rev[norm_shift] * var_sqrt_rev[norm_shift] * var_sqrt_rev[norm_shift];
sum2 += dyg;
sum3 += -2.0f * dxm;
}
for (size_t j = i * block_size; j < (i + 1) * block_size; ++j) {
int param_shift = j % param_num;
int norm_shift = (int)(j / block_size);
float16_t var_sqrt = var_sqrt_rev[norm_shift];
float dx1 = dy[j] * gamma[param_shift] * var_sqrt;
float dx2 = sum1 * 2.0f / block_size * (x[j] - mean[norm_shift]);
float dx3 = (-1.0f * var_sqrt * sum2 + (1.0f / block_size) * sum1 * sum3) * (1.0f / block_size);
dx[j] = (float16_t)(dx1 + dx2 + dx3);
}
}
}

View File

@ -0,0 +1,32 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_LAYERNORM_GRAD_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_LAYERNORM_GRAD_H_
#include "nnacl/op_base.h"
#ifdef __cplusplus
extern "C" {
#endif
void LayerNormFp16Grad(const float16_t *x, const float16_t *dy, const float16_t *var, const float16_t *mean,
const float16_t *gamma, int param_num, int param_size, int block_num, int block_size,
float16_t *dx, float16_t *dg, float16_t *db);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_LAYERNORM_GRAD_H_

View File

@ -0,0 +1,201 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string.h>
#include "nnacl/fp16_grad/pack_fp16_ext.h"
void RollingIm2ColPackDwUnitFp16(const float16_t *in_data, const ConvParameter *conv_param, float16_t *data_col_orig,
int real_cal_num, int start) {
const int pad_left = conv_param->pad_l_;
const int pad_up = conv_param->pad_u_;
const int stride_h = conv_param->stride_h_;
const int stride_w = conv_param->stride_w_;
const int dilation_h = conv_param->dilation_h_;
const int dilation_w = conv_param->dilation_w_;
const int kernel_h = conv_param->kernel_h_;
const int kernel_w = conv_param->kernel_w_;
const int in_height = conv_param->input_h_;
const int in_width = conv_param->input_w_;
const int output_w = conv_param->output_w_;
const int channels = conv_param->input_channel_;
const int stride = kernel_h * kernel_w;
int kernel_row, kernel_col;
for (int i = 0; i < real_cal_num; i++) {
int block_start = start + i;
int input_h = block_start / output_w * stride_h;
int input_w = block_start % output_w * stride_w;
float16_t *data_col = data_col_orig + i * channels * stride;
for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
int input_row = -pad_up + kernel_row * dilation_h + input_h;
for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
int input_col = -pad_left + kernel_col * dilation_w + input_w;
if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
const int offset = (input_row * in_width + input_col) * channels;
for (int c = 0; c < channels; c++) {
data_col[c * stride] = in_data[offset + c];
}
data_col++;
} else {
for (int c = 0; c < channels; c++) {
data_col[c * stride] = 0;
}
data_col++;
}
}
}
}
}
void RollingIm2ColPackUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
int real_cal_num, int block_index) {
const int pad_left = conv_param->pad_l_;
const int pad_up = conv_param->pad_u_;
const int stride_h = conv_param->stride_h_;
const int stride_w = conv_param->stride_w_;
const int dilation_h = conv_param->dilation_h_;
const int dilation_w = conv_param->dilation_w_;
const int kernel_h = conv_param->kernel_h_;
const int kernel_w = conv_param->kernel_w_;
const int in_height = conv_param->input_h_;
const int in_width = conv_param->input_w_;
const int output_w = conv_param->output_w_;
const int channels = conv_param->input_channel_ / conv_param->group_;
const int tot_channels = conv_param->input_channel_;
int kernel_row, kernel_col;
if (channels == 1) {
for (int i = 0; i < real_cal_num; i++) {
int block_start = block_index + i;
int input_h = block_start / output_w * stride_h;
int input_w = block_start % output_w * stride_w;
for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
int input_row = -pad_up + kernel_row * dilation_h + input_h;
for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
int input_col = -pad_left + kernel_col * dilation_w + input_w;
if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
const int offset = (input_row * in_width + input_col) * tot_channels;
*packed_input = input_data[offset];
packed_input++;
} else {
*packed_input = 0;
packed_input++;
}
}
}
}
} else {
for (int i = 0; i < real_cal_num; i++) {
int block_start = block_index + i;
int input_h = block_start / output_w * stride_h;
int input_w = block_start % output_w * stride_w;
for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
int input_row = -pad_up + kernel_row * dilation_h + input_h;
for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
int input_col = -pad_left + kernel_col * dilation_w + input_w;
if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
const int offset = (input_row * in_width + input_col) * tot_channels;
memcpy(packed_input, input_data + offset, sizeof(float16_t) * channels);
packed_input += channels;
} else {
memset(packed_input, 0, sizeof(float16_t) * channels);
packed_input += channels;
}
}
}
}
}
}
void RollingCol2ImPackUnitFp16(const float16_t *data_col, float16_t *data_im, const ConvParameter *conv_param,
int real_cal_num, int block_index) {
const int pad_left = conv_param->pad_l_;
const int pad_up = conv_param->pad_u_;
const int stride_h = conv_param->stride_h_;
const int stride_w = conv_param->stride_w_;
const int dilation_h = conv_param->dilation_h_;
const int dilation_w = conv_param->dilation_w_;
const int kernel_h = conv_param->kernel_h_;
const int kernel_w = conv_param->kernel_w_;
const int in_height = conv_param->input_h_;
const int in_width = conv_param->input_w_;
const int output_w = conv_param->output_w_;
const int channels = conv_param->input_channel_ / conv_param->group_;
const int tot_channels = conv_param->input_channel_;
int kernel_row, kernel_col;
if (channels == 1) {
for (int r = 0; r < real_cal_num; r++) {
int output_col = (block_index + r) % output_w;
int output_row = (block_index + r) / output_w;
int row_stride_offset = output_row * stride_h;
int col_stride_offset = output_col * stride_w;
for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
int input_row = -pad_up + kernel_row * dilation_h + row_stride_offset;
for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
int input_col = -pad_left + kernel_col * dilation_w + col_stride_offset;
if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
int offset = (input_row * in_width + input_col) * tot_channels;
float16_t *data_im_ptr = data_im + offset;
*data_im_ptr += *data_col;
}
data_col++;
}
}
}
} else {
for (int r = 0; r < real_cal_num; r++) {
int output_col = (block_index + r) % output_w;
int output_row = (block_index + r) / output_w;
int row_stride_offset = output_row * stride_h;
int col_stride_offset = output_col * stride_w;
for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
int input_row = -pad_up + kernel_row * dilation_h + row_stride_offset;
for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
int input_col = -pad_left + kernel_col * dilation_w + col_stride_offset;
if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
int offset = (input_row * in_width + input_col) * tot_channels;
float16_t *data_im_ptr = &data_im[offset];
for (int i = 0; i < channels; i++) {
data_im_ptr[i] += data_col[i];
}
}
data_col += channels;
}
}
}
}
}

View File

@ -0,0 +1,37 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_PACK_FP16_EXT_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_PACK_FP16_EXT_H_
#include <stddef.h>
#include "nnacl/conv_parameter.h"
#ifdef __cplusplus
extern "C" {
#endif
void RollingIm2ColPackUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
int real_cal_num, int block_index);
void RollingIm2ColPackDwUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
int real_cal_num, int block_index);
void RollingCol2ImPackUnitFp16(const float16_t *data_col, float16_t *data_im, const ConvParameter *conv_param,
int real_cal_num, int block_index);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_PACK_FP16_EXT_H_

View File

@ -0,0 +1,189 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdint.h>
#include <string.h>
#include <float.h>
#include "nnacl/fp16_grad/pooling_grad.h"
void AvgPoolingFp16Grad(const float16_t *input_ptr, float16_t *output_ptr, int count, PoolingParameter *pooling_param) {
int stride_w = pooling_param->stride_w_;
int stride_h = pooling_param->stride_h_;
int pad_w = pooling_param->pad_l_;
int pad_h = pooling_param->pad_u_;
int win_w = pooling_param->window_w_;
int win_h = pooling_param->window_h_;
int channel = pooling_param->input_channel_;
int in_w = pooling_param->input_w_;
int in_h = pooling_param->input_h_;
int output_w = pooling_param->output_w_;
int output_h = pooling_param->output_h_;
const float16_t kk = 1.0f / (float16_t)(win_h * win_w);
#if ENABLE_NEON
const float16x4_t factor = vdup_n_f16(kk);
#endif
for (int ib = 0; ib < count; ib++) {
float16_t *out = &output_ptr[(ib * in_h * in_w * channel)];
const float16_t *inPtr = &input_ptr[(ib * output_h * output_w * channel)];
// iterate over yt
for (int yh = 0; yh < output_h; yh++) {
int over_h = pad_h - yh * stride_h;
int kh_s = MSMAX(0, over_h);
int kh_e = MSMIN(win_h, in_h + over_h);
for (int yw = 0; yw < output_w; yw++) {
int over_w = pad_w - yw * stride_w;
int kw_s = MSMAX(0, over_w);
int kw_e = MSMIN(win_w, in_w + over_w);
int ic = 0;
for (; ic < channel - 4; ic += 4) {
int idx = (yw + yh * output_w) * channel + ic;
#ifdef ENABLE_NEON
float16x4_t in = vld1_f16(inPtr + idx);
float16x4_t delta = vmul_f16(in, factor);
#else
float16_t delta[4] = {inPtr[idx], inPtr[idx + 1], inPtr[idx + 2], inPtr[idx + 3]};
for (int i = 0; i < 4; i++) delta[i] *= kk;
#endif
for (int kh = kh_s; kh < kh_e; kh++) {
int xh = yh * stride_h + kh - pad_h;
for (int kw = kw_s; kw < kw_e; kw++) {
int xw = yw * stride_w + kw - pad_w;
#ifdef ENABLE_NEON
float16_t *out_vec = out + (xw + in_w * xh) * channel + ic;
float16x4_t outr = vld1_f16(out + (xw + in_w * xh) * channel + ic);
float16x4_t outs = vadd_f16(outr, delta);
vst1_f16(out_vec, outs);
#else
for (int i = 0; i < 4; i++) {
out[(xw + in_w * xh) * channel + ic + i] += ((float16_t *)&delta)[i];
}
#endif
}
}
}
for (; ic < channel; ic++) {
int idx = (yw + yh * output_w) * channel + ic;
float16_t delta = inPtr[idx] * kk;
for (int kh = kh_s; kh < kh_e; kh++) {
int xh = yh * stride_h + kh - pad_h;
for (int kw = kw_s; kw < kw_e; kw++) {
int xw = yw * stride_w + kw - pad_w;
out[(xw + in_w * xh) * channel + ic] += delta;
}
}
}
}
}
}
}
#ifdef ENABLE_NEON
static int32x4_t MaxIndex(float16x4_t in, float16x4_t *max, uint32x4_t index, uint32x4_t prev_index) {
uint16x4_t res = vcgt_f16(in, *max);
int16x4_t tmp = vreinterpret_s16_u16(res);
uint32x4_t res_tmp = vreinterpretq_u32_s32(vmovl_s16(tmp));
int32x4_t m_index = vbslq_s32(res_tmp, index, prev_index);
*max = vbsl_f16(res, in, *max);
return m_index;
}
#endif
void MaxPoolingFp16Grad(const float16_t *input_ptr, const float16_t *dy_ptr, float16_t *output_ptr, int output_batch,
PoolingParameter *pooling_param) {
int stride_w = pooling_param->stride_w_;
int stride_h = pooling_param->stride_h_;
int pad_w = pooling_param->pad_l_;
int pad_h = pooling_param->pad_u_;
int win_w = pooling_param->window_w_;
int win_h = pooling_param->window_h_;
int channel = pooling_param->input_channel_;
int in_w = pooling_param->input_w_;
int in_h = pooling_param->input_h_;
int output_w = pooling_param->output_w_;
int output_h = pooling_param->output_h_;
for (int ib = 0; ib < output_batch; ib++) {
float16_t *out = &output_ptr[(ib * in_h * in_w * channel)];
const float16_t *inPtr = &input_ptr[(ib * in_h * in_w * channel)];
const float16_t *dyPtr = &dy_ptr[(ib * output_h * output_w * channel)];
for (int yh = 0; yh < output_h; yh++) {
int over_h = pad_h - yh * stride_h;
int kh_s = MSMAX(0, over_h);
int kh_e = MSMIN(win_h, in_h + over_h);
for (int yw = 0; yw < output_w; yw++) {
int over_w = pad_w - yw * stride_w;
int kw_s = MSMAX(0, over_w);
int kw_e = MSMIN(win_w, in_w + over_w);
int ic = 0;
for (; ic < (channel & ~3); ic += 4) {
int idx = (yw + yh * output_w) * channel + ic;
#ifdef ENABLE_NEON
uint32x4_t max_idx = vdupq_n_u32(0);
float16x4_t max_val = vdup_n_f16(-FLT16_MAX);
float16x4_t delta = vld1_f16(dyPtr + idx);
#else
float16_t delta[4] = {dyPtr[idx], dyPtr[idx + 1], dyPtr[idx + 2], dyPtr[idx + 3]};
float16_t max_val[4] = {-FLT16_MAX, -FLT16_MAX, -FLT16_MAX, -FLT16_MAX};
uint max_idx[4] = {0};
#endif
for (int kh = kh_s; kh < kh_e; kh++) {
int xh = yh * stride_h + kh - pad_h;
for (int kw = kw_s; kw < kw_e; kw++) {
int xw = yw * stride_w + kw - pad_w;
int val_idx = (xw + in_w * xh) * channel + ic;
#ifdef ENABLE_NEON
uint32x4_t index = {val_idx, val_idx + 1, val_idx + 2, val_idx + 3};
float16x4_t in = vld1_f16(inPtr + val_idx);
max_idx = MaxIndex(in, &max_val, index, max_idx);
#else
float16_t val[4] = {inPtr[val_idx], inPtr[val_idx + 1], inPtr[val_idx + 2], inPtr[val_idx + 3]};
for (int i = 0; i < 4; i++) {
if (val[i] > max_val[i]) {
max_val[i] = val[i];
max_idx[i] = val_idx + i;
}
}
#endif
}
}
for (int i = 0; i < 4; i++) {
out[((int *)&max_idx)[i]] += ((float16_t *)&delta)[i];
}
}
for (; ic < channel; ic++) {
float16_t max_val = -FLT16_MAX;
int max_idx = 0;
int idx = (yw + yh * output_w) * channel + ic;
float16_t delta = dyPtr[idx];
for (int kh = kh_s; kh < kh_e; kh++) {
int xh = yh * stride_h + kh - pad_h;
for (int kw = kw_e; kw < kw_s; kw++) {
int xw = yw * stride_w + kw - pad_w;
int val_idx = (xw + in_w * xh) * channel + ic;
float16_t val = inPtr[val_idx];
if (val > max_val) {
max_val = val;
max_idx = val_idx;
}
}
}
out[max_idx] += delta;
}
}
}
}
}

View File

@ -0,0 +1,32 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_POOLING_GRAD_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_POOLING_GRAD_H_
#include "nnacl/fp16/pooling_fp16.h"
#ifdef __cplusplus
extern "C" {
#endif
void AvgPoolingFp16Grad(const float16_t *input_ptr, float16_t *output_ptr, int count, PoolingParameter *pooling_param);
void MaxPoolingFp16Grad(const float16_t *input_ptr, const float16_t *dy_ptr, float16_t *output_ptr, int output_batch,
PoolingParameter *pooling_param);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_POOLING_GRAD_H_

View File

@ -0,0 +1,142 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp16_grad/resize_grad.h"
#include <math.h>
#include "nnacl/infer/common_infer.h"
void ResizeNearestNeighborFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format,
ResizeFp16GradParameter *param) {
bool align_corners = param->align_corners_;
size_t in_hw_size = param->in_width_ * param->in_height_;
size_t out_hw_size = param->out_width_ * param->out_height_;
if (format == Format_NHWC) {
for (int32_t b = 0; b < batch_size; ++b) {
for (size_t i = 0; i < in_hw_size; ++i) {
size_t in_y = i / param->in_width_;
size_t in_x = i % param->in_width_;
for (int32_t c = 0; c < channel; ++c) {
size_t out_y = MSMIN(
(align_corners) ? (size_t)roundf(in_y * param->height_scale_) : (size_t)floorf(in_y * param->height_scale_),
param->out_height_ - 1);
size_t out_x = MSMIN(
(align_corners) ? (size_t)roundf(in_x * param->width_scale_) : (size_t)floorf(in_x * param->width_scale_),
param->out_width_ - 1);
size_t out_offset = out_y * (param->out_width_ * channel) + (out_x * channel) + c;
size_t in_offset = in_y * (param->in_width_ * channel) + (in_x * channel) + c;
out_addr[out_offset] += in_addr[in_offset];
}
}
out_addr += out_hw_size * channel;
in_addr += in_hw_size * channel;
}
} else if (format == Format_NCHW) {
for (int32_t b = 0; b < batch_size; ++b) {
for (int32_t c = 0; c < channel; ++c) {
for (size_t h = 0; h < param->in_height_; ++h) {
size_t out_y =
MSMIN((align_corners) ? (size_t)roundf(h * param->height_scale_) : (size_t)floorf(h * param->height_scale_),
param->out_height_ - 1);
for (size_t w = 0; w < param->in_width_; ++w) {
size_t out_x =
MSMIN((align_corners) ? (size_t)roundf(w * param->width_scale_) : (size_t)floorf(w * param->width_scale_),
param->out_width_ - 1);
out_addr[out_y * param->out_width_ + out_x] += in_addr[h * param->in_width_ + w];
}
}
out_addr += out_hw_size;
in_addr += in_hw_size;
}
}
}
}
void ResizeBiLinearFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format,
ResizeFp16GradParameter *param) {
size_t in_hw_size = param->in_width_ * param->in_height_;
size_t out_hw_size = param->out_width_ * param->out_height_;
if (format == Format_NHWC) {
for (int32_t b = 0; b < batch_size; ++b) {
for (size_t i = 0; i < in_hw_size; ++i) {
size_t h = i / param->in_width_;
size_t w = i % param->in_width_;
for (int32_t c = 0; c < channel; ++c) {
float16_t in_y = (float16_t)h * param->height_scale_;
size_t top_y_index = MSMAX((size_t)(floorf(in_y)), (size_t)(0));
size_t bottom_y_index = MSMIN((size_t)(ceilf(in_y)), param->out_height_ - 1);
float16_t y_lerp = in_y - floorf(in_y);
float16_t inverse_y_lerp = 1.0 - y_lerp;
float16_t in_x = (float16_t)w * param->width_scale_;
size_t left_x_index = MSMAX((size_t)(floorf(in_x)), (size_t)(0));
size_t right_x_index = MSMIN((size_t)(ceilf(in_x)), param->out_width_ - 1);
float16_t x_lerp = in_x - floorf(in_x);
float16_t inverse_x_lerp = 1.0 - x_lerp;
size_t in_offset = h * (param->in_width_ * channel) + (w * channel) + c;
size_t out_offset_top_y_left_x = top_y_index * (param->out_width_ * channel) + (left_x_index * channel) + c;
size_t out_offset_top_y_right_x = top_y_index * (param->out_width_ * channel) + (right_x_index * channel) + c;
size_t out_offset_bottom_y_left_x =
bottom_y_index * (param->out_width_ * channel) + (left_x_index * channel) + c;
size_t out_offset_bottom_y_right_x =
bottom_y_index * (param->out_width_ * channel) + (right_x_index * channel) + c;
out_addr[out_offset_top_y_left_x] += in_addr[in_offset] * (float16_t)(inverse_y_lerp * inverse_x_lerp);
out_addr[out_offset_top_y_right_x] += in_addr[in_offset] * (float16_t)(inverse_y_lerp * x_lerp);
out_addr[out_offset_bottom_y_left_x] += in_addr[in_offset] * (float16_t)(y_lerp * inverse_x_lerp);
out_addr[out_offset_bottom_y_right_x] += in_addr[in_offset] * (float16_t)(y_lerp * x_lerp);
}
}
out_addr += out_hw_size * channel;
in_addr += in_hw_size * channel;
}
} else if (format == Format_NCHW) {
size_t in_height = param->in_height_;
size_t in_width = param->in_width_;
size_t out_height = param->out_height_;
size_t out_width = param->out_width_;
for (size_t b = 0; b < batch_size; ++b) {
for (size_t c = 0; c < channel; ++c) {
for (size_t h = 0; h < in_height; ++h) {
const float16_t in_y = (float16_t)(h)*param->height_scale_;
const size_t top_y_index = MSMAX((size_t)floorf(in_y), 0);
const size_t bottom_y_index = MSMIN((size_t)ceilf(in_y), out_height - 1);
const float16_t y_lerp = in_y - floorf(in_y);
const float16_t inverse_y_lerp = 1.0 - y_lerp;
for (size_t w = 0; w < in_width; ++w) {
const float16_t in_x = (float16_t)(w)*param->width_scale_;
const size_t left_x_index = MSMAX((size_t)floorf(in_x), 0);
const size_t right_x_index = MSMIN((size_t)ceilf(in_x), out_width - 1);
const float16_t x_lerp = in_x - floorf(in_x);
const float16_t inverse_x_lerp = 1.0 - x_lerp;
out_addr[top_y_index * out_width + left_x_index] +=
in_addr[h * in_width + w] * (float16_t)(inverse_y_lerp * inverse_x_lerp);
out_addr[top_y_index * out_width + right_x_index] +=
in_addr[h * in_width + w] * (float16_t)(inverse_y_lerp * x_lerp);
out_addr[bottom_y_index * out_width + left_x_index] +=
in_addr[h * in_width + w] * (float16_t)(y_lerp * inverse_x_lerp);
out_addr[bottom_y_index * out_width + right_x_index] +=
in_addr[h * in_width + w] * (float16_t)(y_lerp * x_lerp);
}
}
out_addr += out_hw_size;
in_addr += in_hw_size;
}
}
}
}

View File

@ -0,0 +1,45 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_RESIZE_GRAD_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_RESIZE_GRAD_H_
#include "nnacl/op_base.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct ResizeFp16GradParameter {
OpParameter op_parameter_;
bool align_corners_;
int method;
size_t in_height_;
size_t in_width_;
size_t out_height_;
size_t out_width_;
float16_t height_scale_;
float16_t width_scale_;
} ResizeFp16GradParameter;
void ResizeNearestNeighborFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format,
ResizeFp16GradParameter *param);
void ResizeBiLinearFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format,
ResizeFp16GradParameter *param);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_RESIZE_GRAD_H_

View File

@ -0,0 +1,62 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp16_grad/strided_slice_grad.h"
#include "nnacl/errorcode.h"
static size_t CalcIndex(const int *shape, size_t size, int i, size_t pos) {
size_t res = 1;
for (size_t j = 0; j < size; j++) {
res *= shape[(i + 1) + j];
}
return (pos / res % shape[i]);
}
int DoStridedSliceFp16Grad(const float16_t *inputs, float16_t *output, const int *dx_shape,
StridedSliceParameter *param) {
if (inputs == NULL || output == NULL || param == NULL) {
return NNACL_NULL_PTR;
}
if (param->num_axes_ > DIMENSION_7D) {
return NNACL_PARAM_INVALID;
}
size_t size = 1;
int *s = param->strides_;
int *b = param->begins_;
for (int i = 0; i < DIMENSION_7D; i++) {
size *= param->in_shape_[i];
}
for (size_t pos = 0; pos < size; pos++) {
size_t i = CalcIndex(param->in_shape_, 6, 0, pos);
size_t j = CalcIndex(param->in_shape_, 5, 1, pos);
size_t k = CalcIndex(param->in_shape_, 4, 2, pos);
size_t l = CalcIndex(param->in_shape_, 3, 3, pos);
size_t m = CalcIndex(param->in_shape_, 2, 4, pos);
size_t n = CalcIndex(param->in_shape_, 1, 5, pos);
size_t o = CalcIndex(param->in_shape_, 0, 6, pos);
size_t input_idx =
(i * s[0] + b[0]) * dx_shape[1] * dx_shape[2] * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] +
(j * s[1] + b[1]) * dx_shape[2] * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] +
(k * s[2] + b[2]) * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] +
(l * s[3] + b[3]) * dx_shape[4] * dx_shape[5] * dx_shape[6] + (m * s[4] + b[4]) * dx_shape[5] * dx_shape[6] +
(n * s[5] + b[5]) * dx_shape[6] + (o * s[6] + b[6]);
output[input_idx] = inputs[pos];
}
return NNACL_OK;
}

View File

@ -0,0 +1,31 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_STRIDED_SLICE_GRAD_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_STRIDED_SLICE_GRAD_H_
#include "nnacl/op_base.h"
#include "nnacl/strided_slice_parameter.h"
#ifdef __cplusplus
extern "C" {
#endif
int DoStridedSliceFp16Grad(const float16_t *inputs, float16_t *output, const int *dx_shape,
StridedSliceParameter *param);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_STRIDED_SLICE_GRAD_H_

View File

@ -0,0 +1,33 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp16_grad/unsorted_segment_sum.h"
#include "nnacl/errorcode.h"
int UnsortedSegmentSumFp16(const float16_t *input, int unit_num, int input_dim1, const int *indices, float16_t *output,
int output_dim0, int output_dim1) {
for (int i = 0; i < unit_num; ++i) {
int j = i / input_dim1;
int k = i % input_dim1;
int index = indices[j];
if (index < 0 || index >= output_dim0) {
continue;
}
int output_index = index * output_dim1 + k;
output[output_index] += input[i];
}
return NNACL_OK;
}

View File

@ -0,0 +1,31 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_UNSORTED_SEGMENT_SUM_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_UNSORTED_SEGMENT_SUM_H_
#include "nnacl/op_base.h"
#ifdef __cplusplus
extern "C" {
#endif
int UnsortedSegmentSumFp16(const float16_t *input, int unit_num, int input_dim1, const int *indices, float16_t *output,
int output_dim0, int output_dim1);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_UNSORTED_SEGMENT_SUM_H_

View File

@ -321,7 +321,7 @@ static int Filtergrad2Arm(const float *x, const float *dy, int i_c, int k_idx, f
dw[(i_c + 1) * k_spatial + k_idx] = sum_2[1];
i_c += 2;
}
return i_c += 2;
return i_c;
}
#endif
int ConvDwFilterGrad(const float *x, const float *dy, float *dw, int start, int count,

View File

@ -118,4 +118,11 @@ static inline float16x8_t MS_ERFX8_F16(float16x8_t src) {
dst[7] = erff(src[7]);
return dst;
}
static inline float32x4_t MS_VMLAL_F16(float16x4_t x, float16x4_t dy, float32x4_t sum) {
float32x4_t x_fp32 = MS_CVT_F32_F16(x);
float32x4_t dy_fp32 = MS_CVT_F32_F16(dy);
return vmlaq_f32(sum, x_fp32, dy_fp32);
}
#endif // MINDSPORE_NNACL_INTRINSICS_MS_SIMD_INSTRUCTIONS_FP16_H_

View File

@ -39,7 +39,7 @@ OpParameter *PopulatePartialParameter(const void *prim) {
MS_LOG(ERROR) << "malloc partial parameter failed.";
return nullptr;
}
memset(partial_parameter, 0, sizeof(PartialParameter));
memset(reinterpret_cast<void *>(partial_parameter), 0, sizeof(PartialParameter));
partial_parameter->op_parameter_.type_ = schema::PrimitiveType_PartialFusion;
partial_parameter->sub_graph_index_ = partial_prim->subGraphIndex();

View File

@ -39,7 +39,7 @@ OpParameter *PopulateWhileParameter(const void *prim) {
MS_LOG(ERROR) << "malloc WhileParemeter failed.";
return nullptr;
}
memset(while_paremeter, 0, sizeof(WhileParemeter));
memset(reinterpret_cast<void *>(while_paremeter), 0, sizeof(WhileParemeter));
while_paremeter->op_parameter_.type_ = schema::PrimitiveType_While;
while_paremeter->body_subgraph_index = while_prim->bodySubgraphIndex();

View File

@ -38,7 +38,7 @@ OpParameter *PopulateWhileParemeter(const void *prim) {
MS_LOG(ERROR) << "malloc WhileParemeter failed.";
return nullptr;
}
memset(param, 0, sizeof(WhileParemeter));
memset(reinterpret_cast<void *>(param), 0, sizeof(WhileParemeter));
param->op_parameter_.type_ = primitive->value_type();
param->body_subgraph_index = value->body_subgraph_index();

View File

@ -52,10 +52,26 @@ int ActivationGradCPUKernelFp16::DoActivation(int task_id) {
auto error_code = RET_OK;
if (param_act_grad_->type_ == schema::ActivationType_RELU) {
error_code = Fp16ReluGrad(yt_addr + start, input_addr + start, count, output_addr + start);
error_code = ReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
} else if (param_act_grad_->type_ == schema::ActivationType_RELU6) {
error_code = Relu6Fp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
} else if (param_act_grad_->type_ == schema::ActivationType_LEAKY_RELU) {
error_code = LReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start,
(float16_t)param_act_grad_->alpha_);
} else if (param_act_grad_->type_ == schema::ActivationType_SIGMOID) {
// Sigmoid gets the input tensors in reverse order!
error_code = Fp16SigmoidGrad(input_addr + start, yt_addr + start, count, output_addr + start);
error_code = SigmoidFp16Grad(input_addr + start, yt_addr + start, count, output_addr + start);
} else if (param_act_grad_->type_ == schema::ActivationType_TANH) {
error_code = TanhFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
} else if (param_act_grad_->type_ == schema::ActivationType_HSWISH) {
error_code = HSwishFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
} else if (param_act_grad_->type_ == schema::ActivationType_HSIGMOID) {
error_code = HSigmoidFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
} else if (param_act_grad_->type_ == schema::ActivationType_ELU) {
error_code =
EluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, (float16_t)param_act_grad_->alpha_);
} else if (param_act_grad_->type_ == schema::ActivationType_GELU) {
error_code = GeluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
} else {
MS_LOG(ERROR) << "Activation type error";
return RET_ERROR;

View File

@ -14,12 +14,13 @@
* limitations under the License.
*/
#ifndef MINDSPORE_ACTIVATION_FP16_GRAD_H
#define MINDSPORE_ACTIVATION_FP16_GRAD_H
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ACTIVATION_FP16_GRAD_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ACTIVATION_FP16_GRAD_H_
#include <vector>
#include "src/inner_kernel.h"
#include "nnacl/fp16_grad/activation_grad.h"
#include "nnacl/fp32_grad/activation_grad.h"
namespace mindspore::kernel {
class ActivationGradCPUKernelFp16 : public InnerKernel {
@ -27,7 +28,7 @@ class ActivationGradCPUKernelFp16 : public InnerKernel {
explicit ActivationGradCPUKernelFp16(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(param, inputs, outputs, ctx), thread_count_(ctx->thread_num_) {
param_act_grad_ = reinterpret_cast<ActivationGradParameterFp16 *>(param);
param_act_grad_ = reinterpret_cast<ActivationGradParameter *>(param);
}
~ActivationGradCPUKernelFp16() override = default;
@ -37,9 +38,9 @@ class ActivationGradCPUKernelFp16 : public InnerKernel {
int DoActivation(int task_id);
private:
ActivationGradParameterFp16 *param_act_grad_;
ActivationGradParameter *param_act_grad_;
int thread_count_;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_ACTIVATION_FP16_GRAD_H
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ACTIVATION_FP16_GRAD_H_

View File

@ -0,0 +1,89 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "nnacl/fp16_grad/arithmetic_grad.h"
#include "include/errorcode.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
namespace mindspore::kernel {
int ArithmeticGradCPUKernelFp16::Init() { return RET_OK; }
void ArithmeticGradCPUKernelFp16::ArithmeticGradMaximum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size,
float16_t *dx2, int dx2_size) {
auto x1 = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
auto x2 = reinterpret_cast<float16_t *>(in_tensors_[1]->data_c());
dy = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c());
MaximumByAxesFp16(x1, x2, dy, arithmeticParameter_->in_shape0_, arithmeticParameter_->in_shape1_,
arithmeticParameter_->out_shape_, dx1, dx2, arithmeticParameter_->ndim_);
}
void ArithmeticGradCPUKernelFp16::ArithmeticGradMinimum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size,
float16_t *dx2, int dx2_size) {
auto x1 = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
auto x2 = reinterpret_cast<float16_t *>(in_tensors_[1]->data_c());
dy = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c());
MinimumByAxesFp16(x1, x2, dy, arithmeticParameter_->in_shape0_, arithmeticParameter_->in_shape1_,
arithmeticParameter_->out_shape_, dx1, dx2, arithmeticParameter_->ndim_);
}
int ArithmeticGradCPUKernelFp16::ReSize() { return RET_OK; }
int ArithmeticGradCPUKernelFp16::Execute(int task_id) {
auto dy = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
auto dx1 = reinterpret_cast<float16_t *>(out_tensors_[0]->data_c());
auto dx2 = reinterpret_cast<float16_t *>(out_tensors_[1]->data_c());
size_t dy_size = in_tensors_.at(0)->ElementsNum();
size_t dx1_size = out_tensors_.at(0)->ElementsNum();
size_t dx2_size = out_tensors_.at(1)->ElementsNum();
(this->*arithmetic_grad_)(dy, dy_size, dx1, dx1_size, dx2, dx2_size);
return RET_OK;
}
int ArithmeticGradRunFp16(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
MS_ASSERT(cdata != nullptr);
auto Arithmetic_kernel = reinterpret_cast<ArithmeticGradCPUKernelFp16 *>(cdata);
auto error_code = Arithmetic_kernel->Execute(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "ArithmeticGradRunFp16 error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int ArithmeticGradCPUKernelFp16::Run() {
int error_code = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(ArithmeticGradRunFp16, this, 1);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "Arithmetic Grad function error error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MaximumGrad, LiteKernelCreator<ArithmeticGradCPUKernelFp16>)
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MinimumGrad, LiteKernelCreator<ArithmeticGradCPUKernelFp16>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,80 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ARITHMETIC_FP16_GRAD_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ARITHMETIC_FP16_GRAD_H_
#include <vector>
#include "src/inner_kernel.h"
#include "nnacl/fp16/arithmetic_fp16.h"
#include "schema/model_generated.h"
using mindspore::schema::PrimitiveType_AddGrad;
using mindspore::schema::PrimitiveType_DivGrad;
using mindspore::schema::PrimitiveType_MaximumGrad;
using mindspore::schema::PrimitiveType_MinimumGrad;
using mindspore::schema::PrimitiveType_MulGrad;
using mindspore::schema::PrimitiveType_SubGrad;
namespace mindspore::kernel {
class ArithmeticGradCPUKernelFp16;
class ArithmeticGradCPUKernelFp16 : public InnerKernel {
typedef void (ArithmeticGradCPUKernelFp16::*ArithmeticGradOperation)(float16_t *, int, float16_t *, int, float16_t *,
int);
public:
explicit ArithmeticGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx), tile_data0(NULL), tile_data1(NULL), tile_data2(NULL) {
switch (type()) {
case PrimitiveType_MaximumGrad:
arithmetic_grad_ = &ArithmeticGradCPUKernelFp16::ArithmeticGradMaximum;
break;
case PrimitiveType_MinimumGrad:
arithmetic_grad_ = &ArithmeticGradCPUKernelFp16::ArithmeticGradMinimum;
break;
default:
MS_LOG(ERROR) << "Error Operator type " << parameter->type_;
break;
}
arithmeticParameter_ = reinterpret_cast<ArithmeticParameter *>(parameter);
}
~ArithmeticGradCPUKernelFp16() override {
if (tile_data0) delete[] tile_data0;
if (tile_data1) delete[] tile_data1;
if (tile_data2) delete[] tile_data2;
}
int Init() override;
int InferShape();
int ReSize() override;
int Run() override;
int Execute(int task_id);
private:
void ArithmeticGradMaximum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size, float16_t *dx2, int dx2_size);
void ArithmeticGradMinimum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size, float16_t *dx2, int dx2_size);
ArithmeticParameter *arithmeticParameter_;
ArithmeticGradOperation arithmetic_grad_;
float16_t *tile_data0;
float16_t *tile_data1;
float16_t *tile_data2;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ARITHMETIC_FP16_GRAD_H_

View File

@ -0,0 +1,96 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <vector>
#include "src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_BiasAddGrad;
namespace mindspore::kernel {
int BiasGradCPUKernelFp16::ReSize() {
auto dims = in_tensors_[0]->shape();
bias_param->ndim_ = dims.size();
for (unsigned int i = 0; i < bias_param->ndim_; i++) {
bias_param->in_shape0_[i] = dims[i];
bias_param->out_shape_[i] = 1; // 1 dimension for N,H,W,
}
bias_param->out_shape_[bias_param->ndim_ - 1] = dims[bias_param->ndim_ - 1];
for (auto i = bias_param->ndim_; i < 4; i++) {
bias_param->in_shape0_[i] = 0;
bias_param->out_shape_[i] = 0;
}
return RET_OK;
}
int BiasGradCPUKernelFp16::Init() {
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int BiasGradCPUKernelFp16::Execute(int task_id) {
auto in = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
auto out = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
size_t nhw_size = 1;
size_t channels = bias_param->in_shape0_[bias_param->ndim_ - 1]; // C in NHWC
for (unsigned int i = 0; i < bias_param->ndim_ - 1; i++) {
nhw_size *= bias_param->in_shape0_[i];
}
size_t total_size = channels * nhw_size;
for (size_t c = 0; c < channels; ++c) {
float sum = 0;
for (size_t offset = 0; offset < total_size; offset += channels) {
sum += in[offset + c];
}
out[c] = (float16_t)sum;
}
return RET_OK;
}
int BiasGradFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto bias_kernel = reinterpret_cast<BiasGradCPUKernelFp16 *>(cdata);
auto error_code = bias_kernel->Execute(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "bias error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int BiasGradCPUKernelFp16::Run() {
int error_code =
static_cast<const lite::InnerContext *>(this->context_)->thread_pool_->ParallelLaunch(BiasGradFp16Run, this, 1);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "bias function error error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BiasAddGrad, LiteKernelCreator<BiasGradCPUKernelFp16>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,44 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BIAS_FP16_GRAD_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BIAS_FP16_GRAD_H_
#include <vector>
#include "src/lite_kernel.h"
#include "nnacl/fp16/arithmetic_fp16.h"
namespace mindspore::kernel {
class BiasGradCPUKernelFp16 : public InnerKernel {
public:
explicit BiasGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx) {
bias_param = reinterpret_cast<ArithmeticParameter *>(parameter);
}
~BiasGradCPUKernelFp16() override = default;
int Init() override;
int ReSize() override;
int Run() override;
int Execute(int task_id);
private:
ArithmeticParameter *bias_param;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BIAS_FP16_GRAD_H_

View File

@ -0,0 +1,166 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.h"
#include <math.h>
#include <algorithm>
#include <vector>
#include <string>
#include <thread>
#include <fstream>
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "nnacl/fp16_grad/batch_norm.h"
#include "include/errorcode.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_BatchNormGrad;
namespace mindspore::kernel {
int BNGradCPUKernelFp16::ReSize() {
auto *input_x = in_tensors_.at(1);
int channels = input_x->shape().at(kNHWC_C);
ws_size_ = 2 * channels;
set_workspace_size(ws_size_ * sizeof(float16_t));
return RET_OK;
}
int BNGradCPUKernelFp16::Init() {
for (int i = 0; i < in_tensors_.size(); i++) {
if (in_tensors_.at(i)->data_type() != kNumberTypeFloat16) {
MS_LOG(ERROR) << "BNGradCPUKernelFp16 type error in_tensor_[" << i << "]";
}
}
return ReSize();
}
int BNGradCPUKernelFp16::Execute(int task_id) {
auto *input_yt = in_tensors_.at(0);
auto *input_x = in_tensors_.at(1);
auto *input_scale = in_tensors_.at(2);
auto *input_mean = in_tensors_.at(3);
auto *input_var = in_tensors_.at(4);
auto kernel_name = this->name();
if (kernel_name.find("FusedBatchNormGradCPU") != std::string::npos) {
input_mean = in_tensors_.at(4);
input_var = in_tensors_.at(5);
}
auto bn_param = reinterpret_cast<BNGradParameter *>(op_parameter_);
int stage = stage_;
int thread_num = thread_num_;
float16_t *save_mean = reinterpret_cast<float16_t *>(input_mean->data_c());
float16_t *save_var = reinterpret_cast<float16_t *>(input_var->data_c());
auto *output_dx = out_tensors_.at(0);
auto *output_scale = out_tensors_.at(1);
auto *output_bias = out_tensors_.at(2);
int32_t batch = input_x->Batch();
int32_t channels = input_x->Channel();
int32_t spatial = input_x->Height() * input_x->Width();
float *workspace_temp = static_cast<float *>(workspace());
float *dxhat_sum = workspace_temp;
float *dxhathat_sum = dxhat_sum + channels;
float16_t *x = reinterpret_cast<float16_t *>(input_x->data_c());
float16_t *yt = reinterpret_cast<float16_t *>(input_yt->data_c());
float16_t *scale = reinterpret_cast<float16_t *>(input_scale->data_c());
float16_t *dx = reinterpret_cast<float16_t *>(output_dx->data_c());
float16_t *dbias = reinterpret_cast<float16_t *>(output_bias->data_c());
float16_t *dscale = reinterpret_cast<float16_t *>(output_scale->data_c());
int total = spatial * batch;
int stride = UP_DIV(total, thread_num);
int count = MSMIN(stride, total - stride * task_id);
count = (count < 0) ? 0 : count;
switch (stage) {
case 0: {
for (int job = task_id; job < 4; job += thread_num) {
switch (job) {
case 0:
var2InvarFp16(save_var, input_var->ElementsNum(), bn_param->epsilon_);
break;
case 1:
std::fill(workspace_temp, workspace_temp + ws_size_, 0.f);
break;
case 2:
std::fill(dbias, dbias + channels, 0.f);
break;
case 3:
std::fill(dscale, dscale + channels, 0.f);
break;
}
}
if (thread_num == 1) {
backwardAllFp16(x, yt, save_mean, save_var, scale, total, channels, dxhat_sum, dxhathat_sum, dbias, dscale, dx);
}
break;
}
case 1: {
backwardP1Fp16(x, yt, save_mean, save_var, scale, total, channels, dxhat_sum, dxhathat_sum, dbias, dscale);
break;
}
case 2: {
backwardP2Fp16(x + task_id * stride * channels, yt + task_id * stride * channels, save_mean, save_var, scale,
count, total, channels, dxhat_sum, dxhathat_sum, dx + task_id * stride * channels);
break;
}
}
return RET_OK;
}
int BNGradFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
MS_ASSERT(cdata != nullptr);
auto bn_kernel = reinterpret_cast<BNGradCPUKernelFp16 *>(cdata);
auto error_code = bn_kernel->Execute(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "BNGradRun error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int BNGradCPUKernelFp16::Run() {
stage_ = 0;
thread_num_ = context_->thread_num_;
if (thread_num_ == 1) {
int error_code = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(BNGradFp16Run, this, thread_num_);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "BN function error error_code[" << error_code << "]";
return RET_ERROR;
}
} else {
const std::vector<int> threads = {thread_num_, 1, thread_num_};
for (size_t stage = 0; stage < threads.size(); stage++) {
stage_ = static_cast<int>(stage);
int error_code = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(BNGradFp16Run, this, threads.at(stage));
if (error_code != RET_OK) {
MS_LOG(ERROR) << "BN function error error_code[" << error_code << "]";
return RET_ERROR;
}
}
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BatchNormGrad, LiteKernelCreator<BNGradCPUKernelFp16>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,43 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BN_FP16_GRAD_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BN_FP16_GRAD_H_
#include <vector>
#include "src/lite_kernel.h"
#include "nnacl/fp32_grad/batch_norm.h"
namespace mindspore::kernel {
class BNGradCPUKernelFp16 : public InnerKernel {
public:
explicit BNGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx) {}
~BNGradCPUKernelFp16() override {}
int Init() override;
int ReSize() override;
int Run() override;
int Execute(int task_id);
private:
int thread_num_ = 1;
int stage_ = 0;
size_t ws_size_ = 0;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BN_FP16_GRAD_H_

View File

@ -0,0 +1,205 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.h"
#include "src/kernel_registry.h"
#include "nnacl/pack.h"
#include "nnacl/fp16_grad/convolution_grad_filter.h"
#include "nnacl/fp16_grad/pack_fp16_ext.h"
#include "nnacl/fp16_grad/gemm_fp16.h"
#include "include/errorcode.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Conv2DBackpropFilterFusion;
namespace mindspore::kernel {
int ConvolutionGradFilterCPUKernelFp16::ReSize() {
// dy is in input 0
// x is in input 1
// dw is output 0
auto *x_tensor = in_tensors_.at(1);
MS_ASSERT(x_tensor != nullptr);
auto *dy_tensor = in_tensors_.at(0);
MS_ASSERT(dy_tensor != nullptr);
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
conv_param->output_batch_ = dy_tensor->shape().at(kNHWC_N);
conv_param->input_batch_ = x_tensor->shape().at(kNHWC_N);
conv_param->input_h_ = x_tensor->shape().at(kNHWC_H);
conv_param->input_w_ = x_tensor->shape().at(kNHWC_W);
// assume OutCh|kh|kw|InCh
conv_param->input_channel_ = x_tensor->shape().at(kNHWC_C);
conv_param->output_channel_ = dy_tensor->shape().at(kNHWC_C);
conv_param->output_h_ = dy_tensor->shape()[kNHWC_H];
conv_param->output_w_ = dy_tensor->shape()[kNHWC_W];
do_img2col_ = (conv_param->kernel_h_ == 1) && (conv_param->kernel_w_ == 1) && (conv_param->pad_d_ == 0) &&
(conv_param->pad_u_ == 0) && (conv_param->pad_l_ == 0) && (conv_param->pad_r_ == 0) &&
(conv_param->dilation_h_ == 1) && (conv_param->dilation_w_ == 1) && (conv_param->stride_h_ == 1) &&
(conv_param->stride_w_ == 1) && (conv_param->group_ == 1)
? false
: true;
do_dw_ = (conv_param->output_channel_ == conv_param->group_) &&
(conv_param->input_channel_ == conv_param->output_channel_) && (conv_param->dilation_h_ == 1) &&
(conv_param->dilation_w_ == 1)
? true
: false;
ws_size_ = chunk_ * conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
ws_size_ = do_dw_ ? ws_size_ : ws_size_ / conv_param->group_;
int n = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_ / conv_param->group_;
int k = conv_param->output_channel_ / conv_param->group_;
int thread_num = context_->thread_num_;
mat_alloc_ = MatSizeTotalFp16(k, n, chunk_, 0);
set_workspace_size((ws_size_ + mat_alloc_ + (k * n)) * thread_num * sizeof(float16_t));
return RET_OK;
}
int ConvolutionGradFilterCPUKernelFp16::Init() { return ReSize(); }
int ConvolutionGradFilterCPUKernelFp16::Execute(int task_id) {
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
auto *input_dy = in_tensors_.at(0);
auto *input_x = in_tensors_.at(1);
auto *out_dw = out_tensors_.at(0);
auto x_addr = reinterpret_cast<float16_t *>(input_x->data_c());
auto dy_addr = reinterpret_cast<float16_t *>(input_dy->data_c());
auto dw_addr = reinterpret_cast<float16_t *>(out_dw->data_c());
int nweights = out_dw->ElementsNum();
int in_ch = conv_param->input_channel_;
int in_h = conv_param->input_h_;
int in_w = conv_param->input_w_;
int k_h = conv_param->kernel_h_;
int k_w = conv_param->kernel_w_;
int batch = conv_param->output_batch_;
int out_ch = conv_param->output_channel_;
int groups = conv_param->group_;
int out_h = conv_param->output_h_;
int out_w = conv_param->output_w_;
int m = out_h * out_w;
int n = k_h * k_w * in_ch / groups;
int k = out_ch / groups;
int thread_num = context_->thread_num_;
float16_t *workspace_temp = reinterpret_cast<float16_t *>(workspace());
float16_t *mat_workspace = workspace_temp + ws_size_ * thread_num + task_id * (mat_alloc_ + k * n);
float16_t *mat_tmp = mat_workspace + mat_alloc_;
int stride = UP_DIV(batch, thread_num);
int count = MSMIN(stride, batch - stride * task_id);
count = (count < 0) ? 0 : count;
int start = stride * task_id;
int end = start + count;
if (do_dw_) {
#ifdef ENABLE_ARM
stride = UP_DIV(k_h * k_w, thread_num);
count = MSMIN(stride, k_h * k_w - stride * task_id);
count = (count < 0) ? 0 : count;
start = stride * task_id;
ConvDwFilterFp16Grad(x_addr, dy_addr, dw_addr, start, count, conv_param);
#else
stride = UP_DIV(groups, thread_num);
count = MSMIN(stride, groups - stride * task_id);
start = stride * task_id;
end = start + count;
const int kernel_spatial = k_h * k_w;
for (int i = 0; i < batch; ++i) {
for (int ci = 0; ci < m; ci += chunk_) {
int real_chunk = MSMIN(m - ci, chunk_);
float16_t *mat_b = workspace_temp + task_id * ws_size_;
float16_t *im = x_addr + (i * in_ch * in_h * in_w);
RollingIm2ColPackDwUnitFp16(im, conv_param, mat_b, real_chunk, ci);
for (int j = start; j < end; ++j) {
float16_t *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups) + ci * out_ch;
float16_t *mat_c = dw_addr + j * nweights / groups;
GemmMatmulFp16(1, 0, k, n, real_chunk, 1, mat_a, out_ch, mat_b + (j * kernel_spatial), n * groups, 1, mat_c,
n, mat_workspace);
}
}
}
#endif
} else if (do_img2col_) {
for (int i = start; i < end; ++i) {
for (int ci = 0; ci < m; ci += chunk_) {
for (int j = 0; j < groups; ++j) {
int real_chunk = MSMIN(m - ci, chunk_);
float16_t *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups) + ci * out_ch;
float16_t *mat_b = workspace_temp + task_id * ws_size_;
float16_t *mat_c = dw_addr + j * nweights / groups;
float16_t *im = x_addr + (i * in_ch * in_h * in_w) + j * (in_ch / groups);
RollingIm2ColPackUnitFp16(im, conv_param, mat_b, real_chunk, ci);
GemmMatmulFp16(1, 0, k, n, real_chunk, 1, mat_a, out_ch, mat_b, n, 0, mat_tmp, n, mat_workspace);
std::unique_lock<std::mutex> merge_lock(lock_);
AddMatrixFp16(mat_tmp, mat_c, 1, k, n, n);
}
}
}
} else {
float16_t *mat_c = dw_addr;
const size_t in_plane_size = in_ch * in_h * in_w;
for (int i = start; i < end; ++i) {
for (int ci = 0; ci < m; ci += chunk_) {
int real_chunk = MSMIN(m - ci, chunk_);
float16_t *mat_a = dy_addr + i * m * k + ci * out_ch;
float16_t *im = x_addr + i * in_plane_size;
int input_h = ci / out_w * conv_param->stride_h_;
int input_w = ci % out_w * conv_param->stride_w_;
int offset = (input_h * in_w + input_w) * in_ch;
GemmMatmulFp16(1, 0, k, n, real_chunk, 1, mat_a, out_ch, im + offset, n, 0, mat_tmp, n, mat_workspace);
std::unique_lock<std::mutex> merge_lock(lock_);
AddMatrixFp16(mat_tmp, mat_c, 1, k, n, n);
}
}
}
return RET_OK;
}
int ConvolutionGradFilterFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
MS_ASSERT(cdata != nullptr);
auto convfilter_kernel = reinterpret_cast<ConvolutionGradFilterCPUKernelFp16 *>(cdata);
auto error_code = convfilter_kernel->Execute(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "ConvolutionGradFilterRun error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int ConvolutionGradFilterCPUKernelFp16::Run() {
auto *out_dw = out_tensors_.at(0);
auto dw_addr = reinterpret_cast<float16_t *>(out_dw->data_c());
memset(dw_addr, 0, out_dw->Size());
int error_code = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(ConvolutionGradFilterFp16Run, this, context_->thread_num_);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "conv filter function error error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2DBackpropFilterFusion,
LiteKernelCreator<ConvolutionGradFilterCPUKernelFp16>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,51 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_FILTER_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_FILTER_H_
#include <vector>
#include "src/inner_kernel.h"
namespace mindspore::kernel {
class ConvolutionGradFilterCPUKernelFp16 : public InnerKernel {
public:
explicit ConvolutionGradFilterCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx) {}
~ConvolutionGradFilterCPUKernelFp16() override {}
int Init() override;
int ReSize() override;
int Run() override;
int Execute(int task_id);
private:
size_t ws_size_ = 0;
bool do_img2col_ = true;
bool do_dw_ = false;
std::mutex lock_;
size_t mat_alloc_ = 0;
#ifdef ENABLE_ARM32
const int chunk_ = C4NUM * 2;
#else
const int chunk_ = C12NUM * 2;
#endif
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_FILTER_H_

View File

@ -0,0 +1,172 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.h"
#include "src/kernel_registry.h"
#include "nnacl/pack.h"
#include "nnacl/fp16_grad/pack_fp16_ext.h"
#include "nnacl/fp16_grad/gemm_fp16.h"
#include "include/errorcode.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Conv2DBackpropInputFusion;
namespace mindspore::kernel {
int ConvolutionGradInputCPUKernelFp16::ReSize() {
auto *dy_tensor = in_tensors_.at(kInputIndex);
MS_ASSERT(dy_tensor != nullptr);
auto *weight_tensor = in_tensors_.at(kWeightIndex);
MS_ASSERT(weight_tensor != nullptr);
auto *dx_tensor = out_tensors_.at(kOutputIndex);
MS_ASSERT(dx_tensor != nullptr);
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
conv_param->output_batch_ = dx_tensor->shape()[(kNHWC_N)];
conv_param->input_batch_ = dy_tensor->shape()[(kNHWC_N)];
conv_param->input_h_ = dx_tensor->shape()[(kNHWC_H)];
conv_param->input_w_ = dx_tensor->shape()[(kNHWC_W)];
// assume OutCh|kh|kw|In
conv_param->input_channel_ = dx_tensor->shape()[(kNHWC_C)];
conv_param->output_channel_ = weight_tensor->shape()[(kNHWC_N)];
conv_param->output_h_ = dy_tensor->shape()[kNHWC_H];
conv_param->output_w_ = dy_tensor->shape()[kNHWC_W];
ws_size_ = chunk_ * conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_ / conv_param->group_;
int n = conv_param->kernel_w_ * conv_param->kernel_h_ * conv_param->input_channel_ / conv_param->group_;
int k = conv_param->output_channel_ / conv_param->group_;
int thread_num = context_->thread_num_;
mat_alloc_ = MatSizeTotalFp16(chunk_, n, k, 0);
set_workspace_size((ws_size_ + mat_alloc_) * sizeof(float16_t) * thread_num);
do_img2col_ = (conv_param->kernel_h_ == 1) && (conv_param->kernel_w_ == 1) && (conv_param->pad_d_ == 0) &&
(conv_param->pad_u_ == 0) && (conv_param->pad_l_ == 0) && (conv_param->pad_r_ == 0) &&
(conv_param->dilation_h_ == 1) && (conv_param->dilation_w_ == 1) && (conv_param->stride_h_ == 1) &&
(conv_param->stride_w_ == 1) && (conv_param->group_ == 1)
? false
: true;
return RET_OK;
}
int ConvolutionGradInputCPUKernelFp16::Init() { return ReSize(); }
int ConvolutionGradInputCPUKernelFp16::Execute(int task_id) {
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
auto *input_dy = in_tensors_.at(0);
auto *input_w = in_tensors_.at(1);
auto *out_dx = out_tensors_.at(0);
auto dy_addr = reinterpret_cast<float16_t *>(input_dy->data_c());
auto w_addr = reinterpret_cast<float16_t *>(input_w->data_c());
auto dx_addr = reinterpret_cast<float16_t *>(out_dx->data_c());
int i, j;
int nweights = input_w->ElementsNum();
int in_ch = conv_param->input_channel_;
int in_h = conv_param->input_h_;
int in_w = conv_param->input_w_;
int k_h = conv_param->kernel_h_; // out_dw->shape()[1];
int k_w = conv_param->kernel_w_; // out_dw->shape()[2];
int batch = conv_param->output_batch_;
int out_ch = conv_param->output_channel_;
int groups = conv_param->group_;
int out_h = conv_param->output_h_;
int out_w = conv_param->output_w_;
int thread_num = context_->thread_num_;
int m = out_h * out_w;
int n = k_w * k_h * in_ch / groups;
int k = out_ch / groups;
float16_t *workspace_temp = reinterpret_cast<float16_t *>(workspace()) + task_id * (mat_alloc_ + ws_size_);
float16_t *mat_workspace = workspace_temp + ws_size_;
int stride = UP_DIV(batch, thread_num);
int count = MSMIN(stride, batch - stride * task_id);
count = (count < 0) ? 0 : count;
int start = stride * task_id;
int end = start + count;
for (i = start; i < end; ++i) {
for (j = 0; j < groups; ++j) {
GemmCbFp16 gcb;
for (int ci = 0; ci < m; ci += chunk_) {
float16_t *mat_b = nullptr;
if (ci == 0) {
mat_b = w_addr + j * nweights / groups;
gcb.ca = 0;
gcb.cb = 0;
gcb.bias = nullptr;
gcb.atype = ActType_No;
} else {
mat_b = gcb.mat_b;
gcb.cb = 1;
}
int real_chunk = MSMIN(m - ci, chunk_);
float16_t *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups) + ci * out_ch;
if (do_img2col_) {
float16_t *mat_c = workspace_temp;
GemmMatmulPlusFp16(0, 0, real_chunk, n, k, 1, mat_a, out_ch, mat_b, n, 0, mat_c, n, mat_workspace, &gcb);
RollingCol2ImPackUnitFp16(mat_c,
dx_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups),
conv_param, real_chunk, ci);
} else {
float16_t *mat_c =
dx_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups) + ci * (in_ch / groups);
GemmMatmulPlusFp16(0, 0, real_chunk, n, k, 1, mat_a, out_ch, mat_b, n, 0, mat_c, n, mat_workspace, &gcb);
}
}
}
}
return RET_OK;
}
int ConvolutionGradInputFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
MS_ASSERT(cdata != nullptr);
auto convinput_kernel = reinterpret_cast<ConvolutionGradInputCPUKernelFp16 *>(cdata);
auto error_code = convinput_kernel->Execute(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "conv input error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int ConvolutionGradInputCPUKernelFp16::Run() {
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
int batch = conv_param->output_batch_;
int in_ch = conv_param->input_channel_;
int in_h = conv_param->input_h_;
int in_w = conv_param->input_w_;
auto *out_dx = out_tensors_.at(0);
auto dx_addr = reinterpret_cast<float16_t *>(out_dx->data_c());
memset(dx_addr, 0, sizeof(float16_t) * batch * in_ch * in_h * in_w);
int error_code = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(ConvolutionGradInputFp16Run, this, context_->thread_num_);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "bias function error error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2DBackpropInputFusion,
LiteKernelCreator<ConvolutionGradInputCPUKernelFp16>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,44 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_INPUT_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_INPUT_H_
#include <vector>
#include "src/inner_kernel.h"
namespace mindspore::kernel {
class ConvolutionGradInputCPUKernelFp16 : public InnerKernel {
public:
explicit ConvolutionGradInputCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx) {}
~ConvolutionGradInputCPUKernelFp16() override {}
int Init() override;
int ReSize() override;
int Run() override;
int Execute(int task_id);
private:
size_t ws_size_ = 0;
size_t mat_alloc_ = 0;
bool do_img2col_ = true;
const int chunk_ = C12NUM;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_INPUT_H_

View File

@ -0,0 +1,95 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <random>
#include "src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.h"
#include "nnacl/fp16_grad/dropout_grad.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "nnacl/fp32_grad/dropout_parameter.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_NULL_PTR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DropoutGrad;
namespace mindspore::kernel {
int DropoutGradCPUKernelFp16::Init() {
auto param = reinterpret_cast<DropoutParameter *>(op_parameter_);
if (param == nullptr) {
MS_LOG(ERROR) << "Dropout op_parameter_ nullptr";
return RET_NULL_PTR;
}
auto ratio = param->ratio_;
if ((ratio > 1.0f) || (ratio < 0.0f)) {
MS_LOG(ERROR) << "unsupported ratio value - Dropout ratio should be between zero to one";
return RET_ERROR;
}
if (ratio >= 1.0f) {
scale_ = 1.0f;
} else {
scale_ = 1. / (1. - ratio);
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int DropoutGradCPUKernelFp16::ReSize() { return RET_OK; }
int DropoutGradCPUKernelFp16::Execute(int task_id) {
auto yt_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(kInputIndex)->data_c());
auto mask_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c());
auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(kOutputIndex)->data_c());
auto length = in_tensors_.at(kInputIndex)->ElementsNum();
int stride = UP_DIV(length, thread_count_);
int count = MSMIN(stride, length - stride * task_id);
if (count > 0) {
int start = stride * task_id;
DropoutFp16Grad(&(yt_ptr[start]), &(mask_ptr[start]), &(output_ptr[start]), count, (float16_t)scale_);
}
return RET_OK;
}
int RunDropoutFp16Grad(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto dropout = reinterpret_cast<DropoutGradCPUKernelFp16 *>(cdata);
auto error_code = dropout->Execute(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "Dropout Grad Run error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int DropoutGradCPUKernelFp16::Run() {
int error_code = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(RunDropoutFp16Grad, this, thread_count_);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "Dropout Grad function error error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_DropoutGrad, LiteKernelCreator<DropoutGradCPUKernelFp16>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,43 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_DROPOUT_FP16_GRAD_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_DROPOUT_FP16_GRAD_H_
#include <vector>
#include "src/inner_kernel.h"
namespace mindspore::kernel {
class DropoutGradCPUKernelFp16 : public InnerKernel {
public:
DropoutGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx), thread_count_(ctx->thread_num_) {}
~DropoutGradCPUKernelFp16() override = default;
int Init() override;
int ReSize() override;
int Run() override;
int Execute(int task_id);
private:
float scale_;
int thread_count_ = 1;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_DROPOUT_FP16_GRAD_H_

View File

@ -0,0 +1,109 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.h"
#include <vector>
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "nnacl/fp16_grad/layernorm_grad.h"
#include "nnacl/fp32_grad/layernormgrad_parameter.h"
#include "include/errorcode.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_LayerNormGrad;
namespace mindspore::kernel {
int LayerNormGradCPUKernelFp16::ReSize() { return RET_OK; }
int LayerNormGradCPUKernelFp16::Init() {
auto lngrad_param = reinterpret_cast<LayerNormGradParameter *>(op_parameter_);
auto *input_x = in_tensors_.at(0);
std::vector<int> x_shape = input_x->shape();
int begin_norm_axis = lngrad_param->begin_norm_axis_;
if (begin_norm_axis < 0) {
begin_norm_axis += x_shape.size();
}
auto begin_params_axis = lngrad_param->begin_params_axis_;
if (begin_params_axis < 0) {
begin_params_axis += x_shape.size();
}
for (size_t i = 0; i < static_cast<size_t>(begin_norm_axis); i++) {
block_num_ *= x_shape[i];
}
for (size_t i = static_cast<size_t>(begin_norm_axis); i < x_shape.size(); i++) {
block_size_ *= x_shape[i];
}
for (size_t i = 0; i < static_cast<size_t>(begin_params_axis); i++) {
param_size_ *= x_shape[i];
}
for (size_t i = begin_params_axis; i < x_shape.size(); i++) {
param_num_ *= x_shape[i];
}
if (block_num_ <= 0 || block_size_ <= 0) {
MS_LOG(ERROR) << "LayerNormGradCPUKernelFp16 input shape error, input shape: " << x_shape;
}
return RET_OK;
}
int LayerNormGradCPUKernelFp16::Execute(int task_id) {
auto input_x = in_tensors_.at(0);
auto input_dy = in_tensors_.at(1);
auto input_var = in_tensors_.at(2);
auto input_mean = in_tensors_.at(3);
auto input_gamma = in_tensors_.at(4);
auto output_dx = out_tensors_.at(0);
auto output_dg = out_tensors_.at(1);
auto output_db = out_tensors_.at(2);
float16_t *x = reinterpret_cast<float16_t *>(input_x->data_c());
float16_t *dy = reinterpret_cast<float16_t *>(input_dy->data_c());
float16_t *var = reinterpret_cast<float16_t *>(input_var->data_c());
float16_t *mean = reinterpret_cast<float16_t *>(input_mean->data_c());
float16_t *gamma = reinterpret_cast<float16_t *>(input_gamma->data_c());
float16_t *dx = reinterpret_cast<float16_t *>(output_dx->data_c());
float16_t *dg = reinterpret_cast<float16_t *>(output_dg->data_c());
float16_t *db = reinterpret_cast<float16_t *>(output_db->data_c());
LayerNormFp16Grad(x, dy, var, mean, gamma, param_num_, param_size_, block_num_, block_size_, dx, dg, db);
return RET_OK;
}
int LayerNormF16GradRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
MS_ASSERT(cdata != nullptr);
auto ln_kernel = reinterpret_cast<LayerNormGradCPUKernelFp16 *>(cdata);
auto error_code = ln_kernel->Execute(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "LayerNormGradRun error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int LayerNormGradCPUKernelFp16::Run() {
int error_code =
static_cast<const lite::InnerContext *>(this->context_)->thread_pool_->ParallelLaunch(LayerNormF16GradRun, this, 1);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "LayerNorm function error error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LayerNormGrad, LiteKernelCreator<LayerNormGradCPUKernelFp16>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,43 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_LAYERNORM_FP16_GRAD_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_LAYERNORM_FP16_GRAD_H_
#include <vector>
#include "src/inner_kernel.h"
namespace mindspore::kernel {
class LayerNormGradCPUKernelFp16 : public InnerKernel {
public:
explicit LayerNormGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx) {}
~LayerNormGradCPUKernelFp16() override {}
int Init() override;
int ReSize() override;
int Run() override;
int Execute(int task_id);
private:
int block_num_ = 1;
int block_size_ = 1;
int param_num_ = 1;
int param_size_ = 1;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_LAYERNORM_FP16_GRAD_H_

View File

@ -0,0 +1,69 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "nnacl/fp16/arithmetic_self_fp16.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_NegGrad;
namespace mindspore::kernel {
namespace {
int NegGradRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
MS_ASSERT(cdata != nullptr);
auto kernel = reinterpret_cast<NegGradCPUKernelFp16 *>(cdata);
MS_ASSERT(kernel != nullptr);
return kernel->DoNegGrad(task_id);
}
} // namespace
int NegGradCPUKernelFp16::Init() { return RET_OK; }
int NegGradCPUKernelFp16::DoNegGrad(int task_id) {
auto dy = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
auto dx = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
int length = in_tensors_.at(0)->ElementsNum();
int stride = UP_DIV(length, thread_count_);
int count = MSMIN(stride, length - stride * task_id);
count = (count < 0) ? 0 : count;
int start = stride * task_id;
ElementNegativeFp16(dy + start, dx + start, count);
return RET_OK;
}
int NegGradCPUKernelFp16::ReSize() { return RET_OK; }
int NegGradCPUKernelFp16::Run() {
int ret = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(NegGradRun, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "parallel launch fail!ret: " << ret;
return ret;
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_NegGrad, LiteKernelCreator<NegGradCPUKernelFp16>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,42 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_NEG_FP16_GRAD_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_NEG_FP16_GRAD_H_
#include <vector>
#include "src/inner_kernel.h"
#include "schema/model_generated.h"
namespace mindspore::kernel {
class NegGradCPUKernelFp16 : public InnerKernel {
public:
explicit NegGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx), thread_count_(ctx->thread_num_) {}
~NegGradCPUKernelFp16() override {}
int Init() override;
int ReSize() override;
int Run() override;
int DoNegGrad(int thread_id);
private:
int thread_count_;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_NEG_FP16_GRAD_H_

View File

@ -0,0 +1,113 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "nnacl/fp16/pooling_fp16.h"
#include "nnacl/fp16_grad/pooling_grad.h"
#include "include/errorcode.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_AvgPoolGrad;
using mindspore::schema::PrimitiveType_MaxPoolGrad;
namespace mindspore::kernel {
int PoolingGradCPUKernelFp16::ReSize() {
PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(op_parameter_);
auto in_shape = in_tensors_.at(0)->shape();
auto out_shape = in_tensors_.at(1)->shape();
if (pool_param->pool_mode_ == PoolMode_AvgPool) {
out_shape = in_tensors_.at(2)->shape();
}
int input_h = in_shape.at(1);
int input_w = in_shape.at(2);
if (pool_param->global_) {
pool_param->window_w_ = input_w;
pool_param->window_h_ = input_h;
}
pool_param->input_h_ = in_shape[kNHWC_H];
pool_param->input_w_ = in_shape[kNHWC_W];
pool_param->input_batch_ = in_shape[kNHWC_N];
pool_param->input_channel_ = in_shape[kNHWC_C];
pool_param->output_h_ = out_shape[kNHWC_H];
pool_param->output_w_ = out_shape[kNHWC_W];
pool_param->output_batch_ = out_shape[kNHWC_N];
pool_param->output_channel_ = out_shape[kNHWC_C];
return RET_OK;
}
int PoolingGradCPUKernelFp16::Init() { return ReSize(); }
int PoolingGradCPUKernelFp16::Execute(int task_id) {
PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(op_parameter_);
auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
int stride = UP_DIV(pool_param->output_batch_, thread_num_);
int count = MSMIN(stride, pool_param->output_batch_ - stride * task_id);
if (count > 0) {
int in_batch_size = pool_param->input_h_ * pool_param->input_w_ * pool_param->input_channel_;
int out_batch_size = pool_param->output_h_ * pool_param->output_w_ * pool_param->input_channel_;
std::fill(output_ptr + task_id * stride * in_batch_size, output_ptr + ((task_id * stride) + count) * in_batch_size,
0.f);
if (pool_param->pool_mode_ == PoolMode_MaxPool) {
auto dy_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(2)->data_c());
MaxPoolingFp16Grad(input_ptr + task_id * stride * in_batch_size, dy_ptr + task_id * stride * out_batch_size,
output_ptr + task_id * stride * in_batch_size, count, pool_param);
} else {
input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(2)->data_c());
AvgPoolingFp16Grad(input_ptr + task_id * stride * out_batch_size, output_ptr + task_id * stride * in_batch_size,
count, pool_param);
}
}
return RET_OK;
}
int PoolingFp16GradImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
MS_ASSERT(cdata != nullptr);
auto pooling = reinterpret_cast<PoolingGradCPUKernelFp16 *>(cdata);
auto error_code = pooling->Execute(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "Pooling Run error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int PoolingGradCPUKernelFp16::Run() {
thread_num_ = context_->thread_num_;
int error_code = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(PoolingFp16GradImpl, this, thread_num_);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_AvgPoolGrad, LiteKernelCreator<PoolingGradCPUKernelFp16>)
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MaxPoolGrad, LiteKernelCreator<PoolingGradCPUKernelFp16>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,47 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_POOLING_FP16_GRAD_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_POOLING_FP16_GRAD_H_
#include <vector>
#include "src/inner_kernel.h"
namespace mindspore::kernel {
using mindspore::schema::PadMode;
using mindspore::schema::PoolMode;
using mindspore::schema::QuantType;
using mindspore::schema::RoundMode;
class PoolingGradCPUKernelFp16 : public InnerKernel {
public:
explicit PoolingGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx) {}
~PoolingGradCPUKernelFp16() override = default;
int Init() override;
int ReSize() override;
int Run() override;
int Execute(int task_id);
private:
int thread_num_ = 1;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_POOLING_FP16_GRAD_H_

View File

@ -0,0 +1,104 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <vector>
#include "src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.h"
#include "nnacl/fp16_grad/resize_grad.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_ResizeGrad;
namespace mindspore::kernel {
float16_t ScalingFp16(size_t in_size, size_t out_size, bool align_corners) {
return (align_corners && out_size > 1) ? (in_size - 1) / static_cast<float16_t>(out_size - 1)
: in_size / static_cast<float16_t>(out_size);
}
int ResizeGradCPUKernelFp16::ReSize() {
auto param = reinterpret_cast<ResizeFp16GradParameter *>(op_parameter_);
if (param == nullptr) {
MS_LOG(ERROR) << "ResizeGradCPUKernelFp16 op_parameter_ is nullptr";
return RET_ERROR;
}
bool align_corners = param->align_corners_;
param->in_height_ = static_cast<size_t>(in_tensors_.at(0)->Height());
param->in_width_ = static_cast<size_t>(in_tensors_.at(0)->Width());
param->out_height_ = static_cast<size_t>(out_tensors_.at(0)->Height());
param->out_width_ = static_cast<size_t>(out_tensors_.at(0)->Width());
param->height_scale_ = ScalingFp16(param->out_height_, param->in_height_, align_corners);
param->width_scale_ = ScalingFp16(param->out_width_, param->in_width_, align_corners);
return RET_OK;
}
int ResizeGradCPUKernelFp16::Init() {
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int ResizeGradCPUKernelFp16::Execute(int task_id) {
auto in_addr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
auto out_addr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
auto param = reinterpret_cast<ResizeFp16GradParameter *>(op_parameter_);
if (param == nullptr) {
MS_LOG(ERROR) << "ResizeGradCPUKernelFp16 op_parameter_ is nullptr";
return RET_ERROR;
}
auto batch_size = in_tensors_.at(0)->Batch();
auto channel = in_tensors_.at(0)->Channel();
if (param->method == static_cast<int>(schema::ResizeMethod_NEAREST)) {
ResizeNearestNeighborFp16Grad(in_addr, out_addr, batch_size, channel, in_tensors_.at(0)->format(), param);
} else {
ResizeBiLinearFp16Grad(in_addr, out_addr, batch_size, channel, in_tensors_.at(0)->format(), param);
}
return RET_OK;
}
int ResizeFp16GradRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
auto resize_grad_kernel = reinterpret_cast<ResizeGradCPUKernelFp16 *>(cdata);
auto error_code = resize_grad_kernel->Execute(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "resize grad error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int ResizeGradCPUKernelFp16::Run() {
auto out_addr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
size_t elem_number = out_tensors_.at(0)->ElementsNum();
std::fill(out_addr, out_addr + elem_number, 0.f);
int error_code =
static_cast<const lite::InnerContext *>(this->context_)->thread_pool_->ParallelLaunch(ResizeFp16GradRun, this, 1);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "ResizeGradCPUKernelFp16 function error error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_ResizeGrad, LiteKernelCreator<ResizeGradCPUKernelFp16>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,38 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_RESIZE_FP16_GRAD_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_RESIZE_FP16_GRAD_H_
#include <vector>
#include "src/inner_kernel.h"
namespace mindspore::kernel {
class ResizeGradCPUKernelFp16 : public InnerKernel {
public:
explicit ResizeGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx) {}
~ResizeGradCPUKernelFp16() override = default;
int Init() override;
int ReSize() override;
int Run() override;
int ExecuteInit(int task_id);
int Execute(int task_id);
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_RESIZE_FP16_GRAD_H_

View File

@ -0,0 +1,154 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.h"
#include <vector>
#include <algorithm>
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "nnacl/fp16_grad/strided_slice_grad.h"
#include "src/ops/populate/strided_slice_populate.h"
#include "include/errorcode.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_StridedSliceGrad;
namespace mindspore::kernel {
int StridedSliceGradCPUKernelFp16::Init() {
if (!InferShapeDone()) {
return RET_OK;
}
param_ = reinterpret_cast<StridedSliceParameter *>(op_parameter_);
auto input = in_tensors_.at(0);
MS_ASSERT(input);
switch (input->data_type()) {
case kNumberTypeFloat16:
param_->data_type = kDataTypeFloat16;
break;
default:
MS_LOG(ERROR) << "Not supported data type: " << input->data_type();
return RET_ERROR;
}
FillEmptyDims();
FillOutputDim();
return ReSize();
}
void StridedSliceGradCPUKernelFp16::FillEmptyDims() {
int32_t begins[DIMENSION_7D];
int32_t ends[DIMENSION_7D];
int32_t strides[DIMENSION_7D];
int32_t input_shape[DIMENSION_7D];
int32_t i;
for (i = 0; i < param_->num_axes_; ++i) {
begins[i] = param_->begins_[i];
ends[i] = MSMIN(param_->ends_[i], param_->in_shape_[i]);
strides[i] = param_->strides_[i];
input_shape[i] = param_->in_shape_[i];
}
for (i = param_->num_axes_; i < param_->in_shape_length_; ++i) {
input_shape[i] = param_->in_shape_[i];
begins[i] = 0;
ends[i] = param_->in_shape_[i];
strides[i] = 1;
}
int32_t real_index = param_->in_shape_length_ - 1;
for (i = DIMENSION_7D - 1; i >= 0; --i) {
if (real_index >= 0) {
param_->begins_[i] = begins[real_index];
param_->ends_[i] = ends[real_index];
param_->strides_[i] = strides[real_index];
param_->in_shape_[i] = input_shape[real_index--];
} else {
param_->begins_[i] = 0;
param_->ends_[i] = 1;
param_->strides_[i] = 1;
param_->in_shape_[i] = 1;
}
}
param_->num_axes_ = DIMENSION_7D;
param_->in_shape_length_ = DIMENSION_7D;
for (i = 0; i < DIMENSION_7D; ++i) {
if (param_->begins_[i] < 0) {
param_->begins_[i] += param_->in_shape_[i];
}
if (param_->ends_[i] < 0) {
param_->ends_[i] += param_->in_shape_[i];
}
}
}
void StridedSliceGradCPUKernelFp16::FillOutputDim() {
auto output = out_tensors_.at(0);
size_t out_size = output->shape().size();
for (size_t i = 0; i < DIMENSION_7D; i++) {
if (i < out_size) {
output_shape_.push_back(output->shape()[i]);
} else {
output_shape_.insert(output_shape_.begin(), 1);
}
}
}
int StridedSliceGradCPUKernelFp16::ReSize() { return RET_OK; }
int StridedSliceFp16GradImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
MS_ASSERT(cdata != nullptr);
auto slice = reinterpret_cast<StridedSliceGradCPUKernelFp16 *>(cdata);
auto error_code = slice->Execute(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "StridedSliceGrad Run error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int StridedSliceGradCPUKernelFp16::Run() {
int error_code = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(StridedSliceFp16GradImpl, this, 1);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "Strided slice error error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int StridedSliceGradCPUKernelFp16::Execute(int task_id) {
auto input = in_tensors_.at(0);
auto output = out_tensors_.at(0);
MS_ASSERT(output);
int *po = output_shape_.data();
auto dx = reinterpret_cast<float16_t *>(output->data_c());
auto dy = reinterpret_cast<float16_t *>(input->data_c());
std::fill(dx, dx + output->ElementsNum(), 0.f);
auto ret = DoStridedSliceFp16Grad(dy, dx, po, param_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "StridedSliceGrad error error_code[" << ret << "]";
return RET_ERROR;
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_StridedSliceGrad, LiteKernelCreator<StridedSliceGradCPUKernelFp16>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,49 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_STRIDED_SLICE_FP16_GRAD_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_STRIDED_SLICE_FP16_GRAD_H_
#include <vector>
#include "nnacl/fp16_grad/strided_slice_grad.h"
#include "src/lite_kernel.h"
namespace mindspore::kernel {
class StridedSliceGradCPUKernelFp16 : public InnerKernel {
public:
StridedSliceGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx) {
param_ = reinterpret_cast<StridedSliceParameter *>(parameter);
}
~StridedSliceGradCPUKernelFp16() override = default;
int Init() override;
int ReSize() override;
int Run() override;
int Execute(int task_id);
private:
void FillEmptyDims();
void FillOutputDim();
void ParseMasks();
StridedSliceParameter *param_;
std::vector<int> output_shape_;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_STRIDED_SLICE_FP16_GRAD_H_

View File

@ -0,0 +1,98 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.h"
#include <vector>
#include <algorithm>
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "nnacl/fp16_grad/unsorted_segment_sum.h"
#include "include/errorcode.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_UnsortedSegmentSum;
namespace mindspore::kernel {
int UnsortedSegmentSumCPUKernelFp16::Init() {
if (!InferShapeDone()) {
return RET_OK;
}
auto input_shape = in_tensors_.at(0)->shape();
auto segment_ids_shape = in_tensors_.at(1)->shape();
auto output_shape = out_tensors_.at(0)->shape();
unit_num_ = 1;
input_dim1_ = 1;
for (size_t i = 0; i < input_shape.size(); ++i) {
unit_num_ *= input_shape[i];
if (i >= segment_ids_shape.size()) {
input_dim1_ *= input_shape[i];
}
}
output_dim0_ = output_shape[0];
output_dim1_ = 1;
for (size_t j = 1; j < output_shape.size(); j++) {
output_dim1_ *= output_shape[j];
}
return RET_OK;
}
int UnsortedSegmentSumCPUKernelFp16::ReSize() { return RET_OK; }
int UnsortedSegmentSumFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
MS_ASSERT(cdata != nullptr);
auto kernel = reinterpret_cast<UnsortedSegmentSumCPUKernelFp16 *>(cdata);
auto error_code = kernel->Execute(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "UnsortedSegmentSum Run error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int UnsortedSegmentSumCPUKernelFp16::Run() {
int error_code = static_cast<const lite::InnerContext *>(this->context_)
->thread_pool_->ParallelLaunch(UnsortedSegmentSumFp16Run, this, 1);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "Strided slice error error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
int UnsortedSegmentSumCPUKernelFp16::Execute(int task_id) {
int ret;
auto input_tensor = in_tensors_.at(0);
auto indices_tensor = in_tensors_.at(1);
auto output_tensor = out_tensors_.at(0);
float16_t *input = reinterpret_cast<float16_t *>(input_tensor->data_c());
int *indices = reinterpret_cast<int *>(indices_tensor->data_c());
float16_t *output = reinterpret_cast<float16_t *>(output_tensor->data_c());
std::fill(output, output + output_tensor->ElementsNum(), 0.f);
ret = UnsortedSegmentSumFp16(input, unit_num_, input_dim1_, indices, output, output_dim0_, output_dim1_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "UnsortedSegmentSumFp16 error error_code[" << ret << "]";
return RET_ERROR;
}
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_UnsortedSegmentSum,
LiteKernelCreator<UnsortedSegmentSumCPUKernelFp16>)
} // namespace mindspore::kernel

View File

@ -0,0 +1,44 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_UNSORTED_SEGMENT_SUM_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_UNSORTED_SEGMENT_SUM_FP16_H_
#include <vector>
#include "src/lite_kernel.h"
namespace mindspore::kernel {
class UnsortedSegmentSumCPUKernelFp16 : public InnerKernel {
public:
UnsortedSegmentSumCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx) {}
~UnsortedSegmentSumCPUKernelFp16() override = default;
int Init() override;
int ReSize() override;
int Run() override;
int Execute(int task_id);
size_t unit_num_ = 0;
size_t input_dim1_ = 0;
size_t output_dim0_ = 0;
size_t output_dim1_ = 0;
private:
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_UNSORTED_SEGMENT_SUM_FP16_H_

View File

@ -58,7 +58,7 @@ TEST_F(TestActGradFp16, ReluGradFp16) {
input_buf[i] = (float16_t)input_data[i];
}
Fp16ReluGrad(yt_buf, input_buf, 50, output_buf);
ReluFp16Grad(yt_buf, input_buf, 50, output_buf);
int res = 0;
float error = 0;
@ -113,7 +113,7 @@ TEST_F(TestActGradFp16, SigmoidGradFp16) {
input_buf[i] = (float16_t)input_data[i];
}
Fp16SigmoidGrad(yt_buf, input_buf, 50, output_buf);
SigmoidFp16Grad(yt_buf, input_buf, 50, output_buf);
int res = 0;
float error = 0;