forked from mindspore-Ecosystem/mindspore
float16 kernels
This commit is contained in:
parent
346a35ba7d
commit
394a8398e2
|
@ -14,21 +14,26 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
#include "nnacl/op_base.h"
|
||||
#include "nnacl/fp16_grad/activation_grad.h"
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#ifdef ENABLE_NEON
|
||||
#include <arm_neon.h>
|
||||
#include "nnacl/fp32/exp_fp32.h"
|
||||
#endif
|
||||
#include "nnacl/op_base.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
|
||||
int Fp16ReluGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst) {
|
||||
int ReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
|
||||
int i = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t zero_4 = vdupq_n_f16(0);
|
||||
for (; i < length - 4; i += 4) {
|
||||
float16x8_t src0_4 = vld1q_f16(src0 + i);
|
||||
float16x8_t src1_4 = vld1q_f16(src1 + i);
|
||||
uint16x8_t mask_4 = vcgtq_f16(src1_4, zero_4);
|
||||
float16x8_t dst_4 = vbslq_f16(mask_4, src0_4, zero_4);
|
||||
vst1q_f16(dst + i, dst_4);
|
||||
float16x8_t zero_v = vdupq_n_f16(0);
|
||||
for (; i <= length - C8NUM; i += C8NUM) {
|
||||
float16x8_t src0_v = vld1q_f16(src0 + i);
|
||||
float16x8_t src1_v = vld1q_f16(src1 + i);
|
||||
uint16x8_t mask_v = vcgtq_f16(src1_v, zero_v);
|
||||
float16x8_t dst_v = vbslq_f16(mask_v, src0_v, zero_v);
|
||||
vst1q_f16(dst + i, dst_v);
|
||||
}
|
||||
#endif
|
||||
for (; i < length; i++) {
|
||||
|
@ -37,15 +42,43 @@ int Fp16ReluGrad(const float16_t *src0, const float16_t *src1, size_t length, fl
|
|||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int Fp16SigmoidGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst) {
|
||||
int Relu6Fp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
|
||||
int i = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t one_4 = vdupq_n_f16(1);
|
||||
for (; i < length - 4; i += 4) {
|
||||
float16x8_t src0_4 = vld1q_f16(src0 + i);
|
||||
float16x8_t src1_4 = vld1q_f16(src1 + i);
|
||||
float16x8_t dst_4 = vmulq_f16(src0_4, vmulq_f16(src1_4, vsubq_f16(one_4, src1_4)));
|
||||
vst1q_f16(dst + i, dst_4);
|
||||
float16x8_t zero_8 = vdupq_n_f16(0);
|
||||
float16x8_t six_8 = vdupq_n_f16(6);
|
||||
for (; i < length - 8; i += 8) {
|
||||
float16x8_t src1_8 = vld1q_f16(src1 + i);
|
||||
float16x8_t src0_8 = vld1q_f16(src0 + i);
|
||||
float16x8_t max_8 = vmaxq_f16(src1_8, zero_8);
|
||||
float16x8_t min_max_8 = vminq_f16(max_8, six_8);
|
||||
uint16x8_t mask_8 = vceqq_f16(min_max_8, src1_8);
|
||||
float16x8_t dst_8 = vbslq_f16(mask_8, src0_8, zero_8);
|
||||
vst1q_f16(dst + i, dst_8);
|
||||
}
|
||||
#endif
|
||||
for (; i < length; ++i) {
|
||||
dst[i] = (src1[i] > 0.0f && src1[i] <= 6.0f) ? src0[i] : 0.0f;
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int LReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha) {
|
||||
for (int i = 0; i < length; ++i) {
|
||||
dst[i] = src0[i] * (src1[i] * (1.0f - src1[i]));
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int SigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
|
||||
int i = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x8_t one_8 = vdupq_n_f16(1);
|
||||
for (; i < length - 8; i += 8) {
|
||||
float16x8_t src0_8 = vld1q_f16(src0 + i);
|
||||
float16x8_t src1_8 = vld1q_f16(src1 + i);
|
||||
float16x8_t dst_8 = vmulq_f16(src0_8, vmulq_f16(src1_8, vsubq_f16(one_8, src1_8)));
|
||||
vst1q_f16(dst + i, dst_8);
|
||||
}
|
||||
#endif
|
||||
for (; i < length; i++) {
|
||||
|
@ -53,3 +86,56 @@ int Fp16SigmoidGrad(const float16_t *src0, const float16_t *src1, size_t length,
|
|||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int TanhFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
|
||||
for (int i = 0; i < length; ++i) {
|
||||
dst[i] = (float16_t)((1.0f - ((float)src1[i] * (float)src1[i])) * (float)src0[i]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int HSwishFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
|
||||
for (int i = 0; i < length; ++i) {
|
||||
float16_t tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : (2.0f * src1[i] + 3.0f) / 6.0f));
|
||||
dst[i] = tmp * src0[i];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int HSigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
|
||||
for (int i = 0; i < length; ++i) {
|
||||
float16_t tmp = (src1[i] > 3.0f ? 0.0f : (src1[i] < -3.0f ? 0.0f : 1.0f / 6.0f));
|
||||
dst[i] = tmp * src0[i];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
int EluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha) {
|
||||
int i = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x4_t zero_4 = vdup_n_f16(0);
|
||||
float16x4_t one_4 = vdup_n_f16(1);
|
||||
float16x4_t alpha_4 = vdup_n_f16(alpha);
|
||||
for (; i < length - 4; i += 4) {
|
||||
float16x4_t src0_4 = vld1_f16(src0 + i);
|
||||
float16x4_t src1_4 = vld1_f16(src1 + i);
|
||||
uint16x4_t mask_4 = vcgt_f16(src1_4, zero_4);
|
||||
float32x4_t tmp;
|
||||
simd_exp(vcvt_f32_f16(src1_4), (float *)&tmp);
|
||||
uint16x4_t expm1_4 = vsub_f16(vcvt_f16_f32(tmp), one_4);
|
||||
float16x4_t dst_4 = vbsl_f16(mask_4, src0_4, vmul_f16(alpha_4, vmul_f16(expm1_4, src0_4)));
|
||||
vst1_f16(dst + i, dst_4);
|
||||
}
|
||||
#endif
|
||||
for (; i < length; ++i) {
|
||||
dst[i] = (src1[i] > 0.0f ? src0[i] : alpha * expm1(src1[i]) * src0[i]);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int GeluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst) {
|
||||
for (int i = 0; i < length; ++i) {
|
||||
dst[i] = src0[i] * ((0.5 * (1.0 + erf(src1[i] / 1.4142135623730951))) +
|
||||
(src1[i] * exp(-0.5 * src1[i] * src1[i]) / 2.5066282746));
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
|
|
@ -13,8 +13,8 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
|
||||
#define MINDSPORE_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
#include <arm_neon.h>
|
||||
|
@ -23,20 +23,22 @@
|
|||
#include "nnacl/op_base.h"
|
||||
#include "nnacl/int8/fixed_point.h"
|
||||
|
||||
typedef struct ActivationGradParameterFp16 {
|
||||
OpParameter op_parameter;
|
||||
int type_;
|
||||
float alpha_;
|
||||
} ActivationGradParameterFp16;
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int Fp16ReluGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst);
|
||||
int Fp16SigmoidGrad(const float16_t *src0, const float16_t *src1, size_t length, float16_t *dst);
|
||||
int ReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
|
||||
int Relu6Fp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
|
||||
int LReluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha);
|
||||
int SigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
|
||||
int TanhFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
|
||||
int HSwishFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
|
||||
int HSigmoidFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
|
||||
int EluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst, float16_t alpha);
|
||||
int GeluFp16Grad(const float16_t *src0, const float16_t *src1, int length, float16_t *dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ACTIVATION_GRAD_H_
|
||||
|
|
|
@ -0,0 +1,157 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "nnacl/fp16_grad/arithmetic_grad.h"
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include "nnacl/fp32_grad/utils.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
|
||||
void ElementDivNegSquareFp16(const float16_t *nom, const float16_t *denom, float16_t *output, int element_size) {
|
||||
for (int i = 0; i < element_size; i++) {
|
||||
output[i] = -nom[i] / (denom[i] * denom[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ElementMulAndDivNegSquareFp16(const float16_t *a, const float16_t *b, const float16_t *denom, float16_t *output,
|
||||
int element_size) {
|
||||
for (int i = 0; i < element_size; i++) {
|
||||
output[i] = -a[i] * b[i] / (denom[i] * denom[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int ElementAbsGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, int element_size) {
|
||||
for (int i = 0; i < element_size; i++) {
|
||||
out[i] = (in1[i] < 0.f) ? -in2[i] : ((in1[i] > 0.f) ? in2[i] : 0);
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
void MaximumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims,
|
||||
const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1,
|
||||
int num_dims) {
|
||||
int num_output0 = 1;
|
||||
int num_output1 = 1;
|
||||
bool same_shape = true;
|
||||
for (int idx = 0; idx < num_dims; ++idx) {
|
||||
num_output0 *= input0_dims[idx];
|
||||
num_output1 *= input1_dims[idx];
|
||||
if (input0_dims[idx] != input1_dims[idx]) {
|
||||
same_shape = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (same_shape) {
|
||||
int input_iter[8] = {0};
|
||||
|
||||
// Iterate through input_data.
|
||||
do {
|
||||
size_t offset = GetInputOffset(num_dims, input0_dims, input_iter);
|
||||
output0[offset] = input0[offset] > input1[offset] ? dy[offset] : 0.;
|
||||
output1[offset] = input1[offset] >= input0[offset] ? dy[offset] : 0.;
|
||||
} while (NextIndex(num_dims, input0_dims, input_iter));
|
||||
} else {
|
||||
memset(output0, 0, num_output0 * sizeof(float16_t)); // zero output
|
||||
memset(output1, 0, num_output1 * sizeof(float16_t)); // zero output
|
||||
|
||||
int input_iter[8] = {0};
|
||||
int axes0[5] = {0};
|
||||
int axes1[5] = {0};
|
||||
int num_axes0 = 0;
|
||||
int num_axes1 = 0;
|
||||
for (int i = 0; i < num_dims; i++) {
|
||||
if (input0_dims[i] == 1) {
|
||||
axes0[num_axes0++] = i;
|
||||
}
|
||||
if (input1_dims[i] == 1) {
|
||||
axes1[num_axes1++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
size_t offset0 = GetOutputOffset(num_dims, input0_dims, input_iter, num_axes0, axes0);
|
||||
size_t offset1 = GetOutputOffset(num_dims, input1_dims, input_iter, num_axes1, axes1);
|
||||
size_t yt_offset = GetInputOffset(num_dims, input0_dims, input_iter);
|
||||
output0[offset0] += input0[offset0] > input1[offset1] ? dy[yt_offset] : 0.;
|
||||
output1[offset1] += input1[offset1] >= input0[offset0] ? dy[yt_offset] : 0.;
|
||||
} while (NextIndex(num_dims, dy_dims, input_iter));
|
||||
}
|
||||
}
|
||||
|
||||
void MinimumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims,
|
||||
const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1,
|
||||
int num_dims) {
|
||||
int num_output0 = 1;
|
||||
int num_output1 = 1;
|
||||
bool same_shape = true;
|
||||
for (int idx = 0; idx < num_dims; ++idx) {
|
||||
num_output0 *= input0_dims[idx];
|
||||
num_output1 *= input1_dims[idx];
|
||||
if (input0_dims[idx] != input1_dims[idx]) {
|
||||
same_shape = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (same_shape) {
|
||||
int input_iter[8] = {0};
|
||||
|
||||
// Iterate through input_data.
|
||||
do {
|
||||
size_t offset = GetInputOffset(num_dims, input0_dims, input_iter);
|
||||
output0[offset] = input0[offset] < input1[offset] ? dy[offset] : 0.;
|
||||
output1[offset] = input1[offset] <= input0[offset] ? dy[offset] : 0.;
|
||||
} while (NextIndex(num_dims, input0_dims, input_iter));
|
||||
} else {
|
||||
memset(output0, 0, num_output0 * sizeof(float16_t)); // zero output
|
||||
memset(output1, 0, num_output1 * sizeof(float16_t)); // zero output
|
||||
|
||||
int input_iter[8] = {0};
|
||||
int axes0[5] = {0};
|
||||
int axes1[5] = {0};
|
||||
int num_axes0 = 0;
|
||||
int num_axes1 = 0;
|
||||
for (int i = 0; i < num_dims; i++) {
|
||||
if (input0_dims[i] == 1) {
|
||||
axes0[num_axes0++] = i;
|
||||
}
|
||||
if (input1_dims[i] == 1) {
|
||||
axes1[num_axes1++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
size_t offset0 = GetOutputOffset(num_dims, input0_dims, input_iter, num_axes0, axes0);
|
||||
size_t offset1 = GetOutputOffset(num_dims, input1_dims, input_iter, num_axes1, axes1);
|
||||
size_t yt_offset = GetInputOffset(num_dims, input0_dims, input_iter);
|
||||
output0[offset0] += input0[offset0] < input1[offset1] ? dy[yt_offset] : 0.;
|
||||
output1[offset1] += input1[offset1] <= input0[offset0] ? dy[yt_offset] : 0.;
|
||||
} while (NextIndex(num_dims, dy_dims, input_iter));
|
||||
}
|
||||
}
|
||||
|
||||
int ElementSqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size) {
|
||||
for (int i = 0; i < element_size; i++) {
|
||||
out[i] = 0.5f * in2[i] / in1[i];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ElementRsqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size) {
|
||||
for (int i = 0; i < element_size; i++) {
|
||||
out[i] = -0.5f * in2[i] * in1[i] * in1[1] * in1[i];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ARITHMETIC_GRAD_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ARITHMETIC_GRAD_H_
|
||||
|
||||
#include "nnacl/op_base.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
void ElementDivNegSquareFp16(const float16_t *nom, const float16_t *denom, float16_t *output, int element_size);
|
||||
void ElementMulAndDivNegSquareFp16(const float16_t *a, const float16_t *b, const float16_t *denom, float16_t *output,
|
||||
int element_size);
|
||||
int ElementAbsGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, int element_size);
|
||||
void MaximumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims,
|
||||
const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1,
|
||||
int num_dims);
|
||||
void MinimumByAxesFp16(const float16_t *input0, const float16_t *input1, const float16_t *dy, const int *input0_dims,
|
||||
const int *input1_dims, const int *dy_dims, float16_t *output0, float16_t *output1,
|
||||
int num_dims);
|
||||
int ElementSqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size);
|
||||
int ElementRsqrtGradFp16(const float16_t *in1, const float16_t *in2, float16_t *out, const int element_size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_ARITHMETIC_GRAD_H_
|
|
@ -0,0 +1,86 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
#include "nnacl/fp16_grad/batch_norm.h"
|
||||
|
||||
void var2InvarFp16(float16_t *save_var, int size, float eps) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
save_var[i] = (float16_t)(1.0f / sqrtf((float)save_var[i] + eps));
|
||||
}
|
||||
}
|
||||
|
||||
void backwardAllFp16(const float16_t *restrict in, const float16_t *restrict yt, const float16_t *restrict mean,
|
||||
const float16_t *restrict invar, const float16_t *restrict scale, int size, int ch,
|
||||
float *restrict dxhat_sum, float *restrict dxhathat_sum, float16_t *restrict dbias,
|
||||
float16_t *restrict dscale, float16_t *restrict dx) {
|
||||
float16_t N = (float16_t)size;
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (int c = 0; c < ch; c++) {
|
||||
int ix = i * ch + c;
|
||||
dbias[c] += yt[ix];
|
||||
// dscale
|
||||
float16_t x_hat = (in[ix] - mean[c]) * invar[c];
|
||||
dscale[c] += (yt[ix] * x_hat);
|
||||
// dx_1
|
||||
float dx_hat = (float)(yt[ix] * scale[c]);
|
||||
dxhat_sum[c] += dx_hat;
|
||||
dxhathat_sum[c] += (float)(dx_hat * x_hat);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (int c = 0; c < ch; c++) {
|
||||
// dx_2
|
||||
int ix = i * ch + c;
|
||||
float16_t x_hat = (in[ix] - mean[c]) * invar[c];
|
||||
float16_t dx_hat = yt[ix] * scale[c];
|
||||
dx[ix] = 1.0f / N * (float)((invar[c]) * (N * dx_hat - dxhat_sum[c] - x_hat * dxhathat_sum[c]));
|
||||
}
|
||||
}
|
||||
}
|
||||
void backwardP1Fp16(const float16_t *restrict in, const float16_t *restrict yt, const float16_t *restrict mean,
|
||||
const float16_t *restrict invar, const float16_t *restrict scale, int size, int ch,
|
||||
float *restrict dxhat_sum, float *restrict dxhathat_sum, float16_t *restrict dbias,
|
||||
float16_t *restrict dscale) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (int c = 0; c < ch; c++) {
|
||||
int ix = i * ch + c;
|
||||
dbias[c] += yt[ix];
|
||||
// dscale
|
||||
float x_hat = (float)((in[ix] - mean[c]) * invar[c]);
|
||||
dscale[c] += (yt[ix] * x_hat);
|
||||
// dx_1
|
||||
float dx_hat = (float)(yt[ix] * scale[c]);
|
||||
dxhat_sum[c] += dx_hat;
|
||||
dxhathat_sum[c] += dx_hat * x_hat;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void backwardP2Fp16(const float16_t *restrict in, const float16_t *restrict yt, const float16_t *restrict mean,
|
||||
const float16_t *restrict invar, const float16_t *restrict scale, int size, int total_size, int ch,
|
||||
const float *dxhat_sum, const float *dxhathat_sum, float16_t *restrict dx) {
|
||||
const float N = (float)total_size;
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (int c = 0; c < ch; c++) {
|
||||
// dx_2
|
||||
int ix = i * ch + c;
|
||||
float x_hat = (float)((in[ix] - mean[c]) * invar[c]);
|
||||
float dx_hat = (float)(yt[ix] * scale[c]);
|
||||
dx[ix] = (float16_t)(1.0f / N * (float)(invar[c]) * (N * dx_hat - dxhat_sum[c] - x_hat * dxhathat_sum[c]));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_BATCH_NORM_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_BATCH_NORM_H_
|
||||
|
||||
#include "nnacl/op_base.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void var2InvarFp16(float16_t *save_var, int size, float eps);
|
||||
void backwardAllFp16(const float16_t *in, const float16_t *yt, const float16_t *mean, const float16_t *invar,
|
||||
const float16_t *scale, int size, int ch, float *dxhat_sum, float *dxhathat_sum, float16_t *dbias,
|
||||
float16_t *dscale, float16_t *dx);
|
||||
void backwardP1Fp16(const float16_t *in, const float16_t *yt, const float16_t *mean, const float16_t *invar,
|
||||
const float16_t *scale, int size, int ch, float *dxhat_sum, float *dxhathat_sum, float16_t *dbias,
|
||||
float16_t *dscale);
|
||||
void backwardP2Fp16(const float16_t *in, const float16_t *yt, const float16_t *mean, const float16_t *invar,
|
||||
const float16_t *scale, int size, int total_size, int ch, const float *dxhat_sum,
|
||||
const float *dxhathat_sum, float16_t *dx);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_BATCH_NORM_H_
|
|
@ -0,0 +1,360 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "nnacl/fp16_grad/convolution_grad_filter.h"
|
||||
#include "nnacl/intrinsics/ms_simd_instructions_fp16.h"
|
||||
#ifdef ENABLE_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
|
||||
static int FilterGrad32Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
|
||||
const ConvParameter *conv_param) {
|
||||
int in_h = conv_param->input_h_;
|
||||
int in_w = conv_param->input_w_;
|
||||
int k_h = conv_param->kernel_h_;
|
||||
int k_w = conv_param->kernel_w_;
|
||||
int batch = conv_param->output_batch_;
|
||||
int out_ch = conv_param->output_channel_;
|
||||
int in_ch = conv_param->input_channel_;
|
||||
int out_h = conv_param->output_h_;
|
||||
int out_w = conv_param->output_w_;
|
||||
|
||||
int m = out_h * out_w;
|
||||
int x_size = in_h * in_w * in_ch;
|
||||
int y_size = out_ch * out_h * out_w;
|
||||
int k_spatial = k_w * k_h;
|
||||
int i_kh = k_idx / k_w;
|
||||
int i_kw = k_idx % k_w;
|
||||
for (; i_c < (out_ch & ~31); i_c += 32) {
|
||||
float32x4_t sum_0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sum_1 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sum_2 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sum_3 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sum_4 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sum_5 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sum_6 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sum_7 = vdupq_n_f32(0.0f);
|
||||
|
||||
for (int b = 0; b < batch; ++b) {
|
||||
const float16_t *x_addr = &x[b * x_size];
|
||||
const float16_t *dy_addr = &dy[b * y_size];
|
||||
for (int i = 0; i < m; i++) {
|
||||
int idx = i;
|
||||
int input_h = idx / out_w * conv_param->stride_h_;
|
||||
int input_w = idx % out_w * conv_param->stride_w_;
|
||||
int input_row = -conv_param->pad_u_ + i_kh + input_h;
|
||||
int input_col = -conv_param->pad_l_ + i_kw + input_w;
|
||||
if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
|
||||
int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
|
||||
int offset_dy = idx * out_ch + i_c;
|
||||
|
||||
float16x8_t x_0 = vld1q_f16(x_addr + offset_x);
|
||||
float16x8_t dy_0 = vld1q_f16(dy_addr + offset_dy);
|
||||
sum_0 = MS_VMLAL_F16(vget_low_f16(x_0), vget_low_f16(dy_0), sum_0);
|
||||
sum_1 = MS_VMLAL_F16(vget_high_f16(x_0), vget_high_f16(dy_0), sum_1);
|
||||
|
||||
float16x8_t x_1 = vld1q_f16(x_addr + offset_x + 8);
|
||||
float16x8_t dy_1 = vld1q_f16(dy_addr + offset_dy + 8);
|
||||
sum_2 = MS_VMLAL_F16(vget_low_f16(x_1), vget_low_f16(dy_1), sum_2);
|
||||
sum_3 = MS_VMLAL_F16(vget_high_f16(x_1), vget_high_f16(dy_1), sum_3);
|
||||
|
||||
float16x8_t x_2 = vld1q_f16(x_addr + offset_x + 16);
|
||||
float16x8_t dy_2 = vld1q_f16(dy_addr + offset_dy + 16);
|
||||
sum_4 = MS_VMLAL_F16(vget_low_f16(x_2), vget_low_f16(dy_2), sum_4);
|
||||
sum_5 = MS_VMLAL_F16(vget_high_f16(x_2), vget_high_f16(dy_2), sum_5);
|
||||
|
||||
float16x8_t x_3 = vld1q_f16(x_addr + offset_x + 24);
|
||||
float16x8_t dy_3 = vld1q_f16(dy_addr + offset_dy + 24);
|
||||
sum_6 = MS_VMLAL_F16(vget_low_f16(x_3), vget_low_f16(dy_3), sum_6);
|
||||
sum_7 = MS_VMLAL_F16(vget_high_f16(x_3), vget_high_f16(dy_3), sum_7);
|
||||
}
|
||||
}
|
||||
}
|
||||
// store into memory
|
||||
for (int l = 0; l < 4; l++) {
|
||||
dw[(i_c + l) * k_spatial + k_idx] = sum_0[l];
|
||||
dw[(i_c + 4 + l) * k_spatial + k_idx] = sum_1[l];
|
||||
dw[(i_c + 8 + l) * k_spatial + k_idx] = sum_2[l];
|
||||
dw[(i_c + 12 + l) * k_spatial + k_idx] = sum_3[l];
|
||||
dw[(i_c + 16 + l) * k_spatial + k_idx] = sum_4[l];
|
||||
dw[(i_c + 20 + l) * k_spatial + k_idx] = sum_5[l];
|
||||
dw[(i_c + 24 + l) * k_spatial + k_idx] = sum_6[l];
|
||||
dw[(i_c + 28 + l) * k_spatial + k_idx] = sum_7[l];
|
||||
}
|
||||
}
|
||||
return i_c;
|
||||
}
|
||||
|
||||
static int FilterGrad16Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
|
||||
const ConvParameter *conv_param) {
|
||||
int in_h = conv_param->input_h_;
|
||||
int in_w = conv_param->input_w_;
|
||||
int k_h = conv_param->kernel_h_;
|
||||
int k_w = conv_param->kernel_w_;
|
||||
int batch = conv_param->output_batch_;
|
||||
int out_ch = conv_param->output_channel_;
|
||||
int in_ch = conv_param->input_channel_;
|
||||
int out_h = conv_param->output_h_;
|
||||
int out_w = conv_param->output_w_;
|
||||
|
||||
int m = out_h * out_w;
|
||||
int x_size = in_h * in_w * in_ch;
|
||||
int y_size = out_ch * out_h * out_w;
|
||||
int k_spatial = k_w * k_h;
|
||||
int i_kh = k_idx / k_w;
|
||||
int i_kw = k_idx % k_w;
|
||||
for (; i_c < (out_ch & ~15); i_c += 16) {
|
||||
float32x4_t sum_0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sum_1 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sum_2 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sum_3 = vdupq_n_f32(0.0f);
|
||||
for (int b = 0; b < batch; ++b) {
|
||||
const float16_t *x_addr = &x[b * x_size];
|
||||
const float16_t *dy_addr = &dy[b * y_size];
|
||||
for (int i = 0; i < m; i++) {
|
||||
int idx = i;
|
||||
int input_h = idx / out_w * conv_param->stride_h_;
|
||||
int input_w = idx % out_w * conv_param->stride_w_;
|
||||
int input_row = -conv_param->pad_u_ + i_kh + input_h;
|
||||
int input_col = -conv_param->pad_l_ + i_kw + input_w;
|
||||
if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
|
||||
int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
|
||||
int offset_dy = idx * out_ch + i_c;
|
||||
|
||||
float16x8_t x_0 = vld1q_f16(x_addr + offset_x);
|
||||
float16x8_t dy_0 = vld1q_f16(dy_addr + offset_dy);
|
||||
sum_0 = MS_VMLAL_F16(vget_low_f16(x_0), vget_low_f16(dy_0), sum_0);
|
||||
sum_1 = MS_VMLAL_F16(vget_high_f16(x_0), vget_high_f16(dy_0), sum_1);
|
||||
|
||||
float16x8_t x_1 = vld1q_f16(x_addr + offset_x + 8);
|
||||
float16x8_t dy_1 = vld1q_f16(dy_addr + offset_dy + 8);
|
||||
sum_2 = MS_VMLAL_F16(vget_low_f16(x_1), vget_low_f16(dy_1), sum_2);
|
||||
sum_3 = MS_VMLAL_F16(vget_high_f16(x_1), vget_high_f16(dy_1), sum_3);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int l = 0; l < 4; l++) {
|
||||
dw[(i_c + l) * k_spatial + k_idx] = sum_0[l];
|
||||
dw[(i_c + l + 4) * k_spatial + k_idx] = sum_1[l];
|
||||
dw[(i_c + l + 8) * k_spatial + k_idx] = sum_2[l];
|
||||
dw[(i_c + l + 12) * k_spatial + k_idx] = sum_3[l];
|
||||
}
|
||||
}
|
||||
return i_c;
|
||||
}
|
||||
|
||||
static int FilterGrad8Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
|
||||
const ConvParameter *conv_param) {
|
||||
int in_h = conv_param->input_h_;
|
||||
int in_w = conv_param->input_w_;
|
||||
int k_h = conv_param->kernel_h_;
|
||||
int k_w = conv_param->kernel_w_;
|
||||
int batch = conv_param->output_batch_;
|
||||
int out_ch = conv_param->output_channel_;
|
||||
int in_ch = conv_param->input_channel_;
|
||||
int out_h = conv_param->output_h_;
|
||||
int out_w = conv_param->output_w_;
|
||||
|
||||
int m = out_h * out_w;
|
||||
int x_size = in_h * in_w * in_ch;
|
||||
int y_size = out_ch * out_h * out_w;
|
||||
int k_spatial = k_w * k_h;
|
||||
int i_kh = k_idx / k_w;
|
||||
int i_kw = k_idx % k_w;
|
||||
for (; i_c < (out_ch & ~7); i_c += 8) {
|
||||
float32x4_t sum_0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sum_1 = vdupq_n_f32(0.0f);
|
||||
|
||||
for (int b = 0; b < batch; ++b) {
|
||||
const float16_t *x_addr = &x[b * x_size];
|
||||
const float16_t *dy_addr = &dy[b * y_size];
|
||||
for (int i = 0; i < m; i++) {
|
||||
int idx = i;
|
||||
int input_h = idx / out_w * conv_param->stride_h_;
|
||||
int input_w = idx % out_w * conv_param->stride_w_;
|
||||
int input_row = -conv_param->pad_u_ + i_kh + input_h;
|
||||
int input_col = -conv_param->pad_l_ + i_kw + input_w;
|
||||
if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
|
||||
int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
|
||||
int offset_dy = idx * out_ch + i_c;
|
||||
float16x8_t x_0 = vld1q_f16(x_addr + offset_x);
|
||||
float16x8_t dy_0 = vld1q_f16(dy_addr + offset_dy);
|
||||
sum_0 = MS_VMLAL_F16(vget_low_f16(x_0), vget_low_f16(dy_0), sum_0);
|
||||
sum_1 = MS_VMLAL_F16(vget_high_f16(x_0), vget_high_f16(dy_0), sum_1);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int l = 0; l < 4; l++) {
|
||||
dw[(i_c + l) * k_spatial + k_idx] = sum_0[l];
|
||||
dw[(i_c + 4 + l) * k_spatial + k_idx] = sum_1[l];
|
||||
}
|
||||
}
|
||||
return i_c;
|
||||
}
|
||||
|
||||
static int FilterGrad4Arm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
|
||||
const ConvParameter *conv_param) {
|
||||
int in_h = conv_param->input_h_;
|
||||
int in_w = conv_param->input_w_;
|
||||
int k_h = conv_param->kernel_h_;
|
||||
int k_w = conv_param->kernel_w_;
|
||||
int batch = conv_param->output_batch_;
|
||||
int out_ch = conv_param->output_channel_;
|
||||
int in_ch = conv_param->input_channel_;
|
||||
int out_h = conv_param->output_h_;
|
||||
int out_w = conv_param->output_w_;
|
||||
|
||||
int m = out_h * out_w;
|
||||
int x_size = in_h * in_w * in_ch;
|
||||
int y_size = out_ch * out_h * out_w;
|
||||
int k_spatial = k_w * k_h;
|
||||
int i_kh = k_idx / k_w;
|
||||
int i_kw = k_idx % k_w;
|
||||
for (; i_c < (out_ch & ~3); i_c += 4) {
|
||||
float32x4_t sum_0 = vdupq_n_f32(0.0f);
|
||||
|
||||
for (int b = 0; b < batch; ++b) {
|
||||
const float16_t *x_addr = &x[b * x_size];
|
||||
const float16_t *dy_addr = &dy[b * y_size];
|
||||
|
||||
for (int i = 0; i < m; i++) {
|
||||
int idx = i;
|
||||
int input_h = idx / out_w * conv_param->stride_h_;
|
||||
int input_w = idx % out_w * conv_param->stride_w_;
|
||||
int input_row = -conv_param->pad_u_ + i_kh + input_h;
|
||||
int input_col = -conv_param->pad_l_ + i_kw + input_w;
|
||||
if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
|
||||
int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
|
||||
int offset_dy = idx * out_ch + i_c;
|
||||
float16x4_t x_0 = vld1_f16(x_addr + offset_x);
|
||||
float16x4_t dy_0 = vld1_f16(dy_addr + offset_dy);
|
||||
sum_0 = MS_VMLAL_F16(x_0, dy_0, sum_0);
|
||||
}
|
||||
}
|
||||
}
|
||||
dw[(i_c + 0) * k_spatial + k_idx] = sum_0[0];
|
||||
dw[(i_c + 1) * k_spatial + k_idx] = sum_0[1];
|
||||
dw[(i_c + 2) * k_spatial + k_idx] = sum_0[2];
|
||||
dw[(i_c + 3) * k_spatial + k_idx] = sum_0[3];
|
||||
}
|
||||
return i_c;
|
||||
}
|
||||
|
||||
static int FilterGradLeftoverArm(const float16_t *x, const float16_t *dy, int i_c, int k_idx, float16_t *dw,
|
||||
const ConvParameter *conv_param) {
|
||||
int in_h = conv_param->input_h_;
|
||||
int in_w = conv_param->input_w_;
|
||||
int k_h = conv_param->kernel_h_;
|
||||
int k_w = conv_param->kernel_w_;
|
||||
int batch = conv_param->output_batch_;
|
||||
int out_ch = conv_param->output_channel_;
|
||||
int in_ch = conv_param->input_channel_;
|
||||
int out_h = conv_param->output_h_;
|
||||
int out_w = conv_param->output_w_;
|
||||
|
||||
int m = out_h * out_w;
|
||||
int x_size = in_h * in_w * in_ch;
|
||||
int y_size = out_ch * out_h * out_w;
|
||||
int k_spatial = k_w * k_h;
|
||||
int i_kh = k_idx / k_w;
|
||||
int i_kw = k_idx % k_w;
|
||||
int leftover = out_ch - i_c;
|
||||
if (leftover > 0) {
|
||||
float32x4_t sum_0 = vdupq_n_f32(0.0f);
|
||||
|
||||
for (int b = 0; b < batch; ++b) {
|
||||
const float16_t *x_addr = &x[b * x_size];
|
||||
const float16_t *dy_addr = &dy[b * y_size];
|
||||
|
||||
for (int i = 0; i < m; i++) {
|
||||
int idx = i;
|
||||
int input_h = idx / out_w * conv_param->stride_h_;
|
||||
int input_w = idx % out_w * conv_param->stride_w_;
|
||||
int input_row = -conv_param->pad_u_ + i_kh + input_h;
|
||||
int input_col = -conv_param->pad_l_ + i_kw + input_w;
|
||||
if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
|
||||
int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
|
||||
int offset_dy = idx * out_ch + i_c;
|
||||
float16x4_t x_0 = vld1_f16(x_addr + offset_x);
|
||||
float16x4_t dy_0 = vld1_f16(dy_addr + offset_dy);
|
||||
sum_0 = MS_VMLAL_F16(x_0, dy_0, sum_0);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int l = 0; l < leftover; l++) {
|
||||
dw[(i_c + l) * k_spatial + k_idx] = sum_0[l];
|
||||
}
|
||||
}
|
||||
return out_ch;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int ConvDwFilterFp16Grad(const float16_t *x, const float16_t *dy, float16_t *dw, int start, int count,
|
||||
const ConvParameter *conv_param) {
|
||||
int in_h = conv_param->input_h_;
|
||||
int in_w = conv_param->input_w_;
|
||||
int k_h = conv_param->kernel_h_;
|
||||
int k_w = conv_param->kernel_w_;
|
||||
int batch = conv_param->output_batch_;
|
||||
int out_ch = conv_param->output_channel_;
|
||||
int in_ch = conv_param->input_channel_;
|
||||
int out_h = conv_param->output_h_;
|
||||
int out_w = conv_param->output_w_;
|
||||
|
||||
int m = out_h * out_w;
|
||||
int x_size = in_h * in_w * in_ch;
|
||||
int y_size = out_ch * out_h * out_w;
|
||||
int k_spatial = k_w * k_h;
|
||||
|
||||
for (int i_k = 0; i_k < count; i_k++) {
|
||||
int k_idx = start + i_k;
|
||||
int i_kh = k_idx / k_w;
|
||||
int i_kw = k_idx % k_w;
|
||||
int i_c = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
i_c = FilterGrad32Arm(x, dy, i_c, k_idx, dw, conv_param);
|
||||
i_c = FilterGrad16Arm(x, dy, i_c, k_idx, dw, conv_param);
|
||||
i_c = FilterGrad8Arm(x, dy, i_c, k_idx, dw, conv_param);
|
||||
i_c = FilterGrad4Arm(x, dy, i_c, k_idx, dw, conv_param);
|
||||
i_c = FilterGradLeftoverArm(x, dy, i_c, k_idx, dw, conv_param);
|
||||
#endif
|
||||
for (; i_c < out_ch; i_c++) {
|
||||
float sum = 0;
|
||||
for (int b = 0; b < batch; ++b) {
|
||||
const float16_t *x_addr = &x[b * x_size];
|
||||
const float16_t *dy_addr = &dy[b * y_size];
|
||||
|
||||
for (int i = 0; i < m; i++) {
|
||||
int idx = i;
|
||||
int input_h = idx / out_w * conv_param->stride_h_;
|
||||
int input_w = idx % out_w * conv_param->stride_w_;
|
||||
int input_row = -conv_param->pad_u_ + i_kh + input_h;
|
||||
int input_col = -conv_param->pad_l_ + i_kw + input_w;
|
||||
if (((unsigned)(input_row) < (unsigned)(in_h)) && ((unsigned)(input_col) < (unsigned)(in_w))) {
|
||||
int offset_x = (input_row * in_w + input_col) * out_ch + i_c;
|
||||
int offset_dy = idx * out_ch + i_c;
|
||||
sum += x_addr[offset_x] * dy_addr[offset_dy];
|
||||
}
|
||||
}
|
||||
}
|
||||
dw[i_c * k_spatial + k_idx] = sum;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_CONVOLUTION_GRAD_FILTER_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_CONVOLUTION_GRAD_FILTER_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include "nnacl/conv_parameter.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int ConvDwFilterFp16Grad(const float16_t *x, const float16_t *dy, float16_t *dw, int start, int count,
|
||||
const ConvParameter *conv_param);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_CONVOLUTION_GRAD_FILTER_H_
|
|
@ -0,0 +1,24 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "nnacl/fp16_grad/dropout_grad.h"
|
||||
|
||||
void DropoutFp16Grad(const float16_t *yt_ptr, const float16_t *mask, float16_t *output_ptr, int length,
|
||||
float16_t scale) {
|
||||
for (int i = 0; i < length; i++) {
|
||||
output_ptr[i] = yt_ptr[i] * mask[i] * scale;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_DROPOUT_GRAD_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_DROPOUT_GRAD_H_
|
||||
|
||||
#include "nnacl/op_base.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void DropoutFp16Grad(const float16_t *yt_ptr, const float16_t *mask, float16_t *output_ptr, int length,
|
||||
float16_t ratio);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_DROPOUT_GRAD_H_
|
|
@ -0,0 +1,373 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "nnacl/fp16_grad/gemm_fp16.h"
|
||||
#include <string.h>
|
||||
#ifdef __ARM_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#include "nnacl/fp16/matmul_fp16.h"
|
||||
#include "nnacl/fp16/pack_fp16.h"
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
static void Row2Col16Block16(const float16_t *src_ptr, float16_t *dst_ptr, size_t col) {
|
||||
size_t stride = col * 2;
|
||||
asm volatile(
|
||||
"mov x10, %[src_c]\n"
|
||||
"mov x11, %[dst_c]\n"
|
||||
|
||||
"ld1 {v0.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v1.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v2.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v3.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v4.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v5.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v6.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v7.8h}, [x10], %[stride]\n"
|
||||
|
||||
"zip1 v16.8h, v0.8h, v1.8h\n"
|
||||
"zip1 v17.8h, v2.8h, v3.8h\n"
|
||||
"zip1 v18.8h, v4.8h, v5.8h\n"
|
||||
"zip1 v19.8h, v6.8h, v7.8h\n"
|
||||
|
||||
"ld1 {v8.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v9.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v10.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v11.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v12.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v13.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v14.8h}, [x10], %[stride]\n"
|
||||
"ld1 {v15.8h}, [x10], %[stride]\n"
|
||||
|
||||
"trn1 v20.4s, v16.4s, v17.4s\n"
|
||||
"trn2 v21.4s, v16.4s, v17.4s\n"
|
||||
"trn1 v22.4s, v18.4s, v19.4s\n"
|
||||
"trn2 v23.4s, v18.4s, v19.4s\n"
|
||||
|
||||
"trn1 v24.2d, v20.2d, v22.2d\n"
|
||||
"trn2 v25.2d, v20.2d, v22.2d\n"
|
||||
"trn1 v26.2d, v21.2d, v23.2d\n"
|
||||
"trn2 v27.2d, v21.2d, v23.2d\n"
|
||||
|
||||
"zip1 v16.8h, v8.8h, v9.8h\n"
|
||||
"zip1 v17.8h, v10.8h, v11.8h\n"
|
||||
"zip1 v18.8h, v12.8h, v13.8h\n"
|
||||
"zip1 v19.8h, v14.8h, v15.8h\n"
|
||||
|
||||
"trn1 v20.4s, v16.4s, v17.4s\n"
|
||||
"trn2 v21.4s, v16.4s, v17.4s\n"
|
||||
"trn1 v22.4s, v18.4s, v19.4s\n"
|
||||
"trn2 v23.4s, v18.4s, v19.4s\n"
|
||||
|
||||
"trn1 v28.2d, v20.2d, v22.2d\n"
|
||||
"trn2 v29.2d, v20.2d, v22.2d\n"
|
||||
"trn1 v30.2d, v21.2d, v23.2d\n"
|
||||
"trn2 v31.2d, v21.2d, v23.2d\n"
|
||||
|
||||
"st1 {v24.8h}, [x11], #16\n"
|
||||
"st1 {v28.8h}, [x11], #16\n"
|
||||
"st1 {v26.8h}, [x11], #16\n"
|
||||
"st1 {v30.8h}, [x11], #16\n"
|
||||
"st1 {v25.8h}, [x11], #16\n"
|
||||
"st1 {v29.8h}, [x11], #16\n"
|
||||
"st1 {v27.8h}, [x11], #16\n"
|
||||
"st1 {v31.8h}, [x11], #16\n"
|
||||
|
||||
"zip2 v16.8h, v0.8h, v1.8h\n"
|
||||
"zip2 v17.8h, v2.8h, v3.8h\n"
|
||||
"zip2 v18.8h, v4.8h, v5.8h\n"
|
||||
"zip2 v19.8h, v6.8h, v7.8h\n"
|
||||
|
||||
"trn1 v20.4s, v16.4s, v17.4s\n"
|
||||
"trn2 v21.4s, v16.4s, v17.4s\n"
|
||||
"trn1 v22.4s, v18.4s, v19.4s\n"
|
||||
"trn2 v23.4s, v18.4s, v19.4s\n"
|
||||
|
||||
"trn1 v24.2d, v20.2d, v22.2d\n"
|
||||
"trn2 v25.2d, v20.2d, v22.2d\n"
|
||||
"trn1 v26.2d, v21.2d, v23.2d\n"
|
||||
"trn2 v27.2d, v21.2d, v23.2d\n"
|
||||
|
||||
"zip2 v16.8h, v8.8h, v9.8h\n"
|
||||
"zip2 v17.8h, v10.8h, v11.8h\n"
|
||||
"zip2 v18.8h, v12.8h, v13.8h\n"
|
||||
"zip2 v19.8h, v14.8h, v15.8h\n"
|
||||
|
||||
"trn1 v20.4s, v16.4s, v17.4s\n"
|
||||
"trn2 v21.4s, v16.4s, v17.4s\n"
|
||||
"trn1 v22.4s, v18.4s, v19.4s\n"
|
||||
"trn2 v23.4s, v18.4s, v19.4s\n"
|
||||
|
||||
"trn1 v28.2d, v20.2d, v22.2d\n"
|
||||
"trn2 v29.2d, v20.2d, v22.2d\n"
|
||||
"trn1 v30.2d, v21.2d, v23.2d\n"
|
||||
"trn2 v31.2d, v21.2d, v23.2d\n"
|
||||
|
||||
"st1 {v24.8h}, [x11], #16\n"
|
||||
"st1 {v28.8h}, [x11], #16\n"
|
||||
"st1 {v26.8h}, [x11], #16\n"
|
||||
"st1 {v30.8h}, [x11], #16\n"
|
||||
"st1 {v25.8h}, [x11], #16\n"
|
||||
"st1 {v29.8h}, [x11], #16\n"
|
||||
"st1 {v27.8h}, [x11], #16\n"
|
||||
"st1 {v31.8h}, [x11], #16\n"
|
||||
:
|
||||
: [ dst_c ] "r"(dst_ptr), [ src_c ] "r"(src_ptr), [ stride ] "r"(stride)
|
||||
: "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
|
||||
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||
"v31");
|
||||
}
|
||||
#endif
|
||||
|
||||
void AddMatrixFp16(const float16_t *restrict v1, float16_t *restrict v2, float16_t beta, int row, int col, int stride) {
|
||||
const float16_t *src_ptr = v1;
|
||||
float16_t *dst_ptr = v2;
|
||||
for (int r = 0; r < row; r++) {
|
||||
for (int c = 0; c < col; c++) {
|
||||
dst_ptr[c] += beta * src_ptr[c];
|
||||
}
|
||||
src_ptr += stride;
|
||||
dst_ptr += stride;
|
||||
}
|
||||
}
|
||||
|
||||
int MatSizeFp16(int row, int col, int round) {
|
||||
int res = UP_ROUND(row, round) * col;
|
||||
return res;
|
||||
}
|
||||
|
||||
int MatSizeTotalFp16(int row, int col, int deep, int stride) {
|
||||
#ifdef ENABLE_ARM64
|
||||
const int num = C16NUM;
|
||||
#else
|
||||
const int num = C12NUM;
|
||||
#endif
|
||||
int res = MatSizeFp16(row, deep, num) + MatSizeFp16(col, deep, C8NUM);
|
||||
if (stride > 0) res += row * stride;
|
||||
return res;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
static void RowMajor2Col16MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
|
||||
size_t row_up_16 = UP_ROUND(row, C16NUM);
|
||||
size_t row16 = row / C16NUM * C16NUM;
|
||||
size_t col8 = col / C8NUM * C8NUM;
|
||||
const float16_t *src_r = src;
|
||||
float16_t *dst_r = dst;
|
||||
size_t ri = 0;
|
||||
// find 16 block unit
|
||||
for (; ri < row16; ri += C16NUM) {
|
||||
size_t ci = 0;
|
||||
for (; ci < col8; ci += C8NUM) {
|
||||
const float16_t *src_c = src_r + ci;
|
||||
float16_t *dst_c = dst_r + ci * C16NUM;
|
||||
#ifdef ENABLE_ARM64
|
||||
Row2Col16Block16(src_c, dst_c, stride);
|
||||
#else
|
||||
for (int tr = 0; tr < C16NUM; tr++) {
|
||||
for (int tc = 0; tc < C8NUM; tc++) {
|
||||
dst_c[tc * C16NUM + tr] = src_c[tr * stride + tc];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (; ci < col; ci++) {
|
||||
const float16_t *src_c = src_r + ci;
|
||||
float16_t *dst_c = dst_r + ci * C16NUM;
|
||||
for (size_t i = 0; i < C16NUM; i++) {
|
||||
dst_c[i] = src_c[i * stride];
|
||||
}
|
||||
}
|
||||
src_r += C16NUM * stride;
|
||||
dst_r += C16NUM * col;
|
||||
}
|
||||
for (; ri < row; ri++) {
|
||||
for (size_t i = 0; i < col; ++i) {
|
||||
dst_r[i * C16NUM] = src_r[i];
|
||||
}
|
||||
src_r += stride;
|
||||
dst_r += 1;
|
||||
}
|
||||
for (; ri < row_up_16; ri++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
dst_r[i * C16NUM] = 0;
|
||||
}
|
||||
dst_r += 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
void RowMajor2Row16MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
|
||||
for (int r = 0; r < row; r++) {
|
||||
for (int c = 0; c < col; c++) {
|
||||
int c_div16 = c / 16;
|
||||
int c_mod16 = c % 16;
|
||||
dst[c_div16 * 16 * row + r * 16 + c_mod16] = src[r * stride + c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RowMajor2Col12MajorStrideFp16(const float16_t *src, float16_t *dst, size_t row, size_t col, int stride) {
|
||||
size_t row_up_12 = UP_ROUND(row, C12NUM);
|
||||
size_t row12 = row / C12NUM * C12NUM;
|
||||
size_t col8 = col / C8NUM * C8NUM;
|
||||
const float16_t *src_r = src;
|
||||
float16_t *dst_r = dst;
|
||||
size_t ri = 0;
|
||||
// transpose 12x8
|
||||
for (; ri < row12; ri += C12NUM) {
|
||||
size_t ci = 0;
|
||||
for (; ci < col8; ci += C8NUM) {
|
||||
const float16_t *src_c = src_r + ci;
|
||||
float16_t *dst_c = dst_r + ci * C12NUM;
|
||||
#ifdef ENABLE_ARM82_A32
|
||||
Transpose12x8A32Fp16(src_c, dst_c, stride * sizeof(float16_t), 24);
|
||||
#else
|
||||
for (int tr = 0; tr < C12NUM; tr++) {
|
||||
for (int tc = 0; tc < C8NUM; tc++) {
|
||||
dst_c[tc * C12NUM + tr] = src_c[tr * stride + tc];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (; ci < col; ci++) {
|
||||
const float16_t *src_c = src_r + ci;
|
||||
float16_t *dst_c = dst_r + ci * C12NUM;
|
||||
for (size_t i = 0; i < C12NUM; i++) {
|
||||
dst_c[i] = src_c[i * stride];
|
||||
}
|
||||
}
|
||||
src_r += C12NUM * stride;
|
||||
dst_r += C12NUM * col;
|
||||
}
|
||||
for (; ri < row; ri++) {
|
||||
for (size_t i = 0; i < col; ++i) {
|
||||
dst_r[i * C12NUM] = src_r[i];
|
||||
}
|
||||
src_r += stride;
|
||||
dst_r += 1;
|
||||
}
|
||||
for (; ri < row_up_12; ri++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
dst_r[i * C12NUM] = 0;
|
||||
}
|
||||
dst_r += 1;
|
||||
}
|
||||
}
|
||||
|
||||
void RowMajor2Row12MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
|
||||
for (int r = 0; r < row; r++) {
|
||||
for (int c = 0; c < col; c++) {
|
||||
int c_div12 = c / 12;
|
||||
int c_mod12 = c % 12;
|
||||
dst[c_div12 * 12 * row + r * 12 + c_mod12] = src[r * stride + c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void RowMajor2Col8MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
|
||||
for (int r = 0; r < row; r++) {
|
||||
for (int c = 0; c < col; c++) {
|
||||
int r_div8 = r / 8;
|
||||
int r_mod8 = r % 8;
|
||||
dst[r_div8 * 8 * col + c * 8 + r_mod8] = src[r * stride + c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void RowMajor2Row8MajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
|
||||
for (int r = 0; r < row; r++) {
|
||||
const float16_t *src_ptr = src + r * stride;
|
||||
int c = 0;
|
||||
for (; c < col; c++) {
|
||||
int cd8 = c / C8NUM;
|
||||
int cm8 = c % C8NUM;
|
||||
dst[cd8 * C8NUM * row + r * C8NUM + cm8] = src_ptr[c];
|
||||
}
|
||||
for (; c < UP_ROUND(col, C8NUM); c++) {
|
||||
int cd8 = c / C8NUM;
|
||||
int cm8 = c % C8NUM;
|
||||
dst[cd8 * C8NUM * row + r * C8NUM + cm8] = 0;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void RowMajor2ColXMajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
|
||||
#ifdef ENABLE_ARM64
|
||||
RowMajor2Col16MajorStrideFp16(src, dst, row, col, stride);
|
||||
#else
|
||||
RowMajor2Col12MajorStrideFp16(src, dst, row, col, stride);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void RowMajor2RowXMajorStrideFp16(const float16_t *src, float16_t *dst, int row, int col, int stride) {
|
||||
#ifdef ENABLE_ARM64
|
||||
RowMajor2Row16MajorStrideFp16(src, dst, row, col, stride);
|
||||
#else
|
||||
RowMajor2Row12MajorStrideFp16(src, dst, row, col, stride);
|
||||
#endif
|
||||
}
|
||||
|
||||
void GemmMatmulFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda,
|
||||
const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc, float16_t *workspace) {
|
||||
GemmCbFp16 gcb;
|
||||
gcb.atype = ActType_No;
|
||||
gcb.ca = 0;
|
||||
gcb.cb = 0;
|
||||
gcb.bias = NULL;
|
||||
GemmMatmulPlusFp16(ta, tb, M, N, K, alpha, mat_a, lda, mat_b, ldb, beta, mat_c, ldc, workspace, &gcb);
|
||||
}
|
||||
|
||||
void GemmMatmulPlusFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda,
|
||||
const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc,
|
||||
float16_t *workspace, GemmCbFp16 *gcb) {
|
||||
#ifdef ENABLE_ARM64
|
||||
const int num = C16NUM;
|
||||
#else
|
||||
const int num = C12NUM;
|
||||
#endif
|
||||
float16_t *output = mat_c;
|
||||
float16_t *fworkspace = workspace;
|
||||
int incremental = (beta < 0.f) || (beta > 0.f);
|
||||
float16_t *mat_a_input = (float16_t *)mat_a;
|
||||
float16_t *mat_b_input = (float16_t *)mat_b;
|
||||
|
||||
if (!gcb->ca) {
|
||||
mat_a_input = fworkspace;
|
||||
fworkspace += MatSizeFp16(M, K, num);
|
||||
if (ta) {
|
||||
RowMajor2RowXMajorStrideFp16(mat_a, mat_a_input, K, M, lda);
|
||||
} else {
|
||||
RowMajor2ColXMajorStrideFp16(mat_a, mat_a_input, M, K, lda);
|
||||
}
|
||||
}
|
||||
if (!gcb->cb) {
|
||||
mat_b_input = fworkspace;
|
||||
fworkspace += MatSizeFp16(N, K, C8NUM);
|
||||
if (tb) {
|
||||
RowMajor2Col8MajorStrideFp16(mat_b, mat_b_input, N, K, ldb);
|
||||
} else {
|
||||
RowMajor2Row8MajorStrideFp16(mat_b, mat_b_input, K, N, ldb);
|
||||
}
|
||||
}
|
||||
if (incremental) output = fworkspace;
|
||||
MatMulFp16(mat_a_input, mat_b_input, output, gcb->bias, gcb->atype, K, M, N, ldc, OutType_Nhwc);
|
||||
if (incremental) AddMatrixFp16(output, mat_c, beta, M, N, ldc);
|
||||
gcb->mat_a = mat_a_input;
|
||||
gcb->mat_b = mat_b_input;
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_GEMM_FP16_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_GEMM_FP16_H_
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "nnacl/op_base.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
typedef struct {
|
||||
int ca;
|
||||
int cb;
|
||||
ActType atype;
|
||||
float16_t *bias;
|
||||
float16_t *mat_a;
|
||||
float16_t *mat_b;
|
||||
} GemmCbFp16;
|
||||
|
||||
void GemmMatmulFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda,
|
||||
const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc, float16_t *workspace);
|
||||
void GemmMatmulPlusFp16(int ta, int tb, int M, int N, int K, float16_t alpha, const float16_t *mat_a, int lda,
|
||||
const float16_t *mat_b, int ldb, float16_t beta, float16_t *mat_c, int ldc,
|
||||
float16_t *workspace, GemmCbFp16 *gcb);
|
||||
int MatSizeFp16(int row, int col, int round);
|
||||
int MatSizeTotalFp16(int row, int col, int deep, int inc);
|
||||
void AddMatrixFp16(const float16_t *v1, float16_t *v2, float16_t beta, int row, int col, int stride);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_GEMM_FP16_H_
|
|
@ -0,0 +1,59 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "nnacl/fp16_grad/layernorm_grad.h"
|
||||
#include <stddef.h>
|
||||
#include <math.h>
|
||||
|
||||
void LayerNormFp16Grad(const float16_t *x, const float16_t *dy, const float16_t *var, const float16_t *mean,
|
||||
const float16_t *gamma, int param_num, int param_size, int block_num, int block_size,
|
||||
float16_t *dx, float16_t *dg, float16_t *db) {
|
||||
// var is actually 1/sqrf(var)-> var^0.5
|
||||
const float16_t *var_sqrt_rev = var;
|
||||
for (size_t i = 0; i < param_num; ++i) {
|
||||
float dgamma = 0.0f;
|
||||
float dbeta = 0.0f;
|
||||
for (size_t j = i; j < param_size * param_num; j += param_num) {
|
||||
int norm_shift = (int)(j / block_size);
|
||||
dgamma += dy[j] * var_sqrt_rev[norm_shift] * (x[j] - mean[norm_shift]);
|
||||
dbeta += dy[j];
|
||||
}
|
||||
dg[i] = (float16_t)dgamma;
|
||||
db[i] = (float16_t)dbeta;
|
||||
}
|
||||
for (size_t i = 0; i < block_num; ++i) {
|
||||
float sum1 = 0.0f;
|
||||
float sum2 = 0.0f;
|
||||
float sum3 = 0.0f;
|
||||
for (size_t j = i * block_size; j < (i + 1) * block_size; ++j) {
|
||||
int param_shift = j % param_num;
|
||||
int norm_shift = (int)(j / block_size);
|
||||
float16_t dxm = x[j] - mean[norm_shift];
|
||||
float16_t dyg = dy[j] * gamma[param_shift];
|
||||
sum1 += -0.5f * dyg * dxm * var_sqrt_rev[norm_shift] * var_sqrt_rev[norm_shift] * var_sqrt_rev[norm_shift];
|
||||
sum2 += dyg;
|
||||
sum3 += -2.0f * dxm;
|
||||
}
|
||||
for (size_t j = i * block_size; j < (i + 1) * block_size; ++j) {
|
||||
int param_shift = j % param_num;
|
||||
int norm_shift = (int)(j / block_size);
|
||||
float16_t var_sqrt = var_sqrt_rev[norm_shift];
|
||||
float dx1 = dy[j] * gamma[param_shift] * var_sqrt;
|
||||
float dx2 = sum1 * 2.0f / block_size * (x[j] - mean[norm_shift]);
|
||||
float dx3 = (-1.0f * var_sqrt * sum2 + (1.0f / block_size) * sum1 * sum3) * (1.0f / block_size);
|
||||
dx[j] = (float16_t)(dx1 + dx2 + dx3);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_LAYERNORM_GRAD_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_LAYERNORM_GRAD_H_
|
||||
|
||||
#include "nnacl/op_base.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void LayerNormFp16Grad(const float16_t *x, const float16_t *dy, const float16_t *var, const float16_t *mean,
|
||||
const float16_t *gamma, int param_num, int param_size, int block_num, int block_size,
|
||||
float16_t *dx, float16_t *dg, float16_t *db);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_LAYERNORM_GRAD_H_
|
|
@ -0,0 +1,201 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include "nnacl/fp16_grad/pack_fp16_ext.h"
|
||||
|
||||
void RollingIm2ColPackDwUnitFp16(const float16_t *in_data, const ConvParameter *conv_param, float16_t *data_col_orig,
|
||||
int real_cal_num, int start) {
|
||||
const int pad_left = conv_param->pad_l_;
|
||||
const int pad_up = conv_param->pad_u_;
|
||||
|
||||
const int stride_h = conv_param->stride_h_;
|
||||
const int stride_w = conv_param->stride_w_;
|
||||
|
||||
const int dilation_h = conv_param->dilation_h_;
|
||||
const int dilation_w = conv_param->dilation_w_;
|
||||
|
||||
const int kernel_h = conv_param->kernel_h_;
|
||||
const int kernel_w = conv_param->kernel_w_;
|
||||
|
||||
const int in_height = conv_param->input_h_;
|
||||
const int in_width = conv_param->input_w_;
|
||||
|
||||
const int output_w = conv_param->output_w_;
|
||||
|
||||
const int channels = conv_param->input_channel_;
|
||||
const int stride = kernel_h * kernel_w;
|
||||
|
||||
int kernel_row, kernel_col;
|
||||
|
||||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int block_start = start + i;
|
||||
int input_h = block_start / output_w * stride_h;
|
||||
int input_w = block_start % output_w * stride_w;
|
||||
float16_t *data_col = data_col_orig + i * channels * stride;
|
||||
for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
|
||||
int input_row = -pad_up + kernel_row * dilation_h + input_h;
|
||||
for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
|
||||
int input_col = -pad_left + kernel_col * dilation_w + input_w;
|
||||
if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
|
||||
const int offset = (input_row * in_width + input_col) * channels;
|
||||
for (int c = 0; c < channels; c++) {
|
||||
data_col[c * stride] = in_data[offset + c];
|
||||
}
|
||||
data_col++;
|
||||
} else {
|
||||
for (int c = 0; c < channels; c++) {
|
||||
data_col[c * stride] = 0;
|
||||
}
|
||||
data_col++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RollingIm2ColPackUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
|
||||
int real_cal_num, int block_index) {
|
||||
const int pad_left = conv_param->pad_l_;
|
||||
const int pad_up = conv_param->pad_u_;
|
||||
|
||||
const int stride_h = conv_param->stride_h_;
|
||||
const int stride_w = conv_param->stride_w_;
|
||||
|
||||
const int dilation_h = conv_param->dilation_h_;
|
||||
const int dilation_w = conv_param->dilation_w_;
|
||||
|
||||
const int kernel_h = conv_param->kernel_h_;
|
||||
const int kernel_w = conv_param->kernel_w_;
|
||||
|
||||
const int in_height = conv_param->input_h_;
|
||||
const int in_width = conv_param->input_w_;
|
||||
|
||||
const int output_w = conv_param->output_w_;
|
||||
|
||||
const int channels = conv_param->input_channel_ / conv_param->group_;
|
||||
const int tot_channels = conv_param->input_channel_;
|
||||
|
||||
int kernel_row, kernel_col;
|
||||
|
||||
if (channels == 1) {
|
||||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int block_start = block_index + i;
|
||||
int input_h = block_start / output_w * stride_h;
|
||||
int input_w = block_start % output_w * stride_w;
|
||||
for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
|
||||
int input_row = -pad_up + kernel_row * dilation_h + input_h;
|
||||
for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
|
||||
int input_col = -pad_left + kernel_col * dilation_w + input_w;
|
||||
if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
|
||||
const int offset = (input_row * in_width + input_col) * tot_channels;
|
||||
*packed_input = input_data[offset];
|
||||
packed_input++;
|
||||
} else {
|
||||
*packed_input = 0;
|
||||
packed_input++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < real_cal_num; i++) {
|
||||
int block_start = block_index + i;
|
||||
int input_h = block_start / output_w * stride_h;
|
||||
int input_w = block_start % output_w * stride_w;
|
||||
for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
|
||||
int input_row = -pad_up + kernel_row * dilation_h + input_h;
|
||||
for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
|
||||
int input_col = -pad_left + kernel_col * dilation_w + input_w;
|
||||
if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
|
||||
const int offset = (input_row * in_width + input_col) * tot_channels;
|
||||
memcpy(packed_input, input_data + offset, sizeof(float16_t) * channels);
|
||||
packed_input += channels;
|
||||
} else {
|
||||
memset(packed_input, 0, sizeof(float16_t) * channels);
|
||||
packed_input += channels;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RollingCol2ImPackUnitFp16(const float16_t *data_col, float16_t *data_im, const ConvParameter *conv_param,
|
||||
int real_cal_num, int block_index) {
|
||||
const int pad_left = conv_param->pad_l_;
|
||||
const int pad_up = conv_param->pad_u_;
|
||||
|
||||
const int stride_h = conv_param->stride_h_;
|
||||
const int stride_w = conv_param->stride_w_;
|
||||
|
||||
const int dilation_h = conv_param->dilation_h_;
|
||||
const int dilation_w = conv_param->dilation_w_;
|
||||
|
||||
const int kernel_h = conv_param->kernel_h_;
|
||||
const int kernel_w = conv_param->kernel_w_;
|
||||
|
||||
const int in_height = conv_param->input_h_;
|
||||
const int in_width = conv_param->input_w_;
|
||||
|
||||
const int output_w = conv_param->output_w_;
|
||||
const int channels = conv_param->input_channel_ / conv_param->group_;
|
||||
const int tot_channels = conv_param->input_channel_;
|
||||
|
||||
int kernel_row, kernel_col;
|
||||
|
||||
if (channels == 1) {
|
||||
for (int r = 0; r < real_cal_num; r++) {
|
||||
int output_col = (block_index + r) % output_w;
|
||||
int output_row = (block_index + r) / output_w;
|
||||
int row_stride_offset = output_row * stride_h;
|
||||
int col_stride_offset = output_col * stride_w;
|
||||
for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
|
||||
int input_row = -pad_up + kernel_row * dilation_h + row_stride_offset;
|
||||
for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
|
||||
int input_col = -pad_left + kernel_col * dilation_w + col_stride_offset;
|
||||
if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
|
||||
int offset = (input_row * in_width + input_col) * tot_channels;
|
||||
float16_t *data_im_ptr = data_im + offset;
|
||||
*data_im_ptr += *data_col;
|
||||
}
|
||||
data_col++;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int r = 0; r < real_cal_num; r++) {
|
||||
int output_col = (block_index + r) % output_w;
|
||||
int output_row = (block_index + r) / output_w;
|
||||
int row_stride_offset = output_row * stride_h;
|
||||
int col_stride_offset = output_col * stride_w;
|
||||
for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
|
||||
int input_row = -pad_up + kernel_row * dilation_h + row_stride_offset;
|
||||
for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
|
||||
int input_col = -pad_left + kernel_col * dilation_w + col_stride_offset;
|
||||
if (((unsigned)(input_row) < (unsigned)(in_height)) && ((unsigned)(input_col) < (unsigned)(in_width))) {
|
||||
int offset = (input_row * in_width + input_col) * tot_channels;
|
||||
float16_t *data_im_ptr = &data_im[offset];
|
||||
for (int i = 0; i < channels; i++) {
|
||||
data_im_ptr[i] += data_col[i];
|
||||
}
|
||||
}
|
||||
data_col += channels;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_PACK_FP16_EXT_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_PACK_FP16_EXT_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include "nnacl/conv_parameter.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void RollingIm2ColPackUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
|
||||
int real_cal_num, int block_index);
|
||||
void RollingIm2ColPackDwUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
|
||||
int real_cal_num, int block_index);
|
||||
void RollingCol2ImPackUnitFp16(const float16_t *data_col, float16_t *data_im, const ConvParameter *conv_param,
|
||||
int real_cal_num, int block_index);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_PACK_FP16_EXT_H_
|
|
@ -0,0 +1,189 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <float.h>
|
||||
#include "nnacl/fp16_grad/pooling_grad.h"
|
||||
|
||||
void AvgPoolingFp16Grad(const float16_t *input_ptr, float16_t *output_ptr, int count, PoolingParameter *pooling_param) {
|
||||
int stride_w = pooling_param->stride_w_;
|
||||
int stride_h = pooling_param->stride_h_;
|
||||
int pad_w = pooling_param->pad_l_;
|
||||
int pad_h = pooling_param->pad_u_;
|
||||
int win_w = pooling_param->window_w_;
|
||||
int win_h = pooling_param->window_h_;
|
||||
int channel = pooling_param->input_channel_;
|
||||
int in_w = pooling_param->input_w_;
|
||||
int in_h = pooling_param->input_h_;
|
||||
int output_w = pooling_param->output_w_;
|
||||
int output_h = pooling_param->output_h_;
|
||||
|
||||
const float16_t kk = 1.0f / (float16_t)(win_h * win_w);
|
||||
#if ENABLE_NEON
|
||||
const float16x4_t factor = vdup_n_f16(kk);
|
||||
#endif
|
||||
for (int ib = 0; ib < count; ib++) {
|
||||
float16_t *out = &output_ptr[(ib * in_h * in_w * channel)];
|
||||
const float16_t *inPtr = &input_ptr[(ib * output_h * output_w * channel)];
|
||||
// iterate over yt
|
||||
for (int yh = 0; yh < output_h; yh++) {
|
||||
int over_h = pad_h - yh * stride_h;
|
||||
int kh_s = MSMAX(0, over_h);
|
||||
int kh_e = MSMIN(win_h, in_h + over_h);
|
||||
for (int yw = 0; yw < output_w; yw++) {
|
||||
int over_w = pad_w - yw * stride_w;
|
||||
int kw_s = MSMAX(0, over_w);
|
||||
int kw_e = MSMIN(win_w, in_w + over_w);
|
||||
int ic = 0;
|
||||
for (; ic < channel - 4; ic += 4) {
|
||||
int idx = (yw + yh * output_w) * channel + ic;
|
||||
#ifdef ENABLE_NEON
|
||||
float16x4_t in = vld1_f16(inPtr + idx);
|
||||
float16x4_t delta = vmul_f16(in, factor);
|
||||
#else
|
||||
float16_t delta[4] = {inPtr[idx], inPtr[idx + 1], inPtr[idx + 2], inPtr[idx + 3]};
|
||||
for (int i = 0; i < 4; i++) delta[i] *= kk;
|
||||
#endif
|
||||
for (int kh = kh_s; kh < kh_e; kh++) {
|
||||
int xh = yh * stride_h + kh - pad_h;
|
||||
for (int kw = kw_s; kw < kw_e; kw++) {
|
||||
int xw = yw * stride_w + kw - pad_w;
|
||||
#ifdef ENABLE_NEON
|
||||
float16_t *out_vec = out + (xw + in_w * xh) * channel + ic;
|
||||
float16x4_t outr = vld1_f16(out + (xw + in_w * xh) * channel + ic);
|
||||
float16x4_t outs = vadd_f16(outr, delta);
|
||||
vst1_f16(out_vec, outs);
|
||||
#else
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
out[(xw + in_w * xh) * channel + ic + i] += ((float16_t *)&delta)[i];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
for (; ic < channel; ic++) {
|
||||
int idx = (yw + yh * output_w) * channel + ic;
|
||||
float16_t delta = inPtr[idx] * kk;
|
||||
for (int kh = kh_s; kh < kh_e; kh++) {
|
||||
int xh = yh * stride_h + kh - pad_h;
|
||||
for (int kw = kw_s; kw < kw_e; kw++) {
|
||||
int xw = yw * stride_w + kw - pad_w;
|
||||
out[(xw + in_w * xh) * channel + ic] += delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
static int32x4_t MaxIndex(float16x4_t in, float16x4_t *max, uint32x4_t index, uint32x4_t prev_index) {
|
||||
uint16x4_t res = vcgt_f16(in, *max);
|
||||
int16x4_t tmp = vreinterpret_s16_u16(res);
|
||||
uint32x4_t res_tmp = vreinterpretq_u32_s32(vmovl_s16(tmp));
|
||||
int32x4_t m_index = vbslq_s32(res_tmp, index, prev_index);
|
||||
*max = vbsl_f16(res, in, *max);
|
||||
return m_index;
|
||||
}
|
||||
#endif
|
||||
|
||||
void MaxPoolingFp16Grad(const float16_t *input_ptr, const float16_t *dy_ptr, float16_t *output_ptr, int output_batch,
|
||||
PoolingParameter *pooling_param) {
|
||||
int stride_w = pooling_param->stride_w_;
|
||||
int stride_h = pooling_param->stride_h_;
|
||||
int pad_w = pooling_param->pad_l_;
|
||||
int pad_h = pooling_param->pad_u_;
|
||||
int win_w = pooling_param->window_w_;
|
||||
int win_h = pooling_param->window_h_;
|
||||
int channel = pooling_param->input_channel_;
|
||||
int in_w = pooling_param->input_w_;
|
||||
int in_h = pooling_param->input_h_;
|
||||
int output_w = pooling_param->output_w_;
|
||||
int output_h = pooling_param->output_h_;
|
||||
|
||||
for (int ib = 0; ib < output_batch; ib++) {
|
||||
float16_t *out = &output_ptr[(ib * in_h * in_w * channel)];
|
||||
const float16_t *inPtr = &input_ptr[(ib * in_h * in_w * channel)];
|
||||
const float16_t *dyPtr = &dy_ptr[(ib * output_h * output_w * channel)];
|
||||
for (int yh = 0; yh < output_h; yh++) {
|
||||
int over_h = pad_h - yh * stride_h;
|
||||
int kh_s = MSMAX(0, over_h);
|
||||
int kh_e = MSMIN(win_h, in_h + over_h);
|
||||
for (int yw = 0; yw < output_w; yw++) {
|
||||
int over_w = pad_w - yw * stride_w;
|
||||
int kw_s = MSMAX(0, over_w);
|
||||
int kw_e = MSMIN(win_w, in_w + over_w);
|
||||
int ic = 0;
|
||||
for (; ic < (channel & ~3); ic += 4) {
|
||||
int idx = (yw + yh * output_w) * channel + ic;
|
||||
#ifdef ENABLE_NEON
|
||||
uint32x4_t max_idx = vdupq_n_u32(0);
|
||||
float16x4_t max_val = vdup_n_f16(-FLT16_MAX);
|
||||
float16x4_t delta = vld1_f16(dyPtr + idx);
|
||||
#else
|
||||
float16_t delta[4] = {dyPtr[idx], dyPtr[idx + 1], dyPtr[idx + 2], dyPtr[idx + 3]};
|
||||
float16_t max_val[4] = {-FLT16_MAX, -FLT16_MAX, -FLT16_MAX, -FLT16_MAX};
|
||||
uint max_idx[4] = {0};
|
||||
#endif
|
||||
for (int kh = kh_s; kh < kh_e; kh++) {
|
||||
int xh = yh * stride_h + kh - pad_h;
|
||||
for (int kw = kw_s; kw < kw_e; kw++) {
|
||||
int xw = yw * stride_w + kw - pad_w;
|
||||
int val_idx = (xw + in_w * xh) * channel + ic;
|
||||
#ifdef ENABLE_NEON
|
||||
uint32x4_t index = {val_idx, val_idx + 1, val_idx + 2, val_idx + 3};
|
||||
float16x4_t in = vld1_f16(inPtr + val_idx);
|
||||
max_idx = MaxIndex(in, &max_val, index, max_idx);
|
||||
#else
|
||||
float16_t val[4] = {inPtr[val_idx], inPtr[val_idx + 1], inPtr[val_idx + 2], inPtr[val_idx + 3]};
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (val[i] > max_val[i]) {
|
||||
max_val[i] = val[i];
|
||||
max_idx[i] = val_idx + i;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 4; i++) {
|
||||
out[((int *)&max_idx)[i]] += ((float16_t *)&delta)[i];
|
||||
}
|
||||
}
|
||||
for (; ic < channel; ic++) {
|
||||
float16_t max_val = -FLT16_MAX;
|
||||
int max_idx = 0;
|
||||
int idx = (yw + yh * output_w) * channel + ic;
|
||||
float16_t delta = dyPtr[idx];
|
||||
for (int kh = kh_s; kh < kh_e; kh++) {
|
||||
int xh = yh * stride_h + kh - pad_h;
|
||||
for (int kw = kw_e; kw < kw_s; kw++) {
|
||||
int xw = yw * stride_w + kw - pad_w;
|
||||
int val_idx = (xw + in_w * xh) * channel + ic;
|
||||
float16_t val = inPtr[val_idx];
|
||||
if (val > max_val) {
|
||||
max_val = val;
|
||||
max_idx = val_idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
out[max_idx] += delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_POOLING_GRAD_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_POOLING_GRAD_H_
|
||||
|
||||
#include "nnacl/fp16/pooling_fp16.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
void AvgPoolingFp16Grad(const float16_t *input_ptr, float16_t *output_ptr, int count, PoolingParameter *pooling_param);
|
||||
void MaxPoolingFp16Grad(const float16_t *input_ptr, const float16_t *dy_ptr, float16_t *output_ptr, int output_batch,
|
||||
PoolingParameter *pooling_param);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_POOLING_GRAD_H_
|
|
@ -0,0 +1,142 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "nnacl/fp16_grad/resize_grad.h"
|
||||
#include <math.h>
|
||||
#include "nnacl/infer/common_infer.h"
|
||||
|
||||
void ResizeNearestNeighborFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format,
|
||||
ResizeFp16GradParameter *param) {
|
||||
bool align_corners = param->align_corners_;
|
||||
size_t in_hw_size = param->in_width_ * param->in_height_;
|
||||
size_t out_hw_size = param->out_width_ * param->out_height_;
|
||||
|
||||
if (format == Format_NHWC) {
|
||||
for (int32_t b = 0; b < batch_size; ++b) {
|
||||
for (size_t i = 0; i < in_hw_size; ++i) {
|
||||
size_t in_y = i / param->in_width_;
|
||||
size_t in_x = i % param->in_width_;
|
||||
for (int32_t c = 0; c < channel; ++c) {
|
||||
size_t out_y = MSMIN(
|
||||
(align_corners) ? (size_t)roundf(in_y * param->height_scale_) : (size_t)floorf(in_y * param->height_scale_),
|
||||
param->out_height_ - 1);
|
||||
size_t out_x = MSMIN(
|
||||
(align_corners) ? (size_t)roundf(in_x * param->width_scale_) : (size_t)floorf(in_x * param->width_scale_),
|
||||
param->out_width_ - 1);
|
||||
size_t out_offset = out_y * (param->out_width_ * channel) + (out_x * channel) + c;
|
||||
size_t in_offset = in_y * (param->in_width_ * channel) + (in_x * channel) + c;
|
||||
out_addr[out_offset] += in_addr[in_offset];
|
||||
}
|
||||
}
|
||||
out_addr += out_hw_size * channel;
|
||||
in_addr += in_hw_size * channel;
|
||||
}
|
||||
} else if (format == Format_NCHW) {
|
||||
for (int32_t b = 0; b < batch_size; ++b) {
|
||||
for (int32_t c = 0; c < channel; ++c) {
|
||||
for (size_t h = 0; h < param->in_height_; ++h) {
|
||||
size_t out_y =
|
||||
MSMIN((align_corners) ? (size_t)roundf(h * param->height_scale_) : (size_t)floorf(h * param->height_scale_),
|
||||
param->out_height_ - 1);
|
||||
for (size_t w = 0; w < param->in_width_; ++w) {
|
||||
size_t out_x =
|
||||
MSMIN((align_corners) ? (size_t)roundf(w * param->width_scale_) : (size_t)floorf(w * param->width_scale_),
|
||||
param->out_width_ - 1);
|
||||
out_addr[out_y * param->out_width_ + out_x] += in_addr[h * param->in_width_ + w];
|
||||
}
|
||||
}
|
||||
out_addr += out_hw_size;
|
||||
in_addr += in_hw_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ResizeBiLinearFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format,
|
||||
ResizeFp16GradParameter *param) {
|
||||
size_t in_hw_size = param->in_width_ * param->in_height_;
|
||||
size_t out_hw_size = param->out_width_ * param->out_height_;
|
||||
|
||||
if (format == Format_NHWC) {
|
||||
for (int32_t b = 0; b < batch_size; ++b) {
|
||||
for (size_t i = 0; i < in_hw_size; ++i) {
|
||||
size_t h = i / param->in_width_;
|
||||
size_t w = i % param->in_width_;
|
||||
for (int32_t c = 0; c < channel; ++c) {
|
||||
float16_t in_y = (float16_t)h * param->height_scale_;
|
||||
size_t top_y_index = MSMAX((size_t)(floorf(in_y)), (size_t)(0));
|
||||
size_t bottom_y_index = MSMIN((size_t)(ceilf(in_y)), param->out_height_ - 1);
|
||||
float16_t y_lerp = in_y - floorf(in_y);
|
||||
float16_t inverse_y_lerp = 1.0 - y_lerp;
|
||||
|
||||
float16_t in_x = (float16_t)w * param->width_scale_;
|
||||
size_t left_x_index = MSMAX((size_t)(floorf(in_x)), (size_t)(0));
|
||||
size_t right_x_index = MSMIN((size_t)(ceilf(in_x)), param->out_width_ - 1);
|
||||
float16_t x_lerp = in_x - floorf(in_x);
|
||||
float16_t inverse_x_lerp = 1.0 - x_lerp;
|
||||
|
||||
size_t in_offset = h * (param->in_width_ * channel) + (w * channel) + c;
|
||||
size_t out_offset_top_y_left_x = top_y_index * (param->out_width_ * channel) + (left_x_index * channel) + c;
|
||||
size_t out_offset_top_y_right_x = top_y_index * (param->out_width_ * channel) + (right_x_index * channel) + c;
|
||||
size_t out_offset_bottom_y_left_x =
|
||||
bottom_y_index * (param->out_width_ * channel) + (left_x_index * channel) + c;
|
||||
size_t out_offset_bottom_y_right_x =
|
||||
bottom_y_index * (param->out_width_ * channel) + (right_x_index * channel) + c;
|
||||
|
||||
out_addr[out_offset_top_y_left_x] += in_addr[in_offset] * (float16_t)(inverse_y_lerp * inverse_x_lerp);
|
||||
out_addr[out_offset_top_y_right_x] += in_addr[in_offset] * (float16_t)(inverse_y_lerp * x_lerp);
|
||||
out_addr[out_offset_bottom_y_left_x] += in_addr[in_offset] * (float16_t)(y_lerp * inverse_x_lerp);
|
||||
out_addr[out_offset_bottom_y_right_x] += in_addr[in_offset] * (float16_t)(y_lerp * x_lerp);
|
||||
}
|
||||
}
|
||||
out_addr += out_hw_size * channel;
|
||||
in_addr += in_hw_size * channel;
|
||||
}
|
||||
} else if (format == Format_NCHW) {
|
||||
size_t in_height = param->in_height_;
|
||||
size_t in_width = param->in_width_;
|
||||
size_t out_height = param->out_height_;
|
||||
size_t out_width = param->out_width_;
|
||||
|
||||
for (size_t b = 0; b < batch_size; ++b) {
|
||||
for (size_t c = 0; c < channel; ++c) {
|
||||
for (size_t h = 0; h < in_height; ++h) {
|
||||
const float16_t in_y = (float16_t)(h)*param->height_scale_;
|
||||
const size_t top_y_index = MSMAX((size_t)floorf(in_y), 0);
|
||||
const size_t bottom_y_index = MSMIN((size_t)ceilf(in_y), out_height - 1);
|
||||
const float16_t y_lerp = in_y - floorf(in_y);
|
||||
const float16_t inverse_y_lerp = 1.0 - y_lerp;
|
||||
for (size_t w = 0; w < in_width; ++w) {
|
||||
const float16_t in_x = (float16_t)(w)*param->width_scale_;
|
||||
const size_t left_x_index = MSMAX((size_t)floorf(in_x), 0);
|
||||
const size_t right_x_index = MSMIN((size_t)ceilf(in_x), out_width - 1);
|
||||
const float16_t x_lerp = in_x - floorf(in_x);
|
||||
const float16_t inverse_x_lerp = 1.0 - x_lerp;
|
||||
out_addr[top_y_index * out_width + left_x_index] +=
|
||||
in_addr[h * in_width + w] * (float16_t)(inverse_y_lerp * inverse_x_lerp);
|
||||
out_addr[top_y_index * out_width + right_x_index] +=
|
||||
in_addr[h * in_width + w] * (float16_t)(inverse_y_lerp * x_lerp);
|
||||
out_addr[bottom_y_index * out_width + left_x_index] +=
|
||||
in_addr[h * in_width + w] * (float16_t)(y_lerp * inverse_x_lerp);
|
||||
out_addr[bottom_y_index * out_width + right_x_index] +=
|
||||
in_addr[h * in_width + w] * (float16_t)(y_lerp * x_lerp);
|
||||
}
|
||||
}
|
||||
out_addr += out_hw_size;
|
||||
in_addr += in_hw_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_RESIZE_GRAD_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_RESIZE_GRAD_H_
|
||||
|
||||
#include "nnacl/op_base.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct ResizeFp16GradParameter {
|
||||
OpParameter op_parameter_;
|
||||
bool align_corners_;
|
||||
int method;
|
||||
size_t in_height_;
|
||||
size_t in_width_;
|
||||
size_t out_height_;
|
||||
size_t out_width_;
|
||||
float16_t height_scale_;
|
||||
float16_t width_scale_;
|
||||
} ResizeFp16GradParameter;
|
||||
|
||||
void ResizeNearestNeighborFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format,
|
||||
ResizeFp16GradParameter *param);
|
||||
void ResizeBiLinearFp16Grad(float16_t *in_addr, float16_t *out_addr, int batch_size, int channel, int format,
|
||||
ResizeFp16GradParameter *param);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_RESIZE_GRAD_H_
|
|
@ -0,0 +1,62 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "nnacl/fp16_grad/strided_slice_grad.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
|
||||
static size_t CalcIndex(const int *shape, size_t size, int i, size_t pos) {
|
||||
size_t res = 1;
|
||||
for (size_t j = 0; j < size; j++) {
|
||||
res *= shape[(i + 1) + j];
|
||||
}
|
||||
return (pos / res % shape[i]);
|
||||
}
|
||||
|
||||
int DoStridedSliceFp16Grad(const float16_t *inputs, float16_t *output, const int *dx_shape,
|
||||
StridedSliceParameter *param) {
|
||||
if (inputs == NULL || output == NULL || param == NULL) {
|
||||
return NNACL_NULL_PTR;
|
||||
}
|
||||
if (param->num_axes_ > DIMENSION_7D) {
|
||||
return NNACL_PARAM_INVALID;
|
||||
}
|
||||
|
||||
size_t size = 1;
|
||||
int *s = param->strides_;
|
||||
int *b = param->begins_;
|
||||
for (int i = 0; i < DIMENSION_7D; i++) {
|
||||
size *= param->in_shape_[i];
|
||||
}
|
||||
|
||||
for (size_t pos = 0; pos < size; pos++) {
|
||||
size_t i = CalcIndex(param->in_shape_, 6, 0, pos);
|
||||
size_t j = CalcIndex(param->in_shape_, 5, 1, pos);
|
||||
size_t k = CalcIndex(param->in_shape_, 4, 2, pos);
|
||||
size_t l = CalcIndex(param->in_shape_, 3, 3, pos);
|
||||
size_t m = CalcIndex(param->in_shape_, 2, 4, pos);
|
||||
size_t n = CalcIndex(param->in_shape_, 1, 5, pos);
|
||||
size_t o = CalcIndex(param->in_shape_, 0, 6, pos);
|
||||
|
||||
size_t input_idx =
|
||||
(i * s[0] + b[0]) * dx_shape[1] * dx_shape[2] * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] +
|
||||
(j * s[1] + b[1]) * dx_shape[2] * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] +
|
||||
(k * s[2] + b[2]) * dx_shape[3] * dx_shape[4] * dx_shape[5] * dx_shape[6] +
|
||||
(l * s[3] + b[3]) * dx_shape[4] * dx_shape[5] * dx_shape[6] + (m * s[4] + b[4]) * dx_shape[5] * dx_shape[6] +
|
||||
(n * s[5] + b[5]) * dx_shape[6] + (o * s[6] + b[6]);
|
||||
output[input_idx] = inputs[pos];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_STRIDED_SLICE_GRAD_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_STRIDED_SLICE_GRAD_H_
|
||||
|
||||
#include "nnacl/op_base.h"
|
||||
#include "nnacl/strided_slice_parameter.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
int DoStridedSliceFp16Grad(const float16_t *inputs, float16_t *output, const int *dx_shape,
|
||||
StridedSliceParameter *param);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_STRIDED_SLICE_GRAD_H_
|
|
@ -0,0 +1,33 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "nnacl/fp16_grad/unsorted_segment_sum.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
|
||||
int UnsortedSegmentSumFp16(const float16_t *input, int unit_num, int input_dim1, const int *indices, float16_t *output,
|
||||
int output_dim0, int output_dim1) {
|
||||
for (int i = 0; i < unit_num; ++i) {
|
||||
int j = i / input_dim1;
|
||||
int k = i % input_dim1;
|
||||
|
||||
int index = indices[j];
|
||||
if (index < 0 || index >= output_dim0) {
|
||||
continue;
|
||||
}
|
||||
int output_index = index * output_dim1 + k;
|
||||
output[output_index] += input[i];
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_UNSORTED_SEGMENT_SUM_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_UNSORTED_SEGMENT_SUM_H_
|
||||
|
||||
#include "nnacl/op_base.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int UnsortedSegmentSumFp16(const float16_t *input, int unit_num, int input_dim1, const int *indices, float16_t *output,
|
||||
int output_dim0, int output_dim1);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NNACL_FP16_GRAD_UNSORTED_SEGMENT_SUM_H_
|
|
@ -321,7 +321,7 @@ static int Filtergrad2Arm(const float *x, const float *dy, int i_c, int k_idx, f
|
|||
dw[(i_c + 1) * k_spatial + k_idx] = sum_2[1];
|
||||
i_c += 2;
|
||||
}
|
||||
return i_c += 2;
|
||||
return i_c;
|
||||
}
|
||||
#endif
|
||||
int ConvDwFilterGrad(const float *x, const float *dy, float *dw, int start, int count,
|
||||
|
|
|
@ -118,4 +118,11 @@ static inline float16x8_t MS_ERFX8_F16(float16x8_t src) {
|
|||
dst[7] = erff(src[7]);
|
||||
return dst;
|
||||
}
|
||||
|
||||
static inline float32x4_t MS_VMLAL_F16(float16x4_t x, float16x4_t dy, float32x4_t sum) {
|
||||
float32x4_t x_fp32 = MS_CVT_F32_F16(x);
|
||||
float32x4_t dy_fp32 = MS_CVT_F32_F16(dy);
|
||||
return vmlaq_f32(sum, x_fp32, dy_fp32);
|
||||
}
|
||||
|
||||
#endif // MINDSPORE_NNACL_INTRINSICS_MS_SIMD_INSTRUCTIONS_FP16_H_
|
||||
|
|
|
@ -39,7 +39,7 @@ OpParameter *PopulatePartialParameter(const void *prim) {
|
|||
MS_LOG(ERROR) << "malloc partial parameter failed.";
|
||||
return nullptr;
|
||||
}
|
||||
memset(partial_parameter, 0, sizeof(PartialParameter));
|
||||
memset(reinterpret_cast<void *>(partial_parameter), 0, sizeof(PartialParameter));
|
||||
partial_parameter->op_parameter_.type_ = schema::PrimitiveType_PartialFusion;
|
||||
|
||||
partial_parameter->sub_graph_index_ = partial_prim->subGraphIndex();
|
||||
|
|
|
@ -39,7 +39,7 @@ OpParameter *PopulateWhileParameter(const void *prim) {
|
|||
MS_LOG(ERROR) << "malloc WhileParemeter failed.";
|
||||
return nullptr;
|
||||
}
|
||||
memset(while_paremeter, 0, sizeof(WhileParemeter));
|
||||
memset(reinterpret_cast<void *>(while_paremeter), 0, sizeof(WhileParemeter));
|
||||
|
||||
while_paremeter->op_parameter_.type_ = schema::PrimitiveType_While;
|
||||
while_paremeter->body_subgraph_index = while_prim->bodySubgraphIndex();
|
||||
|
|
|
@ -38,7 +38,7 @@ OpParameter *PopulateWhileParemeter(const void *prim) {
|
|||
MS_LOG(ERROR) << "malloc WhileParemeter failed.";
|
||||
return nullptr;
|
||||
}
|
||||
memset(param, 0, sizeof(WhileParemeter));
|
||||
memset(reinterpret_cast<void *>(param), 0, sizeof(WhileParemeter));
|
||||
|
||||
param->op_parameter_.type_ = primitive->value_type();
|
||||
param->body_subgraph_index = value->body_subgraph_index();
|
||||
|
|
|
@ -52,10 +52,26 @@ int ActivationGradCPUKernelFp16::DoActivation(int task_id) {
|
|||
auto error_code = RET_OK;
|
||||
|
||||
if (param_act_grad_->type_ == schema::ActivationType_RELU) {
|
||||
error_code = Fp16ReluGrad(yt_addr + start, input_addr + start, count, output_addr + start);
|
||||
error_code = ReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
|
||||
} else if (param_act_grad_->type_ == schema::ActivationType_RELU6) {
|
||||
error_code = Relu6Fp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
|
||||
} else if (param_act_grad_->type_ == schema::ActivationType_LEAKY_RELU) {
|
||||
error_code = LReluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start,
|
||||
(float16_t)param_act_grad_->alpha_);
|
||||
} else if (param_act_grad_->type_ == schema::ActivationType_SIGMOID) {
|
||||
// Sigmoid gets the input tensors in reverse order!
|
||||
error_code = Fp16SigmoidGrad(input_addr + start, yt_addr + start, count, output_addr + start);
|
||||
error_code = SigmoidFp16Grad(input_addr + start, yt_addr + start, count, output_addr + start);
|
||||
} else if (param_act_grad_->type_ == schema::ActivationType_TANH) {
|
||||
error_code = TanhFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
|
||||
} else if (param_act_grad_->type_ == schema::ActivationType_HSWISH) {
|
||||
error_code = HSwishFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
|
||||
} else if (param_act_grad_->type_ == schema::ActivationType_HSIGMOID) {
|
||||
error_code = HSigmoidFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
|
||||
} else if (param_act_grad_->type_ == schema::ActivationType_ELU) {
|
||||
error_code =
|
||||
EluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start, (float16_t)param_act_grad_->alpha_);
|
||||
} else if (param_act_grad_->type_ == schema::ActivationType_GELU) {
|
||||
error_code = GeluFp16Grad(yt_addr + start, input_addr + start, count, output_addr + start);
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Activation type error";
|
||||
return RET_ERROR;
|
||||
|
|
|
@ -14,12 +14,13 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_ACTIVATION_FP16_GRAD_H
|
||||
#define MINDSPORE_ACTIVATION_FP16_GRAD_H
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ACTIVATION_FP16_GRAD_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ACTIVATION_FP16_GRAD_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/inner_kernel.h"
|
||||
#include "nnacl/fp16_grad/activation_grad.h"
|
||||
#include "nnacl/fp32_grad/activation_grad.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class ActivationGradCPUKernelFp16 : public InnerKernel {
|
||||
|
@ -27,7 +28,7 @@ class ActivationGradCPUKernelFp16 : public InnerKernel {
|
|||
explicit ActivationGradCPUKernelFp16(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(param, inputs, outputs, ctx), thread_count_(ctx->thread_num_) {
|
||||
param_act_grad_ = reinterpret_cast<ActivationGradParameterFp16 *>(param);
|
||||
param_act_grad_ = reinterpret_cast<ActivationGradParameter *>(param);
|
||||
}
|
||||
~ActivationGradCPUKernelFp16() override = default;
|
||||
|
||||
|
@ -37,9 +38,9 @@ class ActivationGradCPUKernelFp16 : public InnerKernel {
|
|||
int DoActivation(int task_id);
|
||||
|
||||
private:
|
||||
ActivationGradParameterFp16 *param_act_grad_;
|
||||
ActivationGradParameter *param_act_grad_;
|
||||
int thread_count_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_ACTIVATION_FP16_GRAD_H
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ACTIVATION_FP16_GRAD_H_
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "nnacl/fp16_grad/arithmetic_grad.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int ArithmeticGradCPUKernelFp16::Init() { return RET_OK; }
|
||||
|
||||
void ArithmeticGradCPUKernelFp16::ArithmeticGradMaximum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size,
|
||||
float16_t *dx2, int dx2_size) {
|
||||
auto x1 = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
|
||||
auto x2 = reinterpret_cast<float16_t *>(in_tensors_[1]->data_c());
|
||||
dy = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c());
|
||||
|
||||
MaximumByAxesFp16(x1, x2, dy, arithmeticParameter_->in_shape0_, arithmeticParameter_->in_shape1_,
|
||||
arithmeticParameter_->out_shape_, dx1, dx2, arithmeticParameter_->ndim_);
|
||||
}
|
||||
|
||||
void ArithmeticGradCPUKernelFp16::ArithmeticGradMinimum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size,
|
||||
float16_t *dx2, int dx2_size) {
|
||||
auto x1 = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
|
||||
auto x2 = reinterpret_cast<float16_t *>(in_tensors_[1]->data_c());
|
||||
dy = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c());
|
||||
|
||||
MinimumByAxesFp16(x1, x2, dy, arithmeticParameter_->in_shape0_, arithmeticParameter_->in_shape1_,
|
||||
arithmeticParameter_->out_shape_, dx1, dx2, arithmeticParameter_->ndim_);
|
||||
}
|
||||
|
||||
int ArithmeticGradCPUKernelFp16::ReSize() { return RET_OK; }
|
||||
|
||||
int ArithmeticGradCPUKernelFp16::Execute(int task_id) {
|
||||
auto dy = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
|
||||
auto dx1 = reinterpret_cast<float16_t *>(out_tensors_[0]->data_c());
|
||||
auto dx2 = reinterpret_cast<float16_t *>(out_tensors_[1]->data_c());
|
||||
|
||||
size_t dy_size = in_tensors_.at(0)->ElementsNum();
|
||||
size_t dx1_size = out_tensors_.at(0)->ElementsNum();
|
||||
size_t dx2_size = out_tensors_.at(1)->ElementsNum();
|
||||
(this->*arithmetic_grad_)(dy, dy_size, dx1, dx1_size, dx2, dx2_size);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ArithmeticGradRunFp16(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
MS_ASSERT(cdata != nullptr);
|
||||
auto Arithmetic_kernel = reinterpret_cast<ArithmeticGradCPUKernelFp16 *>(cdata);
|
||||
auto error_code = Arithmetic_kernel->Execute(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "ArithmeticGradRunFp16 error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ArithmeticGradCPUKernelFp16::Run() {
|
||||
int error_code = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(ArithmeticGradRunFp16, this, 1);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "Arithmetic Grad function error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MaximumGrad, LiteKernelCreator<ArithmeticGradCPUKernelFp16>)
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MinimumGrad, LiteKernelCreator<ArithmeticGradCPUKernelFp16>)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,80 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ARITHMETIC_FP16_GRAD_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ARITHMETIC_FP16_GRAD_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/inner_kernel.h"
|
||||
#include "nnacl/fp16/arithmetic_fp16.h"
|
||||
#include "schema/model_generated.h"
|
||||
|
||||
using mindspore::schema::PrimitiveType_AddGrad;
|
||||
using mindspore::schema::PrimitiveType_DivGrad;
|
||||
using mindspore::schema::PrimitiveType_MaximumGrad;
|
||||
using mindspore::schema::PrimitiveType_MinimumGrad;
|
||||
using mindspore::schema::PrimitiveType_MulGrad;
|
||||
using mindspore::schema::PrimitiveType_SubGrad;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
class ArithmeticGradCPUKernelFp16;
|
||||
|
||||
class ArithmeticGradCPUKernelFp16 : public InnerKernel {
|
||||
typedef void (ArithmeticGradCPUKernelFp16::*ArithmeticGradOperation)(float16_t *, int, float16_t *, int, float16_t *,
|
||||
int);
|
||||
|
||||
public:
|
||||
explicit ArithmeticGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(parameter, inputs, outputs, ctx), tile_data0(NULL), tile_data1(NULL), tile_data2(NULL) {
|
||||
switch (type()) {
|
||||
case PrimitiveType_MaximumGrad:
|
||||
arithmetic_grad_ = &ArithmeticGradCPUKernelFp16::ArithmeticGradMaximum;
|
||||
break;
|
||||
case PrimitiveType_MinimumGrad:
|
||||
arithmetic_grad_ = &ArithmeticGradCPUKernelFp16::ArithmeticGradMinimum;
|
||||
break;
|
||||
default:
|
||||
MS_LOG(ERROR) << "Error Operator type " << parameter->type_;
|
||||
break;
|
||||
}
|
||||
arithmeticParameter_ = reinterpret_cast<ArithmeticParameter *>(parameter);
|
||||
}
|
||||
~ArithmeticGradCPUKernelFp16() override {
|
||||
if (tile_data0) delete[] tile_data0;
|
||||
if (tile_data1) delete[] tile_data1;
|
||||
if (tile_data2) delete[] tile_data2;
|
||||
}
|
||||
|
||||
int Init() override;
|
||||
int InferShape();
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
void ArithmeticGradMaximum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size, float16_t *dx2, int dx2_size);
|
||||
void ArithmeticGradMinimum(float16_t *dy, int dy_size, float16_t *dx1, int dx1_size, float16_t *dx2, int dx2_size);
|
||||
ArithmeticParameter *arithmeticParameter_;
|
||||
ArithmeticGradOperation arithmetic_grad_;
|
||||
float16_t *tile_data0;
|
||||
float16_t *tile_data1;
|
||||
float16_t *tile_data2;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_ARITHMETIC_FP16_GRAD_H_
|
|
@ -0,0 +1,96 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include "src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_BiasAddGrad;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int BiasGradCPUKernelFp16::ReSize() {
|
||||
auto dims = in_tensors_[0]->shape();
|
||||
bias_param->ndim_ = dims.size();
|
||||
for (unsigned int i = 0; i < bias_param->ndim_; i++) {
|
||||
bias_param->in_shape0_[i] = dims[i];
|
||||
bias_param->out_shape_[i] = 1; // 1 dimension for N,H,W,
|
||||
}
|
||||
bias_param->out_shape_[bias_param->ndim_ - 1] = dims[bias_param->ndim_ - 1];
|
||||
for (auto i = bias_param->ndim_; i < 4; i++) {
|
||||
bias_param->in_shape0_[i] = 0;
|
||||
bias_param->out_shape_[i] = 0;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BiasGradCPUKernelFp16::Init() {
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
return ReSize();
|
||||
}
|
||||
|
||||
int BiasGradCPUKernelFp16::Execute(int task_id) {
|
||||
auto in = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
|
||||
auto out = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
|
||||
|
||||
size_t nhw_size = 1;
|
||||
size_t channels = bias_param->in_shape0_[bias_param->ndim_ - 1]; // C in NHWC
|
||||
for (unsigned int i = 0; i < bias_param->ndim_ - 1; i++) {
|
||||
nhw_size *= bias_param->in_shape0_[i];
|
||||
}
|
||||
|
||||
size_t total_size = channels * nhw_size;
|
||||
for (size_t c = 0; c < channels; ++c) {
|
||||
float sum = 0;
|
||||
for (size_t offset = 0; offset < total_size; offset += channels) {
|
||||
sum += in[offset + c];
|
||||
}
|
||||
out[c] = (float16_t)sum;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BiasGradFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto bias_kernel = reinterpret_cast<BiasGradCPUKernelFp16 *>(cdata);
|
||||
auto error_code = bias_kernel->Execute(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "bias error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BiasGradCPUKernelFp16::Run() {
|
||||
int error_code =
|
||||
static_cast<const lite::InnerContext *>(this->context_)->thread_pool_->ParallelLaunch(BiasGradFp16Run, this, 1);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "bias function error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BiasAddGrad, LiteKernelCreator<BiasGradCPUKernelFp16>)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,44 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BIAS_FP16_GRAD_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BIAS_FP16_GRAD_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "nnacl/fp16/arithmetic_fp16.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class BiasGradCPUKernelFp16 : public InnerKernel {
|
||||
public:
|
||||
explicit BiasGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(parameter, inputs, outputs, ctx) {
|
||||
bias_param = reinterpret_cast<ArithmeticParameter *>(parameter);
|
||||
}
|
||||
~BiasGradCPUKernelFp16() override = default;
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
ArithmeticParameter *bias_param;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BIAS_FP16_GRAD_H_
|
|
@ -0,0 +1,166 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.h"
|
||||
#include <math.h>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <fstream>
|
||||
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "nnacl/fp16_grad/batch_norm.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_BatchNormGrad;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int BNGradCPUKernelFp16::ReSize() {
|
||||
auto *input_x = in_tensors_.at(1);
|
||||
int channels = input_x->shape().at(kNHWC_C);
|
||||
ws_size_ = 2 * channels;
|
||||
set_workspace_size(ws_size_ * sizeof(float16_t));
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BNGradCPUKernelFp16::Init() {
|
||||
for (int i = 0; i < in_tensors_.size(); i++) {
|
||||
if (in_tensors_.at(i)->data_type() != kNumberTypeFloat16) {
|
||||
MS_LOG(ERROR) << "BNGradCPUKernelFp16 type error in_tensor_[" << i << "]";
|
||||
}
|
||||
}
|
||||
return ReSize();
|
||||
}
|
||||
|
||||
int BNGradCPUKernelFp16::Execute(int task_id) {
|
||||
auto *input_yt = in_tensors_.at(0);
|
||||
auto *input_x = in_tensors_.at(1);
|
||||
auto *input_scale = in_tensors_.at(2);
|
||||
auto *input_mean = in_tensors_.at(3);
|
||||
auto *input_var = in_tensors_.at(4);
|
||||
|
||||
auto kernel_name = this->name();
|
||||
if (kernel_name.find("FusedBatchNormGradCPU") != std::string::npos) {
|
||||
input_mean = in_tensors_.at(4);
|
||||
input_var = in_tensors_.at(5);
|
||||
}
|
||||
auto bn_param = reinterpret_cast<BNGradParameter *>(op_parameter_);
|
||||
int stage = stage_;
|
||||
int thread_num = thread_num_;
|
||||
float16_t *save_mean = reinterpret_cast<float16_t *>(input_mean->data_c());
|
||||
float16_t *save_var = reinterpret_cast<float16_t *>(input_var->data_c());
|
||||
|
||||
auto *output_dx = out_tensors_.at(0);
|
||||
auto *output_scale = out_tensors_.at(1);
|
||||
auto *output_bias = out_tensors_.at(2);
|
||||
int32_t batch = input_x->Batch();
|
||||
int32_t channels = input_x->Channel();
|
||||
int32_t spatial = input_x->Height() * input_x->Width();
|
||||
|
||||
float *workspace_temp = static_cast<float *>(workspace());
|
||||
float *dxhat_sum = workspace_temp;
|
||||
float *dxhathat_sum = dxhat_sum + channels;
|
||||
float16_t *x = reinterpret_cast<float16_t *>(input_x->data_c());
|
||||
float16_t *yt = reinterpret_cast<float16_t *>(input_yt->data_c());
|
||||
float16_t *scale = reinterpret_cast<float16_t *>(input_scale->data_c());
|
||||
float16_t *dx = reinterpret_cast<float16_t *>(output_dx->data_c());
|
||||
float16_t *dbias = reinterpret_cast<float16_t *>(output_bias->data_c());
|
||||
float16_t *dscale = reinterpret_cast<float16_t *>(output_scale->data_c());
|
||||
int total = spatial * batch;
|
||||
int stride = UP_DIV(total, thread_num);
|
||||
int count = MSMIN(stride, total - stride * task_id);
|
||||
count = (count < 0) ? 0 : count;
|
||||
switch (stage) {
|
||||
case 0: {
|
||||
for (int job = task_id; job < 4; job += thread_num) {
|
||||
switch (job) {
|
||||
case 0:
|
||||
var2InvarFp16(save_var, input_var->ElementsNum(), bn_param->epsilon_);
|
||||
break;
|
||||
case 1:
|
||||
std::fill(workspace_temp, workspace_temp + ws_size_, 0.f);
|
||||
break;
|
||||
case 2:
|
||||
std::fill(dbias, dbias + channels, 0.f);
|
||||
break;
|
||||
case 3:
|
||||
std::fill(dscale, dscale + channels, 0.f);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (thread_num == 1) {
|
||||
backwardAllFp16(x, yt, save_mean, save_var, scale, total, channels, dxhat_sum, dxhathat_sum, dbias, dscale, dx);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
backwardP1Fp16(x, yt, save_mean, save_var, scale, total, channels, dxhat_sum, dxhathat_sum, dbias, dscale);
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
backwardP2Fp16(x + task_id * stride * channels, yt + task_id * stride * channels, save_mean, save_var, scale,
|
||||
count, total, channels, dxhat_sum, dxhathat_sum, dx + task_id * stride * channels);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BNGradFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
MS_ASSERT(cdata != nullptr);
|
||||
auto bn_kernel = reinterpret_cast<BNGradCPUKernelFp16 *>(cdata);
|
||||
auto error_code = bn_kernel->Execute(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "BNGradRun error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BNGradCPUKernelFp16::Run() {
|
||||
stage_ = 0;
|
||||
thread_num_ = context_->thread_num_;
|
||||
if (thread_num_ == 1) {
|
||||
int error_code = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(BNGradFp16Run, this, thread_num_);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "BN function error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
const std::vector<int> threads = {thread_num_, 1, thread_num_};
|
||||
for (size_t stage = 0; stage < threads.size(); stage++) {
|
||||
stage_ = static_cast<int>(stage);
|
||||
int error_code = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(BNGradFp16Run, this, threads.at(stage));
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "BN function error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BatchNormGrad, LiteKernelCreator<BNGradCPUKernelFp16>)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,43 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BN_FP16_GRAD_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BN_FP16_GRAD_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "nnacl/fp32_grad/batch_norm.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
class BNGradCPUKernelFp16 : public InnerKernel {
|
||||
public:
|
||||
explicit BNGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(parameter, inputs, outputs, ctx) {}
|
||||
~BNGradCPUKernelFp16() override {}
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
int thread_num_ = 1;
|
||||
int stage_ = 0;
|
||||
size_t ws_size_ = 0;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_BN_FP16_GRAD_H_
|
|
@ -0,0 +1,205 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "nnacl/pack.h"
|
||||
#include "nnacl/fp16_grad/convolution_grad_filter.h"
|
||||
#include "nnacl/fp16_grad/pack_fp16_ext.h"
|
||||
#include "nnacl/fp16_grad/gemm_fp16.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_Conv2DBackpropFilterFusion;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int ConvolutionGradFilterCPUKernelFp16::ReSize() {
|
||||
// dy is in input 0
|
||||
// x is in input 1
|
||||
// dw is output 0
|
||||
|
||||
auto *x_tensor = in_tensors_.at(1);
|
||||
MS_ASSERT(x_tensor != nullptr);
|
||||
auto *dy_tensor = in_tensors_.at(0);
|
||||
MS_ASSERT(dy_tensor != nullptr);
|
||||
|
||||
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
|
||||
conv_param->output_batch_ = dy_tensor->shape().at(kNHWC_N);
|
||||
conv_param->input_batch_ = x_tensor->shape().at(kNHWC_N);
|
||||
conv_param->input_h_ = x_tensor->shape().at(kNHWC_H);
|
||||
conv_param->input_w_ = x_tensor->shape().at(kNHWC_W);
|
||||
// assume OutCh|kh|kw|InCh
|
||||
conv_param->input_channel_ = x_tensor->shape().at(kNHWC_C);
|
||||
conv_param->output_channel_ = dy_tensor->shape().at(kNHWC_C);
|
||||
|
||||
conv_param->output_h_ = dy_tensor->shape()[kNHWC_H];
|
||||
conv_param->output_w_ = dy_tensor->shape()[kNHWC_W];
|
||||
|
||||
do_img2col_ = (conv_param->kernel_h_ == 1) && (conv_param->kernel_w_ == 1) && (conv_param->pad_d_ == 0) &&
|
||||
(conv_param->pad_u_ == 0) && (conv_param->pad_l_ == 0) && (conv_param->pad_r_ == 0) &&
|
||||
(conv_param->dilation_h_ == 1) && (conv_param->dilation_w_ == 1) && (conv_param->stride_h_ == 1) &&
|
||||
(conv_param->stride_w_ == 1) && (conv_param->group_ == 1)
|
||||
? false
|
||||
: true;
|
||||
do_dw_ = (conv_param->output_channel_ == conv_param->group_) &&
|
||||
(conv_param->input_channel_ == conv_param->output_channel_) && (conv_param->dilation_h_ == 1) &&
|
||||
(conv_param->dilation_w_ == 1)
|
||||
? true
|
||||
: false;
|
||||
|
||||
ws_size_ = chunk_ * conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
|
||||
ws_size_ = do_dw_ ? ws_size_ : ws_size_ / conv_param->group_;
|
||||
int n = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_ / conv_param->group_;
|
||||
int k = conv_param->output_channel_ / conv_param->group_;
|
||||
int thread_num = context_->thread_num_;
|
||||
mat_alloc_ = MatSizeTotalFp16(k, n, chunk_, 0);
|
||||
set_workspace_size((ws_size_ + mat_alloc_ + (k * n)) * thread_num * sizeof(float16_t));
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionGradFilterCPUKernelFp16::Init() { return ReSize(); }
|
||||
|
||||
int ConvolutionGradFilterCPUKernelFp16::Execute(int task_id) {
|
||||
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
|
||||
auto *input_dy = in_tensors_.at(0);
|
||||
auto *input_x = in_tensors_.at(1);
|
||||
auto *out_dw = out_tensors_.at(0);
|
||||
|
||||
auto x_addr = reinterpret_cast<float16_t *>(input_x->data_c());
|
||||
auto dy_addr = reinterpret_cast<float16_t *>(input_dy->data_c());
|
||||
auto dw_addr = reinterpret_cast<float16_t *>(out_dw->data_c());
|
||||
|
||||
int nweights = out_dw->ElementsNum();
|
||||
int in_ch = conv_param->input_channel_;
|
||||
int in_h = conv_param->input_h_;
|
||||
int in_w = conv_param->input_w_;
|
||||
int k_h = conv_param->kernel_h_;
|
||||
int k_w = conv_param->kernel_w_;
|
||||
int batch = conv_param->output_batch_;
|
||||
int out_ch = conv_param->output_channel_;
|
||||
int groups = conv_param->group_;
|
||||
int out_h = conv_param->output_h_;
|
||||
int out_w = conv_param->output_w_;
|
||||
|
||||
int m = out_h * out_w;
|
||||
int n = k_h * k_w * in_ch / groups;
|
||||
int k = out_ch / groups;
|
||||
int thread_num = context_->thread_num_;
|
||||
float16_t *workspace_temp = reinterpret_cast<float16_t *>(workspace());
|
||||
float16_t *mat_workspace = workspace_temp + ws_size_ * thread_num + task_id * (mat_alloc_ + k * n);
|
||||
float16_t *mat_tmp = mat_workspace + mat_alloc_;
|
||||
int stride = UP_DIV(batch, thread_num);
|
||||
int count = MSMIN(stride, batch - stride * task_id);
|
||||
count = (count < 0) ? 0 : count;
|
||||
int start = stride * task_id;
|
||||
int end = start + count;
|
||||
|
||||
if (do_dw_) {
|
||||
#ifdef ENABLE_ARM
|
||||
stride = UP_DIV(k_h * k_w, thread_num);
|
||||
count = MSMIN(stride, k_h * k_w - stride * task_id);
|
||||
count = (count < 0) ? 0 : count;
|
||||
start = stride * task_id;
|
||||
ConvDwFilterFp16Grad(x_addr, dy_addr, dw_addr, start, count, conv_param);
|
||||
#else
|
||||
stride = UP_DIV(groups, thread_num);
|
||||
count = MSMIN(stride, groups - stride * task_id);
|
||||
start = stride * task_id;
|
||||
end = start + count;
|
||||
|
||||
const int kernel_spatial = k_h * k_w;
|
||||
for (int i = 0; i < batch; ++i) {
|
||||
for (int ci = 0; ci < m; ci += chunk_) {
|
||||
int real_chunk = MSMIN(m - ci, chunk_);
|
||||
float16_t *mat_b = workspace_temp + task_id * ws_size_;
|
||||
float16_t *im = x_addr + (i * in_ch * in_h * in_w);
|
||||
RollingIm2ColPackDwUnitFp16(im, conv_param, mat_b, real_chunk, ci);
|
||||
for (int j = start; j < end; ++j) {
|
||||
float16_t *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups) + ci * out_ch;
|
||||
float16_t *mat_c = dw_addr + j * nweights / groups;
|
||||
GemmMatmulFp16(1, 0, k, n, real_chunk, 1, mat_a, out_ch, mat_b + (j * kernel_spatial), n * groups, 1, mat_c,
|
||||
n, mat_workspace);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} else if (do_img2col_) {
|
||||
for (int i = start; i < end; ++i) {
|
||||
for (int ci = 0; ci < m; ci += chunk_) {
|
||||
for (int j = 0; j < groups; ++j) {
|
||||
int real_chunk = MSMIN(m - ci, chunk_);
|
||||
float16_t *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups) + ci * out_ch;
|
||||
float16_t *mat_b = workspace_temp + task_id * ws_size_;
|
||||
float16_t *mat_c = dw_addr + j * nweights / groups;
|
||||
float16_t *im = x_addr + (i * in_ch * in_h * in_w) + j * (in_ch / groups);
|
||||
RollingIm2ColPackUnitFp16(im, conv_param, mat_b, real_chunk, ci);
|
||||
GemmMatmulFp16(1, 0, k, n, real_chunk, 1, mat_a, out_ch, mat_b, n, 0, mat_tmp, n, mat_workspace);
|
||||
std::unique_lock<std::mutex> merge_lock(lock_);
|
||||
AddMatrixFp16(mat_tmp, mat_c, 1, k, n, n);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
float16_t *mat_c = dw_addr;
|
||||
const size_t in_plane_size = in_ch * in_h * in_w;
|
||||
for (int i = start; i < end; ++i) {
|
||||
for (int ci = 0; ci < m; ci += chunk_) {
|
||||
int real_chunk = MSMIN(m - ci, chunk_);
|
||||
float16_t *mat_a = dy_addr + i * m * k + ci * out_ch;
|
||||
float16_t *im = x_addr + i * in_plane_size;
|
||||
int input_h = ci / out_w * conv_param->stride_h_;
|
||||
int input_w = ci % out_w * conv_param->stride_w_;
|
||||
int offset = (input_h * in_w + input_w) * in_ch;
|
||||
GemmMatmulFp16(1, 0, k, n, real_chunk, 1, mat_a, out_ch, im + offset, n, 0, mat_tmp, n, mat_workspace);
|
||||
std::unique_lock<std::mutex> merge_lock(lock_);
|
||||
AddMatrixFp16(mat_tmp, mat_c, 1, k, n, n);
|
||||
}
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionGradFilterFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
MS_ASSERT(cdata != nullptr);
|
||||
auto convfilter_kernel = reinterpret_cast<ConvolutionGradFilterCPUKernelFp16 *>(cdata);
|
||||
auto error_code = convfilter_kernel->Execute(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "ConvolutionGradFilterRun error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionGradFilterCPUKernelFp16::Run() {
|
||||
auto *out_dw = out_tensors_.at(0);
|
||||
auto dw_addr = reinterpret_cast<float16_t *>(out_dw->data_c());
|
||||
memset(dw_addr, 0, out_dw->Size());
|
||||
int error_code = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(ConvolutionGradFilterFp16Run, this, context_->thread_num_);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "conv filter function error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2DBackpropFilterFusion,
|
||||
LiteKernelCreator<ConvolutionGradFilterCPUKernelFp16>)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,51 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_FILTER_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_FILTER_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/inner_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class ConvolutionGradFilterCPUKernelFp16 : public InnerKernel {
|
||||
public:
|
||||
explicit ConvolutionGradFilterCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
|
||||
: InnerKernel(parameter, inputs, outputs, ctx) {}
|
||||
~ConvolutionGradFilterCPUKernelFp16() override {}
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
size_t ws_size_ = 0;
|
||||
bool do_img2col_ = true;
|
||||
bool do_dw_ = false;
|
||||
std::mutex lock_;
|
||||
size_t mat_alloc_ = 0;
|
||||
#ifdef ENABLE_ARM32
|
||||
const int chunk_ = C4NUM * 2;
|
||||
#else
|
||||
const int chunk_ = C12NUM * 2;
|
||||
#endif
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_FILTER_H_
|
|
@ -0,0 +1,172 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "nnacl/pack.h"
|
||||
#include "nnacl/fp16_grad/pack_fp16_ext.h"
|
||||
#include "nnacl/fp16_grad/gemm_fp16.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_Conv2DBackpropInputFusion;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int ConvolutionGradInputCPUKernelFp16::ReSize() {
|
||||
auto *dy_tensor = in_tensors_.at(kInputIndex);
|
||||
MS_ASSERT(dy_tensor != nullptr);
|
||||
auto *weight_tensor = in_tensors_.at(kWeightIndex);
|
||||
MS_ASSERT(weight_tensor != nullptr);
|
||||
auto *dx_tensor = out_tensors_.at(kOutputIndex);
|
||||
MS_ASSERT(dx_tensor != nullptr);
|
||||
|
||||
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
|
||||
conv_param->output_batch_ = dx_tensor->shape()[(kNHWC_N)];
|
||||
conv_param->input_batch_ = dy_tensor->shape()[(kNHWC_N)];
|
||||
|
||||
conv_param->input_h_ = dx_tensor->shape()[(kNHWC_H)];
|
||||
conv_param->input_w_ = dx_tensor->shape()[(kNHWC_W)];
|
||||
|
||||
// assume OutCh|kh|kw|In
|
||||
conv_param->input_channel_ = dx_tensor->shape()[(kNHWC_C)];
|
||||
conv_param->output_channel_ = weight_tensor->shape()[(kNHWC_N)];
|
||||
|
||||
conv_param->output_h_ = dy_tensor->shape()[kNHWC_H];
|
||||
conv_param->output_w_ = dy_tensor->shape()[kNHWC_W];
|
||||
ws_size_ = chunk_ * conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_ / conv_param->group_;
|
||||
|
||||
int n = conv_param->kernel_w_ * conv_param->kernel_h_ * conv_param->input_channel_ / conv_param->group_;
|
||||
int k = conv_param->output_channel_ / conv_param->group_;
|
||||
int thread_num = context_->thread_num_;
|
||||
mat_alloc_ = MatSizeTotalFp16(chunk_, n, k, 0);
|
||||
set_workspace_size((ws_size_ + mat_alloc_) * sizeof(float16_t) * thread_num);
|
||||
|
||||
do_img2col_ = (conv_param->kernel_h_ == 1) && (conv_param->kernel_w_ == 1) && (conv_param->pad_d_ == 0) &&
|
||||
(conv_param->pad_u_ == 0) && (conv_param->pad_l_ == 0) && (conv_param->pad_r_ == 0) &&
|
||||
(conv_param->dilation_h_ == 1) && (conv_param->dilation_w_ == 1) && (conv_param->stride_h_ == 1) &&
|
||||
(conv_param->stride_w_ == 1) && (conv_param->group_ == 1)
|
||||
? false
|
||||
: true;
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionGradInputCPUKernelFp16::Init() { return ReSize(); }
|
||||
|
||||
int ConvolutionGradInputCPUKernelFp16::Execute(int task_id) {
|
||||
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
|
||||
auto *input_dy = in_tensors_.at(0);
|
||||
auto *input_w = in_tensors_.at(1);
|
||||
auto *out_dx = out_tensors_.at(0);
|
||||
|
||||
auto dy_addr = reinterpret_cast<float16_t *>(input_dy->data_c());
|
||||
auto w_addr = reinterpret_cast<float16_t *>(input_w->data_c());
|
||||
auto dx_addr = reinterpret_cast<float16_t *>(out_dx->data_c());
|
||||
|
||||
int i, j;
|
||||
int nweights = input_w->ElementsNum();
|
||||
int in_ch = conv_param->input_channel_;
|
||||
int in_h = conv_param->input_h_;
|
||||
int in_w = conv_param->input_w_;
|
||||
int k_h = conv_param->kernel_h_; // out_dw->shape()[1];
|
||||
int k_w = conv_param->kernel_w_; // out_dw->shape()[2];
|
||||
int batch = conv_param->output_batch_;
|
||||
int out_ch = conv_param->output_channel_;
|
||||
int groups = conv_param->group_;
|
||||
int out_h = conv_param->output_h_;
|
||||
int out_w = conv_param->output_w_;
|
||||
int thread_num = context_->thread_num_;
|
||||
int m = out_h * out_w;
|
||||
int n = k_w * k_h * in_ch / groups;
|
||||
int k = out_ch / groups;
|
||||
float16_t *workspace_temp = reinterpret_cast<float16_t *>(workspace()) + task_id * (mat_alloc_ + ws_size_);
|
||||
float16_t *mat_workspace = workspace_temp + ws_size_;
|
||||
int stride = UP_DIV(batch, thread_num);
|
||||
int count = MSMIN(stride, batch - stride * task_id);
|
||||
count = (count < 0) ? 0 : count;
|
||||
int start = stride * task_id;
|
||||
int end = start + count;
|
||||
|
||||
for (i = start; i < end; ++i) {
|
||||
for (j = 0; j < groups; ++j) {
|
||||
GemmCbFp16 gcb;
|
||||
for (int ci = 0; ci < m; ci += chunk_) {
|
||||
float16_t *mat_b = nullptr;
|
||||
if (ci == 0) {
|
||||
mat_b = w_addr + j * nweights / groups;
|
||||
gcb.ca = 0;
|
||||
gcb.cb = 0;
|
||||
gcb.bias = nullptr;
|
||||
gcb.atype = ActType_No;
|
||||
} else {
|
||||
mat_b = gcb.mat_b;
|
||||
gcb.cb = 1;
|
||||
}
|
||||
int real_chunk = MSMIN(m - ci, chunk_);
|
||||
float16_t *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups) + ci * out_ch;
|
||||
if (do_img2col_) {
|
||||
float16_t *mat_c = workspace_temp;
|
||||
GemmMatmulPlusFp16(0, 0, real_chunk, n, k, 1, mat_a, out_ch, mat_b, n, 0, mat_c, n, mat_workspace, &gcb);
|
||||
RollingCol2ImPackUnitFp16(mat_c,
|
||||
dx_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups),
|
||||
conv_param, real_chunk, ci);
|
||||
} else {
|
||||
float16_t *mat_c =
|
||||
dx_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups) + ci * (in_ch / groups);
|
||||
GemmMatmulPlusFp16(0, 0, real_chunk, n, k, 1, mat_a, out_ch, mat_b, n, 0, mat_c, n, mat_workspace, &gcb);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionGradInputFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
MS_ASSERT(cdata != nullptr);
|
||||
auto convinput_kernel = reinterpret_cast<ConvolutionGradInputCPUKernelFp16 *>(cdata);
|
||||
auto error_code = convinput_kernel->Execute(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "conv input error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionGradInputCPUKernelFp16::Run() {
|
||||
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
|
||||
int batch = conv_param->output_batch_;
|
||||
int in_ch = conv_param->input_channel_;
|
||||
int in_h = conv_param->input_h_;
|
||||
int in_w = conv_param->input_w_;
|
||||
auto *out_dx = out_tensors_.at(0);
|
||||
auto dx_addr = reinterpret_cast<float16_t *>(out_dx->data_c());
|
||||
memset(dx_addr, 0, sizeof(float16_t) * batch * in_ch * in_h * in_w);
|
||||
int error_code = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(ConvolutionGradInputFp16Run, this, context_->thread_num_);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "bias function error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2DBackpropInputFusion,
|
||||
LiteKernelCreator<ConvolutionGradInputCPUKernelFp16>)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,44 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_INPUT_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_INPUT_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/inner_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class ConvolutionGradInputCPUKernelFp16 : public InnerKernel {
|
||||
public:
|
||||
explicit ConvolutionGradInputCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(parameter, inputs, outputs, ctx) {}
|
||||
~ConvolutionGradInputCPUKernelFp16() override {}
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
size_t ws_size_ = 0;
|
||||
size_t mat_alloc_ = 0;
|
||||
bool do_img2col_ = true;
|
||||
const int chunk_ = C12NUM;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_CONVOLUTION_FP16_GRAD_INPUT_H_
|
|
@ -0,0 +1,95 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <random>
|
||||
#include "src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.h"
|
||||
#include "nnacl/fp16_grad/dropout_grad.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "nnacl/fp32_grad/dropout_parameter.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_NULL_PTR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_DropoutGrad;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int DropoutGradCPUKernelFp16::Init() {
|
||||
auto param = reinterpret_cast<DropoutParameter *>(op_parameter_);
|
||||
if (param == nullptr) {
|
||||
MS_LOG(ERROR) << "Dropout op_parameter_ nullptr";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
auto ratio = param->ratio_;
|
||||
if ((ratio > 1.0f) || (ratio < 0.0f)) {
|
||||
MS_LOG(ERROR) << "unsupported ratio value - Dropout ratio should be between zero to one";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
if (ratio >= 1.0f) {
|
||||
scale_ = 1.0f;
|
||||
} else {
|
||||
scale_ = 1. / (1. - ratio);
|
||||
}
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
return ReSize();
|
||||
}
|
||||
|
||||
int DropoutGradCPUKernelFp16::ReSize() { return RET_OK; }
|
||||
|
||||
int DropoutGradCPUKernelFp16::Execute(int task_id) {
|
||||
auto yt_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(kInputIndex)->data_c());
|
||||
auto mask_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c());
|
||||
auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(kOutputIndex)->data_c());
|
||||
auto length = in_tensors_.at(kInputIndex)->ElementsNum();
|
||||
int stride = UP_DIV(length, thread_count_);
|
||||
int count = MSMIN(stride, length - stride * task_id);
|
||||
|
||||
if (count > 0) {
|
||||
int start = stride * task_id;
|
||||
DropoutFp16Grad(&(yt_ptr[start]), &(mask_ptr[start]), &(output_ptr[start]), count, (float16_t)scale_);
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int RunDropoutFp16Grad(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto dropout = reinterpret_cast<DropoutGradCPUKernelFp16 *>(cdata);
|
||||
auto error_code = dropout->Execute(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "Dropout Grad Run error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DropoutGradCPUKernelFp16::Run() {
|
||||
int error_code = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(RunDropoutFp16Grad, this, thread_count_);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "Dropout Grad function error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_DropoutGrad, LiteKernelCreator<DropoutGradCPUKernelFp16>)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,43 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_DROPOUT_FP16_GRAD_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_DROPOUT_FP16_GRAD_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/inner_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class DropoutGradCPUKernelFp16 : public InnerKernel {
|
||||
public:
|
||||
DropoutGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(parameter, inputs, outputs, ctx), thread_count_(ctx->thread_num_) {}
|
||||
|
||||
~DropoutGradCPUKernelFp16() override = default;
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
float scale_;
|
||||
int thread_count_ = 1;
|
||||
};
|
||||
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_DROPOUT_FP16_GRAD_H_
|
|
@ -0,0 +1,109 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.h"
|
||||
#include <vector>
|
||||
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "nnacl/fp16_grad/layernorm_grad.h"
|
||||
#include "nnacl/fp32_grad/layernormgrad_parameter.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_LayerNormGrad;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int LayerNormGradCPUKernelFp16::ReSize() { return RET_OK; }
|
||||
|
||||
int LayerNormGradCPUKernelFp16::Init() {
|
||||
auto lngrad_param = reinterpret_cast<LayerNormGradParameter *>(op_parameter_);
|
||||
auto *input_x = in_tensors_.at(0);
|
||||
std::vector<int> x_shape = input_x->shape();
|
||||
int begin_norm_axis = lngrad_param->begin_norm_axis_;
|
||||
if (begin_norm_axis < 0) {
|
||||
begin_norm_axis += x_shape.size();
|
||||
}
|
||||
auto begin_params_axis = lngrad_param->begin_params_axis_;
|
||||
if (begin_params_axis < 0) {
|
||||
begin_params_axis += x_shape.size();
|
||||
}
|
||||
for (size_t i = 0; i < static_cast<size_t>(begin_norm_axis); i++) {
|
||||
block_num_ *= x_shape[i];
|
||||
}
|
||||
for (size_t i = static_cast<size_t>(begin_norm_axis); i < x_shape.size(); i++) {
|
||||
block_size_ *= x_shape[i];
|
||||
}
|
||||
for (size_t i = 0; i < static_cast<size_t>(begin_params_axis); i++) {
|
||||
param_size_ *= x_shape[i];
|
||||
}
|
||||
for (size_t i = begin_params_axis; i < x_shape.size(); i++) {
|
||||
param_num_ *= x_shape[i];
|
||||
}
|
||||
if (block_num_ <= 0 || block_size_ <= 0) {
|
||||
MS_LOG(ERROR) << "LayerNormGradCPUKernelFp16 input shape error, input shape: " << x_shape;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int LayerNormGradCPUKernelFp16::Execute(int task_id) {
|
||||
auto input_x = in_tensors_.at(0);
|
||||
auto input_dy = in_tensors_.at(1);
|
||||
auto input_var = in_tensors_.at(2);
|
||||
auto input_mean = in_tensors_.at(3);
|
||||
auto input_gamma = in_tensors_.at(4);
|
||||
auto output_dx = out_tensors_.at(0);
|
||||
auto output_dg = out_tensors_.at(1);
|
||||
auto output_db = out_tensors_.at(2);
|
||||
|
||||
float16_t *x = reinterpret_cast<float16_t *>(input_x->data_c());
|
||||
float16_t *dy = reinterpret_cast<float16_t *>(input_dy->data_c());
|
||||
float16_t *var = reinterpret_cast<float16_t *>(input_var->data_c());
|
||||
float16_t *mean = reinterpret_cast<float16_t *>(input_mean->data_c());
|
||||
float16_t *gamma = reinterpret_cast<float16_t *>(input_gamma->data_c());
|
||||
float16_t *dx = reinterpret_cast<float16_t *>(output_dx->data_c());
|
||||
float16_t *dg = reinterpret_cast<float16_t *>(output_dg->data_c());
|
||||
float16_t *db = reinterpret_cast<float16_t *>(output_db->data_c());
|
||||
LayerNormFp16Grad(x, dy, var, mean, gamma, param_num_, param_size_, block_num_, block_size_, dx, dg, db);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int LayerNormF16GradRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
MS_ASSERT(cdata != nullptr);
|
||||
auto ln_kernel = reinterpret_cast<LayerNormGradCPUKernelFp16 *>(cdata);
|
||||
auto error_code = ln_kernel->Execute(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "LayerNormGradRun error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int LayerNormGradCPUKernelFp16::Run() {
|
||||
int error_code =
|
||||
static_cast<const lite::InnerContext *>(this->context_)->thread_pool_->ParallelLaunch(LayerNormF16GradRun, this, 1);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "LayerNorm function error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LayerNormGrad, LiteKernelCreator<LayerNormGradCPUKernelFp16>)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,43 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_LAYERNORM_FP16_GRAD_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_LAYERNORM_FP16_GRAD_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/inner_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
class LayerNormGradCPUKernelFp16 : public InnerKernel {
|
||||
public:
|
||||
explicit LayerNormGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(parameter, inputs, outputs, ctx) {}
|
||||
~LayerNormGradCPUKernelFp16() override {}
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
int block_num_ = 1;
|
||||
int block_size_ = 1;
|
||||
int param_num_ = 1;
|
||||
int param_size_ = 1;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_LAYERNORM_FP16_GRAD_H_
|
|
@ -0,0 +1,69 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "nnacl/fp16/arithmetic_self_fp16.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_NegGrad;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
namespace {
|
||||
int NegGradRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
MS_ASSERT(cdata != nullptr);
|
||||
auto kernel = reinterpret_cast<NegGradCPUKernelFp16 *>(cdata);
|
||||
MS_ASSERT(kernel != nullptr);
|
||||
return kernel->DoNegGrad(task_id);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
int NegGradCPUKernelFp16::Init() { return RET_OK; }
|
||||
|
||||
int NegGradCPUKernelFp16::DoNegGrad(int task_id) {
|
||||
auto dy = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
|
||||
auto dx = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
|
||||
int length = in_tensors_.at(0)->ElementsNum();
|
||||
|
||||
int stride = UP_DIV(length, thread_count_);
|
||||
int count = MSMIN(stride, length - stride * task_id);
|
||||
count = (count < 0) ? 0 : count;
|
||||
int start = stride * task_id;
|
||||
|
||||
ElementNegativeFp16(dy + start, dx + start, count);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int NegGradCPUKernelFp16::ReSize() { return RET_OK; }
|
||||
|
||||
int NegGradCPUKernelFp16::Run() {
|
||||
int ret = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(NegGradRun, this, thread_count_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "parallel launch fail!ret: " << ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_NegGrad, LiteKernelCreator<NegGradCPUKernelFp16>)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_NEG_FP16_GRAD_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_NEG_FP16_GRAD_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/inner_kernel.h"
|
||||
#include "schema/model_generated.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
class NegGradCPUKernelFp16 : public InnerKernel {
|
||||
public:
|
||||
explicit NegGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(parameter, inputs, outputs, ctx), thread_count_(ctx->thread_num_) {}
|
||||
~NegGradCPUKernelFp16() override {}
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int DoNegGrad(int thread_id);
|
||||
|
||||
private:
|
||||
int thread_count_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_NEG_FP16_GRAD_H_
|
|
@ -0,0 +1,113 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "nnacl/fp16/pooling_fp16.h"
|
||||
#include "nnacl/fp16_grad/pooling_grad.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_AvgPoolGrad;
|
||||
using mindspore::schema::PrimitiveType_MaxPoolGrad;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int PoolingGradCPUKernelFp16::ReSize() {
|
||||
PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(op_parameter_);
|
||||
|
||||
auto in_shape = in_tensors_.at(0)->shape();
|
||||
auto out_shape = in_tensors_.at(1)->shape();
|
||||
|
||||
if (pool_param->pool_mode_ == PoolMode_AvgPool) {
|
||||
out_shape = in_tensors_.at(2)->shape();
|
||||
}
|
||||
|
||||
int input_h = in_shape.at(1);
|
||||
int input_w = in_shape.at(2);
|
||||
|
||||
if (pool_param->global_) {
|
||||
pool_param->window_w_ = input_w;
|
||||
pool_param->window_h_ = input_h;
|
||||
}
|
||||
|
||||
pool_param->input_h_ = in_shape[kNHWC_H];
|
||||
pool_param->input_w_ = in_shape[kNHWC_W];
|
||||
pool_param->input_batch_ = in_shape[kNHWC_N];
|
||||
pool_param->input_channel_ = in_shape[kNHWC_C];
|
||||
pool_param->output_h_ = out_shape[kNHWC_H];
|
||||
pool_param->output_w_ = out_shape[kNHWC_W];
|
||||
pool_param->output_batch_ = out_shape[kNHWC_N];
|
||||
pool_param->output_channel_ = out_shape[kNHWC_C];
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int PoolingGradCPUKernelFp16::Init() { return ReSize(); }
|
||||
|
||||
int PoolingGradCPUKernelFp16::Execute(int task_id) {
|
||||
PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(op_parameter_);
|
||||
auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
|
||||
auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
|
||||
int stride = UP_DIV(pool_param->output_batch_, thread_num_);
|
||||
int count = MSMIN(stride, pool_param->output_batch_ - stride * task_id);
|
||||
|
||||
if (count > 0) {
|
||||
int in_batch_size = pool_param->input_h_ * pool_param->input_w_ * pool_param->input_channel_;
|
||||
int out_batch_size = pool_param->output_h_ * pool_param->output_w_ * pool_param->input_channel_;
|
||||
std::fill(output_ptr + task_id * stride * in_batch_size, output_ptr + ((task_id * stride) + count) * in_batch_size,
|
||||
0.f);
|
||||
if (pool_param->pool_mode_ == PoolMode_MaxPool) {
|
||||
auto dy_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(2)->data_c());
|
||||
MaxPoolingFp16Grad(input_ptr + task_id * stride * in_batch_size, dy_ptr + task_id * stride * out_batch_size,
|
||||
output_ptr + task_id * stride * in_batch_size, count, pool_param);
|
||||
} else {
|
||||
input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(2)->data_c());
|
||||
AvgPoolingFp16Grad(input_ptr + task_id * stride * out_batch_size, output_ptr + task_id * stride * in_batch_size,
|
||||
count, pool_param);
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int PoolingFp16GradImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
MS_ASSERT(cdata != nullptr);
|
||||
auto pooling = reinterpret_cast<PoolingGradCPUKernelFp16 *>(cdata);
|
||||
auto error_code = pooling->Execute(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "Pooling Run error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int PoolingGradCPUKernelFp16::Run() {
|
||||
thread_num_ = context_->thread_num_;
|
||||
int error_code = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(PoolingFp16GradImpl, this, thread_num_);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_AvgPoolGrad, LiteKernelCreator<PoolingGradCPUKernelFp16>)
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MaxPoolGrad, LiteKernelCreator<PoolingGradCPUKernelFp16>)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,47 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_POOLING_FP16_GRAD_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_POOLING_FP16_GRAD_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/inner_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
using mindspore::schema::PadMode;
|
||||
using mindspore::schema::PoolMode;
|
||||
using mindspore::schema::QuantType;
|
||||
using mindspore::schema::RoundMode;
|
||||
|
||||
class PoolingGradCPUKernelFp16 : public InnerKernel {
|
||||
public:
|
||||
explicit PoolingGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(parameter, inputs, outputs, ctx) {}
|
||||
~PoolingGradCPUKernelFp16() override = default;
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
int thread_num_ = 1;
|
||||
};
|
||||
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_POOLING_FP16_GRAD_H_
|
|
@ -0,0 +1,104 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include "src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.h"
|
||||
#include "nnacl/fp16_grad/resize_grad.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_ResizeGrad;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
float16_t ScalingFp16(size_t in_size, size_t out_size, bool align_corners) {
|
||||
return (align_corners && out_size > 1) ? (in_size - 1) / static_cast<float16_t>(out_size - 1)
|
||||
: in_size / static_cast<float16_t>(out_size);
|
||||
}
|
||||
|
||||
int ResizeGradCPUKernelFp16::ReSize() {
|
||||
auto param = reinterpret_cast<ResizeFp16GradParameter *>(op_parameter_);
|
||||
if (param == nullptr) {
|
||||
MS_LOG(ERROR) << "ResizeGradCPUKernelFp16 op_parameter_ is nullptr";
|
||||
return RET_ERROR;
|
||||
}
|
||||
bool align_corners = param->align_corners_;
|
||||
param->in_height_ = static_cast<size_t>(in_tensors_.at(0)->Height());
|
||||
param->in_width_ = static_cast<size_t>(in_tensors_.at(0)->Width());
|
||||
param->out_height_ = static_cast<size_t>(out_tensors_.at(0)->Height());
|
||||
param->out_width_ = static_cast<size_t>(out_tensors_.at(0)->Width());
|
||||
param->height_scale_ = ScalingFp16(param->out_height_, param->in_height_, align_corners);
|
||||
param->width_scale_ = ScalingFp16(param->out_width_, param->in_width_, align_corners);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ResizeGradCPUKernelFp16::Init() {
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
return ReSize();
|
||||
}
|
||||
|
||||
int ResizeGradCPUKernelFp16::Execute(int task_id) {
|
||||
auto in_addr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
|
||||
auto out_addr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
|
||||
auto param = reinterpret_cast<ResizeFp16GradParameter *>(op_parameter_);
|
||||
if (param == nullptr) {
|
||||
MS_LOG(ERROR) << "ResizeGradCPUKernelFp16 op_parameter_ is nullptr";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto batch_size = in_tensors_.at(0)->Batch();
|
||||
auto channel = in_tensors_.at(0)->Channel();
|
||||
|
||||
if (param->method == static_cast<int>(schema::ResizeMethod_NEAREST)) {
|
||||
ResizeNearestNeighborFp16Grad(in_addr, out_addr, batch_size, channel, in_tensors_.at(0)->format(), param);
|
||||
} else {
|
||||
ResizeBiLinearFp16Grad(in_addr, out_addr, batch_size, channel, in_tensors_.at(0)->format(), param);
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ResizeFp16GradRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
auto resize_grad_kernel = reinterpret_cast<ResizeGradCPUKernelFp16 *>(cdata);
|
||||
auto error_code = resize_grad_kernel->Execute(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "resize grad error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ResizeGradCPUKernelFp16::Run() {
|
||||
auto out_addr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
|
||||
size_t elem_number = out_tensors_.at(0)->ElementsNum();
|
||||
std::fill(out_addr, out_addr + elem_number, 0.f);
|
||||
int error_code =
|
||||
static_cast<const lite::InnerContext *>(this->context_)->thread_pool_->ParallelLaunch(ResizeFp16GradRun, this, 1);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "ResizeGradCPUKernelFp16 function error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_ResizeGrad, LiteKernelCreator<ResizeGradCPUKernelFp16>)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_RESIZE_FP16_GRAD_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_RESIZE_FP16_GRAD_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/inner_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class ResizeGradCPUKernelFp16 : public InnerKernel {
|
||||
public:
|
||||
explicit ResizeGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(parameter, inputs, outputs, ctx) {}
|
||||
~ResizeGradCPUKernelFp16() override = default;
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int ExecuteInit(int task_id);
|
||||
int Execute(int task_id);
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_RESIZE_FP16_GRAD_H_
|
|
@ -0,0 +1,154 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.h"
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "nnacl/fp16_grad/strided_slice_grad.h"
|
||||
#include "src/ops/populate/strided_slice_populate.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_StridedSliceGrad;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int StridedSliceGradCPUKernelFp16::Init() {
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
param_ = reinterpret_cast<StridedSliceParameter *>(op_parameter_);
|
||||
auto input = in_tensors_.at(0);
|
||||
MS_ASSERT(input);
|
||||
switch (input->data_type()) {
|
||||
case kNumberTypeFloat16:
|
||||
param_->data_type = kDataTypeFloat16;
|
||||
break;
|
||||
default:
|
||||
MS_LOG(ERROR) << "Not supported data type: " << input->data_type();
|
||||
return RET_ERROR;
|
||||
}
|
||||
FillEmptyDims();
|
||||
FillOutputDim();
|
||||
return ReSize();
|
||||
}
|
||||
|
||||
void StridedSliceGradCPUKernelFp16::FillEmptyDims() {
|
||||
int32_t begins[DIMENSION_7D];
|
||||
int32_t ends[DIMENSION_7D];
|
||||
int32_t strides[DIMENSION_7D];
|
||||
int32_t input_shape[DIMENSION_7D];
|
||||
int32_t i;
|
||||
for (i = 0; i < param_->num_axes_; ++i) {
|
||||
begins[i] = param_->begins_[i];
|
||||
ends[i] = MSMIN(param_->ends_[i], param_->in_shape_[i]);
|
||||
strides[i] = param_->strides_[i];
|
||||
input_shape[i] = param_->in_shape_[i];
|
||||
}
|
||||
for (i = param_->num_axes_; i < param_->in_shape_length_; ++i) {
|
||||
input_shape[i] = param_->in_shape_[i];
|
||||
begins[i] = 0;
|
||||
ends[i] = param_->in_shape_[i];
|
||||
strides[i] = 1;
|
||||
}
|
||||
|
||||
int32_t real_index = param_->in_shape_length_ - 1;
|
||||
for (i = DIMENSION_7D - 1; i >= 0; --i) {
|
||||
if (real_index >= 0) {
|
||||
param_->begins_[i] = begins[real_index];
|
||||
param_->ends_[i] = ends[real_index];
|
||||
param_->strides_[i] = strides[real_index];
|
||||
param_->in_shape_[i] = input_shape[real_index--];
|
||||
} else {
|
||||
param_->begins_[i] = 0;
|
||||
param_->ends_[i] = 1;
|
||||
param_->strides_[i] = 1;
|
||||
param_->in_shape_[i] = 1;
|
||||
}
|
||||
}
|
||||
param_->num_axes_ = DIMENSION_7D;
|
||||
param_->in_shape_length_ = DIMENSION_7D;
|
||||
|
||||
for (i = 0; i < DIMENSION_7D; ++i) {
|
||||
if (param_->begins_[i] < 0) {
|
||||
param_->begins_[i] += param_->in_shape_[i];
|
||||
}
|
||||
if (param_->ends_[i] < 0) {
|
||||
param_->ends_[i] += param_->in_shape_[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void StridedSliceGradCPUKernelFp16::FillOutputDim() {
|
||||
auto output = out_tensors_.at(0);
|
||||
size_t out_size = output->shape().size();
|
||||
for (size_t i = 0; i < DIMENSION_7D; i++) {
|
||||
if (i < out_size) {
|
||||
output_shape_.push_back(output->shape()[i]);
|
||||
} else {
|
||||
output_shape_.insert(output_shape_.begin(), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int StridedSliceGradCPUKernelFp16::ReSize() { return RET_OK; }
|
||||
|
||||
int StridedSliceFp16GradImpl(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
MS_ASSERT(cdata != nullptr);
|
||||
auto slice = reinterpret_cast<StridedSliceGradCPUKernelFp16 *>(cdata);
|
||||
auto error_code = slice->Execute(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "StridedSliceGrad Run error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int StridedSliceGradCPUKernelFp16::Run() {
|
||||
int error_code = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(StridedSliceFp16GradImpl, this, 1);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "Strided slice error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int StridedSliceGradCPUKernelFp16::Execute(int task_id) {
|
||||
auto input = in_tensors_.at(0);
|
||||
auto output = out_tensors_.at(0);
|
||||
MS_ASSERT(output);
|
||||
|
||||
int *po = output_shape_.data();
|
||||
auto dx = reinterpret_cast<float16_t *>(output->data_c());
|
||||
auto dy = reinterpret_cast<float16_t *>(input->data_c());
|
||||
|
||||
std::fill(dx, dx + output->ElementsNum(), 0.f);
|
||||
auto ret = DoStridedSliceFp16Grad(dy, dx, po, param_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "StridedSliceGrad error error_code[" << ret << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_StridedSliceGrad, LiteKernelCreator<StridedSliceGradCPUKernelFp16>)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_STRIDED_SLICE_FP16_GRAD_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_STRIDED_SLICE_FP16_GRAD_H_
|
||||
|
||||
#include <vector>
|
||||
#include "nnacl/fp16_grad/strided_slice_grad.h"
|
||||
#include "src/lite_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class StridedSliceGradCPUKernelFp16 : public InnerKernel {
|
||||
public:
|
||||
StridedSliceGradCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(parameter, inputs, outputs, ctx) {
|
||||
param_ = reinterpret_cast<StridedSliceParameter *>(parameter);
|
||||
}
|
||||
~StridedSliceGradCPUKernelFp16() override = default;
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Execute(int task_id);
|
||||
|
||||
private:
|
||||
void FillEmptyDims();
|
||||
void FillOutputDim();
|
||||
void ParseMasks();
|
||||
|
||||
StridedSliceParameter *param_;
|
||||
std::vector<int> output_shape_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_STRIDED_SLICE_FP16_GRAD_H_
|
|
@ -0,0 +1,98 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.h"
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "nnacl/fp16_grad/unsorted_segment_sum.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_UnsortedSegmentSum;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int UnsortedSegmentSumCPUKernelFp16::Init() {
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
auto input_shape = in_tensors_.at(0)->shape();
|
||||
auto segment_ids_shape = in_tensors_.at(1)->shape();
|
||||
auto output_shape = out_tensors_.at(0)->shape();
|
||||
unit_num_ = 1;
|
||||
input_dim1_ = 1;
|
||||
for (size_t i = 0; i < input_shape.size(); ++i) {
|
||||
unit_num_ *= input_shape[i];
|
||||
if (i >= segment_ids_shape.size()) {
|
||||
input_dim1_ *= input_shape[i];
|
||||
}
|
||||
}
|
||||
output_dim0_ = output_shape[0];
|
||||
output_dim1_ = 1;
|
||||
for (size_t j = 1; j < output_shape.size(); j++) {
|
||||
output_dim1_ *= output_shape[j];
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int UnsortedSegmentSumCPUKernelFp16::ReSize() { return RET_OK; }
|
||||
|
||||
int UnsortedSegmentSumFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
|
||||
MS_ASSERT(cdata != nullptr);
|
||||
auto kernel = reinterpret_cast<UnsortedSegmentSumCPUKernelFp16 *>(cdata);
|
||||
auto error_code = kernel->Execute(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnsortedSegmentSum Run error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int UnsortedSegmentSumCPUKernelFp16::Run() {
|
||||
int error_code = static_cast<const lite::InnerContext *>(this->context_)
|
||||
->thread_pool_->ParallelLaunch(UnsortedSegmentSumFp16Run, this, 1);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "Strided slice error error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int UnsortedSegmentSumCPUKernelFp16::Execute(int task_id) {
|
||||
int ret;
|
||||
auto input_tensor = in_tensors_.at(0);
|
||||
auto indices_tensor = in_tensors_.at(1);
|
||||
auto output_tensor = out_tensors_.at(0);
|
||||
float16_t *input = reinterpret_cast<float16_t *>(input_tensor->data_c());
|
||||
int *indices = reinterpret_cast<int *>(indices_tensor->data_c());
|
||||
float16_t *output = reinterpret_cast<float16_t *>(output_tensor->data_c());
|
||||
std::fill(output, output + output_tensor->ElementsNum(), 0.f);
|
||||
ret = UnsortedSegmentSumFp16(input, unit_num_, input_dim1_, indices, output, output_dim0_, output_dim1_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "UnsortedSegmentSumFp16 error error_code[" << ret << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_UnsortedSegmentSum,
|
||||
LiteKernelCreator<UnsortedSegmentSumCPUKernelFp16>)
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,44 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_UNSORTED_SEGMENT_SUM_FP16_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_UNSORTED_SEGMENT_SUM_FP16_H_
|
||||
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class UnsortedSegmentSumCPUKernelFp16 : public InnerKernel {
|
||||
public:
|
||||
UnsortedSegmentSumCPUKernelFp16(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
|
||||
: InnerKernel(parameter, inputs, outputs, ctx) {}
|
||||
~UnsortedSegmentSumCPUKernelFp16() override = default;
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Execute(int task_id);
|
||||
size_t unit_num_ = 0;
|
||||
size_t input_dim1_ = 0;
|
||||
size_t output_dim0_ = 0;
|
||||
size_t output_dim1_ = 0;
|
||||
|
||||
private:
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_GRAD_UNSORTED_SEGMENT_SUM_FP16_H_
|
|
@ -58,7 +58,7 @@ TEST_F(TestActGradFp16, ReluGradFp16) {
|
|||
input_buf[i] = (float16_t)input_data[i];
|
||||
}
|
||||
|
||||
Fp16ReluGrad(yt_buf, input_buf, 50, output_buf);
|
||||
ReluFp16Grad(yt_buf, input_buf, 50, output_buf);
|
||||
|
||||
int res = 0;
|
||||
float error = 0;
|
||||
|
@ -113,7 +113,7 @@ TEST_F(TestActGradFp16, SigmoidGradFp16) {
|
|||
input_buf[i] = (float16_t)input_data[i];
|
||||
}
|
||||
|
||||
Fp16SigmoidGrad(yt_buf, input_buf, 50, output_buf);
|
||||
SigmoidFp16Grad(yt_buf, input_buf, 50, output_buf);
|
||||
|
||||
int res = 0;
|
||||
float error = 0;
|
||||
|
|
Loading…
Reference in New Issue