forked from mindspore-Ecosystem/mindspore
fp16 deconv winograd
This commit is contained in:
parent
6dd56e8d8b
commit
f6aa35b12b
|
@ -40,6 +40,7 @@ typedef enum ErrorCodeFp32OpEnum {
|
|||
|
||||
typedef enum ErrorCodeFp16OpEnum {
|
||||
NNACL_ERRCODE_OP_FP16_START = 20000,
|
||||
NNACL_ERRCODE_OP_FP16_WINOGRAD_GENERATOR,
|
||||
NNACL_ERRCODE_OP_FP16_END = 29999
|
||||
} ErrorCodeFp16OpEnum;
|
||||
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "nnacl/fp16/common_func_fp16.h"
|
||||
|
||||
void PostConvFuncCommFp16(float16_t *out_ptr, const float16_t *src_ptr_, const float16_t *bias_ptr,
|
||||
size_t output_channel, size_t plane_size, size_t oc_stride, size_t hw_stride,
|
||||
ActType act_type, int size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
for (int oc = 0; oc < output_channel; oc++) {
|
||||
int oc_div = oc / size, oc_mod = oc % size;
|
||||
for (int hw = 0; hw < plane_size; hw++) {
|
||||
int src_index = oc_div * size * hw_stride + hw * size + oc_mod;
|
||||
int dst_index = hw * oc_stride + oc;
|
||||
float16_t value = src_ptr_[src_index];
|
||||
if (bias_ptr != NULL) {
|
||||
value = value + bias_ptr[oc];
|
||||
}
|
||||
value = (act_type == ActType_Relu || act_type == ActType_Relu6) ? (MSMAX(0.f, value)) : (value);
|
||||
value = (act_type == ActType_Relu6) ? (MSMIN(6.f, value)) : (value);
|
||||
out_ptr[dst_index] = value;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void PostConvFuncFp16C8(const float16_t *c8_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane,
|
||||
size_t oc_stride, ActType act_type) {
|
||||
size_t oc8mod = oc % C8NUM;
|
||||
size_t oc8div = oc - oc8mod;
|
||||
size_t stride_size = oc_stride * sizeof(float16_t);
|
||||
PostFuncBiasReluC8Fp16(nhwc_out, c8_out, bias, oc8div, oc8mod, plane, stride_size, act_type);
|
||||
return;
|
||||
}
|
||||
|
||||
void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane,
|
||||
size_t plane_stride, ActType act_type) {
|
||||
PostConvFuncCommFp16(nhwc_out, c4_out, bias, oc, plane, oc, plane_stride, act_type, C4NUM);
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_LITE_NNACL_FP16_COMMON_FUNC_FP16_H_
|
||||
#define MINDSPORE_LITE_NNACL_FP16_COMMON_FUNC_FP16_H_
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "nnacl/op_base.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* deconv common */
|
||||
void PostConvFuncFp16C8(const float16_t *c8_out_ptr, float16_t *out_ptr, const float16_t *bias_ptr,
|
||||
size_t output_channel, size_t plane_size, size_t stride, ActType act_type);
|
||||
void PostFuncBiasReluC8Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc8div, size_t oc8mod,
|
||||
size_t plane_size, size_t stride, size_t relu_type);
|
||||
|
||||
/* deconv winograd */
|
||||
void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t output_channel,
|
||||
size_t plane_size, size_t plane_stride, ActType act_type);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // MINDSPORE_LITE_NNACL_FP16_COMMON_FUNC_FP16_H_
|
|
@ -13,42 +13,9 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "nnacl/fp16/deconv_fp16.h"
|
||||
|
||||
void PostConvFuncCommFp16(float16_t *out_ptr, const float16_t *src_ptr_, const float16_t *bias_ptr,
|
||||
size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6,
|
||||
int size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
for (int oc = 0; oc < output_channel; oc++) {
|
||||
int oc_div = oc / size, oc_mod = oc % size;
|
||||
for (int hw = 0; hw < plane_size; hw++) {
|
||||
int src_index = oc_div * size * plane_size + hw * size + oc_mod;
|
||||
int dst_index = hw * stride + oc;
|
||||
float16_t value = src_ptr_[src_index];
|
||||
if (bias_ptr != NULL) {
|
||||
value = value + bias_ptr[oc];
|
||||
}
|
||||
value = (is_relu || is_relu6) ? (MSMAX(0.f, value)) : (value);
|
||||
value = (is_relu6) ? (MSMIN(6.f, value)) : (value);
|
||||
out_ptr[dst_index] = value;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void PostConvFuncFp16C8(const float16_t *c8_out_ptr, float16_t *out_ptr, const float16_t *bias_ptr,
|
||||
size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6) {
|
||||
size_t oc8mod = output_channel % C8NUM;
|
||||
size_t oc8div = output_channel - oc8mod;
|
||||
size_t stride_size = stride * sizeof(float16_t);
|
||||
size_t relu_type = is_relu ? 1 : 0;
|
||||
relu_type = is_relu6 ? 3 : relu_type;
|
||||
PostFuncBiasReluC8Fp16(out_ptr, c8_out_ptr, bias_ptr, oc8div, oc8mod, plane_size, stride_size, relu_type);
|
||||
return;
|
||||
}
|
||||
|
||||
int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias, float16_t *dst, int output_channel,
|
||||
ConvParameter *conv_param) {
|
||||
/* row8x8-major(ih*iw x oc*kh*kw) -> row8-major(oh*ow x oc) */
|
||||
|
@ -112,7 +79,6 @@ int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias,
|
|||
} /*ih*/
|
||||
} /*oc8*/
|
||||
|
||||
PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_,
|
||||
conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6);
|
||||
PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->act_type_);
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
|
|
@ -13,27 +13,23 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_NNACL_FP16_DECONV_FP16_H_
|
||||
#define MINDSPORE_LITE_NNACL_FP16_DECONV_FP16_H_
|
||||
|
||||
#include <string.h>
|
||||
#include <arm_neon.h>
|
||||
#include <string.h>
|
||||
#include "nnacl/conv_parameter.h"
|
||||
#include "nnacl/matmul_parameter.h"
|
||||
#include "nnacl/fp16/matmul_fp16.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
#include "nnacl/fp16/common_func_fp16.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias, float16_t *dst, int output_channel,
|
||||
ConvParameter *conv_param);
|
||||
|
||||
void PostConvFuncFp16C8(const float16_t *c8_out_ptr, float16_t *out_ptr, const float16_t *bias_ptr,
|
||||
size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6);
|
||||
|
||||
void PostFuncBiasReluC8Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc8div, size_t oc8mod,
|
||||
size_t plane_size, size_t stride, size_t relu_type);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,329 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "nnacl/fp16/deconv_winograd_fp16.h"
|
||||
#include "nnacl/minimal_filtering_generator.h"
|
||||
|
||||
void DeConvWgInputPackFp16(float16_t *src_ptr, float16_t *dst_ptr, int channel, int stride) {
|
||||
int ic4div = channel / C4NUM;
|
||||
int ic4mod = channel % C4NUM;
|
||||
float16_t *src = src_ptr;
|
||||
float16_t *dst = dst_ptr;
|
||||
|
||||
for (int ic = 0; ic < ic4div; ic++) {
|
||||
memcpy(dst, src, C4NUM * sizeof(float16_t));
|
||||
dst += stride;
|
||||
src += C4NUM;
|
||||
}
|
||||
|
||||
if (ic4mod != 0) {
|
||||
int ic_res = 0;
|
||||
for (; ic_res < ic4mod; ic_res++) {
|
||||
dst[ic_res] = src[ic_res];
|
||||
}
|
||||
for (; ic_res < C4NUM; ic_res++) {
|
||||
dst[ic_res] = 0;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void C4GemmFp16(float16_t *dst, const float16_t *src, const float16_t *weight, size_t src_depth_quad, size_t dst_step,
|
||||
size_t dst_depth_quad, size_t width, size_t weight_depth_offset) {
|
||||
int dx, sz, dz;
|
||||
int src_depth_step = 4 * width;
|
||||
for (dz = 0; dz < dst_depth_quad; ++dz) {
|
||||
float16_t *dst_z = dst + dz * dst_step;
|
||||
const float16_t *weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset);
|
||||
for (dx = 0; dx < width; ++dx) {
|
||||
float16_t *dst_x = dst_z + dx * 4;
|
||||
dst_x[0] = 0.0f;
|
||||
dst_x[1] = 0.0f;
|
||||
dst_x[2] = 0.0f;
|
||||
dst_x[3] = 0.0f;
|
||||
const float16_t *src_dx = src + 4 * dx;
|
||||
for (sz = 0; sz < src_depth_quad; ++sz) {
|
||||
const float16_t *src_z = src_dx + sz * src_depth_step;
|
||||
const float16_t *weight_z = weight_dz + sz * 16;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
dst_x[j] += src_z[i] * weight_z[4 * i + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DeConvWgMergeFp16(const float16_t *src, float16_t *dst, size_t src_stride, size_t dst_stride, size_t count) {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
const float16_t *s = src + i * src_stride;
|
||||
float16_t *d = dst + i * dst_stride;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
d[j] += s[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void _deConvWinogradFp16(float16_t *tile_in, float16_t *tile_out, float16_t *weight_buf, float16_t *tmp_buf,
|
||||
float16_t *at_buf, float16_t *a_mid_buf, float16_t *trans_a_buf, bool a_trans,
|
||||
float16_t *bt_buf, float16_t *b_tmp_buf, int unit_size, int w_start, int h_start,
|
||||
ConvParameter *conv_param, DeConvParam *deconv_param) {
|
||||
int winograd_plane = unit_size * unit_size;
|
||||
if (!a_trans) {
|
||||
WinogradMatrixProductLeftFp16(tile_in, at_buf, a_mid_buf, DECONV_WINOGRAD_DEFAULT_UNIT, unit_size,
|
||||
DECONV_WINOGRAD_DEFAULT_UNIT, deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
|
||||
WinogradMatrixProductRightFp16(a_mid_buf, at_buf, trans_a_buf, unit_size, unit_size, DECONV_WINOGRAD_DEFAULT_UNIT,
|
||||
deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
|
||||
}
|
||||
|
||||
for (int index = 0; index < winograd_plane; index++) {
|
||||
float16_t *src = trans_a_buf + index * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_;
|
||||
float16_t *dst = tmp_buf + index * deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
|
||||
float16_t *weight = weight_buf + index * deconv_param->ic_up4_ * deconv_param->oc_up4_;
|
||||
C4GemmFp16(dst, src, weight, deconv_param->ic_div4_, DECONV_WINOGRAD_DEFAULT_TILE * C4NUM, deconv_param->oc_div4_,
|
||||
DECONV_WINOGRAD_DEFAULT_TILE, 0);
|
||||
}
|
||||
|
||||
WinogradMatrixProductLeftFp16(tmp_buf, bt_buf, b_tmp_buf, unit_size, unit_size, unit_size,
|
||||
deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
|
||||
WinogradMatrixProductRightFp16(b_tmp_buf, bt_buf, tmp_buf, unit_size, unit_size, unit_size,
|
||||
deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
|
||||
|
||||
// Add to dest
|
||||
for (int uhi = 0; uhi < unit_size; uhi++) {
|
||||
int h_index = uhi * conv_param->stride_h_ + h_start;
|
||||
for (int uwi = 0; uwi < unit_size; uwi++) {
|
||||
int w_index = uwi * conv_param->stride_w_ + w_start;
|
||||
|
||||
float16_t *dst = tile_out + w_index * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_ +
|
||||
h_index * deconv_param->out_tile_w_ * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_;
|
||||
float16_t *src = tmp_buf + (uwi + uhi * unit_size) * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_;
|
||||
DeConvWgMergeFp16(src, dst, 4, 4, DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_div4_);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void _deConvCommonFp16(float16_t *tile_in, float16_t *tile_out, float16_t *weight, float16_t *tmp_buf, int h_start,
|
||||
int w_start, int h_size, int w_size, ConvParameter *conv_param, DeConvParam *deconv_param) {
|
||||
int count = deconv_param->oc_div4_ * w_size * h_size;
|
||||
int in_stride = DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_;
|
||||
int out_stride = DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_;
|
||||
|
||||
for (int hi = 0; hi < DECONV_WINOGRAD_DEFAULT_UNIT; hi++) {
|
||||
for (int wi = 0; wi < DECONV_WINOGRAD_DEFAULT_UNIT; wi++) {
|
||||
float16_t *src_in = tile_in + (wi + hi * DECONV_WINOGRAD_DEFAULT_UNIT) * in_stride;
|
||||
C4GemmFp16(tmp_buf, src_in, weight, deconv_param->ic_div4_, DECONV_WINOGRAD_DEFAULT_TILE * 4, count,
|
||||
DECONV_WINOGRAD_DEFAULT_TILE, 0);
|
||||
|
||||
for (int uhi = 0; uhi < h_size; uhi++) {
|
||||
for (int uwi = 0; uwi < w_size; uwi++) {
|
||||
int w_index = (wi + uwi) * conv_param->stride_w_ + w_start;
|
||||
int h_index = (hi + uhi) * conv_param->stride_h_ + h_start;
|
||||
float16_t *dst = tile_out + h_index * out_stride * deconv_param->out_tile_w_ + w_index * out_stride;
|
||||
float16_t *src = tmp_buf + (uwi + uhi * w_size) * out_stride;
|
||||
DeConvWgMergeFp16(src, dst, 4, 4, DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_div4_);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int PackDeConvWgDataFp16(float16_t *nhwc_weight, DeConvComputeUnit *unit, ConvParameter *conv_param,
|
||||
DeConvParam *deconv_param) {
|
||||
int tmp_kernel_plane = unit->w_size_ * unit->h_size_;
|
||||
int output_channel = conv_param->output_channel_;
|
||||
int size = conv_param->input_channel_ * output_channel * tmp_kernel_plane;
|
||||
float16_t *current_unit_weight = (float16_t *)malloc(size * sizeof(float16_t));
|
||||
if (current_unit_weight == NULL) {
|
||||
return NNACL_NULL_PTR;
|
||||
}
|
||||
for (int ic = 0; ic < conv_param->input_channel_; ic++) {
|
||||
float16_t *src_ic = nhwc_weight + deconv_param->kernel_plane_ * output_channel * ic;
|
||||
float16_t *dst_ic = current_unit_weight + tmp_kernel_plane * output_channel * ic;
|
||||
for (int uhi = 0; uhi < unit->h_size_; uhi++) {
|
||||
for (int uwi = 0; uwi < unit->w_size_; uwi++) {
|
||||
int src_h_offset = unit->h_start_ + uhi * conv_param->stride_h_;
|
||||
int src_w_offset = unit->w_start_ + uwi * conv_param->stride_w_;
|
||||
float16_t *src_hw = src_ic + (src_h_offset * conv_param->kernel_w_ + src_w_offset) * output_channel;
|
||||
float16_t *dst_hw = dst_ic + (uhi * unit->w_size_ + uwi) * output_channel;
|
||||
memcpy(dst_hw, src_hw, output_channel * sizeof(float16_t));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (unit->use_winograd_) {
|
||||
/* Generate winograd */
|
||||
float matrix_g[64];
|
||||
float matrix_gt[64];
|
||||
float matrix_a[64];
|
||||
float matrix_at[64];
|
||||
float matrix_b[64];
|
||||
float matrix_bt[64];
|
||||
int ret = CookToomFilter(matrix_a, matrix_at, matrix_b, matrix_bt, matrix_g, matrix_gt, 0.5f,
|
||||
DECONV_WINOGRAD_DEFAULT_UNIT, unit->h_size_);
|
||||
if (ret != NNACL_OK) {
|
||||
return NNACL_ERRCODE_WINOGRAD_GENERATOR_ERROR;
|
||||
}
|
||||
|
||||
/* winograd AT */
|
||||
unit->winograd_.AT_ = malloc(unit->winograd_.i_ * unit->winograd_.o_ * sizeof(float16_t));
|
||||
if (unit->winograd_.AT_ == NULL) {
|
||||
return NNACL_NULL_PTR;
|
||||
}
|
||||
Float32ToFloat16(matrix_at, unit->winograd_.AT_, unit->winograd_.i_ * unit->winograd_.o_);
|
||||
|
||||
/* winograd BT */
|
||||
unit->winograd_.BT_ = malloc(unit->winograd_.o_ * unit->winograd_.o_ * sizeof(float16_t));
|
||||
if (unit->winograd_.BT_ == NULL) {
|
||||
return NNACL_NULL_PTR;
|
||||
}
|
||||
Float32ToFloat16(matrix_bt, unit->winograd_.BT_, unit->winograd_.o_ * unit->winograd_.o_);
|
||||
|
||||
/* winograd Weight */
|
||||
size = conv_param->input_channel_ * output_channel * unit->winograd_.kh_ * unit->winograd_.kw_;
|
||||
float16_t *winograd_unit_weight = (float16_t *)malloc(size * sizeof(float16_t));
|
||||
if (winograd_unit_weight == NULL) {
|
||||
return NNACL_NULL_PTR;
|
||||
}
|
||||
|
||||
WinogradWeightTransformFp16(current_unit_weight, winograd_unit_weight, matrix_g, matrix_gt, C4NUM,
|
||||
unit->winograd_.kh_, unit->h_size_, output_channel, conv_param->input_channel_, false);
|
||||
|
||||
/* reset weight data & info */
|
||||
tmp_kernel_plane = unit->winograd_.kh_ * unit->winograd_.kw_;
|
||||
free(current_unit_weight);
|
||||
current_unit_weight = winograd_unit_weight;
|
||||
winograd_unit_weight = NULL;
|
||||
}
|
||||
|
||||
/* trans mhwc -> hw1:k1-knc0-c4:k1-knc5-c8:hw2:k1-knc0-c4:k1 */
|
||||
float16_t *dst_weight = (float16_t *)unit->weight_;
|
||||
size = deconv_param->ic_up4_ * deconv_param->oc_up4_ * tmp_kernel_plane;
|
||||
memset(dst_weight, 0, size * sizeof(float16_t));
|
||||
for (int ic = 0; ic < conv_param->input_channel_; ic++) {
|
||||
for (int oc = 0; oc < output_channel; oc++) {
|
||||
int oc4div = oc / C4NUM, oc4mod = oc % C4NUM;
|
||||
for (int upi = 0; upi < tmp_kernel_plane; upi++) {
|
||||
int src_index = ic * output_channel * tmp_kernel_plane + upi * output_channel + oc;
|
||||
int dst_index = upi * deconv_param->oc_up4_ * deconv_param->ic_up4_ + oc4div * C4NUM * deconv_param->ic_up4_ +
|
||||
ic * C4NUM + oc4mod;
|
||||
dst_weight[dst_index] = current_unit_weight[src_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(current_unit_weight);
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
void DeconvWgFp16(float16_t *nhwc_input_, float16_t *tile_in, float16_t *tile_out, int start_index, int calculate_count,
|
||||
ConvParameter *conv_param, DeConvParam *deconv_param, int task_id) {
|
||||
/* pack tile input */
|
||||
int tile_in_unit_stride = deconv_param->ic_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
|
||||
float16x4_t zero = vdup_n_f16(0.0f);
|
||||
|
||||
for (int unit_index = 0; unit_index < calculate_count; unit_index++) {
|
||||
int plane_index = start_index + unit_index;
|
||||
int w_unit_index = plane_index % deconv_param->in_tile_w_count_;
|
||||
int h_unit_index = plane_index / deconv_param->in_tile_w_count_;
|
||||
int w_start = w_unit_index * DECONV_WINOGRAD_DEFAULT_UNIT;
|
||||
int h_start = h_unit_index * DECONV_WINOGRAD_DEFAULT_UNIT;
|
||||
|
||||
float16_t *dst_unit = tile_in + unit_index * C4NUM;
|
||||
for (int hi = 0; hi < DECONV_WINOGRAD_DEFAULT_UNIT; hi++) {
|
||||
for (int wi = 0; wi < DECONV_WINOGRAD_DEFAULT_UNIT; wi++) {
|
||||
float16_t *dst = dst_unit + (wi + hi * DECONV_WINOGRAD_DEFAULT_UNIT) * tile_in_unit_stride;
|
||||
int w_index = w_start + wi;
|
||||
int h_index = h_start + hi;
|
||||
if (w_index >= conv_param->input_w_ || h_index >= conv_param->input_h_) {
|
||||
for (int ic4_index = 0; ic4_index < deconv_param->ic_div4_; ic4_index++) {
|
||||
vst1_f16(dst + ic4_index * DECONV_WINOGRAD_DEFAULT_TILE * C4NUM, zero);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
float16_t *src = nhwc_input_ + (w_index + h_index * conv_param->input_w_) * conv_param->input_channel_;
|
||||
DeConvWgInputPackFp16(src, dst, conv_param->input_channel_, DECONV_WINOGRAD_DEFAULT_TILE * C4NUM);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* compute */
|
||||
for (int i = 0; i < deconv_param->compute_size_; i++) {
|
||||
DeConvComputeUnit *unit = &deconv_param->compute_units_[i];
|
||||
if (unit->use_winograd_) {
|
||||
float16_t *tmp_buf = (float16_t *)unit->tmp_buffer_ + task_id * unit->winograd_.kh_ * unit->winograd_.kw_ *
|
||||
deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
|
||||
|
||||
/* winograd a buffer */
|
||||
DeConvWgABuffer *tmp_a = &deconv_param->a_buffer_[unit->winograd_.kh_];
|
||||
float16_t *mid_a = (float16_t *)tmp_a->middle_buffer_ + task_id * unit->winograd_.kw_ * unit->winograd_.kh_ *
|
||||
DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_;
|
||||
float16_t *dst_a = (float16_t *)tmp_a->dest_buffer_ + task_id * unit->winograd_.kw_ * unit->winograd_.kh_ *
|
||||
DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_;
|
||||
float16_t *tmp_b = (float16_t *)unit->winograd_.b_buffer_ + task_id * unit->winograd_.kh_ * unit->winograd_.kw_ *
|
||||
DECONV_WINOGRAD_DEFAULT_TILE *
|
||||
deconv_param->oc_up4_;
|
||||
_deConvWinogradFp16(tile_in, tile_out, (float16_t *)unit->weight_, tmp_buf, unit->winograd_.AT_, mid_a, dst_a,
|
||||
tmp_a->trans_formed_, unit->winograd_.BT_, tmp_b, unit->winograd_.kh_, unit->w_start_,
|
||||
unit->h_start_, conv_param, deconv_param);
|
||||
tmp_a->trans_formed_ = true;
|
||||
} else {
|
||||
float16_t *tmp_buf = (float16_t *)unit->tmp_buffer_ + task_id * deconv_param->oc_div4_ * unit->w_size_ *
|
||||
unit->h_size_ * DECONV_WINOGRAD_DEFAULT_TILE * C4NUM;
|
||||
_deConvCommonFp16(tile_in, tile_out, (float16_t *)unit->weight_, tmp_buf, unit->h_start_, unit->w_start_,
|
||||
unit->h_size_, unit->w_size_, conv_param, deconv_param);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void DeconvWgPostFp16(float16_t *tile_out, float16_t *nc4hw4_output, ConvParameter *conv_param,
|
||||
DeConvParam *deconv_param, int calculate_count, int tile_index) {
|
||||
/* merge */
|
||||
int src_unit_stride = deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
|
||||
|
||||
int src_stride = DECONV_WINOGRAD_DEFAULT_TILE * C4NUM;
|
||||
int dst_stride = conv_param->output_w_ * conv_param->output_h_ * C4NUM;
|
||||
|
||||
for (int index = 0; index < calculate_count; ++index) {
|
||||
float16_t *src_start = tile_out + index * C4NUM;
|
||||
|
||||
int plane_index = tile_index * DECONV_WINOGRAD_DEFAULT_TILE + index;
|
||||
int w_unit_index = plane_index % deconv_param->in_tile_w_count_;
|
||||
int h_unit_index = plane_index / deconv_param->in_tile_w_count_;
|
||||
int w_start = w_unit_index * DECONV_WINOGRAD_DEFAULT_UNIT * conv_param->stride_w_ - conv_param->pad_l_;
|
||||
int h_start = h_unit_index * DECONV_WINOGRAD_DEFAULT_UNIT * conv_param->stride_h_ - conv_param->pad_u_;
|
||||
float16_t *dst_start = nc4hw4_output + h_start * conv_param->output_w_ * C4NUM + w_start * C4NUM;
|
||||
|
||||
int merge_w_start = MSMAX(-w_start, 0);
|
||||
int merge_h_start = MSMAX(-h_start, 0);
|
||||
int merge_h_end = MSMIN(deconv_param->out_tile_h_, conv_param->output_h_ - h_start);
|
||||
int merge_w_end = MSMIN(deconv_param->out_tile_w_, conv_param->output_w_ - w_start);
|
||||
|
||||
for (int hi = merge_h_start; hi < merge_h_end; hi++) {
|
||||
for (int wi = merge_w_start; wi < merge_w_end; wi++) {
|
||||
float16_t *src = src_start + (hi * deconv_param->out_tile_w_ + wi) * src_unit_stride;
|
||||
float16_t *dst = dst_start + (hi * conv_param->output_w_ + wi) * C4NUM;
|
||||
DeConvWgMergeFp16(src, dst, src_stride, dst_stride, deconv_param->oc_div4_);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_NNACL_FP16_DECONV_WINOGRAD_FP16_H_
|
||||
#define MINDSPORE_LITE_NNACL_FP16_DECONV_WINOGRAD_FP16_H_
|
||||
|
||||
#include "nnacl/fp16/winograd_transform_fp16.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int PackDeConvWgDataFp16(float16_t *nhwc_weight, DeConvComputeUnit *unit, ConvParameter *conv_param,
|
||||
DeConvParam *deconv_param);
|
||||
|
||||
void DeconvWgFp16(float16_t *nhwc_input_, float16_t *tile_in, float16_t *tile_out, int start_index, int calculate_count,
|
||||
ConvParameter *conv_param, DeConvParam *deconv_param, int task_id);
|
||||
|
||||
void DeconvWgPostFp16(float16_t *tile_out, float16_t *nc4hw4_output, ConvParameter *conv_param,
|
||||
DeConvParam *deconv_param, int calculate_count, int tile_index);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MINDSPORE_LITE_NNACL_FP16_DECONV_WINOGRAD_FP16_H_
|
|
@ -81,3 +81,51 @@ void MatrixMultiplyVecFp16(const float16x8_t *matrix_a, const float16x8_t *matri
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void WinogradMatrixProductLeftFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k,
|
||||
size_t length) {
|
||||
int unitStep = 4 * length;
|
||||
for (int y = 0; y < h; ++y) {
|
||||
float16_t *dstY = M + y * w * unitStep;
|
||||
for (int x = 0; x < w; ++x) {
|
||||
float16_t *dstX = dstY + x * unitStep;
|
||||
const float16_t *srcX = S + x * unitStep;
|
||||
memset(dstX, 0, unitStep * sizeof(float16_t));
|
||||
for (int i = 0; i < k; ++i) {
|
||||
float16_t b = B[i * h + y];
|
||||
const float16_t *srcY = srcX + i * w * unitStep;
|
||||
if (0.0f == b) {
|
||||
continue;
|
||||
}
|
||||
for (int j = 0; j < unitStep; ++j) {
|
||||
dstX[j] += srcY[j] * b;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// M = S * B , M = w*h * l, S = k*h * l, B = w*k
|
||||
void WinogradMatrixProductRightFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k,
|
||||
size_t length) {
|
||||
int unitStep = 4 * length;
|
||||
for (int y = 0; y < h; ++y) {
|
||||
float16_t *dstY = M + y * w * unitStep;
|
||||
const float16_t *srcY = S + y * k * unitStep;
|
||||
|
||||
for (int x = 0; x < w; ++x) {
|
||||
float16_t *dstX = dstY + x * unitStep;
|
||||
memset(dstX, 0, unitStep * sizeof(float16_t));
|
||||
for (int i = 0; i < k; ++i) {
|
||||
const float16_t *srcX = srcY + i * unitStep;
|
||||
float16_t b = B[i * h + x];
|
||||
if (0.0f == b) {
|
||||
continue;
|
||||
}
|
||||
for (int j = 0; j < unitStep; ++j) {
|
||||
dstX[j] += srcX[j] * b;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#define MINDSPORE_LITE_NNACL_FP16_MATRIX_FP16_H_
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -28,6 +29,13 @@ void MatrixMultiplyVecFp16(const float16x8_t *matrix_a, const float16x8_t *matri
|
|||
const float16_t *bias, int m, int k, int n);
|
||||
void MatrixMultiplyWinogradFp16(const float16_t *matix_a, const float16_t *matrix_b, float16_t *matrix_c, int m, int k,
|
||||
int n, int in_channel);
|
||||
|
||||
void WinogradMatrixProductLeftFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k,
|
||||
size_t length);
|
||||
|
||||
void WinogradMatrixProductRightFp16(const float16_t *S, const float16_t *B, float16_t *M, size_t w, size_t h, size_t k,
|
||||
size_t length);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -712,3 +712,102 @@ void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_d
|
|||
out_tile_index++;
|
||||
}
|
||||
}
|
||||
|
||||
int WinogradWeightTransformFp16(const float16_t *weight_data, float16_t *winograd_data, float *matrix_g,
|
||||
float *matrix_gt, int oc_block, int input_unit, int kernel_unit, int filter_channel,
|
||||
int filter_batch, bool pack) {
|
||||
// original weight format : ohwi
|
||||
int oc_block_num = UP_DIV(filter_batch, oc_block);
|
||||
int block_stride = filter_channel * oc_block;
|
||||
int block_num_stride = block_stride * oc_block_num;
|
||||
|
||||
float16_t *matrix_gt_data_fp16 = (float16_t *)(malloc(input_unit * kernel_unit * sizeof(float16_t)));
|
||||
if (matrix_gt_data_fp16 == NULL) {
|
||||
return NNACL_ERRCODE_OP_FP16_WINOGRAD_GENERATOR;
|
||||
}
|
||||
Float32ToFloat16(matrix_gt, matrix_gt_data_fp16, input_unit * kernel_unit);
|
||||
|
||||
// trans_filter = G*g*GT (g represents weight_data) = [(g * (G)T)T * (G)T]T
|
||||
// separate into two steps ===> tmp = (g * (G)T)T ===> out = [tmp * (G)T]T
|
||||
float16_t *tmp_data = (float16_t *)(malloc(filter_channel * input_unit * kernel_unit * sizeof(float16_t)));
|
||||
if (tmp_data == NULL) {
|
||||
free(matrix_gt_data_fp16);
|
||||
return NNACL_ERRCODE_OP_FP16_WINOGRAD_GENERATOR;
|
||||
}
|
||||
float16_t *trans_out_data = (float16_t *)(malloc(filter_channel * input_unit * input_unit * sizeof(float16_t)));
|
||||
if (trans_out_data == NULL) {
|
||||
free(tmp_data);
|
||||
free(matrix_gt_data_fp16);
|
||||
return NNACL_ERRCODE_OP_FP16_WINOGRAD_GENERATOR;
|
||||
}
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
float16_t *tmp_data1 = (float16_t *)(malloc(filter_channel * input_unit * kernel_unit * sizeof(float16_t)));
|
||||
if (tmp_data1 == NULL) {
|
||||
free(tmp_data);
|
||||
free(matrix_gt_data_fp16);
|
||||
free(trans_out_data);
|
||||
return NNACL_ERRCODE_OP_FP16_WINOGRAD_GENERATOR;
|
||||
}
|
||||
float16_t *trans_out_data1 = (float16_t *)(malloc(filter_channel * input_unit * input_unit * sizeof(float16_t)));
|
||||
if (trans_out_data1 == NULL) {
|
||||
free(tmp_data);
|
||||
free(tmp_data1);
|
||||
free(matrix_gt_data_fp16);
|
||||
free(trans_out_data);
|
||||
return NNACL_ERRCODE_OP_FP16_WINOGRAD_GENERATOR;
|
||||
}
|
||||
#endif
|
||||
|
||||
int input_oz_offset = kernel_unit * kernel_unit * filter_channel;
|
||||
for (int i = 0; i < filter_batch; i++) {
|
||||
int out_c_block = i / oc_block;
|
||||
int out_c_res = i % oc_block;
|
||||
int output_oz_offset = out_c_block * block_stride + out_c_res;
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
// tmp_data = g * GT
|
||||
MatrixMultiplyWinogradFp16(weight_data + i * input_oz_offset, matrix_gt_data_fp16, tmp_data, kernel_unit,
|
||||
kernel_unit, input_unit, filter_channel);
|
||||
// tmp_data1 = (tmp_data)T
|
||||
PackHWCToWHCFp16(tmp_data, tmp_data1, kernel_unit, input_unit, filter_channel);
|
||||
// trans_out_data1 = tmp * GT
|
||||
MatrixMultiplyWinogradFp16(tmp_data1, matrix_gt_data_fp16, trans_out_data1, input_unit, kernel_unit, input_unit,
|
||||
filter_channel);
|
||||
// trans_out_data = (trans_out_data1)T
|
||||
PackHWCToWHCFp16(trans_out_data1, trans_out_data, input_unit, input_unit, filter_channel);
|
||||
#else
|
||||
// tmp = (g * GT)T
|
||||
MatrixMultiplyWinogradFp16(weight_data + i * input_oz_offset, matrix_gt_data_fp16, tmp_data, kernel_unit,
|
||||
kernel_unit, input_unit, filter_channel);
|
||||
// trans = (tmp * GT)T
|
||||
MatrixMultiplyWinogradFp16(tmp_data, matrix_gt_data_fp16, trans_out_data, input_unit, kernel_unit, input_unit,
|
||||
filter_channel);
|
||||
#endif
|
||||
|
||||
if (pack) {
|
||||
int in_offset = 0;
|
||||
for (int j = 0; j < input_unit; ++j) {
|
||||
for (int k = 0; k < input_unit; ++k) {
|
||||
for (int c = 0; c < filter_channel; ++c) {
|
||||
*(winograd_data + output_oz_offset + c * oc_block) = trans_out_data[in_offset + c];
|
||||
}
|
||||
in_offset += filter_channel;
|
||||
output_oz_offset += block_num_stride;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
memcpy(winograd_data + i * filter_channel * input_unit * input_unit, trans_out_data,
|
||||
filter_channel * input_unit * input_unit * sizeof(float16_t));
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
free(tmp_data1);
|
||||
free(trans_out_data1);
|
||||
#endif
|
||||
free(tmp_data);
|
||||
free(trans_out_data);
|
||||
free(matrix_gt_data_fp16);
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
|
|
@ -19,9 +19,10 @@
|
|||
|
||||
#include <arm_neon.h>
|
||||
#include <string.h>
|
||||
#include "nnacl/fp16/pack_fp16.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
#include "nnacl/fp16/cast_fp16.h"
|
||||
#include "nnacl/fp16/conv_fp16.h"
|
||||
#include "nnacl/fp16/winograd_utils_fp16.h"
|
||||
#include "nnacl/fp16/matrix_fp16.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -49,6 +50,12 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in
|
|||
void WinogradOutputTransformFp16(const float16_t *gemm_out, float16_t *tmp_out_data, const float16_t *bias_data,
|
||||
int cal_num, int out_tile_index, int output_unit_num, ConvParameter *conv_param,
|
||||
OutputTransFp16Func func);
|
||||
|
||||
// fp16 winograd weight trans
|
||||
int WinogradWeightTransformFp16(const float16_t *weight_data, float16_t *winograd_data, float *matrix_g,
|
||||
float *matrix_gt, int oc_block, int input_unit, int kernel_unit, int filter_channel,
|
||||
int filter_batch, bool pack);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -15,23 +15,10 @@
|
|||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h"
|
||||
#include "nnacl/fp16/matrix_fp16.h"
|
||||
#include "nnacl/fp16/conv_fp16.h"
|
||||
#include "nnacl/fp16/cast_fp16.h"
|
||||
#include "nnacl/fp16/pack_fp16.h"
|
||||
#include "nnacl/fp16/winograd_transform_fp16.h"
|
||||
#include "nnacl/fp16/winograd_utils_fp16.h"
|
||||
#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_Conv2D;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
int ConvolutionWinogradFP16CPUKernel::WinogradFilterTransformFp16(const float16_t *weight_data, float *matrix_g,
|
||||
|
@ -40,104 +27,9 @@ int ConvolutionWinogradFP16CPUKernel::WinogradFilterTransformFp16(const float16_
|
|||
MS_LOG(ERROR) << "Divide by zero";
|
||||
return RET_ERROR;
|
||||
}
|
||||
// original weight format : ohwi
|
||||
auto channel_in = conv_param_->input_channel_;
|
||||
auto channel_out = conv_param_->output_channel_;
|
||||
int oc_block_num = UP_DIV(channel_out, oc_block);
|
||||
int block_stride = channel_in * oc_block;
|
||||
int block_num_stride = block_stride * oc_block_num;
|
||||
|
||||
auto matrix_gt_data_fp16 = reinterpret_cast<float16_t *>(malloc(input_unit_ * kernel_unit_ * sizeof(float16_t)));
|
||||
if (matrix_gt_data_fp16 == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc matrix_gt_data_fp16 failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
Float32ToFloat16(matrix_gt, matrix_gt_data_fp16, input_unit_ * kernel_unit_);
|
||||
|
||||
// trans_filter = G*g*GT (g represents weight_data) = [(g * (G)T)T * (G)T]T
|
||||
// separate into two steps ===> tmp = (g * (G)T)T ===> out = [tmp * (G)T]T
|
||||
auto tmp_data = reinterpret_cast<float16_t *>(malloc(channel_in * input_unit_ * kernel_unit_ * sizeof(float16_t)));
|
||||
if (tmp_data == nullptr) {
|
||||
free(matrix_gt_data_fp16);
|
||||
MS_LOG(ERROR) << "malloc tmp_data failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto trans_out_data =
|
||||
reinterpret_cast<float16_t *>(malloc(channel_in * input_unit_ * input_unit_ * sizeof(float16_t)));
|
||||
if (trans_out_data == nullptr) {
|
||||
free(tmp_data);
|
||||
free(matrix_gt_data_fp16);
|
||||
MS_LOG(ERROR) << "malloc trans_out_data failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
auto tmp_data1 = reinterpret_cast<float16_t *>(malloc(channel_in * input_unit_ * kernel_unit_ * sizeof(float16_t)));
|
||||
if (tmp_data1 == nullptr) {
|
||||
free(tmp_data);
|
||||
free(matrix_gt_data_fp16);
|
||||
free(trans_out_data);
|
||||
MS_LOG(ERROR) << "malloc tmp_data1 failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto trans_out_data1 =
|
||||
reinterpret_cast<float16_t *>(malloc(channel_in * input_unit_ * input_unit_ * sizeof(float16_t)));
|
||||
if (trans_out_data1 == nullptr) {
|
||||
free(tmp_data);
|
||||
free(tmp_data1);
|
||||
free(matrix_gt_data_fp16);
|
||||
free(trans_out_data);
|
||||
MS_LOG(ERROR) << "malloc trans_out_data1 failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
#endif
|
||||
|
||||
int input_oz_offset = kernel_unit_ * kernel_unit_ * channel_in;
|
||||
for (int i = 0; i < channel_out; i++) {
|
||||
int out_c_block = i / oc_block;
|
||||
int out_c_res = i % oc_block;
|
||||
int output_oz_offset = out_c_block * block_stride + out_c_res;
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
// tmp_data = g * GT
|
||||
MatrixMultiplyWinogradFp16(weight_data + i * input_oz_offset, matrix_gt_data_fp16, tmp_data, kernel_unit_,
|
||||
kernel_unit_, input_unit_, channel_in);
|
||||
// tmp_data1 = (tmp_data)T
|
||||
PackHWCToWHCFp16(tmp_data, tmp_data1, kernel_unit_, input_unit_, channel_in);
|
||||
// trans_out_data1 = tmp * GT
|
||||
MatrixMultiplyWinogradFp16(tmp_data1, matrix_gt_data_fp16, trans_out_data1, input_unit_, kernel_unit_, input_unit_,
|
||||
channel_in);
|
||||
// trans_out_data = (trans_out_data1)T
|
||||
PackHWCToWHCFp16(trans_out_data1, trans_out_data, input_unit_, input_unit_, channel_in);
|
||||
#else
|
||||
// tmp = (g * GT)T
|
||||
MatrixMultiplyWinogradFp16(weight_data + i * input_oz_offset, matrix_gt_data_fp16, tmp_data, kernel_unit_,
|
||||
kernel_unit_, input_unit_, channel_in);
|
||||
// trans = (tmp * GT)T
|
||||
MatrixMultiplyWinogradFp16(tmp_data, matrix_gt_data_fp16, trans_out_data, input_unit_, kernel_unit_, input_unit_,
|
||||
channel_in);
|
||||
#endif
|
||||
|
||||
int in_offset = 0;
|
||||
for (int j = 0; j < input_unit_; ++j) {
|
||||
for (int k = 0; k < input_unit_; ++k) {
|
||||
for (int c = 0; c < channel_in; ++c) {
|
||||
*(trans_weight_ + output_oz_offset + c * oc_block) = trans_out_data[in_offset + c];
|
||||
}
|
||||
in_offset += channel_in;
|
||||
output_oz_offset += block_num_stride;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
free(tmp_data1);
|
||||
free(trans_out_data1);
|
||||
#endif
|
||||
free(tmp_data);
|
||||
free(trans_out_data);
|
||||
free(matrix_gt_data_fp16);
|
||||
return RET_OK;
|
||||
return WinogradWeightTransformFp16(weight_data, trans_weight_, matrix_g, matrix_gt, oc_block, input_unit_,
|
||||
kernel_unit_, conv_param_->input_channel_, conv_param_->output_channel_, true);
|
||||
}
|
||||
|
||||
int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16/deconvolution_fp16.h"
|
||||
#include "src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
|
@ -63,7 +64,7 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() {
|
|||
memset(bias_data_, 0, UP_ROUND(output_channel, C4NUM) * sizeof(float16_t));
|
||||
if (in_tensors_.size() == 3) {
|
||||
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->MutableData()),
|
||||
reinterpret_cast<float16_t *>(bias_data_), conv_param_->output_channel_);
|
||||
reinterpret_cast<float16_t *>(bias_data_), output_channel);
|
||||
}
|
||||
|
||||
size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
|
||||
|
@ -157,9 +158,10 @@ int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
|
|||
MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
|
||||
tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0,
|
||||
OutType_C8);
|
||||
|
||||
DeConvPostFp16(tmp_buf, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
|
||||
reinterpret_cast<float16_t *>(bias_data_) + task_id * thread_stride_ * C8NUM,
|
||||
execute_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
|
||||
batch_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -190,7 +192,10 @@ int DeConvolutionFp16CPUKernel::Run() {
|
|||
}
|
||||
|
||||
for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
|
||||
RowMajor2Col16MajorFp16Opt(execute_input_, pack_input_, input_plane_, conv_param_->input_channel_);
|
||||
batch_input_ = execute_input_ + batch_index * conv_param_->input_channel_ * input_plane_;
|
||||
batch_output_ = execute_output_ + batch_index * conv_param_->output_channel_ * output_plane_;
|
||||
|
||||
RowMajor2Col16MajorFp16Opt(batch_input_, pack_input_, input_plane_, conv_param_->input_channel_);
|
||||
|
||||
error_code = ParallelLaunch(this->context_->thread_pool_, DeConvFp16Run, this, thread_count_);
|
||||
if (error_code != RET_OK) {
|
||||
|
@ -227,7 +232,16 @@ kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *>
|
|||
weight_tensor->SetData(dequant_weight);
|
||||
}
|
||||
|
||||
auto kernel = new (std::nothrow) DeConvolutionFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||
kernel::LiteKernel *kernel;
|
||||
auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
|
||||
if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
|
||||
(conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) {
|
||||
/* DeConvWinogradFp16CPUKernel */
|
||||
kernel = new (std::nothrow) kernel::DeConvolutionFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||
} else {
|
||||
kernel = new (std::nothrow) kernel::DeConvolutionFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
|
||||
}
|
||||
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << "kernel is nullptr.";
|
||||
if (dequant_flag) {
|
||||
|
|
|
@ -17,17 +17,11 @@
|
|||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_H_
|
||||
|
||||
#include <float.h>
|
||||
#include <vector>
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
|
||||
#include "nnacl/fp16/deconv_fp16.h"
|
||||
#include "nnacl/fp16/matmul_fp16.h"
|
||||
#include "nnacl/fp16/pack_fp16.h"
|
||||
#include "nnacl/fp16/cast_fp16.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class DeConvolutionFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
|
||||
|
@ -65,6 +59,8 @@ class DeConvolutionFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
|
|||
float16_t *pack_input_;
|
||||
float16_t *pack_output_;
|
||||
float16_t *tmp_buffer_;
|
||||
float16_t *batch_input_;
|
||||
float16_t *batch_output_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_H_
|
||||
|
|
|
@ -0,0 +1,311 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_NULL_PTR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_DeConv2D;
|
||||
using mindspore::schema::Format::Format_NHWC;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
DeConvWinogradFp16CPUKernel::~DeConvWinogradFp16CPUKernel() {
|
||||
FreeResizeBuf();
|
||||
FreeDeconvParam();
|
||||
return;
|
||||
}
|
||||
|
||||
void DeConvWinogradFp16CPUKernel::FreeResizeBuf() { return; }
|
||||
|
||||
void DeConvWinogradFp16CPUKernel::FreeDeconvParam() {
|
||||
if (deconv_param_ != nullptr) {
|
||||
delete deconv_param_;
|
||||
deconv_param_ = nullptr;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int DeConvWinogradFp16CPUKernel::InitParameter() {
|
||||
deconv_param_->input_plane_ = conv_param_->input_h_ * conv_param_->input_w_;
|
||||
deconv_param_->output_plane_ = conv_param_->output_h_ * conv_param_->output_w_;
|
||||
|
||||
nc4hw4_output_ =
|
||||
reinterpret_cast<float16_t *>(malloc(deconv_param_->oc_up4_ * deconv_param_->output_plane_ * sizeof(float16_t)));
|
||||
|
||||
deconv_param_->in_tile_w_count_ = UP_DIV(conv_param_->input_w_, DECONV_WINOGRAD_DEFAULT_UNIT);
|
||||
deconv_param_->in_tile_h_count_ = UP_DIV(conv_param_->input_h_, DECONV_WINOGRAD_DEFAULT_UNIT);
|
||||
|
||||
deconv_param_->in_tile_count_ =
|
||||
UP_DIV(deconv_param_->in_tile_w_count_ * deconv_param_->in_tile_h_count_, DECONV_WINOGRAD_DEFAULT_TILE);
|
||||
deconv_param_->thread_num_ = MSMAX(1, op_parameter_->thread_num_);
|
||||
deconv_param_->thread_num_ = MSMIN(deconv_param_->thread_num_, deconv_param_->in_tile_count_);
|
||||
|
||||
thread_num_hw_ = MSMIN(op_parameter_->thread_num_, deconv_param_->output_plane_);
|
||||
thread_stride_hw_ = UP_DIV(deconv_param_->output_plane_, thread_num_hw_);
|
||||
|
||||
int size = deconv_param_->thread_num_ * DECONV_WINOGRAD_DEFAULT_UNIT * DECONV_WINOGRAD_DEFAULT_UNIT *
|
||||
DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->ic_up4_;
|
||||
tile_input_ = reinterpret_cast<float16_t *>(malloc(size * sizeof(float16_t)));
|
||||
memset(tile_input_, 0, size * sizeof(float16_t));
|
||||
|
||||
deconv_param_->out_tile_w_ = (DECONV_WINOGRAD_DEFAULT_UNIT - 1) * conv_param_->stride_w_ + conv_param_->kernel_w_;
|
||||
deconv_param_->out_tile_h_ = (DECONV_WINOGRAD_DEFAULT_UNIT - 1) * conv_param_->stride_h_ + conv_param_->kernel_h_;
|
||||
size = deconv_param_->thread_num_ * deconv_param_->out_tile_w_ * deconv_param_->out_tile_h_ *
|
||||
DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->oc_up4_;
|
||||
tile_output_ = reinterpret_cast<float16_t *>(malloc(size * sizeof(float16_t)));
|
||||
|
||||
for (int i = 0; i < deconv_param_->compute_size_; i++) {
|
||||
DeConvComputeUnit &unit = deconv_param_->compute_units_[i];
|
||||
if (unit.use_winograd_) {
|
||||
if (deconv_param_->a_buffer_[unit.winograd_.kh_].buf_init_ == false) {
|
||||
deconv_param_->a_buffer_[unit.winograd_.kh_].buf_init_ = true;
|
||||
deconv_param_->a_buffer_[unit.winograd_.kh_].trans_formed_ = false;
|
||||
|
||||
size = unit.winograd_.kh_ * unit.winograd_.kw_ * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->ic_up4_;
|
||||
deconv_param_->a_buffer_[unit.winograd_.kh_].middle_buffer_ =
|
||||
malloc(deconv_param_->thread_num_ * size * sizeof(float16_t));
|
||||
deconv_param_->a_buffer_[unit.winograd_.kh_].dest_buffer_ =
|
||||
malloc(deconv_param_->thread_num_ * size * sizeof(float16_t));
|
||||
}
|
||||
|
||||
unit.winograd_.b_buffer_ = malloc(deconv_param_->thread_num_ * unit.winograd_.kh_ * unit.winograd_.kw_ *
|
||||
deconv_param_->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE * sizeof(float16_t));
|
||||
unit.tmp_buffer_ = malloc(deconv_param_->thread_num_ * unit.winograd_.kh_ * unit.winograd_.kw_ *
|
||||
deconv_param_->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE * C4NUM * sizeof(float16_t));
|
||||
|
||||
} else {
|
||||
unit.tmp_buffer_ = malloc(deconv_param_->thread_num_ * deconv_param_->oc_div4_ * unit.w_size_ * unit.h_size_ *
|
||||
DECONV_WINOGRAD_DEFAULT_TILE * C4NUM * sizeof(float16_t));
|
||||
}
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvWinogradFp16CPUKernel::DoDeconv(int task_id) {
|
||||
for (int tile_index = task_id; tile_index < deconv_param_->in_tile_count_; tile_index += deconv_param_->thread_num_) {
|
||||
float16_t *tile_in = tile_input_ + task_id * DECONV_WINOGRAD_DEFAULT_UNIT * DECONV_WINOGRAD_DEFAULT_UNIT *
|
||||
DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->ic_up4_;
|
||||
int size = deconv_param_->out_tile_w_ * deconv_param_->out_tile_h_ * DECONV_WINOGRAD_DEFAULT_TILE *
|
||||
deconv_param_->oc_div4_ * C4NUM;
|
||||
float16_t *tile_out = tile_output_ + task_id * size;
|
||||
memset(tile_out, 0, size * sizeof(float16_t));
|
||||
|
||||
int start_index = tile_index * DECONV_WINOGRAD_DEFAULT_TILE;
|
||||
int calculate_count = MSMIN(DECONV_WINOGRAD_DEFAULT_TILE,
|
||||
deconv_param_->in_tile_w_count_ * deconv_param_->in_tile_h_count_ - start_index);
|
||||
|
||||
for (int i = 0; i < DECONV_WINOGRAD_BUFFER_COUNT; i++) {
|
||||
deconv_param_->a_buffer_[i].trans_formed_ = false;
|
||||
}
|
||||
DeconvWgFp16(nhwc_input_, tile_in, tile_out, start_index, calculate_count, conv_param_, deconv_param_, task_id);
|
||||
|
||||
std::unique_lock<std::mutex> merge_lock(lock_);
|
||||
DeconvWgPostFp16(tile_out, nc4hw4_output_, conv_param_, deconv_param_, calculate_count, tile_index);
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvWinogradFp16CPUKernel::DeDeconvPost(int task_id) {
|
||||
int rest_plane = deconv_param_->output_plane_ - task_id * thread_stride_hw_;
|
||||
int current_plane = MSMIN(rest_plane, thread_stride_hw_);
|
||||
if (current_plane <= 0) {
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
PostConvFuncFp16C4(nc4hw4_output_ + task_id * thread_stride_hw_ * C4NUM,
|
||||
nhwc_output_ + task_id * thread_stride_hw_ * conv_param_->output_channel_,
|
||||
reinterpret_cast<float16_t *>(bias_data_), conv_param_->output_channel_, current_plane,
|
||||
deconv_param_->output_plane_, conv_param_->act_type_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvWgFp16Run(void *cdata, int task_id) {
|
||||
auto deconvWg = reinterpret_cast<DeConvWinogradFp16CPUKernel *>(cdata);
|
||||
deconvWg->DoDeconv(task_id);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvWgPostFp16Run(void *cdata, int task_id) {
|
||||
auto deconvWg = reinterpret_cast<DeConvWinogradFp16CPUKernel *>(cdata);
|
||||
deconvWg->DeDeconvPost(task_id);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvWinogradFp16CPUKernel::InitComputeParam() {
|
||||
auto weight_tensor = in_tensors_[1];
|
||||
|
||||
conv_param_->input_channel_ = weight_tensor->Batch();
|
||||
conv_param_->output_channel_ = weight_tensor->Channel();
|
||||
conv_param_->kernel_w_ = weight_tensor->Width();
|
||||
conv_param_->kernel_h_ = weight_tensor->Height();
|
||||
|
||||
deconv_param_->kernel_plane_ = conv_param_->kernel_w_ * conv_param_->kernel_h_;
|
||||
deconv_param_->ic_div4_ = UP_DIV(conv_param_->input_channel_, C4NUM);
|
||||
deconv_param_->oc_div4_ = UP_DIV(conv_param_->output_channel_, C4NUM);
|
||||
deconv_param_->ic_up4_ = deconv_param_->ic_div4_ * C4NUM;
|
||||
deconv_param_->oc_up4_ = deconv_param_->oc_div4_ * C4NUM;
|
||||
|
||||
deconv_param_->compute_size_ = 0;
|
||||
for (int si_h = 0; si_h < conv_param_->stride_h_; si_h++) {
|
||||
for (int si_w = 0; si_w < conv_param_->stride_w_; si_w++) {
|
||||
if (si_h < conv_param_->kernel_h_ && si_w < conv_param_->kernel_w_) {
|
||||
deconv_param_->compute_size_++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int size = deconv_param_->compute_size_ * sizeof(DeConvComputeUnit);
|
||||
deconv_param_->compute_units_ = reinterpret_cast<DeConvComputeUnit *>(malloc(size));
|
||||
if (deconv_param_->compute_units_ == nullptr) {
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
int cur_count = 0;
|
||||
for (int si_h = 0; si_h < conv_param_->stride_h_; si_h++) {
|
||||
if (si_h >= conv_param_->kernel_h_) {
|
||||
continue;
|
||||
}
|
||||
for (int si_w = 0; si_w < conv_param_->stride_w_; si_w++) {
|
||||
if (si_w >= conv_param_->kernel_w_) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int h_size = 1 + (conv_param_->kernel_h_ - si_h - 1) / conv_param_->stride_h_;
|
||||
int w_size = 1 + (conv_param_->kernel_w_ - si_w - 1) / conv_param_->stride_w_;
|
||||
|
||||
DeConvComputeUnit unit;
|
||||
|
||||
unit.h_start_ = si_h;
|
||||
unit.w_start_ = si_w;
|
||||
unit.h_size_ = h_size;
|
||||
unit.w_size_ = w_size;
|
||||
|
||||
if (h_size == w_size) {
|
||||
unit.use_winograd_ = true;
|
||||
|
||||
unit.winograd_.k_ = unit.h_size_;
|
||||
unit.winograd_.i_ = DECONV_WINOGRAD_DEFAULT_UNIT;
|
||||
unit.winograd_.o_ = DECONV_WINOGRAD_DEFAULT_UNIT + unit.h_size_ - 1;
|
||||
unit.winograd_.kh_ = unit.h_size_ + DECONV_WINOGRAD_DEFAULT_UNIT - 1;
|
||||
unit.winograd_.kw_ = unit.w_size_ + DECONV_WINOGRAD_DEFAULT_UNIT - 1;
|
||||
|
||||
unit.winograd_.b_buffer_ = nullptr;
|
||||
unit.weight_ = malloc(unit.winograd_.kh_ * unit.winograd_.kw_ * deconv_param_->oc_up4_ *
|
||||
deconv_param_->ic_up4_ * sizeof(float16_t));
|
||||
} else {
|
||||
unit.use_winograd_ = false;
|
||||
unit.weight_ = malloc(h_size * w_size * deconv_param_->ic_up4_ * deconv_param_->oc_up4_ * sizeof(float16_t));
|
||||
}
|
||||
unit.tmp_buffer_ = nullptr;
|
||||
deconv_param_->compute_units_[cur_count] = unit;
|
||||
cur_count++;
|
||||
}
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvWinogradFp16CPUKernel::InitDataParam() {
|
||||
/* unit data : weight & winograd data*/
|
||||
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Get Execute filter failed.";
|
||||
return ret;
|
||||
}
|
||||
for (int i = 0; i < deconv_param_->compute_size_; i++) {
|
||||
DeConvComputeUnit *unit = &deconv_param_->compute_units_[i];
|
||||
ret = PackDeConvWgDataFp16(fp16_weight_, unit, conv_param_, deconv_param_);
|
||||
if (ret != RET_OK) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* bias */
|
||||
bias_data_ = malloc(deconv_param_->oc_up4_ * sizeof(float16_t));
|
||||
if (bias_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc bias_data_ failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_data_, 0, deconv_param_->oc_up4_ * sizeof(float16_t));
|
||||
auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto src_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
|
||||
for (int i = 0; i < conv_param_->output_channel_; ++i) {
|
||||
fp16_bias_data[i] = (float16_t)src_bias[i];
|
||||
}
|
||||
} else {
|
||||
MS_ASSERT(inputs_.size() == kInputSize1);
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvWinogradFp16CPUKernel::ReSize() {
|
||||
FreeResizeBuf();
|
||||
ConvolutionBaseCPUKernel::Init();
|
||||
InitParameter();
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeConvWinogradFp16CPUKernel::Init() {
|
||||
int error_code = InitComputeParam();
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "InitComputeParam error! ret: " << error_code;
|
||||
return error_code;
|
||||
}
|
||||
|
||||
error_code = InitDataParam();
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "InitWeightBias error! ret: " << error_code;
|
||||
return error_code;
|
||||
}
|
||||
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
return ReSize();
|
||||
}
|
||||
|
||||
int DeConvWinogradFp16CPUKernel::Run() {
|
||||
auto prepare_ret = Prepare();
|
||||
if (prepare_ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
|
||||
return prepare_ret;
|
||||
}
|
||||
|
||||
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
|
||||
|
||||
for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
|
||||
nhwc_input_ = execute_input_ + batch_index * deconv_param_->input_plane_ * conv_param_->input_channel_;
|
||||
nhwc_output_ = execute_output_ + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_;
|
||||
|
||||
::memset(nc4hw4_output_, 0, deconv_param_->output_plane_ * deconv_param_->oc_div4_ * C4NUM * sizeof(float16_t));
|
||||
ParallelLaunch(this->context_->thread_pool_, DeConvWgFp16Run, this, deconv_param_->thread_num_);
|
||||
|
||||
/*post bias activate and nhwc */
|
||||
ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp16Run, this, thread_num_hw_);
|
||||
}
|
||||
|
||||
ConvolutionBaseFP16CPUKernel::IfCastOutput();
|
||||
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
} // namespace mindspore::kernel
|
|
@ -0,0 +1,67 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_WINOGRAD_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_WINOGRAD_H_
|
||||
|
||||
#include <vector>
|
||||
#include "include/errorcode.h"
|
||||
#include "nnacl/fp16/common_func_fp16.h"
|
||||
#include "nnacl/fp16/deconv_winograd_fp16.h"
|
||||
#include "nnacl/fp16/pack_fp16.h"
|
||||
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class DeConvWinogradFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
|
||||
public:
|
||||
DeConvWinogradFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
|
||||
const mindspore::lite::PrimitiveC *primitive)
|
||||
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {
|
||||
deconv_param_ = new DeConvParam();
|
||||
for (auto &wg : deconv_param_->a_buffer_) {
|
||||
wg.buf_init_ = false;
|
||||
}
|
||||
}
|
||||
~DeConvWinogradFp16CPUKernel() override;
|
||||
int Init() override;
|
||||
int Run() override;
|
||||
int ReSize() override;
|
||||
|
||||
public:
|
||||
int DoDeconv(int task_id);
|
||||
int DeDeconvPost(int task_id);
|
||||
|
||||
private:
|
||||
int InitComputeParam();
|
||||
int InitDataParam();
|
||||
int InitParameter();
|
||||
void FreeDeconvParam();
|
||||
void FreeResizeBuf();
|
||||
|
||||
private:
|
||||
DeConvParam *deconv_param_;
|
||||
std::mutex lock_;
|
||||
float16_t *nhwc_input_ = nullptr;
|
||||
float16_t *nhwc_output_ = nullptr;
|
||||
float16_t *nc4hw4_output_ = nullptr;
|
||||
float16_t *tile_input_ = nullptr;
|
||||
float16_t *tile_output_ = nullptr;
|
||||
int thread_num_hw_;
|
||||
int thread_stride_hw_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_DECONVOLUTION_WINOGRAD_H_
|
|
@ -17,16 +17,11 @@
|
|||
#include "src/runtime/kernel/arm/fp32/deconvolution_winograd.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kCPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_NULL_PTR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_DeConv2D;
|
||||
using mindspore::schema::Format::Format_NHWC;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
DeConvolutionWinogradCPUKernel::~DeConvolutionWinogradCPUKernel() {
|
||||
FreeResizeBuf();
|
||||
FreeDeconvParam();
|
||||
|
@ -352,10 +347,7 @@ int DeConvolutionWinogradCPUKernel::Run() {
|
|||
nhwc_output_ = src_out + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_;
|
||||
|
||||
::memset(nc4hw4_output_, 0, deconv_param_->output_plane_ * deconv_param_->oc_div4_ * C4NUM * sizeof(float));
|
||||
for (int i = 0; i < deconv_param_->thread_num_; i++) {
|
||||
DoDeconv(i);
|
||||
}
|
||||
// ParallelLaunch(this->context_->thread_pool_, DeConvWgFp32Run, this, deconv_param_->thread_num_);
|
||||
ParallelLaunch(this->context_->thread_pool_, DeConvWgFp32Run, this, deconv_param_->thread_num_);
|
||||
|
||||
/*post bias activate and nhwc */
|
||||
ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp32Run, this, thread_num_hw_);
|
||||
|
|
Loading…
Reference in New Issue