forked from mindspore-Ecosystem/mindspore
!8277 [MS][LITE][CPU]optimize int8 reduce mean && extract functions
From: @fuzhiye Reviewed-by: @zhang_xue_tong Signed-off-by:
This commit is contained in:
commit
4330052001
|
@ -20,6 +20,160 @@
|
|||
#include "nnacl/quantization/fixed_point.h"
|
||||
#include "nnacl/common_func.h"
|
||||
|
||||
int ReduceMeanN(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
int ReduceMeanH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
int ReduceMeanW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
int ReduceMeanC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
int ReduceMeanNH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
int ReduceMeanNW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
int ReduceMeanNC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
int ReduceMeanHW(int n, int plane, int count, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg,
|
||||
int32_t bias) {
|
||||
int stride = plane * UP_ROUND(c, C4NUM);
|
||||
for (int batch = 0; batch < n; ++batch) {
|
||||
int8_t *in_ptr = in_data + batch * stride;
|
||||
int8_t *out_ptr = out_data + batch * c;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
int32_t sum_array = 0;
|
||||
int j = 0;
|
||||
#ifdef ENABLE_ARM64
|
||||
for (; j < plane; j += 16) {
|
||||
int8x16_t in_data_vec = vld1q_s8(in_ptr);
|
||||
sum_array += vaddlvq_s8(in_data_vec);
|
||||
in_ptr += 16;
|
||||
}
|
||||
for (; j < plane; j += 8) {
|
||||
int8x8_t in_data_vec = vld1_s8(in_ptr);
|
||||
sum_array += vaddlv_s8(in_data_vec);
|
||||
in_ptr += 8;
|
||||
}
|
||||
for (; j < plane; j += 4) {
|
||||
int32x4_t in_data_vec;
|
||||
in_data_vec[0] = in_ptr[0];
|
||||
in_data_vec[1] = in_ptr[1];
|
||||
in_data_vec[2] = in_ptr[2];
|
||||
in_data_vec[3] = in_ptr[3];
|
||||
sum_array += vaddvq_s32(in_data_vec);
|
||||
in_ptr += 4;
|
||||
}
|
||||
#elif ENABLE_ARM32
|
||||
int32x4_t accum = vmovq_n_s32(0);
|
||||
for (; j < plane; j += 16) {
|
||||
int32x4_t in_data_vec1;
|
||||
int32x4_t in_data_vec2;
|
||||
int32x4_t in_data_vec3;
|
||||
int32x4_t in_data_vec4;
|
||||
in_data_vec1[0] = in_ptr[0];
|
||||
in_data_vec1[1] = in_ptr[1];
|
||||
in_data_vec1[2] = in_ptr[2];
|
||||
in_data_vec1[3] = in_ptr[3];
|
||||
in_data_vec2[0] = in_ptr[4];
|
||||
in_data_vec2[1] = in_ptr[5];
|
||||
in_data_vec2[2] = in_ptr[6];
|
||||
in_data_vec2[3] = in_ptr[7];
|
||||
in_data_vec3[0] = in_ptr[8];
|
||||
in_data_vec3[1] = in_ptr[9];
|
||||
in_data_vec3[2] = in_ptr[10];
|
||||
in_data_vec3[3] = in_ptr[11];
|
||||
in_data_vec4[0] = in_ptr[12];
|
||||
in_data_vec4[1] = in_ptr[13];
|
||||
in_data_vec4[2] = in_ptr[14];
|
||||
in_data_vec4[3] = in_ptr[15];
|
||||
accum = vaddq_s32(accum, in_data_vec1);
|
||||
accum = vaddq_s32(accum, in_data_vec2);
|
||||
accum = vaddq_s32(accum, in_data_vec3);
|
||||
accum = vaddq_s32(accum, in_data_vec4);
|
||||
in_ptr += 16;
|
||||
}
|
||||
for (; j < plane; j += 8) {
|
||||
int32x4_t in_data_vec1;
|
||||
int32x4_t in_data_vec2;
|
||||
in_data_vec1[0] = in_ptr[0];
|
||||
in_data_vec1[1] = in_ptr[1];
|
||||
in_data_vec1[2] = in_ptr[2];
|
||||
in_data_vec1[3] = in_ptr[3];
|
||||
in_data_vec2[0] = in_ptr[4];
|
||||
in_data_vec2[1] = in_ptr[5];
|
||||
in_data_vec2[2] = in_ptr[6];
|
||||
in_data_vec2[3] = in_ptr[7];
|
||||
accum = vaddq_s32(accum, in_data_vec1);
|
||||
accum = vaddq_s32(accum, in_data_vec2);
|
||||
in_ptr += 8;
|
||||
}
|
||||
for (; j < plane; j += 4) {
|
||||
int32x4_t in_data_vec;
|
||||
in_data_vec[0] = in_ptr[0];
|
||||
in_data_vec[1] = in_ptr[1];
|
||||
in_data_vec[2] = in_ptr[2];
|
||||
in_data_vec[3] = in_ptr[3];
|
||||
accum = vaddq_s32(accum, in_data_vec);
|
||||
in_ptr += 4;
|
||||
}
|
||||
sum_array += accum[0];
|
||||
sum_array += accum[1];
|
||||
sum_array += accum[2];
|
||||
sum_array += accum[3];
|
||||
#endif
|
||||
for (; j < plane; j++) {
|
||||
sum_array += in_ptr[0];
|
||||
in_ptr++;
|
||||
}
|
||||
int32_t mean =
|
||||
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(sum_array * (1 << (unsigned int)quant_arg.left_shift_),
|
||||
quant_arg.multiplier_),
|
||||
quant_arg.right_shift_);
|
||||
mean += bias;
|
||||
mean = MSMIN(mean, INT8_MAX);
|
||||
mean = MSMAX(mean, INT8_MIN);
|
||||
out_ptr[0] = mean;
|
||||
out_ptr++;
|
||||
}
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ReduceMeanHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ReduceMeanWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ReduceMeanNHW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ReduceMeanNHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ReduceMeanNWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ReduceMeanHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ReduceMeanNHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
// Get x such that (x-zp_in) * scale_in = mean
|
||||
// Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
|
||||
int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
|
||||
|
|
|
@ -21,6 +21,23 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
int ReduceMeanN(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanNH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanNW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanNC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanHW(int n, int plane, int count, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg,
|
||||
int32_t bias);
|
||||
int ReduceMeanHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanNHW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanNHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanNWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
int ReduceMeanNHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
|
||||
|
||||
int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
|
||||
int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
|
||||
int ReduceMeanLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
|
||||
|
|
|
@ -929,13 +929,144 @@ void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int
|
|||
return;
|
||||
}
|
||||
|
||||
void PackNHWCToNCHWInt8(const void *src, void *dst, int batch, int plane, int channel) {
|
||||
for (int n = 0; n < batch; n++) {
|
||||
for (int c = 0; c < channel; c++) {
|
||||
for (int hw = 0; hw < plane; hw++) {
|
||||
int nhwc_index = n * channel * plane + hw * channel + c;
|
||||
int nchw_index = n * channel * plane + c * plane + hw;
|
||||
((int8_t *)dst)[nchw_index] = ((int8_t *)src)[nhwc_index];
|
||||
void PackNHWCToNCHWInt8(const void *src, void *dst, int batches, int plane, int channel) {
|
||||
int hw8 = plane / C8NUM * C8NUM;
|
||||
int c8 = channel / C8NUM * C8NUM;
|
||||
int batch = plane * channel;
|
||||
for (int n = 0; n < batches; n++) {
|
||||
const int8_t *src_batch = (const int8_t *)src + n * batch;
|
||||
int8_t *dst_batch = (int8_t *)dst + n * batch;
|
||||
int hw = 0;
|
||||
for (; hw < hw8; hw += C8NUM) {
|
||||
int c = 0;
|
||||
for (; c < c8; c += C8NUM) {
|
||||
const int8_t *src_ptr = src_batch + hw * channel + c;
|
||||
int8_t *dst_ptr = dst_batch + c * plane + hw;
|
||||
#ifdef ENABLE_ARM64
|
||||
size_t srcStride = channel * sizeof(int8_t);
|
||||
size_t dstStride = plane * sizeof(int8_t);
|
||||
asm volatile(
|
||||
"mov x10, %[src_ptr]\n"
|
||||
"mov x11, %[dst_ptr]\n"
|
||||
|
||||
"ld1 {v0.8b}, [x10], %[srcStride]\n"
|
||||
"ld1 {v1.8b}, [x10], %[srcStride]\n"
|
||||
"ld1 {v2.8b}, [x10], %[srcStride]\n"
|
||||
"ld1 {v3.8b}, [x10], %[srcStride]\n"
|
||||
|
||||
"trn1 v4.8b, v0.8b, v1.8b\n"
|
||||
"trn2 v5.8b, v0.8b, v1.8b\n"
|
||||
"trn1 v6.8b, v2.8b, v3.8b\n"
|
||||
"trn2 v7.8b, v2.8b, v3.8b\n"
|
||||
|
||||
"ld1 {v0.8b}, [x10], %[srcStride]\n"
|
||||
"ld1 {v1.8b}, [x10], %[srcStride]\n"
|
||||
"ld1 {v2.8b}, [x10], %[srcStride]\n"
|
||||
"ld1 {v3.8b}, [x10], %[srcStride]\n"
|
||||
|
||||
"trn1 v8.4h, v4.4h, v6.4h\n"
|
||||
"trn2 v9.4h, v4.4h, v6.4h\n"
|
||||
"trn1 v10.4h, v5.4h, v7.4h\n"
|
||||
"trn2 v11.4h, v5.4h, v7.4h\n"
|
||||
|
||||
"trn1 v4.8b, v0.8b, v1.8b\n"
|
||||
"trn2 v5.8b, v0.8b, v1.8b\n"
|
||||
"trn1 v6.8b, v2.8b, v3.8b\n"
|
||||
"trn2 v7.8b, v2.8b, v3.8b\n"
|
||||
|
||||
"trn1 v12.4h, v4.4h, v6.4h\n"
|
||||
"trn2 v13.4h, v4.4h, v6.4h\n"
|
||||
"trn1 v14.4h, v5.4h, v7.4h\n"
|
||||
"trn2 v15.4h, v5.4h, v7.4h\n"
|
||||
|
||||
"trn1 v0.2s, v8.2s, v12.2s\n"
|
||||
"trn2 v4.2s, v8.2s, v12.2s\n"
|
||||
"trn1 v1.2s, v10.2s, v14.2s\n"
|
||||
"trn2 v5.2s, v10.2s, v14.2s\n"
|
||||
"trn1 v2.2s, v9.2s, v13.2s\n"
|
||||
"trn2 v6.2s, v9.2s, v13.2s\n"
|
||||
"trn1 v3.2s, v11.2s, v15.2s\n"
|
||||
"trn2 v7.2s, v11.2s, v15.2s\n"
|
||||
|
||||
"st1 {v0.8b}, [x11], %[dstStride]\n"
|
||||
"st1 {v1.8b}, [x11], %[dstStride]\n"
|
||||
"st1 {v2.8b}, [x11], %[dstStride]\n"
|
||||
"st1 {v3.8b}, [x11], %[dstStride]\n"
|
||||
"st1 {v4.8b}, [x11], %[dstStride]\n"
|
||||
"st1 {v5.8b}, [x11], %[dstStride]\n"
|
||||
"st1 {v6.8b}, [x11], %[dstStride]\n"
|
||||
"st1 {v7.8b}, [x11], %[dstStride]\n"
|
||||
:
|
||||
:
|
||||
[ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
|
||||
: "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
|
||||
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
|
||||
"v30", "v31");
|
||||
#elif ENABLE_ARM32
|
||||
size_t srcStride = channel * sizeof(int8_t);
|
||||
size_t dstStride = plane * sizeof(int8_t);
|
||||
asm volatile(
|
||||
"mov r10, %[src_ptr]\n"
|
||||
"mov r12, %[dst_ptr]\n"
|
||||
|
||||
"vld1.8 {d0}, [r10], %[srcStride]\n"
|
||||
"vld1.8 {d1}, [r10], %[srcStride]\n"
|
||||
"vld1.8 {d2}, [r10], %[srcStride]\n"
|
||||
"vld1.8 {d3}, [r10], %[srcStride]\n"
|
||||
"vld1.8 {d4}, [r10], %[srcStride]\n"
|
||||
"vld1.8 {d5}, [r10], %[srcStride]\n"
|
||||
"vld1.8 {d6}, [r10], %[srcStride]\n"
|
||||
"vld1.8 {d7}, [r10], %[srcStride]\n"
|
||||
|
||||
"vtrn.8 d0, d1\n"
|
||||
"vtrn.8 d2, d3\n"
|
||||
"vtrn.8 d4, d5\n"
|
||||
"vtrn.8 d6, d7\n"
|
||||
|
||||
"vtrn.16 d0, d2\n"
|
||||
"vtrn.16 d1, d3\n"
|
||||
"vtrn.16 d4, d6\n"
|
||||
"vtrn.16 d5, d7\n"
|
||||
|
||||
"vtrn.32 d0, d4\n"
|
||||
"vtrn.32 d1, d5\n"
|
||||
"vtrn.32 d2, d6\n"
|
||||
"vtrn.32 d3, d7\n"
|
||||
|
||||
"vst1.8 {d0}, [r12], %[dstStride]\n"
|
||||
"vst1.8 {d1}, [r12], %[dstStride]\n"
|
||||
"vst1.8 {d2}, [r12], %[dstStride]\n"
|
||||
"vst1.8 {d3}, [r12], %[dstStride]\n"
|
||||
"vst1.8 {d4}, [r12], %[dstStride]\n"
|
||||
"vst1.8 {d5}, [r12], %[dstStride]\n"
|
||||
"vst1.8 {d6}, [r12], %[dstStride]\n"
|
||||
"vst1.8 {d7}, [r12], %[dstStride]\n"
|
||||
:
|
||||
:
|
||||
[ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
|
||||
: "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
|
||||
"q15");
|
||||
#else
|
||||
for (int tr = 0; tr < C8NUM; tr++) {
|
||||
for (int tc = 0; tc < C8NUM; tc++) {
|
||||
dst_ptr[tc * plane + tr] = src_ptr[tr * channel + tc];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (; c < channel; c++) {
|
||||
const int8_t *src_ptr = src_batch + hw * channel + c;
|
||||
int8_t *dst_ptr = dst_batch + c * plane + hw;
|
||||
for (size_t i = 0; i < C8NUM; i++) {
|
||||
dst_ptr[i] = src_ptr[i * channel];
|
||||
}
|
||||
}
|
||||
}
|
||||
for (; hw < plane; hw++) {
|
||||
const int8_t *src_ptr = src_batch + hw * channel;
|
||||
int8_t *dst_ptr = dst_batch + hw;
|
||||
for (size_t i = 0; i < channel; i++) {
|
||||
dst_ptr[i * plane] = src_ptr[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -108,7 +108,6 @@ int ReduceBaseCPUKernel::Init() {
|
|||
}
|
||||
|
||||
mode_ = reduce_param->mode_;
|
||||
memcpy(axes_, reduce_param->axes_, sizeof(reduce_param->axes_));
|
||||
reduce_to_end_ = reduce_param->reduce_to_end_;
|
||||
|
||||
auto ret = CheckInputsOutputs();
|
||||
|
|
|
@ -201,8 +201,8 @@ kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &i
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
void FreeMemoryFp16(std::vector<kernel::LiteKernel *> group_convs, std::vector<lite::Tensor *> new_inputs,
|
||||
std::vector<lite::Tensor *> new_outputs) {
|
||||
void FreeMemoryFp16(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
|
||||
const std::vector<lite::Tensor *> &new_outputs) {
|
||||
for (auto sub_conv : group_convs) {
|
||||
if (sub_conv != nullptr) {
|
||||
delete sub_conv;
|
||||
|
@ -220,49 +220,131 @@ void FreeMemoryFp16(std::vector<kernel::LiteKernel *> group_convs, std::vector<l
|
|||
}
|
||||
}
|
||||
|
||||
lite::Tensor *CreateInputTensor(TypeId data_type, std::vector<int> in_shape, bool infered_flag) {
|
||||
auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
|
||||
if (in_tensor == nullptr) {
|
||||
MS_LOG(ERROR) << "new in_tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
if (infered_flag) {
|
||||
auto ret = in_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete in_tensor;
|
||||
MS_LOG(ERROR) << "in tensor malloc failed.";
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return in_tensor;
|
||||
}
|
||||
|
||||
lite::Tensor *CreateFilterTensor(TypeId data_type, std::vector<int> filter_shape,
|
||||
const std::vector<lite::Tensor *> &inputs, int copy_length, int index) {
|
||||
auto filter_tensor =
|
||||
new (std::nothrow) lite::Tensor(data_type, filter_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
|
||||
if (filter_tensor == nullptr) {
|
||||
MS_LOG(ERROR) << "new filter_tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = filter_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete filter_tensor;
|
||||
MS_LOG(ERROR) << "filter_tensor malloc failed.";
|
||||
return nullptr;
|
||||
}
|
||||
if (data_type == kNumberTypeFloat16) {
|
||||
auto *origin_weight = reinterpret_cast<float16_t *>(inputs.at(kWeightIndex)->data_c());
|
||||
memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float16_t));
|
||||
} else {
|
||||
MS_ASSERT(data_type == kNumberTypeFloat32);
|
||||
auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
|
||||
memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float));
|
||||
}
|
||||
return filter_tensor;
|
||||
}
|
||||
|
||||
lite::Tensor *CreateBiasTensor(TypeId data_type, std::vector<int> bias_shape, const std::vector<lite::Tensor *> &inputs,
|
||||
int new_out_channel, int index) {
|
||||
auto *origin_bias = inputs.at(kBiasIndex)->data_c();
|
||||
auto bias_tensor =
|
||||
new (std::nothrow) lite::Tensor(data_type, bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
|
||||
if (bias_tensor == nullptr) {
|
||||
MS_LOG(ERROR) << "new bias_tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = bias_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete bias_tensor;
|
||||
MS_LOG(ERROR) << "bias_tensor malloc failed.";
|
||||
return nullptr;
|
||||
}
|
||||
if (data_type == kNumberTypeFloat16) {
|
||||
auto bias_data = reinterpret_cast<float16_t *>(origin_bias);
|
||||
memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float16_t));
|
||||
} else {
|
||||
MS_ASSERT(data_type == kNumberTypeFloat32);
|
||||
auto bias_data = reinterpret_cast<float *>(origin_bias);
|
||||
memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float));
|
||||
}
|
||||
return bias_tensor;
|
||||
}
|
||||
|
||||
lite::Tensor *CreateOutputTensor(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
|
||||
bool infered_flag, int index) {
|
||||
auto out_tensor = new (std::nothrow) lite::Tensor();
|
||||
if (out_tensor == nullptr) {
|
||||
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
out_tensor->set_data_type(outputs.at(index)->data_type());
|
||||
out_tensor->SetFormat(outputs.at(index)->GetFormat());
|
||||
if (infered_flag) {
|
||||
out_tensor->set_shape(out_shape);
|
||||
auto ret = out_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete out_tensor;
|
||||
MS_LOG(ERROR) << "out_tensor malloc data failed.";
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return out_tensor;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
|
||||
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
|
||||
int group) {
|
||||
std::vector<kernel::LiteKernel *> group_convs;
|
||||
std::vector<int> in_shape;
|
||||
std::vector<int> filter_shape;
|
||||
std::vector<int> bias_shape;
|
||||
std::vector<int> out_shape;
|
||||
|
||||
int out_unit;
|
||||
bool has_bias = inputs.size() == 3;
|
||||
bool use_winograd = false;
|
||||
bool infered_flag = (primitive != nullptr && primitive->GetInferFlag());
|
||||
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
|
||||
int out_channel = inputs.at(kWeightIndex)->Batch();
|
||||
|
||||
// update new shape info for each sub kernel
|
||||
int new_in_channel = inputs.at(kWeightIndex)->Channel();
|
||||
int new_out_channel = 0;
|
||||
if (group == 0) {
|
||||
MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
|
||||
return nullptr;
|
||||
} else {
|
||||
new_out_channel = out_channel / group;
|
||||
new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
|
||||
}
|
||||
int kernel_h = conv_param->kernel_h_;
|
||||
int kernel_w = conv_param->kernel_w_;
|
||||
int input_num = inputs.size();
|
||||
int output_num = outputs.size();
|
||||
bool has_bias = input_num == 3;
|
||||
bool use_winograd = false;
|
||||
int out_unit;
|
||||
bool infered_flag = (primitive != nullptr && primitive->GetInferFlag());
|
||||
|
||||
std::vector<int> in_shape;
|
||||
std::vector<int> out_shape;
|
||||
if (infered_flag) {
|
||||
int batch = inputs.front()->Batch();
|
||||
int in_h = inputs.front()->Height();
|
||||
int in_w = inputs.front()->Width();
|
||||
conv_param->input_channel_ = new_in_channel;
|
||||
conv_param->output_channel_ = new_out_channel;
|
||||
CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);
|
||||
in_shape = {batch, in_h, in_w, new_in_channel};
|
||||
in_shape = {batch, inputs.front()->Height(), inputs.front()->Width(), new_in_channel};
|
||||
out_shape = {batch, conv_param->output_h_, conv_param->output_w_, new_out_channel};
|
||||
}
|
||||
std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
|
||||
std::vector<int> bias_shape = {new_out_channel};
|
||||
|
||||
filter_shape = {new_out_channel, kernel_h, kernel_w, new_in_channel};
|
||||
bias_shape = {new_out_channel};
|
||||
|
||||
// new group conv op
|
||||
std::vector<kernel::LiteKernel *> group_convs;
|
||||
// create tensors for every sub conv kernel
|
||||
for (int i = 0; i < group; ++i) {
|
||||
std::vector<lite::Tensor *> new_inputs;
|
||||
std::vector<lite::Tensor *> new_outputs;
|
||||
|
@ -272,116 +354,56 @@ kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor
|
|||
MS_LOG(ERROR) << "Get new conv parameter failed.";
|
||||
return nullptr;
|
||||
}
|
||||
// get new input for each group
|
||||
auto in_tensor =
|
||||
new (std::nothrow) lite::Tensor(inputs.front()->data_type(), in_shape, Format_NHWC, lite::Tensor::Category::VAR);
|
||||
|
||||
// create new input for each group
|
||||
auto in_tensor = CreateInputTensor(inputs.front()->data_type(), in_shape, infered_flag);
|
||||
if (in_tensor == nullptr) {
|
||||
delete new_conv_parameter;
|
||||
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "new in_tensor failed.";
|
||||
MS_LOG(ERROR) << "create input tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
if (infered_flag) {
|
||||
auto ret = in_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete new_conv_parameter;
|
||||
delete in_tensor;
|
||||
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "in tensor malloc failed.";
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
new_inputs.emplace_back(in_tensor);
|
||||
|
||||
// new weight
|
||||
auto filter_tensor = new (std::nothrow) lite::Tensor(inputs.at(kWeightIndex)->data_type(), filter_shape,
|
||||
Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
|
||||
// create new weight
|
||||
int copy_length = conv_param->kernel_h_ * conv_param->kernel_w_ * new_in_channel * new_out_channel;
|
||||
auto filter_tensor = CreateFilterTensor(inputs.at(kWeightIndex)->data_type(), filter_shape, inputs, copy_length, i);
|
||||
if (filter_tensor == nullptr) {
|
||||
delete new_conv_parameter;
|
||||
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "new filter_tensor failed.";
|
||||
MS_LOG(ERROR) << "create filter tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = filter_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete new_conv_parameter;
|
||||
delete filter_tensor;
|
||||
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "filter_tensor malloc failed.";
|
||||
return nullptr;
|
||||
}
|
||||
int copy_length = kernel_h * kernel_w * new_in_channel * new_out_channel;
|
||||
auto filter_data_type = inputs.at(kWeightIndex)->data_type();
|
||||
if (filter_data_type == kNumberTypeFloat16) {
|
||||
auto *origin_weight = reinterpret_cast<float16_t *>(inputs.at(kWeightIndex)->data_c());
|
||||
memcpy(filter_tensor->data_c(), origin_weight + i * copy_length, copy_length * sizeof(float16_t));
|
||||
} else {
|
||||
MS_ASSERT(filter_data_type == kNumberTypeFloat32);
|
||||
auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
|
||||
memcpy(filter_tensor->data_c(), origin_weight + i * copy_length, copy_length * sizeof(float));
|
||||
}
|
||||
new_inputs.emplace_back(filter_tensor);
|
||||
|
||||
// if has bias, set new bias
|
||||
// if has bias, create new bias
|
||||
if (has_bias) {
|
||||
auto *origin_bias = inputs.at(kBiasIndex)->data_c();
|
||||
auto bias_data_type = inputs.at(kBiasIndex)->data_type();
|
||||
auto bias_tensor = new (std::nothrow)
|
||||
lite::Tensor(inputs.at(kBiasIndex)->data_type(), bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
|
||||
auto bias_tensor = CreateBiasTensor(inputs.at(kBiasIndex)->data_type(), bias_shape, inputs, new_out_channel, i);
|
||||
if (bias_tensor == nullptr) {
|
||||
delete new_conv_parameter;
|
||||
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "new bias_tensor failed.";
|
||||
MS_LOG(ERROR) << "create bias_tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
ret = bias_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete new_conv_parameter;
|
||||
delete bias_tensor;
|
||||
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "bias_tensor malloc failed.";
|
||||
return nullptr;
|
||||
}
|
||||
if (bias_data_type == kNumberTypeFloat16) {
|
||||
auto bias_data = reinterpret_cast<float16_t *>(origin_bias);
|
||||
memcpy(bias_tensor->data_c(), bias_data + i * new_out_channel, new_out_channel * sizeof(float16_t));
|
||||
} else {
|
||||
MS_ASSERT(bias_data_type == kNumberTypeFloat32);
|
||||
auto bias_data = reinterpret_cast<float *>(origin_bias);
|
||||
memcpy(bias_tensor->data_c(), bias_data + i * new_out_channel, new_out_channel * sizeof(float));
|
||||
}
|
||||
new_inputs.emplace_back(bias_tensor);
|
||||
}
|
||||
|
||||
// set new output tensor
|
||||
for (int j = 0; j < output_num; ++j) {
|
||||
auto tmp_out_tensor = new (std::nothrow) lite::Tensor();
|
||||
if (tmp_out_tensor == nullptr) {
|
||||
// create new output tensors
|
||||
for (size_t j = 0; j < outputs.size(); ++j) {
|
||||
auto out_tensor = CreateOutputTensor(out_shape, outputs, infered_flag, j);
|
||||
if (out_tensor == nullptr) {
|
||||
delete new_conv_parameter;
|
||||
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
|
||||
MS_LOG(ERROR) << "new out_tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
tmp_out_tensor->set_data_type(outputs.at(j)->data_type());
|
||||
tmp_out_tensor->SetFormat(outputs.at(j)->GetFormat());
|
||||
if (infered_flag) {
|
||||
tmp_out_tensor->set_shape(out_shape);
|
||||
ret = tmp_out_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete new_conv_parameter;
|
||||
delete tmp_out_tensor;
|
||||
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "tmp_out_tensor malloc data failed.";
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
new_outputs.emplace_back(tmp_out_tensor);
|
||||
new_outputs.emplace_back(out_tensor);
|
||||
}
|
||||
|
||||
group_convs.emplace_back(CpuConvFp16KernelSelect(new_inputs, new_outputs,
|
||||
reinterpret_cast<OpParameter *>(new_conv_parameter), ctx,
|
||||
primitive, use_winograd, out_unit));
|
||||
}
|
||||
|
||||
return new (std::nothrow)
|
||||
GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, group);
|
||||
}
|
||||
|
|
|
@ -169,8 +169,8 @@ ConvParameter *CreateNewConvParameter(ConvParameter *parameter) {
|
|||
return conv_parameter;
|
||||
}
|
||||
|
||||
void FreeMemoryFp32(std::vector<kernel::LiteKernel *> group_convs, std::vector<lite::Tensor *> new_inputs,
|
||||
std::vector<lite::Tensor *> new_outputs) {
|
||||
void FreeMemoryFp32(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
|
||||
const std::vector<lite::Tensor *> &new_outputs) {
|
||||
for (auto sub_conv : group_convs) {
|
||||
if (sub_conv != nullptr) {
|
||||
delete sub_conv;
|
||||
|
@ -188,6 +188,87 @@ void FreeMemoryFp32(std::vector<kernel::LiteKernel *> group_convs, std::vector<l
|
|||
}
|
||||
}
|
||||
|
||||
lite::Tensor *CreateInputTensorFp32(TypeId data_type, std::vector<int> in_shape, bool infered_flag) {
|
||||
auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
|
||||
if (in_tensor == nullptr) {
|
||||
MS_LOG(ERROR) << "new in_tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
if (infered_flag) {
|
||||
auto ret = in_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete in_tensor;
|
||||
MS_LOG(ERROR) << "in tensor malloc failed.";
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return in_tensor;
|
||||
}
|
||||
|
||||
lite::Tensor *CreateFilterTensorFp32(TypeId data_type, std::vector<int> filter_shape,
|
||||
const std::vector<lite::Tensor *> &inputs, int copy_length, int index) {
|
||||
auto filter_tensor =
|
||||
new (std::nothrow) lite::Tensor(data_type, filter_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
|
||||
if (filter_tensor == nullptr) {
|
||||
MS_LOG(ERROR) << "new filter_tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = filter_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete filter_tensor;
|
||||
MS_LOG(ERROR) << "filter_tensor malloc failed.";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
MS_ASSERT(data_type == kNumberTypeFloat32);
|
||||
auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
|
||||
memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float));
|
||||
return filter_tensor;
|
||||
}
|
||||
|
||||
lite::Tensor *CreateBiasTensorFp32(TypeId data_type, std::vector<int> bias_shape,
|
||||
const std::vector<lite::Tensor *> &inputs, int new_out_channel, int index) {
|
||||
auto *origin_bias = inputs.at(kBiasIndex)->data_c();
|
||||
auto bias_tensor =
|
||||
new (std::nothrow) lite::Tensor(data_type, bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
|
||||
if (bias_tensor == nullptr) {
|
||||
MS_LOG(ERROR) << "new bias_tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = bias_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete bias_tensor;
|
||||
MS_LOG(ERROR) << "bias_tensor malloc failed.";
|
||||
return nullptr;
|
||||
}
|
||||
MS_ASSERT(data_type == kNumberTypeFloat32);
|
||||
auto bias_data = reinterpret_cast<float *>(origin_bias);
|
||||
memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float));
|
||||
|
||||
return bias_tensor;
|
||||
}
|
||||
|
||||
lite::Tensor *CreateOutputTensorFp32(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
|
||||
bool infered_flag, int index) {
|
||||
auto out_tensor = new (std::nothrow) lite::Tensor();
|
||||
if (out_tensor == nullptr) {
|
||||
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
out_tensor->set_data_type(outputs.at(index)->data_type());
|
||||
out_tensor->SetFormat(outputs.at(index)->GetFormat());
|
||||
if (infered_flag) {
|
||||
out_tensor->set_shape(out_shape);
|
||||
auto ret = out_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete out_tensor;
|
||||
MS_LOG(ERROR) << "out_tensor malloc data failed.";
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return out_tensor;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *CpuConvFp32KernelSelect(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
|
||||
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
|
||||
|
@ -208,31 +289,22 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
|
|||
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
|
||||
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
|
||||
int group) {
|
||||
std::vector<kernel::LiteKernel *> group_convs;
|
||||
std::vector<int> in_shape;
|
||||
std::vector<int> filter_shape;
|
||||
std::vector<int> bias_shape;
|
||||
std::vector<int> out_shape;
|
||||
|
||||
int out_unit;
|
||||
bool has_bias = inputs.size() == 3;
|
||||
bool use_winograd = false;
|
||||
bool infered_flag = primitive != nullptr && primitive->GetInferFlag();
|
||||
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
|
||||
int out_channel = inputs.at(kWeightIndex)->Batch();
|
||||
|
||||
std::vector<int> in_shape;
|
||||
std::vector<int> out_shape;
|
||||
int new_in_channel = inputs.at(kWeightIndex)->Channel();
|
||||
int new_out_channel = 0;
|
||||
if (group == 0) {
|
||||
MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
|
||||
return nullptr;
|
||||
} else {
|
||||
new_out_channel = out_channel / group;
|
||||
new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
|
||||
}
|
||||
int kernel_h = conv_param->kernel_h_;
|
||||
int kernel_w = conv_param->kernel_w_;
|
||||
int input_num = inputs.size();
|
||||
int output_num = outputs.size();
|
||||
bool has_bias = input_num == 3;
|
||||
bool use_winograd = false;
|
||||
int out_unit;
|
||||
bool infered_flag = primitive != nullptr && primitive->GetInferFlag();
|
||||
|
||||
if (infered_flag) {
|
||||
int batch = inputs.front()->Batch();
|
||||
int in_h = inputs.front()->Height();
|
||||
|
@ -243,11 +315,11 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
|
|||
in_shape = {batch, in_h, in_w, new_in_channel};
|
||||
out_shape = {batch, conv_param->output_h_, conv_param->output_w_, new_out_channel};
|
||||
}
|
||||
std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
|
||||
std::vector<int> bias_shape = {new_out_channel};
|
||||
|
||||
filter_shape = {new_out_channel, kernel_h, kernel_w, new_in_channel};
|
||||
bias_shape = {new_out_channel};
|
||||
auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
|
||||
|
||||
// create sub kernels
|
||||
std::vector<kernel::LiteKernel *> group_convs;
|
||||
for (int i = 0; i < group; ++i) {
|
||||
std::vector<lite::Tensor *> new_inputs;
|
||||
std::vector<lite::Tensor *> new_outputs;
|
||||
|
@ -257,100 +329,58 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
|
|||
MS_LOG(ERROR) << "Get new conv parameter failed.";
|
||||
return nullptr;
|
||||
}
|
||||
// get new input for each group
|
||||
auto in_tensor =
|
||||
new (std::nothrow) lite::Tensor(inputs.front()->data_type(), in_shape, Format_NHWC, lite::Tensor::Category::VAR);
|
||||
|
||||
// create new input for each group
|
||||
auto in_tensor = CreateInputTensorFp32(inputs.front()->data_type(), in_shape, infered_flag);
|
||||
if (in_tensor == nullptr) {
|
||||
delete new_conv_parameter;
|
||||
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "new in_tensor failed.";
|
||||
MS_LOG(ERROR) << "create input tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
if (infered_flag) {
|
||||
auto ret = in_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete new_conv_parameter;
|
||||
delete in_tensor;
|
||||
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "in tensor malloc failed.";
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
new_inputs.emplace_back(in_tensor);
|
||||
|
||||
// new weight
|
||||
auto filter_tensor = new (std::nothrow) lite::Tensor(inputs.at(kWeightIndex)->data_type(), filter_shape,
|
||||
Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
|
||||
// create new weight
|
||||
int copy_length = conv_param->kernel_h_ * conv_param->kernel_w_ * new_in_channel * new_out_channel;
|
||||
auto filter_tensor =
|
||||
CreateFilterTensorFp32(inputs.at(kWeightIndex)->data_type(), filter_shape, inputs, copy_length, i);
|
||||
if (filter_tensor == nullptr) {
|
||||
delete new_conv_parameter;
|
||||
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "new filter_tensor failed.";
|
||||
MS_LOG(ERROR) << "create filter tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = filter_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete new_conv_parameter;
|
||||
delete filter_tensor;
|
||||
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "filter_tensor malloc failed.";
|
||||
return nullptr;
|
||||
}
|
||||
int copy_length = kernel_h * kernel_w * new_in_channel * new_out_channel;
|
||||
memcpy(filter_tensor->data_c(), origin_weight + i * copy_length, copy_length * sizeof(float));
|
||||
new_inputs.emplace_back(filter_tensor);
|
||||
|
||||
// if has bias, set new bias
|
||||
// if has bias, create new bias
|
||||
if (has_bias) {
|
||||
auto *origin_bias = reinterpret_cast<float *>(inputs.at(kBiasIndex)->data_c());
|
||||
auto bias_tensor = new (std::nothrow)
|
||||
lite::Tensor(inputs.at(kBiasIndex)->data_type(), bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
|
||||
auto bias_tensor =
|
||||
CreateBiasTensorFp32(inputs.at(kBiasIndex)->data_type(), bias_shape, inputs, new_out_channel, i);
|
||||
if (bias_tensor == nullptr) {
|
||||
delete new_conv_parameter;
|
||||
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "new bias_tensor failed.";
|
||||
MS_LOG(ERROR) << "create bias_tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
ret = bias_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete new_conv_parameter;
|
||||
delete bias_tensor;
|
||||
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "bias_tensor malloc failed.";
|
||||
return nullptr;
|
||||
}
|
||||
memcpy(bias_tensor->data_c(), origin_bias + i * new_out_channel, new_out_channel * sizeof(float));
|
||||
new_inputs.emplace_back(bias_tensor);
|
||||
}
|
||||
|
||||
// set new output tensor
|
||||
for (int j = 0; j < output_num; ++j) {
|
||||
auto tmp_out_tensor = new (std::nothrow) lite::Tensor();
|
||||
if (tmp_out_tensor == nullptr) {
|
||||
// create new output tensor
|
||||
for (size_t j = 0; j < outputs.size(); ++j) {
|
||||
auto out_tensor = CreateOutputTensorFp32(out_shape, outputs, infered_flag, j);
|
||||
if (out_tensor == nullptr) {
|
||||
delete new_conv_parameter;
|
||||
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
|
||||
MS_LOG(ERROR) << "new out_tensor failed.";
|
||||
return nullptr;
|
||||
}
|
||||
tmp_out_tensor->set_data_type(outputs.at(j)->data_type());
|
||||
tmp_out_tensor->SetFormat(outputs.at(j)->GetFormat());
|
||||
if (infered_flag) {
|
||||
tmp_out_tensor->set_shape(out_shape);
|
||||
ret = tmp_out_tensor->MallocData();
|
||||
if (ret != RET_OK) {
|
||||
delete new_conv_parameter;
|
||||
delete tmp_out_tensor;
|
||||
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
|
||||
MS_LOG(ERROR) << "tmp_out_tensor malloc data failed.";
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
new_outputs.emplace_back(tmp_out_tensor);
|
||||
new_outputs.emplace_back(out_tensor);
|
||||
}
|
||||
|
||||
group_convs.emplace_back(CpuConvFp32KernelSelect(new_inputs, new_outputs,
|
||||
reinterpret_cast<OpParameter *>(new_conv_parameter), ctx,
|
||||
primitive, use_winograd, out_unit));
|
||||
}
|
||||
|
||||
return new (std::nothrow)
|
||||
GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, group);
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/runtime_api.h"
|
||||
#include "nnacl/quantization/quantize.h"
|
||||
#include "nnacl/pack.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
|
@ -38,11 +39,77 @@ using mindspore::schema::PrimitiveType_Mean;
|
|||
using mindspore::schema::PrimitiveType_Reduce;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
void ReduceInt8CPUKernel::OneAxis() {
|
||||
auto axis_info = axes_[0];
|
||||
if (axis_info == 0) {
|
||||
pattern_ = kernel::N;
|
||||
} else if (axis_info == 1) {
|
||||
pattern_ = kernel::H;
|
||||
} else if (axis_info == 2) {
|
||||
pattern_ = kernel::W;
|
||||
} else {
|
||||
pattern_ = kernel::C;
|
||||
}
|
||||
}
|
||||
|
||||
void ReduceInt8CPUKernel::TwoAxes() {
|
||||
auto axis_info1 = axes_[0];
|
||||
auto axis_info2 = axes_[1];
|
||||
auto axis_sum = axis_info1 + axis_info2;
|
||||
if (axis_sum == 1) {
|
||||
pattern_ = kernel::NH;
|
||||
} else if (axis_sum == 2) {
|
||||
pattern_ = kernel::NW;
|
||||
} else if (axis_sum == 3) {
|
||||
if (axis_info1 == 0) {
|
||||
pattern_ = kernel::NC;
|
||||
} else {
|
||||
pattern_ = kernel::HW;
|
||||
}
|
||||
} else if (axis_sum == 4) {
|
||||
pattern_ = kernel::HC;
|
||||
} else {
|
||||
MS_ASSERT(axis_sum == 5);
|
||||
pattern_ = kernel::WC;
|
||||
}
|
||||
}
|
||||
|
||||
void ReduceInt8CPUKernel::ThreeAxes() {
|
||||
auto axis_info1 = axes_[0];
|
||||
auto axis_info2 = axes_[1];
|
||||
auto axis_info3 = axes_[2];
|
||||
auto axis_sum = axis_info1 + axis_info2 + axis_info3;
|
||||
if (axis_sum == 3) {
|
||||
pattern_ = kernel::NHW;
|
||||
} else if (axis_sum == 4) {
|
||||
pattern_ = kernel::NHC;
|
||||
} else if (axis_sum == 5) {
|
||||
pattern_ = kernel::NWC;
|
||||
} else {
|
||||
MS_ASSERT(axis_sum == 6);
|
||||
pattern_ = kernel::HWC;
|
||||
}
|
||||
}
|
||||
|
||||
void ReduceInt8CPUKernel::Match4DReducePattern() {
|
||||
if (num_axes_ == 1) {
|
||||
OneAxis();
|
||||
} else if (num_axes_ == 2) {
|
||||
TwoAxes();
|
||||
} else if (num_axes_ == 3) {
|
||||
ThreeAxes();
|
||||
} else {
|
||||
MS_ASSERT(num_axes_ == 4);
|
||||
pattern_ = kernel::NHWC;
|
||||
}
|
||||
}
|
||||
|
||||
int ReduceInt8CPUKernel::Init() {
|
||||
auto ret = ReduceBaseCPUKernel::Init();
|
||||
if (ret != RET_OK) {
|
||||
return ret;
|
||||
}
|
||||
Match4DReducePattern();
|
||||
if (!this->in_tensors_[0]->shape().empty()) {
|
||||
this->valid_shape_ = true;
|
||||
ret = CalculateQuantArgs();
|
||||
|
@ -96,6 +163,64 @@ int ReduceInt8CPUKernel::Init() {
|
|||
return ReSize();
|
||||
}
|
||||
|
||||
void ReduceInt8CPUKernel::ReduceMean4DCalQuantParam() {
|
||||
int reduce_num = 1;
|
||||
auto in_shape = in_tensors_.front()->shape();
|
||||
switch (pattern_) {
|
||||
case N:
|
||||
reduce_num = in_shape[0];
|
||||
break;
|
||||
case H:
|
||||
reduce_num = in_shape[1];
|
||||
break;
|
||||
case W:
|
||||
reduce_num = in_shape[2];
|
||||
break;
|
||||
case C:
|
||||
reduce_num = in_shape[3];
|
||||
break;
|
||||
case NH:
|
||||
reduce_num = in_shape[0] * in_shape[1];
|
||||
break;
|
||||
case NW:
|
||||
reduce_num = in_shape[0] * in_shape[2];
|
||||
break;
|
||||
case NC:
|
||||
reduce_num = in_shape[0] * in_shape[3];
|
||||
break;
|
||||
case HW:
|
||||
reduce_num = in_shape[1] * in_shape[2];
|
||||
break;
|
||||
case HC:
|
||||
reduce_num = in_shape[1] * in_shape[3];
|
||||
break;
|
||||
case WC:
|
||||
reduce_num = in_shape[2] * in_shape[3];
|
||||
break;
|
||||
case NHW:
|
||||
reduce_num = in_shape[0] * in_shape[1] * in_shape[2];
|
||||
break;
|
||||
case NHC:
|
||||
reduce_num = in_shape[0] * in_shape[1] * in_shape[3];
|
||||
break;
|
||||
case NWC:
|
||||
reduce_num = in_shape[0] * in_shape[2] * in_shape[3];
|
||||
break;
|
||||
case HWC:
|
||||
reduce_num = in_shape[1] * in_shape[2] * in_shape[3];
|
||||
break;
|
||||
case NHWC:
|
||||
reduce_num = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
break;
|
||||
}
|
||||
bias_ = quant_arg_.out_zp_ - quant_arg_.in_zp_ * quant_arg_.in_scale_ / quant_arg_.out_scale_;
|
||||
int shift;
|
||||
double reciprocal = quant_arg_.in_scale_ / (quant_arg_.out_scale_ * reduce_num);
|
||||
QuantizeMultiplierSmallerThanOne(reciprocal, &reduce_mean_quant_param_.multiplier_, &shift);
|
||||
reduce_mean_quant_param_.left_shift_ = shift < 0 ? -shift : 0;
|
||||
reduce_mean_quant_param_.right_shift_ = shift > 0 ? shift : 0;
|
||||
}
|
||||
|
||||
int ReduceInt8CPUKernel::CalculateQuantArgs() {
|
||||
lite::Tensor *input = in_tensors_.at(0);
|
||||
lite::Tensor *output = out_tensors_.at(0);
|
||||
|
@ -117,18 +242,24 @@ int ReduceInt8CPUKernel::CalculateQuantArgs() {
|
|||
// (quant_out - zp_out)*scale_out = sum((quant_in -zp)*scale_in) * (1/num) for each axis in axes
|
||||
// quant_out = sum(quant_in-zp) * (scale_in/scale_out) * (1/num)
|
||||
if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean)) {
|
||||
for (auto i = 0; i < num_axes_; i++) {
|
||||
auto axis = axes_[i];
|
||||
double reciprocal = 1.0 / in_tensors_.at(0)->shape()[axis];
|
||||
QuantMulArg *qm = new (std::nothrow) QuantMulArg;
|
||||
if (qm == nullptr) {
|
||||
MS_LOG(ERROR) << "Reduce new QuantMulArg failed.";
|
||||
return RET_NULL_PTR;
|
||||
if (input->shape().size() == 4 && pattern_ == kernel::HW) {
|
||||
// special case, can use pattern
|
||||
ReduceMean4DCalQuantParam();
|
||||
pattern_impl_ = true;
|
||||
} else {
|
||||
for (auto i = 0; i < num_axes_; i++) {
|
||||
auto axis = axes_[i];
|
||||
double reciprocal = 1.0 / in_tensors_.at(0)->shape()[axis];
|
||||
QuantMulArg *qm = new (std::nothrow) QuantMulArg;
|
||||
if (qm == nullptr) {
|
||||
MS_LOG(ERROR) << "Reduce new QuantMulArg failed.";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
QuantizeMultiplierSmallerThanOne(reciprocal, &qm->multiplier_, &shift);
|
||||
qm->left_shift_ = shift < 0 ? -shift : 0;
|
||||
qm->right_shift_ = shift > 0 ? shift : 0;
|
||||
mean_multipliers_.push_back(qm);
|
||||
}
|
||||
QuantizeMultiplierSmallerThanOne(reciprocal, &qm->multiplier_, &shift);
|
||||
qm->left_shift_ = shift < 0 ? -shift : 0;
|
||||
qm->right_shift_ = shift > 0 ? shift : 0;
|
||||
mean_multipliers_.push_back(qm);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -230,6 +361,16 @@ int ReduceInt8Impl(void *cdata, int task_id) {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int ReduceMeanPatternInt8Impl(void *cdata, int task_id) {
|
||||
auto reduce = reinterpret_cast<ReduceInt8CPUKernel *>(cdata);
|
||||
auto error_code = reduce->Reduce4DExecute(task_id);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "Reduce Run error task_id[" << task_id << "] error_code[" << error_code << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ReduceInt8CPUKernel::GetQuantArgs(size_t i) {
|
||||
MS_ASSERT(i < static_cast<size_t>(num_axis_));
|
||||
if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean)) {
|
||||
|
@ -250,6 +391,78 @@ void ReduceInt8CPUKernel::GetQuantArgs(size_t i) {
|
|||
}
|
||||
}
|
||||
|
||||
int ReduceInt8CPUKernel::Reduce4DExecute(int task_id) {
|
||||
auto input = in_tensors_.at(0);
|
||||
auto in_data = reinterpret_cast<int8_t *>(input->data_c());
|
||||
auto in_shape = input->shape();
|
||||
MS_ASSERT(in_shape.size() == 4);
|
||||
int n = in_shape.at(0);
|
||||
int h = in_shape.at(1);
|
||||
int w = in_shape.at(2);
|
||||
int c = in_shape.at(3);
|
||||
auto output_data = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c());
|
||||
switch (pattern_) {
|
||||
case N:
|
||||
return ReduceMeanN(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case H:
|
||||
return ReduceMeanH(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case W:
|
||||
return ReduceMeanW(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case C:
|
||||
return ReduceMeanC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case NH:
|
||||
return ReduceMeanNH(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case NW:
|
||||
return ReduceMeanNW(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case NC:
|
||||
return ReduceMeanNC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case HW: {
|
||||
// data has been convert into NCHW format for efficiently
|
||||
int num = UP_DIV(c, ctx_->thread_num_);
|
||||
int count = c - task_id * num;
|
||||
count = count > num ? num : count;
|
||||
int plane = h * w;
|
||||
return ReduceMeanHW(n, plane, count, c, nchw_in_data_ + task_id * num * plane, output_data + task_id * num,
|
||||
reduce_mean_quant_param_, bias_);
|
||||
}
|
||||
case HC:
|
||||
return ReduceMeanHC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case WC:
|
||||
return ReduceMeanWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case NHW:
|
||||
return ReduceMeanNHW(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case NHC:
|
||||
return ReduceMeanNHC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case NWC:
|
||||
return ReduceMeanNWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case HWC:
|
||||
return ReduceMeanHWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
case NHWC:
|
||||
return ReduceMeanNHWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ReduceInt8CPUKernel::Fast4DReduceMeanHWImpl() {
|
||||
auto input = in_tensors_.at(0);
|
||||
auto input_data = reinterpret_cast<int8_t *>(input->data_c());
|
||||
nchw_in_data_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(input->ElementsNum()));
|
||||
if (nchw_in_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc nchw_in_data_ failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
PackNHWCToNCHWInt8(reinterpret_cast<void *>(input_data), reinterpret_cast<void *>(nchw_in_data_), input->Batch(),
|
||||
input->Height() * input->Width(), input->Channel());
|
||||
auto ret = ParallelLaunch(this->context_->thread_pool_, ReduceMeanPatternInt8Impl, this, context_->thread_num_);
|
||||
if (ret != RET_OK) {
|
||||
ctx_->allocator->Free(nchw_in_data_);
|
||||
MS_LOG(ERROR) << "Reduce run error, error_code[" << ret << "]";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ctx_->allocator->Free(nchw_in_data_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ReduceInt8CPUKernel::Run() {
|
||||
if (!this->valid_shape_) {
|
||||
auto ret = CalculateQuantArgs();
|
||||
|
@ -257,6 +470,11 @@ int ReduceInt8CPUKernel::Run() {
|
|||
return ret;
|
||||
}
|
||||
}
|
||||
// now only implement reduce mean mode 4d reduce HW case, otherwise go into reference impl
|
||||
if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean) && pattern_impl_ && pattern_ == kernel::HW) {
|
||||
return Fast4DReduceMeanHWImpl();
|
||||
}
|
||||
|
||||
auto ret = MallocTmpBuffer();
|
||||
if (ret != RET_OK) {
|
||||
FreeTmpBuffer();
|
||||
|
@ -313,6 +531,7 @@ int ReduceInt8CPUKernel::CallReduceUnit(int task_id) {
|
|||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
|
||||
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
using mindspore::schema::ReduceMode;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
enum Four_DIMENSION_REDUCE_TEMPLATE { N, H, W, C, NH, NW, NC, HW, HC, WC, NHW, NHC, NWC, HWC, NHWC };
|
||||
class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
|
||||
typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
|
||||
int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
|
||||
|
@ -38,7 +39,7 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
|
|||
ReduceInt8CPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
|
||||
const mindspore::lite::PrimitiveC *primitive)
|
||||
: ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
|
||||
: ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive), ctx_(ctx) {}
|
||||
~ReduceInt8CPUKernel() {
|
||||
for (auto qm : mean_multipliers_) {
|
||||
delete qm;
|
||||
|
@ -59,6 +60,8 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
|
|||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int Fast4DReduceMeanHWImpl();
|
||||
int Reduce4DExecute(int task_id);
|
||||
int CallReduceUnit(int task_id);
|
||||
int ReduceLastAxis(int task_id);
|
||||
|
||||
|
@ -68,22 +71,31 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
|
|||
private:
|
||||
int MallocTmpBuffer();
|
||||
void FreeTmpBuffer();
|
||||
|
||||
void Match4DReducePattern();
|
||||
void OneAxis();
|
||||
void TwoAxes();
|
||||
void ThreeAxes();
|
||||
void ReduceMean4DCalQuantParam();
|
||||
int CalculateQuantArgs();
|
||||
void GetQuantArgs(size_t i);
|
||||
|
||||
private:
|
||||
ReduceParameter *param_ = nullptr;
|
||||
ReduceQuantArg quant_arg_;
|
||||
int8_t *nchw_in_data_ = nullptr;
|
||||
int32_t bias_;
|
||||
|
||||
private:
|
||||
const lite::InnerContext *ctx_;
|
||||
int32_t *begin_src_data_ = nullptr;
|
||||
int8_t *last_dst_data_ = nullptr;
|
||||
std::vector<int32_t *> data_buffers_;
|
||||
const int32_t *src_data_ = nullptr;
|
||||
int32_t *dst_data_ = nullptr;
|
||||
bool valid_shape_ = false;
|
||||
|
||||
bool pattern_impl_ = false;
|
||||
Four_DIMENSION_REDUCE_TEMPLATE pattern_;
|
||||
QuantMulArg reduce_mean_quant_param_; // used in reduce mean 4D situation
|
||||
Reducer reducer_ = nullptr;
|
||||
LastReducer last_reducer_ = nullptr;
|
||||
std::vector<QuantMulArg *> mean_multipliers_;
|
||||
|
|
Loading…
Reference in New Issue