!8277 [MS][LITE][CPU]optimize int8 reduce mean && extract functions

From: @fuzhiye
Reviewed-by: @zhang_xue_tong
Signed-off-by:
This commit is contained in:
mindspore-ci-bot 2020-11-09 09:21:27 +08:00 committed by Gitee
commit 4330052001
8 changed files with 793 additions and 209 deletions

View File

@ -20,6 +20,160 @@
#include "nnacl/quantization/fixed_point.h"
#include "nnacl/common_func.h"
int ReduceMeanN(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanHW(int n, int plane, int count, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg,
int32_t bias) {
int stride = plane * UP_ROUND(c, C4NUM);
for (int batch = 0; batch < n; ++batch) {
int8_t *in_ptr = in_data + batch * stride;
int8_t *out_ptr = out_data + batch * c;
for (int i = 0; i < count; ++i) {
int32_t sum_array = 0;
int j = 0;
#ifdef ENABLE_ARM64
for (; j < plane; j += 16) {
int8x16_t in_data_vec = vld1q_s8(in_ptr);
sum_array += vaddlvq_s8(in_data_vec);
in_ptr += 16;
}
for (; j < plane; j += 8) {
int8x8_t in_data_vec = vld1_s8(in_ptr);
sum_array += vaddlv_s8(in_data_vec);
in_ptr += 8;
}
for (; j < plane; j += 4) {
int32x4_t in_data_vec;
in_data_vec[0] = in_ptr[0];
in_data_vec[1] = in_ptr[1];
in_data_vec[2] = in_ptr[2];
in_data_vec[3] = in_ptr[3];
sum_array += vaddvq_s32(in_data_vec);
in_ptr += 4;
}
#elif ENABLE_ARM32
int32x4_t accum = vmovq_n_s32(0);
for (; j < plane; j += 16) {
int32x4_t in_data_vec1;
int32x4_t in_data_vec2;
int32x4_t in_data_vec3;
int32x4_t in_data_vec4;
in_data_vec1[0] = in_ptr[0];
in_data_vec1[1] = in_ptr[1];
in_data_vec1[2] = in_ptr[2];
in_data_vec1[3] = in_ptr[3];
in_data_vec2[0] = in_ptr[4];
in_data_vec2[1] = in_ptr[5];
in_data_vec2[2] = in_ptr[6];
in_data_vec2[3] = in_ptr[7];
in_data_vec3[0] = in_ptr[8];
in_data_vec3[1] = in_ptr[9];
in_data_vec3[2] = in_ptr[10];
in_data_vec3[3] = in_ptr[11];
in_data_vec4[0] = in_ptr[12];
in_data_vec4[1] = in_ptr[13];
in_data_vec4[2] = in_ptr[14];
in_data_vec4[3] = in_ptr[15];
accum = vaddq_s32(accum, in_data_vec1);
accum = vaddq_s32(accum, in_data_vec2);
accum = vaddq_s32(accum, in_data_vec3);
accum = vaddq_s32(accum, in_data_vec4);
in_ptr += 16;
}
for (; j < plane; j += 8) {
int32x4_t in_data_vec1;
int32x4_t in_data_vec2;
in_data_vec1[0] = in_ptr[0];
in_data_vec1[1] = in_ptr[1];
in_data_vec1[2] = in_ptr[2];
in_data_vec1[3] = in_ptr[3];
in_data_vec2[0] = in_ptr[4];
in_data_vec2[1] = in_ptr[5];
in_data_vec2[2] = in_ptr[6];
in_data_vec2[3] = in_ptr[7];
accum = vaddq_s32(accum, in_data_vec1);
accum = vaddq_s32(accum, in_data_vec2);
in_ptr += 8;
}
for (; j < plane; j += 4) {
int32x4_t in_data_vec;
in_data_vec[0] = in_ptr[0];
in_data_vec[1] = in_ptr[1];
in_data_vec[2] = in_ptr[2];
in_data_vec[3] = in_ptr[3];
accum = vaddq_s32(accum, in_data_vec);
in_ptr += 4;
}
sum_array += accum[0];
sum_array += accum[1];
sum_array += accum[2];
sum_array += accum[3];
#endif
for (; j < plane; j++) {
sum_array += in_ptr[0];
in_ptr++;
}
int32_t mean =
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(sum_array * (1 << (unsigned int)quant_arg.left_shift_),
quant_arg.multiplier_),
quant_arg.right_shift_);
mean += bias;
mean = MSMIN(mean, INT8_MAX);
mean = MSMAX(mean, INT8_MIN);
out_ptr[0] = mean;
out_ptr++;
}
}
return NNACL_OK;
}
int ReduceMeanHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNHW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
int ReduceMeanNHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg) {
return NNACL_OK;
}
// Get x such that (x-zp_in) * scale_in = mean
// Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,

View File

@ -21,6 +21,23 @@
extern "C" {
#endif
int ReduceMeanN(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNH(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanHW(int n, int plane, int count, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg,
int32_t bias);
int ReduceMeanHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNHW(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNHC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanNHWC(int n, int h, int w, int c, int8_t *in_data, int8_t *out_data, QuantMulArg quant_arg);
int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
int ReduceMeanLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,

View File

@ -929,13 +929,144 @@ void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int
return;
}
void PackNHWCToNCHWInt8(const void *src, void *dst, int batch, int plane, int channel) {
for (int n = 0; n < batch; n++) {
for (int c = 0; c < channel; c++) {
for (int hw = 0; hw < plane; hw++) {
int nhwc_index = n * channel * plane + hw * channel + c;
int nchw_index = n * channel * plane + c * plane + hw;
((int8_t *)dst)[nchw_index] = ((int8_t *)src)[nhwc_index];
void PackNHWCToNCHWInt8(const void *src, void *dst, int batches, int plane, int channel) {
int hw8 = plane / C8NUM * C8NUM;
int c8 = channel / C8NUM * C8NUM;
int batch = plane * channel;
for (int n = 0; n < batches; n++) {
const int8_t *src_batch = (const int8_t *)src + n * batch;
int8_t *dst_batch = (int8_t *)dst + n * batch;
int hw = 0;
for (; hw < hw8; hw += C8NUM) {
int c = 0;
for (; c < c8; c += C8NUM) {
const int8_t *src_ptr = src_batch + hw * channel + c;
int8_t *dst_ptr = dst_batch + c * plane + hw;
#ifdef ENABLE_ARM64
size_t srcStride = channel * sizeof(int8_t);
size_t dstStride = plane * sizeof(int8_t);
asm volatile(
"mov x10, %[src_ptr]\n"
"mov x11, %[dst_ptr]\n"
"ld1 {v0.8b}, [x10], %[srcStride]\n"
"ld1 {v1.8b}, [x10], %[srcStride]\n"
"ld1 {v2.8b}, [x10], %[srcStride]\n"
"ld1 {v3.8b}, [x10], %[srcStride]\n"
"trn1 v4.8b, v0.8b, v1.8b\n"
"trn2 v5.8b, v0.8b, v1.8b\n"
"trn1 v6.8b, v2.8b, v3.8b\n"
"trn2 v7.8b, v2.8b, v3.8b\n"
"ld1 {v0.8b}, [x10], %[srcStride]\n"
"ld1 {v1.8b}, [x10], %[srcStride]\n"
"ld1 {v2.8b}, [x10], %[srcStride]\n"
"ld1 {v3.8b}, [x10], %[srcStride]\n"
"trn1 v8.4h, v4.4h, v6.4h\n"
"trn2 v9.4h, v4.4h, v6.4h\n"
"trn1 v10.4h, v5.4h, v7.4h\n"
"trn2 v11.4h, v5.4h, v7.4h\n"
"trn1 v4.8b, v0.8b, v1.8b\n"
"trn2 v5.8b, v0.8b, v1.8b\n"
"trn1 v6.8b, v2.8b, v3.8b\n"
"trn2 v7.8b, v2.8b, v3.8b\n"
"trn1 v12.4h, v4.4h, v6.4h\n"
"trn2 v13.4h, v4.4h, v6.4h\n"
"trn1 v14.4h, v5.4h, v7.4h\n"
"trn2 v15.4h, v5.4h, v7.4h\n"
"trn1 v0.2s, v8.2s, v12.2s\n"
"trn2 v4.2s, v8.2s, v12.2s\n"
"trn1 v1.2s, v10.2s, v14.2s\n"
"trn2 v5.2s, v10.2s, v14.2s\n"
"trn1 v2.2s, v9.2s, v13.2s\n"
"trn2 v6.2s, v9.2s, v13.2s\n"
"trn1 v3.2s, v11.2s, v15.2s\n"
"trn2 v7.2s, v11.2s, v15.2s\n"
"st1 {v0.8b}, [x11], %[dstStride]\n"
"st1 {v1.8b}, [x11], %[dstStride]\n"
"st1 {v2.8b}, [x11], %[dstStride]\n"
"st1 {v3.8b}, [x11], %[dstStride]\n"
"st1 {v4.8b}, [x11], %[dstStride]\n"
"st1 {v5.8b}, [x11], %[dstStride]\n"
"st1 {v6.8b}, [x11], %[dstStride]\n"
"st1 {v7.8b}, [x11], %[dstStride]\n"
:
:
[ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
: "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31");
#elif ENABLE_ARM32
size_t srcStride = channel * sizeof(int8_t);
size_t dstStride = plane * sizeof(int8_t);
asm volatile(
"mov r10, %[src_ptr]\n"
"mov r12, %[dst_ptr]\n"
"vld1.8 {d0}, [r10], %[srcStride]\n"
"vld1.8 {d1}, [r10], %[srcStride]\n"
"vld1.8 {d2}, [r10], %[srcStride]\n"
"vld1.8 {d3}, [r10], %[srcStride]\n"
"vld1.8 {d4}, [r10], %[srcStride]\n"
"vld1.8 {d5}, [r10], %[srcStride]\n"
"vld1.8 {d6}, [r10], %[srcStride]\n"
"vld1.8 {d7}, [r10], %[srcStride]\n"
"vtrn.8 d0, d1\n"
"vtrn.8 d2, d3\n"
"vtrn.8 d4, d5\n"
"vtrn.8 d6, d7\n"
"vtrn.16 d0, d2\n"
"vtrn.16 d1, d3\n"
"vtrn.16 d4, d6\n"
"vtrn.16 d5, d7\n"
"vtrn.32 d0, d4\n"
"vtrn.32 d1, d5\n"
"vtrn.32 d2, d6\n"
"vtrn.32 d3, d7\n"
"vst1.8 {d0}, [r12], %[dstStride]\n"
"vst1.8 {d1}, [r12], %[dstStride]\n"
"vst1.8 {d2}, [r12], %[dstStride]\n"
"vst1.8 {d3}, [r12], %[dstStride]\n"
"vst1.8 {d4}, [r12], %[dstStride]\n"
"vst1.8 {d5}, [r12], %[dstStride]\n"
"vst1.8 {d6}, [r12], %[dstStride]\n"
"vst1.8 {d7}, [r12], %[dstStride]\n"
:
:
[ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
: "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
"q15");
#else
for (int tr = 0; tr < C8NUM; tr++) {
for (int tc = 0; tc < C8NUM; tc++) {
dst_ptr[tc * plane + tr] = src_ptr[tr * channel + tc];
}
}
#endif
}
for (; c < channel; c++) {
const int8_t *src_ptr = src_batch + hw * channel + c;
int8_t *dst_ptr = dst_batch + c * plane + hw;
for (size_t i = 0; i < C8NUM; i++) {
dst_ptr[i] = src_ptr[i * channel];
}
}
}
for (; hw < plane; hw++) {
const int8_t *src_ptr = src_batch + hw * channel;
int8_t *dst_ptr = dst_batch + hw;
for (size_t i = 0; i < channel; i++) {
dst_ptr[i * plane] = src_ptr[i];
}
}
}

View File

@ -108,7 +108,6 @@ int ReduceBaseCPUKernel::Init() {
}
mode_ = reduce_param->mode_;
memcpy(axes_, reduce_param->axes_, sizeof(reduce_param->axes_));
reduce_to_end_ = reduce_param->reduce_to_end_;
auto ret = CheckInputsOutputs();

View File

@ -201,8 +201,8 @@ kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &i
return nullptr;
}
void FreeMemoryFp16(std::vector<kernel::LiteKernel *> group_convs, std::vector<lite::Tensor *> new_inputs,
std::vector<lite::Tensor *> new_outputs) {
void FreeMemoryFp16(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs) {
for (auto sub_conv : group_convs) {
if (sub_conv != nullptr) {
delete sub_conv;
@ -220,49 +220,131 @@ void FreeMemoryFp16(std::vector<kernel::LiteKernel *> group_convs, std::vector<l
}
}
lite::Tensor *CreateInputTensor(TypeId data_type, std::vector<int> in_shape, bool infered_flag) {
auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
if (in_tensor == nullptr) {
MS_LOG(ERROR) << "new in_tensor failed.";
return nullptr;
}
if (infered_flag) {
auto ret = in_tensor->MallocData();
if (ret != RET_OK) {
delete in_tensor;
MS_LOG(ERROR) << "in tensor malloc failed.";
return nullptr;
}
}
return in_tensor;
}
lite::Tensor *CreateFilterTensor(TypeId data_type, std::vector<int> filter_shape,
const std::vector<lite::Tensor *> &inputs, int copy_length, int index) {
auto filter_tensor =
new (std::nothrow) lite::Tensor(data_type, filter_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
if (filter_tensor == nullptr) {
MS_LOG(ERROR) << "new filter_tensor failed.";
return nullptr;
}
auto ret = filter_tensor->MallocData();
if (ret != RET_OK) {
delete filter_tensor;
MS_LOG(ERROR) << "filter_tensor malloc failed.";
return nullptr;
}
if (data_type == kNumberTypeFloat16) {
auto *origin_weight = reinterpret_cast<float16_t *>(inputs.at(kWeightIndex)->data_c());
memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float16_t));
} else {
MS_ASSERT(data_type == kNumberTypeFloat32);
auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float));
}
return filter_tensor;
}
lite::Tensor *CreateBiasTensor(TypeId data_type, std::vector<int> bias_shape, const std::vector<lite::Tensor *> &inputs,
int new_out_channel, int index) {
auto *origin_bias = inputs.at(kBiasIndex)->data_c();
auto bias_tensor =
new (std::nothrow) lite::Tensor(data_type, bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
if (bias_tensor == nullptr) {
MS_LOG(ERROR) << "new bias_tensor failed.";
return nullptr;
}
auto ret = bias_tensor->MallocData();
if (ret != RET_OK) {
delete bias_tensor;
MS_LOG(ERROR) << "bias_tensor malloc failed.";
return nullptr;
}
if (data_type == kNumberTypeFloat16) {
auto bias_data = reinterpret_cast<float16_t *>(origin_bias);
memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float16_t));
} else {
MS_ASSERT(data_type == kNumberTypeFloat32);
auto bias_data = reinterpret_cast<float *>(origin_bias);
memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float));
}
return bias_tensor;
}
lite::Tensor *CreateOutputTensor(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
bool infered_flag, int index) {
auto out_tensor = new (std::nothrow) lite::Tensor();
if (out_tensor == nullptr) {
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
return nullptr;
}
out_tensor->set_data_type(outputs.at(index)->data_type());
out_tensor->SetFormat(outputs.at(index)->GetFormat());
if (infered_flag) {
out_tensor->set_shape(out_shape);
auto ret = out_tensor->MallocData();
if (ret != RET_OK) {
delete out_tensor;
MS_LOG(ERROR) << "out_tensor malloc data failed.";
return nullptr;
}
}
return out_tensor;
}
kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
int group) {
std::vector<kernel::LiteKernel *> group_convs;
std::vector<int> in_shape;
std::vector<int> filter_shape;
std::vector<int> bias_shape;
std::vector<int> out_shape;
int out_unit;
bool has_bias = inputs.size() == 3;
bool use_winograd = false;
bool infered_flag = (primitive != nullptr && primitive->GetInferFlag());
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
int out_channel = inputs.at(kWeightIndex)->Batch();
// update new shape info for each sub kernel
int new_in_channel = inputs.at(kWeightIndex)->Channel();
int new_out_channel = 0;
if (group == 0) {
MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
return nullptr;
} else {
new_out_channel = out_channel / group;
new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
}
int kernel_h = conv_param->kernel_h_;
int kernel_w = conv_param->kernel_w_;
int input_num = inputs.size();
int output_num = outputs.size();
bool has_bias = input_num == 3;
bool use_winograd = false;
int out_unit;
bool infered_flag = (primitive != nullptr && primitive->GetInferFlag());
std::vector<int> in_shape;
std::vector<int> out_shape;
if (infered_flag) {
int batch = inputs.front()->Batch();
int in_h = inputs.front()->Height();
int in_w = inputs.front()->Width();
conv_param->input_channel_ = new_in_channel;
conv_param->output_channel_ = new_out_channel;
CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);
in_shape = {batch, in_h, in_w, new_in_channel};
in_shape = {batch, inputs.front()->Height(), inputs.front()->Width(), new_in_channel};
out_shape = {batch, conv_param->output_h_, conv_param->output_w_, new_out_channel};
}
std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
std::vector<int> bias_shape = {new_out_channel};
filter_shape = {new_out_channel, kernel_h, kernel_w, new_in_channel};
bias_shape = {new_out_channel};
// new group conv op
std::vector<kernel::LiteKernel *> group_convs;
// create tensors for every sub conv kernel
for (int i = 0; i < group; ++i) {
std::vector<lite::Tensor *> new_inputs;
std::vector<lite::Tensor *> new_outputs;
@ -272,116 +354,56 @@ kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor
MS_LOG(ERROR) << "Get new conv parameter failed.";
return nullptr;
}
// get new input for each group
auto in_tensor =
new (std::nothrow) lite::Tensor(inputs.front()->data_type(), in_shape, Format_NHWC, lite::Tensor::Category::VAR);
// create new input for each group
auto in_tensor = CreateInputTensor(inputs.front()->data_type(), in_shape, infered_flag);
if (in_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new in_tensor failed.";
MS_LOG(ERROR) << "create input tensor failed.";
return nullptr;
}
if (infered_flag) {
auto ret = in_tensor->MallocData();
if (ret != RET_OK) {
delete new_conv_parameter;
delete in_tensor;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "in tensor malloc failed.";
return nullptr;
}
}
new_inputs.emplace_back(in_tensor);
// new weight
auto filter_tensor = new (std::nothrow) lite::Tensor(inputs.at(kWeightIndex)->data_type(), filter_shape,
Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
// create new weight
int copy_length = conv_param->kernel_h_ * conv_param->kernel_w_ * new_in_channel * new_out_channel;
auto filter_tensor = CreateFilterTensor(inputs.at(kWeightIndex)->data_type(), filter_shape, inputs, copy_length, i);
if (filter_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new filter_tensor failed.";
MS_LOG(ERROR) << "create filter tensor failed.";
return nullptr;
}
auto ret = filter_tensor->MallocData();
if (ret != RET_OK) {
delete new_conv_parameter;
delete filter_tensor;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "filter_tensor malloc failed.";
return nullptr;
}
int copy_length = kernel_h * kernel_w * new_in_channel * new_out_channel;
auto filter_data_type = inputs.at(kWeightIndex)->data_type();
if (filter_data_type == kNumberTypeFloat16) {
auto *origin_weight = reinterpret_cast<float16_t *>(inputs.at(kWeightIndex)->data_c());
memcpy(filter_tensor->data_c(), origin_weight + i * copy_length, copy_length * sizeof(float16_t));
} else {
MS_ASSERT(filter_data_type == kNumberTypeFloat32);
auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
memcpy(filter_tensor->data_c(), origin_weight + i * copy_length, copy_length * sizeof(float));
}
new_inputs.emplace_back(filter_tensor);
// if has bias, set new bias
// if has bias, create new bias
if (has_bias) {
auto *origin_bias = inputs.at(kBiasIndex)->data_c();
auto bias_data_type = inputs.at(kBiasIndex)->data_type();
auto bias_tensor = new (std::nothrow)
lite::Tensor(inputs.at(kBiasIndex)->data_type(), bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
auto bias_tensor = CreateBiasTensor(inputs.at(kBiasIndex)->data_type(), bias_shape, inputs, new_out_channel, i);
if (bias_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new bias_tensor failed.";
MS_LOG(ERROR) << "create bias_tensor failed.";
return nullptr;
}
ret = bias_tensor->MallocData();
if (ret != RET_OK) {
delete new_conv_parameter;
delete bias_tensor;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "bias_tensor malloc failed.";
return nullptr;
}
if (bias_data_type == kNumberTypeFloat16) {
auto bias_data = reinterpret_cast<float16_t *>(origin_bias);
memcpy(bias_tensor->data_c(), bias_data + i * new_out_channel, new_out_channel * sizeof(float16_t));
} else {
MS_ASSERT(bias_data_type == kNumberTypeFloat32);
auto bias_data = reinterpret_cast<float *>(origin_bias);
memcpy(bias_tensor->data_c(), bias_data + i * new_out_channel, new_out_channel * sizeof(float));
}
new_inputs.emplace_back(bias_tensor);
}
// set new output tensor
for (int j = 0; j < output_num; ++j) {
auto tmp_out_tensor = new (std::nothrow) lite::Tensor();
if (tmp_out_tensor == nullptr) {
// create new output tensors
for (size_t j = 0; j < outputs.size(); ++j) {
auto out_tensor = CreateOutputTensor(out_shape, outputs, infered_flag, j);
if (out_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
MS_LOG(ERROR) << "new out_tensor failed.";
return nullptr;
}
tmp_out_tensor->set_data_type(outputs.at(j)->data_type());
tmp_out_tensor->SetFormat(outputs.at(j)->GetFormat());
if (infered_flag) {
tmp_out_tensor->set_shape(out_shape);
ret = tmp_out_tensor->MallocData();
if (ret != RET_OK) {
delete new_conv_parameter;
delete tmp_out_tensor;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "tmp_out_tensor malloc data failed.";
return nullptr;
}
}
new_outputs.emplace_back(tmp_out_tensor);
new_outputs.emplace_back(out_tensor);
}
group_convs.emplace_back(CpuConvFp16KernelSelect(new_inputs, new_outputs,
reinterpret_cast<OpParameter *>(new_conv_parameter), ctx,
primitive, use_winograd, out_unit));
}
return new (std::nothrow)
GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, group);
}

View File

@ -169,8 +169,8 @@ ConvParameter *CreateNewConvParameter(ConvParameter *parameter) {
return conv_parameter;
}
void FreeMemoryFp32(std::vector<kernel::LiteKernel *> group_convs, std::vector<lite::Tensor *> new_inputs,
std::vector<lite::Tensor *> new_outputs) {
void FreeMemoryFp32(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs) {
for (auto sub_conv : group_convs) {
if (sub_conv != nullptr) {
delete sub_conv;
@ -188,6 +188,87 @@ void FreeMemoryFp32(std::vector<kernel::LiteKernel *> group_convs, std::vector<l
}
}
lite::Tensor *CreateInputTensorFp32(TypeId data_type, std::vector<int> in_shape, bool infered_flag) {
auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
if (in_tensor == nullptr) {
MS_LOG(ERROR) << "new in_tensor failed.";
return nullptr;
}
if (infered_flag) {
auto ret = in_tensor->MallocData();
if (ret != RET_OK) {
delete in_tensor;
MS_LOG(ERROR) << "in tensor malloc failed.";
return nullptr;
}
}
return in_tensor;
}
lite::Tensor *CreateFilterTensorFp32(TypeId data_type, std::vector<int> filter_shape,
const std::vector<lite::Tensor *> &inputs, int copy_length, int index) {
auto filter_tensor =
new (std::nothrow) lite::Tensor(data_type, filter_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
if (filter_tensor == nullptr) {
MS_LOG(ERROR) << "new filter_tensor failed.";
return nullptr;
}
auto ret = filter_tensor->MallocData();
if (ret != RET_OK) {
delete filter_tensor;
MS_LOG(ERROR) << "filter_tensor malloc failed.";
return nullptr;
}
MS_ASSERT(data_type == kNumberTypeFloat32);
auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float));
return filter_tensor;
}
lite::Tensor *CreateBiasTensorFp32(TypeId data_type, std::vector<int> bias_shape,
const std::vector<lite::Tensor *> &inputs, int new_out_channel, int index) {
auto *origin_bias = inputs.at(kBiasIndex)->data_c();
auto bias_tensor =
new (std::nothrow) lite::Tensor(data_type, bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
if (bias_tensor == nullptr) {
MS_LOG(ERROR) << "new bias_tensor failed.";
return nullptr;
}
auto ret = bias_tensor->MallocData();
if (ret != RET_OK) {
delete bias_tensor;
MS_LOG(ERROR) << "bias_tensor malloc failed.";
return nullptr;
}
MS_ASSERT(data_type == kNumberTypeFloat32);
auto bias_data = reinterpret_cast<float *>(origin_bias);
memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float));
return bias_tensor;
}
lite::Tensor *CreateOutputTensorFp32(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
bool infered_flag, int index) {
auto out_tensor = new (std::nothrow) lite::Tensor();
if (out_tensor == nullptr) {
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
return nullptr;
}
out_tensor->set_data_type(outputs.at(index)->data_type());
out_tensor->SetFormat(outputs.at(index)->GetFormat());
if (infered_flag) {
out_tensor->set_shape(out_shape);
auto ret = out_tensor->MallocData();
if (ret != RET_OK) {
delete out_tensor;
MS_LOG(ERROR) << "out_tensor malloc data failed.";
return nullptr;
}
}
return out_tensor;
}
kernel::LiteKernel *CpuConvFp32KernelSelect(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
@ -208,31 +289,22 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
int group) {
std::vector<kernel::LiteKernel *> group_convs;
std::vector<int> in_shape;
std::vector<int> filter_shape;
std::vector<int> bias_shape;
std::vector<int> out_shape;
int out_unit;
bool has_bias = inputs.size() == 3;
bool use_winograd = false;
bool infered_flag = primitive != nullptr && primitive->GetInferFlag();
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
int out_channel = inputs.at(kWeightIndex)->Batch();
std::vector<int> in_shape;
std::vector<int> out_shape;
int new_in_channel = inputs.at(kWeightIndex)->Channel();
int new_out_channel = 0;
if (group == 0) {
MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
return nullptr;
} else {
new_out_channel = out_channel / group;
new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
}
int kernel_h = conv_param->kernel_h_;
int kernel_w = conv_param->kernel_w_;
int input_num = inputs.size();
int output_num = outputs.size();
bool has_bias = input_num == 3;
bool use_winograd = false;
int out_unit;
bool infered_flag = primitive != nullptr && primitive->GetInferFlag();
if (infered_flag) {
int batch = inputs.front()->Batch();
int in_h = inputs.front()->Height();
@ -243,11 +315,11 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
in_shape = {batch, in_h, in_w, new_in_channel};
out_shape = {batch, conv_param->output_h_, conv_param->output_w_, new_out_channel};
}
std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
std::vector<int> bias_shape = {new_out_channel};
filter_shape = {new_out_channel, kernel_h, kernel_w, new_in_channel};
bias_shape = {new_out_channel};
auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
// create sub kernels
std::vector<kernel::LiteKernel *> group_convs;
for (int i = 0; i < group; ++i) {
std::vector<lite::Tensor *> new_inputs;
std::vector<lite::Tensor *> new_outputs;
@ -257,100 +329,58 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
MS_LOG(ERROR) << "Get new conv parameter failed.";
return nullptr;
}
// get new input for each group
auto in_tensor =
new (std::nothrow) lite::Tensor(inputs.front()->data_type(), in_shape, Format_NHWC, lite::Tensor::Category::VAR);
// create new input for each group
auto in_tensor = CreateInputTensorFp32(inputs.front()->data_type(), in_shape, infered_flag);
if (in_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new in_tensor failed.";
MS_LOG(ERROR) << "create input tensor failed.";
return nullptr;
}
if (infered_flag) {
auto ret = in_tensor->MallocData();
if (ret != RET_OK) {
delete new_conv_parameter;
delete in_tensor;
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "in tensor malloc failed.";
return nullptr;
}
}
new_inputs.emplace_back(in_tensor);
// new weight
auto filter_tensor = new (std::nothrow) lite::Tensor(inputs.at(kWeightIndex)->data_type(), filter_shape,
Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
// create new weight
int copy_length = conv_param->kernel_h_ * conv_param->kernel_w_ * new_in_channel * new_out_channel;
auto filter_tensor =
CreateFilterTensorFp32(inputs.at(kWeightIndex)->data_type(), filter_shape, inputs, copy_length, i);
if (filter_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new filter_tensor failed.";
MS_LOG(ERROR) << "create filter tensor failed.";
return nullptr;
}
auto ret = filter_tensor->MallocData();
if (ret != RET_OK) {
delete new_conv_parameter;
delete filter_tensor;
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "filter_tensor malloc failed.";
return nullptr;
}
int copy_length = kernel_h * kernel_w * new_in_channel * new_out_channel;
memcpy(filter_tensor->data_c(), origin_weight + i * copy_length, copy_length * sizeof(float));
new_inputs.emplace_back(filter_tensor);
// if has bias, set new bias
// if has bias, create new bias
if (has_bias) {
auto *origin_bias = reinterpret_cast<float *>(inputs.at(kBiasIndex)->data_c());
auto bias_tensor = new (std::nothrow)
lite::Tensor(inputs.at(kBiasIndex)->data_type(), bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
auto bias_tensor =
CreateBiasTensorFp32(inputs.at(kBiasIndex)->data_type(), bias_shape, inputs, new_out_channel, i);
if (bias_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new bias_tensor failed.";
MS_LOG(ERROR) << "create bias_tensor failed.";
return nullptr;
}
ret = bias_tensor->MallocData();
if (ret != RET_OK) {
delete new_conv_parameter;
delete bias_tensor;
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "bias_tensor malloc failed.";
return nullptr;
}
memcpy(bias_tensor->data_c(), origin_bias + i * new_out_channel, new_out_channel * sizeof(float));
new_inputs.emplace_back(bias_tensor);
}
// set new output tensor
for (int j = 0; j < output_num; ++j) {
auto tmp_out_tensor = new (std::nothrow) lite::Tensor();
if (tmp_out_tensor == nullptr) {
// create new output tensor
for (size_t j = 0; j < outputs.size(); ++j) {
auto out_tensor = CreateOutputTensorFp32(out_shape, outputs, infered_flag, j);
if (out_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
MS_LOG(ERROR) << "new out_tensor failed.";
return nullptr;
}
tmp_out_tensor->set_data_type(outputs.at(j)->data_type());
tmp_out_tensor->SetFormat(outputs.at(j)->GetFormat());
if (infered_flag) {
tmp_out_tensor->set_shape(out_shape);
ret = tmp_out_tensor->MallocData();
if (ret != RET_OK) {
delete new_conv_parameter;
delete tmp_out_tensor;
FreeMemoryFp32(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "tmp_out_tensor malloc data failed.";
return nullptr;
}
}
new_outputs.emplace_back(tmp_out_tensor);
new_outputs.emplace_back(out_tensor);
}
group_convs.emplace_back(CpuConvFp32KernelSelect(new_inputs, new_outputs,
reinterpret_cast<OpParameter *>(new_conv_parameter), ctx,
primitive, use_winograd, out_unit));
}
return new (std::nothrow)
GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, group);
}

View File

@ -19,6 +19,7 @@
#include "src/kernel_registry.h"
#include "src/runtime/runtime_api.h"
#include "nnacl/quantization/quantize.h"
#include "nnacl/pack.h"
#include "include/errorcode.h"
using mindspore::lite::KernelRegistrar;
@ -38,11 +39,77 @@ using mindspore::schema::PrimitiveType_Mean;
using mindspore::schema::PrimitiveType_Reduce;
namespace mindspore::kernel {
void ReduceInt8CPUKernel::OneAxis() {
auto axis_info = axes_[0];
if (axis_info == 0) {
pattern_ = kernel::N;
} else if (axis_info == 1) {
pattern_ = kernel::H;
} else if (axis_info == 2) {
pattern_ = kernel::W;
} else {
pattern_ = kernel::C;
}
}
void ReduceInt8CPUKernel::TwoAxes() {
auto axis_info1 = axes_[0];
auto axis_info2 = axes_[1];
auto axis_sum = axis_info1 + axis_info2;
if (axis_sum == 1) {
pattern_ = kernel::NH;
} else if (axis_sum == 2) {
pattern_ = kernel::NW;
} else if (axis_sum == 3) {
if (axis_info1 == 0) {
pattern_ = kernel::NC;
} else {
pattern_ = kernel::HW;
}
} else if (axis_sum == 4) {
pattern_ = kernel::HC;
} else {
MS_ASSERT(axis_sum == 5);
pattern_ = kernel::WC;
}
}
void ReduceInt8CPUKernel::ThreeAxes() {
auto axis_info1 = axes_[0];
auto axis_info2 = axes_[1];
auto axis_info3 = axes_[2];
auto axis_sum = axis_info1 + axis_info2 + axis_info3;
if (axis_sum == 3) {
pattern_ = kernel::NHW;
} else if (axis_sum == 4) {
pattern_ = kernel::NHC;
} else if (axis_sum == 5) {
pattern_ = kernel::NWC;
} else {
MS_ASSERT(axis_sum == 6);
pattern_ = kernel::HWC;
}
}
void ReduceInt8CPUKernel::Match4DReducePattern() {
if (num_axes_ == 1) {
OneAxis();
} else if (num_axes_ == 2) {
TwoAxes();
} else if (num_axes_ == 3) {
ThreeAxes();
} else {
MS_ASSERT(num_axes_ == 4);
pattern_ = kernel::NHWC;
}
}
int ReduceInt8CPUKernel::Init() {
auto ret = ReduceBaseCPUKernel::Init();
if (ret != RET_OK) {
return ret;
}
Match4DReducePattern();
if (!this->in_tensors_[0]->shape().empty()) {
this->valid_shape_ = true;
ret = CalculateQuantArgs();
@ -96,6 +163,64 @@ int ReduceInt8CPUKernel::Init() {
return ReSize();
}
void ReduceInt8CPUKernel::ReduceMean4DCalQuantParam() {
int reduce_num = 1;
auto in_shape = in_tensors_.front()->shape();
switch (pattern_) {
case N:
reduce_num = in_shape[0];
break;
case H:
reduce_num = in_shape[1];
break;
case W:
reduce_num = in_shape[2];
break;
case C:
reduce_num = in_shape[3];
break;
case NH:
reduce_num = in_shape[0] * in_shape[1];
break;
case NW:
reduce_num = in_shape[0] * in_shape[2];
break;
case NC:
reduce_num = in_shape[0] * in_shape[3];
break;
case HW:
reduce_num = in_shape[1] * in_shape[2];
break;
case HC:
reduce_num = in_shape[1] * in_shape[3];
break;
case WC:
reduce_num = in_shape[2] * in_shape[3];
break;
case NHW:
reduce_num = in_shape[0] * in_shape[1] * in_shape[2];
break;
case NHC:
reduce_num = in_shape[0] * in_shape[1] * in_shape[3];
break;
case NWC:
reduce_num = in_shape[0] * in_shape[2] * in_shape[3];
break;
case HWC:
reduce_num = in_shape[1] * in_shape[2] * in_shape[3];
break;
case NHWC:
reduce_num = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3];
break;
}
bias_ = quant_arg_.out_zp_ - quant_arg_.in_zp_ * quant_arg_.in_scale_ / quant_arg_.out_scale_;
int shift;
double reciprocal = quant_arg_.in_scale_ / (quant_arg_.out_scale_ * reduce_num);
QuantizeMultiplierSmallerThanOne(reciprocal, &reduce_mean_quant_param_.multiplier_, &shift);
reduce_mean_quant_param_.left_shift_ = shift < 0 ? -shift : 0;
reduce_mean_quant_param_.right_shift_ = shift > 0 ? shift : 0;
}
int ReduceInt8CPUKernel::CalculateQuantArgs() {
lite::Tensor *input = in_tensors_.at(0);
lite::Tensor *output = out_tensors_.at(0);
@ -117,18 +242,24 @@ int ReduceInt8CPUKernel::CalculateQuantArgs() {
// (quant_out - zp_out)*scale_out = sum((quant_in -zp)*scale_in) * (1/num) for each axis in axes
// quant_out = sum(quant_in-zp) * (scale_in/scale_out) * (1/num)
if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean)) {
for (auto i = 0; i < num_axes_; i++) {
auto axis = axes_[i];
double reciprocal = 1.0 / in_tensors_.at(0)->shape()[axis];
QuantMulArg *qm = new (std::nothrow) QuantMulArg;
if (qm == nullptr) {
MS_LOG(ERROR) << "Reduce new QuantMulArg failed.";
return RET_NULL_PTR;
if (input->shape().size() == 4 && pattern_ == kernel::HW) {
// special case, can use pattern
ReduceMean4DCalQuantParam();
pattern_impl_ = true;
} else {
for (auto i = 0; i < num_axes_; i++) {
auto axis = axes_[i];
double reciprocal = 1.0 / in_tensors_.at(0)->shape()[axis];
QuantMulArg *qm = new (std::nothrow) QuantMulArg;
if (qm == nullptr) {
MS_LOG(ERROR) << "Reduce new QuantMulArg failed.";
return RET_NULL_PTR;
}
QuantizeMultiplierSmallerThanOne(reciprocal, &qm->multiplier_, &shift);
qm->left_shift_ = shift < 0 ? -shift : 0;
qm->right_shift_ = shift > 0 ? shift : 0;
mean_multipliers_.push_back(qm);
}
QuantizeMultiplierSmallerThanOne(reciprocal, &qm->multiplier_, &shift);
qm->left_shift_ = shift < 0 ? -shift : 0;
qm->right_shift_ = shift > 0 ? shift : 0;
mean_multipliers_.push_back(qm);
}
}
@ -230,6 +361,16 @@ int ReduceInt8Impl(void *cdata, int task_id) {
return RET_OK;
}
int ReduceMeanPatternInt8Impl(void *cdata, int task_id) {
auto reduce = reinterpret_cast<ReduceInt8CPUKernel *>(cdata);
auto error_code = reduce->Reduce4DExecute(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "Reduce Run error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return RET_OK;
}
void ReduceInt8CPUKernel::GetQuantArgs(size_t i) {
MS_ASSERT(i < static_cast<size_t>(num_axis_));
if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean)) {
@ -250,6 +391,78 @@ void ReduceInt8CPUKernel::GetQuantArgs(size_t i) {
}
}
int ReduceInt8CPUKernel::Reduce4DExecute(int task_id) {
auto input = in_tensors_.at(0);
auto in_data = reinterpret_cast<int8_t *>(input->data_c());
auto in_shape = input->shape();
MS_ASSERT(in_shape.size() == 4);
int n = in_shape.at(0);
int h = in_shape.at(1);
int w = in_shape.at(2);
int c = in_shape.at(3);
auto output_data = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c());
switch (pattern_) {
case N:
return ReduceMeanN(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case H:
return ReduceMeanH(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case W:
return ReduceMeanW(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case C:
return ReduceMeanC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case NH:
return ReduceMeanNH(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case NW:
return ReduceMeanNW(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case NC:
return ReduceMeanNC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case HW: {
// data has been convert into NCHW format for efficiently
int num = UP_DIV(c, ctx_->thread_num_);
int count = c - task_id * num;
count = count > num ? num : count;
int plane = h * w;
return ReduceMeanHW(n, plane, count, c, nchw_in_data_ + task_id * num * plane, output_data + task_id * num,
reduce_mean_quant_param_, bias_);
}
case HC:
return ReduceMeanHC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case WC:
return ReduceMeanWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case NHW:
return ReduceMeanNHW(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case NHC:
return ReduceMeanNHC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case NWC:
return ReduceMeanNWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case HWC:
return ReduceMeanHWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
case NHWC:
return ReduceMeanNHWC(n, h, w, c, in_data, output_data, reduce_mean_quant_param_);
}
return RET_OK;
}
int ReduceInt8CPUKernel::Fast4DReduceMeanHWImpl() {
auto input = in_tensors_.at(0);
auto input_data = reinterpret_cast<int8_t *>(input->data_c());
nchw_in_data_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(input->ElementsNum()));
if (nchw_in_data_ == nullptr) {
MS_LOG(ERROR) << "malloc nchw_in_data_ failed.";
return RET_ERROR;
}
PackNHWCToNCHWInt8(reinterpret_cast<void *>(input_data), reinterpret_cast<void *>(nchw_in_data_), input->Batch(),
input->Height() * input->Width(), input->Channel());
auto ret = ParallelLaunch(this->context_->thread_pool_, ReduceMeanPatternInt8Impl, this, context_->thread_num_);
if (ret != RET_OK) {
ctx_->allocator->Free(nchw_in_data_);
MS_LOG(ERROR) << "Reduce run error, error_code[" << ret << "]";
return RET_ERROR;
}
ctx_->allocator->Free(nchw_in_data_);
return RET_OK;
}
int ReduceInt8CPUKernel::Run() {
if (!this->valid_shape_) {
auto ret = CalculateQuantArgs();
@ -257,6 +470,11 @@ int ReduceInt8CPUKernel::Run() {
return ret;
}
}
// now only implement reduce mean mode 4d reduce HW case, otherwise go into reference impl
if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean) && pattern_impl_ && pattern_ == kernel::HW) {
return Fast4DReduceMeanHWImpl();
}
auto ret = MallocTmpBuffer();
if (ret != RET_OK) {
FreeTmpBuffer();
@ -313,6 +531,7 @@ int ReduceInt8CPUKernel::CallReduceUnit(int task_id) {
}
return ret;
}
kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,

View File

@ -28,6 +28,7 @@
using mindspore::schema::ReduceMode;
namespace mindspore::kernel {
enum Four_DIMENSION_REDUCE_TEMPLATE { N, H, W, C, NH, NW, NC, HW, HC, WC, NHW, NHC, NWC, HWC, NHWC };
class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
@ -38,7 +39,7 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
ReduceInt8CPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
: ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive), ctx_(ctx) {}
~ReduceInt8CPUKernel() {
for (auto qm : mean_multipliers_) {
delete qm;
@ -59,6 +60,8 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
int Init() override;
int ReSize() override;
int Run() override;
int Fast4DReduceMeanHWImpl();
int Reduce4DExecute(int task_id);
int CallReduceUnit(int task_id);
int ReduceLastAxis(int task_id);
@ -68,22 +71,31 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
private:
int MallocTmpBuffer();
void FreeTmpBuffer();
void Match4DReducePattern();
void OneAxis();
void TwoAxes();
void ThreeAxes();
void ReduceMean4DCalQuantParam();
int CalculateQuantArgs();
void GetQuantArgs(size_t i);
private:
ReduceParameter *param_ = nullptr;
ReduceQuantArg quant_arg_;
int8_t *nchw_in_data_ = nullptr;
int32_t bias_;
private:
const lite::InnerContext *ctx_;
int32_t *begin_src_data_ = nullptr;
int8_t *last_dst_data_ = nullptr;
std::vector<int32_t *> data_buffers_;
const int32_t *src_data_ = nullptr;
int32_t *dst_data_ = nullptr;
bool valid_shape_ = false;
bool pattern_impl_ = false;
Four_DIMENSION_REDUCE_TEMPLATE pattern_;
QuantMulArg reduce_mean_quant_param_; // used in reduce mean 4D situation
Reducer reducer_ = nullptr;
LastReducer last_reducer_ = nullptr;
std::vector<QuantMulArg *> mean_multipliers_;