diff --git a/mindspore/lite/nnacl/fp16/reduce_fp16.c b/mindspore/lite/nnacl/fp16/reduce_fp16.c index cf7665ac0e1..69a21a4c644 100644 --- a/mindspore/lite/nnacl/fp16/reduce_fp16.c +++ b/mindspore/lite/nnacl/fp16/reduce_fp16.c @@ -19,8 +19,8 @@ #include "nnacl/errorcode.h" int ReduceMeanFp16(const int outer_size, const int inner_size, const int axis_size, const float16_t *src_data, - const int *src_shape, float16_t *dst_data, const int tid, const int thread_num) { - if (src_data == NULL || src_shape == NULL || dst_data == NULL) { + float16_t *dst_data, const int tid, const int thread_num) { + if (src_data == NULL || dst_data == NULL) { return NNACL_NULL_PTR; } int i, j, k; diff --git a/mindspore/lite/nnacl/fp16/reduce_fp16.h b/mindspore/lite/nnacl/fp16/reduce_fp16.h index 8a9655abb14..c826e90ee02 100644 --- a/mindspore/lite/nnacl/fp16/reduce_fp16.h +++ b/mindspore/lite/nnacl/fp16/reduce_fp16.h @@ -26,7 +26,7 @@ extern "C" { #endif int ReduceMeanFp16(const int outer_size, const int inner_size, const int axis_size, const float16_t *src_data, - const int *src_shape, float16_t *dst_data, const int tid, const int thread_num); + float16_t *dst_data, const int tid, const int thread_num); #ifdef __cplusplus } diff --git a/mindspore/lite/nnacl/fp32/reduce.c b/mindspore/lite/nnacl/fp32/reduce.c index 03ba8b634be..fdda3f5da17 100644 --- a/mindspore/lite/nnacl/fp32/reduce.c +++ b/mindspore/lite/nnacl/fp32/reduce.c @@ -18,9 +18,9 @@ #include "nnacl/fp32/reduce.h" #include "nnacl/errorcode.h" -int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num) { - if (src_data == NULL || src_shape == NULL || dst_data == NULL) { +int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, + const int tid, const int thread_num) { + if (src_data == NULL || dst_data == NULL) { return NNACL_NULL_PTR; } int i, j, k; @@ -39,9 +39,9 @@ int ReduceMean(const int outer_size, const int inner_size, const int axis_size, } return NNACL_OK; } -int ReduceSum(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num) { - if (src_data == NULL || src_shape == NULL || dst_data == NULL) { +int ReduceSum(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, + const int tid, const int thread_num) { + if (src_data == NULL || dst_data == NULL) { return NNACL_NULL_PTR; } int i, j, k; @@ -60,9 +60,9 @@ int ReduceSum(const int outer_size, const int inner_size, const int axis_size, c } return NNACL_OK; } -int ReduceMax(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num) { - if (src_data == NULL || src_shape == NULL || dst_data == NULL) { +int ReduceMax(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, + const int tid, const int thread_num) { + if (src_data == NULL || dst_data == NULL) { return NNACL_NULL_PTR; } int i, j, k; @@ -81,9 +81,9 @@ int ReduceMax(const int outer_size, const int inner_size, const int axis_size, c } return NNACL_OK; } -int ReduceMin(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num) { - if (src_data == NULL || src_shape == NULL || dst_data == NULL) { +int ReduceMin(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, + const int tid, const int thread_num) { + if (src_data == NULL || dst_data == NULL) { return NNACL_NULL_PTR; } int i, j, k; @@ -102,9 +102,9 @@ int ReduceMin(const int outer_size, const int inner_size, const int axis_size, c } return NNACL_OK; } -int ReduceProd(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num) { - if (src_data == NULL || src_shape == NULL || dst_data == NULL) { +int ReduceProd(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, + const int tid, const int thread_num) { + if (src_data == NULL || dst_data == NULL) { return NNACL_NULL_PTR; } int i, j, k; @@ -124,8 +124,8 @@ int ReduceProd(const int outer_size, const int inner_size, const int axis_size, return NNACL_OK; } int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num) { - if (src_data == NULL || src_shape == NULL || dst_data == NULL) { + float *dst_data, const int tid, const int thread_num) { + if (src_data == NULL || dst_data == NULL) { return NNACL_NULL_PTR; } int i, j, k; diff --git a/mindspore/lite/nnacl/fp32/reduce.h b/mindspore/lite/nnacl/fp32/reduce.h index 5844b23f32e..78fa15c1352 100644 --- a/mindspore/lite/nnacl/fp32/reduce.h +++ b/mindspore/lite/nnacl/fp32/reduce.h @@ -22,18 +22,18 @@ #ifdef __cplusplus extern "C" { #endif -int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num); -int ReduceSum(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num); -int ReduceMax(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num); -int ReduceMin(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num); -int ReduceProd(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num); +int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, + const int tid, const int thread_num); +int ReduceSum(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, + const int tid, const int thread_num); +int ReduceMax(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, + const int tid, const int thread_num); +int ReduceMin(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, + const int tid, const int thread_num); +int ReduceProd(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, + const int tid, const int thread_num); int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num); + float *dst_data, const int tid, const int thread_num); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc index 4024f3afe31..5731acc901a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc +++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc @@ -120,7 +120,54 @@ int ReduceBaseCPUKernel::Init() { return RET_OK; } -int ReduceBaseCPUKernel::ReSize() { return CheckParameters(); } +void ReduceBaseCPUKernel::CalculateInnerOuterSize() { + outer_sizes_.clear(); + inner_sizes_.clear(); + axis_sizes_.clear(); + auto tmp_shape = in_tensors_.at(0)->shape(); + for (auto i = 0; i < num_axes_; ++i) { + int axis = axes_[i]; + auto outer_size = 1; + for (int j = 0; j < axis; j++) { + outer_size *= tmp_shape[j]; + } + outer_sizes_.emplace_back(outer_size); + auto inner_size = 1; + for (int k = axis + 1; k < static_cast(tmp_shape.size()); k++) { + inner_size *= tmp_shape[k]; + } + inner_sizes_.emplace_back(inner_size); + axis_sizes_.emplace_back(tmp_shape[axis]); + tmp_shape[axis] = 1; + } +} + +void ReduceBaseCPUKernel::CalculateTmpBufferSize() { + buffer_sizes_.clear(); + auto input_shape = in_tensors_.at(0)->shape(); + for (auto i = 0; i < num_axes_; i++) { + int axis = axes_[i]; + size_t size = 1; + for (size_t j = 0; j < input_shape.size(); j++) { + if (axis != static_cast(j)) { + size *= input_shape[j]; + } + } + MS_ASSERT(context_->allocator != nullptr); + buffer_sizes_.emplace_back(size); + input_shape[axis] = 1; + } +} + +int ReduceBaseCPUKernel::ReSize() { + auto ret = CheckParameters(); + if (ret != RET_OK) { + return ret; + } + CalculateTmpBufferSize(); + CalculateInnerOuterSize(); + return RET_OK; +} kernel::LiteKernel *CpuReduceFp32KernelCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter, diff --git a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.h b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.h index 9a60f9a250c..f233385dd64 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.h +++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.h @@ -45,10 +45,15 @@ class ReduceBaseCPUKernel : public LiteKernel { bool reduce_to_end_; protected: + void CalculateTmpBufferSize(); + void CalculateInnerOuterSize(); + std::vector buffer_sizes_; + std::vector outer_sizes_; + std::vector inner_sizes_; + std::vector axis_sizes_; int outer_size_; int inner_size_; int axis_size_; - std::vector tmp_shape_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc index cae5b1c4156..9cd329c012d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc @@ -60,8 +60,8 @@ int ReduceFp16CPUKernel::Init() { int ReduceFp16CPUKernel::ReSize() { return ReduceBaseCPUKernel::ReSize(); } int ReduceFp16CPUKernel::CallReduceUnit(int task_id) { - auto ret = reducer_(outer_size_, inner_size_, axis_size_, fp16_src_data_, tmp_shape_.data(), fp16_dst_data_, task_id, - context_->thread_num_); + auto ret = + reducer_(outer_size_, inner_size_, axis_size_, fp16_src_data_, fp16_dst_data_, task_id, context_->thread_num_); return ret; } @@ -88,7 +88,6 @@ int ReduceFp16CPUKernel::Run() { return ret; } - tmp_shape_ = in_tensors_.at(0)->shape(); auto in_tensor = in_tensors_.at(0); if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { auto input_data = reinterpret_cast(in_tensor->MutableData()); @@ -100,23 +99,15 @@ int ReduceFp16CPUKernel::Run() { fp16_src_data_ = fp16_input_; for (int i = 0; i < data_buffers_.size(); ++i) { fp16_dst_data_ = data_buffers_[i]; - int axis = axes_[i]; - outer_size_ = 1; - for (int j = 0; j < axis; j++) { - outer_size_ *= tmp_shape_[j]; - } - inner_size_ = 1; - for (int k = axis + 1; k < static_cast(tmp_shape_.size()); k++) { - inner_size_ *= tmp_shape_[k]; - } - axis_size_ = tmp_shape_[axis]; + outer_size_ = outer_sizes_[i]; + inner_size_ = inner_sizes_[i]; + axis_size_ = axis_sizes_[i]; auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceImpl, this, context_->thread_num_); if (error_code != RET_OK) { FreeTmpBuffer(); MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]"; return RET_ERROR; } - tmp_shape_[axis] = 1; fp16_src_data_ = fp16_dst_data_; } @@ -151,22 +142,14 @@ void ReduceFp16CPUKernel::FreeTmpBuffer() { } int ReduceFp16CPUKernel::MallocTmpBuffer() { - auto input_shape = in_tensors_.at(0)->shape(); - for (auto i = 0; i < num_axes_; i++) { - int axis = axes_[i]; - size_t size = 1; - for (auto j = 0; j < input_shape.size(); j++) { - if (static_cast(axis) != j) { - size *= input_shape[j]; - } - } + data_buffers_.clear(); + for (auto size : buffer_sizes_) { float16_t *buffer = reinterpret_cast(context_->allocator->Malloc(size * sizeof(float16_t))); if (buffer == nullptr) { MS_LOG(ERROR) << "Malloc data failed"; return RET_ERROR; } data_buffers_.emplace_back(buffer); - input_shape[axis] = 1; } auto in_tensor = in_tensors_.front(); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h index cb076c93758..ceb0b228eca 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.h @@ -27,7 +27,7 @@ using mindspore::schema::ReduceMode; namespace mindspore::kernel { class ReduceFp16CPUKernel : public ReduceBaseCPUKernel { typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float16_t *src_data, - const int *src_shape, float16_t *dst_data, const int tid, const int thread_num); + float16_t *dst_data, const int tid, const int thread_num); public: ReduceFp16CPUKernel(OpParameter *param, const std::vector &inputs, diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc index 41adc5be8eb..f98351e36ba 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc @@ -81,17 +81,10 @@ int ReduceCPUKernel::Init() { return ReSize(); } -int ReduceCPUKernel::ReSize() { - auto ret = ReduceBaseCPUKernel::ReSize(); - if (ret != RET_OK) { - return ret; - } - return MallocTmpBuffer(); -} +int ReduceCPUKernel::ReSize() { return ReduceBaseCPUKernel::ReSize(); } int ReduceCPUKernel::CallReduceUnit(int task_id) { - auto ret = reducer_(outer_size_, inner_size_, axis_size_, src_data_, tmp_shape_.data(), dst_data_, task_id, - context_->thread_num_); + auto ret = reducer_(outer_size_, inner_size_, axis_size_, src_data_, dst_data_, task_id, context_->thread_num_); return ret; } @@ -111,75 +104,55 @@ int ReduceCPUKernel::Run() { MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; return prepare_ret; } - tmp_shape_ = in_tensors_.at(0)->shape(); + auto ret = MallocTmpBuffer(); + if (ret != RET_OK) { + FreeTmpBuffer(); + return ret; + } + src_data_ = static_cast(in_tensors_.at(0)->MutableData()); - for (size_t i = 0; i < data_buffers_.size(); ++i) { - dst_data_ = data_buffers_[i]; - int axis = axes_[i]; - outer_size_ = 1; - for (int j = 0; j < axis; j++) { - outer_size_ *= tmp_shape_[j]; + for (size_t i = 0; i < static_cast(num_axes_); ++i) { + if (i != static_cast(num_axes_ - 1)) { + dst_data_ = data_buffers_[i]; + } else { + dst_data_ = reinterpret_cast(out_tensors_.at(0)->MutableData()); } - inner_size_ = 1; - for (int k = axis + 1; k < static_cast(tmp_shape_.size()); k++) { - inner_size_ *= tmp_shape_[k]; - } - axis_size_ = tmp_shape_[axis]; + outer_size_ = outer_sizes_[i]; + inner_size_ = inner_sizes_[i]; + axis_size_ = axis_sizes_[i]; auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceImpl, this, context_->thread_num_); if (error_code != RET_OK) { MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]"; + FreeTmpBuffer(); return RET_ERROR; } - tmp_shape_[axis] = 1; src_data_ = dst_data_; } - - int last_reduce_axis = axes_[num_axes_ - 1]; - outer_size_ = 1; - for (int i = 0; i < last_reduce_axis; i++) { - outer_size_ *= tmp_shape_[i]; - } - inner_size_ = 1; - for (int i = last_reduce_axis + 1; i < static_cast(tmp_shape_.size()); i++) { - inner_size_ *= tmp_shape_[i]; - } - axis_size_ = tmp_shape_[last_reduce_axis]; - dst_data_ = reinterpret_cast(out_tensors_.at(0)->MutableData()); - auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceImpl, this, context_->thread_num_); - if (error_code != RET_OK) { - MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]"; - return RET_ERROR; - } - + FreeTmpBuffer(); return RET_OK; } int ReduceCPUKernel::MallocTmpBuffer() { - for (auto buffer : data_buffers_) { - if (buffer != nullptr) { - free(buffer); - buffer = nullptr; - } - } data_buffers_.clear(); - - auto input_shape = in_tensors_.at(0)->shape(); - for (auto i = 0; i < num_axes_ - 1; i++) { - int axis = axes_[i]; - size_t size = 1; - for (size_t j = 0; j < input_shape.size(); j++) { - if (axis != static_cast(j)) { - size *= input_shape[j]; - } - } - float *buffer = reinterpret_cast(malloc(size * sizeof(float))); + for (auto size : buffer_sizes_) { + float *buffer = reinterpret_cast(context_->allocator->Malloc(size * sizeof(float))); if (buffer == nullptr) { MS_LOG(ERROR) << "Malloc data failed."; return RET_ERROR; } data_buffers_.emplace_back(buffer); - input_shape[axis] = 1; } return RET_OK; } + +void ReduceCPUKernel::FreeTmpBuffer() { + for (size_t i = 0; i < data_buffers_.size(); i++) { + float *buffer = data_buffers_[i]; + if (buffer != nullptr) { + context_->allocator->Free(buffer); + buffer = nullptr; + } + } + data_buffers_.clear(); +} } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h index 309d8e8cd50..6cf5856d612 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h @@ -28,7 +28,7 @@ using mindspore::schema::ReduceMode; namespace mindspore::kernel { class ReduceCPUKernel : public ReduceBaseCPUKernel { typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data, - const int *src_shape, float *dst_data, const int tid, const int thread_num); + float *dst_data, const int tid, const int thread_num); public: ReduceCPUKernel(OpParameter *param, const std::vector &inputs, @@ -36,13 +36,7 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel { const mindspore::lite::PrimitiveC *primitive) : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {} ~ReduceCPUKernel() { - for (size_t i = 0; i < data_buffers_.size(); i++) { - float *buffer = data_buffers_[i]; - if (buffer != nullptr) { - free(buffer); - buffer = nullptr; - } - } + FreeTmpBuffer(); src_data_ = nullptr; dst_data_ = nullptr; } @@ -60,6 +54,7 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel { private: int MallocTmpBuffer(); + void FreeTmpBuffer(); }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc index b3d02c874b7..9d6c4dd999d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc @@ -39,10 +39,6 @@ int ReduceInt8CPUKernel::Init() { if (ret != RET_OK) { return ret; } - ret = MallocTmpBuffer(); - if (ret != RET_OK) { - return ret; - } ret = CalculateQuantArgs(); if (ret != RET_OK) { return ret; @@ -179,23 +175,15 @@ int ReduceInt8CPUKernel::CalculateQuantArgs() { } int ReduceInt8CPUKernel::MallocTmpBuffer() { - auto input_shape = in_tensors_.at(0)->shape(); - for (auto i = 0; i < num_axes_ - 1; i++) { - int axis = axes_[i]; - size_t size = 1; - for (size_t j = 0; j < input_shape.size(); j++) { - if (axis != static_cast(j)) { - size *= input_shape[j]; - } - } - MS_ASSERT(context_->allocator != nullptr); - int32_t *buffer = reinterpret_cast(context_->allocator->Malloc(size * sizeof(int32_t))); + data_buffers_.clear(); + MS_ASSERT(static_cast(buffer_sizes_.size()) == num_axes_ - 1); + for (auto buffer_size : buffer_sizes_) { + int32_t *buffer = reinterpret_cast(context_->allocator->Malloc(buffer_size * sizeof(int32_t))); if (buffer == nullptr) { MS_LOG(ERROR) << "Malloc data failed."; return RET_ERROR; } data_buffers_.emplace_back(buffer); - input_shape[axis] = 1; } auto input = in_tensors_.at(0); @@ -203,17 +191,13 @@ int ReduceInt8CPUKernel::MallocTmpBuffer() { if (begin_src_data_ == nullptr) { return RET_NULL_PTR; } - auto input_data = reinterpret_cast(input->MutableData()); - for (auto i = 0; i < input->ElementsNum(); i++) { - begin_src_data_[i] = static_cast(input_data[i]); - } + return RET_OK; } void ReduceInt8CPUKernel::FreeTmpBuffer() { for (auto buffer : data_buffers_) { if (buffer != nullptr) { - MS_ASSERT(context_->allocator != nullptr); context_->allocator->Free(buffer); buffer = nullptr; } @@ -221,20 +205,12 @@ void ReduceInt8CPUKernel::FreeTmpBuffer() { data_buffers_.clear(); if (begin_src_data_ != nullptr) { - MS_ASSERT(context_->allocator != nullptr); context_->allocator->Free(begin_src_data_); begin_src_data_ = nullptr; } } -int ReduceInt8CPUKernel::ReSize() { - FreeTmpBuffer(); - auto ret = MallocTmpBuffer(); - if (ret != RET_OK) { - FreeTmpBuffer(); - } - return ret; -} +int ReduceInt8CPUKernel::ReSize() { return ReduceBaseCPUKernel::ReSize(); } int ReduceInt8Impl(void *cdata, int task_id) { auto reduce = reinterpret_cast(cdata); @@ -246,80 +222,65 @@ int ReduceInt8Impl(void *cdata, int task_id) { return RET_OK; } +void ReduceInt8CPUKernel::GetQuantArgs(size_t i) { + MS_ASSERT(i < static_cast(num_axis_)); + if (mode_ == static_cast(schema::ReduceMode_ReduceMean)) { + quant_arg_.mean_multiplier_ = mean_multipliers_[i]->multiplier_; + quant_arg_.mean_left_shift_ = mean_multipliers_[i]->left_shift_; + quant_arg_.mean_right_shift_ = mean_multipliers_[i]->right_shift_; + } + + if (mode_ == static_cast(schema::ReduceMode_ReduceProd)) { + quant_arg_.prod_multiplier_ = prod_multipliers_[i]->multiplier_; + quant_arg_.prod_left_shift_ = prod_multipliers_[i]->left_shift_; + quant_arg_.prod_right_shift_ = prod_multipliers_[i]->right_shift_; + } + if (mode_ == static_cast(schema::ReduceMode_ReduceSumSquare)) { + quant_arg_.sum_square_multiplier_ = sum_square_multipliers_[i]->multiplier_; + quant_arg_.sum_square_left_shift_ = sum_square_multipliers_[i]->left_shift_; + quant_arg_.sum_square_right_shift_ = sum_square_multipliers_[i]->right_shift_; + } +} + int ReduceInt8CPUKernel::Run() { auto prepare_ret = Prepare(); if (prepare_ret != RET_OK) { MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; return prepare_ret; } + auto ret = MallocTmpBuffer(); + if (ret != RET_OK) { + FreeTmpBuffer(); + return ret; + } is_last_axis_ = false; - tmp_shape_ = in_tensors_.at(0)->shape(); + + auto input = in_tensors().at(0); + auto input_data = reinterpret_cast(input->MutableData()); + for (auto i = 0; i < input->ElementsNum(); i++) { + begin_src_data_[i] = static_cast(input_data[i]); + } src_data_ = begin_src_data_; - - for (size_t i = 0; i < data_buffers_.size(); ++i) { - if (mode_ == static_cast(schema::ReduceMode_ReduceMean)) { - quant_arg_.mean_multiplier_ = mean_multipliers_[i]->multiplier_; - quant_arg_.mean_left_shift_ = mean_multipliers_[i]->left_shift_; - quant_arg_.mean_right_shift_ = mean_multipliers_[i]->right_shift_; - } - - if (mode_ == static_cast(schema::ReduceMode_ReduceProd)) { - quant_arg_.prod_multiplier_ = prod_multipliers_[i]->multiplier_; - quant_arg_.prod_left_shift_ = prod_multipliers_[i]->left_shift_; - quant_arg_.prod_right_shift_ = prod_multipliers_[i]->right_shift_; - } - if (mode_ == static_cast(schema::ReduceMode_ReduceSumSquare)) { - quant_arg_.sum_square_multiplier_ = sum_square_multipliers_[i]->multiplier_; - quant_arg_.sum_square_left_shift_ = sum_square_multipliers_[i]->left_shift_; - quant_arg_.sum_square_right_shift_ = sum_square_multipliers_[i]->right_shift_; - } + for (size_t i = 0; i < data_buffers_.size() - 1; ++i) { + GetQuantArgs(i); dst_data_ = data_buffers_[i]; - int axis = axes_[i]; - outer_size_ = 1; - for (int j = 0; j < axis; j++) { - outer_size_ *= tmp_shape_[j]; - } - inner_size_ = 1; - for (int k = axis + 1; k < static_cast(tmp_shape_.size()); k++) { - inner_size_ *= tmp_shape_[k]; - } - axis_size_ = tmp_shape_[axis]; + outer_size_ = outer_sizes_[i]; + inner_size_ = inner_sizes_[i]; + axis_size_ = axis_sizes_[i]; auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceInt8Impl, this, context_->thread_num_); if (error_code != RET_OK) { FreeTmpBuffer(); MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]"; return RET_ERROR; } - tmp_shape_[axis] = 1; src_data_ = dst_data_; } - if (mode_ == static_cast(schema::ReduceMode_ReduceMean)) { - quant_arg_.mean_multiplier_ = mean_multipliers_.back()->multiplier_; - quant_arg_.mean_left_shift_ = mean_multipliers_.back()->left_shift_; - quant_arg_.mean_right_shift_ = mean_multipliers_.back()->right_shift_; - } - if (mode_ == static_cast(schema::ReduceMode_ReduceProd)) { - quant_arg_.prod_multiplier_ = prod_multipliers_.back()->multiplier_; - quant_arg_.prod_left_shift_ = prod_multipliers_.back()->left_shift_; - quant_arg_.prod_right_shift_ = prod_multipliers_.back()->right_shift_; - } - if (mode_ == static_cast(schema::ReduceMode_ReduceSumSquare)) { - quant_arg_.sum_square_multiplier_ = sum_square_multipliers_.back()->multiplier_; - quant_arg_.sum_square_left_shift_ = sum_square_multipliers_.back()->left_shift_; - quant_arg_.sum_square_right_shift_ = sum_square_multipliers_.back()->right_shift_; - } - int last_reduce_axis = axes_[num_axes_ - 1]; - outer_size_ = 1; - for (int i = 0; i < last_reduce_axis; i++) { - outer_size_ *= tmp_shape_[i]; - } - inner_size_ = 1; - for (int i = last_reduce_axis + 1; i < static_cast(tmp_shape_.size()); i++) { - inner_size_ *= tmp_shape_[i]; - } - axis_size_ = tmp_shape_[last_reduce_axis]; + GetQuantArgs(static_cast(num_axes_ - 1)); + outer_size_ = outer_sizes_.back(); + inner_size_ = inner_sizes_.back(); + axis_size_ = axis_sizes_.back(); last_dst_data_ = reinterpret_cast(out_tensors_.at(0)->MutableData()); is_last_axis_ = true; auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceInt8Impl, this, context_->thread_num_); @@ -328,7 +289,6 @@ int ReduceInt8CPUKernel::Run() { FreeTmpBuffer(); return RET_ERROR; } - FreeTmpBuffer(); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h index f9aa5b231df..86d95a912e8 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h @@ -68,7 +68,9 @@ class ReduceInt8CPUKernel : public ReduceBaseCPUKernel { private: int MallocTmpBuffer(); void FreeTmpBuffer(); + int CalculateQuantArgs(); + void GetQuantArgs(size_t i); private: ReduceParameter *param_ = nullptr; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reduce_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reduce_fp32_tests.cc index 5940775d618..82db7d3418a 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reduce_fp32_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reduce_fp32_tests.cc @@ -46,7 +46,7 @@ TEST_F(TestReduceFp32, Mean) { int outer_size = 2; int inner_size = 12; int axis_size = 4; - (void)ReduceMean(outer_size, inner_size, axis_size, in, input_shape, out, tid, thread_num); + (void)ReduceMean(outer_size, inner_size, axis_size, in, out, tid, thread_num); int output_size = 24; CompareOutputData(out, correct, output_size, err_tol); @@ -72,9 +72,9 @@ TEST_F(TestReduceFp32, Mean2Thread) { int axis_size = 4; thread_num = 2; tid = 0; - (void)ReduceMean(outer_size, inner_size, axis_size, in, input_shape, out, tid, thread_num); + (void)ReduceMean(outer_size, inner_size, axis_size, in, out, tid, thread_num); tid = 1; - (void)ReduceMean(outer_size, inner_size, axis_size, in, input_shape, out, tid, thread_num); + (void)ReduceMean(outer_size, inner_size, axis_size, in, out, tid, thread_num); int output_size = 24; CompareOutputData(out, correct, output_size, err_tol); @@ -98,7 +98,7 @@ TEST_F(TestReduceFp32, MeanAllAxis) { float *src = in; float dst1[48] = {0}; MS_ASSERT(dst != nullptr); - (void)ReduceMean(outer_size, inner_size, axis_size, src, input_shape, dst1, tid, thread_num); + (void)ReduceMean(outer_size, inner_size, axis_size, src, dst1, tid, thread_num); input_shape[0] = 1; // 1 4 4 3 outer_size = 1; @@ -106,7 +106,7 @@ TEST_F(TestReduceFp32, MeanAllAxis) { axis_size = 4; src = dst1; float dst2[12] = {0}; - (void)ReduceMean(outer_size, inner_size, axis_size, src, input_shape, dst2, tid, thread_num); + (void)ReduceMean(outer_size, inner_size, axis_size, src, dst2, tid, thread_num); input_shape[1] = 1; // 1 1 4 3 outer_size = 1; @@ -114,14 +114,14 @@ TEST_F(TestReduceFp32, MeanAllAxis) { axis_size = 4; src = dst2; float dst3[3] = {0}; - (void)ReduceMean(outer_size, inner_size, axis_size, src, input_shape, dst3, tid, thread_num); + (void)ReduceMean(outer_size, inner_size, axis_size, src, dst3, tid, thread_num); input_shape[2] = 1; // 1 1 1 3 outer_size = 1; inner_size = 1; axis_size = 3; src = dst3; - (void)ReduceMean(outer_size, inner_size, axis_size, src, input_shape, out, tid, thread_num); + (void)ReduceMean(outer_size, inner_size, axis_size, src, out, tid, thread_num); int output_size = 1; CompareOutputData(out, correct, output_size, err_tol); @@ -145,7 +145,7 @@ TEST_F(TestReduceFp32, Sum) { int outer_size = 2; int inner_size = 12; int axis_size = 4; - (void)ReduceSum(outer_size, inner_size, axis_size, in, input_shape, out, tid, thread_num); + (void)ReduceSum(outer_size, inner_size, axis_size, in, out, tid, thread_num); int output_size = 24; CompareOutputData(out, correct, output_size, err_tol); @@ -171,9 +171,9 @@ TEST_F(TestReduceFp32, Sum2Thread) { int axis_size = 4; thread_num = 2; tid = 0; - (void)ReduceSum(outer_size, inner_size, axis_size, in, input_shape, out, tid, thread_num); + (void)ReduceSum(outer_size, inner_size, axis_size, in, out, tid, thread_num); tid = 1; - (void)ReduceSum(outer_size, inner_size, axis_size, in, input_shape, out, tid, thread_num); + (void)ReduceSum(outer_size, inner_size, axis_size, in, out, tid, thread_num); int output_size = 24; CompareOutputData(out, correct, output_size, err_tol); @@ -197,7 +197,7 @@ TEST_F(TestReduceFp32, SumAllAxis) { float *src = in; float dst1[48] = {0}; MS_ASSERT(dst != nullptr); - (void)ReduceSum(outer_size, inner_size, axis_size, src, input_shape, dst1, tid, thread_num); + (void)ReduceSum(outer_size, inner_size, axis_size, src, dst1, tid, thread_num); input_shape[0] = 1; // 1 4 4 3 outer_size = 1; @@ -205,7 +205,7 @@ TEST_F(TestReduceFp32, SumAllAxis) { axis_size = 4; src = dst1; float dst2[12] = {0}; - (void)ReduceSum(outer_size, inner_size, axis_size, src, input_shape, dst2, tid, thread_num); + (void)ReduceSum(outer_size, inner_size, axis_size, src, dst2, tid, thread_num); input_shape[1] = 1; // 1 1 4 3 outer_size = 1; @@ -213,14 +213,14 @@ TEST_F(TestReduceFp32, SumAllAxis) { axis_size = 4; src = dst2; float dst3[3] = {0}; - (void)ReduceSum(outer_size, inner_size, axis_size, src, input_shape, dst3, tid, thread_num); + (void)ReduceSum(outer_size, inner_size, axis_size, src, dst3, tid, thread_num); input_shape[2] = 1; // 1 1 1 3 outer_size = 1; inner_size = 1; axis_size = 3; src = dst3; - (void)ReduceSum(outer_size, inner_size, axis_size, src, input_shape, out, tid, thread_num); + (void)ReduceSum(outer_size, inner_size, axis_size, src, out, tid, thread_num); int output_size = 1; CompareOutputData(out, correct, output_size, err_tol); @@ -244,7 +244,7 @@ TEST_F(TestReduceFp32, Max) { int outer_size = 2; int inner_size = 12; int axis_size = 4; - (void)ReduceMax(outer_size, inner_size, axis_size, in, input_shape, out, tid, thread_num); + (void)ReduceMax(outer_size, inner_size, axis_size, in, out, tid, thread_num); int output_size = 24; CompareOutputData(out, correct, output_size, err_tol); @@ -268,7 +268,7 @@ TEST_F(TestReduceFp32, Min) { int outer_size = 2; int inner_size = 12; int axis_size = 4; - (void)ReduceMin(outer_size, inner_size, axis_size, in, input_shape, out, tid, thread_num); + (void)ReduceMin(outer_size, inner_size, axis_size, in, out, tid, thread_num); int output_size = 24; CompareOutputData(out, correct, output_size, err_tol); @@ -293,7 +293,7 @@ TEST_F(TestReduceFp32, Prod) { int outer_size = 2; int inner_size = 12; int axis_size = 4; - (void)ReduceProd(outer_size, inner_size, axis_size, in, input_shape, out, tid, thread_num); + (void)ReduceProd(outer_size, inner_size, axis_size, in, out, tid, thread_num); int output_size = 24; CompareOutputData(out, correct, output_size, err_tol); @@ -318,7 +318,7 @@ TEST_F(TestReduceFp32, SumSquare) { int outer_size = 2; int inner_size = 12; int axis_size = 4; - (void)ReduceSumSquare(outer_size, inner_size, axis_size, in, input_shape, out, tid, thread_num); + (void)ReduceSumSquare(outer_size, inner_size, axis_size, in, out, tid, thread_num); int output_size = 24; CompareOutputData(out, correct, output_size, err_tol);