fix reduce ops buffer_sizes_ and data_buffers_ size

This commit is contained in:
zhaozhenlong 2020-12-26 14:31:40 +08:00
parent ca0bbd3d69
commit 77ddb17b1a
4 changed files with 15 additions and 35 deletions

View File

@ -146,7 +146,8 @@ void ReduceBaseCPUKernel::CalculateInnerOuterSize() {
void ReduceBaseCPUKernel::CalculateTmpBufferSize() { void ReduceBaseCPUKernel::CalculateTmpBufferSize() {
buffer_sizes_.clear(); buffer_sizes_.clear();
auto input_shape = in_tensors_.at(0)->shape(); auto input_shape = in_tensors_.at(0)->shape();
for (auto i = 0; i < num_axes_; i++) { // calculate size of buffer to malloc for each reducing axis
for (auto i = 0; i < num_axes_ - 1; i++) {
int axis = axes_[i]; int axis = axes_[i];
size_t size = 1; size_t size = 1;
for (size_t j = 0; j < input_shape.size(); j++) { for (size_t j = 0; j < input_shape.size(); j++) {

View File

@ -82,14 +82,7 @@ int ReduceFp16CPUKernel::Run() {
} }
auto in_tensor = in_tensors_.at(0); auto in_tensor = in_tensors_.at(0);
if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) { fp16_src_data_ = reinterpret_cast<float16_t *>(in_tensor->MutableData());
auto input_data = reinterpret_cast<float *>(in_tensor->MutableData());
Float32ToFloat16(input_data, fp16_input_, in_tensor->ElementsNum());
} else {
fp16_input_ = reinterpret_cast<float16_t *>(in_tensor->MutableData());
}
fp16_src_data_ = fp16_input_;
for (size_t i = 0; i < data_buffers_.size(); ++i) { for (size_t i = 0; i < data_buffers_.size(); ++i) {
fp16_dst_data_ = data_buffers_.at(i); fp16_dst_data_ = data_buffers_.at(i);
outer_size_ = outer_sizes_.at(i); outer_size_ = outer_sizes_.at(i);
@ -105,11 +98,16 @@ int ReduceFp16CPUKernel::Run() {
} }
auto out_tensor = out_tensors_.at(0); auto out_tensor = out_tensors_.at(0);
if (out_tensor->data_type() == kNumberTypeFloat32 || out_tensor->data_type() == kNumberTypeFloat) { fp16_dst_data_ = reinterpret_cast<float16_t *>(out_tensor->data_c());
dst_data_ = reinterpret_cast<float *>(out_tensor->MutableData()); MS_ASSERT(fp16_dst_data_ != nullptr);
Float16ToFloat32(fp16_dst_data_, dst_data_, out_tensor->ElementsNum()); outer_size_ = outer_sizes_.back();
} else { inner_size_ = inner_sizes_.back();
memcpy(out_tensor->MutableData(), fp16_dst_data_, out_tensor->ElementsNum() * sizeof(float16_t)); axis_size_ = axis_sizes_.back();
auto error_code = ParallelLaunch(this->context_->thread_pool_, ReduceFp16Impl, this, context_->thread_num_);
if (error_code != RET_OK) {
FreeTmpBuffer();
MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
return RET_ERROR;
} }
FreeTmpBuffer(); FreeTmpBuffer();
@ -124,14 +122,6 @@ void ReduceFp16CPUKernel::FreeTmpBuffer() {
} }
} }
data_buffers_.clear(); data_buffers_.clear();
auto in_tensor = in_tensors_.at(0);
if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) {
if (fp16_input_ != nullptr) {
context_->allocator->Free(fp16_input_);
fp16_input_ = nullptr;
}
}
} }
int ReduceFp16CPUKernel::MallocTmpBuffer() { int ReduceFp16CPUKernel::MallocTmpBuffer() {
@ -144,16 +134,6 @@ int ReduceFp16CPUKernel::MallocTmpBuffer() {
} }
data_buffers_.emplace_back(buffer); data_buffers_.emplace_back(buffer);
} }
auto in_tensor = in_tensors_.front();
if (in_tensor->data_type() == kNumberTypeFloat32 || in_tensor->data_type() == kNumberTypeFloat) {
fp16_input_ =
reinterpret_cast<float16_t *>(context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float16_t)));
if (fp16_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc data failed";
return RET_ERROR;
}
}
return RET_OK; return RET_OK;
} }

View File

@ -44,8 +44,6 @@ class ReduceFp16CPUKernel : public ReduceBaseCPUKernel {
private: private:
Reducer reducer_ = nullptr; Reducer reducer_ = nullptr;
std::vector<float16_t *> data_buffers_; std::vector<float16_t *> data_buffers_;
float *dst_data_ = nullptr;
float16_t *fp16_input_ = nullptr;
const float16_t *fp16_src_data_ = nullptr; const float16_t *fp16_src_data_ = nullptr;
float16_t *fp16_dst_data_ = nullptr; float16_t *fp16_dst_data_ = nullptr;

View File

@ -315,6 +315,7 @@ int ReduceInt8CPUKernel::CalculateQuantArgs() {
int ReduceInt8CPUKernel::MallocTmpBuffer() { int ReduceInt8CPUKernel::MallocTmpBuffer() {
data_buffers_.clear(); data_buffers_.clear();
MS_ASSERT(static_cast<int>(buffer_sizes_.size()) == num_axes_ - 1); MS_ASSERT(static_cast<int>(buffer_sizes_.size()) == num_axes_ - 1);
// malloc num_axes_-1 buffers, since reduce on last axis will generate result to out_tensor, no need for buffer.
for (auto buffer_size : buffer_sizes_) { for (auto buffer_size : buffer_sizes_) {
int32_t *buffer = reinterpret_cast<int32_t *>(context_->allocator->Malloc(buffer_size * sizeof(int32_t))); int32_t *buffer = reinterpret_cast<int32_t *>(context_->allocator->Malloc(buffer_size * sizeof(int32_t)));
if (buffer == nullptr) { if (buffer == nullptr) {
@ -488,7 +489,7 @@ int ReduceInt8CPUKernel::Run() {
begin_src_data_[i] = static_cast<int32_t>(input_data[i]); begin_src_data_[i] = static_cast<int32_t>(input_data[i]);
} }
src_data_ = begin_src_data_; src_data_ = begin_src_data_;
for (size_t i = 0; i < data_buffers_.size() - 1; ++i) { for (size_t i = 0; i < data_buffers_.size(); ++i) {
GetQuantArgs(i); GetQuantArgs(i);
dst_data_ = data_buffers_[i]; dst_data_ = data_buffers_[i];
outer_size_ = outer_sizes_[i]; outer_size_ = outer_sizes_[i];