!5263 improve resize fp32 performance
Merge pull request !5263 from zhaozhenlong/lite/op/commit_optimize_resize
This commit is contained in:
commit
595d8c21a8
|
@ -17,17 +17,15 @@
|
|||
#include "nnacl/fp32/resize.h"
|
||||
#include "nnacl/common_func.h"
|
||||
#include "nnacl/errorcode.h"
|
||||
|
||||
int ResizeBilinear(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
|
||||
bool align_corners, int tid, int thread_num) {
|
||||
if (input_data == NULL || output_data == NULL || input_shape == NULL || output_shape == NULL) {
|
||||
int PrepareResizeBilinear(const int *input_shape, const int *output_shape, bool align_corners, int *y_bottoms,
|
||||
int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights, float *x_left_weights) {
|
||||
if (input_shape == NULL || output_shape == NULL || y_bottoms == NULL || y_tops == NULL || x_lefts == NULL ||
|
||||
x_rights == NULL || y_bottom_weights == NULL || x_left_weights == NULL) {
|
||||
return NNACL_NULL_PTR;
|
||||
}
|
||||
|
||||
int in_n = input_shape[0];
|
||||
int in_h = input_shape[1];
|
||||
int in_w = input_shape[2];
|
||||
int in_c = input_shape[3];
|
||||
|
||||
int new_height = output_shape[1];
|
||||
int new_width = output_shape[2];
|
||||
|
@ -40,65 +38,119 @@ int ResizeBilinear(const float *input_data, float *output_data, const int *input
|
|||
width_scale = (float)(in_w - 1) / (new_width - 1);
|
||||
}
|
||||
|
||||
int n, h, w, c;
|
||||
for (n = 0; n < in_n; n++) {
|
||||
for (h = tid; h < new_height; h += thread_num) {
|
||||
float actual_y = (float)h * height_scale;
|
||||
int y_bottom = (int)(floor(actual_y));
|
||||
int y_top = y_bottom + 1 < in_h ? (y_bottom + 1) : (in_h - 1);
|
||||
float y_top_weight = actual_y - (float)(y_bottom);
|
||||
const float y_bottom_weight = 1.0f - y_top_weight;
|
||||
for (w = 0; w < new_width; w++) {
|
||||
float actual_x = (float)(w)*width_scale;
|
||||
int x_left = (int)(floor(actual_x));
|
||||
int x_right = x_left + 1 < in_w ? (x_left + 1) : (in_w - 1);
|
||||
float x_right_weight = actual_x - (float)(x_left);
|
||||
const float x_left_weight = 1.0f - x_right_weight;
|
||||
c = 0;
|
||||
int h, w;
|
||||
for (h = 0; h < new_height; h++) {
|
||||
float actual_y = (float)h * height_scale;
|
||||
int y_bottom = (int)(floor(actual_y));
|
||||
int y_top = y_bottom + 1 < in_h ? (y_bottom + 1) : (in_h - 1);
|
||||
float y_top_weight = actual_y - (float)(y_bottom);
|
||||
const float y_bottom_weight = 1.0f - y_top_weight;
|
||||
|
||||
y_bottoms[h] = y_bottom;
|
||||
y_tops[h] = y_top;
|
||||
y_bottom_weights[h] = y_bottom_weight;
|
||||
}
|
||||
for (w = 0; w < new_width; w++) {
|
||||
float actual_x = (float)(w)*width_scale;
|
||||
int x_left = (int)(floor(actual_x));
|
||||
int x_right = x_left + 1 < in_w ? (x_left + 1) : (in_w - 1);
|
||||
float x_right_weight = actual_x - (float)(x_left);
|
||||
const float x_left_weight = 1.0f - x_right_weight;
|
||||
|
||||
x_lefts[w] = x_left;
|
||||
x_rights[w] = x_right;
|
||||
x_left_weights[w] = x_left_weight;
|
||||
}
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ResizeBilinear(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
|
||||
int *y_bottoms, int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights,
|
||||
float *x_left_weights, int n_h_begin, int n_h_end) {
|
||||
if (input_data == NULL || output_data == NULL || input_shape == NULL || output_shape == NULL || y_bottoms == NULL ||
|
||||
y_tops == NULL || x_lefts == NULL || x_rights == NULL || y_bottom_weights == NULL || x_left_weights == NULL) {
|
||||
return NNACL_NULL_PTR;
|
||||
}
|
||||
|
||||
int in_w = input_shape[2];
|
||||
int in_c = input_shape[3];
|
||||
|
||||
int new_height = output_shape[1];
|
||||
int new_width = output_shape[2];
|
||||
|
||||
int n_h, n, h, w, c;
|
||||
n = n_h_begin / new_height;
|
||||
h = n_h_begin % new_height;
|
||||
int n_h_stride = new_width * in_c;
|
||||
int out_offset = n_h_begin * n_h_stride;
|
||||
for (n_h = n_h_begin; n_h < n_h_end; n_h++, h++) {
|
||||
if (h == new_height) {
|
||||
h = 0;
|
||||
n++;
|
||||
}
|
||||
int y_bottom = y_bottoms[h];
|
||||
int y_top = y_tops[h];
|
||||
float y_bottom_weight = y_bottom_weights[h];
|
||||
float y_top_weight = 1.0f - y_bottom_weight;
|
||||
|
||||
for (w = 0; w < new_width; w++) {
|
||||
int x_left = x_lefts[w];
|
||||
int x_right = x_rights[w];
|
||||
float x_left_weight = x_left_weights[w];
|
||||
float x_right_weight = 1.0f - x_left_weight;
|
||||
float top_left_weight = y_top_weight * x_left_weight;
|
||||
float top_right_weight = y_top_weight * x_right_weight;
|
||||
float bottom_left_weight = y_bottom_weight * x_left_weight;
|
||||
float bottom_right_weight = y_bottom_weight * x_right_weight;
|
||||
|
||||
c = 0;
|
||||
int in_bottom_left_offset = offset(input_shape, n, y_bottom, x_left, c);
|
||||
int in_bottom_right_offset = in_bottom_left_offset + (x_right - x_left) * in_c;
|
||||
int in_top_left_offset = in_bottom_left_offset + (y_top - y_bottom) * in_w * in_c;
|
||||
int in_top_right_offset = in_bottom_right_offset + (y_top - y_bottom) * in_w * in_c;
|
||||
|
||||
#ifdef ENABLE_NEON
|
||||
for (; c <= in_c - 4; c += 4) {
|
||||
float32x4_t bottom_left = vld1q_f32(input_data + offset(input_shape, n, y_bottom, x_left, c));
|
||||
float32x4_t bottom_right = vld1q_f32(input_data + offset(input_shape, n, y_bottom, x_right, c));
|
||||
float32x4_t top_left = vld1q_f32(input_data + offset(input_shape, n, y_top, x_left, c));
|
||||
float32x4_t top_right = vld1q_f32(input_data + offset(input_shape, n, y_top, x_right, c));
|
||||
float32x4_t top_left_w = vdupq_n_f32(top_left_weight);
|
||||
float32x4_t top_right_w = vdupq_n_f32(top_right_weight);
|
||||
float32x4_t bottom_left_w = vdupq_n_f32(bottom_left_weight);
|
||||
float32x4_t bottom_right_w = vdupq_n_f32(bottom_right_weight);
|
||||
|
||||
float32x4_t y_top_w = vdupq_n_f32(y_top_weight);
|
||||
float32x4_t y_bottom_w = vdupq_n_f32(y_bottom_weight);
|
||||
float32x4_t x_left_w = vdupq_n_f32(x_left_weight);
|
||||
float32x4_t x_right_w = vdupq_n_f32(x_right_weight);
|
||||
for (; c <= in_c - 4; c += 4) {
|
||||
float32x4_t bottom_left = vld1q_f32(input_data + in_bottom_left_offset + c);
|
||||
float32x4_t bottom_right = vld1q_f32(input_data + in_bottom_right_offset + c);
|
||||
float32x4_t top_left = vld1q_f32(input_data + in_top_left_offset + c);
|
||||
float32x4_t top_right = vld1q_f32(input_data + in_top_right_offset + c);
|
||||
|
||||
float32x4_t interp_value = vdupq_n_f32(0.0);
|
||||
float32x4_t tmp = vmulq_f32(bottom_left, y_bottom_w);
|
||||
tmp = vmulq_f32(tmp, x_left_w);
|
||||
interp_value = vaddq_f32(interp_value, tmp);
|
||||
float32x4_t interp_value = vdupq_n_f32(0.0);
|
||||
|
||||
tmp = vmulq_f32(bottom_right, y_bottom_w);
|
||||
tmp = vmulq_f32(tmp, x_right_w);
|
||||
interp_value = vaddq_f32(interp_value, tmp);
|
||||
float32x4_t tmp = vmulq_f32(bottom_left, bottom_left_w);
|
||||
interp_value = vaddq_f32(interp_value, tmp);
|
||||
|
||||
tmp = vmulq_f32(top_left, y_top_w);
|
||||
tmp = vmulq_f32(tmp, x_left_w);
|
||||
interp_value = vaddq_f32(interp_value, tmp);
|
||||
tmp = vmulq_f32(bottom_right, bottom_right_w);
|
||||
interp_value = vaddq_f32(interp_value, tmp);
|
||||
|
||||
tmp = vmulq_f32(top_right, y_top_w);
|
||||
tmp = vmulq_f32(tmp, x_right_w);
|
||||
interp_value = vaddq_f32(interp_value, tmp);
|
||||
vst1q_f32(output_data + offset(output_shape, n, h, w, c), interp_value);
|
||||
}
|
||||
tmp = vmulq_f32(top_left, top_left_w);
|
||||
interp_value = vaddq_f32(interp_value, tmp);
|
||||
|
||||
tmp = vmulq_f32(top_right, top_right_w);
|
||||
interp_value = vaddq_f32(interp_value, tmp);
|
||||
vst1q_f32(output_data + out_offset, interp_value);
|
||||
out_offset += 4;
|
||||
}
|
||||
#endif
|
||||
for (; c < in_c; c++) {
|
||||
float bottom_left = input_data[offset(input_shape, n, y_bottom, x_left, c)];
|
||||
float bottom_right = input_data[offset(input_shape, n, y_bottom, x_right, c)];
|
||||
float top_left = input_data[offset(input_shape, n, y_top, x_left, c)];
|
||||
float top_right = input_data[offset(input_shape, n, y_top, x_right, c)];
|
||||
float interp_value = bottom_left * y_bottom_weight * x_left_weight +
|
||||
bottom_right * y_bottom_weight * x_right_weight +
|
||||
top_left * y_top_weight * x_left_weight + top_right * y_top_weight * x_right_weight;
|
||||
output_data[offset(output_shape, n, h, w, c)] = interp_value;
|
||||
}
|
||||
for (; c < in_c; c++) {
|
||||
float bottom_left = input_data[in_bottom_left_offset + c];
|
||||
float bottom_right = input_data[in_bottom_right_offset + c];
|
||||
float top_left = input_data[in_top_left_offset + c];
|
||||
float top_right = input_data[in_top_right_offset + c];
|
||||
float interp_value = bottom_left * bottom_left_weight + bottom_right * bottom_right_weight +
|
||||
top_left * top_left_weight + top_right * top_right_weight;
|
||||
output_data[out_offset] = interp_value;
|
||||
out_offset++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -25,9 +25,12 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
int ResizeBilinear(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
|
||||
bool align_corners, int tid, int thread_num);
|
||||
|
||||
int PrepareResizeBilinear(const int *input_shape, const int *output_shape, bool align_corners, int *y_bottoms,
|
||||
int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights, float *x_left_weights);
|
||||
int ResizeBilinear(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
|
||||
int *y_bottoms, int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights,
|
||||
float *x_left_weights, int n_h_begin, int n_h_end);
|
||||
int ResizeNearestNeighbor(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
|
||||
int tid, int thread_num);
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -41,7 +41,8 @@ int ResizeBaseCPUKernel::CheckParameters() {
|
|||
return RET_NULL_PTR;
|
||||
}
|
||||
method_ = parameter->method_;
|
||||
if (method_ != schema::ResizeMethod_BILINEAR && method_ != schema::ResizeMethod_NEAREST_NEIGHBOR) {
|
||||
if (method_ != static_cast<int>(schema::ResizeMethod_BILINEAR) &&
|
||||
method_ != static_cast<int>(schema::ResizeMethod_NEAREST_NEIGHBOR)) {
|
||||
MS_LOG(ERROR) << "Resize method should be bilinear or nearest_neighbor, but got " << method_;
|
||||
return RET_INVALID_OP_ATTR;
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include "src/runtime/kernel/arm/fp32/resize.h"
|
||||
#include "schema/model_generated.h"
|
||||
#include "nnacl/fp32/resize.h"
|
||||
|
@ -38,6 +39,91 @@ int ResizeCPUKernel::Init() {
|
|||
return ReSize();
|
||||
}
|
||||
|
||||
int ResizeCPUKernel::ReSize() {
|
||||
int ret = RET_OK;
|
||||
if (method_ == static_cast<int>(schema::ResizeMethod_BILINEAR)) {
|
||||
FreeTmpBuffer();
|
||||
ret = MallocTmpBuffer();
|
||||
if (ret != RET_OK) {
|
||||
FreeTmpBuffer();
|
||||
return ret;
|
||||
}
|
||||
|
||||
auto input = in_tensors_.at(0);
|
||||
auto input_shape = input->shape();
|
||||
ret = PrepareResizeBilinear(input_shape.data(), out_tensors_[0]->shape().data(), align_corners_, y_bottoms_,
|
||||
y_tops_, x_lefts_, x_rights_, y_bottom_weights_, x_left_weights_);
|
||||
if (ret != RET_OK) {
|
||||
FreeTmpBuffer();
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ResizeCPUKernel::MallocTmpBuffer() {
|
||||
int h = new_height_;
|
||||
int w = new_width_;
|
||||
y_bottoms_ = reinterpret_cast<int *>(malloc(sizeof(int) * h));
|
||||
if (y_bottoms_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc data failed";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
y_tops_ = reinterpret_cast<int *>(malloc(sizeof(int) * h));
|
||||
if (y_tops_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc data failed";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
y_bottom_weights_ = reinterpret_cast<float *>(malloc(sizeof(float) * h));
|
||||
if (y_bottom_weights_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc data failed";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
|
||||
x_lefts_ = reinterpret_cast<int *>(malloc(sizeof(int) * w));
|
||||
if (x_lefts_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc data failed";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
x_rights_ = reinterpret_cast<int *>(malloc(sizeof(int) * w));
|
||||
if (x_rights_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc data failed";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
x_left_weights_ = reinterpret_cast<float *>(malloc(sizeof(float) * w));
|
||||
if (x_left_weights_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc data failed";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
void ResizeCPUKernel::FreeTmpBuffer() {
|
||||
if (y_bottoms_ != nullptr) {
|
||||
free(y_bottoms_);
|
||||
y_bottoms_ = nullptr;
|
||||
}
|
||||
if (y_tops_ != nullptr) {
|
||||
free(y_tops_);
|
||||
y_tops_ = nullptr;
|
||||
}
|
||||
if (y_bottom_weights_ != nullptr) {
|
||||
free(y_bottom_weights_);
|
||||
y_bottom_weights_ = nullptr;
|
||||
}
|
||||
|
||||
if (x_lefts_ != nullptr) {
|
||||
free(x_lefts_);
|
||||
x_lefts_ = nullptr;
|
||||
}
|
||||
if (x_rights_ != nullptr) {
|
||||
free(x_rights_);
|
||||
x_rights_ = nullptr;
|
||||
}
|
||||
if (x_left_weights_ != nullptr) {
|
||||
free(x_left_weights_);
|
||||
x_left_weights_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
int ResizeImpl(void *cdata, int task_id) {
|
||||
auto resize = reinterpret_cast<ResizeCPUKernel *>(cdata);
|
||||
auto error_code = resize->RunImpl(task_id);
|
||||
|
@ -66,8 +152,16 @@ int ResizeCPUKernel::RunImpl(int task_id) {
|
|||
int ret = 0;
|
||||
switch (method_) {
|
||||
case static_cast<int>(schema::ResizeMethod_BILINEAR): {
|
||||
ret = ResizeBilinear(input_data, output_data, input_shape.data(), out_tensors_[0]->shape().data(), align_corners_,
|
||||
task_id, context_->thread_num_);
|
||||
int n_h_begin, n_h_end;
|
||||
int n = out_tensors_.at(0)->shape()[0];
|
||||
int h = new_height_;
|
||||
int unit = UP_DIV(n * h, context_->thread_num_);
|
||||
n_h_begin = unit * task_id;
|
||||
n_h_end = std::min(n_h_begin + unit, n * h);
|
||||
|
||||
ret = ResizeBilinear(input_data, output_data, input_shape.data(), out_tensors_[0]->shape().data(), y_bottoms_,
|
||||
y_tops_, x_lefts_, x_rights_, y_bottom_weights_, x_left_weights_, n_h_begin, n_h_end);
|
||||
|
||||
break;
|
||||
}
|
||||
case static_cast<int>(schema::ResizeMethod_NEAREST_NEIGHBOR): {
|
||||
|
@ -97,8 +191,10 @@ int ResizeCPUKernel::Run() {
|
|||
int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ResizeImpl, this, context_->thread_num_);
|
||||
if (error_code != RET_OK) {
|
||||
MS_LOG(ERROR) << "Resize run error, error_code[" << error_code << "]";
|
||||
FreeTmpBuffer();
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -31,12 +31,22 @@ class ResizeCPUKernel : public ResizeBaseCPUKernel {
|
|||
const mindspore::lite::PrimitiveC *primitive)
|
||||
: ResizeBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
|
||||
|
||||
~ResizeCPUKernel() = default;
|
||||
~ResizeCPUKernel() { FreeTmpBuffer(); }
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override { return 0; };
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int RunImpl(int task_id);
|
||||
int MallocTmpBuffer();
|
||||
void FreeTmpBuffer();
|
||||
|
||||
private:
|
||||
int *y_tops_ = nullptr;
|
||||
int *y_bottoms_ = nullptr;
|
||||
int *x_lefts_ = nullptr;
|
||||
int *x_rights_ = nullptr;
|
||||
float *y_bottom_weights_ = nullptr;
|
||||
float *x_left_weights_ = nullptr;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
Loading…
Reference in New Issue