forked from mindspore-Ecosystem/mindspore
!5494 optimize resize bilinear using cached lines
Merge pull request !5494 from zhaozhenlong/lite/op/optimize_resize_cache_line
This commit is contained in:
commit
437ae441f5
|
@ -154,6 +154,129 @@ int ResizeBilinear(const float *input_data, float *output_data, const int *input
|
|||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int InterpRow(const float *src_line, float *linear_output, int new_width, float *x_left_weights, int *x_lefts,
|
||||
int *x_rights, int in_c) {
|
||||
int w;
|
||||
for (w = 0; w < new_width; w++) {
|
||||
int c = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
float32x4_t left_w = vdupq_n_f32(x_left_weights[w]);
|
||||
float32x4_t right_w = vdupq_n_f32(1.0f - x_left_weights[w]);
|
||||
|
||||
for (; c <= in_c - 4; c += 4) {
|
||||
float32x4_t left = vld1q_f32(src_line + x_lefts[w] * in_c + c);
|
||||
float32x4_t right = vld1q_f32(src_line + x_rights[w] * in_c + c);
|
||||
|
||||
float32x4_t interp_value = left * left_w + right * right_w;
|
||||
vst1q_f32(linear_output + w * in_c + c, interp_value);
|
||||
}
|
||||
#endif
|
||||
int left_w_offset = x_lefts[w] * in_c;
|
||||
int right_w_offset = x_rights[w] * in_c;
|
||||
for (; c < in_c; c++) {
|
||||
float left = src_line[left_w_offset + c];
|
||||
float right = src_line[right_w_offset + c];
|
||||
linear_output[w * in_c + c] = left * x_left_weights[w] + right * (1.0f - x_left_weights[w]);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InterpCol(const float *bottom_line, const float *top_line, float *output, int new_width, float y_bottom_weight,
|
||||
int in_c) {
|
||||
int w;
|
||||
for (w = 0; w < new_width; w++) {
|
||||
int c = 0;
|
||||
#ifdef ENABLE_NEON
|
||||
float32x4_t bottom_w = vdupq_n_f32(y_bottom_weight);
|
||||
float32x4_t top_w = vdupq_n_f32(1.0f - y_bottom_weight);
|
||||
|
||||
for (; c <= in_c - 4; c += 4) {
|
||||
float32x4_t bottom = vld1q_f32(bottom_line + w * in_c + c);
|
||||
float32x4_t top = vld1q_f32(top_line + w * in_c + c);
|
||||
float32x4_t interp_value = bottom * bottom_w + top * top_w;
|
||||
vst1q_f32(output + w * in_c + c, interp_value);
|
||||
}
|
||||
#endif
|
||||
for (; c < in_c; c++) {
|
||||
float bottom = bottom_line[w * in_c + c];
|
||||
float top = top_line[w * in_c + c];
|
||||
output[w * in_c + c] = bottom * y_bottom_weight + top * (1.0f - y_bottom_weight);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ResizeBilinear2(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
|
||||
int *y_bottoms, int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights,
|
||||
float *x_left_weights, float *line0, float *line1, int n_h_begin, int n_h_end) {
|
||||
if (input_data == NULL || output_data == NULL || input_shape == NULL || output_shape == NULL || y_bottoms == NULL ||
|
||||
y_tops == NULL || x_lefts == NULL || x_rights == NULL || y_bottom_weights == NULL || x_left_weights == NULL) {
|
||||
return NNACL_NULL_PTR;
|
||||
}
|
||||
|
||||
int in_h = input_shape[1];
|
||||
int in_w = input_shape[2];
|
||||
int in_c = input_shape[3];
|
||||
|
||||
int new_height = output_shape[1];
|
||||
int new_width = output_shape[2];
|
||||
|
||||
int n_h;
|
||||
int n_h_stride = new_width * in_c;
|
||||
|
||||
bool cache_line_used[2] = {false, false};
|
||||
int cache_line_num[2] = {-1, -1};
|
||||
float *const cache_line_ptr[2] = {line0, line1};
|
||||
float *current_line_ptr[2] = {line0, line1};
|
||||
int current_line_num[2] = {-1, -1};
|
||||
|
||||
for (n_h = n_h_begin; n_h < n_h_end; n_h++) {
|
||||
int n, h;
|
||||
n = n_h / new_height;
|
||||
h = n_h % new_height;
|
||||
|
||||
current_line_num[0] = n * in_h + y_bottoms[h];
|
||||
current_line_num[1] = n * in_h + y_tops[h];
|
||||
int i;
|
||||
for (i = 0; i < 2; i++) {
|
||||
cache_line_used[i] = false;
|
||||
}
|
||||
// search if we cached
|
||||
int j, k;
|
||||
for (j = 0; j < 2; j++) {
|
||||
bool find = false;
|
||||
for (k = 0; k < 2; k++) {
|
||||
if (current_line_num[j] == cache_line_num[k]) {
|
||||
cache_line_used[k] = true;
|
||||
current_line_ptr[j] = cache_line_ptr[k];
|
||||
find = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!find) {
|
||||
const float *line = input_data + current_line_num[j] * in_w * in_c;
|
||||
for (k = 0; k < 2; k++) {
|
||||
if (!cache_line_used[k]) {
|
||||
cache_line_num[k] = current_line_num[j];
|
||||
cache_line_used[k] = true;
|
||||
current_line_ptr[j] = cache_line_ptr[k];
|
||||
InterpRow(line, current_line_ptr[j], new_width, x_left_weights, x_lefts, x_rights, in_c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// do col interp
|
||||
InterpCol(current_line_ptr[0], current_line_ptr[1], output_data + n_h * n_h_stride, new_width, y_bottom_weights[h],
|
||||
in_c);
|
||||
}
|
||||
|
||||
return NNACL_OK;
|
||||
}
|
||||
|
||||
int ResizeNearestNeighbor(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
|
||||
int tid, int thread_num) {
|
||||
int batch, y, x, c;
|
||||
|
|
|
@ -28,9 +28,15 @@ extern "C" {
|
|||
|
||||
int PrepareResizeBilinear(const int *input_shape, const int *output_shape, bool align_corners, int *y_bottoms,
|
||||
int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights, float *x_left_weights);
|
||||
|
||||
int ResizeBilinear(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
|
||||
int *y_bottoms, int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights,
|
||||
float *x_left_weights, int n_h_begin, int n_h_end);
|
||||
|
||||
int ResizeBilinear2(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
|
||||
int *y_bottoms, int *y_tops, int *x_lefts, int *x_rights, float *y_bottom_weights,
|
||||
float *x_left_weights, float *line0, float *line1, int n_h_begin, int n_h_end);
|
||||
|
||||
int ResizeNearestNeighbor(const float *input_data, float *output_data, const int *input_shape, const int *output_shape,
|
||||
int tid, int thread_num);
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -61,6 +61,7 @@ int ResizeCPUKernel::ReSize() {
|
|||
}
|
||||
|
||||
int ResizeCPUKernel::MallocTmpBuffer() {
|
||||
int c = in_tensors_.at(0)->Channel();
|
||||
int h = new_height_;
|
||||
int w = new_width_;
|
||||
y_bottoms_ = reinterpret_cast<int *>(malloc(sizeof(int) * h));
|
||||
|
@ -94,6 +95,12 @@ int ResizeCPUKernel::MallocTmpBuffer() {
|
|||
MS_LOG(ERROR) << "malloc data failed";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
line_buffer_ = reinterpret_cast<float *>(malloc(sizeof(float) * w * c * 2 * context_->thread_num_));
|
||||
if (line_buffer_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc data failed";
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
void ResizeCPUKernel::FreeTmpBuffer() {
|
||||
|
@ -122,6 +129,10 @@ void ResizeCPUKernel::FreeTmpBuffer() {
|
|||
free(x_left_weights_);
|
||||
x_left_weights_ = nullptr;
|
||||
}
|
||||
if (line_buffer_ != nullptr) {
|
||||
free(line_buffer_);
|
||||
line_buffer_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
int ResizeImpl(void *cdata, int task_id) {
|
||||
|
@ -158,9 +169,12 @@ int ResizeCPUKernel::RunImpl(int task_id) {
|
|||
int unit = UP_DIV(n * h, context_->thread_num_);
|
||||
n_h_begin = unit * task_id;
|
||||
n_h_end = std::min(n_h_begin + unit, n * h);
|
||||
|
||||
ret = ResizeBilinear(input_data, output_data, input_shape.data(), out_tensors_[0]->shape().data(), y_bottoms_,
|
||||
y_tops_, x_lefts_, x_rights_, y_bottom_weights_, x_left_weights_, n_h_begin, n_h_end);
|
||||
int c = in_tensors_.at(0)->shape()[3];
|
||||
line0_ = line_buffer_ + new_width_ * c * 2 * task_id;
|
||||
line1_ = line0_ + new_width_ * c;
|
||||
ret = ResizeBilinear2(input_data, output_data, input_shape.data(), out_tensors_[0]->shape().data(), y_bottoms_,
|
||||
y_tops_, x_lefts_, x_rights_, y_bottom_weights_, x_left_weights_, line0_, line1_, n_h_begin,
|
||||
n_h_end);
|
||||
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -47,6 +47,9 @@ class ResizeCPUKernel : public ResizeBaseCPUKernel {
|
|||
int *x_rights_ = nullptr;
|
||||
float *y_bottom_weights_ = nullptr;
|
||||
float *x_left_weights_ = nullptr;
|
||||
float *line_buffer_ = nullptr;
|
||||
float *line0_ = nullptr;
|
||||
float *line1_ = nullptr;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -19,6 +19,8 @@
|
|||
#include "common/common_test.h"
|
||||
#include "nnacl/resize_parameter.h"
|
||||
#include "mindspore/lite/src/kernel_registry.h"
|
||||
#include "mindspore/lite/schema/ops_generated.h"
|
||||
using mindspore::schema::Format_NHWC;
|
||||
|
||||
namespace mindspore {
|
||||
|
||||
|
@ -52,6 +54,7 @@ void TestResizeBilinearFp32::Prepare(const std::vector<int> &input_shape, const
|
|||
float *input_data, float *output_data, const bool align_corners,
|
||||
const int thread_num) {
|
||||
in_tensor_.set_data_type(kNumberTypeFloat32);
|
||||
in_tensor_.SetFormat(Format_NHWC);
|
||||
in_tensor_.set_shape(input_shape);
|
||||
out_tensor_.set_data_type(kNumberTypeFloat32);
|
||||
out_tensor_.set_shape(output_shape);
|
||||
|
@ -377,4 +380,30 @@ TEST_F(TestResizeBilinearFp32, ResizeBilinearTest15) {
|
|||
|
||||
CompareOutputData(output_data, expect.data(), output_size, err_tol);
|
||||
}
|
||||
|
||||
// 5*5 -> 2*2
|
||||
TEST_F(TestResizeBilinearFp32, ResizeBilinearTest16) {
|
||||
float input_data[] = {
|
||||
0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
|
||||
16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
|
||||
32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
|
||||
48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0,
|
||||
64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0,
|
||||
80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0,
|
||||
96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0,
|
||||
112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0, 124.0};
|
||||
float output_data[20] = {0};
|
||||
std::vector<int> input_shape = {1, 5, 5, 5};
|
||||
std::vector<int> output_shape = {1, 2, 2, 5};
|
||||
std::vector<float> expect = {0.0, 1.0, 2.0, 3.0, 4.0, 12.5, 13.5, 14.5, 15.5, 16.5,
|
||||
62.5, 63.5, 64.5, 65.5, 66.5, 75.0, 76.0, 77.0, 78.0, 79.0};
|
||||
bool align_corners = false;
|
||||
auto output_size = 20;
|
||||
|
||||
Prepare(input_shape, output_shape, input_data, output_data, align_corners, 2);
|
||||
auto ret = kernel_->Run();
|
||||
EXPECT_EQ(0, ret);
|
||||
|
||||
CompareOutputData(output_data, expect.data(), output_size, err_tol);
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
Loading…
Reference in New Issue