!10591 [MS][LITE][DDevelop] solve some problems for stack and gather ops

From: @pengyongrong Reviewed-by: @ddwsky,@zhang_xue_tong Signed-off-by: @ddwsky
2020-12-26 17:46:23 +08:00 · 2020-12-26 17:46:23 +08:00 · b936b9641d
parent 5488fe3fda 55fc235e6c
commit b936b9641d
5 changed files with 20 additions and 19 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/stack.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/stack.cl
@ -14,16 +14,19 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE |
 // input -1D
 __kernel void stack_2input_3axis_1inshape(__read_only image2d_t input0, __read_only image2d_t input1,
                                          __write_only image2d_t output, int4 input_shape, int4 output_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z) {
+  int X = get_global_id(0);  // N*H
+  int Y = get_global_id(1);  // W*C
+  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z * output_shape.w) {
    return;
  }
-  int coordinate_x_out = output_shape.w;
-  FLT4 result1 = READ_IMAGE(input0, smp_none, (int2)(0, (X)));
-  FLT4 result2 = READ_IMAGE(input1, smp_none, (int2)(0, (X)));
-  FLT4 result = {result1.x, result2.x, 0, 0};
-  WRITE_IMAGE(output, (int2)(Y, (X)), result);
+  FLT4 result1 = READ_IMAGE(input0, smp_none, (int2)(X, 0));
+  FLT result1_temp[4] = {result1.x, result1.y, result1.z, result1.w};
+  FLT4 result2 = READ_IMAGE(input1, smp_none, (int2)(X, 0));
+  FLT result2_temp[4] = {result2.x, result2.y, result2.z, result2.w};
+  for (int i = 0; i < C4NUM; ++i) {
+    FLT4 result = {result1_temp[i], result2_temp[i], 0, 0};
+    WRITE_IMAGE(output, (int2)(Y, (X * C4NUM + i)), result);
+  }
 }

 // input -2D -axis = 1
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
@ -126,8 +126,6 @@ int GatherOpenCLKernel::Prepare() {
 int GatherOpenCLKernel::ConvertTensorToweight() {
  auto allocator = ocl_runtime_->GetAllocator();
  GpuTensorInfo img_info(in_tensors_[1]);
-  size_t dtype = sizeof(cl_int);
-  stride_w = img_info.RowPitch() / dtype;
  auto indices_tensor = in_tensors_.at(1);
  auto indices_num = indices_tensor->ElementsNum();
  indices_data_ = reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num));
@ -140,7 +138,7 @@ int GatherOpenCLKernel::ConvertTensorToweight() {
  auto data = indices_tensor->data_c();
  if (data_type == kNumberTypeInt32) {
    for (int i = 0; i < indices_num; i++) {
-      indices_data_[i] = reinterpret_cast<int32_t *>(data)[i * stride_w];
+      indices_data_[i] = reinterpret_cast<int32_t *>(data)[i];
    }
  } else {
    MS_LOG(ERROR) << "Gather Only supported The DataType Of Intensor1 is Int32  "
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h
@ -49,7 +49,6 @@ class GatherOpenCLKernel : public OpenCLKernel {
  int axis_ = {0};
  bool intensor1_is_tensor{false};
  bool enable_fp16_{false};
-  cl_int stride_w{1};
 };
 }  // namespace mindspore::kernel
 #endif
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
@ -119,8 +119,8 @@ void StackOpenCLKernel::SetGlobalLocal() {
    OW_ = out_shape_.s[2];
    OC_ = out_shape_.s[3];
  } else if (in_tensors_[0]->shape().size() == 1) {
-    OH_ = out_shape_.s[0] * out_shape_.s[1];
-    OW_ = out_shape_.s[2];
+    OH_ = UP_DIV(out_shape_.s[0], C4NUM);
+    OW_ = out_shape_.s[3];
  } else {
    OH_ = out_shape_.s[0];
    OW_ = out_shape_.s[1];
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc
@ -33,11 +33,12 @@ OpParameter *CreateParameter(int axis) {
 TEST_F(TestOpenCL_Stack, input2_ndim1_axis1) {
  constexpr int INPUT_NUM = 2;
  int axis = 1;
-  std::vector<int> input_shapes[INPUT_NUM] = {{8}, {8}};
-  std::vector<int> output_shape = {8, 2};
-  float input_datas[INPUT_NUM][8] = {{0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37},
-                                     {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.47}};
-  float output_data[] = {0.75, 0.5, 0.06, 0.6, 0.74, 0.74, 0.30, 0.23, 0.9, 0.46, 0.59, 0.69, 0.03, 0.13, 0.37, 0.47};
+  std::vector<int> input_shapes[INPUT_NUM] = {{10}, {10}};
+  std::vector<int> output_shape = {10, 2};
+  float input_datas[INPUT_NUM][10] = {{0.75, 0.06, 0.74, 0.30, 0.9, 0.59, 0.03, 0.37, 0.13, 0.47},
+                                      {0.5, 0.6, 0.74, 0.23, 0.46, 0.69, 0.13, 0.47, 0.59, 0.03}};
+  float output_data[] = {0.75, 0.5,  0.06, 0.6,  0.74, 0.74, 0.30, 0.23, 0.9,  0.46,
+                         0.59, 0.69, 0.03, 0.13, 0.37, 0.47, 0.13, 0.59, 0.47, 0.03};

  for (auto fp16_enable : {true}) {
    auto *param = CreateParameter(axis);