!12267 [MS][LITE][GPU]adreno softmax fp16 acc error

From: @chenzupeng Reviewed-by: Signed-off-by:
2021-02-10 17:01:55 +08:00 · 2021-02-10 17:01:55 +08:00 · f9a2b2004f
parent 260c77f4ad 3ceeadf5c2
commit f9a2b2004f
5 changed files with 31 additions and 8 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
@ -44,7 +44,7 @@ __kernel void SoftMaxAxis3_NHWC4(__read_only image2d_t input, __write_only image
  float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X)));
  result = exp(min(result - input_max_f4, 0)) / sum;
  result = result * mask;
-  WRITE_IMAGE(output, (int2)(Y * C4 + C4 - 1, X), TO_FLT4(result));
+  WRITE_IMAGEOUT(output, (int2)(Y * C4 + C4 - 1, X), OUT_FLT4(result));
 }

 __kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
@ -65,7 +65,7 @@ __kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image
  for (int d = 0; d < H; ++d) {
    float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, d)));
    result = exp(result) / sum;
-    WRITE_IMAGE(output, (int2)(X * C4 + Y, d), TO_FLT4(result));
+    WRITE_IMAGEOUT(output, (int2)(X * C4 + Y, d), OUT_FLT4(result));
  }
 }

@ -87,7 +87,7 @@ __kernel void SoftMaxAxis2_NHWC4(__read_only image2d_t input, __write_only image
  for (int d = 0; d < W; ++d) {
    float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, X)));
    result = exp(result) / sum;
-    WRITE_IMAGE(output, (int2)(d * C4 + Y, X), TO_FLT4(result));
+    WRITE_IMAGEOUT(output, (int2)(d * C4 + Y, X), OUT_FLT4(result));
  }
 }

@ -125,6 +125,6 @@ __kernel void SoftMax1x1_NHWC4(__read_only image2d_t input, __write_only image2d
  for (size_t i = tid; i < C4; i += 32) {
    float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0)));
    result = exp(result) * sum;
-    WRITE_IMAGE(output, (int2)(i, 0), TO_FLT4(result));
+    WRITE_IMAGEOUT(output, (int2)(i, 0), OUT_FLT4(result));
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
@ -236,7 +236,7 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data,
  if (X >= size.x || Y >= size.y || Z >= size.z) {
    return;
  }
-  float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+  float4 data = convert_float4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
  int offset = (X * shape.z + Y) * shape.w + Z * 4;
  __global float *dst_addr = (__global float *)dst_data;
  dst_addr += offset;
@ -320,7 +320,7 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, _
  if (X >= size.x || Y >= size.y || Z >= size.z) {
    return;
  }
-  half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+  half4 data = convert_half4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
  int offset = (X * shape.z + Y) * shape.w + Z * 4;
  __global half *dst_addr = (__global half *)dst_data;
  dst_addr += offset;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@ -87,7 +87,13 @@ int SoftmaxOpenCLKernel::Prepare() {
 #else
  std::string program_name = "SoftMax";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> ext_build_opt;
+  if (out_tensors_[0]->data_type() == kNumberTypeFloat32) {
+    ext_build_opt.push_back("-DOUT_FLT4=convert_float4 -DWRITE_IMAGEOUT=write_imagef");
+  } else {
+    ext_build_opt.push_back("-DOUT_FLT4=convert_half4 -DWRITE_IMAGEOUT=write_imageh");
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt);
 #endif
  SetConstArgs();
  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@ -81,7 +81,13 @@ int ToFormatOpenCLKernel::Prepare() {
  std::string program_name = "to_format";
  std::string source = to_format_source;
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
+  std::vector<std::string> ext_build_opt;
+  if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
+    ext_build_opt.push_back("-DREAD_IMAGEIN=read_imagef");
+  } else {
+    ext_build_opt.push_back("-DREAD_IMAGEIN=read_imageh");
+  }
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt);
 #endif

  auto output = GpuTensorInfo(out_tensors_.front());
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
@ -251,6 +251,17 @@ int OpenCLSubGraph::UpdateTensorDataTypePass() {
    for (auto iv : nodes_) {
      MS_ASSERT(iv);
      auto cur_outs = iv->out_tensors();
+      // if softmax is last kernel, output fp32 tensor
+      if (iv->Type() == schema::PrimitiveType_SoftMax) {
+        bool last_kernel = true;
+        for (auto k : iv->out_kernels()) {
+          if (k->Type() != schema::PrimitiveType_ToFormat) {
+            last_kernel = false;
+            break;
+          }
+        }
+        if (last_kernel) continue;
+      }
      for (auto jv : cur_outs) {
        if (out_set.count(jv) == 0) {
          MS_ASSERT(jv);