!12267 [MS][LITE][GPU]adreno softmax fp16 acc error

From: @chenzupeng
Reviewed-by: 
Signed-off-by:
This commit is contained in:
mindspore-ci-bot 2021-02-10 17:01:55 +08:00 committed by Gitee
commit f9a2b2004f
5 changed files with 31 additions and 8 deletions

View File

@ -44,7 +44,7 @@ __kernel void SoftMaxAxis3_NHWC4(__read_only image2d_t input, __write_only image
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X)));
result = exp(min(result - input_max_f4, 0)) / sum;
result = result * mask;
WRITE_IMAGE(output, (int2)(Y * C4 + C4 - 1, X), TO_FLT4(result));
WRITE_IMAGEOUT(output, (int2)(Y * C4 + C4 - 1, X), OUT_FLT4(result));
}
__kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
@ -65,7 +65,7 @@ __kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image
for (int d = 0; d < H; ++d) {
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, d)));
result = exp(result) / sum;
WRITE_IMAGE(output, (int2)(X * C4 + Y, d), TO_FLT4(result));
WRITE_IMAGEOUT(output, (int2)(X * C4 + Y, d), OUT_FLT4(result));
}
}
@ -87,7 +87,7 @@ __kernel void SoftMaxAxis2_NHWC4(__read_only image2d_t input, __write_only image
for (int d = 0; d < W; ++d) {
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, X)));
result = exp(result) / sum;
WRITE_IMAGE(output, (int2)(d * C4 + Y, X), TO_FLT4(result));
WRITE_IMAGEOUT(output, (int2)(d * C4 + Y, X), OUT_FLT4(result));
}
}
@ -125,6 +125,6 @@ __kernel void SoftMax1x1_NHWC4(__read_only image2d_t input, __write_only image2d
for (size_t i = tid; i < C4; i += 32) {
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0)));
result = exp(result) * sum;
WRITE_IMAGE(output, (int2)(i, 0), TO_FLT4(result));
WRITE_IMAGEOUT(output, (int2)(i, 0), OUT_FLT4(result));
}
}

View File

@ -236,7 +236,7 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data,
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
float4 data = convert_float4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global float *dst_addr = (__global float *)dst_data;
dst_addr += offset;
@ -320,7 +320,7 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, _
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
half4 data = convert_half4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global half *dst_addr = (__global half *)dst_data;
dst_addr += offset;

View File

@ -87,7 +87,13 @@ int SoftmaxOpenCLKernel::Prepare() {
#else
std::string program_name = "SoftMax";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
std::vector<std::string> ext_build_opt;
if (out_tensors_[0]->data_type() == kNumberTypeFloat32) {
ext_build_opt.push_back("-DOUT_FLT4=convert_float4 -DWRITE_IMAGEOUT=write_imagef");
} else {
ext_build_opt.push_back("-DOUT_FLT4=convert_half4 -DWRITE_IMAGEOUT=write_imageh");
}
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt);
#endif
SetConstArgs();
SetGlobalLocal();

View File

@ -81,7 +81,13 @@ int ToFormatOpenCLKernel::Prepare() {
std::string program_name = "to_format";
std::string source = to_format_source;
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
std::vector<std::string> ext_build_opt;
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
ext_build_opt.push_back("-DREAD_IMAGEIN=read_imagef");
} else {
ext_build_opt.push_back("-DREAD_IMAGEIN=read_imageh");
}
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt);
#endif
auto output = GpuTensorInfo(out_tensors_.front());

View File

@ -251,6 +251,17 @@ int OpenCLSubGraph::UpdateTensorDataTypePass() {
for (auto iv : nodes_) {
MS_ASSERT(iv);
auto cur_outs = iv->out_tensors();
// if softmax is last kernel, output fp32 tensor
if (iv->Type() == schema::PrimitiveType_SoftMax) {
bool last_kernel = true;
for (auto k : iv->out_kernels()) {
if (k->Type() != schema::PrimitiveType_ToFormat) {
last_kernel = false;
break;
}
}
if (last_kernel) continue;
}
for (auto jv : cur_outs) {
if (out_set.count(jv) == 0) {
MS_ASSERT(jv);