forked from mindspore-Ecosystem/mindspore
!12267 [MS][LITE][GPU]adreno softmax fp16 acc error
From: @chenzupeng Reviewed-by: Signed-off-by:
This commit is contained in:
commit
f9a2b2004f
|
@ -44,7 +44,7 @@ __kernel void SoftMaxAxis3_NHWC4(__read_only image2d_t input, __write_only image
|
|||
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X)));
|
||||
result = exp(min(result - input_max_f4, 0)) / sum;
|
||||
result = result * mask;
|
||||
WRITE_IMAGE(output, (int2)(Y * C4 + C4 - 1, X), TO_FLT4(result));
|
||||
WRITE_IMAGEOUT(output, (int2)(Y * C4 + C4 - 1, X), OUT_FLT4(result));
|
||||
}
|
||||
|
||||
__kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
|
||||
|
@ -65,7 +65,7 @@ __kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image
|
|||
for (int d = 0; d < H; ++d) {
|
||||
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, d)));
|
||||
result = exp(result) / sum;
|
||||
WRITE_IMAGE(output, (int2)(X * C4 + Y, d), TO_FLT4(result));
|
||||
WRITE_IMAGEOUT(output, (int2)(X * C4 + Y, d), OUT_FLT4(result));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -87,7 +87,7 @@ __kernel void SoftMaxAxis2_NHWC4(__read_only image2d_t input, __write_only image
|
|||
for (int d = 0; d < W; ++d) {
|
||||
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, X)));
|
||||
result = exp(result) / sum;
|
||||
WRITE_IMAGE(output, (int2)(d * C4 + Y, X), TO_FLT4(result));
|
||||
WRITE_IMAGEOUT(output, (int2)(d * C4 + Y, X), OUT_FLT4(result));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -125,6 +125,6 @@ __kernel void SoftMax1x1_NHWC4(__read_only image2d_t input, __write_only image2d
|
|||
for (size_t i = tid; i < C4; i += 32) {
|
||||
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0)));
|
||||
result = exp(result) * sum;
|
||||
WRITE_IMAGE(output, (int2)(i, 0), TO_FLT4(result));
|
||||
WRITE_IMAGEOUT(output, (int2)(i, 0), OUT_FLT4(result));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -236,7 +236,7 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data,
|
|||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
||||
float4 data = convert_float4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
||||
int offset = (X * shape.z + Y) * shape.w + Z * 4;
|
||||
__global float *dst_addr = (__global float *)dst_data;
|
||||
dst_addr += offset;
|
||||
|
@ -320,7 +320,7 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, _
|
|||
if (X >= size.x || Y >= size.y || Z >= size.z) {
|
||||
return;
|
||||
}
|
||||
half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
||||
half4 data = convert_half4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
|
||||
int offset = (X * shape.z + Y) * shape.w + Z * 4;
|
||||
__global half *dst_addr = (__global half *)dst_data;
|
||||
dst_addr += offset;
|
||||
|
|
|
@ -87,7 +87,13 @@ int SoftmaxOpenCLKernel::Prepare() {
|
|||
#else
|
||||
std::string program_name = "SoftMax";
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
|
||||
std::vector<std::string> ext_build_opt;
|
||||
if (out_tensors_[0]->data_type() == kNumberTypeFloat32) {
|
||||
ext_build_opt.push_back("-DOUT_FLT4=convert_float4 -DWRITE_IMAGEOUT=write_imagef");
|
||||
} else {
|
||||
ext_build_opt.push_back("-DOUT_FLT4=convert_half4 -DWRITE_IMAGEOUT=write_imageh");
|
||||
}
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt);
|
||||
#endif
|
||||
SetConstArgs();
|
||||
SetGlobalLocal();
|
||||
|
|
|
@ -81,7 +81,13 @@ int ToFormatOpenCLKernel::Prepare() {
|
|||
std::string program_name = "to_format";
|
||||
std::string source = to_format_source;
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
|
||||
std::vector<std::string> ext_build_opt;
|
||||
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
|
||||
ext_build_opt.push_back("-DREAD_IMAGEIN=read_imagef");
|
||||
} else {
|
||||
ext_build_opt.push_back("-DREAD_IMAGEIN=read_imageh");
|
||||
}
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt);
|
||||
#endif
|
||||
|
||||
auto output = GpuTensorInfo(out_tensors_.front());
|
||||
|
|
|
@ -251,6 +251,17 @@ int OpenCLSubGraph::UpdateTensorDataTypePass() {
|
|||
for (auto iv : nodes_) {
|
||||
MS_ASSERT(iv);
|
||||
auto cur_outs = iv->out_tensors();
|
||||
// if softmax is last kernel, output fp32 tensor
|
||||
if (iv->Type() == schema::PrimitiveType_SoftMax) {
|
||||
bool last_kernel = true;
|
||||
for (auto k : iv->out_kernels()) {
|
||||
if (k->Type() != schema::PrimitiveType_ToFormat) {
|
||||
last_kernel = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (last_kernel) continue;
|
||||
}
|
||||
for (auto jv : cur_outs) {
|
||||
if (out_set.count(jv) == 0) {
|
||||
MS_ASSERT(jv);
|
||||
|
|
Loading…
Reference in New Issue