forked from mindspore-Ecosystem/mindspore
!5635 [MS][LITE][GPU]fix bug in matmul and pooling
Merge pull request !5635 from chenzupeng/master-lite
This commit is contained in:
commit
8e442ce7ca
|
@ -1,4 +1,7 @@
|
|||
__kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
__kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
|
||||
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(0);
|
||||
|
@ -10,10 +13,10 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
|
|||
return;
|
||||
}
|
||||
|
||||
float4 r = (float4)(0.0f);
|
||||
float window_size = 0.0f;
|
||||
int xs = X * stride.x + padding.x;
|
||||
int ys = Y * stride.y + padding.y;
|
||||
FLT4 r = (FLT4)(0.0f);
|
||||
FLT window_size = 0.0f;
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
|
@ -21,11 +24,11 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
|
|||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
bool outside = outside_x || y_c < 0 || y_c >= input_shape.y;
|
||||
r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (float4)(0.0f);
|
||||
r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (FLT4)(0.0f);
|
||||
window_size += !outside ? 1.0f : 0.0f;
|
||||
}
|
||||
}
|
||||
float4 result = convert_float4(r / window_size);
|
||||
FLT4 result = TO_FLT4(r / window_size);
|
||||
output[(output_shape.y * X + Y) * output_shape.w + Z] = result;
|
||||
}
|
||||
|
||||
|
@ -43,10 +46,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
|
|||
return;
|
||||
}
|
||||
|
||||
float4 r = (float4)(0.0f);
|
||||
float window_size = 0.0f;
|
||||
int xs = X * stride.x + padding.x;
|
||||
int ys = Y * stride.y + padding.y;
|
||||
FLT4 r = (FLT4)(0.0f);
|
||||
FLT window_size = 0.0f;
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
|
@ -54,10 +57,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
|
|||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
|
||||
r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
|
||||
r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (float4)(0.0f);
|
||||
window_size += !outside ? 1.0f : 0.0f;
|
||||
}
|
||||
}
|
||||
float4 result = convert_float4(r / window_size);
|
||||
write_imagef(output, (int2)(Y * output_shape.w + Z, X), result);
|
||||
FLT4 result = TO_FLT4(r / window_size);
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
|
||||
}
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
__kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
__kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
|
||||
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(0);
|
||||
|
@ -10,9 +13,9 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
|
|||
return;
|
||||
}
|
||||
|
||||
float4 maximum = (float4)(-10000.0f);
|
||||
int xs = X * stride.x + padding.x;
|
||||
int ys = Y * stride.y + padding.y;
|
||||
FLT4 maximum = (FLT4)(-10000.0f);
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
|
@ -24,7 +27,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
|
|||
if (y_c < 0 || y_c >= input_shape.y) {
|
||||
continue;
|
||||
}
|
||||
float4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z];
|
||||
FLT4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z];
|
||||
maximum = max(src, maximum);
|
||||
}
|
||||
}
|
||||
|
@ -45,18 +48,18 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d
|
|||
return;
|
||||
}
|
||||
|
||||
float4 maximum = (float4)(-10000.0f);
|
||||
int xs = X * stride.x + padding.x;
|
||||
int ys = Y * stride.y + padding.y;
|
||||
FLT4 maximum = (FLT4)(-10000.0f);
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
if (y_c < 0 || y_c >= input_shape.y) continue;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
if (x_c < 0 || x_c >= input_shape.x) continue;
|
||||
float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
|
||||
FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
|
||||
maximum = max(src, maximum);
|
||||
}
|
||||
}
|
||||
write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum);
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum);
|
||||
}
|
||||
|
|
|
@ -58,14 +58,13 @@ int MatMulOpenCLKernel::Init() {
|
|||
sizeCO = {co, UP_DIV(co, C4NUM)};
|
||||
PadWeight();
|
||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||
in_tensors_[0]->SetFormat(schema::Format_NHWC4);
|
||||
out_ori_format_ = out_tensors_[0]->GetFormat();
|
||||
out_tensors_[0]->SetFormat(schema::Format_NHWC4);
|
||||
if (out_tensors_[0]->shape().size() == 2) {
|
||||
out_ori_format_ = schema::Format_NC;
|
||||
out_tensors_[0]->SetFormat(schema::Format_NC4);
|
||||
in_ori_format_ = schema::Format_NC;
|
||||
in_tensors_[0]->SetFormat(schema::Format_NC4);
|
||||
} else {
|
||||
in_tensors_[0]->SetFormat(schema::Format_NHWC4);
|
||||
out_tensors_[0]->SetFormat(schema::Format_NHWC4);
|
||||
}
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
|
|
|
@ -60,7 +60,7 @@ int PoolingOpenCLKernel::Init() {
|
|||
return RET_INVALID_OP_NAME;
|
||||
}
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
||||
enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
|
@ -96,11 +96,10 @@ int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
|
|||
size_t im_dst_x, im_dst_y;
|
||||
im_dst_x = out_tensors_[0]->Width() * CO4;
|
||||
im_dst_y = out_tensors_[0]->Height();
|
||||
#ifdef ENABLE_FP16
|
||||
size_t img_dtype = CL_HALF_FLOAT;
|
||||
#else
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
#endif
|
||||
if (enable_fp16_) {
|
||||
img_dtype = CL_HALF_FLOAT;
|
||||
}
|
||||
img_size->clear();
|
||||
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
|
||||
*img_size = vec;
|
||||
|
@ -161,5 +160,6 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::tensor:
|
|||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator)
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -44,6 +44,7 @@ class PoolingOpenCLKernel : public OpenCLKernel {
|
|||
std::vector<size_t> InitGlobalSize() const;
|
||||
PoolingParameter *parameter_;
|
||||
cl::Kernel kernel_;
|
||||
bool enable_fp16_{false};
|
||||
};
|
||||
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -50,6 +50,7 @@ class OpenCLKernel : public LiteKernel {
|
|||
}
|
||||
OpenCLMemType GetMemType() { return out_mem_type_; }
|
||||
void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; }
|
||||
void SetFormatType(schema::Format format_type) { op_format_ = format_type; }
|
||||
schema::Format GetInOriFormat() { return in_ori_format_; }
|
||||
schema::Format GetOutOriFormat() { return out_ori_format_; }
|
||||
|
||||
|
@ -57,6 +58,7 @@ class OpenCLKernel : public LiteKernel {
|
|||
OpenCLMemType out_mem_type_{OpenCLMemType::IMG};
|
||||
schema::Format in_ori_format_{schema::Format_NHWC};
|
||||
schema::Format out_ori_format_{schema::Format_NHWC4};
|
||||
schema::Format op_format_{schema::Format_NC4HW4};
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
Loading…
Reference in New Issue