forked from OSSInnovation/mindspore
!3823 optimize opencl conv2d transpose
Merge pull request !3823 from chenzupeng/master-lite
This commit is contained in:
commit
d4f82d6c56
|
@ -1,52 +1,59 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#define FLT half
|
||||
#define FLT4 half4
|
||||
#define FLT16 half16
|
||||
__kernel void conv2d_transpose2x2(__global FLT4 *inputx, __global FLT16 *weight, __global FLT4 *bias,
|
||||
__global FLT4 *output, int2 kernel_size, int2 stride, int2 padding, int4 src_size,
|
||||
int4 dst_size) {
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
|
||||
__write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
|
||||
int4 src_size, int4 dst_size) {
|
||||
int h = get_global_id(0);
|
||||
int kh = h % 2;
|
||||
int src_h = h / 2;
|
||||
src_h = src_h * 2;
|
||||
int w = get_global_id(1);
|
||||
int kw = w % 2;
|
||||
int src_w = w / 2;
|
||||
src_w = src_w * 2;
|
||||
int co = get_global_id(2);
|
||||
if (h * 2 >= dst_size.x || w * 2 >= dst_size.y || co >= dst_size.z) return;
|
||||
FLT4 r0 = (FLT4)(0.f);
|
||||
FLT4 r1 = (FLT4)(0.f);
|
||||
FLT4 r2 = (FLT4)(0.f);
|
||||
FLT4 r3 = (FLT4)(0.f);
|
||||
int base_x = (h * src_size.y + w) * src_size.z;
|
||||
int base_w = co * src_size.z;
|
||||
int base_w = (co * 4 + kh + kw * 2) * src_size.z;
|
||||
for (int ci = 0; ci < src_size.z; ++ci) {
|
||||
FLT4 x = inputx[base_x + ci];
|
||||
FLT16 w0 = weight[(base_w + ci) * 4];
|
||||
FLT16 w1 = weight[(base_w + ci) * 4 + 1];
|
||||
FLT16 w2 = weight[(base_w + ci) * 4 + 2];
|
||||
FLT16 w3 = weight[(base_w + ci) * 4 + 3];
|
||||
r0 += x.x * w0.s0123;
|
||||
r0 += x.y * w0.s4567;
|
||||
r0 += x.z * w0.s89ab;
|
||||
r0 += x.w * w0.scdef;
|
||||
FLT4 x0 = read_imagef(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h));
|
||||
FLT4 x1 = read_imagef(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h + 1));
|
||||
FLT4 x2 = read_imagef(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h));
|
||||
FLT4 x3 = read_imagef(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h + 1));
|
||||
FLT16 weight_cache = weight[base_w++];
|
||||
r0 += x0.x * weight_cache.s0123;
|
||||
r0 += x0.y * weight_cache.s4567;
|
||||
r0 += x0.z * weight_cache.s89ab;
|
||||
r0 += x0.w * weight_cache.scdef;
|
||||
|
||||
r1 += x.x * w1.s0123;
|
||||
r1 += x.y * w1.s4567;
|
||||
r1 += x.z * w1.s89ab;
|
||||
r1 += x.w * w1.scdef;
|
||||
r1 += x1.x * weight_cache.s0123;
|
||||
r1 += x1.y * weight_cache.s4567;
|
||||
r1 += x1.z * weight_cache.s89ab;
|
||||
r1 += x1.w * weight_cache.scdef;
|
||||
|
||||
r2 += x.x * w2.s0123;
|
||||
r2 += x.y * w2.s4567;
|
||||
r2 += x.z * w2.s89ab;
|
||||
r2 += x.w * w2.scdef;
|
||||
r2 += x2.x * weight_cache.s0123;
|
||||
r2 += x2.y * weight_cache.s4567;
|
||||
r2 += x2.z * weight_cache.s89ab;
|
||||
r2 += x2.w * weight_cache.scdef;
|
||||
|
||||
r3 += x.x * w3.s0123;
|
||||
r3 += x.y * w3.s4567;
|
||||
r3 += x.z * w3.s89ab;
|
||||
r3 += x.w * w3.scdef;
|
||||
r3 += x3.x * weight_cache.s0123;
|
||||
r3 += x3.y * weight_cache.s4567;
|
||||
r3 += x3.z * weight_cache.s89ab;
|
||||
r3 += x3.w * weight_cache.scdef;
|
||||
}
|
||||
r0 += bias[co];
|
||||
r1 += bias[co];
|
||||
r2 += bias[co];
|
||||
r3 += bias[co];
|
||||
output[((2 * h + 0) * dst_size.y + 2 * w + 0) * dst_size.z + co] = r0;
|
||||
output[((2 * h + 0) * dst_size.y + 2 * w + 1) * dst_size.z + co] = r1;
|
||||
output[((2 * h + 1) * dst_size.y + 2 * w + 0) * dst_size.z + co] = r2;
|
||||
output[((2 * h + 1) * dst_size.y + 2 * w + 1) * dst_size.z + co] = r3;
|
||||
}
|
||||
FLT4 bias_val = read_imagef(biases, smp_zero, (int2)(co, 0));
|
||||
r0 += bias_val;
|
||||
r1 += bias_val;
|
||||
r2 += bias_val;
|
||||
r3 += bias_val;
|
||||
|
||||
write_imagef(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh), r0);
|
||||
write_imagef(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh + 2), r1);
|
||||
write_imagef(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2);
|
||||
write_imagef(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3);
|
||||
}
|
||||
|
|
|
@ -1,51 +1,59 @@
|
|||
#define FLT float
|
||||
#define FLT4 float4
|
||||
#define FLT16 float16
|
||||
__kernel void conv2d_transpose2x2(__global FLT4 *inputx, __global FLT16 *weight, __global FLT4 *bias,
|
||||
__global FLT4 *output, int2 kernel_size, int2 stride, int2 padding, int4 src_size,
|
||||
int4 dst_size) {
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases,
|
||||
__write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
|
||||
int4 src_size, int4 dst_size) {
|
||||
int h = get_global_id(0);
|
||||
int kh = h % 2;
|
||||
int src_h = h / 2;
|
||||
src_h = src_h * 2;
|
||||
int w = get_global_id(1);
|
||||
int kw = w % 2;
|
||||
int src_w = w / 2;
|
||||
src_w = src_w * 2;
|
||||
int co = get_global_id(2);
|
||||
if (h * 2 >= dst_size.x || w * 2 >= dst_size.y || co >= dst_size.z) return;
|
||||
FLT4 r0 = (FLT4)(0.f);
|
||||
FLT4 r1 = (FLT4)(0.f);
|
||||
FLT4 r2 = (FLT4)(0.f);
|
||||
FLT4 r3 = (FLT4)(0.f);
|
||||
int base_x = (h * src_size.y + w) * src_size.z;
|
||||
int base_w = co * src_size.z;
|
||||
int base_w = (co * 4 + kh + kw * 2) * src_size.z;
|
||||
for (int ci = 0; ci < src_size.z; ++ci) {
|
||||
FLT4 x = inputx[base_x + ci];
|
||||
FLT16 w0 = weight[(base_w + ci) * 4];
|
||||
FLT16 w1 = weight[(base_w + ci) * 4 + 1];
|
||||
FLT16 w2 = weight[(base_w + ci) * 4 + 2];
|
||||
FLT16 w3 = weight[(base_w + ci) * 4 + 3];
|
||||
r0 += x.x * w0.s0123;
|
||||
r0 += x.y * w0.s4567;
|
||||
r0 += x.z * w0.s89ab;
|
||||
r0 += x.w * w0.scdef;
|
||||
FLT4 x0 = read_imagef(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h));
|
||||
FLT4 x1 = read_imagef(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h + 1));
|
||||
FLT4 x2 = read_imagef(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h));
|
||||
FLT4 x3 = read_imagef(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h + 1));
|
||||
FLT16 weight_cache = weight[base_w++];
|
||||
r0 += x0.x * weight_cache.s0123;
|
||||
r0 += x0.y * weight_cache.s4567;
|
||||
r0 += x0.z * weight_cache.s89ab;
|
||||
r0 += x0.w * weight_cache.scdef;
|
||||
|
||||
r1 += x.x * w1.s0123;
|
||||
r1 += x.y * w1.s4567;
|
||||
r1 += x.z * w1.s89ab;
|
||||
r1 += x.w * w1.scdef;
|
||||
r1 += x1.x * weight_cache.s0123;
|
||||
r1 += x1.y * weight_cache.s4567;
|
||||
r1 += x1.z * weight_cache.s89ab;
|
||||
r1 += x1.w * weight_cache.scdef;
|
||||
|
||||
r2 += x.x * w2.s0123;
|
||||
r2 += x.y * w2.s4567;
|
||||
r2 += x.z * w2.s89ab;
|
||||
r2 += x.w * w2.scdef;
|
||||
r2 += x2.x * weight_cache.s0123;
|
||||
r2 += x2.y * weight_cache.s4567;
|
||||
r2 += x2.z * weight_cache.s89ab;
|
||||
r2 += x2.w * weight_cache.scdef;
|
||||
|
||||
r3 += x.x * w3.s0123;
|
||||
r3 += x.y * w3.s4567;
|
||||
r3 += x.z * w3.s89ab;
|
||||
r3 += x.w * w3.scdef;
|
||||
r3 += x3.x * weight_cache.s0123;
|
||||
r3 += x3.y * weight_cache.s4567;
|
||||
r3 += x3.z * weight_cache.s89ab;
|
||||
r3 += x3.w * weight_cache.scdef;
|
||||
}
|
||||
r0 += bias[co];
|
||||
r1 += bias[co];
|
||||
r2 += bias[co];
|
||||
r3 += bias[co];
|
||||
output[((2 * h + 0) * dst_size.y + 2 * w + 0) * dst_size.z + co] = r0;
|
||||
output[((2 * h + 0) * dst_size.y + 2 * w + 1) * dst_size.z + co] = r1;
|
||||
output[((2 * h + 1) * dst_size.y + 2 * w + 0) * dst_size.z + co] = r2;
|
||||
output[((2 * h + 1) * dst_size.y + 2 * w + 1) * dst_size.z + co] = r3;
|
||||
}
|
||||
FLT4 bias_val = read_imagef(biases, smp_zero, (int2)(co, 0));
|
||||
r0 += bias_val;
|
||||
r1 += bias_val;
|
||||
r2 += bias_val;
|
||||
r3 += bias_val;
|
||||
|
||||
write_imagef(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh), r0);
|
||||
write_imagef(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh + 2), r1);
|
||||
write_imagef(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2);
|
||||
write_imagef(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3);
|
||||
}
|
||||
|
|
|
@ -64,11 +64,8 @@ int Conv2dTransposeOpenCLKernel::Init() {
|
|||
auto allocator = ocl_runtime->GetAllocator();
|
||||
padWeight_ = reinterpret_cast<FLOAT_T *>(allocator->Malloc(div_ci * div_co * 16 * kh * kw * sizeof(FLOAT_T)));
|
||||
padWeight_ = reinterpret_cast<FLOAT_T *>(allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true));
|
||||
bias_ = reinterpret_cast<FLOAT_T *>(allocator->Malloc(div_co * 4 * sizeof(FLOAT_T)));
|
||||
bias_ = reinterpret_cast<FLOAT_T *>(allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true));
|
||||
PadWeight();
|
||||
allocator->UnmapBuffer(padWeight_);
|
||||
allocator->UnmapBuffer(bias_);
|
||||
outputs_[0]->SetFormat(schema::Format_NHWC4);
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return 0;
|
||||
|
@ -77,7 +74,7 @@ int Conv2dTransposeOpenCLKernel::Init() {
|
|||
int Conv2dTransposeOpenCLKernel::ReSize() { return 0; }
|
||||
|
||||
void Conv2dTransposeOpenCLKernel::PadWeight() {
|
||||
// OHWI to OIHW4(I)4(O)
|
||||
// OHWI to OHWI4(I)4(O)
|
||||
ConvParameter *param = reinterpret_cast<ConvParameter *>(opParameter);
|
||||
int ci = param->input_channel_;
|
||||
int co = param->output_channel_;
|
||||
|
@ -86,13 +83,11 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
|
|||
int div_ci = UP_DIV(ci, 4);
|
||||
int div_co = UP_DIV(co, 4);
|
||||
auto origin_weight = reinterpret_cast<FLOAT_T *>(inputs_.at(kWeightIndex)->Data());
|
||||
auto origin_bias = reinterpret_cast<FLOAT_T *>(inputs_.at(kBiasIndex)->Data());
|
||||
bool has_bias = origin_bias != nullptr;
|
||||
int index = 0;
|
||||
for (int co_i = 0; co_i < div_co; co_i++) {
|
||||
for (int ci_i = 0; ci_i < div_ci; ci_i++) {
|
||||
for (int kw_i = 0; kw_i < kw; kw_i++) {
|
||||
for (int kh_i = 0; kh_i < kh; kh_i++) {
|
||||
for (int kw_i = 0; kw_i < kw; kw_i++) {
|
||||
for (int ci_i = 0; ci_i < div_ci; ci_i++) {
|
||||
for (int ci4_i = 0; ci4_i < 4; ci4_i++) {
|
||||
for (int co4_i = 0; co4_i < 4; co4_i++) {
|
||||
int co_offset = co_i * 4 + co4_i;
|
||||
|
@ -109,16 +104,6 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
|
|||
}
|
||||
}
|
||||
}
|
||||
for (int co_i = 0; co_i < div_co; co_i++) {
|
||||
for (int co4_i = 0; co4_i < 4; co4_i++) {
|
||||
int co_offset = co_i * 4 + co4_i;
|
||||
if (has_bias && co_offset < co) {
|
||||
bias_[co_offset] = origin_bias[co_offset];
|
||||
} else {
|
||||
bias_[co_offset] = 0.;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int Conv2dTransposeOpenCLKernel::Run() {
|
||||
|
@ -134,14 +119,31 @@ int Conv2dTransposeOpenCLKernel::Run() {
|
|||
int co = param->output_channel_;
|
||||
int kh = param->kernel_h_;
|
||||
int kw = param->kernel_w_;
|
||||
int pad = kh - 1 - param->pad_h_;
|
||||
int pad = param->pad_h_;
|
||||
int oh = outputs_[0]->shape()[1];
|
||||
int ow = outputs_[0]->shape()[2];
|
||||
int h = inputs_[0]->shape()[1];
|
||||
int w = inputs_[0]->shape()[2];
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
||||
cl::ImageFormat image_format;
|
||||
{
|
||||
image_format.image_channel_order = CL_RGBA;
|
||||
#ifdef ENABLE_FP16
|
||||
image_format.image_channel_data_type = CL_HALF_FLOAT;
|
||||
#else
|
||||
image_format.image_channel_data_type = CL_FLOAT;
|
||||
#endif
|
||||
}
|
||||
cl_int in_error_code, in_error_code_weight, in_error_code_bias, out_error_code;
|
||||
cl::Image2D img_x(*ocl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_format, w * ci / 4, h, 0,
|
||||
inputs_[0]->Data(), &in_error_code);
|
||||
cl::Image2D img_bias(*ocl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_format, co / 4, 1, 0,
|
||||
inputs_[2]->Data(), &in_error_code_bias);
|
||||
cl::Image2D out_mem(*ocl_runtime->Context(), CL_MEM_WRITE_ONLY, image_format, ow * co / 4, oh, 0, nullptr,
|
||||
&out_error_code);
|
||||
// local size should less than MAX_GROUP_SIZE
|
||||
std::vector<size_t> local = {4, 4, 32};
|
||||
std::vector<size_t> local = {16, 1, 16};
|
||||
std::vector<size_t> global = {UP_ROUND((size_t)oh / 2, local[0]), UP_ROUND((size_t)ow / 2, local[1]),
|
||||
UP_ROUND((size_t)co / 4, local[2])};
|
||||
|
||||
|
@ -150,16 +152,19 @@ int Conv2dTransposeOpenCLKernel::Run() {
|
|||
cl_int2 padding = {pad, pad};
|
||||
cl_int4 src_size = {h, w, UP_DIV(ci, 4), 1};
|
||||
cl_int4 dst_size = {oh, ow, UP_DIV(co, 4), 1};
|
||||
ocl_runtime->SetKernelArg(kernel_, 0, inputs_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, 0, img_x);
|
||||
ocl_runtime->SetKernelArg(kernel_, 1, padWeight_);
|
||||
ocl_runtime->SetKernelArg(kernel_, 2, bias_);
|
||||
ocl_runtime->SetKernelArg(kernel_, 3, outputs_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, 2, img_bias);
|
||||
ocl_runtime->SetKernelArg(kernel_, 3, out_mem);
|
||||
ocl_runtime->SetKernelArg(kernel_, 4, kernel_size);
|
||||
ocl_runtime->SetKernelArg(kernel_, 5, stride);
|
||||
ocl_runtime->SetKernelArg(kernel_, 6, padding);
|
||||
ocl_runtime->SetKernelArg(kernel_, 7, src_size);
|
||||
ocl_runtime->SetKernelArg(kernel_, 8, dst_size);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
auto origin = cl::array<cl::size_type, 3U>{0, 0, 0};
|
||||
auto region = cl::array<cl::size_type, 3U>{(size_t)(ow * co / 4), (size_t)(oh), 1};
|
||||
ocl_runtime->GetDefaultCommandQueue()->enqueueReadImage(out_mem, CL_TRUE, origin, region, 0, 0, outputs_[0]->Data());
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -180,4 +185,3 @@ kernel::LiteKernel *OpenCLConv2dTransposeKernelCreator(const std::vector<lite::t
|
|||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DeConv2D, OpenCLConv2dTransposeKernelCreator)
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -94,8 +94,8 @@ void MatMulOpenCLKernel::PadWeight() {
|
|||
}
|
||||
}
|
||||
if (hasBias_) {
|
||||
memcpy(inputs_[2]->Data(), bias_, sizeof(FLOAT_T) * sizeCI.s[0]);
|
||||
for (int i = sizeCI.s[0]; i < sizeCI.s[1] * 4; i++) {
|
||||
memcpy(bias_, inputs_[2]->Data(), sizeof(FLOAT_T) * sizeCO.s[0]);
|
||||
for (int i = sizeCO.s[0]; i < sizeCO.s[1] * 4; i++) {
|
||||
bias_[i] = 0;
|
||||
}
|
||||
}
|
||||
|
@ -118,7 +118,7 @@ int MatMulOpenCLKernel::Run() {
|
|||
ocl_runtime->SetKernelArg(kernel_, 1, padWeight_);
|
||||
ocl_runtime->SetKernelArg(kernel_, 2, outputs_[0]->Data());
|
||||
if (hasBias_) {
|
||||
ocl_runtime->SetKernelArg(kernel_, 3, inputs_[2]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, 3, bias_);
|
||||
} else {
|
||||
ocl_runtime->SetKernelArg(kernel_, 3, nullptr);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue