forked from mindspore-Ecosystem/mindspore
!7890 [MS][LITE][GPU]optimize arithmetic and pooling
Merge pull request !7890 from chenzupeng/master-lite
This commit is contained in:
commit
1bd0e8ba0b
|
@ -2,46 +2,14 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
#define divide_no_check(a, b) (a / b)
|
||||
__kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
|
||||
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 r = (FLT4)(0.0f);
|
||||
FLT window_size = 0.0f;
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
bool outside_x = x_c < 0 || x_c >= input_shape.x;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
bool outside = outside_x || y_c < 0 || y_c >= input_shape.y;
|
||||
r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (FLT4)(0.0f);
|
||||
window_size += !outside ? 1.0f : 0.0f;
|
||||
}
|
||||
}
|
||||
FLT4 result = TO_FLT4(r / window_size);
|
||||
output[(output_shape.y * X + Y) * output_shape.w + Z] = result;
|
||||
}
|
||||
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
|
||||
const int4 output_shape, const int2 stride, const int2 kernel_size,
|
||||
const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(0);
|
||||
int X = get_global_id(2);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
int Z = get_global_id(0);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
|
@ -66,35 +34,3 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i
|
|||
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
|
||||
}
|
||||
|
||||
__kernel void AvgPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output,
|
||||
const int4 input_shape, const int4 output_shape, const int2 stride,
|
||||
const int2 kernel_size, const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 r = (FLT4)(0.0f);
|
||||
FLT window_size = 0.0f;
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
bool outside_y = y_c < 0 || y_c >= input_shape.y;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
|
||||
r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c, Z * input_shape.x + x_c)) : (FLT4)(0.0f);
|
||||
window_size += !outside ? 1.0f : 0.0f;
|
||||
}
|
||||
}
|
||||
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
|
||||
WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), result);
|
||||
}
|
||||
|
|
|
@ -1,48 +1,14 @@
|
|||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
__kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
|
||||
const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 maximum = (FLT4)(-10000.0f);
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
if (x_c < 0 || x_c >= input_shape.x) {
|
||||
continue;
|
||||
}
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
if (y_c < 0 || y_c >= input_shape.y) {
|
||||
continue;
|
||||
}
|
||||
FLT4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z];
|
||||
maximum = max(src, maximum);
|
||||
}
|
||||
}
|
||||
output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum;
|
||||
}
|
||||
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
|
||||
const int4 output_shape, const int2 stride, const int2 kernel_size,
|
||||
const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(0);
|
||||
int X = get_global_id(2);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
int Z = get_global_id(0);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
|
@ -69,9 +35,9 @@ __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o
|
|||
const int4 input_shape, const int4 output_shape, const int2 stride,
|
||||
const int2 kernel_size, const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(0);
|
||||
int X = get_global_id(2);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
int Z = get_global_id(0);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
|
@ -93,32 +59,3 @@ __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o
|
|||
}
|
||||
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f)));
|
||||
}
|
||||
|
||||
__kernel void MaxPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output,
|
||||
const int4 input_shape, const int4 output_shape, const int2 stride,
|
||||
const int2 kernel_size, const int2 padding) {
|
||||
// axis to dst tensor coordinate
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
|
||||
// boundary check
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT4 maximum = (FLT4)(-10000.0f);
|
||||
int xs = X * stride.x - padding.x;
|
||||
int ys = Y * stride.y - padding.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = ys + ky;
|
||||
if (y_c < 0 || y_c >= input_shape.y) continue;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = xs + kx;
|
||||
if (x_c < 0 || x_c >= input_shape.x) continue;
|
||||
FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c, Z * input_shape.x + x_c));
|
||||
maximum = max(src, maximum);
|
||||
}
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), maximum);
|
||||
}
|
||||
|
|
|
@ -52,16 +52,23 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
|
|||
}
|
||||
|
||||
void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
|
||||
local_size_ = {16, 16};
|
||||
auto out_shape = out_tensors_[0]->shape();
|
||||
if (out_shape.size() == 2) {
|
||||
size_t H = out_shape[0];
|
||||
size_t W = UP_DIV(out_shape[1], C4NUM);
|
||||
global_size_ = {W, H};
|
||||
if (element_flag_) {
|
||||
local_size_ = {16, 16};
|
||||
auto out_shape = out_tensors_[0]->shape();
|
||||
if (out_shape.size() == 2) {
|
||||
size_t H = out_shape[0];
|
||||
size_t W = UP_DIV(out_shape[1], C4NUM);
|
||||
global_size_ = {W, H};
|
||||
} else {
|
||||
size_t H = out_shape[0] * out_shape[1];
|
||||
size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM);
|
||||
global_size_ = {W, H};
|
||||
}
|
||||
} else {
|
||||
size_t H = out_shape[0] * out_shape[1];
|
||||
size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM);
|
||||
global_size_ = {W, H};
|
||||
local_size_ = {};
|
||||
auto out_shape = GetNHWCShape(out_tensors_[0]->shape());
|
||||
global_size_ = {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]),
|
||||
static_cast<size_t>(out_shape[1] * out_shape[0])};
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -129,6 +136,27 @@ int ArithmeticOpenCLKernel::InitBuffer() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int ArithmeticOpenCLKernel::SetArgs() {
|
||||
int arg_idx = 3;
|
||||
if (!element_flag_) {
|
||||
cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2],
|
||||
UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape);
|
||||
cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2],
|
||||
UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape);
|
||||
auto out_shape = GetNHWCShape(out_tensors_[0]->shape());
|
||||
cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||
} else {
|
||||
cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ArithmeticOpenCLKernel::Init() {
|
||||
std::string kernel_name;
|
||||
auto *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
|
||||
|
@ -237,6 +265,7 @@ int ArithmeticOpenCLKernel::Init() {
|
|||
|
||||
Image2dGetWorkGroupSize();
|
||||
InitBuffer();
|
||||
SetArgs();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
@ -250,29 +279,7 @@ int ArithmeticOpenCLKernel::Run() {
|
|||
auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1];
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||
if (!element_flag_) {
|
||||
cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2],
|
||||
UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape);
|
||||
cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2],
|
||||
UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape);
|
||||
auto out_shape = GetNHWCShape(out_tensors_[0]->shape());
|
||||
cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
|
||||
ocl_runtime_->RunKernel(kernel_,
|
||||
{static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]),
|
||||
static_cast<size_t>(out_shape[1] * out_shape[0])},
|
||||
{}, nullptr);
|
||||
} else {
|
||||
cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
|
||||
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
|
||||
}
|
||||
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {
|
|||
int Init() override;
|
||||
int Run() override;
|
||||
int InitBuffer() override;
|
||||
int SetArgs();
|
||||
|
||||
private:
|
||||
std::vector<size_t> InitGlobalSize() const;
|
||||
|
|
|
@ -83,17 +83,20 @@ int PoolingOpenCLKernel::Init() {
|
|||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
InitGlobalSize();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
|
||||
return mindspore::lite::RET_OK;
|
||||
}
|
||||
|
||||
std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const {
|
||||
void PoolingOpenCLKernel::InitGlobalSize() {
|
||||
const size_t global_x = out_tensors_[0]->shape()[1];
|
||||
const size_t global_y = out_tensors_[0]->shape()[2];
|
||||
const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
|
||||
std::vector<size_t> global = {global_x, global_y, global_z};
|
||||
return global;
|
||||
global_size_ = {global_z, global_y, global_x};
|
||||
int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
|
||||
local_size_ = GetCommonLocalSize(global_size_, max_work_group_size);
|
||||
global_size_ = GetCommonGlobalSize(local_size_, global_size_);
|
||||
}
|
||||
|
||||
int PoolingOpenCLKernel::Run() {
|
||||
|
@ -116,13 +119,7 @@ int PoolingOpenCLKernel::Run() {
|
|||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding);
|
||||
|
||||
std::vector<size_t> local_size;
|
||||
std::vector<size_t> global_size = InitGlobalSize();
|
||||
int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
|
||||
local_size = GetCommonLocalSize(global_size, max_work_group_size);
|
||||
global_size = GetCommonGlobalSize(local_size, global_size);
|
||||
|
||||
ocl_runtime_->RunKernel(kernel_, global_size, local_size, nullptr);
|
||||
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
|
||||
return mindspore::lite::RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -35,10 +35,12 @@ class PoolingOpenCLKernel : public OpenCLKernel {
|
|||
int Run() override;
|
||||
|
||||
private:
|
||||
std::vector<size_t> InitGlobalSize() const;
|
||||
void InitGlobalSize();
|
||||
PoolingParameter *parameter_;
|
||||
cl::Kernel kernel_;
|
||||
bool enable_fp16_{false};
|
||||
std::vector<size_t> local_size_;
|
||||
std::vector<size_t> global_size_;
|
||||
};
|
||||
|
||||
} // namespace mindspore::kernel
|
||||
|
|
Loading…
Reference in New Issue