From cbf7f21bdd97465e53fdddbd05a44fe58084d8db Mon Sep 17 00:00:00 2001 From: chenzomi Date: Mon, 3 Aug 2020 15:07:20 +0800 Subject: [PATCH] add oepncl arithmetic.cl --- .../kernel/opencl/cl/fp32/arithmetic.cl | 49 -------- .../opencl/cl/fp32/arithmetic_buffer.cl | 51 ++++++++ .../opencl/cl/fp32/arithmetic_image2d.cl | 15 +++ .../kernel/opencl/kernel/arithmetic.cc | 118 ++++++++++-------- .../runtime/kernel/opencl/kernel/arithmetic.h | 15 ++- 5 files changed, 142 insertions(+), 106 deletions(-) delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic.cl create mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl create mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic.cl deleted file mode 100644 index 255f8b3623f..00000000000 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic.cl +++ /dev/null @@ -1,49 +0,0 @@ -__kernel void ArithmeticAdd(__global float *input_a, - __global float *input_b, - __global float *output, - const unsigned int n) { - int id = get_global_id(0); - if (id < n) { - output[id] = input_a[id] + input_b[id]; - } -} -__kernel void ArithmeticSub(__global float *input_a, - __global float *input_b, - __global float *output, - const unsigned int n) { - int id = get_global_id(0); - if (id < n) { - output[id] = input_a[id] - input_b[id]; - } -} -__kernel void ArithmeticMul(__global float *input_a, - __global float *input_b, - __global float *output, - const unsigned int n) { - int id = get_global_id(0); - if (id < n) { - output[id] = input_a[id] * input_b[id]; - } -} -__kernel void ArithmeticDiv(__global float *input_a, - __global float *input_b, - __global float *output, - const unsigned int n) { - int id = get_global_id(0); - if (id < n) { - output[id] = input_a[id] * input_b[id]; - } -} - -__kernel void ArithmeticBiasAdd(__global float4 *input, - __global float4 *output, - const float weight, - const float bias, - const unsigned int n) { - int id = get_global_id(0); - float4 bias_vec = (float4)(bias, 0.0f, .0f, .0f); - float4 weight_vec = (float4)(weight, 0.0f, .0f, .0f); - if (id < n) { - output[id] = weight_vec * input[id] + bias_vec; - } -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl new file mode 100644 index 00000000000..4e9a8422be6 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl @@ -0,0 +1,51 @@ +__kernel void ElementAdd(__global float *input_a, __global float *input_b, __global float *output, + const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] + input_b[idx]; +} + +__kernel void ElementSub(__global float *input_a, __global float *input_b, __global float *output, + const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] - input_b[idx]; +} + +__kernel void ElementMul(__global float *input_a, __global float *input_b, __global float *output, + const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] * input_b[idx]; +} + +__kernel void ElementDiv(__global float *input_a, __global float *input_b, __global float *output, + const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] * input_b[idx]; +} + +__kernel void BoardcastAdd(__global float *input_a, float input_b, __global float *output, const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] + input_b; +} + +__kernel void BoardcastSub(__global float *input_a, float input_b, __global float *output, const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] - input_b; +} + +__kernel void BoardcastMul(__global float *input_a, float input_b, __global float *output, const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] * input_b; +} + +__kernel void BoardcastDiv(__global float *input_a, float input_b, __global float *output, const unsigned int n) { + int idx = get_global_id(0); + if (idx >= n) return; + output[idx] = input_a[idx] * input_b; +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl new file mode 100644 index 00000000000..12f75438f96 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl @@ -0,0 +1,15 @@ +__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; + +__kernel void ElementAdd(__read_only image2d_t *input_a, __read_only image2d_t *input_b, __write_only image2d_t *output, + const int4 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) return; + + if (idx >= n) return; + float4 a = read_imagef(input_a, smp_none, (int2)(X, Y * output_shape.w + Z)); + float4 b = read_imagef(input_b, smp_none, (int2)(X, Y * output_shape.w + Z)); + src = a + b; + write_imagef(output, (int2)(0, 0), src); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index 58f594d539a..b6b9131db2e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -1,5 +1,5 @@ /** - * Copyright 2019 Huawei Technologies Co., Ltd + * Copyright 2020 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,15 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#include "src/runtime/kernel/opencl/kernel/arithmetic.h" +#include #include #include -#include #include "schema/model_generated.h" #include "src/kernel_registry.h" +#include "src/runtime/kernel/opencl/utils.h" +#include "src/runtime/kernel/opencl/kernel/arithmetic.h" #ifndef PROGRAM_WITH_IL -#include "src/runtime/kernel/opencl/cl/fp32/arithmetic.cl.inc" +#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl.inc" +#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl.inc" #endif using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -29,44 +30,53 @@ using mindspore::lite::KernelRegistrar; namespace mindspore::kernel { -int ArithmeticOpenCLKernel::Init() { - auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - std::string kernel_name = "ArithmeticAdd"; +std::vector ArithmeticOpenCLKernel::InitGlobalSize() const { + const size_t global_x = outputs_[0]->Width(); + const size_t global_y = outputs_[0]->Height(); + const size_t global_z = UP_ROUND_DIV(outputs_[0]->Channel(), 4); + std::vector global = {global_x, global_y, global_z}; + return global; +} + +void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { + global_size_ = InitGlobalSize(); + int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())()); + local_size_ = GetLocalSize(global_size_, max_work_group_size); + global_size_ = GetGlobalSize(local_size_, global_size_); +} + +void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() { + uint32_t element_num = outputs_[0]->ElementsC4Num(); + global_size_ = {element_num}; +} + +int ArithmeticOpenCLKernel::Init() { + runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); + std::string element_name; + std::string boardcast_name; - is_bias_add_ = false; if (inputs_[1]->TensorType() == schema::NodeType_ValueNode && inputs_[1]->Data() != nullptr) { - kernel_name = "ArithmeticBiasAdd"; - is_bias_add_ = true; + element_flag_ = false; + } else { + element_flag_ = true; } switch (opParameter->type_) { case PrimitiveType_Mul: - if (is_bias_add_) { - weight_ = static_cast(inputs_[1]->Data())[0]; - break; - } - kernel_name = "ArithmeticMul"; + element_name = "ElementMul"; + boardcast_name = "BoardcastMul"; break; case PrimitiveType_Add: - if (is_bias_add_) { - bias_ = static_cast(inputs_[1]->Data())[0]; - break; - } - kernel_name = "ArithmeticAdd"; + element_name = "ElementAdd"; + boardcast_name = "BoardcastAdd"; break; case PrimitiveType_Sub: - if (is_bias_add_) { - bias_ = -1 * static_cast(inputs_[1]->Data())[0]; - break; - } - kernel_name = "ArithmeticSub"; + element_name = "ElementSub"; + boardcast_name = "BoardcastSub"; break; case PrimitiveType_Div: - if (is_bias_add_) { - weight_ = 1 / static_cast(inputs_[1]->Data())[0]; - break; - } - kernel_name = "ArithmeticDiv"; + element_name = "ElementDiv"; + boardcast_name = "BoardcastDiv"; break; default: MS_LOG(ERROR) << "Error Operator type " << opParameter->type_; @@ -74,39 +84,44 @@ int ArithmeticOpenCLKernel::Init() { } #ifdef PROGRAM_WITH_IL - ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name); + runtime_->CreateKernelFromIL(kernel_(), kernel_name); #else std::string program_name = "Arithmetic"; std::set build_options; - std::string source = arithmetic_source_fp32; - ocl_runtime->LoadSource(program_name, source); - ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); + std::string source = arithmetic_buffer_source_fp32; + runtime_->LoadSource(program_name, source); + + if (element_flag_) { + runtime_->BuildKernel(kernel_, program_name, element_name, build_options); + MS_LOG(DEBUG) << element_name << " Init Done!"; + } else { + runtime_->BuildKernel(kernel_, program_name, boardcast_name, build_options); + MS_LOG(DEBUG) << boardcast_name << " Init Done!"; + } #endif outputs_[0]->SetFormat(schema::Format_NHWC4); - MS_LOG(DEBUG) << kernel_name << " Init Done!"; return 0; } int ArithmeticOpenCLKernel::Run() { MS_LOG(DEBUG) << this->Name() << " Running!"; - uint32_t element_num = outputs_[0]->ElementsC4Num(); - auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - std::vector global = {element_num}; - std::vector local; + auto runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); + BufferGetWorkGroupSize(); - ocl_runtime->SetKernelArg(kernel_, 0, inputs_[0]->Data()); - if (is_bias_add_) { - MS_LOG(DEBUG) << "weight: " << weight_ << " bias: " << bias_; - ocl_runtime->SetKernelArg(kernel_, 1, outputs_[0]->Data()); - ocl_runtime->SetKernelArg(kernel_, 2, weight_); - ocl_runtime->SetKernelArg(kernel_, 3, bias_); - ocl_runtime->SetKernelArg(kernel_, 4, element_num / C4NUM); + int arg_idx = 0; + uint32_t element_num = outputs_[0]->ElementsC4Num(); + + runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[0]->Data()); + if (element_flag_) { + runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[1]->Data()); } else { - ocl_runtime->SetKernelArg(kernel_, 1, inputs_[1]->Data()); - ocl_runtime->SetKernelArg(kernel_, 2, outputs_[0]->Data()); - ocl_runtime->SetKernelArg(kernel_, 3, element_num); + runtime_->SetKernelArg(kernel_, arg_idx++, static_cast(inputs_[1]->Data())[0]); } - return ocl_runtime->RunKernel(kernel_, global, local, nullptr); + runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data()); + runtime_->SetKernelArg(kernel_, arg_idx++, element_num); + + runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); + return 0; } kernel::LiteKernel *OpenCLArithmeticKernelCreator(const std::vector &inputs, @@ -132,4 +147,3 @@ REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLArithmeticKernelCr REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLArithmeticKernelCreator) REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLArithmeticKernelCreator) } // namespace mindspore::kernel - diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h index df6c8438401..f95750406dc 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h @@ -28,18 +28,23 @@ class ArithmeticOpenCLKernel : public ArithmeticCPUKernel { explicit ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const lite::Context *ctx) : ArithmeticCPUKernel(parameter, inputs, outputs, ctx) {} - ~ArithmeticOpenCLKernel() override {}; + ~ArithmeticOpenCLKernel() override{}; int Init() override; int Run() override; private: + std::vector InitGlobalSize() const; + void Image2dGetWorkGroupSize(); + void BufferGetWorkGroupSize(); + cl::Kernel kernel_; - bool is_bias_add_{false}; - float weight_{1.f}; - float bias_{.0f}; + lite::opencl::OpenCLRuntime *runtime_; + bool element_flag_{true}; + + std::vector local_size_; + std::vector global_size_; }; } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_ARITHMETIC_H_ -