!3871 add opengl arthimitic

Merge pull request !3871 from chenzhongming/master
This commit is contained in:
mindspore-ci-bot 2020-08-04 15:49:01 +08:00 committed by Gitee
commit 00bf6808c5
5 changed files with 142 additions and 106 deletions

View File

@ -1,49 +0,0 @@
__kernel void ArithmeticAdd(__global float *input_a,
__global float *input_b,
__global float *output,
const unsigned int n) {
int id = get_global_id(0);
if (id < n) {
output[id] = input_a[id] + input_b[id];
}
}
__kernel void ArithmeticSub(__global float *input_a,
__global float *input_b,
__global float *output,
const unsigned int n) {
int id = get_global_id(0);
if (id < n) {
output[id] = input_a[id] - input_b[id];
}
}
__kernel void ArithmeticMul(__global float *input_a,
__global float *input_b,
__global float *output,
const unsigned int n) {
int id = get_global_id(0);
if (id < n) {
output[id] = input_a[id] * input_b[id];
}
}
__kernel void ArithmeticDiv(__global float *input_a,
__global float *input_b,
__global float *output,
const unsigned int n) {
int id = get_global_id(0);
if (id < n) {
output[id] = input_a[id] * input_b[id];
}
}
__kernel void ArithmeticBiasAdd(__global float4 *input,
__global float4 *output,
const float weight,
const float bias,
const unsigned int n) {
int id = get_global_id(0);
float4 bias_vec = (float4)(bias, 0.0f, .0f, .0f);
float4 weight_vec = (float4)(weight, 0.0f, .0f, .0f);
if (id < n) {
output[id] = weight_vec * input[id] + bias_vec;
}
}

View File

@ -0,0 +1,51 @@
__kernel void ElementAdd(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] + input_b[idx];
}
__kernel void ElementSub(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] - input_b[idx];
}
__kernel void ElementMul(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] * input_b[idx];
}
__kernel void ElementDiv(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] * input_b[idx];
}
__kernel void BoardcastAdd(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] + input_b;
}
__kernel void BoardcastSub(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] - input_b;
}
__kernel void BoardcastMul(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] * input_b;
}
__kernel void BoardcastDiv(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] * input_b;
}

View File

@ -0,0 +1,15 @@
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__kernel void ElementAdd(__read_only image2d_t *input_a, __read_only image2d_t *input_b, __write_only image2d_t *output,
const int4 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) return;
if (idx >= n) return;
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y * output_shape.w + Z));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y * output_shape.w + Z));
src = a + b;
write_imagef(output, (int2)(0, 0), src);
}

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -13,15 +13,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/opencl/kernel/arithmetic.h"
#include <set>
#include <vector>
#include <string>
#include <set>
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "src/runtime/kernel/opencl/kernel/arithmetic.h"
#ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/fp32/arithmetic.cl.inc"
#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl.inc"
#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl.inc"
#endif
using mindspore::kernel::KERNEL_ARCH::kGPU;
@ -29,44 +30,53 @@ using mindspore::lite::KernelRegistrar;
namespace mindspore::kernel {
int ArithmeticOpenCLKernel::Init() {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
std::string kernel_name = "ArithmeticAdd";
std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
const size_t global_x = outputs_[0]->Width();
const size_t global_y = outputs_[0]->Height();
const size_t global_z = UP_ROUND_DIV(outputs_[0]->Channel(), 4);
std::vector<size_t> global = {global_x, global_y, global_z};
return global;
}
void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
global_size_ = InitGlobalSize();
int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())());
local_size_ = GetLocalSize(global_size_, max_work_group_size);
global_size_ = GetGlobalSize(local_size_, global_size_);
}
void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
uint32_t element_num = outputs_[0]->ElementsC4Num();
global_size_ = {element_num};
}
int ArithmeticOpenCLKernel::Init() {
runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
std::string element_name;
std::string boardcast_name;
is_bias_add_ = false;
if (inputs_[1]->TensorType() == schema::NodeType_ValueNode && inputs_[1]->Data() != nullptr) {
kernel_name = "ArithmeticBiasAdd";
is_bias_add_ = true;
element_flag_ = false;
} else {
element_flag_ = true;
}
switch (opParameter->type_) {
case PrimitiveType_Mul:
if (is_bias_add_) {
weight_ = static_cast<float *>(inputs_[1]->Data())[0];
break;
}
kernel_name = "ArithmeticMul";
element_name = "ElementMul";
boardcast_name = "BoardcastMul";
break;
case PrimitiveType_Add:
if (is_bias_add_) {
bias_ = static_cast<float *>(inputs_[1]->Data())[0];
break;
}
kernel_name = "ArithmeticAdd";
element_name = "ElementAdd";
boardcast_name = "BoardcastAdd";
break;
case PrimitiveType_Sub:
if (is_bias_add_) {
bias_ = -1 * static_cast<float *>(inputs_[1]->Data())[0];
break;
}
kernel_name = "ArithmeticSub";
element_name = "ElementSub";
boardcast_name = "BoardcastSub";
break;
case PrimitiveType_Div:
if (is_bias_add_) {
weight_ = 1 / static_cast<float *>(inputs_[1]->Data())[0];
break;
}
kernel_name = "ArithmeticDiv";
element_name = "ElementDiv";
boardcast_name = "BoardcastDiv";
break;
default:
MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
@ -74,39 +84,44 @@ int ArithmeticOpenCLKernel::Init() {
}
#ifdef PROGRAM_WITH_IL
ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
runtime_->CreateKernelFromIL(kernel_(), kernel_name);
#else
std::string program_name = "Arithmetic";
std::set<std::string> build_options;
std::string source = arithmetic_source_fp32;
ocl_runtime->LoadSource(program_name, source);
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
std::string source = arithmetic_buffer_source_fp32;
runtime_->LoadSource(program_name, source);
if (element_flag_) {
runtime_->BuildKernel(kernel_, program_name, element_name, build_options);
MS_LOG(DEBUG) << element_name << " Init Done!";
} else {
runtime_->BuildKernel(kernel_, program_name, boardcast_name, build_options);
MS_LOG(DEBUG) << boardcast_name << " Init Done!";
}
#endif
outputs_[0]->SetFormat(schema::Format_NHWC4);
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return 0;
}
int ArithmeticOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->Name() << " Running!";
uint32_t element_num = outputs_[0]->ElementsC4Num();
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
std::vector<size_t> global = {element_num};
std::vector<size_t> local;
auto runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
BufferGetWorkGroupSize();
ocl_runtime->SetKernelArg(kernel_, 0, inputs_[0]->Data());
if (is_bias_add_) {
MS_LOG(DEBUG) << "weight: " << weight_ << " bias: " << bias_;
ocl_runtime->SetKernelArg(kernel_, 1, outputs_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, 2, weight_);
ocl_runtime->SetKernelArg(kernel_, 3, bias_);
ocl_runtime->SetKernelArg(kernel_, 4, element_num / C4NUM);
int arg_idx = 0;
uint32_t element_num = outputs_[0]->ElementsC4Num();
runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[0]->Data());
if (element_flag_) {
runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[1]->Data());
} else {
ocl_runtime->SetKernelArg(kernel_, 1, inputs_[1]->Data());
ocl_runtime->SetKernelArg(kernel_, 2, outputs_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, 3, element_num);
runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float *>(inputs_[1]->Data())[0]);
}
return ocl_runtime->RunKernel(kernel_, global, local, nullptr);
runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data());
runtime_->SetKernelArg(kernel_, arg_idx++, element_num);
runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
return 0;
}
kernel::LiteKernel *OpenCLArithmeticKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
@ -132,4 +147,3 @@ REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLArithmeticKernelCr
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLArithmeticKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLArithmeticKernelCreator)
} // namespace mindspore::kernel

View File

@ -28,18 +28,23 @@ class ArithmeticOpenCLKernel : public ArithmeticCPUKernel {
explicit ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx)
: ArithmeticCPUKernel(parameter, inputs, outputs, ctx) {}
~ArithmeticOpenCLKernel() override {};
~ArithmeticOpenCLKernel() override{};
int Init() override;
int Run() override;
private:
std::vector<size_t> InitGlobalSize() const;
void Image2dGetWorkGroupSize();
void BufferGetWorkGroupSize();
cl::Kernel kernel_;
bool is_bias_add_{false};
float weight_{1.f};
float bias_{.0f};
lite::opencl::OpenCLRuntime *runtime_;
bool element_flag_{true};
std::vector<size_t> local_size_;
std::vector<size_t> global_size_;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_ARITHMETIC_H_