forked from mindspore-Ecosystem/mindspore
!3871 add opengl arthimitic
Merge pull request !3871 from chenzhongming/master
This commit is contained in:
commit
00bf6808c5
|
@ -1,49 +0,0 @@
|
|||
__kernel void ArithmeticAdd(__global float *input_a,
|
||||
__global float *input_b,
|
||||
__global float *output,
|
||||
const unsigned int n) {
|
||||
int id = get_global_id(0);
|
||||
if (id < n) {
|
||||
output[id] = input_a[id] + input_b[id];
|
||||
}
|
||||
}
|
||||
__kernel void ArithmeticSub(__global float *input_a,
|
||||
__global float *input_b,
|
||||
__global float *output,
|
||||
const unsigned int n) {
|
||||
int id = get_global_id(0);
|
||||
if (id < n) {
|
||||
output[id] = input_a[id] - input_b[id];
|
||||
}
|
||||
}
|
||||
__kernel void ArithmeticMul(__global float *input_a,
|
||||
__global float *input_b,
|
||||
__global float *output,
|
||||
const unsigned int n) {
|
||||
int id = get_global_id(0);
|
||||
if (id < n) {
|
||||
output[id] = input_a[id] * input_b[id];
|
||||
}
|
||||
}
|
||||
__kernel void ArithmeticDiv(__global float *input_a,
|
||||
__global float *input_b,
|
||||
__global float *output,
|
||||
const unsigned int n) {
|
||||
int id = get_global_id(0);
|
||||
if (id < n) {
|
||||
output[id] = input_a[id] * input_b[id];
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void ArithmeticBiasAdd(__global float4 *input,
|
||||
__global float4 *output,
|
||||
const float weight,
|
||||
const float bias,
|
||||
const unsigned int n) {
|
||||
int id = get_global_id(0);
|
||||
float4 bias_vec = (float4)(bias, 0.0f, .0f, .0f);
|
||||
float4 weight_vec = (float4)(weight, 0.0f, .0f, .0f);
|
||||
if (id < n) {
|
||||
output[id] = weight_vec * input[id] + bias_vec;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
__kernel void ElementAdd(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] + input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementSub(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] - input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementMul(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] * input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void ElementDiv(__global float *input_a, __global float *input_b, __global float *output,
|
||||
const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] * input_b[idx];
|
||||
}
|
||||
|
||||
__kernel void BoardcastAdd(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] + input_b;
|
||||
}
|
||||
|
||||
__kernel void BoardcastSub(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] - input_b;
|
||||
}
|
||||
|
||||
__kernel void BoardcastMul(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] * input_b;
|
||||
}
|
||||
|
||||
__kernel void BoardcastDiv(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
|
||||
int idx = get_global_id(0);
|
||||
if (idx >= n) return;
|
||||
output[idx] = input_a[idx] * input_b;
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void ElementAdd(__read_only image2d_t *input_a, __read_only image2d_t *input_b, __write_only image2d_t *output,
|
||||
const int4 output_shape) {
|
||||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) return;
|
||||
|
||||
if (idx >= n) return;
|
||||
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y * output_shape.w + Z));
|
||||
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y * output_shape.w + Z));
|
||||
src = a + b;
|
||||
write_imagef(output, (int2)(0, 0), src);
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -13,15 +13,16 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "src/runtime/kernel/opencl/kernel/arithmetic.h"
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include "schema/model_generated.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/arithmetic.h"
|
||||
#ifndef PROGRAM_WITH_IL
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/arithmetic.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl.inc"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl.inc"
|
||||
#endif
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
|
@ -29,44 +30,53 @@ using mindspore::lite::KernelRegistrar;
|
|||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int ArithmeticOpenCLKernel::Init() {
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::string kernel_name = "ArithmeticAdd";
|
||||
std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
|
||||
const size_t global_x = outputs_[0]->Width();
|
||||
const size_t global_y = outputs_[0]->Height();
|
||||
const size_t global_z = UP_ROUND_DIV(outputs_[0]->Channel(), 4);
|
||||
std::vector<size_t> global = {global_x, global_y, global_z};
|
||||
return global;
|
||||
}
|
||||
|
||||
void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
|
||||
global_size_ = InitGlobalSize();
|
||||
int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())());
|
||||
local_size_ = GetLocalSize(global_size_, max_work_group_size);
|
||||
global_size_ = GetGlobalSize(local_size_, global_size_);
|
||||
}
|
||||
|
||||
void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
|
||||
uint32_t element_num = outputs_[0]->ElementsC4Num();
|
||||
global_size_ = {element_num};
|
||||
}
|
||||
|
||||
int ArithmeticOpenCLKernel::Init() {
|
||||
runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::string element_name;
|
||||
std::string boardcast_name;
|
||||
|
||||
is_bias_add_ = false;
|
||||
if (inputs_[1]->TensorType() == schema::NodeType_ValueNode && inputs_[1]->Data() != nullptr) {
|
||||
kernel_name = "ArithmeticBiasAdd";
|
||||
is_bias_add_ = true;
|
||||
element_flag_ = false;
|
||||
} else {
|
||||
element_flag_ = true;
|
||||
}
|
||||
|
||||
switch (opParameter->type_) {
|
||||
case PrimitiveType_Mul:
|
||||
if (is_bias_add_) {
|
||||
weight_ = static_cast<float *>(inputs_[1]->Data())[0];
|
||||
break;
|
||||
}
|
||||
kernel_name = "ArithmeticMul";
|
||||
element_name = "ElementMul";
|
||||
boardcast_name = "BoardcastMul";
|
||||
break;
|
||||
case PrimitiveType_Add:
|
||||
if (is_bias_add_) {
|
||||
bias_ = static_cast<float *>(inputs_[1]->Data())[0];
|
||||
break;
|
||||
}
|
||||
kernel_name = "ArithmeticAdd";
|
||||
element_name = "ElementAdd";
|
||||
boardcast_name = "BoardcastAdd";
|
||||
break;
|
||||
case PrimitiveType_Sub:
|
||||
if (is_bias_add_) {
|
||||
bias_ = -1 * static_cast<float *>(inputs_[1]->Data())[0];
|
||||
break;
|
||||
}
|
||||
kernel_name = "ArithmeticSub";
|
||||
element_name = "ElementSub";
|
||||
boardcast_name = "BoardcastSub";
|
||||
break;
|
||||
case PrimitiveType_Div:
|
||||
if (is_bias_add_) {
|
||||
weight_ = 1 / static_cast<float *>(inputs_[1]->Data())[0];
|
||||
break;
|
||||
}
|
||||
kernel_name = "ArithmeticDiv";
|
||||
element_name = "ElementDiv";
|
||||
boardcast_name = "BoardcastDiv";
|
||||
break;
|
||||
default:
|
||||
MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
|
||||
|
@ -74,39 +84,44 @@ int ArithmeticOpenCLKernel::Init() {
|
|||
}
|
||||
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
|
||||
runtime_->CreateKernelFromIL(kernel_(), kernel_name);
|
||||
#else
|
||||
std::string program_name = "Arithmetic";
|
||||
std::set<std::string> build_options;
|
||||
std::string source = arithmetic_source_fp32;
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
std::string source = arithmetic_buffer_source_fp32;
|
||||
runtime_->LoadSource(program_name, source);
|
||||
|
||||
if (element_flag_) {
|
||||
runtime_->BuildKernel(kernel_, program_name, element_name, build_options);
|
||||
MS_LOG(DEBUG) << element_name << " Init Done!";
|
||||
} else {
|
||||
runtime_->BuildKernel(kernel_, program_name, boardcast_name, build_options);
|
||||
MS_LOG(DEBUG) << boardcast_name << " Init Done!";
|
||||
}
|
||||
#endif
|
||||
outputs_[0]->SetFormat(schema::Format_NHWC4);
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ArithmeticOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->Name() << " Running!";
|
||||
uint32_t element_num = outputs_[0]->ElementsC4Num();
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::vector<size_t> global = {element_num};
|
||||
std::vector<size_t> local;
|
||||
auto runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
BufferGetWorkGroupSize();
|
||||
|
||||
ocl_runtime->SetKernelArg(kernel_, 0, inputs_[0]->Data());
|
||||
if (is_bias_add_) {
|
||||
MS_LOG(DEBUG) << "weight: " << weight_ << " bias: " << bias_;
|
||||
ocl_runtime->SetKernelArg(kernel_, 1, outputs_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, 2, weight_);
|
||||
ocl_runtime->SetKernelArg(kernel_, 3, bias_);
|
||||
ocl_runtime->SetKernelArg(kernel_, 4, element_num / C4NUM);
|
||||
int arg_idx = 0;
|
||||
uint32_t element_num = outputs_[0]->ElementsC4Num();
|
||||
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[0]->Data());
|
||||
if (element_flag_) {
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[1]->Data());
|
||||
} else {
|
||||
ocl_runtime->SetKernelArg(kernel_, 1, inputs_[1]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, 2, outputs_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, 3, element_num);
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float *>(inputs_[1]->Data())[0]);
|
||||
}
|
||||
return ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data());
|
||||
runtime_->SetKernelArg(kernel_, arg_idx++, element_num);
|
||||
|
||||
runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *OpenCLArithmeticKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
|
@ -132,4 +147,3 @@ REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLArithmeticKernelCr
|
|||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLArithmeticKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLArithmeticKernelCreator)
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -28,18 +28,23 @@ class ArithmeticOpenCLKernel : public ArithmeticCPUKernel {
|
|||
explicit ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx)
|
||||
: ArithmeticCPUKernel(parameter, inputs, outputs, ctx) {}
|
||||
~ArithmeticOpenCLKernel() override {};
|
||||
~ArithmeticOpenCLKernel() override{};
|
||||
|
||||
int Init() override;
|
||||
int Run() override;
|
||||
|
||||
private:
|
||||
std::vector<size_t> InitGlobalSize() const;
|
||||
void Image2dGetWorkGroupSize();
|
||||
void BufferGetWorkGroupSize();
|
||||
|
||||
cl::Kernel kernel_;
|
||||
bool is_bias_add_{false};
|
||||
float weight_{1.f};
|
||||
float bias_{.0f};
|
||||
lite::opencl::OpenCLRuntime *runtime_;
|
||||
bool element_flag_{true};
|
||||
|
||||
std::vector<size_t> local_size_;
|
||||
std::vector<size_t> global_size_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_ARITHMETIC_H_
|
||||
|
||||
|
|
Loading…
Reference in New Issue