forked from mindspore-Ecosystem/mindspore
add new ops named split and modify matmul_strassen
This commit is contained in:
parent
2fd660a2e9
commit
8cf6298bc9
|
@ -73,7 +73,7 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE |
|
||||||
int coordinate_x = Y * input_shape0.w + Z; \
|
int coordinate_x = Y * input_shape0.w + Z; \
|
||||||
int coordinate_y = X; \
|
int coordinate_y = X; \
|
||||||
result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); \
|
result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); \
|
||||||
} else { \
|
} else if (Y < boundary1) { \
|
||||||
int coordinate_x = (Y - boundary0) * input_shape1.w + Z; \
|
int coordinate_x = (Y - boundary0) * input_shape1.w + Z; \
|
||||||
int coordinate_y = X; \
|
int coordinate_y = X; \
|
||||||
result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); \
|
result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); \
|
||||||
|
@ -123,7 +123,7 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE |
|
||||||
int coordinate_x = Y * input_shape0.w + Z; \
|
int coordinate_x = Y * input_shape0.w + Z; \
|
||||||
int coordinate_y = X; \
|
int coordinate_y = X; \
|
||||||
result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); \
|
result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); \
|
||||||
} else { \
|
} else if (Z < boundary1) { \
|
||||||
int coordinate_x = Y * input_shape1.w + Z - boundary0; \
|
int coordinate_x = Y * input_shape1.w + Z - boundary0; \
|
||||||
int coordinate_y = X; \
|
int coordinate_y = X; \
|
||||||
result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); \
|
result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); \
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#define C4NUM 4
|
||||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||||
|
|
||||||
__kernel void gather(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices,
|
__kernel void gather(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices,
|
||||||
|
|
|
@ -0,0 +1,114 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||||
|
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
|
||||||
|
#define C4NUM 4
|
||||||
|
|
||||||
|
#define CHECK_IDX_ALIGN \
|
||||||
|
const int X = get_global_id(0); \
|
||||||
|
const int Y = get_global_id(1); \
|
||||||
|
const int Z = get_global_id(2); \
|
||||||
|
if (X > in_shape.x * in_shape.y || Y > in_shape.z || Z > in_shape.w || in_shape.y == 0) { \
|
||||||
|
return; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ARGS_ALIGN \
|
||||||
|
const int IN = X / in_shape.y; \
|
||||||
|
const int IH = X % in_shape.y; \
|
||||||
|
int coordinate_x = IN * in_shape.y + IH; \
|
||||||
|
int coordinate_y = Y * in_shape.w + Z; \
|
||||||
|
FLT4 result = READ_IMAGE(input, smp_none, (int2)(coordinate_y, coordinate_x));
|
||||||
|
|
||||||
|
__kernel void split_out2_axis3(__read_only image2d_t input, __write_only image2d_t output1,
|
||||||
|
__write_only image2d_t output2, __global int *split_sizes_, int4 in_shape,
|
||||||
|
int4 out_shape1, int4 out_shape2) {
|
||||||
|
CHECK_IDX_ALIGN;
|
||||||
|
ARGS_ALIGN;
|
||||||
|
int boundary = UP_DIV(split_sizes_[0], C4NUM);
|
||||||
|
if (Z < boundary) {
|
||||||
|
coordinate_x = IN * out_shape1.y + IH;
|
||||||
|
coordinate_y = Y * out_shape1.w + Z;
|
||||||
|
WRITE_IMAGE(output1, (int2)(coordinate_y, coordinate_x), result);
|
||||||
|
} else {
|
||||||
|
coordinate_x = IN * out_shape2.y + IH;
|
||||||
|
coordinate_y = Y * out_shape2.w + Z - boundary;
|
||||||
|
WRITE_IMAGE(output2, (int2)(coordinate_y, coordinate_x), result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void split_out2_axis2(__read_only image2d_t input, __write_only image2d_t output1,
|
||||||
|
__write_only image2d_t output2, __global int *split_sizes_, int4 in_shape,
|
||||||
|
int4 out_shape1, int4 out_shape2) {
|
||||||
|
CHECK_IDX_ALIGN;
|
||||||
|
ARGS_ALIGN;
|
||||||
|
if (Y < split_sizes_[0]) {
|
||||||
|
coordinate_x = IN * out_shape1.y + IH;
|
||||||
|
coordinate_y = Y * out_shape1.w + Z;
|
||||||
|
WRITE_IMAGE(output1, (int2)(coordinate_y, coordinate_x), result);
|
||||||
|
} else {
|
||||||
|
coordinate_x = IN * out_shape2.y + IH;
|
||||||
|
coordinate_y = (Y - split_sizes_[0]) * out_shape2.w + Z;
|
||||||
|
WRITE_IMAGE(output2, (int2)(coordinate_y, coordinate_x), result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void split_out2_axis1(__read_only image2d_t input, __write_only image2d_t output1,
|
||||||
|
__write_only image2d_t output2, __global int *split_sizes_, int4 in_shape,
|
||||||
|
int4 out_shape1, int4 out_shape2) {
|
||||||
|
CHECK_IDX_ALIGN;
|
||||||
|
ARGS_ALIGN;
|
||||||
|
if (IH < split_sizes_[0]) {
|
||||||
|
coordinate_x = IN * out_shape1.y + IH;
|
||||||
|
coordinate_y = Y * out_shape1.w + Z;
|
||||||
|
WRITE_IMAGE(output1, (int2)(coordinate_y, coordinate_x), result);
|
||||||
|
} else {
|
||||||
|
coordinate_x = IN * out_shape2.y + IH - split_sizes_[0];
|
||||||
|
coordinate_y = Y * out_shape2.w + Z;
|
||||||
|
WRITE_IMAGE(output2, (int2)(coordinate_y, coordinate_x), result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// UnAlign in Axis C for concat
|
||||||
|
#define CHECK_IDX_UNALIGN \
|
||||||
|
const int X = get_global_id(0); \
|
||||||
|
const int Y = get_global_id(1); \
|
||||||
|
if (X >= in_shape.x * in_shape.y || Y >= in_shape.z || in_shape.y == 0) { \
|
||||||
|
return; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ARGS_UNALIGN \
|
||||||
|
const int IN = X / in_shape.y, IH = X % in_shape.y; \
|
||||||
|
const int IW = Y; \
|
||||||
|
const int Align_inShape = UP_DIV(in_shape.w, C4NUM); \
|
||||||
|
int index_input = (IN * in_shape.y + IH) * stride_w + IW * Align_inShape * C4NUM;
|
||||||
|
|
||||||
|
int dosplit(__global FLT *input, __write_only image2d_t output, int4 out_shape, int IN, int IH, int IW,
|
||||||
|
int index_input) {
|
||||||
|
int Remainder = out_shape.w % C4NUM;
|
||||||
|
int coordinate_x = IN * out_shape.y + IH;
|
||||||
|
int align_w = UP_DIV(out_shape.w, C4NUM);
|
||||||
|
for (int i = 0; i < align_w; ++i) {
|
||||||
|
int coordinate_y = IW * align_w + i;
|
||||||
|
if ((i + 1) * C4NUM <= out_shape.w) {
|
||||||
|
FLT4 result = {input[index_input], input[index_input + 1], input[index_input + 2], input[index_input + 3]};
|
||||||
|
WRITE_IMAGE(output, (int2)(coordinate_y, coordinate_x), result);
|
||||||
|
index_input += 4;
|
||||||
|
} else {
|
||||||
|
FLT result_temp[4] = {};
|
||||||
|
for (int j = 0; j < Remainder; ++j) {
|
||||||
|
result_temp[j] = input[index_input++];
|
||||||
|
}
|
||||||
|
FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
|
||||||
|
WRITE_IMAGE(output, (int2)(coordinate_y, coordinate_x), result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return index_input;
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void split_out2_axis3_unalign(__global FLT *input, __write_only image2d_t output1,
|
||||||
|
__write_only image2d_t output2, __global int *split_sizes_, int4 in_shape,
|
||||||
|
int4 out_shape1, int4 out_shape2, int stride_w) {
|
||||||
|
CHECK_IDX_UNALIGN;
|
||||||
|
ARGS_UNALIGN;
|
||||||
|
index_input = dosplit(input, output1, out_shape1, IN, IH, IW, index_input);
|
||||||
|
index_input = dosplit(input, output2, out_shape2, IN, IH, IW, index_input);
|
||||||
|
}
|
|
@ -161,6 +161,14 @@ void ConcatOpenCLKernel::SetGlobalLocal() {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ConcatOpenCLKernel::Prepare() {
|
int ConcatOpenCLKernel::Prepare() {
|
||||||
|
if (axis_ == 0) {
|
||||||
|
for (int i = 0; i < in_tensors_.size(); ++i) {
|
||||||
|
if (in_tensors_.at(0)->shape().size() != 1) {
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
axis_ = 3;
|
||||||
|
}
|
||||||
for (int i = 0; i < in_tensors_.size(); ++i) {
|
for (int i = 0; i < in_tensors_.size(); ++i) {
|
||||||
int length = in_tensors_[0]->shape().size();
|
int length = in_tensors_[0]->shape().size();
|
||||||
if (in_tensors_[i]->shape()[length - 1] % C4NUM != 0) {
|
if (in_tensors_[i]->shape()[length - 1] % C4NUM != 0) {
|
||||||
|
|
|
@ -99,6 +99,9 @@ void GatherOpenCLKernel::SetGlobalLocal() {
|
||||||
|
|
||||||
int GatherOpenCLKernel::Prepare() {
|
int GatherOpenCLKernel::Prepare() {
|
||||||
std::string kernel_name = "gather";
|
std::string kernel_name = "gather";
|
||||||
|
if (in_tensors_.at(0)->shape().size() == 1 && axis_ == 0) {
|
||||||
|
axis_ = 3;
|
||||||
|
}
|
||||||
#ifdef PROGRAM_WITH_IL
|
#ifdef PROGRAM_WITH_IL
|
||||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||||
#else
|
#else
|
||||||
|
@ -106,17 +109,13 @@ int GatherOpenCLKernel::Prepare() {
|
||||||
ocl_runtime_->LoadSource(program_name, gather_source);
|
ocl_runtime_->LoadSource(program_name, gather_source);
|
||||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
|
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
|
||||||
#endif
|
#endif
|
||||||
if (!in_tensors_.at(1)->IsConst()) {
|
if (in_tensors_.at(1)->IsConst()) {
|
||||||
intensor1_is_tensor = true;
|
intensor1_is_tensor = false;
|
||||||
}
|
|
||||||
|
|
||||||
if (!intensor1_is_tensor) {
|
|
||||||
int ret = InitWeights();
|
int ret = InitWeights();
|
||||||
if (ret != RET_OK) {
|
if (ret != RET_OK) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SetGlobalLocal();
|
SetGlobalLocal();
|
||||||
SetConstArgs();
|
SetConstArgs();
|
||||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||||
|
@ -125,7 +124,6 @@ int GatherOpenCLKernel::Prepare() {
|
||||||
|
|
||||||
int GatherOpenCLKernel::ConvertTensorToweight() {
|
int GatherOpenCLKernel::ConvertTensorToweight() {
|
||||||
auto allocator = ocl_runtime_->GetAllocator();
|
auto allocator = ocl_runtime_->GetAllocator();
|
||||||
GpuTensorInfo img_info(in_tensors_[1]);
|
|
||||||
auto indices_tensor = in_tensors_.at(1);
|
auto indices_tensor = in_tensors_.at(1);
|
||||||
auto indices_num = indices_tensor->ElementsNum();
|
auto indices_num = indices_tensor->ElementsNum();
|
||||||
indices_data_ = reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num));
|
indices_data_ = reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num));
|
||||||
|
|
|
@ -41,13 +41,10 @@ class GatherOpenCLKernel : public OpenCLKernel {
|
||||||
int Tune() override { return lite::RET_OK; }
|
int Tune() override { return lite::RET_OK; }
|
||||||
int ConvertTensorToweight();
|
int ConvertTensorToweight();
|
||||||
|
|
||||||
protected:
|
|
||||||
int UpdateWeights();
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int32_t *indices_data_{nullptr};
|
int32_t *indices_data_{nullptr};
|
||||||
int axis_ = {0};
|
int axis_ = {0};
|
||||||
bool intensor1_is_tensor{false};
|
bool intensor1_is_tensor{true};
|
||||||
bool enable_fp16_{false};
|
bool enable_fp16_{false};
|
||||||
};
|
};
|
||||||
} // namespace mindspore::kernel
|
} // namespace mindspore::kernel
|
||||||
|
|
|
@ -17,17 +17,12 @@
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include "nnacl/fp32/common_func_fp32.h"
|
|
||||||
#include "src/kernel_registry.h"
|
#include "src/kernel_registry.h"
|
||||||
#include "src/runtime/kernel/opencl/utils.h"
|
|
||||||
#include "src/runtime/kernel/opencl/kernel/matmul.h"
|
#include "src/runtime/kernel/opencl/kernel/matmul.h"
|
||||||
#include "src/common/utils.h"
|
#include "src/runtime/kernel/opencl/kernel/strassen.h"
|
||||||
|
|
||||||
#ifndef PROGRAM_WITH_IL
|
#ifndef PROGRAM_WITH_IL
|
||||||
|
|
||||||
#include "src/runtime/kernel/opencl/cl/matmul.cl.inc"
|
#include "src/runtime/kernel/opencl/cl/matmul.cl.inc"
|
||||||
#include "src/runtime/kernel/opencl/cl/strassen.cl.inc"
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||||
|
@ -36,6 +31,21 @@ using mindspore::schema::PrimitiveType_MatMul;
|
||||||
|
|
||||||
namespace mindspore::kernel {
|
namespace mindspore::kernel {
|
||||||
|
|
||||||
|
bool IsUseStrassenMatmul(const std::vector<lite::Tensor *> &in_tensors_) {
|
||||||
|
if (in_tensors_.at(0)->shape().size() == 2) {
|
||||||
|
auto shape0 = in_tensors_.at(0)->shape();
|
||||||
|
auto shape1 = in_tensors_.at(1)->shape();
|
||||||
|
if (in_tensors_.at(1)->IsConst() && (shape0[0] == shape0[1]) && (shape1[0] == shape1[1]) &&
|
||||||
|
(shape0[0] == shape1[0]) && (shape0[0] % 8 == 0)) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int MatMulOpenCLKernel::CheckSpecs() {
|
int MatMulOpenCLKernel::CheckSpecs() {
|
||||||
if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
|
if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
|
||||||
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
|
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
|
||||||
|
@ -74,35 +84,14 @@ int MatMulOpenCLKernel::Prepare() {
|
||||||
}
|
}
|
||||||
std::map<int, std::string> dims2str = {{2, "_2d"}, {3, "_4d"}, {4, "_4d"}};
|
std::map<int, std::string> dims2str = {{2, "_2d"}, {3, "_4d"}, {4, "_4d"}};
|
||||||
kernel_name += dims2str[dims];
|
kernel_name += dims2str[dims];
|
||||||
if (in_tensors_.at(0)->shape().size() == 2) {
|
|
||||||
auto shape0 = in_tensors_.at(0)->shape();
|
|
||||||
auto shape1 = in_tensors_.at(1)->shape();
|
|
||||||
if (in_tensors_.at(1)->IsConst() && (shape0[0] == shape0[1]) && (shape1[0] == shape1[1]) &&
|
|
||||||
(shape0[0] == shape1[0]) && (shape0[0] % 8 == 0)) {
|
|
||||||
use_strassen = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#ifdef PROGRAM_WITH_IL
|
#ifdef PROGRAM_WITH_IL
|
||||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||||
#else
|
#else
|
||||||
std::string source = matmul_source;
|
std::string source = matmul_source;
|
||||||
if (use_strassen) {
|
|
||||||
source.clear();
|
|
||||||
source = strassen_source;
|
|
||||||
}
|
|
||||||
std::string program_name = "MatMul";
|
std::string program_name = "MatMul";
|
||||||
ocl_runtime_->LoadSource(program_name, source);
|
ocl_runtime_->LoadSource(program_name, source);
|
||||||
if (use_strassen) {
|
|
||||||
kernel_name = "MatMul_Strassen_NHWC4_2d";
|
|
||||||
ocl_runtime_->BuildKernel(kernel_IMG_add_sub_2, program_name, "MatMul_IMG_Add_Sub_2");
|
|
||||||
ocl_runtime_->BuildKernel(kernel_BUF_add_sub_2, program_name, "MatMul_BUF_Add_Sub_2");
|
|
||||||
ocl_runtime_->BuildKernel(kernel_back_result, program_name, "Strassen_Back_Result");
|
|
||||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
|
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
|
||||||
ocl_runtime_->BuildKernel(MatMul_StrassenBUFFilled, program_name, "MatMul_BUF_Filled");
|
|
||||||
ocl_runtime_->BuildKernel(MatMul_StrassenIMGFilled, program_name, "MatMul_IMG_Filled");
|
|
||||||
} else {
|
|
||||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
auto ret = InitWeights();
|
auto ret = InitWeights();
|
||||||
if (ret != RET_OK) {
|
if (ret != RET_OK) {
|
||||||
|
@ -114,31 +103,6 @@ int MatMulOpenCLKernel::Prepare() {
|
||||||
return mindspore::lite::RET_OK;
|
return mindspore::lite::RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MatMulOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) {
|
|
||||||
std::vector<size_t> img_size;
|
|
||||||
img_size.push_back(UP_DIV(NumA, C4NUM));
|
|
||||||
img_size.push_back(NumA);
|
|
||||||
size_t img_dtype = enable_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
|
|
||||||
size_t dtype_size = enable_fp16_ ? sizeof(CL_HALF_FLOAT) : sizeof(CL_FLOAT);
|
|
||||||
img_size.push_back(img_dtype);
|
|
||||||
auto allocator = ocl_runtime_->GetAllocator();
|
|
||||||
size_t memA = NumA * NumA;
|
|
||||||
|
|
||||||
size_t memB = NumB * NumB * dtype_size;
|
|
||||||
for (int depth = 0; depth < MAXDEPTH; depth++) {
|
|
||||||
B_temp[depth] = allocator->Malloc(memB);
|
|
||||||
A_temp[depth] = allocator->Malloc(memA, img_size);
|
|
||||||
|
|
||||||
M1[depth] = allocator->Malloc(memA, img_size);
|
|
||||||
M2[depth] = allocator->Malloc(memA, img_size);
|
|
||||||
M3[depth] = allocator->Malloc(memA, img_size);
|
|
||||||
M4[depth] = allocator->Malloc(memA, img_size);
|
|
||||||
M5[depth] = allocator->Malloc(memA, img_size);
|
|
||||||
M6[depth] = allocator->Malloc(memA, img_size);
|
|
||||||
M7[depth] = allocator->Malloc(memA, img_size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int MatMulOpenCLKernel::InitWeights() {
|
int MatMulOpenCLKernel::InitWeights() {
|
||||||
if (act_weight_) {
|
if (act_weight_) {
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
|
@ -165,30 +129,6 @@ int MatMulOpenCLKernel::InitWeights() {
|
||||||
auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
|
auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
|
||||||
auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c());
|
auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c());
|
||||||
bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;
|
bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;
|
||||||
if (use_strassen) {
|
|
||||||
int NumA = in_tensors_[0]->shape()[0];
|
|
||||||
int NumB = in_tensors_[1]->shape()[0];
|
|
||||||
AllocatorMemoryForStrassen(NumA / 2, NumB / 2);
|
|
||||||
size_t size = NumA * NumB * dtype_size;
|
|
||||||
transposeB = false;
|
|
||||||
if (isModelFp16) {
|
|
||||||
if (enable_fp16_) {
|
|
||||||
memcpy(padWeightFp16, originWeightFp16, size);
|
|
||||||
} else {
|
|
||||||
for (int i = 0; i < NumA * NumB; ++i) {
|
|
||||||
padWeightFp32[i] = static_cast<float>(originWeightFp16[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (enable_fp16_) {
|
|
||||||
for (int i = 0; i < NumA * NumB; ++i) {
|
|
||||||
padWeightFp16[i] = static_cast<float16_t>(originWeightFp32[i]);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
memcpy(padWeightFp32, originWeightFp32, size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// pad weight
|
// pad weight
|
||||||
// ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI)
|
// ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI)
|
||||||
// if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI)
|
// if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI)
|
||||||
|
@ -229,262 +169,37 @@ int MatMulOpenCLKernel::InitWeights() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
allocator->UnmapBuffer(padWeight_);
|
allocator->UnmapBuffer(padWeight_);
|
||||||
FreeDequantedWeight();
|
FreeDequantedWeight();
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
void AlignStrassenGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local,
|
|
||||||
cl::NDRange *global_range, cl::NDRange *local_range) {
|
|
||||||
*local_range = cl::NDRange(local[0], local[1], local[2]);
|
|
||||||
*global_range =
|
|
||||||
cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2]));
|
|
||||||
}
|
|
||||||
|
|
||||||
// 0 : global_size_, 1: global_size_add_sub
|
|
||||||
void MatMulOpenCLKernel::StrassenSetGlobalLocal(size_t strassen_size, int type_flag) {
|
|
||||||
size_t strassen_size_C4 = UP_DIV(strassen_size, C4NUM);
|
|
||||||
local_size_add_sub = {16, 1, 16};
|
|
||||||
if (type_flag == 0) {
|
|
||||||
global_size_ = {strassen_size_C4, 1, strassen_size};
|
|
||||||
AlignGlobalLocal(global_size_, local_size_);
|
|
||||||
} else {
|
|
||||||
global_size_add_sub = {strassen_size_C4, 1, strassen_size};
|
|
||||||
AlignStrassenGlobalLocal(global_size_add_sub, local_size_add_sub, &global_add_sub_, &local_add_sub_);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void MatMulOpenCLKernel::SetGlobalLocal() {
|
void MatMulOpenCLKernel::SetGlobalLocal() {
|
||||||
// local size should less than MAX_GROUP_SIZE
|
// local size should less than MAX_GROUP_SIZE
|
||||||
local_size_ = {32, 4, 1};
|
local_size_ = {32, 4, 1};
|
||||||
global_size_ = {1, 1, 1};
|
global_size_ = {1, 1, 1};
|
||||||
if (use_strassen) {
|
|
||||||
size_t strassen_size = outShape[3] / 2;
|
|
||||||
StrassenSetGlobalLocal(strassen_size, 0); // set global_ and local
|
|
||||||
StrassenSetGlobalLocal(strassen_size, 1); // set global_size_add_sub
|
|
||||||
StrassenSetGlobalLocal(strassen_size, 2); // set global_size_weights
|
|
||||||
} else {
|
|
||||||
global_size_ = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM),
|
global_size_ = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM),
|
||||||
4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]),
|
4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]),
|
||||||
static_cast<size_t>(outShape[2])};
|
static_cast<size_t>(outShape[2])};
|
||||||
AlignGlobalLocal(global_size_, local_size_);
|
AlignGlobalLocal(global_size_, local_size_);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
void MatMulOpenCLKernel::StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel) {
|
|
||||||
cl_int4 shape;
|
|
||||||
if (is_matmul_kernel) {
|
|
||||||
shape = {1, 1, strassen_size, strassen_size};
|
|
||||||
} else {
|
|
||||||
shape = {strassen_size, 1, 1, UP_DIV(strassen_size, C4NUM)};
|
|
||||||
}
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, index, shape);
|
|
||||||
}
|
|
||||||
|
|
||||||
void MatMulOpenCLKernel::SetConstArgs() {
|
void MatMulOpenCLKernel::SetConstArgs() {
|
||||||
int arg_count = 2;
|
int arg_count = 2;
|
||||||
cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
|
cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
|
||||||
cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
|
cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
|
||||||
cl_int4 shape_offset = {0, 0, 0, 0};
|
|
||||||
if (use_strassen) {
|
|
||||||
int strassen_size = inShape[3] / 2;
|
|
||||||
out_shape.s[2] = in_shape.s[2] = in_shape.s[2] / 2;
|
|
||||||
out_shape.s[3] = in_shape.s[3] = in_shape.s[3] / 2;
|
|
||||||
StrassenSetConstArgs(&kernel_IMG_add_sub_2, 3, strassen_size, false);
|
|
||||||
StrassenSetConstArgs(&kernel_BUF_add_sub_2, 2, strassen_size, false);
|
|
||||||
} else {
|
|
||||||
if (act_weight_) {
|
if (act_weight_) {
|
||||||
arg_count++;
|
arg_count++;
|
||||||
} else {
|
} else {
|
||||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
|
ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
|
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
|
||||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
|
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
|
||||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, shape_offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
// OriginSize = N*H*W*C typesize = sizeof(type data) width = W * UP_DIV(C,C4NUM) size = N
|
|
||||||
void MatMulOpenCLKernel::PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size) {
|
|
||||||
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
|
|
||||||
int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
|
|
||||||
auto runtime = runtime_wrapper.GetInstance();
|
|
||||||
runtime->SyncCommandQueue();
|
|
||||||
MS_ASSERT(alignment);
|
|
||||||
size_t row_pitch = UP_ROUND(width, alignment) * typesize * C4NUM;
|
|
||||||
size_t OriginSize = size * size * typesize;
|
|
||||||
std::vector<char> data(OriginSize);
|
|
||||||
auto row_size = width * typesize * C4NUM;
|
|
||||||
|
|
||||||
for (int i = 0; i < size; ++i) {
|
|
||||||
memcpy(reinterpret_cast<char *>(data.data()) + i * row_size, static_cast<char *>(IMGData) + i * row_pitch,
|
|
||||||
row_size);
|
|
||||||
}
|
|
||||||
for (int i = 0; i < size * size; ++i) {
|
|
||||||
if ((i + 1) % size == 0) {
|
|
||||||
std::cout << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void MatMulOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size,
|
|
||||||
cl_int2 offset, lite::opencl::MemType mem_type) {
|
|
||||||
if (input == nullptr || output == nullptr) {
|
|
||||||
MS_LOG(ERROR) << "StrassenDataFilled input or output can not nullptr";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (mem_type == lite::opencl::MemType::IMG) {
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 0, input);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 1, output);
|
|
||||||
} else {
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF);
|
|
||||||
}
|
|
||||||
StrassenSetConstArgs(kernel, 2, size, false);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 3, offset);
|
|
||||||
ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void MatMulOpenCLKernel::StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset,
|
|
||||||
int flag, lite::opencl::MemType mem_type) {
|
|
||||||
if (input == nullptr || output == nullptr) {
|
|
||||||
MS_LOG(ERROR) << "StrassenAddSub input or output can not nullptr";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (mem_type == lite::opencl::MemType::IMG) {
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::IMG);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::IMG);
|
|
||||||
} else {
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF);
|
|
||||||
}
|
|
||||||
StrassenSetConstArgs(kernel, 2, size, false);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 3, offset);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 4, flag);
|
|
||||||
ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void MatMulOpenCLKernel::StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4,
|
|
||||||
void *input5, void *input6, void *input7, void *output, const int size) {
|
|
||||||
if (input1 == nullptr || input2 == nullptr || input3 == nullptr || input4 == nullptr || input5 == nullptr ||
|
|
||||||
input6 == nullptr || input7 == nullptr || output == nullptr) {
|
|
||||||
MS_LOG(ERROR) << "StrassenBackResult input or output can not nullptr";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 0, input1);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 1, input2);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 2, input3);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 3, input4);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 4, input5);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 5, input6);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 6, input7);
|
|
||||||
ocl_runtime_->SetKernelArg(*kernel, 7, output);
|
|
||||||
StrassenSetConstArgs(kernel, 8, size, false);
|
|
||||||
ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void MatMulOpenCLKernel::StrassenRunMmatmul(void *input, void *weight, void *output, const int size) {
|
|
||||||
if (input == nullptr || weight == nullptr || output == nullptr) {
|
|
||||||
MS_LOG(ERROR) << "StrassenRunMmatmul input ,weight or output can not nullptr";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
ocl_runtime_->SetKernelArg(kernel_, 0, input);
|
|
||||||
ocl_runtime_->SetKernelArg(kernel_, 1, output);
|
|
||||||
ocl_runtime_->SetKernelArg(kernel_, 2, weight, lite::opencl::MemType::BUF);
|
|
||||||
StrassenSetConstArgs(&kernel_, 3, size, true);
|
|
||||||
StrassenSetConstArgs(&kernel_, 4, size, true);
|
|
||||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void MatMulOpenCLKernel::DoStrassen(void *data, void *weight, void *result, const int size, const int depth,
|
|
||||||
const int threshold) {
|
|
||||||
const int size_2 = size / 2;
|
|
||||||
int C4 = UP_DIV(size_2, C4NUM);
|
|
||||||
if (size <= threshold) {
|
|
||||||
// run matmul;
|
|
||||||
StrassenSetGlobalLocal(size, 0);
|
|
||||||
StrassenRunMmatmul(data, weight, result, size);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// flag = 0 : add otherwise flag = 1 : sub
|
|
||||||
// M1 = A11 * ( B12- B22)
|
|
||||||
StrassenSetGlobalLocal(size_2, 1);
|
|
||||||
StrassenDataFilled(&MatMul_StrassenIMGFilled, data, A_temp[depth + 1], size_2, {0, 0}, lite::opencl::MemType::IMG);
|
|
||||||
StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, C4, size_2, C4}, 1,
|
|
||||||
lite::opencl::MemType::BUF);
|
|
||||||
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M1[depth + 1], size_2, depth + 1, threshold);
|
|
||||||
|
|
||||||
// M2 = (A11 + A12) * B22
|
|
||||||
StrassenSetGlobalLocal(size_2, 1);
|
|
||||||
StrassenDataFilled(&MatMul_StrassenBUFFilled, weight, B_temp[depth + 1], size_2, {size_2, C4},
|
|
||||||
lite::opencl::MemType::BUF);
|
|
||||||
StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, 0, C4}, 0, lite::opencl::MemType::IMG);
|
|
||||||
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M2[depth + 1], size_2, depth + 1, threshold);
|
|
||||||
|
|
||||||
// M3 = (A21 + A22) * B11
|
|
||||||
StrassenSetGlobalLocal(size_2, 1);
|
|
||||||
StrassenDataFilled(&MatMul_StrassenBUFFilled, weight, B_temp[depth + 1], size_2, {0, 0}, lite::opencl::MemType::BUF);
|
|
||||||
StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {size_2, 0, size_2, C4}, 0,
|
|
||||||
lite::opencl::MemType::IMG);
|
|
||||||
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M3[depth + 1], size_2, depth + 1, threshold);
|
|
||||||
|
|
||||||
// M4 = A22 * (B21 - B11)
|
|
||||||
StrassenSetGlobalLocal(size_2, 1);
|
|
||||||
StrassenDataFilled(&MatMul_StrassenIMGFilled, data, A_temp[depth + 1], size_2, {size_2, C4},
|
|
||||||
lite::opencl::MemType::IMG);
|
|
||||||
StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {size_2, 0, 0, 0}, 1,
|
|
||||||
lite::opencl::MemType::BUF);
|
|
||||||
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M4[depth + 1], size_2, depth + 1, threshold);
|
|
||||||
|
|
||||||
// M5 = (A11 + A22) * (B11 + B22)
|
|
||||||
StrassenSetGlobalLocal(size_2, 1);
|
|
||||||
StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, size_2, C4}, 0,
|
|
||||||
lite::opencl::MemType::IMG);
|
|
||||||
// (B11 + B22)
|
|
||||||
StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, 0, size_2, C4}, 0,
|
|
||||||
lite::opencl::MemType::BUF);
|
|
||||||
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M5[depth + 1], size_2, depth + 1, threshold);
|
|
||||||
|
|
||||||
// M6 = (A12 - A22) * (B21 + B22)
|
|
||||||
StrassenSetGlobalLocal(size_2, 1);
|
|
||||||
StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, C4, size_2, C4}, 1,
|
|
||||||
lite::opencl::MemType::IMG);
|
|
||||||
StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {size_2, 0, size_2, C4}, 0,
|
|
||||||
lite::opencl::MemType::BUF);
|
|
||||||
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M6[depth + 1], size_2, depth + 1, threshold);
|
|
||||||
|
|
||||||
// M7 = (A11 - A21) * (B11 + B12)
|
|
||||||
StrassenSetGlobalLocal(size_2, 1);
|
|
||||||
StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, size_2, 0}, 1,
|
|
||||||
lite::opencl::MemType::IMG);
|
|
||||||
StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, 0, 0, C4}, 0,
|
|
||||||
lite::opencl::MemType::BUF);
|
|
||||||
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M7[depth + 1], size_2, depth + 1, threshold);
|
|
||||||
|
|
||||||
// BackResult
|
|
||||||
StrassenSetGlobalLocal(size_2, 1);
|
|
||||||
StrassenBackResult(&kernel_back_result, M1[depth + 1], M2[depth + 1], M3[depth + 1], M4[depth + 1], M5[depth + 1],
|
|
||||||
M6[depth + 1], M7[depth + 1], result, size_2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int MatMulOpenCLKernel::Run() {
|
int MatMulOpenCLKernel::Run() {
|
||||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||||
if (use_strassen) {
|
|
||||||
int threshold = 0;
|
|
||||||
const int up_bound = 1024;
|
|
||||||
const int down_bound = 256;
|
|
||||||
if (in_tensors_.at(0)->shape()[0] >= up_bound) {
|
|
||||||
threshold = UP_DIV(in_tensors_.at(0)->shape()[0], C4NUM) / 2;
|
|
||||||
} else if (in_tensors_.at(0)->shape()[0] <= down_bound) {
|
|
||||||
threshold = in_tensors_.at(0)->shape()[0];
|
|
||||||
} else {
|
|
||||||
threshold = UP_DIV(in_tensors_.at(0)->shape()[0], C4NUM);
|
|
||||||
}
|
|
||||||
DoStrassen(in_tensors_.at(0)->data_c(), padWeight_, out_tensors_.at(0)->data_c(), in_tensors_.at(0)->shape()[0], 0,
|
|
||||||
threshold);
|
|
||||||
} else {
|
|
||||||
int arg_count = 0;
|
int arg_count = 0;
|
||||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
|
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
|
||||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
|
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
|
||||||
|
@ -492,10 +207,35 @@ int MatMulOpenCLKernel::Run() {
|
||||||
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c());
|
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c());
|
||||||
}
|
}
|
||||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||||
}
|
|
||||||
return mindspore::lite::RET_OK;
|
return mindspore::lite::RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLKernelCreator<MatMulOpenCLKernel>)
|
kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLKernelCreator<MatMulOpenCLKernel>)
|
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
|
||||||
|
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
|
||||||
|
const mindspore::lite::PrimitiveC *primitive) {
|
||||||
|
kernel::OpenCLKernel *kernel;
|
||||||
|
if (IsUseStrassenMatmul(inputs)) {
|
||||||
|
MS_LOG(DEBUG) << "use_matmul_strassen";
|
||||||
|
kernel = new (std::nothrow) StrassenOpenCLKernel(opParameter, inputs, outputs);
|
||||||
|
} else {
|
||||||
|
kernel = new (std::nothrow) MatMulOpenCLKernel(opParameter, inputs, outputs);
|
||||||
|
}
|
||||||
|
if (kernel == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
|
||||||
|
free(opParameter);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
auto ret = kernel->CheckSpecs();
|
||||||
|
if (ret != mindspore::lite::RET_OK) {
|
||||||
|
MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!";
|
||||||
|
delete kernel;
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return kernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLMatMulKernelCreator)
|
||||||
|
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLMatMulKernelCreator)
|
||||||
|
|
||||||
} // namespace mindspore::kernel
|
} // namespace mindspore::kernel
|
||||||
|
|
|
@ -41,7 +41,7 @@ class MatMulOpenCLKernel : public OpenCLKernel {
|
||||||
void SetGlobalLocal() override;
|
void SetGlobalLocal() override;
|
||||||
int Tune() override { return lite::RET_OK; }
|
int Tune() override { return lite::RET_OK; }
|
||||||
|
|
||||||
private:
|
protected:
|
||||||
void *padWeight_{nullptr};
|
void *padWeight_{nullptr};
|
||||||
bool enable_fp16_{false};
|
bool enable_fp16_{false};
|
||||||
bool transposeA{false};
|
bool transposeA{false};
|
||||||
|
@ -51,43 +51,6 @@ class MatMulOpenCLKernel : public OpenCLKernel {
|
||||||
bool act_weight_{false};
|
bool act_weight_{false};
|
||||||
std::vector<int> inShape{std::vector<int>(MAX_DIMS, 1)};
|
std::vector<int> inShape{std::vector<int>(MAX_DIMS, 1)};
|
||||||
std::vector<int> outShape{std::vector<int>(MAX_DIMS, 1)};
|
std::vector<int> outShape{std::vector<int>(MAX_DIMS, 1)};
|
||||||
|
|
||||||
// strassen
|
|
||||||
private:
|
|
||||||
void AllocatorMemoryForStrassen(int NumA, int NumB);
|
|
||||||
void DoStrassen(void *data, void *weight, void *result, const int size, const int depth, const int threshold);
|
|
||||||
void StrassenSetGlobalLocal(size_t strassen_size, int type_flag);
|
|
||||||
void StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel);
|
|
||||||
void StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, cl_int2 offset,
|
|
||||||
lite::opencl::MemType mem_type);
|
|
||||||
void StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, int flag,
|
|
||||||
lite::opencl::MemType mem_type);
|
|
||||||
void StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5,
|
|
||||||
void *input6, void *input7, void *output, const int size);
|
|
||||||
void StrassenRunMmatmul(void *input, void *weight, void *output, const int size);
|
|
||||||
void PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size);
|
|
||||||
bool use_strassen{false};
|
|
||||||
cl::Kernel kernel_IMG_add_sub_2;
|
|
||||||
cl::Kernel MatMul_StrassenBUFFilled;
|
|
||||||
cl::Kernel MatMul_StrassenIMGFilled;
|
|
||||||
cl::Kernel kernel_BUF_add_sub_2;
|
|
||||||
cl::Kernel kernel_back_result;
|
|
||||||
cl::NDRange global_add_sub_, local_add_sub_;
|
|
||||||
std::vector<size_t> global_size_add_sub;
|
|
||||||
std::vector<size_t> local_size_add_sub;
|
|
||||||
// image 2d
|
|
||||||
void *A_temp[MAXDEPTH] = {nullptr};
|
|
||||||
|
|
||||||
void *M1[MAXDEPTH] = {nullptr};
|
|
||||||
void *M2[MAXDEPTH] = {nullptr};
|
|
||||||
void *M3[MAXDEPTH] = {nullptr};
|
|
||||||
void *M4[MAXDEPTH] = {nullptr};
|
|
||||||
void *M5[MAXDEPTH] = {nullptr};
|
|
||||||
void *M6[MAXDEPTH] = {nullptr};
|
|
||||||
void *M7[MAXDEPTH] = {nullptr};
|
|
||||||
|
|
||||||
// buffer
|
|
||||||
void *B_temp[MAXDEPTH] = {nullptr};
|
|
||||||
};
|
};
|
||||||
} // namespace mindspore::kernel
|
} // namespace mindspore::kernel
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,206 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "src/runtime/kernel/opencl/kernel/split.h"
|
||||||
|
#include <cstring>
|
||||||
|
#include <string>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <set>
|
||||||
|
#include "src/kernel_registry.h"
|
||||||
|
#include "src/runtime/kernel/opencl/cl/split.cl.inc"
|
||||||
|
|
||||||
|
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||||
|
using mindspore::lite::KernelRegistrar;
|
||||||
|
using mindspore::lite::RET_ERROR;
|
||||||
|
using mindspore::lite::RET_OK;
|
||||||
|
using mindspore::schema::PrimitiveType_Split;
|
||||||
|
namespace mindspore::kernel {
|
||||||
|
|
||||||
|
int SplitOpenCLKernel::RunAxis0() {
|
||||||
|
auto allocator_ = ocl_runtime_->GetAllocator();
|
||||||
|
std::vector<size_t> img_size;
|
||||||
|
auto src_data = in_tensors_[0]->data_c();
|
||||||
|
cl::Image2D *in_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
|
||||||
|
if (in_image == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "RunAxis0 in_image can not be nullptr";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
auto src_area = cl::array<cl::size_type, 3U>{0, 0, 0};
|
||||||
|
for (int i = 0; i < out_tensors_.size(); i++) {
|
||||||
|
auto dst_data = out_tensors_[i]->data_c();
|
||||||
|
allocator_->GetImageSize(dst_data, &img_size);
|
||||||
|
auto dst_area = cl::array<cl::size_type, 3U>{0, 0, 0};
|
||||||
|
auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1};
|
||||||
|
cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data));
|
||||||
|
if (out_image == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "RunAxis0 out_image can not be nullptr";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*in_image, *out_image, src_area, dst_area, region);
|
||||||
|
src_area[1] += region[1];
|
||||||
|
}
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
int SplitOpenCLKernel::CheckSpecs() {
|
||||||
|
if (out_tensors_.size() != 2 || in_tensors_.size() != 1) {
|
||||||
|
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
if (in_tensors_.at(0)->IsConst()) {
|
||||||
|
MS_LOG(ERROR) << "in_tensors_ must be tensor";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
for (auto &out_tensor : out_tensors_) {
|
||||||
|
if (out_tensor->IsConst()) {
|
||||||
|
MS_LOG(ERROR) << "out_tensor must be tensor";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto param = reinterpret_cast<SplitParameter *>(this->op_parameter_);
|
||||||
|
if (param->num_split_ != 2 && param->num_split_ != 1) {
|
||||||
|
MS_LOG(ERROR) << "num_split_ only supported 1 or 2 yet";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
if (param->split_dim_ < 0 || param->split_dim_ > 3) {
|
||||||
|
MS_LOG(ERROR) << "split_dim_ must between 0~3";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
if (param->split_sizes_ == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "split_sizes_ can not nullptr";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SplitOpenCLKernel::AlignSplitSizes(SplitParameter *param, const std::vector<int> &in_shape) {
|
||||||
|
auto allocator = ocl_runtime_->GetAllocator();
|
||||||
|
int shape_dim = in_shape.at(param->split_dim_);
|
||||||
|
if (num_split_ == 1) {
|
||||||
|
size_t num_split = UP_DIV(shape_dim, param->split_sizes_[0]);
|
||||||
|
split_sizes_ = reinterpret_cast<int *>(allocator->Malloc(num_split * sizeof(int)));
|
||||||
|
for (int i = 0; i < num_split - 1; ++i) {
|
||||||
|
split_sizes_[i] = (i + 1) * param->split_sizes_[0];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int sum = 0;
|
||||||
|
split_sizes_ = reinterpret_cast<int *>(allocator->Malloc(num_split_ * sizeof(int)));
|
||||||
|
for (int i = 0; i < num_split_ - 1; ++i) {
|
||||||
|
sum += param->split_sizes_[i];
|
||||||
|
split_sizes_[i] = sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int SplitOpenCLKernel::Prepare() {
|
||||||
|
auto param = reinterpret_cast<SplitParameter *>(this->op_parameter_);
|
||||||
|
auto in_shape = in_tensors_.at(0)->shape();
|
||||||
|
int increment_dim = C4NUM - in_shape.size();
|
||||||
|
split_dim_ = param->split_dim_ == 0 ? param->split_dim_ : param->split_dim_ + increment_dim;
|
||||||
|
num_split_ = param->num_split_;
|
||||||
|
if (split_dim_ == 0) {
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < out_tensors_.size(); ++i) {
|
||||||
|
int length = out_tensors_[0]->shape().size();
|
||||||
|
if (split_dim_ == 3) {
|
||||||
|
if (out_tensors_[i]->shape()[length - 1] % C4NUM != 0) {
|
||||||
|
Align_ = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
AlignSplitSizes(param, in_shape);
|
||||||
|
std::string kernel_name = "split_out";
|
||||||
|
kernel_name += num_split_ == 1 ? std::to_string(out_tensors().size()) : std::to_string(num_split_);
|
||||||
|
kernel_name += "_axis" + std::to_string(split_dim_);
|
||||||
|
if (!Align_) {
|
||||||
|
kernel_name += "_unalign";
|
||||||
|
}
|
||||||
|
MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
|
||||||
|
std::string source = split_source;
|
||||||
|
std::string program_name = "split";
|
||||||
|
ocl_runtime_->LoadSource(program_name, source);
|
||||||
|
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
|
||||||
|
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||||
|
SetConstArgs();
|
||||||
|
SetGlobalLocal();
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SplitOpenCLKernel::SetConstArgs() {
|
||||||
|
int arg_cn = out_tensors_.size() + 2;
|
||||||
|
cl_int4 shape = {};
|
||||||
|
for (int i = 0; i < in_tensors_[0]->shape().size(); ++i) {
|
||||||
|
shape.s[i] = in_tensors_[0]->shape()[i];
|
||||||
|
}
|
||||||
|
Broadcast2GpuShape(in_shape_.s, shape.s, out_tensors_[0]->shape().size(), 1);
|
||||||
|
if (Align_) {
|
||||||
|
in_shape_.s[3] = UP_DIV(in_shape_.s[3], C4NUM);
|
||||||
|
}
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_);
|
||||||
|
|
||||||
|
for (int i = 0; i < out_tensors_.size(); ++i) {
|
||||||
|
cl_int4 temp = {};
|
||||||
|
for (int j = 0; j < out_tensors_[i]->shape().size(); ++j) {
|
||||||
|
temp.s[j] = out_tensors_[i]->shape()[j];
|
||||||
|
}
|
||||||
|
Broadcast2GpuShape(out_shape_.s, temp.s, out_tensors_[i]->shape().size(), 1);
|
||||||
|
if (Align_) {
|
||||||
|
out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM);
|
||||||
|
}
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_);
|
||||||
|
}
|
||||||
|
GpuTensorInfo img_info(in_tensors_.at(0));
|
||||||
|
size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
|
||||||
|
stride_w = img_info.RowPitch() / dtype;
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SplitOpenCLKernel::SetGlobalLocal() {
|
||||||
|
OH = in_shape_.s[0] * in_shape_.s[1];
|
||||||
|
OW = in_shape_.s[2];
|
||||||
|
if (Align_) {
|
||||||
|
OC = in_shape_.s[3];
|
||||||
|
}
|
||||||
|
global_size_ = {OH, OW, OC};
|
||||||
|
local_size_ = {1, 1, 1};
|
||||||
|
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int SplitOpenCLKernel::Run() {
|
||||||
|
if (split_dim_ == 0) {
|
||||||
|
RunAxis0();
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
int arg_cn = 0;
|
||||||
|
if (Align_) {
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c());
|
||||||
|
} else {
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c(), lite::opencl::MemType::BUF);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < out_tensors_.size(); ++i) {
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(i)->data_c());
|
||||||
|
}
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, split_sizes_, lite::opencl::MemType::BUF);
|
||||||
|
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Split, OpenCLKernelCreator<SplitOpenCLKernel>)
|
||||||
|
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Split, OpenCLKernelCreator<SplitOpenCLKernel>)
|
||||||
|
} // namespace mindspore::kernel
|
|
@ -0,0 +1,60 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SPLIT_H_
|
||||||
|
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SPLIT_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||||
|
#include "nnacl/split_parameter.h"
|
||||||
|
|
||||||
|
namespace mindspore::kernel {
|
||||||
|
|
||||||
|
class SplitOpenCLKernel : public OpenCLKernel {
|
||||||
|
public:
|
||||||
|
SplitOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||||
|
const std::vector<lite::Tensor *> &outputs)
|
||||||
|
: OpenCLKernel(parameter, inputs, outputs) {}
|
||||||
|
|
||||||
|
~SplitOpenCLKernel() override = default;
|
||||||
|
|
||||||
|
int Prepare() override;
|
||||||
|
|
||||||
|
int CheckSpecs() override;
|
||||||
|
void SetConstArgs() override;
|
||||||
|
void SetGlobalLocal() override;
|
||||||
|
int Run() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void AlignSplitSizes(SplitParameter *param, const std::vector<int> &in_shape);
|
||||||
|
int RunAxis0();
|
||||||
|
|
||||||
|
private:
|
||||||
|
cl_int4 in_shape_{};
|
||||||
|
cl_int4 out_shape_ = {};
|
||||||
|
bool Align_{true};
|
||||||
|
bool enable_fp16_{false};
|
||||||
|
size_t num_split_ = 1;
|
||||||
|
int *split_sizes_{nullptr};
|
||||||
|
int split_dim_ = 0;
|
||||||
|
cl_int stride_w{1};
|
||||||
|
uint32_t OH = {1};
|
||||||
|
uint32_t OW = {1};
|
||||||
|
uint32_t OC = {1};
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace mindspore::kernel
|
||||||
|
#endif
|
|
@ -0,0 +1,361 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2019 Huawei Technologies n., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
|
#include <map>
|
||||||
|
#include "src/runtime/kernel/opencl/kernel/matmul.h"
|
||||||
|
#include "src/runtime/kernel/opencl/kernel/strassen.h"
|
||||||
|
#include "src/common/utils.h"
|
||||||
|
|
||||||
|
#ifndef PROGRAM_WITH_IL
|
||||||
|
|
||||||
|
#include "src/runtime/kernel/opencl/cl/strassen.cl.inc"
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace mindspore::kernel {
|
||||||
|
|
||||||
|
int StrassenOpenCLKernel::Prepare() {
|
||||||
|
#ifdef PROGRAM_WITH_IL
|
||||||
|
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||||
|
#else
|
||||||
|
std::string kernel_name = "MatMul_Strassen_NHWC4_2d";
|
||||||
|
std::string source = strassen_source;
|
||||||
|
std::string program_name = "MatMul";
|
||||||
|
ocl_runtime_->LoadSource(program_name, source);
|
||||||
|
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
|
||||||
|
ocl_runtime_->BuildKernel(kernel_IMG_add_sub_2, program_name, "MatMul_IMG_Add_Sub_2");
|
||||||
|
ocl_runtime_->BuildKernel(kernel_BUF_add_sub_2, program_name, "MatMul_BUF_Add_Sub_2");
|
||||||
|
ocl_runtime_->BuildKernel(kernel_back_result, program_name, "Strassen_Back_Result");
|
||||||
|
ocl_runtime_->BuildKernel(MatMul_StrassenBUFFilled, program_name, "MatMul_BUF_Filled");
|
||||||
|
ocl_runtime_->BuildKernel(MatMul_StrassenIMGFilled, program_name, "MatMul_IMG_Filled");
|
||||||
|
|
||||||
|
#endif
|
||||||
|
auto ret = InitWeights();
|
||||||
|
if (ret != RET_OK) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
SetConstArgs();
|
||||||
|
SetGlobalLocal();
|
||||||
|
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||||
|
return mindspore::lite::RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
void StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) {
|
||||||
|
std::vector<size_t> img_size;
|
||||||
|
img_size.push_back(UP_DIV(NumA, C4NUM));
|
||||||
|
img_size.push_back(NumA);
|
||||||
|
size_t img_dtype = enable_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
|
||||||
|
size_t dtype_size = enable_fp16_ ? sizeof(CL_HALF_FLOAT) : sizeof(CL_FLOAT);
|
||||||
|
img_size.push_back(img_dtype);
|
||||||
|
auto allocator = ocl_runtime_->GetAllocator();
|
||||||
|
size_t memA = NumA * NumA;
|
||||||
|
|
||||||
|
size_t memB = NumB * NumB * dtype_size;
|
||||||
|
for (int depth = 0; depth < MAXDEPTH; depth++) {
|
||||||
|
B_temp[depth] = allocator->Malloc(memB);
|
||||||
|
A_temp[depth] = allocator->Malloc(memA, img_size);
|
||||||
|
|
||||||
|
M1[depth] = allocator->Malloc(memA, img_size);
|
||||||
|
M2[depth] = allocator->Malloc(memA, img_size);
|
||||||
|
M3[depth] = allocator->Malloc(memA, img_size);
|
||||||
|
M4[depth] = allocator->Malloc(memA, img_size);
|
||||||
|
M5[depth] = allocator->Malloc(memA, img_size);
|
||||||
|
M6[depth] = allocator->Malloc(memA, img_size);
|
||||||
|
M7[depth] = allocator->Malloc(memA, img_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int StrassenOpenCLKernel::InitWeights() {
|
||||||
|
// ABMCI @ ABCICO = ABMCO
|
||||||
|
auto ret = DequantWeight();
|
||||||
|
if (ret != RET_OK) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
auto allocator = ocl_runtime_->GetAllocator();
|
||||||
|
int NumA = in_tensors_[0]->shape()[0];
|
||||||
|
int NumB = in_tensors_[1]->shape()[0];
|
||||||
|
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
|
||||||
|
padWeight_ = allocator->Malloc(NumA * NumB * dtype_size);
|
||||||
|
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
||||||
|
auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
|
||||||
|
auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
|
||||||
|
memset(padWeight_, 0x00, NumA * NumB * dtype_size);
|
||||||
|
auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
|
||||||
|
auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c());
|
||||||
|
bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;
|
||||||
|
AllocatorMemoryForStrassen(NumA / 2, NumB / 2);
|
||||||
|
size_t size = NumA * NumB * dtype_size;
|
||||||
|
if (isModelFp16) {
|
||||||
|
if (enable_fp16_) {
|
||||||
|
memcpy(padWeightFp16, originWeightFp16, size);
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < NumA * NumB; ++i) {
|
||||||
|
padWeightFp32[i] = static_cast<float>(originWeightFp16[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (enable_fp16_) {
|
||||||
|
for (int i = 0; i < NumA * NumB; ++i) {
|
||||||
|
padWeightFp16[i] = static_cast<float16_t>(originWeightFp32[i]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
memcpy(padWeightFp32, originWeightFp32, size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
allocator->UnmapBuffer(padWeight_);
|
||||||
|
FreeDequantedWeight();
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
void AlignStrassenGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local,
|
||||||
|
cl::NDRange *global_range, cl::NDRange *local_range) {
|
||||||
|
*local_range = cl::NDRange(local[0], local[1], local[2]);
|
||||||
|
*global_range =
|
||||||
|
cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// 0 : global_size_, 1: global_size_add_sub
|
||||||
|
void StrassenOpenCLKernel::StrassenSetGlobalLocal(size_t strassen_size, int type_flag) {
|
||||||
|
size_t strassen_size_C4 = UP_DIV(strassen_size, C4NUM);
|
||||||
|
local_size_add_sub = {16, 1, 16};
|
||||||
|
if (type_flag == 0) {
|
||||||
|
global_size_ = {strassen_size_C4, 1, strassen_size};
|
||||||
|
AlignGlobalLocal(global_size_, local_size_);
|
||||||
|
} else {
|
||||||
|
global_size_add_sub = {strassen_size_C4, 1, strassen_size};
|
||||||
|
AlignStrassenGlobalLocal(global_size_add_sub, local_size_add_sub, &global_add_sub_, &local_add_sub_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void StrassenOpenCLKernel::SetGlobalLocal() {
|
||||||
|
// local size should less than MAX_GROUP_SIZE
|
||||||
|
local_size_ = {32, 4, 1};
|
||||||
|
global_size_ = {1, 1, 1};
|
||||||
|
size_t strassen_size = outShape[3] / 2;
|
||||||
|
StrassenSetGlobalLocal(strassen_size, 0); // set global_ and local
|
||||||
|
StrassenSetGlobalLocal(strassen_size, 1); // set global_size_add_sub
|
||||||
|
StrassenSetGlobalLocal(strassen_size, 2); // set global_size_weights
|
||||||
|
}
|
||||||
|
|
||||||
|
void StrassenOpenCLKernel::StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size,
|
||||||
|
bool is_matmul_kernel) {
|
||||||
|
cl_int4 shape;
|
||||||
|
if (is_matmul_kernel) {
|
||||||
|
shape = {1, 1, strassen_size, strassen_size};
|
||||||
|
} else {
|
||||||
|
shape = {strassen_size, 1, 1, UP_DIV(strassen_size, C4NUM)};
|
||||||
|
}
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, index, shape);
|
||||||
|
}
|
||||||
|
|
||||||
|
void StrassenOpenCLKernel::SetConstArgs() {
|
||||||
|
int arg_count = 2;
|
||||||
|
cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
|
||||||
|
cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
|
||||||
|
cl_int4 shape_offset = {0, 0, 0, 0};
|
||||||
|
int strassen_size = inShape[3] / 2;
|
||||||
|
out_shape.s[2] = in_shape.s[2] = in_shape.s[2] / 2;
|
||||||
|
out_shape.s[3] = in_shape.s[3] = in_shape.s[3] / 2;
|
||||||
|
StrassenSetConstArgs(&kernel_IMG_add_sub_2, 3, strassen_size, false);
|
||||||
|
StrassenSetConstArgs(&kernel_BUF_add_sub_2, 2, strassen_size, false);
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_count++, shape_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
// OriginSize = N*H*W*C typesize = sizeof(type data) width = W * UP_DIV(C,C4NUM) size = N
|
||||||
|
void StrassenOpenCLKernel::PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size) {
|
||||||
|
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
|
||||||
|
int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
|
||||||
|
auto runtime = runtime_wrapper.GetInstance();
|
||||||
|
runtime->SyncCommandQueue();
|
||||||
|
MS_ASSERT(alignment);
|
||||||
|
size_t row_pitch = UP_ROUND(width, alignment) * typesize * C4NUM;
|
||||||
|
size_t OriginSize = size * size * typesize;
|
||||||
|
std::vector<char> data(OriginSize);
|
||||||
|
auto row_size = width * typesize * C4NUM;
|
||||||
|
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
memcpy(reinterpret_cast<char *>(data.data()) + i * row_size, static_cast<char *>(IMGData) + i * row_pitch,
|
||||||
|
row_size);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < size * size; ++i) {
|
||||||
|
if ((i + 1) % size == 0) {
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void StrassenOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size,
|
||||||
|
cl_int2 offset, lite::opencl::MemType mem_type) {
|
||||||
|
if (input == nullptr || output == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "StrassenDataFilled input or output can not nullptr";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (mem_type == lite::opencl::MemType::IMG) {
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 0, input);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 1, output);
|
||||||
|
} else {
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF);
|
||||||
|
}
|
||||||
|
StrassenSetConstArgs(kernel, 2, size, false);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 3, offset);
|
||||||
|
ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void StrassenOpenCLKernel::StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset,
|
||||||
|
int flag, lite::opencl::MemType mem_type) {
|
||||||
|
if (input == nullptr || output == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "StrassenAddSub input or output can not nullptr";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (mem_type == lite::opencl::MemType::IMG) {
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::IMG);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::IMG);
|
||||||
|
} else {
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF);
|
||||||
|
}
|
||||||
|
StrassenSetConstArgs(kernel, 2, size, false);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 3, offset);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 4, flag);
|
||||||
|
ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void StrassenOpenCLKernel::StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3,
|
||||||
|
void *input4, void *input5, void *input6, void *input7, void *output,
|
||||||
|
const int size) {
|
||||||
|
if (input1 == nullptr || input2 == nullptr || input3 == nullptr || input4 == nullptr || input5 == nullptr ||
|
||||||
|
input6 == nullptr || input7 == nullptr || output == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "StrassenBackResult input or output can not nullptr";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 0, input1);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 1, input2);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 2, input3);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 3, input4);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 4, input5);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 5, input6);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 6, input7);
|
||||||
|
ocl_runtime_->SetKernelArg(*kernel, 7, output);
|
||||||
|
StrassenSetConstArgs(kernel, 8, size, false);
|
||||||
|
ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void StrassenOpenCLKernel::StrassenRunMmatmul(void *input, void *weight, void *output, const int size) {
|
||||||
|
if (input == nullptr || weight == nullptr || output == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "StrassenRunMmatmul input ,weight or output can not nullptr";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, 0, input);
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, 1, output);
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, 2, weight, lite::opencl::MemType::BUF);
|
||||||
|
StrassenSetConstArgs(&kernel_, 3, size, true);
|
||||||
|
StrassenSetConstArgs(&kernel_, 4, size, true);
|
||||||
|
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void StrassenOpenCLKernel::DoStrassen(void *data, void *weight, void *result, const int size, const int depth,
|
||||||
|
const int threshold) {
|
||||||
|
const int size_2 = size / 2;
|
||||||
|
int C4 = UP_DIV(size_2, C4NUM);
|
||||||
|
if (size <= threshold) {
|
||||||
|
// run matmul;
|
||||||
|
StrassenSetGlobalLocal(size, 0);
|
||||||
|
StrassenRunMmatmul(data, weight, result, size);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// flag = 0 : add otherwise flag = 1 : sub
|
||||||
|
// M1 = A11 * ( B12- B22)
|
||||||
|
StrassenSetGlobalLocal(size_2, 1);
|
||||||
|
StrassenDataFilled(&MatMul_StrassenIMGFilled, data, A_temp[depth + 1], size_2, {0, 0}, lite::opencl::MemType::IMG);
|
||||||
|
StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, C4, size_2, C4}, 1,
|
||||||
|
lite::opencl::MemType::BUF);
|
||||||
|
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M1[depth + 1], size_2, depth + 1, threshold);
|
||||||
|
|
||||||
|
// M2 = (A11 + A12) * B22
|
||||||
|
StrassenSetGlobalLocal(size_2, 1);
|
||||||
|
StrassenDataFilled(&MatMul_StrassenBUFFilled, weight, B_temp[depth + 1], size_2, {size_2, C4},
|
||||||
|
lite::opencl::MemType::BUF);
|
||||||
|
StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, 0, C4}, 0, lite::opencl::MemType::IMG);
|
||||||
|
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M2[depth + 1], size_2, depth + 1, threshold);
|
||||||
|
|
||||||
|
// M3 = (A21 + A22) * B11
|
||||||
|
StrassenSetGlobalLocal(size_2, 1);
|
||||||
|
StrassenDataFilled(&MatMul_StrassenBUFFilled, weight, B_temp[depth + 1], size_2, {0, 0}, lite::opencl::MemType::BUF);
|
||||||
|
StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {size_2, 0, size_2, C4}, 0,
|
||||||
|
lite::opencl::MemType::IMG);
|
||||||
|
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M3[depth + 1], size_2, depth + 1, threshold);
|
||||||
|
|
||||||
|
// M4 = A22 * (B21 - B11)
|
||||||
|
StrassenSetGlobalLocal(size_2, 1);
|
||||||
|
StrassenDataFilled(&MatMul_StrassenIMGFilled, data, A_temp[depth + 1], size_2, {size_2, C4},
|
||||||
|
lite::opencl::MemType::IMG);
|
||||||
|
StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {size_2, 0, 0, 0}, 1,
|
||||||
|
lite::opencl::MemType::BUF);
|
||||||
|
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M4[depth + 1], size_2, depth + 1, threshold);
|
||||||
|
|
||||||
|
// M5 = (A11 + A22) * (B11 + B22)
|
||||||
|
StrassenSetGlobalLocal(size_2, 1);
|
||||||
|
StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, size_2, C4}, 0,
|
||||||
|
lite::opencl::MemType::IMG);
|
||||||
|
// (B11 + B22)
|
||||||
|
StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, 0, size_2, C4}, 0,
|
||||||
|
lite::opencl::MemType::BUF);
|
||||||
|
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M5[depth + 1], size_2, depth + 1, threshold);
|
||||||
|
|
||||||
|
// M6 = (A12 - A22) * (B21 + B22)
|
||||||
|
StrassenSetGlobalLocal(size_2, 1);
|
||||||
|
StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, C4, size_2, C4}, 1,
|
||||||
|
lite::opencl::MemType::IMG);
|
||||||
|
StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {size_2, 0, size_2, C4}, 0,
|
||||||
|
lite::opencl::MemType::BUF);
|
||||||
|
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M6[depth + 1], size_2, depth + 1, threshold);
|
||||||
|
|
||||||
|
// M7 = (A11 - A21) * (B11 + B12)
|
||||||
|
StrassenSetGlobalLocal(size_2, 1);
|
||||||
|
StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, size_2, 0}, 1,
|
||||||
|
lite::opencl::MemType::IMG);
|
||||||
|
StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, 0, 0, C4}, 0,
|
||||||
|
lite::opencl::MemType::BUF);
|
||||||
|
DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M7[depth + 1], size_2, depth + 1, threshold);
|
||||||
|
|
||||||
|
// BackResult
|
||||||
|
StrassenSetGlobalLocal(size_2, 1);
|
||||||
|
StrassenBackResult(&kernel_back_result, M1[depth + 1], M2[depth + 1], M3[depth + 1], M4[depth + 1], M5[depth + 1],
|
||||||
|
M6[depth + 1], M7[depth + 1], result, size_2);
|
||||||
|
}
|
||||||
|
|
||||||
|
int StrassenOpenCLKernel::Run() {
|
||||||
|
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||||
|
int threshold = 0;
|
||||||
|
const int up_bound = 1024;
|
||||||
|
const int down_bound = 256;
|
||||||
|
if (in_tensors_.at(0)->shape()[0] >= up_bound) {
|
||||||
|
threshold = UP_DIV(in_tensors_.at(0)->shape()[0], C4NUM) / 2;
|
||||||
|
} else if (in_tensors_.at(0)->shape()[0] <= down_bound) {
|
||||||
|
threshold = in_tensors_.at(0)->shape()[0];
|
||||||
|
} else {
|
||||||
|
threshold = UP_DIV(in_tensors_.at(0)->shape()[0], C4NUM);
|
||||||
|
}
|
||||||
|
DoStrassen(in_tensors_.at(0)->data_c(), padWeight_, out_tensors_.at(0)->data_c(), in_tensors_.at(0)->shape()[0], 0,
|
||||||
|
threshold);
|
||||||
|
return mindspore::lite::RET_OK;
|
||||||
|
}
|
||||||
|
} // namespace mindspore::kernel
|
|
@ -0,0 +1,77 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STRASSEN_H_
|
||||||
|
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STRASSEN_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include "src/runtime/kernel/opencl/kernel/matmul.h"
|
||||||
|
|
||||||
|
namespace mindspore::kernel {
|
||||||
|
|
||||||
|
class StrassenOpenCLKernel : public MatMulOpenCLKernel {
|
||||||
|
public:
|
||||||
|
StrassenOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||||
|
const std::vector<lite::Tensor *> &outputs)
|
||||||
|
: MatMulOpenCLKernel(parameter, inputs, outputs) {}
|
||||||
|
~StrassenOpenCLKernel() override = default;
|
||||||
|
|
||||||
|
public:
|
||||||
|
int Run() override;
|
||||||
|
int Prepare() override;
|
||||||
|
int InitWeights() override;
|
||||||
|
void SetConstArgs() override;
|
||||||
|
void SetGlobalLocal() override;
|
||||||
|
|
||||||
|
// strassen
|
||||||
|
private:
|
||||||
|
void AllocatorMemoryForStrassen(int NumA, int NumB);
|
||||||
|
void DoStrassen(void *data, void *weight, void *result, const int size, const int depth, const int threshold);
|
||||||
|
void StrassenSetGlobalLocal(size_t strassen_size, int type_flag);
|
||||||
|
void StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel);
|
||||||
|
void StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, cl_int2 offset,
|
||||||
|
lite::opencl::MemType mem_type);
|
||||||
|
void StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, int flag,
|
||||||
|
lite::opencl::MemType mem_type);
|
||||||
|
void StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5,
|
||||||
|
void *input6, void *input7, void *output, const int size);
|
||||||
|
void StrassenRunMmatmul(void *input, void *weight, void *output, const int size);
|
||||||
|
void PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size);
|
||||||
|
cl::Kernel kernel_IMG_add_sub_2;
|
||||||
|
cl::Kernel MatMul_StrassenBUFFilled;
|
||||||
|
cl::Kernel MatMul_StrassenIMGFilled;
|
||||||
|
cl::Kernel kernel_BUF_add_sub_2;
|
||||||
|
cl::Kernel kernel_back_result;
|
||||||
|
cl::NDRange global_add_sub_, local_add_sub_;
|
||||||
|
std::vector<size_t> global_size_add_sub;
|
||||||
|
std::vector<size_t> local_size_add_sub;
|
||||||
|
// image 2d
|
||||||
|
void *A_temp[MAXDEPTH] = {nullptr};
|
||||||
|
void *M1[MAXDEPTH] = {nullptr};
|
||||||
|
void *M2[MAXDEPTH] = {nullptr};
|
||||||
|
void *M3[MAXDEPTH] = {nullptr};
|
||||||
|
void *M4[MAXDEPTH] = {nullptr};
|
||||||
|
void *M5[MAXDEPTH] = {nullptr};
|
||||||
|
void *M6[MAXDEPTH] = {nullptr};
|
||||||
|
void *M7[MAXDEPTH] = {nullptr};
|
||||||
|
// buffer
|
||||||
|
void *B_temp[MAXDEPTH] = {nullptr};
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace mindspore::kernel
|
||||||
|
|
||||||
|
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_WINOGRAD_H_
|
|
@ -26,7 +26,156 @@ using mindspore::lite::KernelRegistry;
|
||||||
using mindspore::schema::Format::Format_NHWC;
|
using mindspore::schema::Format::Format_NHWC;
|
||||||
|
|
||||||
namespace mindspore::lite::opencl::test {
|
namespace mindspore::lite::opencl::test {
|
||||||
|
// muti-output
|
||||||
|
void TestMain(const std::vector<ArgsTuple> &input_infos, const std::vector<ArgsTupleOut> &output_info,
|
||||||
|
OpParameter *op_parameter, bool fp16_enable, float atol, float rtol, bool print_data) {
|
||||||
|
std::vector<ArgsTupleWithDtype> input_infos_new;
|
||||||
|
auto transform_fun = [](ArgsTuple in) -> ArgsTupleWithDtype {
|
||||||
|
return ArgsTupleWithDtype(std::get<0>(in), std::get<1>(in), std::get<2>(in), kNumberTypeFloat32);
|
||||||
|
};
|
||||||
|
std::transform(input_infos.begin(), input_infos.end(), std::back_inserter(input_infos_new), transform_fun);
|
||||||
|
TestMain(input_infos_new, output_info, op_parameter, fp16_enable, atol, rtol, print_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, const std::vector<ArgsTupleOut> &output_info,
|
||||||
|
OpParameter *op_parameter, bool fp16_enable, float atol, float rtol, bool print_data) {
|
||||||
|
auto primitive_type = static_cast<schema::PrimitiveType>(op_parameter->type_);
|
||||||
|
static std::set<schema::PrimitiveType> packed_op = {
|
||||||
|
schema::PrimitiveType_Conv2D, schema::PrimitiveType_DeConv2D, schema::PrimitiveType_DepthwiseConv2D,
|
||||||
|
schema::PrimitiveType_DeDepthwiseConv2D, schema::PrimitiveType_MatMul};
|
||||||
|
|
||||||
|
// simulating benchmark: session::LiteSession::CreateSession() -> session->Init()
|
||||||
|
MS_LOG(DEBUG) << "initialize OpenCLRuntime and OpenCLAllocator";
|
||||||
|
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
|
||||||
|
auto ocl_runtime = runtime_wrapper.GetInstance();
|
||||||
|
ocl_runtime->SetFp16Enable(fp16_enable);
|
||||||
|
EXPECT_TRUE(ocl_runtime->Init() == RET_OK);
|
||||||
|
|
||||||
|
// simulating benchmark: session_->CompileGraph() -> ConvertTensors()
|
||||||
|
MS_LOG(DEBUG) << "create Tensors & init weight data";
|
||||||
|
std::vector<std::shared_ptr<Tensor>> in_tensors;
|
||||||
|
std::vector<std::shared_ptr<Tensor>> out_tensors;
|
||||||
|
// firstly, create all Tensors
|
||||||
|
in_tensors.reserve(input_infos.size()); // vector's capacity() is 0, so call reserve() avoiding vector re-malloc
|
||||||
|
for (auto input_info : input_infos) {
|
||||||
|
auto &shape = std::get<0>(input_info);
|
||||||
|
auto category = std::get<2>(input_info);
|
||||||
|
auto data_type = std::get<3>(input_info);
|
||||||
|
in_tensors.emplace_back(std::make_shared<Tensor>(data_type, shape, Format_NHWC, category));
|
||||||
|
}
|
||||||
|
for (auto outout_info : output_info) {
|
||||||
|
const std::vector<int> &output_shape = std::get<0>(outout_info);
|
||||||
|
out_tensors.emplace_back(std::make_shared<Tensor>(kNumberTypeFloat32, output_shape, Format_NHWC, VAR));
|
||||||
|
}
|
||||||
|
// secondly, init weight Tensor's data
|
||||||
|
std::vector<Tensor *> kernel_inputs;
|
||||||
|
std::vector<Tensor *> subgraph_inputs;
|
||||||
|
std::vector<Tensor *> outputs;
|
||||||
|
std::map<Tensor *, float *> subgraph_inputs_data;
|
||||||
|
for (int i = 0; i < in_tensors.size(); ++i) {
|
||||||
|
auto tensor = in_tensors[i];
|
||||||
|
auto *input_data = std::get<1>(input_infos[i]);
|
||||||
|
kernel_inputs.push_back(tensor.get());
|
||||||
|
if (tensor->category() != VAR) { // tensor is weight
|
||||||
|
// simulating src/lite_session.cc:WeightTensorNeedCopy()
|
||||||
|
if (packed_op.count(primitive_type)) {
|
||||||
|
tensor->set_data(input_data);
|
||||||
|
} else {
|
||||||
|
memcpy(tensor->MutableData(), input_data, tensor->Size());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
EXPECT_TRUE(tensor->data_type() == kNumberTypeFloat32 || tensor->data_type() == kNumberTypeInt32);
|
||||||
|
subgraph_inputs.push_back(tensor.get());
|
||||||
|
subgraph_inputs_data[tensor.get()] = reinterpret_cast<float *>(input_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < out_tensors.size(); ++i) {
|
||||||
|
auto out_tensor = out_tensors[i];
|
||||||
|
outputs.push_back(out_tensor.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
// simulating benchmark: session_->CompileGraph() -> scheduler.Schedule() -> BuildKernels()
|
||||||
|
MS_LOG(DEBUG) << "create OpenCLKernel";
|
||||||
|
kernel::KernelKey key{kernel::kGPU, kernel_inputs.front()->data_type(), primitive_type};
|
||||||
|
auto creator = KernelRegistry::GetInstance()->GetCreator(key);
|
||||||
|
if (creator == nullptr) {
|
||||||
|
std::cerr << "can't get registry function for: " << schema::EnumNamePrimitiveType(primitive_type)
|
||||||
|
<< ". Maybe you forget setting op_parameter_.type_ for OpParameter." << std::endl;
|
||||||
|
free(op_parameter);
|
||||||
|
FAIL();
|
||||||
|
}
|
||||||
|
auto *kernel = creator(kernel_inputs, outputs, op_parameter, nullptr, key, nullptr);
|
||||||
|
if (kernel == nullptr) {
|
||||||
|
std::cerr << "call registry function error: " << schema::EnumNamePrimitiveType(primitive_type) << std::endl;
|
||||||
|
free(op_parameter);
|
||||||
|
FAIL();
|
||||||
|
}
|
||||||
|
kernel->set_name(schema::EnumNamesPrimitiveType()[primitive_type]);
|
||||||
|
|
||||||
|
// simulating benchmark: session_->CompileGraph() -> scheduler.Schedule() -> ConstructSubGraphs()
|
||||||
|
MS_LOG(DEBUG) << "create SubGraph";
|
||||||
|
std::vector<LiteKernel *> kernels{kernel};
|
||||||
|
auto sub_graph = new (std::nothrow) OpenCLSubGraph(subgraph_inputs, outputs, kernels, kernels, kernels);
|
||||||
|
if (sub_graph == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// call sub_graph->Init() after construct subgraph like scheduler.cc
|
||||||
|
MS_LOG(DEBUG) << "call sub_graph->Init()";
|
||||||
|
EXPECT_TRUE(sub_graph->Init() == RET_OK);
|
||||||
|
|
||||||
|
// simulating benchmark: session_->CompileGraph() -> PrepareKernels() -> OpenCLSubGraph.Prepare()
|
||||||
|
MS_LOG(DEBUG) << "call sub_graph->Prepare()";
|
||||||
|
EXPECT_TRUE(sub_graph->Prepare() == RET_OK); // will set Tensor's allocator be OpenCLAllocator
|
||||||
|
|
||||||
|
// simulating benchmark: model->Free(), clear weight data in input_infos
|
||||||
|
std::vector<std::unique_ptr<uint8_t[]>> saved_weights;
|
||||||
|
for (int i = 0; i < in_tensors.size(); ++i) {
|
||||||
|
auto &tensor = in_tensors[i];
|
||||||
|
if (tensor->category() != VAR) {
|
||||||
|
saved_weights.emplace_back(new uint8_t[tensor->Size()]);
|
||||||
|
auto *weight_data = std::get<1>(input_infos[i]);
|
||||||
|
memcpy(saved_weights.back().get(), weight_data, tensor->Size());
|
||||||
|
srand(time(nullptr));
|
||||||
|
memset(weight_data, rand(), tensor->Size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// simulating benchmark: LoadInput()
|
||||||
|
MS_LOG(DEBUG) << "malloc and init input data";
|
||||||
|
for (auto input : subgraph_inputs) {
|
||||||
|
EXPECT_TRUE(input->MutableData() != nullptr); // malloc Image2D & call MapBuffer()
|
||||||
|
memcpy(input->data_c(), subgraph_inputs_data[input], input->Size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// simulating benchmark: MarkAccuracy() -> session_->RunGraph() -> executor_->Run() -> OpenCLSubGraph->Run()
|
||||||
|
MS_LOG(DEBUG) << "run SubGraph & compare result";
|
||||||
|
EXPECT_TRUE(sub_graph->Run() == RET_OK); // will call UnmapBuffer() for input
|
||||||
|
|
||||||
|
for (int i = 0; i < outputs.size(); ++i) {
|
||||||
|
ocl_runtime->GetAllocator()->MapBuffer(outputs[i]->data_c(), CL_MAP_READ, nullptr, true);
|
||||||
|
float *expect_data = reinterpret_cast<float *>(std::get<1>(output_info[i]));
|
||||||
|
CompareOutput<float>(outputs[i]->data_c(), expect_data, outputs[i]->ElementsNum(), atol, rtol, print_data);
|
||||||
|
ocl_runtime->GetAllocator()->UnmapBuffer(outputs[i]->data_c());
|
||||||
|
}
|
||||||
|
|
||||||
|
MS_LOG(DEBUG) << "release resources";
|
||||||
|
for (auto &tensor : in_tensors) {
|
||||||
|
if (tensor->category() != VAR && packed_op.count(primitive_type)) {
|
||||||
|
tensor->set_data(nullptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0, j = 0; i < in_tensors.size(); ++i) { // resume weight data to input_infos
|
||||||
|
auto &tensor = in_tensors[i];
|
||||||
|
if (tensor->category() != VAR) {
|
||||||
|
auto *weight_data = std::get<1>(input_infos[i]);
|
||||||
|
memcpy(weight_data, saved_weights[j++].get(), tensor->Size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
delete sub_graph;
|
||||||
|
}
|
||||||
|
|
||||||
|
// single-output
|
||||||
void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, std::tuple<std::vector<int>, float *> output_info,
|
void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, std::tuple<std::vector<int>, float *> output_info,
|
||||||
OpParameter *op_parameter, bool fp16_enable, float atol, float rtol, bool print_data) {
|
OpParameter *op_parameter, bool fp16_enable, float atol, float rtol, bool print_data) {
|
||||||
auto primitive_type = static_cast<schema::PrimitiveType>(op_parameter->type_);
|
auto primitive_type = static_cast<schema::PrimitiveType>(op_parameter->type_);
|
||||||
|
|
|
@ -31,6 +31,7 @@
|
||||||
|
|
||||||
using Tensor = mindspore::lite::Tensor;
|
using Tensor = mindspore::lite::Tensor;
|
||||||
using ArgsTuple = std::tuple<std::vector<int>, void *, Tensor::Category>;
|
using ArgsTuple = std::tuple<std::vector<int>, void *, Tensor::Category>;
|
||||||
|
using ArgsTupleOut = std::tuple<std::vector<int>, void *>;
|
||||||
using ArgsTupleWithDtype = std::tuple<std::vector<int>, void *, Tensor::Category, mindspore::TypeId>;
|
using ArgsTupleWithDtype = std::tuple<std::vector<int>, void *, Tensor::Category, mindspore::TypeId>;
|
||||||
constexpr Tensor::Category VAR = Tensor::VAR;
|
constexpr Tensor::Category VAR = Tensor::VAR;
|
||||||
constexpr Tensor::Category CONST_TENSOR = Tensor::Category::CONST_TENSOR;
|
constexpr Tensor::Category CONST_TENSOR = Tensor::Category::CONST_TENSOR;
|
||||||
|
@ -89,10 +90,17 @@ T *CreateParameter(schema::PrimitiveType type) {
|
||||||
return param;
|
return param;
|
||||||
}
|
}
|
||||||
|
|
||||||
void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, std::tuple<std::vector<int>, float *> output_info,
|
void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, const std::vector<ArgsTupleOut> &output_info,
|
||||||
OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9,
|
OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9,
|
||||||
bool print_output = false);
|
bool print_output = false);
|
||||||
|
|
||||||
|
void TestMain(const std::vector<ArgsTuple> &input_infos, const std::vector<ArgsTupleOut> &output_info,
|
||||||
|
OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9,
|
||||||
|
bool print_output = false);
|
||||||
|
|
||||||
|
void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, std::tuple<std::vector<int>, float *> output_info,
|
||||||
|
OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9,
|
||||||
|
bool print_output = false);
|
||||||
void TestMain(const std::vector<ArgsTuple> &input_infos, std::tuple<std::vector<int>, float *> output_info,
|
void TestMain(const std::vector<ArgsTuple> &input_infos, std::tuple<std::vector<int>, float *> output_info,
|
||||||
OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9,
|
OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9,
|
||||||
bool print_output = false);
|
bool print_output = false);
|
||||||
|
|
|
@ -44,6 +44,21 @@ TEST_F(TestOpenCL_Concat, input2_axis0) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(TestOpenCL_Concat, input2_axis0_shape1) {
|
||||||
|
std::vector<int> input0_shape = {1};
|
||||||
|
std::vector<int> input1_shape = {1};
|
||||||
|
std::vector<int> output_shape = {2};
|
||||||
|
int axis = 0;
|
||||||
|
float input0_data[] = {0.75};
|
||||||
|
float input1_data[] = {0.5};
|
||||||
|
float output_data[] = {0.75, 0.5};
|
||||||
|
for (auto fp16_enable : {false}) {
|
||||||
|
auto *param = CreateParameter(axis);
|
||||||
|
TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param,
|
||||||
|
fp16_enable, fp16_enable ? 1e-3 : 1e-9);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(TestOpenCL_Concat, input2_axis1_Align) {
|
TEST_F(TestOpenCL_Concat, input2_axis1_Align) {
|
||||||
std::vector<int> input0_shape = {2, 2, 2, 8};
|
std::vector<int> input0_shape = {2, 2, 2, 8};
|
||||||
std::vector<int> input1_shape = {2, 2, 2, 8};
|
std::vector<int> input1_shape = {2, 2, 2, 8};
|
||||||
|
|
|
@ -32,13 +32,30 @@ OpParameter *CreateParameter(int axis) {
|
||||||
TEST_F(TestOpenCL_Gather, Axis0) {
|
TEST_F(TestOpenCL_Gather, Axis0) {
|
||||||
int axis = 0;
|
int axis = 0;
|
||||||
std::vector<int> input_shape = {10};
|
std::vector<int> input_shape = {10};
|
||||||
std::vector<int> indices_shape = {2};
|
std::vector<int> indices_shape = {5};
|
||||||
std::vector<int> output_shape = {2};
|
std::vector<int> output_shape = {5};
|
||||||
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
|
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
|
||||||
int32_t indices[] = {1, 3};
|
int32_t indices[] = {1, 3, 5, 7, 9};
|
||||||
float output_data[] = {1, 3};
|
float output_data[] = {1, 3, 5, 7, 9};
|
||||||
|
|
||||||
for (auto fp16_enable : {false, true}) {
|
for (auto fp16_enable : {false}) {
|
||||||
|
auto *param = CreateParameter(axis);
|
||||||
|
TestMain(
|
||||||
|
{{input_shape, input_data, VAR, kNumberTypeFloat32}, {indices_shape, indices, CONST_TENSOR, kNumberTypeInt32}},
|
||||||
|
{output_shape, output_data}, param, fp16_enable);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(TestOpenCL_Gather, Axis0ConstTensor) {
|
||||||
|
int axis = 0;
|
||||||
|
std::vector<int> input_shape = {10};
|
||||||
|
std::vector<int> indices_shape = {1};
|
||||||
|
std::vector<int> output_shape = {1};
|
||||||
|
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
|
||||||
|
int32_t indices[] = {1};
|
||||||
|
float output_data[] = {1};
|
||||||
|
|
||||||
|
for (auto fp16_enable : {false}) {
|
||||||
auto *param = CreateParameter(axis);
|
auto *param = CreateParameter(axis);
|
||||||
TestMain(
|
TestMain(
|
||||||
{{input_shape, input_data, VAR, kNumberTypeFloat32}, {indices_shape, indices, CONST_TENSOR, kNumberTypeInt32}},
|
{{input_shape, input_data, VAR, kNumberTypeFloat32}, {indices_shape, indices, CONST_TENSOR, kNumberTypeInt32}},
|
||||||
|
@ -49,11 +66,11 @@ TEST_F(TestOpenCL_Gather, Axis0) {
|
||||||
TEST_F(TestOpenCL_Gather, Axis0_Tensor) {
|
TEST_F(TestOpenCL_Gather, Axis0_Tensor) {
|
||||||
int axis = 0;
|
int axis = 0;
|
||||||
std::vector<int> input_shape = {10};
|
std::vector<int> input_shape = {10};
|
||||||
std::vector<int> indices_shape = {2};
|
std::vector<int> indices_shape = {1};
|
||||||
std::vector<int> output_shape = {2};
|
std::vector<int> output_shape = {1};
|
||||||
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
|
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
|
||||||
int32_t indices[] = {1, 3};
|
int32_t indices[] = {1};
|
||||||
float output_data[] = {1, 3};
|
float output_data[] = {1};
|
||||||
|
|
||||||
for (auto fp16_enable : {false}) {
|
for (auto fp16_enable : {false}) {
|
||||||
auto *param = CreateParameter(axis);
|
auto *param = CreateParameter(axis);
|
||||||
|
|
|
@ -45,7 +45,7 @@ TEST_F(TestOpenCL_MatMul, 2Dfile) {
|
||||||
auto output_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
|
auto output_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
|
||||||
|
|
||||||
for (auto fp16_enable : {false}) {
|
for (auto fp16_enable : {false}) {
|
||||||
auto *param = CreateParameter();
|
auto *param = CreateParameter(false, false);
|
||||||
TestMain({{input_shape, input_data, VAR}, {weight_shape, weight_data, CONST_TENSOR}}, {output_shape, output_data},
|
TestMain({{input_shape, input_data, VAR}, {weight_shape, weight_data, CONST_TENSOR}}, {output_shape, output_data},
|
||||||
param, fp16_enable, fp16_enable ? 1e-3 : 1e-3);
|
param, fp16_enable, fp16_enable ? 1e-3 : 1e-3);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
#include "ut/src/runtime/kernel/opencl/common.h"
|
||||||
|
#include "nnacl/split_parameter.h"
|
||||||
|
|
||||||
|
namespace mindspore::lite::opencl::test {
|
||||||
|
|
||||||
|
class TestOpenCL_Split : public CommonTest {};
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
// PrimitiveType_Split: src/ops/populate/split_populate.cc
|
||||||
|
OpParameter *CreateParameter(int split_dim_, int num_split_, std::vector<int> split_sizes_) {
|
||||||
|
auto *param = test::CreateParameter<SplitParameter>(schema::PrimitiveType_Split);
|
||||||
|
param->split_dim_ = split_dim_;
|
||||||
|
param->num_split_ = num_split_;
|
||||||
|
param->split_sizes_ = reinterpret_cast<int *>(malloc(param->num_split_ * sizeof(int)));
|
||||||
|
for (int i = 0; i < param->num_split_; ++i) {
|
||||||
|
param->split_sizes_[i] = split_sizes_[i];
|
||||||
|
}
|
||||||
|
return reinterpret_cast<OpParameter *>(param);
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
TEST_F(TestOpenCL_Split, input2_axis3) {
|
||||||
|
std::vector<int> input_shape = {2, 2, 2, 12};
|
||||||
|
std::vector<int> output_shape1 = {2, 2, 2, 6};
|
||||||
|
std::vector<int> output_shape2 = {2, 2, 2, 6};
|
||||||
|
int split_dim_ = 3;
|
||||||
|
int num_split_ = 2; // len of split_sizes_
|
||||||
|
std::vector<int> split_sizes_{6, 6};
|
||||||
|
size_t input_size, output1_size, output2_size;
|
||||||
|
std::string inputPpath = "./test_data/splitfp32_input.bin";
|
||||||
|
std::string output1Ppath = "./test_data/splitfp32_output1.bin";
|
||||||
|
std::string output2Ppath = "./test_data/splitfp32_output2.bin";
|
||||||
|
auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(inputPpath.c_str(), &input_size));
|
||||||
|
auto output_data1 = reinterpret_cast<float *>(mindspore::lite::ReadFile(output1Ppath.c_str(), &output1_size));
|
||||||
|
auto output_data2 = reinterpret_cast<float *>(mindspore::lite::ReadFile(output2Ppath.c_str(), &output2_size));
|
||||||
|
for (auto fp16_enable : {false}) {
|
||||||
|
auto *param = CreateParameter(split_dim_, num_split_, split_sizes_);
|
||||||
|
TestMain({{input_shape, input_data, VAR}}, {{output_shape1, output_data1}, {output_shape2, output_data2}}, param,
|
||||||
|
fp16_enable, fp16_enable ? 1e-3 : 1e-9);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace mindspore::lite::opencl::test
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue